From e41f81d435aae59a05005104a478c17eafbce9e8 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 21 Apr 2026 12:10:21 +0200 Subject: [PATCH 1/3] style: pandas lint --- pyproject.toml | 1 + src/scanpy/_utils/__init__.py | 60 +++++++++---------- .../experimental/pp/_highly_variable_genes.py | 21 +++---- src/scanpy/external/exporting.py | 2 +- src/scanpy/external/pp/_hashsolo.py | 2 +- src/scanpy/get/get.py | 4 +- src/scanpy/plotting/_anndata.py | 8 +-- src/scanpy/plotting/_dotplot.py | 8 +-- src/scanpy/plotting/_stacked_violin.py | 21 ++----- src/scanpy/plotting/_tools/__init__.py | 8 +-- src/scanpy/plotting/_tools/paga.py | 23 +++---- src/scanpy/plotting/_tools/scatterplots.py | 4 +- src/scanpy/plotting/_utils.py | 2 +- src/scanpy/preprocessing/_combat.py | 13 ++-- .../preprocessing/_highly_variable_genes.py | 5 +- src/scanpy/preprocessing/_qc.py | 2 +- .../preprocessing/_scrublet/__init__.py | 2 +- src/scanpy/readwrite.py | 8 +-- src/scanpy/tools/_dendrogram.py | 2 +- src/scanpy/tools/_rank_genes_groups.py | 8 +-- src/scanpy/tools/_utils_clustering.py | 2 +- tests/test_get.py | 52 +++++++++++----- tests/test_highly_variable_genes.py | 16 ++--- tests/test_plotting.py | 2 +- tests/test_read_10x.py | 2 +- 25 files changed, 142 insertions(+), 136 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b1e586eab7..fa6c66534e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,6 +167,7 @@ lint.select = [ "ICN", # Follow import conventions "ISC", # Implicit string concatenation "N", # Naming conventions + "PD", # Pandas "PERF", # Performance "PIE", # Syntax simplifications "PL", # Pylint diff --git a/src/scanpy/_utils/__init__.py b/src/scanpy/_utils/__init__.py index 519a732ad4..7bb457c8ae 100644 --- a/src/scanpy/_utils/__init__.py +++ b/src/scanpy/_utils/__init__.py @@ -357,11 +357,11 @@ def compute_association_matrix_of_groups( if "?" in pred_group: pred_group = str(ipred_group) # noqa: PLW2901 # starting from numpy version 1.13, subtractions of boolean arrays are deprecated - mask_pred = adata.obs[prediction].values == pred_group + mask_pred = adata.obs[prediction].to_numpy() == pred_group mask_pred_int = mask_pred.astype(np.int8) asso_matrix += [[]] for ref_group in adata.obs[reference].cat.categories: - mask_ref = (adata.obs[reference].values == ref_group).astype(np.int8) + mask_ref = (adata.obs[reference].to_numpy() == ref_group).astype(np.int8) mask_ref_or_pred = mask_ref.copy() mask_ref_or_pred[mask_pred] = 1 # e.g. if the pred group is contained in mask_ref, mask_ref and @@ -796,42 +796,38 @@ def select_groups( groups_masks_obs = adata.uns[f"{key}_masks"] else: groups_masks_obs = np.zeros( - (len(adata.obs[key].cat.categories), adata.obs[key].values.size), dtype=bool + (len(adata.obs[key].cat.categories), adata.obs[key].size), dtype=bool ) for iname, name in enumerate(adata.obs[key].cat.categories): # if the name is not found, fallback to index retrieval - if name in adata.obs[key].values: - mask_obs = name == adata.obs[key].values + if name in adata.obs[key].array: + mask_obs = name == adata.obs[key].to_numpy() else: - mask_obs = str(iname) == adata.obs[key].values + mask_obs = str(iname) == adata.obs[key].to_numpy() groups_masks_obs[iname] = mask_obs - groups_ids = list(range(len(groups_order))) - if groups_order_subset != "all": - groups_ids = [] - for name in groups_order_subset: - groups_ids.append( - np.where(adata.obs[key].cat.categories.values == name)[0][0] - ) - if len(groups_ids) == 0: - # fallback to index retrieval - groups_ids = np.where( - np.isin( - np.arange(len(adata.obs[key].cat.categories)).astype(str), - np.array(groups_order_subset), - ) - )[0] - if len(groups_ids) == 0: - logg.debug( - f"{np.array(groups_order_subset)} invalid! specify valid " - f"groups_order (or indices) from {adata.obs[key].cat.categories}", - ) - from sys import exit + if groups_order_subset == "all": + return groups_order.to_numpy(), groups_masks_obs - exit(0) - groups_masks_obs = groups_masks_obs[groups_ids] - groups_order_subset = adata.obs[key].cat.categories[groups_ids].values - else: - groups_order_subset = groups_order.values + groups_ids = [ + np.flatnonzero(adata.obs[key].cat.categories.array == name)[0] + for name in groups_order_subset + ] + if len(groups_ids) == 0: + # fallback to index retrieval + groups_ids = np.flatnonzero( + np.isin( + np.arange(len(adata.obs[key].cat.categories)).astype(str), + np.array(groups_order_subset), + ) + ) + if len(groups_ids) == 0: + msg = ( + f"{np.array(groups_order_subset)} invalid! specify valid " + f"groups_order (or indices) from {adata.obs[key].cat.categories}", + ) + raise RuntimeError(msg) + groups_masks_obs = groups_masks_obs[groups_ids] + groups_order_subset = adata.obs[key].cat.categories[groups_ids].to_numpy() return groups_order_subset, groups_masks_obs diff --git a/src/scanpy/experimental/pp/_highly_variable_genes.py b/src/scanpy/experimental/pp/_highly_variable_genes.py index 990b213eb0..eb8b582233 100644 --- a/src/scanpy/experimental/pp/_highly_variable_genes.py +++ b/src/scanpy/experimental/pp/_highly_variable_genes.py @@ -158,7 +158,7 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 if batch_key is None: batch_info = np.zeros(adata.shape[0], dtype=int) else: - batch_info = adata.obs[batch_key].values + batch_info = adata.obs[batch_key].to_numpy() n_batches = len(np.unique(batch_info)) # Get pearson residuals for each batch separately @@ -239,11 +239,10 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 # Sort genes by how often they selected as hvg within each batch and # break ties with median rank of residual variance across batches - df.sort_values( + df = df.sort_values( ["highly_variable_nbatches", "highly_variable_rank"], ascending=[False, True], na_position="last", - inplace=True, ) high_var = np.zeros(df.shape[0], dtype=bool) @@ -263,18 +262,16 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 " 'variances', float vector (adata.var)\n" " 'residual_variances', float vector (adata.var)" ) - adata.var["means"] = df["means"].values - adata.var["variances"] = df["variances"].values + adata.var["means"] = df["means"].array + adata.var["variances"] = df["variances"].array adata.var["residual_variances"] = df["residual_variances"] - adata.var["highly_variable_rank"] = df["highly_variable_rank"].values + adata.var["highly_variable_rank"] = df["highly_variable_rank"].array if batch_key is not None: - adata.var["highly_variable_nbatches"] = df[ - "highly_variable_nbatches" - ].values + adata.var["highly_variable_nbatches"] = df["highly_variable_nbatches"].array adata.var["highly_variable_intersection"] = df[ "highly_variable_intersection" - ].values - adata.var["highly_variable"] = df["highly_variable"].values + ].array + adata.var["highly_variable"] = df["highly_variable"].array if subset: adata._inplace_subset_var(df["highly_variable"].values) @@ -285,7 +282,7 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 ["highly_variable_nbatches", "highly_variable_intersection"], axis=1 ) if subset: - df = df.iloc[df.highly_variable.values, :] + df = df.iloc[df.highly_variable.array, :] return df diff --git a/src/scanpy/external/exporting.py b/src/scanpy/external/exporting.py index e936060caa..55d39eb2b2 100644 --- a/src/scanpy/external/exporting.py +++ b/src/scanpy/external/exporting.py @@ -394,7 +394,7 @@ def _export_paga_to_spring(adata, paga_coords, outpath) -> None: coords = [list(xy) for xy in paga_coords] sizes = list(adata.uns[f"{group_key}_sizes"]) - clus_labels = adata.obs[group_key].cat.codes.values + clus_labels = adata.obs[group_key].cat.codes.to_numpy() cell_groups = [ [int(j) for j in np.nonzero(clus_labels == i)[0]] for i in range(len(names)) ] diff --git a/src/scanpy/external/pp/_hashsolo.py b/src/scanpy/external/pp/_hashsolo.py index 210ffa05e8..5f81102b95 100644 --- a/src/scanpy/external/pp/_hashsolo.py +++ b/src/scanpy/external/pp/_hashsolo.py @@ -363,7 +363,7 @@ def hashsolo( "Please cite HashSolo paper:\nhttps://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2" ) adata = adata.copy() if not inplace else adata - data = adata.obs[cell_hashing_columns].values + data = adata.obs[cell_hashing_columns].to_numpy() if not check_nonnegative_integers(data): msg = "Cell hashing counts must be non-negative" raise ValueError(msg) diff --git a/src/scanpy/get/get.py b/src/scanpy/get/get.py index fd5ead5ce6..86fda4a906 100644 --- a/src/scanpy/get/get.py +++ b/src/scanpy/get/get.py @@ -79,7 +79,7 @@ def rank_genes_groups_df( d = [pd.DataFrame(adata.uns[key][c])[group] for c in colnames] d = pd.concat(d, axis=1, names=[None, "group"], keys=colnames) - d = d.stack(level=1, future_stack=True).reset_index() + d = d.stack(level=1, future_stack=True).reset_index() # noqa: PD013 d["group"] = pd.Categorical(d["group"], categories=group) d = d.sort_values(["group", "level_0"]).drop(columns="level_0") @@ -106,7 +106,7 @@ def rank_genes_groups_df( # remove group column for backward compat if len(group) == 1 if len(group) == 1: - d.drop(columns="group", inplace=True) + del d["group"] return d.reset_index(drop=True) diff --git a/src/scanpy/plotting/_anndata.py b/src/scanpy/plotting/_anndata.py index b187891b31..e61f1ed8f8 100755 --- a/src/scanpy/plotting/_anndata.py +++ b/src/scanpy/plotting/_anndata.py @@ -460,9 +460,9 @@ def add_centroid(centroids, name, xy, mask) -> None: ) raise ValueError(msg) else: - iname = np.flatnonzero( - adata.obs[key].cat.categories.values == name - )[0] + iname = np.flatnonzero(adata.obs[key].cat.categories.array == name)[ + 0 + ] mask = scatter_group( axs[ikey], key, @@ -1992,7 +1992,7 @@ def _prepare_dataframe( # noqa: PLR0912 if groupby_index is not None: # reset index to treat all columns the same way. - obs_tidy.reset_index(inplace=True) + obs_tidy = obs_tidy.reset_index() groupby.append(groupby_index) if groupby is None: diff --git a/src/scanpy/plotting/_dotplot.py b/src/scanpy/plotting/_dotplot.py index 1553d214ad..1ba66aea4e 100644 --- a/src/scanpy/plotting/_dotplot.py +++ b/src/scanpy/plotting/_dotplot.py @@ -608,8 +608,8 @@ def _plot_stacked_colorbars(self, fig, colorbar_area_spec, normalize): ) # Create a dedicated normalizer for the legend - vmin = self.dot_color_df.values.min() - vmax = self.dot_color_df.values.max() + vmin = self.dot_color_df.to_numpy().min() + vmax = self.dot_color_df.to_numpy().max() legend_norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) for i, group_name in enumerate(groups_to_plot): @@ -799,8 +799,8 @@ def _dotplot( # noqa: PLR0912, PLR0913, PLR0915 y, x = np.indices(dot_color.shape) y = y.flatten() + 0.5 x = x.flatten() + 0.5 - frac = dot_size.values.flatten() - mean_flat = dot_color.values.flatten() + frac = dot_size.to_numpy().flatten() + mean_flat = dot_color.to_numpy().flatten() cmap = colormaps.get_cmap(cmap) if dot_max is None: dot_max = np.ceil(max(frac) * 10) / 10 diff --git a/src/scanpy/plotting/_stacked_violin.py b/src/scanpy/plotting/_stacked_violin.py index 329c7b8f16..a6f2ee6b59 100644 --- a/src/scanpy/plotting/_stacked_violin.py +++ b/src/scanpy/plotting/_stacked_violin.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING import numpy as np -import pandas as pd from matplotlib import colormaps from matplotlib.colors import is_color_like @@ -25,6 +24,7 @@ from collections.abc import Mapping, Sequence from typing import Literal, Self + import pandas as pd from anndata import AnnData from matplotlib.axes import Axes from matplotlib.colors import Colormap, Normalize @@ -440,7 +440,7 @@ def _mainplot(self, ax: Axes): def _make_rows_of_violinplots( self, ax, - _matrix, + _matrix: pd.DataFrame, colormap_array, _color_df, x_spacer_size: float, @@ -466,18 +466,9 @@ def _make_rows_of_violinplots( # the expression value # This format is convenient to aggregate per gene or per category # while making the violin plots. - df = ( - pd - .DataFrame(_matrix.stack(future_stack=True)) - .reset_index() - .rename( - columns={ - "level_1": "genes", - _matrix.index.name: "categories", - 0: "values", - } - ) - ) + df = _matrix.melt( + var_name="genes", value_name="values", ignore_index=False + ).reset_index(names="categories") df["genes"] = ( df["genes"].astype("category").cat.reorder_categories(_matrix.columns) ) @@ -514,7 +505,7 @@ def _make_rows_of_violinplots( if not self.are_axes_swapped: x = "genes" - _df = df[df.categories == row_label] + _df = df[df["categories"] == row_label] else: x = "categories" # because of the renamed matrix columns here diff --git a/src/scanpy/plotting/_tools/__init__.py b/src/scanpy/plotting/_tools/__init__.py index 3ceba630f8..3a13625678 100644 --- a/src/scanpy/plotting/_tools/__init__.py +++ b/src/scanpy/plotting/_tools/__init__.py @@ -266,7 +266,7 @@ def dpt_timeseries( if as_heatmap: # plot time series as heatmap, as in Haghverdi et al. (2016), Fig. 1d timeseries_as_heatmap( - adata.X[adata.obs["dpt_order_indices"].values], + adata.X[adata.obs["dpt_order_indices"].to_numpy()], var_names=adata.var_names, highlights_x=adata.uns["dpt_changepoints"], color_map=color_map, @@ -274,7 +274,7 @@ def dpt_timeseries( else: # plot time series as gene expression vs time timeseries( - adata.X[adata.obs["dpt_order_indices"].values], + adata.X[adata.obs["dpt_order_indices"].to_numpy()], var_names=adata.var_names, highlights_x=adata.uns["dpt_changepoints"], xlim=[0, 1.3 * adata.X.shape[0]], @@ -587,7 +587,7 @@ def _rank_genes_groups_plot( # noqa: PLR0912, PLR0913, PLR0915 if gene_symbols is not None: df["names"] = df[gene_symbols] - genes_list = df.names[df.names.notnull()].tolist() + genes_list = df.names[df.names.notna()].tolist() if len(genes_list) == 0: logg.warning(f"No genes found for group {group}") @@ -1740,7 +1740,7 @@ def _get_values_to_plot( column = values_to_plot.replace("log10_", "") else: column = values_to_plot - values_df = pd.pivot( + values_df = pd.pivot_table( values_df, index="names", columns="group", values=column ).fillna(1) diff --git a/src/scanpy/plotting/_tools/paga.py b/src/scanpy/plotting/_tools/paga.py index 0fb077fbd8..ed50ec6147 100644 --- a/src/scanpy/plotting/_tools/paga.py +++ b/src/scanpy/plotting/_tools/paga.py @@ -782,7 +782,7 @@ def _paga_graph( # noqa: PLR0912, PLR0913, PLR0915 from io import StringIO df = pd.read_csv(StringIO(s), header=-1) - pos_array = df[[4, 5]].values + pos_array = df[[4, 5]].to_numpy() # convert to dictionary pos = {n: [p[0], p[1]] for n, p in enumerate(pos_array)} @@ -809,7 +809,7 @@ def _paga_graph( # noqa: PLR0912, PLR0913, PLR0915 x_color = [] cats = adata.obs[groups_key].cat.categories for cat in cats: - subset = (cat == adata.obs[groups_key]).values + subset = (cat == adata.obs[groups_key]).array if adata.raw is not None and use_raw: adata_gene = adata.raw[:, colors] else: @@ -826,7 +826,7 @@ def _paga_graph( # noqa: PLR0912, PLR0913, PLR0915 x_color = [] cats = adata.obs[groups_key].cat.categories for cat in cats: - subset = (cat == adata.obs[groups_key]).values + subset = (cat == adata.obs[groups_key]).array x_color.append(adata.obs.loc[subset, colors].mean()) colors = x_color @@ -1200,7 +1200,7 @@ def moving_average(a): x = [] for igroup, group in enumerate(nodes_ints): idcs = np.arange(adata.n_obs)[ - adata.obs[groups_key].values == nodes_strs[igroup] + adata.obs[groups_key].array == nodes_strs[igroup] ] if len(idcs) == 0: msg = ( @@ -1211,14 +1211,15 @@ def moving_average(a): ) raise ValueError(msg) idcs_group = np.argsort( - adata.obs["dpt_pseudotime"].values[ - adata.obs[groups_key].values == nodes_strs[igroup] - ] + adata + .obs["dpt_pseudotime"] + .iloc[adata.obs[groups_key].array == nodes_strs[igroup]] + .to_numpy() ) idcs = idcs[idcs_group] - values = (adata.obs[key].values if key in adata.obs else adata_x[:, key].X)[ - idcs - ] + values = ( + adata.obs[key].to_numpy() if key in adata.obs else adata_x[:, key].X + )[idcs] x += (values.toarray() if isinstance(values, CSBase) else values).tolist() if ikey == 0: groups += [group] * len(idcs) @@ -1227,7 +1228,7 @@ def moving_average(a): series = adata.obs[anno] if isinstance(series.dtype, CategoricalDtype): series = series.cat.codes - anno_dict[anno] += list(series.values[idcs]) + anno_dict[anno] += series.iloc[idcs].to_list() if n_avg > 1: x = moving_average(x) if ikey == 0: diff --git a/src/scanpy/plotting/_tools/scatterplots.py b/src/scanpy/plotting/_tools/scatterplots.py index 840c13a3eb..85acff45fd 100644 --- a/src/scanpy/plotting/_tools/scatterplots.py +++ b/src/scanpy/plotting/_tools/scatterplots.py @@ -293,7 +293,7 @@ def embedding( # noqa: PLR0912, PLR0913, PLR0915 order = np.argsort(-color_vector, kind="stable")[::-1] elif sort_order and color_type == "cat": # Null points go on bottom - order = np.argsort(~pd.isnull(color_source_vector), kind="stable") + order = np.argsort(~pd.isna(color_source_vector), kind="stable") # Set orders — use a local to avoid cumulative reordering across # subplots when multiple color keys are given. _size = np.array(size)[order] if isinstance(size, np.ndarray) else size @@ -1130,7 +1130,7 @@ def _add_categorical_legend( # noqa: PLR0913 scatter_array=None, ): """Add a legend to the passed Axes.""" - if na_in_legend and pd.isnull(color_source_vector).any(): + if na_in_legend and pd.isna(color_source_vector).any(): if "NA" in color_source_vector: msg = "No fallback for null labels has been defined if NA already in categories." raise NotImplementedError(msg) diff --git a/src/scanpy/plotting/_utils.py b/src/scanpy/plotting/_utils.py index 47d054ad54..70a8adcf84 100644 --- a/src/scanpy/plotting/_utils.py +++ b/src/scanpy/plotting/_utils.py @@ -630,7 +630,7 @@ def scatter_group( marker: MarkerType = ".", ): """Scatter of group using representation of data Y.""" - mask_obs = adata.obs[key].cat.categories[cat_code] == adata.obs[key].values + mask_obs = adata.obs[key].cat.categories[cat_code] == adata.obs[key].array color = adata.uns[f"{key}_colors"][cat_code] if not isinstance(color[0], str): from matplotlib.colors import rgb2hex diff --git a/src/scanpy/preprocessing/_combat.py b/src/scanpy/preprocessing/_combat.py index 7aa38ccc86..4600074cfd 100644 --- a/src/scanpy/preprocessing/_combat.py +++ b/src/scanpy/preprocessing/_combat.py @@ -43,10 +43,10 @@ def _design_matrix( return_type="dataframe", ) model = model.drop([batch_key], axis=1) - numerical_covariates = model.select_dtypes("number").columns.values + numerical_covariates = model.select_dtypes("number").columns.array logg.info(f"Found {design.shape[1]} batches\n") - other_cols = [c for c in model.columns.values if c not in numerical_covariates] + other_cols = [c for c in model.columns.array if c not in numerical_covariates] if other_cols: col_repr = " + ".join(f"Q('{x}')" for x in other_cols) @@ -230,7 +230,7 @@ def combat( # noqa: PLR0915 # first estimate of the additive batch effect gamma_hat = ( la.inv(batch_design.T @ batch_design) @ batch_design.T @ s_data.T - ).values + ).to_numpy() # first estimate for the multiplicative batch effect delta_hat = [ s_data.iloc[:, batch_idxs].var(axis=1) for batch_idxs in batch_info.values() @@ -288,10 +288,11 @@ def combat( # noqa: PLR0915 bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean # put back into the adata object or return + x = bayesdata.to_numpy().transpose() if inplace: - adata.X = bayesdata.values.transpose() - else: - return bayesdata.values.transpose() + adata.X = x + return None + return x def _it_sol( diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py index e0b1f0ec08..21fc5da45a 100644 --- a/src/scanpy/preprocessing/_highly_variable_genes.py +++ b/src/scanpy/preprocessing/_highly_variable_genes.py @@ -452,7 +452,7 @@ def _postprocess_dispersions_seurat( # retrieve those genes that have nan std, these are the ones where # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 - one_gene_per_bin = disp_bin_stats["dev"].isnull() + one_gene_per_bin = disp_bin_stats["dev"].isna() gen_indices = np.flatnonzero(one_gene_per_bin.loc[mean_bin]) if len(gen_indices) == 0: return @@ -577,11 +577,10 @@ def _highly_variable_genes_batched( # break ties with normalized dispersion across batches df_orig_ind = adata.var.index.copy() - df.sort_values( + df = df.sort_values( ["highly_variable_nbatches", "dispersions_norm"], ascending=False, na_position="last", - inplace=True, ) df["highly_variable"] = np.arange(df.shape[0]) < cutoff df = df.loc[df_orig_ind] diff --git a/src/scanpy/preprocessing/_qc.py b/src/scanpy/preprocessing/_qc.py index 17705eead0..04340ec7e9 100644 --- a/src/scanpy/preprocessing/_qc.py +++ b/src/scanpy/preprocessing/_qc.py @@ -111,7 +111,7 @@ def describe_obs( # noqa: PLR0913 ) for qc_var in qc_vars: obs_metrics[f"total_{expr_type}_{qc_var}"] = stats.sum( - x[:, adata.var[qc_var].values], axis=1 + x[:, adata.var[qc_var].to_numpy()], axis=1 ) if log1p: obs_metrics[f"log1p_total_{expr_type}_{qc_var}"] = np.log1p( diff --git a/src/scanpy/preprocessing/_scrublet/__init__.py b/src/scanpy/preprocessing/_scrublet/__init__.py index 2fce77cc35..6aae43c608 100644 --- a/src/scanpy/preprocessing/_scrublet/__init__.py +++ b/src/scanpy/preprocessing/_scrublet/__init__.py @@ -267,7 +267,7 @@ def _run_scrublet( ) # Now reset the obs to get the scrublet scores - adata.obs = scrubbed_obs.loc[adata.obs_names.values] + adata.obs = scrubbed_obs.loc[adata.obs_names.array] # Save the .uns from each batch separately adata.uns["scrublet"] = {} diff --git a/src/scanpy/readwrite.py b/src/scanpy/readwrite.py index d01520f404..adf4fc80b6 100644 --- a/src/scanpy/readwrite.py +++ b/src/scanpy/readwrite.py @@ -223,7 +223,7 @@ def read_10x_h5( ) adata = _read_10x_h5(path, _read_v3_10x_h5) if genome: - if genome not in adata.var["genome"].values: + if genome not in adata.var["genome"].array: msg = ( f"Could not find data corresponding to genome {genome!r} in {path}. " f"Available genomes are: {list(adata.var['genome'].unique())}." @@ -506,10 +506,8 @@ def read_visium( adata.obsm["spatial"] = adata.obs[ ["pxl_row_in_fullres", "pxl_col_in_fullres"] ].to_numpy() - adata.obs.drop( - columns=["pxl_row_in_fullres", "pxl_col_in_fullres"], - inplace=True, - ) + del adata.obs["pxl_row_in_fullres"] + del adata.obs["pxl_col_in_fullres"] # put image path in uns if source_image_path is not None: diff --git a/src/scanpy/tools/_dendrogram.py b/src/scanpy/tools/_dendrogram.py index 80882cf3fb..a130ecc791 100644 --- a/src/scanpy/tools/_dendrogram.py +++ b/src/scanpy/tools/_dendrogram.py @@ -136,7 +136,7 @@ def dendrogram( # noqa: PLR0913 ).astype("category") categorical.name = "_".join(groupby) - rep_df.set_index(categorical, inplace=True) + rep_df.index = categorical categories: pd.Index = rep_df.index.categories else: gene_names = adata.raw.var_names if use_raw else adata.var_names diff --git a/src/scanpy/tools/_rank_genes_groups.py b/src/scanpy/tools/_rank_genes_groups.py index 02d2e3ebad..d1eec5d4a0 100644 --- a/src/scanpy/tools/_rank_genes_groups.py +++ b/src/scanpy/tools/_rank_genes_groups.py @@ -396,7 +396,7 @@ def logreg( from sklearn.linear_model import LogisticRegression # Indexing with a series causes issues, possibly segfault - x = self.X[self.grouping_mask.values, :] + x = self.X[self.grouping_mask.to_numpy(), :] if len(self.groups_order) == 1: msg = "Cannot perform logistic regression on a single cluster." @@ -882,7 +882,7 @@ def filter_rank_genes_groups( # noqa: PLR0912 for cluster in gene_names.columns: # iterate per column - var_names = gene_names[cluster].values + var_names = gene_names[cluster].array if not use_logfolds or not use_fraction: var_idx = (adata.raw if use_raw else adata).var_names.get_indexer(var_names) @@ -893,10 +893,10 @@ def filter_rank_genes_groups( # noqa: PLR0912 if use_fraction: fraction_in_cluster_matrix.loc[:, cluster] = ( - adata.uns[key]["pts"][cluster].loc[var_names].values + adata.uns[key]["pts"][cluster].loc[var_names].array ) fraction_out_cluster_matrix.loc[:, cluster] = ( - adata.uns[key]["pts_rest"][cluster].loc[var_names].values + adata.uns[key]["pts_rest"][cluster].loc[var_names].array ) else: fraction_in_cluster_matrix.loc[:, cluster] = _calc_frac(x_in) diff --git a/src/scanpy/tools/_utils_clustering.py b/src/scanpy/tools/_utils_clustering.py index eb6dc2ecb9..f5e82e72d5 100644 --- a/src/scanpy/tools/_utils_clustering.py +++ b/src/scanpy/tools/_utils_clustering.py @@ -44,7 +44,7 @@ def restrict_adjacency( if c not in adata.obs[restrict_key].cat.categories: msg = f"{c!r} is not a valid category for {restrict_key!r}" raise ValueError(msg) - restrict_indices = adata.obs[restrict_key].isin(restrict_categories).values + restrict_indices = adata.obs[restrict_key].isin(restrict_categories).to_numpy() adjacency = adjacency[restrict_indices, :] adjacency = adjacency[:, restrict_indices] return adjacency, restrict_indices diff --git a/tests/test_get.py b/tests/test_get.py index 0f8d28c22f..814a760912 100644 --- a/tests/test_get.py +++ b/tests/test_get.py @@ -478,7 +478,8 @@ def test_shared_key_errors(shared_key_adata): ############################## -def test_rank_genes_groups_df(): +@pytest.fixture(scope="module") +def adata_rgg_module() -> AnnData: a = np.zeros((20, 3)) a[:10, 0] = 5 adata = AnnData( @@ -490,32 +491,51 @@ def test_rank_genes_groups_df(): var=pd.DataFrame(index=[f"gene{i}" for i in range(a.shape[1])]), ) sc.tl.rank_genes_groups(adata, groupby="celltype", method="wilcoxon", pts=True) - dedf = sc.get.rank_genes_groups_df(adata, "a") + return adata + + +@pytest.fixture +def adata_rgg(adata_rgg_module: AnnData) -> AnnData: + return adata_rgg_module.copy() + + +def test_rank_genes_groups_df(adata_rgg: AnnData): + dedf = sc.get.rank_genes_groups_df(adata_rgg, "a") assert dedf["pvals"].value_counts()[1.0] == 2 - assert sc.get.rank_genes_groups_df(adata, "a", log2fc_max=0.1).shape[0] == 2 - assert sc.get.rank_genes_groups_df(adata, "a", log2fc_min=0.1).shape[0] == 1 - assert sc.get.rank_genes_groups_df(adata, "a", pval_cutoff=0.9).shape[0] == 1 - del adata.uns["rank_genes_groups"] + assert sc.get.rank_genes_groups_df(adata_rgg, "a", log2fc_max=0.1).shape[0] == 2 + assert sc.get.rank_genes_groups_df(adata_rgg, "a", log2fc_min=0.1).shape[0] == 1 + assert sc.get.rank_genes_groups_df(adata_rgg, "a", pval_cutoff=0.9).shape[0] == 1 + + +def test_rank_genes_groups_df_error(adata_rgg: AnnData): + with pytest.raises(KeyError): + sc.get.rank_genes_groups_df(adata_rgg, "missing") + + +def test_rank_genes_groups_df_explicit_key(adata_rgg: AnnData): + dedf = sc.get.rank_genes_groups_df(adata_rgg, "a") + del adata_rgg.uns["rank_genes_groups"] sc.tl.rank_genes_groups( - adata, + adata_rgg, groupby="celltype", method="wilcoxon", key_added="different_key", pts=True, ) - with pytest.raises(KeyError): - sc.get.rank_genes_groups_df(adata, "a") - dedf2 = sc.get.rank_genes_groups_df(adata, "a", key="different_key") + dedf2 = sc.get.rank_genes_groups_df(adata_rgg, "a", key="different_key") + pd.testing.assert_frame_equal(dedf, dedf2) assert "pct_nz_group" in dedf2.columns assert "pct_nz_reference" in dedf2.columns - # get all groups - dedf3 = sc.get.rank_genes_groups_df(adata, group=None, key="different_key") - assert "a" in dedf3["group"].unique() - assert "b" in dedf3["group"].unique() - adata.var_names.name = "pr1388" - sc.get.rank_genes_groups_df(adata, group=None, key="different_key") + +@pytest.mark.parametrize("index_name", [None, "pr1388"]) +def test_rank_genes_groups_df_all_groups(adata_rgg: AnnData, index_name: str | None): + if index_name is not None: + adata_rgg.var_names.name = index_name + dedf = sc.get.rank_genes_groups_df(adata_rgg, group=None) + assert "a" in dedf["group"].unique() + assert "b" in dedf["group"].unique() ###################### diff --git a/tests/test_highly_variable_genes.py b/tests/test_highly_variable_genes.py index f021ee8584..6ee9893413 100644 --- a/tests/test_highly_variable_genes.py +++ b/tests/test_highly_variable_genes.py @@ -510,7 +510,7 @@ def test_seurat_v3_degenerate() -> None: sc.pp.highly_variable_genes(adata, flavor="seurat_v3") -def test_batches(): +def test_batches() -> None: adata = pbmc68k_reduced() adata.X[:100, :100] = np.zeros((100, 100)) @@ -537,20 +537,22 @@ def test_batches(): assert hvg2 is not None np.testing.assert_allclose( - adata.var["dispersions_norm"].iat[100], - 0.5 * hvg1["dispersions_norm"].iat[0] + 0.5 * hvg2["dispersions_norm"].iat[100], + adata.var["dispersions_norm"].iloc[100], + 0.5 * hvg1["dispersions_norm"].iloc[0] + + 0.5 * hvg2["dispersions_norm"].iloc[100], rtol=1.0e-7, atol=1.0e-7, ) np.testing.assert_allclose( - adata.var["dispersions_norm"].iat[101], - 0.5 * hvg1["dispersions_norm"].iat[1] + 0.5 * hvg2["dispersions_norm"].iat[101], + adata.var["dispersions_norm"].iloc[101], + 0.5 * hvg1["dispersions_norm"].iloc[1] + + 0.5 * hvg2["dispersions_norm"].iloc[101], rtol=1.0e-7, atol=1.0e-7, ) np.testing.assert_allclose( - adata.var["dispersions_norm"].iat[0], - 0.5 * hvg2["dispersions_norm"].iat[0], + adata.var["dispersions_norm"].iloc[0], + 0.5 * hvg2["dispersions_norm"].iloc[0], rtol=1.0e-7, atol=1.0e-7, ) diff --git a/tests/test_plotting.py b/tests/test_plotting.py index af35236c6a..ad3bc849b3 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -895,7 +895,7 @@ def test_rank_genes_groups( def test_rank_genes_group_axes(image_comparer): - fn = next(p.values[0] for p in _RANK_GENES_GROUPS_PARAMS if p.id == "basic") + fn = next(p.values[0] for p in _RANK_GENES_GROUPS_PARAMS if p.id == "basic") # noqa: PD011 save_and_compare_images = partial(image_comparer, ROOT, tol=23) diff --git a/tests/test_read_10x.py b/tests/test_read_10x.py index 0358894991..6055e06cdc 100644 --- a/tests/test_read_10x.py +++ b/tests/test_read_10x.py @@ -62,7 +62,7 @@ def test_read_10x( # Drop genome column for comparing v3 if "3.0.0" in str(h5_path): - h5.var.drop(columns="genome", inplace=True) + del h5.var["genome"] # Verify CSR format (not CSC from transpose) assert isinstance(mtx.X, CSRBase), f"Expected CSR matrix, got {type(mtx.X)}" From 85dc01f29241487dd142c429d5081099d830c3bc Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 21 Apr 2026 12:40:52 +0200 Subject: [PATCH 2/3] fix subset --- src/scanpy/experimental/pp/_highly_variable_genes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scanpy/experimental/pp/_highly_variable_genes.py b/src/scanpy/experimental/pp/_highly_variable_genes.py index eb8b582233..ac829ed8ff 100644 --- a/src/scanpy/experimental/pp/_highly_variable_genes.py +++ b/src/scanpy/experimental/pp/_highly_variable_genes.py @@ -264,7 +264,7 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 ) adata.var["means"] = df["means"].array adata.var["variances"] = df["variances"].array - adata.var["residual_variances"] = df["residual_variances"] + adata.var["residual_variances"] = df["residual_variances"].array adata.var["highly_variable_rank"] = df["highly_variable_rank"].array if batch_key is not None: adata.var["highly_variable_nbatches"] = df["highly_variable_nbatches"].array @@ -274,7 +274,7 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 adata.var["highly_variable"] = df["highly_variable"].array if subset: - adata._inplace_subset_var(df["highly_variable"].values) + adata._inplace_subset_var(df["highly_variable"].to_numpy()) else: if batch_key is None: @@ -282,7 +282,7 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 ["highly_variable_nbatches", "highly_variable_intersection"], axis=1 ) if subset: - df = df.iloc[df.highly_variable.array, :] + df = df.iloc[df["highly_variable"].to_numpy(), :] return df From be10189a254bee02f773354d5da00dc9015b2321 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 21 Apr 2026 13:09:23 +0200 Subject: [PATCH 3/3] cleanup --- src/scanpy/plotting/_anndata.py | 4 +--- src/scanpy/plotting/_tools/paga.py | 16 +++++----------- src/scanpy/plotting/_utils.py | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/scanpy/plotting/_anndata.py b/src/scanpy/plotting/_anndata.py index e61f1ed8f8..5d36075a57 100755 --- a/src/scanpy/plotting/_anndata.py +++ b/src/scanpy/plotting/_anndata.py @@ -460,9 +460,7 @@ def add_centroid(centroids, name, xy, mask) -> None: ) raise ValueError(msg) else: - iname = np.flatnonzero(adata.obs[key].cat.categories.array == name)[ - 0 - ] + iname = np.flatnonzero(adata.obs[key].cat.categories == name)[0] mask = scatter_group( axs[ikey], key, diff --git a/src/scanpy/plotting/_tools/paga.py b/src/scanpy/plotting/_tools/paga.py index ed50ec6147..3403487a15 100644 --- a/src/scanpy/plotting/_tools/paga.py +++ b/src/scanpy/plotting/_tools/paga.py @@ -809,7 +809,7 @@ def _paga_graph( # noqa: PLR0912, PLR0913, PLR0915 x_color = [] cats = adata.obs[groups_key].cat.categories for cat in cats: - subset = (cat == adata.obs[groups_key]).array + subset = (cat == adata.obs[groups_key]).to_numpy() if adata.raw is not None and use_raw: adata_gene = adata.raw[:, colors] else: @@ -826,7 +826,7 @@ def _paga_graph( # noqa: PLR0912, PLR0913, PLR0915 x_color = [] cats = adata.obs[groups_key].cat.categories for cat in cats: - subset = (cat == adata.obs[groups_key]).array + subset = (cat == adata.obs[groups_key]).to_numpy() x_color.append(adata.obs.loc[subset, colors].mean()) colors = x_color @@ -1199,9 +1199,8 @@ def moving_average(a): for ikey, key in enumerate(keys): x = [] for igroup, group in enumerate(nodes_ints): - idcs = np.arange(adata.n_obs)[ - adata.obs[groups_key].array == nodes_strs[igroup] - ] + mask = (adata.obs[groups_key] == nodes_strs[igroup]).to_numpy() + idcs = np.flatnonzero(mask) if len(idcs) == 0: msg = ( "Did not find data points that match " @@ -1210,12 +1209,7 @@ def moving_average(a): "actually contains what you expect." ) raise ValueError(msg) - idcs_group = np.argsort( - adata - .obs["dpt_pseudotime"] - .iloc[adata.obs[groups_key].array == nodes_strs[igroup]] - .to_numpy() - ) + idcs_group = np.argsort(adata.obs["dpt_pseudotime"].iloc[mask].to_numpy()) idcs = idcs[idcs_group] values = ( adata.obs[key].to_numpy() if key in adata.obs else adata_x[:, key].X diff --git a/src/scanpy/plotting/_utils.py b/src/scanpy/plotting/_utils.py index 70a8adcf84..23cad1c111 100644 --- a/src/scanpy/plotting/_utils.py +++ b/src/scanpy/plotting/_utils.py @@ -630,7 +630,7 @@ def scatter_group( marker: MarkerType = ".", ): """Scatter of group using representation of data Y.""" - mask_obs = adata.obs[key].cat.categories[cat_code] == adata.obs[key].array + mask_obs = (adata.obs[key].cat.categories[cat_code] == adata.obs[key]).to_numpy() color = adata.uns[f"{key}_colors"][cat_code] if not isinstance(color[0], str): from matplotlib.colors import rgb2hex