diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 32455175..6b946ff8 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -384,6 +384,9 @@ def filter(self, record: logging.LogRecord) -> bool: if table is not None: tables["table"] = table + # valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids) + # valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids) + # tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy() elements_dict = { "images": images, @@ -418,19 +421,38 @@ def _get_polygons( n_jobs: int, idx: ArrayLike | None = None, ) -> GeoDataFrame: - def _poly(arr: ArrayLike) -> Polygon: - return Polygon(arr[:-1]) - # seems to be faster than pd.read_parquet df = pq.read_table(path / file).to_pandas() + # df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID]) + # # filter out cell ids with too few vertices to form a valid polygon. + # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[ + # XeniumKeys.CELL_ID].unique() + # invalid_ids = [] if len(invalid_ids) == 0 else invalid_ids + # + # if len(invalid_ids) > 0: + # logging.warning( + # f"Found {len(invalid_ids)} invalid polygons for {file}, removing the masks corresponding to the IDs: {invalid_ids}" + # ) + # + # # Filter based on valid cell IDs if idx is provided + # if idx is not None: + # idx = idx[~idx.isin(invalid_ids)] + # if len(invalid_ids) > 0: + # idx = idx.reset_index(drop=True) + # df = df[df[XeniumKeys.CELL_ID].isin(idx)] + # else: + # # If no idx provided, just (potentially) filter out invalid IDs + # df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)] + group_by = df.groupby(XeniumKeys.CELL_ID) index = pd.Series(group_by.indices.keys()) # convert the index to str since we will compare it with an AnnData object, where the index is a str index.index = index.index.astype(str) index = _decode_cell_id_column(index) + out = Parallel(n_jobs=n_jobs)( - delayed(_poly)(i.to_numpy()) + delayed(Polygon)(i.to_numpy()) for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]] ) geo_df = GeoDataFrame({"geometry": out})