-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Open
Labels
bugextension-arraysneeds triageIssue that has not been reviewed by xarray team memberIssue that has not been reviewed by xarray team member
Description
What happened?
Based on this comment I ran a mini test-suite for checking the current dtype support and it resulted in a few failures for EA and numpy.StringDType edge-cases xarray doesn't cover yet.
The tests are AI generated, but they are meaningful, relevant.
Some of the test cases might be already covered in: #10423
It's the generalization of: #10964 (but that's pretty specific, self-contained)
What did you expect to happen?
Clean pass
Minimal Complete Verifiable Example
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "xarray[complete]@git+https://github.com/pydata/xarray.git@main",
# "pyarrow",
# ]
# ///
#
# This script automatically imports the development branch of xarray to check for issues.
# Please delete this header if you have _not_ tested this script with `uv run`!
import xarray as xr
xr.show_versions()
# your reproducer code ...
import numpy as np
import pandas as pd
import xarray as xr
import pyarrow as pa # noqa: F401
from numpy.dtypes import StringDType
def print_header():
print(f"xarray version: {xr.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"pyarrow version: {pa.__version__}")
print()
def run_test(name, func):
try:
func()
except Exception as e:
print(f"FAIL | {name} ({type(e).__name__}: {e})")
else:
print(f"PASS | {name}")
# =============================================================================
# NumPy StringDType tests
# =============================================================================
def test_numpy_stringdtype_values_where():
sdt = StringDType()
data = np.array(["a", "b", "c"], dtype=sdt)
da = xr.DataArray(data, dims="x", coords={"x": [0, 1, 2]}, name="str_val")
_ = da.where(da != "b")
def test_numpy_stringdtype_values_concat():
sdt = StringDType()
data = np.array(["a", "b", "c"], dtype=sdt)
da = xr.DataArray(data, dims="x", coords={"x": [0, 1, 2]}, name="str_val")
_ = xr.concat([da, da], dim="rep")
def test_numpy_stringdtype_coord_align():
sdt = StringDType()
coord = np.array(["A", "B", "C"], dtype=sdt)
da1 = xr.DataArray([1, 2, 3], dims="label", coords={"label": coord}, name="v1")
da2 = xr.DataArray([10, 20], dims="label", coords={"label": ["B", "C"]}, name="v2")
_ = xr.align(da1, da2, join="outer")
def test_numpy_stringdtype_values_where_null():
sdt = StringDType(na_object=pd.NA)
data = np.array(["a", pd.NA, "c"], dtype=sdt)
da = xr.DataArray(data, dims="x", coords={"x": [0, 1, 2]}, name="str_val_na")
mask = ~da.isnull()
_ = da.where(mask)
# =============================================================================
# string[pyarrow] tests
# =============================================================================
def test_string_pyarrow_values_where():
s = pd.Series(["foo", "bar", None], dtype="string[pyarrow]", name="s")
da = s.to_xarray()
_ = da.where(da != "foo")
def test_string_pyarrow_values_concat():
s = pd.Series(["foo", "bar", None], dtype="string[pyarrow]", name="s")
da = s.to_xarray()
_ = xr.concat([da, da], dim="rep")
def test_string_pyarrow_values_align():
s = pd.Series(["foo", "bar", None], dtype="string[pyarrow]", name="s")
da1 = s.to_xarray()
da2 = da1.isel(index=[0, 1])
_ = xr.align(da1, da2, join="outer")
def test_string_pyarrow_values_where_null():
s = pd.Series(["foo", None, "bar"], dtype="string[pyarrow]", name="s")
da = s.to_xarray()
mask = ~da.isnull()
_ = da.where(mask)
def test_string_pyarrow_coord_where():
idx = pd.Index(["A", "B", "C"], dtype="string[pyarrow]", name="label")
da = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx}, name="v")
_ = da.where(da > 1)
def test_string_pyarrow_coord_concat():
idx = pd.Index(["A", "B", "C"], dtype="string[pyarrow]", name="label")
da = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx}, name="v")
_ = xr.concat([da, da], dim="rep")
def test_string_pyarrow_coord_align():
idx = pd.Index(["A", "B", "C"], dtype="string[pyarrow]", name="label")
da1 = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx}, name="v1")
da2 = xr.DataArray([10, 20], dims="label", coords={"label": ["B", "D"]}, name="v2")
_ = xr.align(da1, da2, join="outer")
def test_string_pyarrow_coord_align_with_null():
idx1 = pd.Index(["A", None, "C"], dtype="string[pyarrow]", name="label")
da1 = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx1}, name="v1")
idx2 = pd.Index(["A", "B"], dtype="string[pyarrow]", name="label")
da2 = xr.DataArray([10, 20], dims="label", coords={"label": idx2}, name="v2")
_ = xr.align(da1, da2, join="outer")
# =============================================================================
# date32[pyarrow] as coordinate
# =============================================================================
def test_date32_coord_where():
dates = pd.date_range("2024-01-01", periods=3, freq="D")
s = pd.Series(dates, name="date").astype("date32[pyarrow]")
idx = pd.Index(s, name="time")
da = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="val")
_ = da.where(da["time"] >= idx[1])
def test_date32_coord_concat():
dates = pd.date_range("2024-01-01", periods=3, freq="D")
s = pd.Series(dates, name="date").astype("date32[pyarrow]")
idx = pd.Index(s, name="time")
da = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="val")
_ = xr.concat([da, da], dim="rep")
def test_date32_coord_align_vs_datetime64():
dates = pd.date_range("2024-01-01", periods=3, freq="D")
s = pd.Series(dates, name="date").astype("date32[pyarrow]")
idx = pd.Index(s, name="time")
da1 = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="v1")
da2 = xr.DataArray(
[1.0, 2.0, 3.0],
dims="time",
coords={"time": pd.date_range("2024-01-01", periods=3, freq="D")},
name="v2",
)
_ = xr.align(da1, da2, join="outer")
def test_date32_coord_where_null():
dates = [
pd.Timestamp("2024-01-01"),
pd.NaT,
pd.Timestamp("2024-01-03"),
]
s = pd.Series(dates, name="date").astype("date32[pyarrow]")
idx = pd.Index(s, name="time")
da = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="val")
mask = ~da["time"].isnull()
_ = da.where(mask)
# =============================================================================
# Nullable Int64 and int64[pyarrow]
# =============================================================================
def test_int64_nullable_values_where():
s = pd.Series([1, 2, None], dtype="Int64", name="v")
da = s.to_xarray()
_ = da.where(da > 1)
def test_int64_nullable_values_concat():
s = pd.Series([1, 2, None], dtype="Int64", name="v")
da = s.to_xarray()
_ = xr.concat([da, da], dim="rep")
def test_int64_nullable_coord_align():
idx = pd.Index([1, 2, 3], dtype="Int64", name="i")
da1 = xr.DataArray([10, 20, 30], dims="i", coords={"i": idx}, name="v1")
da2 = xr.DataArray([100, 200], dims="i", coords={"i": [2, 4]}, name="v2")
_ = xr.align(da1, da2, join="outer")
def test_int64_nullable_values_where_null():
s = pd.Series([1, None, 3], dtype="Int64", name="v")
da = s.to_xarray()
mask = da.notnull()
_ = da.where(mask)
def test_int64_pyarrow_values_where():
s = pd.Series([1, 2, None], dtype="int64[pyarrow]", name="v")
da = s.to_xarray()
_ = da.where(da > 1)
def test_int64_pyarrow_values_concat():
s = pd.Series([1, 2, None], dtype="int64[pyarrow]", name="v")
da = s.to_xarray()
_ = xr.concat([da, da], dim="rep")
def test_int64_pyarrow_coord_align():
idx = pd.Index([1, 2, 3], dtype="int64[pyarrow]", name="i_arrow")
da1 = xr.DataArray([5, 6, 7], dims="i_arrow", coords={"i_arrow": idx}, name="v1")
da2 = xr.DataArray([10, 20], dims="i_arrow", coords={"i_arrow": [2, 4]}, name="v2")
_ = xr.align(da1, da2, join="outer")
def test_int64_pyarrow_values_where_null():
s = pd.Series([1, None, 3], dtype="int64[pyarrow]", name="v")
da = s.to_xarray()
mask = da.notnull()
_ = da.where(mask)
# =============================================================================
# Categorical as values and coordinate
# =============================================================================
def test_categorical_values_where():
cat = pd.Categorical(["a", "b", "a", "c"], categories=["a", "b", "c"])
da = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val")
_ = da.where(da != "a")
def test_categorical_values_concat():
cat = pd.Categorical(["a", "b", "a", "c"], categories=["a", "b", "c"])
da = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val")
_ = xr.concat([da, da], dim="rep")
def test_categorical_values_align_vs_object():
cat = pd.Categorical(["a", "b", "a", "c"], categories=["a", "b", "c"])
da1 = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val")
da2 = xr.DataArray(
np.array(["a", "c", "d"], dtype=object),
dims="x",
coords={"x": [0, 1, 2]},
name="obj_val",
)
_ = xr.align(da1, da2, join="outer")
def test_categorical_values_where_null():
cat = pd.Categorical(["a", None, "b", "c"], categories=["a", "b", "c"])
da = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val_na")
mask = ~da.isnull()
_ = da.where(mask)
def test_categorical_coord_where():
cat_idx = pd.CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="lab")
da = xr.DataArray([1, 2, 3], dims="lab", coords={"lab": cat_idx}, name="v")
_ = da.where(da > 1)
def test_categorical_coord_concat():
cat_idx = pd.CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="lab")
da = xr.DataArray([1, 2, 3], dims="lab", coords={"lab": cat_idx}, name="v")
_ = xr.concat([da, da], dim="rep")
def test_categorical_coord_align_vs_object_index():
idx_obj = pd.Index(["A", "B"], dtype="object", name="lab")
ds1 = xr.Dataset({"v": ("lab", [1, 2])}, coords={"lab": idx_obj})
idx_cat = pd.CategoricalIndex(["B", "C"], categories=["A", "B", "C"], name="lab")
ds2 = xr.Dataset({"v": ("lab", [3, 4])}, coords={"lab": idx_cat})
_ = xr.align(ds1, ds2, join="inner")
def test_categorical_coord_align_with_null():
cat_idx1 = pd.CategoricalIndex(
["A", None, "C"], categories=["A", "B", "C"], name="lab"
)
da1 = xr.DataArray([1, 2, 3], dims="lab", coords={"lab": cat_idx1}, name="v1")
cat_idx2 = pd.CategoricalIndex(
["A", "B"], categories=["A", "B", "C"], name="lab"
)
da2 = xr.DataArray([10, 20], dims="lab", coords={"lab": cat_idx2}, name="v2")
_ = xr.align(da1, da2, join="outer")
# =============================================================================
# Main
# =============================================================================
def main():
print_header()
tests = [
# NumPy StringDType
("NumPy StringDType values: where", test_numpy_stringdtype_values_where),
("NumPy StringDType values: concat", test_numpy_stringdtype_values_concat),
("NumPy StringDType coord vs string: align", test_numpy_stringdtype_coord_align),
("NumPy StringDType values: where with null", test_numpy_stringdtype_values_where_null),
# string[pyarrow]
("string[pyarrow] values: where", test_string_pyarrow_values_where),
("string[pyarrow] values: concat", test_string_pyarrow_values_concat),
("string[pyarrow] values: align self vs slice", test_string_pyarrow_values_align),
("string[pyarrow] values: where with null", test_string_pyarrow_values_where_null),
("string[pyarrow] coord: where", test_string_pyarrow_coord_where),
("string[pyarrow] coord: concat", test_string_pyarrow_coord_concat),
("string[pyarrow] coord vs object: align", test_string_pyarrow_coord_align),
("string[pyarrow] coord vs object with null: align", test_string_pyarrow_coord_align_with_null),
# date32[pyarrow]
("date32[pyarrow] coord: where", test_date32_coord_where),
("date32[pyarrow] coord: concat", test_date32_coord_concat),
("date32[pyarrow] coord vs datetime64: align", test_date32_coord_align_vs_datetime64),
("date32[pyarrow] coord: where with null", test_date32_coord_where_null),
# Int64 / int64[pyarrow]
("Int64 values: where", test_int64_nullable_values_where),
("Int64 values: concat", test_int64_nullable_values_concat),
("Int64 coord vs int64: align", test_int64_nullable_coord_align),
("Int64 values: where with null", test_int64_nullable_values_where_null),
("int64[pyarrow] values: where", test_int64_pyarrow_values_where),
("int64[pyarrow] values: concat", test_int64_pyarrow_values_concat),
("int64[pyarrow] coord vs int64: align", test_int64_pyarrow_coord_align),
("int64[pyarrow] values: where with null", test_int64_pyarrow_values_where_null),
# Categorical
("Categorical values: where", test_categorical_values_where),
("Categorical values: concat", test_categorical_values_concat),
("Categorical values vs object: align", test_categorical_values_align_vs_object),
("Categorical values: where with null", test_categorical_values_where_null),
("CategoricalIndex coord: where", test_categorical_coord_where),
("CategoricalIndex coord: concat", test_categorical_coord_concat),
("object vs CategoricalIndex coord: align", test_categorical_coord_align_vs_object_index),
("CategoricalIndex coord with null: align", test_categorical_coord_align_with_null),
]
for name, func in tests:
run_test(name, func)
if __name__ == "__main__":
main()Steps to reproduce
No response
MVCE confirmation
- Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
- Complete example — the example is self-contained, including all data and the text of any traceback.
- Verifiable example — the example copy & pastes into an IPython prompt or Binder notebook, returning the result.
- New issue — a search of GitHub Issues suggests this is not a duplicate.
- Recent environment — the issue occurs with the latest version of xarray and its dependencies.
Relevant log output
xarray version: 2025.11.0
pandas version: 2.3.3
numpy version: 2.3.5
pyarrow version: 22.0.0
FAIL | NumPy StringDType values: where (DTypePromotionError: The DType <class 'numpy.dtypes.StringDType'> could not be promoted by <class 'numpy.dtypes._PyFloatDType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.StringDType'>, <class 'numpy.dtypes._PyFloatDType'>))
PASS | NumPy StringDType values: concat
PASS | NumPy StringDType coord vs string: align
FAIL | NumPy StringDType values: where with null (TypeError: dtype argument must be a NumPy dtype, but it is a <class 'numpy.dtypes.StringDType'>.)
FAIL | string[pyarrow] values: where (TypeError: boolean value of NA is ambiguous)
PASS | string[pyarrow] values: concat
PASS | string[pyarrow] values: align self vs slice
PASS | string[pyarrow] values: where with null
PASS | string[pyarrow] coord: where
PASS | string[pyarrow] coord: concat
PASS | string[pyarrow] coord vs object: align
PASS | string[pyarrow] coord vs object with null: align
PASS | date32[pyarrow] coord: where
PASS | date32[pyarrow] coord: concat
FAIL | date32[pyarrow] coord vs datetime64: align (TypeError: Cannot interpret 'date32[day][pyarrow]' as a data type)
PASS | date32[pyarrow] coord: where with null
FAIL | Int64 values: where (TypeError: Cannot interpret 'Int64Dtype()' as a data type)
PASS | Int64 values: concat
FAIL | Int64 coord vs int64: align (TypeError: Cannot interpret 'Int64Dtype()' as a data type)
FAIL | Int64 values: where with null (TypeError: Cannot interpret 'Int64Dtype()' as a data type)
FAIL | int64[pyarrow] values: where (TypeError: Cannot interpret 'int64[pyarrow]' as a data type)
FAIL | int64[pyarrow] values: concat (IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices)
FAIL | int64[pyarrow] coord vs int64: align (TypeError: Cannot interpret 'int64[pyarrow]' as a data type)
FAIL | int64[pyarrow] values: where with null (TypeError: Cannot interpret 'int64[pyarrow]' as a data type)
FAIL | Categorical values: where (TypeError: Cannot interpret 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)' as a data type)
PASS | Categorical values: concat
PASS | Categorical values vs object: align
FAIL | Categorical values: where with null (TypeError: Cannot interpret 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)' as a data type)
PASS | CategoricalIndex coord: where
PASS | CategoricalIndex coord: concat
FAIL | object vs CategoricalIndex coord: align (TypeError: Cannot interpret 'CategoricalDtype(categories=['A', 'B', 'C'], ordered=False, categories_dtype=object)' as a data type)
FAIL | CategoricalIndex coord with null: align (TypeError: Cannot interpret 'CategoricalDtype(categories=['A', 'B', 'C'], ordered=False, categories_dtype=object)' as a data type)Anything else we need to know?
No response
Environment
xarray version: 2025.11.0
pandas version: 2.3.3
numpy version: 2.3.5
pyarrow version: 22.0.0
Metadata
Metadata
Assignees
Labels
bugextension-arraysneeds triageIssue that has not been reviewed by xarray team memberIssue that has not been reviewed by xarray team member