Skip to content

where/join/concat failures using esoteric dtypes #10965

@alippai

Description

@alippai

What happened?

Based on this comment I ran a mini test-suite for checking the current dtype support and it resulted in a few failures for EA and numpy.StringDType edge-cases xarray doesn't cover yet.

The tests are AI generated, but they are meaningful, relevant.

Some of the test cases might be already covered in: #10423
It's the generalization of: #10964 (but that's pretty specific, self-contained)

cc @dcherian @ilan-gold

What did you expect to happen?

Clean pass

Minimal Complete Verifiable Example

# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "xarray[complete]@git+https://github.com/pydata/xarray.git@main",
#   "pyarrow",
# ]
# ///
#
# This script automatically imports the development branch of xarray to check for issues.
# Please delete this header if you have _not_ tested this script with `uv run`!

import xarray as xr
xr.show_versions()
# your reproducer code ...
import numpy as np
import pandas as pd
import xarray as xr
import pyarrow as pa  # noqa: F401
from numpy.dtypes import StringDType


def print_header():
    print(f"xarray version: {xr.__version__}")
    print(f"pandas version: {pd.__version__}")
    print(f"numpy version: {np.__version__}")
    print(f"pyarrow version: {pa.__version__}")
    print()


def run_test(name, func):
    try:
        func()
    except Exception as e:
        print(f"FAIL | {name}  ({type(e).__name__}: {e})")
    else:
        print(f"PASS | {name}")


# =============================================================================
# NumPy StringDType tests
# =============================================================================

def test_numpy_stringdtype_values_where():
    sdt = StringDType()
    data = np.array(["a", "b", "c"], dtype=sdt)
    da = xr.DataArray(data, dims="x", coords={"x": [0, 1, 2]}, name="str_val")
    _ = da.where(da != "b")


def test_numpy_stringdtype_values_concat():
    sdt = StringDType()
    data = np.array(["a", "b", "c"], dtype=sdt)
    da = xr.DataArray(data, dims="x", coords={"x": [0, 1, 2]}, name="str_val")
    _ = xr.concat([da, da], dim="rep")


def test_numpy_stringdtype_coord_align():
    sdt = StringDType()
    coord = np.array(["A", "B", "C"], dtype=sdt)
    da1 = xr.DataArray([1, 2, 3], dims="label", coords={"label": coord}, name="v1")
    da2 = xr.DataArray([10, 20], dims="label", coords={"label": ["B", "C"]}, name="v2")
    _ = xr.align(da1, da2, join="outer")


def test_numpy_stringdtype_values_where_null():
    sdt = StringDType(na_object=pd.NA)
    data = np.array(["a", pd.NA, "c"], dtype=sdt)
    da = xr.DataArray(data, dims="x", coords={"x": [0, 1, 2]}, name="str_val_na")
    mask = ~da.isnull()
    _ = da.where(mask)


# =============================================================================
# string[pyarrow] tests
# =============================================================================

def test_string_pyarrow_values_where():
    s = pd.Series(["foo", "bar", None], dtype="string[pyarrow]", name="s")
    da = s.to_xarray()
    _ = da.where(da != "foo")


def test_string_pyarrow_values_concat():
    s = pd.Series(["foo", "bar", None], dtype="string[pyarrow]", name="s")
    da = s.to_xarray()
    _ = xr.concat([da, da], dim="rep")


def test_string_pyarrow_values_align():
    s = pd.Series(["foo", "bar", None], dtype="string[pyarrow]", name="s")
    da1 = s.to_xarray()
    da2 = da1.isel(index=[0, 1])
    _ = xr.align(da1, da2, join="outer")


def test_string_pyarrow_values_where_null():
    s = pd.Series(["foo", None, "bar"], dtype="string[pyarrow]", name="s")
    da = s.to_xarray()
    mask = ~da.isnull()
    _ = da.where(mask)


def test_string_pyarrow_coord_where():
    idx = pd.Index(["A", "B", "C"], dtype="string[pyarrow]", name="label")
    da = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx}, name="v")
    _ = da.where(da > 1)


def test_string_pyarrow_coord_concat():
    idx = pd.Index(["A", "B", "C"], dtype="string[pyarrow]", name="label")
    da = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx}, name="v")
    _ = xr.concat([da, da], dim="rep")


def test_string_pyarrow_coord_align():
    idx = pd.Index(["A", "B", "C"], dtype="string[pyarrow]", name="label")
    da1 = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx}, name="v1")
    da2 = xr.DataArray([10, 20], dims="label", coords={"label": ["B", "D"]}, name="v2")
    _ = xr.align(da1, da2, join="outer")


def test_string_pyarrow_coord_align_with_null():
    idx1 = pd.Index(["A", None, "C"], dtype="string[pyarrow]", name="label")
    da1 = xr.DataArray([1, 2, 3], dims="label", coords={"label": idx1}, name="v1")
    idx2 = pd.Index(["A", "B"], dtype="string[pyarrow]", name="label")
    da2 = xr.DataArray([10, 20], dims="label", coords={"label": idx2}, name="v2")
    _ = xr.align(da1, da2, join="outer")


# =============================================================================
# date32[pyarrow] as coordinate
# =============================================================================

def test_date32_coord_where():
    dates = pd.date_range("2024-01-01", periods=3, freq="D")
    s = pd.Series(dates, name="date").astype("date32[pyarrow]")
    idx = pd.Index(s, name="time")
    da = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="val")
    _ = da.where(da["time"] >= idx[1])


def test_date32_coord_concat():
    dates = pd.date_range("2024-01-01", periods=3, freq="D")
    s = pd.Series(dates, name="date").astype("date32[pyarrow]")
    idx = pd.Index(s, name="time")
    da = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="val")
    _ = xr.concat([da, da], dim="rep")


def test_date32_coord_align_vs_datetime64():
    dates = pd.date_range("2024-01-01", periods=3, freq="D")
    s = pd.Series(dates, name="date").astype("date32[pyarrow]")
    idx = pd.Index(s, name="time")
    da1 = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="v1")
    da2 = xr.DataArray(
        [1.0, 2.0, 3.0],
        dims="time",
        coords={"time": pd.date_range("2024-01-01", periods=3, freq="D")},
        name="v2",
    )
    _ = xr.align(da1, da2, join="outer")


def test_date32_coord_where_null():
    dates = [
        pd.Timestamp("2024-01-01"),
        pd.NaT,
        pd.Timestamp("2024-01-03"),
    ]
    s = pd.Series(dates, name="date").astype("date32[pyarrow]")
    idx = pd.Index(s, name="time")
    da = xr.DataArray([10.0, 20.0, 30.0], dims="time", coords={"time": idx}, name="val")
    mask = ~da["time"].isnull()
    _ = da.where(mask)


# =============================================================================
# Nullable Int64 and int64[pyarrow]
# =============================================================================

def test_int64_nullable_values_where():
    s = pd.Series([1, 2, None], dtype="Int64", name="v")
    da = s.to_xarray()
    _ = da.where(da > 1)


def test_int64_nullable_values_concat():
    s = pd.Series([1, 2, None], dtype="Int64", name="v")
    da = s.to_xarray()
    _ = xr.concat([da, da], dim="rep")


def test_int64_nullable_coord_align():
    idx = pd.Index([1, 2, 3], dtype="Int64", name="i")
    da1 = xr.DataArray([10, 20, 30], dims="i", coords={"i": idx}, name="v1")
    da2 = xr.DataArray([100, 200], dims="i", coords={"i": [2, 4]}, name="v2")
    _ = xr.align(da1, da2, join="outer")


def test_int64_nullable_values_where_null():
    s = pd.Series([1, None, 3], dtype="Int64", name="v")
    da = s.to_xarray()
    mask = da.notnull()
    _ = da.where(mask)


def test_int64_pyarrow_values_where():
    s = pd.Series([1, 2, None], dtype="int64[pyarrow]", name="v")
    da = s.to_xarray()
    _ = da.where(da > 1)


def test_int64_pyarrow_values_concat():
    s = pd.Series([1, 2, None], dtype="int64[pyarrow]", name="v")
    da = s.to_xarray()
    _ = xr.concat([da, da], dim="rep")


def test_int64_pyarrow_coord_align():
    idx = pd.Index([1, 2, 3], dtype="int64[pyarrow]", name="i_arrow")
    da1 = xr.DataArray([5, 6, 7], dims="i_arrow", coords={"i_arrow": idx}, name="v1")
    da2 = xr.DataArray([10, 20], dims="i_arrow", coords={"i_arrow": [2, 4]}, name="v2")
    _ = xr.align(da1, da2, join="outer")


def test_int64_pyarrow_values_where_null():
    s = pd.Series([1, None, 3], dtype="int64[pyarrow]", name="v")
    da = s.to_xarray()
    mask = da.notnull()
    _ = da.where(mask)


# =============================================================================
# Categorical as values and coordinate
# =============================================================================

def test_categorical_values_where():
    cat = pd.Categorical(["a", "b", "a", "c"], categories=["a", "b", "c"])
    da = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val")
    _ = da.where(da != "a")


def test_categorical_values_concat():
    cat = pd.Categorical(["a", "b", "a", "c"], categories=["a", "b", "c"])
    da = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val")
    _ = xr.concat([da, da], dim="rep")


def test_categorical_values_align_vs_object():
    cat = pd.Categorical(["a", "b", "a", "c"], categories=["a", "b", "c"])
    da1 = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val")
    da2 = xr.DataArray(
        np.array(["a", "c", "d"], dtype=object),
        dims="x",
        coords={"x": [0, 1, 2]},
        name="obj_val",
    )
    _ = xr.align(da1, da2, join="outer")


def test_categorical_values_where_null():
    cat = pd.Categorical(["a", None, "b", "c"], categories=["a", "b", "c"])
    da = xr.DataArray(cat, dims="x", coords={"x": [0, 1, 2, 3]}, name="cat_val_na")
    mask = ~da.isnull()
    _ = da.where(mask)


def test_categorical_coord_where():
    cat_idx = pd.CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="lab")
    da = xr.DataArray([1, 2, 3], dims="lab", coords={"lab": cat_idx}, name="v")
    _ = da.where(da > 1)


def test_categorical_coord_concat():
    cat_idx = pd.CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="lab")
    da = xr.DataArray([1, 2, 3], dims="lab", coords={"lab": cat_idx}, name="v")
    _ = xr.concat([da, da], dim="rep")


def test_categorical_coord_align_vs_object_index():
    idx_obj = pd.Index(["A", "B"], dtype="object", name="lab")
    ds1 = xr.Dataset({"v": ("lab", [1, 2])}, coords={"lab": idx_obj})

    idx_cat = pd.CategoricalIndex(["B", "C"], categories=["A", "B", "C"], name="lab")
    ds2 = xr.Dataset({"v": ("lab", [3, 4])}, coords={"lab": idx_cat})

    _ = xr.align(ds1, ds2, join="inner")


def test_categorical_coord_align_with_null():
    cat_idx1 = pd.CategoricalIndex(
        ["A", None, "C"], categories=["A", "B", "C"], name="lab"
    )
    da1 = xr.DataArray([1, 2, 3], dims="lab", coords={"lab": cat_idx1}, name="v1")

    cat_idx2 = pd.CategoricalIndex(
        ["A", "B"], categories=["A", "B", "C"], name="lab"
    )
    da2 = xr.DataArray([10, 20], dims="lab", coords={"lab": cat_idx2}, name="v2")

    _ = xr.align(da1, da2, join="outer")


# =============================================================================
# Main
# =============================================================================

def main():
    print_header()

    tests = [
        # NumPy StringDType
        ("NumPy StringDType values: where", test_numpy_stringdtype_values_where),
        ("NumPy StringDType values: concat", test_numpy_stringdtype_values_concat),
        ("NumPy StringDType coord vs string: align", test_numpy_stringdtype_coord_align),
        ("NumPy StringDType values: where with null", test_numpy_stringdtype_values_where_null),

        # string[pyarrow]
        ("string[pyarrow] values: where", test_string_pyarrow_values_where),
        ("string[pyarrow] values: concat", test_string_pyarrow_values_concat),
        ("string[pyarrow] values: align self vs slice", test_string_pyarrow_values_align),
        ("string[pyarrow] values: where with null", test_string_pyarrow_values_where_null),
        ("string[pyarrow] coord: where", test_string_pyarrow_coord_where),
        ("string[pyarrow] coord: concat", test_string_pyarrow_coord_concat),
        ("string[pyarrow] coord vs object: align", test_string_pyarrow_coord_align),
        ("string[pyarrow] coord vs object with null: align", test_string_pyarrow_coord_align_with_null),

        # date32[pyarrow]
        ("date32[pyarrow] coord: where", test_date32_coord_where),
        ("date32[pyarrow] coord: concat", test_date32_coord_concat),
        ("date32[pyarrow] coord vs datetime64: align", test_date32_coord_align_vs_datetime64),
        ("date32[pyarrow] coord: where with null", test_date32_coord_where_null),

        # Int64 / int64[pyarrow]
        ("Int64 values: where", test_int64_nullable_values_where),
        ("Int64 values: concat", test_int64_nullable_values_concat),
        ("Int64 coord vs int64: align", test_int64_nullable_coord_align),
        ("Int64 values: where with null", test_int64_nullable_values_where_null),
        ("int64[pyarrow] values: where", test_int64_pyarrow_values_where),
        ("int64[pyarrow] values: concat", test_int64_pyarrow_values_concat),
        ("int64[pyarrow] coord vs int64: align", test_int64_pyarrow_coord_align),
        ("int64[pyarrow] values: where with null", test_int64_pyarrow_values_where_null),

        # Categorical
        ("Categorical values: where", test_categorical_values_where),
        ("Categorical values: concat", test_categorical_values_concat),
        ("Categorical values vs object: align", test_categorical_values_align_vs_object),
        ("Categorical values: where with null", test_categorical_values_where_null),
        ("CategoricalIndex coord: where", test_categorical_coord_where),
        ("CategoricalIndex coord: concat", test_categorical_coord_concat),
        ("object vs CategoricalIndex coord: align", test_categorical_coord_align_vs_object_index),
        ("CategoricalIndex coord with null: align", test_categorical_coord_align_with_null),
    ]

    for name, func in tests:
        run_test(name, func)


if __name__ == "__main__":
    main()

Steps to reproduce

No response

MVCE confirmation

  • Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
  • Complete example — the example is self-contained, including all data and the text of any traceback.
  • Verifiable example — the example copy & pastes into an IPython prompt or Binder notebook, returning the result.
  • New issue — a search of GitHub Issues suggests this is not a duplicate.
  • Recent environment — the issue occurs with the latest version of xarray and its dependencies.

Relevant log output

xarray version: 2025.11.0                                                                                                                                                                                                                                                                                             
pandas version: 2.3.3
numpy version: 2.3.5
pyarrow version: 22.0.0

FAIL | NumPy StringDType values: where  (DTypePromotionError: The DType <class 'numpy.dtypes.StringDType'> could not be promoted by <class 'numpy.dtypes._PyFloatDType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.StringDType'>, <class 'numpy.dtypes._PyFloatDType'>))
PASS | NumPy StringDType values: concat
PASS | NumPy StringDType coord vs string: align
FAIL | NumPy StringDType values: where with null  (TypeError: dtype argument must be a NumPy dtype, but it is a <class 'numpy.dtypes.StringDType'>.)
FAIL | string[pyarrow] values: where  (TypeError: boolean value of NA is ambiguous)
PASS | string[pyarrow] values: concat
PASS | string[pyarrow] values: align self vs slice
PASS | string[pyarrow] values: where with null
PASS | string[pyarrow] coord: where
PASS | string[pyarrow] coord: concat
PASS | string[pyarrow] coord vs object: align
PASS | string[pyarrow] coord vs object with null: align
PASS | date32[pyarrow] coord: where
PASS | date32[pyarrow] coord: concat
FAIL | date32[pyarrow] coord vs datetime64: align  (TypeError: Cannot interpret 'date32[day][pyarrow]' as a data type)
PASS | date32[pyarrow] coord: where with null
FAIL | Int64 values: where  (TypeError: Cannot interpret 'Int64Dtype()' as a data type)
PASS | Int64 values: concat
FAIL | Int64 coord vs int64: align  (TypeError: Cannot interpret 'Int64Dtype()' as a data type)
FAIL | Int64 values: where with null  (TypeError: Cannot interpret 'Int64Dtype()' as a data type)
FAIL | int64[pyarrow] values: where  (TypeError: Cannot interpret 'int64[pyarrow]' as a data type)
FAIL | int64[pyarrow] values: concat  (IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices)
FAIL | int64[pyarrow] coord vs int64: align  (TypeError: Cannot interpret 'int64[pyarrow]' as a data type)
FAIL | int64[pyarrow] values: where with null  (TypeError: Cannot interpret 'int64[pyarrow]' as a data type)
FAIL | Categorical values: where  (TypeError: Cannot interpret 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)' as a data type)
PASS | Categorical values: concat
PASS | Categorical values vs object: align
FAIL | Categorical values: where with null  (TypeError: Cannot interpret 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)' as a data type)
PASS | CategoricalIndex coord: where
PASS | CategoricalIndex coord: concat
FAIL | object vs CategoricalIndex coord: align  (TypeError: Cannot interpret 'CategoricalDtype(categories=['A', 'B', 'C'], ordered=False, categories_dtype=object)' as a data type)
FAIL | CategoricalIndex coord with null: align  (TypeError: Cannot interpret 'CategoricalDtype(categories=['A', 'B', 'C'], ordered=False, categories_dtype=object)' as a data type)

Anything else we need to know?

No response

Environment

xarray version: 2025.11.0 pandas version: 2.3.3 numpy version: 2.3.5 pyarrow version: 22.0.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions