diff --git a/README.rst b/README.rst index ac6d1047..78e24af2 100644 --- a/README.rst +++ b/README.rst @@ -41,6 +41,9 @@ What is New in ArrayKit Now building free-threaded compatible wheels for Python 3.13. +Added ``is_objectable()`` and ``is_objectable_dt64()``. + +Added ``astype_array()``. 1.0.9 diff --git a/src/__init__.py b/src/__init__.py index dc4ed2e8..9c8bfb8d 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -31,7 +31,10 @@ from ._arraykit import array_to_tuple_array as array_to_tuple_array from ._arraykit import array_to_tuple_iter as array_to_tuple_iter from ._arraykit import nonzero_1d as nonzero_1d - +from ._arraykit import is_objectable_dt64 as is_objectable_dt64 +from ._arraykit import is_objectable as is_objectable +from ._arraykit import astype_array as astype_array from ._arraykit import AutoMap as AutoMap from ._arraykit import FrozenAutoMap as FrozenAutoMap from ._arraykit import NonUniqueError as NonUniqueError + diff --git a/src/__init__.pyi b/src/__init__.pyi index 26f2cc29..888bb58c 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -203,6 +203,9 @@ def get_new_indexers_and_screen(indexers: np.ndarray, positions: np.ndarray) -> def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ... def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ... def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ... +def is_objectable_dt64(__array: np.ndarray, /) -> bool: ... +def is_objectable(__array: np.ndarray, /) -> bool: ... +def astype_array(__array: np.ndarray, __dtype: np.dtype | None, /) -> np.ndarray: ... def slice_to_ascending_slice(__slice: slice, __size: int) -> slice: ... def array_to_tuple_array(__array: np.ndarray) -> np.ndarray: ... def array_to_tuple_iter(__array: np.ndarray) -> tp.Iterator[tp.Tuple[tp.Any, ...]]: ... \ No newline at end of file diff --git a/src/_arraykit.c b/src/_arraykit.c index 7ce767ad..49b30298 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -52,6 +52,9 @@ static PyMethodDef arraykit_methods[] = { NULL}, {"count_iteration", count_iteration, METH_O, NULL}, {"nonzero_1d", nonzero_1d, METH_O, NULL}, + {"is_objectable_dt64", is_objectable_dt64, METH_O, NULL}, + {"is_objectable", is_objectable, METH_O, NULL}, + {"astype_array", astype_array, METH_VARARGS, NULL}, {"isna_element", (PyCFunction)isna_element, METH_VARARGS | METH_KEYWORDS, @@ -95,6 +98,7 @@ PyInit__arraykit(void) return NULL; } + // store a reference to the deepcopy function PyObject *copy = PyImport_ImportModule("copy"); if (copy == NULL) { return NULL; @@ -105,6 +109,18 @@ PyInit__arraykit(void) return NULL; } + // store a year dtype object + PyObject* dt_year_str = PyUnicode_FromString("datetime64[Y]"); + if (!dt_year_str) return NULL; + + PyArray_Descr* dt_year = NULL; + if (!PyArray_DescrConverter2(dt_year_str, &dt_year)) { + Py_DECREF(dt_year_str); + return NULL; + } + Py_DECREF(dt_year_str); + + PyObject *m = PyModule_Create(&arraykit_module); if (!m || PyModule_AddStringConstant(m, "__version__", Py_STRINGIFY(AK_VERSION)) || @@ -128,9 +144,11 @@ PyInit__arraykit(void) PyModule_AddObject(m, "ErrorInitTypeBlocks", ErrorInitTypeBlocks) || PyModule_AddObject(m, "AutoMap", (PyObject *)&AMType) || PyModule_AddObject(m, "FrozenAutoMap", (PyObject *)&FAMType) || - PyModule_AddObject(m, "NonUniqueError", NonUniqueError) + PyModule_AddObject(m, "NonUniqueError", NonUniqueError) || + PyModule_AddObject(m, "dt_year", (PyObject *)dt_year) ){ - Py_DECREF(deepcopy); + Py_XDECREF(deepcopy); + Py_XDECREF(dt_year); Py_XDECREF(m); return NULL; } diff --git a/src/methods.c b/src/methods.c index 21bcbeb4..d98f75cd 100644 --- a/src/methods.c +++ b/src/methods.c @@ -201,6 +201,139 @@ nonzero_1d(PyObject *Py_UNUSED(m), PyObject *a) { return AK_nonzero_1d(array); } +PyObject* +is_objectable_dt64(PyObject *m, PyObject *a) { + AK_CHECK_NUMPY_ARRAY(a); + PyArrayObject* array = (PyArrayObject*)a; + + // this returns a new reference + PyObject* dt_year = PyObject_GetAttrString(m, "dt_year"); + int is_objectable = AK_is_objectable_dt64(array, dt_year); + Py_DECREF(dt_year); + + switch (is_objectable) { + case -1: + return NULL; + case 0: + Py_RETURN_FALSE; + case 1: + Py_RETURN_TRUE; + } + return NULL; +} + + +PyObject* +is_objectable(PyObject *m, PyObject *a) { + AK_CHECK_NUMPY_ARRAY(a); + PyArrayObject* array = (PyArrayObject*)a; + + char kind = PyArray_DESCR(array)->kind; + if ((kind == 'M' || kind == 'm')) { + // this returns a new reference + PyObject* dt_year = PyObject_GetAttrString(m, "dt_year"); + int is_objectable = AK_is_objectable_dt64(array, dt_year); + Py_DECREF(dt_year); + + switch (is_objectable) { + case -1: + return NULL; + case 0: + Py_RETURN_FALSE; + case 1: + Py_RETURN_TRUE; + } + } + Py_RETURN_TRUE; +} + +// Convert array to the dtype provided. NOTE: mutable arrays will be returned unless the input array is immutable and no dtype change is needed +PyObject* +astype_array(PyObject* m, PyObject* args) { + + PyObject* a = NULL; + PyObject* dtype_spec = Py_None; + + if (!PyArg_ParseTuple(args, "O!|O:astype_array", + &PyArray_Type, &a, + &dtype_spec)) { + return NULL; + } + PyArrayObject* array = (PyArrayObject*)a; + + PyArray_Descr* dtype = NULL; + if (dtype_spec == Py_None) { + dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE); + } else { + if (!PyArray_DescrConverter(dtype_spec, &dtype)) { + return NULL; + } + } + + if (PyArray_EquivTypes(PyArray_DESCR(array), dtype)) { + Py_DECREF(dtype); + + if (PyArray_ISWRITEABLE(array)) { + PyObject* result = PyArray_NewCopy(array, NPY_ANYORDER); + if (!result) { + return NULL; + } + return result; + } + else { // already immutable + Py_INCREF(a); + return a; + } + } + // if converting to an object + if (dtype->type_num == NPY_OBJECT) { + char kind = PyArray_DESCR(array)->kind; + if ((kind == 'M' || kind == 'm')) { + PyObject* dt_year = PyObject_GetAttrString(m, "dt_year"); + int is_objectable = AK_is_objectable_dt64(array, dt_year); + Py_DECREF(dt_year); + + if (!is_objectable) { + PyObject* result = PyArray_NewLikeArray(array, NPY_ANYORDER, dtype, 0); + if (!result) { + Py_DECREF(dtype); + return NULL; + } + PyObject** data = (PyObject**)PyArray_DATA((PyArrayObject*)result); + + PyArrayIterObject* it = (PyArrayIterObject*)PyArray_IterNew(a); + if (!it) { + Py_DECREF(result); + return NULL; + } + + npy_intp i = 0; + while (it->index < it->size) { + PyObject* item = PyArray_ToScalar(it->dataptr, array); + if (!item) { + Py_DECREF(result); + Py_DECREF(it); + return NULL; + } + data[i++] = item; + PyArray_ITER_NEXT(it); + } + Py_DECREF(it); + return result; + } + } + } + // all other cases: do a standard cast conversion + PyObject* result = PyArray_CastToType(array, dtype, 0); + if (!result) { + Py_DECREF(dtype); + return NULL; + } + return result; +} + + + static char *first_true_1d_kwarg_names[] = { "array", "forward", diff --git a/src/methods.h b/src/methods.h index 751ccf85..1d33a558 100644 --- a/src/methods.h +++ b/src/methods.h @@ -47,6 +47,15 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg); PyObject * nonzero_1d(PyObject *Py_UNUSED(m), PyObject *a); +PyObject * +is_objectable_dt64(PyObject *m, PyObject *a); + +PyObject * +is_objectable(PyObject *m, PyObject *a); + +PyObject * +astype_array(PyObject *m, PyObject *args); + PyObject * first_true_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs); diff --git a/src/utilities.h b/src/utilities.h index 660681c0..05d81bcf 100644 --- a/src/utilities.h +++ b/src/utilities.h @@ -223,6 +223,69 @@ AK_slice_to_ascending_slice(PyObject* slice, Py_ssize_t size) -step); } + +static inline NPY_DATETIMEUNIT +AK_dt_unit_from_array(PyArrayObject* a) { + // This is based on get_datetime_metadata_from_dtype in the NumPy source, but that function is private. This does not check that the dtype is of the appropriate type. + PyArray_Descr* dt = PyArray_DESCR(a); // borrowed ref + PyArray_DatetimeMetaData* dma = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dt))->meta); + return dma->base; +} + +// Given a dt64 array, determine if it can be cast to a object without data loss. Returns -1 on error. NOTE: if we use dt_year, must incref first +static inline int +AK_is_objectable_dt64(PyArrayObject* a, PyObject* dt_year) +{ + NPY_DATETIMEUNIT unit = AK_dt_unit_from_array(a); + switch (unit) { + case NPY_FR_ERROR: + case NPY_FR_Y: + case NPY_FR_M: + case NPY_FR_W: + return false; + case NPY_FR_D: + case NPY_FR_h: + case NPY_FR_m: + case NPY_FR_s: + case NPY_FR_ms: + case NPY_FR_us: + break; + case NPY_FR_ns: + case NPY_FR_ps: + case NPY_FR_fs: + case NPY_FR_as: + case NPY_FR_GENERIC: + return false; + } + + Py_INCREF(dt_year); + PyObject* a_year = PyArray_CastToType(a, (PyArray_Descr*)dt_year, 0); + if (!a_year) { + Py_DECREF(dt_year); + return -1; + } + + npy_int64* data = (npy_int64*)PyArray_DATA((PyArrayObject*)a_year); + npy_intp size = PyArray_SIZE((PyArrayObject*)a_year); + + for (npy_intp i = 0; i < size; ++i) { + npy_int64 v = data[i]; + if (v == NPY_DATETIME_NAT) { + continue; + } + // offset: 1-1970, 9999-1970 + if (v < -1969 || v > 8029) { + Py_DECREF(a_year); + return 0; + } + } + Py_DECREF(a_year); + return 1; +} + + + + // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory provides the best performance at all scales. Using NpyIter, or using, bit masks does not improve performance over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy benefits from first counting the nonzeros, then allocating only enough data for the expexted number of indices. static inline PyObject * AK_nonzero_1d(PyArrayObject* array) { @@ -319,15 +382,6 @@ AK_nonzero_1d(PyArrayObject* array) { return final; } -static inline NPY_DATETIMEUNIT -AK_dt_unit_from_array(PyArrayObject* a) { - // This is based on get_datetime_metadata_from_dtype in the NumPy source, but that function is private. This does not check that the dtype is of the appropriate type. - PyArray_Descr* dt = PyArray_DESCR(a); // borrowed ref - PyArray_DatetimeMetaData* dma = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dt))->meta); - // PyArray_DatetimeMetaData* dma = &(((PyArray_DatetimeDTypeMetaData *)PyArray_DESCR(a)->c_metadata)->meta); - return dma->base; -} - static inline NPY_DATETIMEUNIT AK_dt_unit_from_scalar(PyDatetimeScalarObject* dts) { // Based on convert_pyobject_to_datetime and related usage in datetime.c diff --git a/test/test_astype_array.py b/test/test_astype_array.py new file mode 100644 index 00000000..acfd32fb --- /dev/null +++ b/test/test_astype_array.py @@ -0,0 +1,123 @@ +import unittest + +import numpy as np + +from arraykit import astype_array + +class TestUnit(unittest.TestCase): + + def test_astype_array_a1(self) -> None: + a1 = np.array([10, 20, 30], dtype=np.int64) + a1.flags.writeable = False + + a2 = astype_array(a1, np.int64) + self.assertEqual(id(a1), id(a2)) + + + def test_astype_array_a2(self) -> None: + a1 = np.array([10, 20, 30], dtype=np.int64) + a1.flags.writeable = False + + a2 = astype_array(a1, np.float64) + self.assertNotEqual(id(a1), id(a2)) + self.assertEqual(a2.dtype, np.dtype(np.float64)) + + + def test_astype_array_a3(self) -> None: + a1 = np.array([False, True, False]) + + a2 = astype_array(a1, np.int8) + self.assertEqual(a2.dtype, np.dtype(np.int8)) + self.assertTrue(a2.flags.writeable) + + def test_astype_array_b1(self) -> None: + a1 = np.array(['2021', '2024'], dtype=np.datetime64) + + a2 = astype_array(a1, np.object_) + self.assertEqual(a2.dtype, np.dtype(np.object_)) + self.assertTrue(a2.flags.writeable) + self.assertEqual(list(a2), [np.datetime64('2021'), np.datetime64('2024')]) + + + def test_astype_array_b2(self) -> None: + a1 = np.array(['2021', '1642'], dtype=np.datetime64) + + a2 = astype_array(a1, np.object_) + self.assertEqual(a2.dtype, np.dtype(np.object_)) + self.assertTrue(a2.flags.writeable) + self.assertEqual(list(a2), [np.datetime64('2021'), np.datetime64('1642')]) + + + def test_astype_array_b3(self) -> None: + a1 = np.array(['2021', '2024', '1984', '1642'], dtype=np.datetime64).reshape((2, 2)) + + a2 = astype_array(a1, np.object_) + self.assertEqual(a2.dtype, np.dtype(np.object_)) + self.assertTrue(a2.flags.writeable) + self.assertEqual( + list(list(a) for a in a2), + [[np.datetime64('2021'), np.datetime64('2024')], [np.datetime64('1984'), np.datetime64('1642')]]) + + def test_astype_array_b4(self) -> None: + a1 = np.array(['2021', '2024', '1532', '1984', '1642', '899'], dtype=np.datetime64).reshape((2, 3)) + + a2 = astype_array(a1, np.object_) + self.assertEqual(a2.dtype, np.dtype(np.object_)) + self.assertEqual(a2.shape, (2, 3)) + self.assertTrue(a2.flags.writeable) + self.assertEqual( + list(list(a) for a in a2), + [[np.datetime64('2021'), np.datetime64('2024'), np.datetime64('1532')], + [np.datetime64('1984'), np.datetime64('1642'), np.datetime64('899')]]) + + def test_astype_array_c(self) -> None: + with self.assertRaises(TypeError): + _ = astype_array([3, 4, 5], np.int64) + + + def test_astype_array_d1(self) -> None: + a1 = np.array([10, 20, 30], dtype=np.int64) + a2 = astype_array(a1) + + self.assertEqual(a2.dtype, np.dtype(np.float64)) + self.assertEqual(a2.shape, (3,)) + self.assertTrue(a2.flags.writeable) + + + def test_astype_array_d2(self) -> None: + a1 = np.array([10, 20, 30], dtype=np.int64) + a2 = astype_array(a1, None) + + self.assertEqual(a2.dtype, np.dtype(np.float64)) + self.assertEqual(a2.shape, (3,)) + self.assertTrue(a2.flags.writeable) + + + + def test_astype_array_d3(self) -> None: + a1 = np.array([10, 20, 30], dtype=np.int64) + a2 = astype_array(a1, np.int64) + + self.assertEqual(a2.dtype, np.dtype(np.int64)) + self.assertEqual(a2.shape, (3,)) + self.assertTrue(a2.flags.writeable) + + self.assertNotEqual(id(a1), id(a2)) + + def test_astype_array_e(self) -> None: + a1 = np.array(['2021', '2024', '1997', '1984', '2000', '1999'], dtype='datetime64[ns]').reshape((2, 3)) + + a2 = astype_array(a1, np.object_) + self.assertEqual(a2.dtype, np.dtype(np.object_)) + self.assertEqual(a2.shape, (2, 3)) + self.assertTrue(a2.flags.writeable) + self.assertEqual( + list(list(a) for a in a2), + [[np.datetime64('2021-01-01T00:00:00.000000000'), + np.datetime64('2024-01-01T00:00:00.000000000'), + np.datetime64('1997-01-01T00:00:00.000000000')], + [np.datetime64('1984-01-01T00:00:00.000000000'), + np.datetime64('2000-01-01T00:00:00.000000000'), + np.datetime64('1999-01-01T00:00:00.000000000')]] + ) + diff --git a/test/test_objectable.py b/test/test_objectable.py new file mode 100644 index 00000000..9d617632 --- /dev/null +++ b/test/test_objectable.py @@ -0,0 +1,68 @@ +import unittest + +import numpy as np + +from arraykit import is_objectable_dt64 +from arraykit import is_objectable + +class TestUnit(unittest.TestCase): + + def test_is_objectable_a1(self) -> None: + a1 = np.array(['2022-01-04', '1954-04-12'], dtype=np.datetime64) + self.assertTrue(is_objectable(a1)) + + def test_is_objectable_a2(self) -> None: + a1 = np.array(['10000-01-04', '1954-04-12'], dtype=np.datetime64) + self.assertFalse(is_objectable(a1)) + + def test_is_objectable_b(self) -> None: + a1 = np.array([10, 20]) + self.assertTrue(is_objectable(a1)) + + def test_is_objectable_c(self) -> None: + a1 = np.array([True, False]) + self.assertTrue(is_objectable(a1)) + + def test_is_objectable_d(self) -> None: + a1 = np.array(['b', 'ccc']) + self.assertTrue(is_objectable(a1)) + + def test_is_objectable_e(self) -> None: + a1 = np.array(['b', None, False], dtype=object) + self.assertTrue(is_objectable(a1)) + + + #--------------------------------------------------------------------------- + + def test_is_objectable_dt64_a1(self) -> None: + a1 = np.array(['2022-01-04', '1954-04-12'], dtype=np.datetime64) + self.assertTrue(is_objectable_dt64(a1)) + + def test_is_objectable_dt64_a2(self) -> None: + a1 = np.array(['2022-01-04', '', '1954-04-12'], dtype=np.datetime64) + self.assertTrue(is_objectable_dt64(a1)) + + def test_is_objectable_dt64_a3(self) -> None: + a1 = np.array(['2022-01-04', '1954-04-12', '', ''], dtype=np.datetime64) + self.assertTrue(is_objectable_dt64(a1)) + + + def test_is_objectable_dt64_b(self) -> None: + # years are nevery objectable + a1 = np.array(['2022', '2023'], dtype=np.datetime64) + self.assertFalse(is_objectable_dt64(a1)) + + + def test_is_objectable_dt64_c(self) -> None: + a1 = np.array(['-120-01-01', '2023-04-05'], dtype='datetime64[m]') + self.assertFalse(is_objectable_dt64(a1)) + + def test_is_objectable_dt64_d(self) -> None: + a1 = np.array(['2024-01-01', '2023-04-05', '10000-01-01'], dtype='datetime64[s]') + self.assertFalse(is_objectable_dt64(a1)) + + + def test_is_objectable_dt64_e(self) -> None: + a1 = np.array(['2024-01-01', '2023-04-05'], dtype='datetime64[ns]') + self.assertFalse(is_objectable_dt64(a1)) + diff --git a/test/test_util.py b/test/test_util.py index b300dbfd..7cabbb32 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -17,6 +17,7 @@ from arraykit import immutable_filter from arraykit import array_deepcopy from arraykit import isna_element +from arraykit import is_objectable_dt64 from arraykit import dtype_from_element from arraykit import count_iteration from arraykit import first_true_1d @@ -953,7 +954,5 @@ def test_slice_to_ascending_slice_i(self) -> None: ) - - if __name__ == '__main__': unittest.main()