From d3f473bf439eaf1c5b872f932584359046364727 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Wed, 8 Apr 2026 14:39:07 +0200 Subject: [PATCH 01/11] Torchvision API RandomApply implementation Signed-off-by: Marek Dabek --- .../dali/experimental/torchvision/__init__.py | 2 + .../experimental/torchvision/v2/rand_apply.py | 58 ++++++++ .../python/torchvision/test_tv_rand_apply.py | 135 ++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py create mode 100644 dali/test/python/torchvision/test_tv_rand_apply.py diff --git a/dali/python/nvidia/dali/experimental/torchvision/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/__init__.py index a87fc3f47f8..de4d4c47977 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/__init__.py @@ -19,6 +19,7 @@ from .v2.gaussian_blur import GaussianBlur from .v2.normalize import Normalize from .v2.pad import Pad +from .v2.rand_apply import RandomApply from .v2.resize import Resize from .v2.totensor import ToPureTensor, PILToTensor, ToPILImage @@ -31,6 +32,7 @@ "Normalize", "Pad", "PILToTensor", + "RandomApply", "RandomGrayscale", "RandomHorizontalFlip", "RandomVerticalFlip", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py b/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py new file mode 100644 index 00000000000..cb09dadede6 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py @@ -0,0 +1,58 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Literal, Sequence + +from .operator import Operator, _ValidateIfZeroOneRange +import nvidia.dali.fn as fn +import nvidia.dali as dali + + +class RandomApply(Operator): + """ + Apply randomly a list of transformations with a given probability. + + Parameters + ---------- + op_list : Sequence[Callable] + List of transformations to apply. + p : float, optional, default = 0.5 + Probability of applying the transformations. + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the operator. Can be ``"cpu"`` or ``"gpu"``. + """ + + arg_rules = [_ValidateIfZeroOneRange] + + def __init__( + self, + op_list: Sequence[Callable], + p: float = 0.5, + device: Literal["cpu", "gpu"] = "cpu", + ): + super().__init__(device=device, p=p) + self.p = p + self.op_list = op_list + + def _kernel(self, data_input): + """ + Randomly applies each operator in op_list sequentially. + """ + output = data_input + convert = fn.random.coin_flip(dtype=dali.types.DALIDataType.BOOL, probability=self.p) + if convert: + for op in self.op_list: + output = op(output) + + return output diff --git a/dali/test/python/torchvision/test_tv_rand_apply.py b/dali/test/python/torchvision/test_tv_rand_apply.py new file mode 100644 index 00000000000..cbf0367ee3b --- /dev/null +++ b/dali/test/python/torchvision/test_tv_rand_apply.py @@ -0,0 +1,135 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from nose2.tools import params +from nose_utils import assert_raises +from PIL import Image +import torch +import torchvision.transforms.v2 as transforms + +from nvidia.dali.experimental.torchvision import ( + Compose, + Grayscale, + RandomApply, + RandomHorizontalFlip, +) + + +def verify_non_one_off(t1: torch.Tensor, t2: torch.Tensor): + if t1.dtype == torch.uint8: + t1 = t1.to(torch.int16) + t2 = t2.to(torch.int16) + + diff = (t1 - t2).abs() + more_than_one_mask = diff > 1 + + return more_than_one_mask.sum().item() == 0 + + +dali_extra = os.environ["DALI_EXTRA_PATH"] +jpeg = os.path.join(dali_extra, "db", "single", "jpeg") +jpeg_113 = os.path.join(jpeg, "113") +test_files = [ + os.path.join(jpeg_113, f) + for f in ["snail-4291306_1280.jpg", "snail-4345504_1280.jpg", "snail-4368154_1280.jpg"] +] + + +@params("cpu", "gpu") +def test_random_apply_p1(device): + """p=1.0: transformations always applied — output must match torchvision.""" + td = Compose( + [RandomApply([Grayscale(num_output_channels=3, device=device)], p=1.0, device=device)] + ) + t = transforms.Compose( + [transforms.RandomApply([transforms.Grayscale(num_output_channels=3)], p=1.0)] + ) + + for fn in test_files: + img = Image.open(fn) + out_tv = transforms.functional.pil_to_tensor(t(img)) + out_dali = transforms.functional.pil_to_tensor(td(img)) + assert verify_non_one_off(out_tv, out_dali), f"Images differ: {fn}" + + +@params("cpu", "gpu") +def test_random_apply_p0(device): + """p=0.0: transformations never applied — output must equal input.""" + td = Compose( + [RandomApply([Grayscale(num_output_channels=3, device=device)], p=0.0, device=device)] + ) + t = transforms.Compose( + [transforms.RandomApply([transforms.Grayscale(num_output_channels=3)], p=0.0)] + ) + + for fn in test_files: + img = Image.open(fn) + out_tv = transforms.functional.pil_to_tensor(t(img)) + out_dali = transforms.functional.pil_to_tensor(td(img)) + assert verify_non_one_off(out_tv, out_dali), f"Images differ: {fn}" + + +@params(-0.1, 2.0, [0.0, 0.8]) +def test_invalid_random_apply_probability(p): + with assert_raises(ValueError): + RandomApply([Grayscale(num_output_channels=3)], p=p) + + +@params("cpu", "gpu") +def test_random_apply_multi_ops(device): + """p=1.0 with multiple operators — all applied in sequence.""" + td = Compose( + [ + RandomApply( + [ + RandomHorizontalFlip(p=1.0, device=device), + Grayscale(num_output_channels=3, device=device), + ], + p=1.0, + device=device, + ) + ] + ) + t = transforms.Compose( + [ + transforms.RandomApply( + [ + transforms.RandomHorizontalFlip(p=1.0), + transforms.Grayscale(num_output_channels=3), + ], + p=1.0, + ) + ] + ) + + for fn in test_files: + img = Image.open(fn) + out_tv = transforms.functional.pil_to_tensor(t(img)) + out_dali = transforms.functional.pil_to_tensor(td(img)) + assert verify_non_one_off(out_tv, out_dali), f"Images differ: {fn}" + + +def test_random_apply_preserves_shape(): + """Output shape must match input shape regardless of p.""" + td_apply = Compose([RandomApply([RandomHorizontalFlip(p=1.0)], p=1.0)]) + td_skip = Compose([RandomApply([RandomHorizontalFlip(p=1.0)], p=0.0)]) + + for fn in test_files: + img = Image.open(fn) + out_apply = td_apply(img) + out_skip = td_skip(img) + assert out_apply.size == img.size, f"Shape mismatch after apply: {fn}" + assert out_skip.size == img.size, f"Shape mismatch after skip: {fn}" From b0c754b9cc12d77f1775e736025aab9dac953ee9 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Fri, 8 May 2026 14:06:32 +0200 Subject: [PATCH 02/11] Greptile review fixes Signed-off-by: Marek Dabek --- .../dali/experimental/torchvision/v2/rand_apply.py | 4 ++-- dali/test/python/torchvision/test_tv_rand_apply.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py b/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py index cb09dadede6..75890376ecc 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/rand_apply.py @@ -43,7 +43,7 @@ def __init__( ): super().__init__(device=device, p=p) self.p = p - self.op_list = op_list + self._op_list = op_list def _kernel(self, data_input): """ @@ -52,7 +52,7 @@ def _kernel(self, data_input): output = data_input convert = fn.random.coin_flip(dtype=dali.types.DALIDataType.BOOL, probability=self.p) if convert: - for op in self.op_list: + for op in self._op_list: output = op(output) return output diff --git a/dali/test/python/torchvision/test_tv_rand_apply.py b/dali/test/python/torchvision/test_tv_rand_apply.py index cbf0367ee3b..fb5aee450b9 100644 --- a/dali/test/python/torchvision/test_tv_rand_apply.py +++ b/dali/test/python/torchvision/test_tv_rand_apply.py @@ -84,7 +84,7 @@ def test_random_apply_p0(device): @params(-0.1, 2.0, [0.0, 0.8]) def test_invalid_random_apply_probability(p): - with assert_raises(ValueError): + with assert_raises(ValueError, regex="p should be a floating point value in the interval"): RandomApply([Grayscale(num_output_channels=3)], p=p) @@ -122,10 +122,15 @@ def test_random_apply_multi_ops(device): assert verify_non_one_off(out_tv, out_dali), f"Images differ: {fn}" -def test_random_apply_preserves_shape(): +@params("cpu", "gpu") +def test_random_apply_preserves_shape(device): """Output shape must match input shape regardless of p.""" - td_apply = Compose([RandomApply([RandomHorizontalFlip(p=1.0)], p=1.0)]) - td_skip = Compose([RandomApply([RandomHorizontalFlip(p=1.0)], p=0.0)]) + td_apply = Compose( + [RandomApply([RandomHorizontalFlip(p=1.0, device=device)], p=1.0, device=device)] + ) + td_skip = Compose( + [RandomApply([RandomHorizontalFlip(p=1.0, device=device)], p=0.0, device=device)] + ) for fn in test_files: img = Image.open(fn) From 57302e8a0a2a185de7f29e7ba6b50ebf67a25c91 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Wed, 13 May 2026 09:54:26 +0200 Subject: [PATCH 03/11] Adding 0 < p 1 tests Signed-off-by: Marek Dabek --- .../python/torchvision/test_tv_rand_apply.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/dali/test/python/torchvision/test_tv_rand_apply.py b/dali/test/python/torchvision/test_tv_rand_apply.py index fb5aee450b9..a1c7e68c00d 100644 --- a/dali/test/python/torchvision/test_tv_rand_apply.py +++ b/dali/test/python/torchvision/test_tv_rand_apply.py @@ -14,7 +14,7 @@ import os -from nose2.tools import params +from nose2.tools import params, cartesian_params from nose_utils import assert_raises from PIL import Image import torch @@ -138,3 +138,32 @@ def test_random_apply_preserves_shape(device): out_skip = td_skip(img) assert out_apply.size == img.size, f"Shape mismatch after apply: {fn}" assert out_skip.size == img.size, f"Shape mismatch after skip: {fn}" + + +@cartesian_params((0.01, 0.1, 0.25, 0.3, 0.8, 0.99), ("cpu", "gpu")) +def test_random_apply_p_sanity(p, device): + """Sanity test to verify if 0 < p < 1.""" + td = Compose( + [RandomApply([Grayscale(num_output_channels=3, device=device)], p=p, device=device)] + ) + for fn in test_files: + img = Image.open(fn) + _ = td(img) + + +@cartesian_params((0.3, 0.5, 0.8), ("cpu", "gpu")) +def test_random_apply_p(p, device): + """Sanity test to verify if p value varies application.""" + td = Compose( + [RandomApply([Grayscale(num_output_channels=3, device=device)], p=p, device=device)] + ) + reps = 10 + for fn in test_files: + img = Image.open(fn) + tensor_img = transforms.functional.pil_to_tensor(img) + proc = 0 + for i in range(reps): + out_dali = transforms.functional.pil_to_tensor(td(img)) + if not verify_non_one_off(out_dali, tensor_img): + proc += 1 + assert proc > 0, f"RandomApply did not apply any operation in {reps} runs" From a4d62091033cc1cdd0d5e226fd54c00c0b33ea0b Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Wed, 13 May 2026 14:11:23 +0200 Subject: [PATCH 04/11] Review fixes Signed-off-by: Marek Dabek --- dali/test/python/torchvision/test_tv_rand_apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dali/test/python/torchvision/test_tv_rand_apply.py b/dali/test/python/torchvision/test_tv_rand_apply.py index a1c7e68c00d..23377711491 100644 --- a/dali/test/python/torchvision/test_tv_rand_apply.py +++ b/dali/test/python/torchvision/test_tv_rand_apply.py @@ -155,15 +155,15 @@ def test_random_apply_p_sanity(p, device): def test_random_apply_p(p, device): """Sanity test to verify if p value varies application.""" td = Compose( - [RandomApply([Grayscale(num_output_channels=3, device=device)], p=p, device=device)] + [RandomApply([Grayscale(num_output_channels=1, device=device)], p=p, device=device)] ) reps = 10 for fn in test_files: img = Image.open(fn) - tensor_img = transforms.functional.pil_to_tensor(img) proc = 0 for i in range(reps): out_dali = transforms.functional.pil_to_tensor(td(img)) - if not verify_non_one_off(out_dali, tensor_img): + # If grayscale was applied it will result in a single channel image + if out_dali.shape[0] == 1: proc += 1 assert proc > 0, f"RandomApply did not apply any operation in {reps} runs" From 693e9affe4da9af93312e44257d157b14fdff2c4 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Mon, 18 May 2026 18:45:20 +0200 Subject: [PATCH 05/11] Torchvision API RandomCrop and crop operartors Signed-off-by: Marek Dabek --- .../dali/experimental/torchvision/__init__.py | 2 + .../torchvision/v2/functional/__init__.py | 2 + .../torchvision/v2/functional/crop.py | 69 ++++ .../experimental/torchvision/v2/randomcrop.py | 283 ++++++++++++++++ dali/test/python/torchvision/test_tv_crop.py | 157 +++++++++ .../python/torchvision/test_tv_randomcrop.py | 304 ++++++++++++++++++ 6 files changed, 817 insertions(+) create mode 100644 dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py create mode 100644 dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py create mode 100644 dali/test/python/torchvision/test_tv_crop.py create mode 100644 dali/test/python/torchvision/test_tv_randomcrop.py diff --git a/dali/python/nvidia/dali/experimental/torchvision/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/__init__.py index de4d4c47977..550dfd57bc5 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/__init__.py @@ -20,6 +20,7 @@ from .v2.normalize import Normalize from .v2.pad import Pad from .v2.rand_apply import RandomApply +from .v2.randomcrop import RandomCrop from .v2.resize import Resize from .v2.totensor import ToPureTensor, PILToTensor, ToPILImage @@ -33,6 +34,7 @@ "Pad", "PILToTensor", "RandomApply", + "RandomCrop", "RandomGrayscale", "RandomHorizontalFlip", "RandomVerticalFlip", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py index ec19014a2d7..18003740b00 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py @@ -14,6 +14,7 @@ from .centercrop import center_crop from .color import to_grayscale, rgb_to_grayscale +from .crop import crop from .flips import horizontal_flip, vertical_flip from .gaussian_blur import gaussian_blur from .normalize import normalize @@ -23,6 +24,7 @@ __all__ = [ "center_crop", + "crop", "gaussian_blur", "horizontal_flip", "normalize", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py new file mode 100644 index 00000000000..d084b6aee0c --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py @@ -0,0 +1,69 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvidia.dali.experimental.dynamic as ndd +from nvidia.dali._typing import TensorLike +from nvidia.dali.experimental.dynamic._device import DeviceLike + +from ..operator import adjust_input +from ..randomcrop import RandomCrop + + +def _get_crop_axes(inpt: TensorLike | ndd.Batch) -> list[int]: + layout = inpt.layout[-3:] + if layout == "HWC": + return [-3, -2] + if layout == "CHW": + return [-2, -1] + if inpt.layout[-2:] == "HW": + return [-2, -1] + raise ValueError(f"Unsupported layout: {inpt.layout!r}. Expected one of HWC, CHW, HW.") + + +def _verify_crop_coordinate(value, name: str) -> None: + if not isinstance(value, int): + raise TypeError(f"{name} must be int, got {type(value)}") + + +@adjust_input +def crop( + inpt: TensorLike | ndd.Batch, + top: int, + left: int, + height: int, + width: int, + device: DeviceLike = "cpu", +) -> ndd.Tensor | ndd.Batch: + """ + Please refer to the ``RandomCrop`` operator for more details. + """ + _verify_crop_coordinate(top, "top") + _verify_crop_coordinate(left, "left") + RandomCrop.verify_args( + size=(height, width), + padding=None, + pad_if_needed=False, + padding_mode="constant", + fill=0, + ) + + return ndd.slice( + inpt, + (top, left), + (height, width), + axes=_get_crop_axes(inpt), + out_of_bounds_policy="pad", + fill_values=0, + device=device, + ) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py new file mode 100644 index 00000000000..9fb673d7de7 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py @@ -0,0 +1,283 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numbers +from types import NoneType +from typing import Literal, Sequence, Union + +from PIL import Image +import nvidia.dali as dali +import nvidia.dali.fn as fn +import numpy as np +import torch + +from .centercrop import CenterCrop +from .operator import ( + Operator, + _ArgumentValidateRule, + _ValidateIfNonNegative, + _ValidateSizeDescriptor, + get_HWC_from_layout_pipeline, +) +from .pad import PADDING_CLASS, _ValidatePaddingMode + + +class _ValidateCropSize(_ArgumentValidateRule): + """ + Verify RandomCrop size values. + """ + + @classmethod + def verify(cls, *, size, **_) -> None: + if isinstance(size, (list, tuple)) and any(not isinstance(value, int) for value in size): + raise ValueError(f"Size values must be integers, got {size}") + + +class _ValidatePadding(_ArgumentValidateRule): + """ + Verify RandomCrop padding arguments. + """ + + @classmethod + def verify(cls, *, padding, pad_if_needed, padding_mode, **_) -> None: + if not isinstance(pad_if_needed, bool): + raise TypeError(f"pad_if_needed must be bool, got {type(pad_if_needed)}") + + if padding is not None: + if not isinstance(padding, (int, list, tuple)): + raise TypeError( + f"Padding must be an int or a sequence of length 1, 2 or 4, " + f"got {type(padding)}" + ) + if isinstance(padding, (list, tuple)) and len(padding) not in (1, 2, 4): + raise ValueError(f"Padding sequence must have length 1, 2 or 4, got {len(padding)}") + if isinstance(padding, (list, tuple)) and any( + not isinstance(value, int) for value in padding + ): + raise ValueError(f"Padding values must be integers, got {padding}") + _ValidateIfNonNegative.verify(values=padding, name="padding") + + if pad_if_needed or padding is not None: + _ValidatePaddingMode.verify(padding_mode=padding_mode) + + +class _ValidateFill(_ArgumentValidateRule): + """ + Verify RandomCrop fill argument. + """ + + @classmethod + def _verify_fill_value(cls, fill) -> None: + if fill is None or isinstance(fill, numbers.Number): + return + if isinstance(fill, (list, tuple)) and all( + isinstance(value, numbers.Number) for value in fill + ): + return + raise TypeError(f"fill must be a number, sequence of numbers, None or a dict, got {fill!r}") + + @classmethod + def verify(cls, *, fill, **_) -> None: + if isinstance(fill, dict): + for key, value in fill.items(): + if not isinstance(key, (type, str)): + raise TypeError(f"fill dictionary keys must be types or strings, got {key!r}") + cls._verify_fill_value(value) + else: + cls._verify_fill_value(fill) + + +class RandomCrop(Operator): + """ + Crop the input at a random location. + + If the input is a ``torch.Tensor`` it can have an arbitrary number of leading batch dimensions. + For example, the image tensor can have [..., C, H, W] shape. + + Parameters + ---------- + size : sequence or int + Desired output size of the crop. If size is an int instead of sequence like (h, w), + a square crop (size, size) is made. If provided a sequence of length 1, it will be + interpreted as (size[0], size[0]). + padding : int or sequence, optional, default = None + Optional padding on each border of the image, applied before cropping. If a single int + or a sequence of length 1 is provided this is used to pad all borders. If sequence of + length 2 is provided this is the padding on left/right and top/bottom respectively. If + a sequence of length 4 is provided this is the padding for the left, top, right and + bottom borders respectively. + pad_if_needed : bool, optional, default = False + Pad the image if it is smaller than the desired size. + fill : number or tuple or dict, optional, default = 0 + Pixel fill value used when the padding_mode is constant. + padding_mode : Literal["constant", "edge", "reflect", "symmetric"], optional, + Type of padding. Should be: constant, edge, reflect or symmetric. + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the crop. Can be ``"cpu"`` or ``"gpu"``. + """ + + arg_rules = [_ValidateSizeDescriptor, _ValidateCropSize, _ValidatePadding, _ValidateFill] + preprocess_data = get_HWC_from_layout_pipeline + + @classmethod + def adjust_size(cls, size: int | Sequence[int]) -> Sequence[int]: + return CenterCrop.adjust_size(size) + + @classmethod + def adjust_padding(cls, padding: None | int | Sequence[int]) -> tuple[int, int, int, int]: + if padding is None: + return 0, 0, 0, 0 + if isinstance(padding, int): + return padding, padding, padding, padding + if isinstance(padding, (list, tuple)): + if len(padding) == 1: + return padding[0], padding[0], padding[0], padding[0] + if len(padding) == 2: + return padding[0], padding[1], padding[0], padding[1] + if len(padding) == 4: + return tuple(padding) + + raise TypeError( + f"Padding must be an int or a sequence of length 1, 2 or 4, got {type(padding)}" + ) + + @staticmethod + def adjust_fill(fill): + if isinstance(fill, dict): + return {key: RandomCrop.adjust_fill(value) for key, value in fill.items()} + if fill is None: + return 0 + if isinstance(fill, numbers.Number): + return fill + return tuple(fill) + + @staticmethod + def _get_input_type(tensor): + layout = tensor.property("layout")[0] + if layout == np.frombuffer(bytes("F", "utf-8"), dtype=np.uint8)[0]: + layout = tensor.property("layout")[1] + if layout == np.frombuffer(bytes("C", "utf-8"), dtype=np.uint8)[0]: + return torch.Tensor + return Image.Image + + @staticmethod + def _get_fill(fill, tensor): + if not isinstance(fill, dict): + return fill + + input_type = RandomCrop._get_input_type(tensor) + string_keys = (input_type.__name__, f"{input_type.__module__}.{input_type.__name__}") + for key in (input_type, *string_keys): + if key in fill: + return fill[key] + if "others" in fill: + return fill["others"] + raise ValueError(f"fill dictionary does not contain a value for {input_type}") + + @staticmethod + def _randint(max_value): + range_start = fn.cast(max_value * 0, dtype=dali.types.FLOAT) + range_end = fn.cast(max_value + 1, dtype=dali.types.FLOAT) + value = dali.math.floor(fn.random.uniform(range=fn.stack(range_start, range_end))) + return fn.cast(value, dtype=dali.types.INT32) + + def __init__( + self, + size: int | Sequence[int], + padding: None | int | Sequence[int] = None, + pad_if_needed: bool = False, + fill: Union[ + int, + float, + Sequence[int], + Sequence[float], + None, + dict[ + type | str, + int + | float + | collections.abc.Sequence[int] + | collections.abc.Sequence[float] + | NoneType, + ], + ] = 0, + padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant", + device: Literal["cpu", "gpu"] = "cpu", + ): + super().__init__( + device=device, + size=size, + padding=padding, + pad_if_needed=pad_if_needed, + padding_mode=padding_mode, + fill=fill, + ) + + self.size = RandomCrop.adjust_size(size) + self.padding = RandomCrop.adjust_padding(padding) + self.pad_if_needed = pad_if_needed + self.fill = RandomCrop.adjust_fill(fill) + self.padding_mode = padding_mode + self.needs_padding = pad_if_needed or any(self.padding) + + def _kernel(self, data_input): + """ + Applies the random crop to the input data. + """ + in_h, in_w, _, tensor = data_input + crop_h, crop_w = self.size + pad_left, pad_top, pad_right, pad_bottom = self.padding + + if self.needs_padding: + padded_h = in_h + pad_top + pad_bottom + padded_w = in_w + pad_left + pad_right + + if self.pad_if_needed: + pad_h = dali.math.max(crop_h - padded_h, 0) + pad_w = dali.math.max(crop_w - padded_w, 0) + pad_top = pad_top + pad_h + pad_bottom = pad_bottom + pad_h + pad_left = pad_left + pad_w + pad_right = pad_right + pad_w + + tensor = fn.slice( + tensor, + fn.stack( + fn.cast(-pad_left, dtype=dali.types.INT64), + fn.cast(-pad_top, dtype=dali.types.INT64), + ), + fn.stack(in_w + pad_left + pad_right, in_h + pad_top + pad_bottom), + out_of_bounds_policy=PADDING_CLASS[self.padding_mode].border_type, + fill_values=self.fill, + device=self.device, + axis_names="WH", + ) + + in_h = in_h + pad_top + pad_bottom + in_w = in_w + pad_left + pad_right + + max_top = fn.cast(in_h, dtype=dali.types.INT32) - crop_h + max_left = fn.cast(in_w, dtype=dali.types.INT32) - crop_w + + top = RandomCrop._randint(max_top) + left = RandomCrop._randint(max_left) + + return fn.slice( + tensor, + fn.stack(left, top), + fn.stack(crop_w, crop_h), + device=self.device, + axis_names="WH", + ) diff --git a/dali/test/python/torchvision/test_tv_crop.py b/dali/test/python/torchvision/test_tv_crop.py new file mode 100644 index 00000000000..07d45d0ed2e --- /dev/null +++ b/dali/test/python/torchvision/test_tv_crop.py @@ -0,0 +1,157 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import unittest + +from nose2.tools import params +from nose_utils import assert_raises +import numpy as np +from PIL import Image +import torch +import torchvision.transforms.v2.functional as tv_fn + +from nvidia.dali.experimental.torchvision.v2.functional import crop + + +def make_test_tensor(shape=(3, 8, 10), dtype=torch.uint8): + return torch.arange(math.prod(shape), dtype=dtype).reshape(shape) + + +def _make_pil_image(mode, h=8, w=10, seed=42): + rng = np.random.default_rng(seed) + if mode == "L": + data = rng.integers(0, 256, (h, w), dtype=np.uint8) + elif mode == "RGB": + data = rng.integers(0, 256, (h, w, 3), dtype=np.uint8) + elif mode == "RGBA": + data = rng.integers(0, 256, (h, w, 4), dtype=np.uint8) + else: + raise ValueError(f"Unsupported mode: {mode}") + return Image.fromarray(data, mode=mode) + + +def _assert_crop_matches_torchvision(inpt, top, left, height, width, device="cpu"): + dali_out = crop(inpt, top, left, height, width, device=device) + tv_out = tv_fn.crop(inpt, top, left, height, width) + + if device == "gpu" and not isinstance(dali_out, Image.Image): + dali_out = dali_out.cpu() + if isinstance(tv_out, torch.Tensor): + tv_out = tv_out.cpu() + + if isinstance(inpt, Image.Image): + assert isinstance(dali_out, Image.Image), f"Expected PIL Image, got {type(dali_out)}" + assert dali_out.mode == tv_out.mode, f"Expected mode {tv_out.mode}, got {dali_out.mode}" + dali_out = tv_fn.pil_to_tensor(dali_out) + tv_out = tv_fn.pil_to_tensor(tv_out) + + assert dali_out.shape == tv_out.shape, f"Shape mismatch: {dali_out.shape} != {tv_out.shape}" + assert torch.equal(dali_out, tv_out), "DALI crop output differs from torchvision" + + +@params( + (1, 2, 4, 5), + (0, 0, 8, 10), + (3, 4, 2, 3), +) +def test_crop_tensor_cpu(top, left, height, width): + _assert_crop_matches_torchvision(make_test_tensor(), top, left, height, width) + + +@params("L", "RGB", "RGBA") +def test_crop_pil_cpu(mode): + _assert_crop_matches_torchvision(_make_pil_image(mode), top=1, left=2, height=4, width=5) + + +@params( + (-1, -2, 6, 8), + (6, 8, 5, 6), + (0, 0, 12, 14), +) +def test_crop_out_of_bounds_tensor_cpu(top, left, height, width): + _assert_crop_matches_torchvision(make_test_tensor(), top, left, height, width) + + +@params("L", "RGB", "RGBA") +def test_crop_out_of_bounds_pil_cpu(mode): + _assert_crop_matches_torchvision(_make_pil_image(mode), top=-2, left=-3, height=12, width=14) + + +def test_crop_batched_tensor_cpu(): + tensor = make_test_tensor(shape=(4, 3, 8, 10)) + _assert_crop_matches_torchvision(tensor, top=2, left=3, height=4, width=5) + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +def test_crop_batched_tensor_gpu(): + tensor = make_test_tensor(shape=(4, 3, 8, 10)).cuda() + _assert_crop_matches_torchvision(tensor, top=2, left=3, height=4, width=5, device="gpu") + + +@params(torch.float32, torch.int16, torch.int32) +def test_crop_preserves_tensor_dtype_cpu(dtype): + tensor = make_test_tensor(dtype=dtype) + dali_out = crop(tensor, top=1, left=1, height=4, width=5) + tv_out = tv_fn.crop(tensor, top=1, left=1, height=4, width=5) + + assert dali_out.dtype == tv_out.dtype, f"Expected dtype {tv_out.dtype}, got {dali_out.dtype}" + assert torch.equal(dali_out, tv_out), "DALI crop output differs from torchvision" + + +def test_crop_invalid_input_type(): + with assert_raises(TypeError): + _ = crop([1, 2, 3], top=0, left=0, height=1, width=1) + + +@params( + (0, 1), + (1, 0), + (-1, 1), + (1, -1), + (1.0, 1), + (1, 1.0), +) +def test_crop_invalid_output_size(height, width): + with assert_raises((TypeError, ValueError)): + _ = crop(make_test_tensor(), top=0, left=0, height=height, width=width) + + +@params( + (0.5, 0), + ("0", 0), + (0, 0.5), + (0, "0"), +) +def test_crop_invalid_coordinates(top, left): + with assert_raises(TypeError): + _ = crop(make_test_tensor(), top=top, left=left, height=1, width=1) + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +@params( + (1, 2, 4, 5), + (-1, -2, 6, 8), +) +def test_crop_tensor_gpu(top, left, height, width): + tensor = make_test_tensor().cuda() + _assert_crop_matches_torchvision(tensor, top, left, height, width, device="gpu") + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +@params("L", "RGB", "RGBA") +def test_crop_pil_gpu(mode): + _assert_crop_matches_torchvision( + _make_pil_image(mode), top=-2, left=-3, height=12, width=14, device="gpu" + ) diff --git a/dali/test/python/torchvision/test_tv_randomcrop.py b/dali/test/python/torchvision/test_tv_randomcrop.py new file mode 100644 index 00000000000..df1129a7dea --- /dev/null +++ b/dali/test/python/torchvision/test_tv_randomcrop.py @@ -0,0 +1,304 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import unittest + +from nose2.tools import params +from nose_utils import assert_raises +import numpy as np +from PIL import Image +import torch +import torchvision.transforms.v2 as transforms +import torchvision.transforms.v2.functional as tv_fn + +from nvidia.dali.experimental.torchvision import Compose, RandomCrop +from nvidia.dali.experimental.torchvision.v2.operator import Operator + + +def make_tensor(shape=(3, 8, 10), dtype=torch.uint8): + return torch.arange(math.prod(shape), dtype=dtype).reshape(shape) + + +def make_pil_image(mode="RGB", h=8, w=10, seed=42): + rng = np.random.default_rng(seed) + if mode == "L": + data = rng.integers(0, 256, (h, w), dtype=np.uint8) + elif mode == "RGB": + data = rng.integers(0, 256, (h, w, 3), dtype=np.uint8) + elif mode == "RGBA": + data = rng.integers(0, 256, (h, w, 4), dtype=np.uint8) + else: + raise ValueError(f"Unsupported mode: {mode}") + return Image.fromarray(data, mode=mode) + + +def _to_tensor(inpt): + if isinstance(inpt, Image.Image): + return tv_fn.pil_to_tensor(inpt) + return inpt + + +def _assert_equal_to_torchvision(inpt, dali_transform, tv_transform, device="cpu"): + out = dali_transform(inpt) + tv_out = tv_transform(inpt) + + out = _to_tensor(out) + tv_out = _to_tensor(tv_out) + if device == "gpu": + out = out.cpu() + if isinstance(tv_out, torch.Tensor): + tv_out = tv_out.cpu() + + assert out.shape == tv_out.shape, f"Shape mismatch: {out.shape} != {tv_out.shape}" + assert torch.equal(out, tv_out), "DALI RandomCrop output differs from torchvision" + + +def _build_dali_random_crop(**kwargs): + batch_size = kwargs.pop("batch_size", 1) + return Compose([RandomCrop(**kwargs)], batch_size=batch_size) + + +def test_random_crop_is_operator(): + assert issubclass(RandomCrop, Operator) + + +@params( + (make_tensor(), (8, 10)), + (make_tensor(shape=(4, 3, 8, 10)), (8, 10)), + (make_pil_image("L"), (8, 10)), + (make_pil_image("RGB"), (8, 10)), + (make_pil_image("RGBA"), (8, 10)), +) +def test_random_crop_identity_matches_torchvision_cpu(inpt, size): + batch_size = inpt.shape[0] if isinstance(inpt, torch.Tensor) and inpt.ndim > 3 else 1 + _assert_equal_to_torchvision( + inpt, + _build_dali_random_crop(size=size, batch_size=batch_size), + transforms.RandomCrop(size=size), + ) + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +@params( + ("tensor", (3, 8, 10), (8, 10)), + ("tensor", (4, 3, 8, 10), (8, 10)), + ("pil", "RGB", (8, 10)), +) +def test_random_crop_identity_matches_torchvision_gpu(input_type, input_arg, size): + inpt = make_pil_image(input_arg) if input_type == "pil" else make_tensor(shape=input_arg).cuda() + batch_size = inpt.shape[0] if isinstance(inpt, torch.Tensor) and inpt.ndim > 3 else 1 + _assert_equal_to_torchvision( + inpt, + _build_dali_random_crop(size=size, device="gpu", batch_size=batch_size), + transforms.RandomCrop(size=size), + device="gpu", + ) + + +@params( + (None, 0, "constant"), + (1, 0, "constant"), + ([1], 0, "constant"), + ([1, 1], 0, "constant"), + ([1, 1, 1, 1], 0, "constant"), + (1, 7, "constant"), + (1, (1, 2, 3), "constant"), + (1, None, "constant"), + (1, 0, "edge"), + (1, 0, "reflect"), + (1, 0, "symmetric"), +) +def test_random_crop_padding_matches_torchvision_tensor_cpu(padding, fill, padding_mode): + tensor = make_tensor(shape=(3, 4, 5)) + size = (4, 5) if padding is None else (6, 7) + + _assert_equal_to_torchvision( + tensor, + _build_dali_random_crop( + size=size, + padding=padding, + fill=fill, + padding_mode=padding_mode, + ), + transforms.RandomCrop( + size=size, + padding=padding, + fill=fill, + padding_mode=padding_mode, + ), + ) + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +@params( + (1, 7, "constant"), + (1, 0, "edge"), + (1, 0, "reflect"), + (1, 0, "symmetric"), +) +def test_random_crop_padding_matches_torchvision_tensor_gpu(padding, fill, padding_mode): + tensor = make_tensor(shape=(3, 4, 5)).cuda() + _assert_equal_to_torchvision( + tensor, + _build_dali_random_crop( + size=(6, 7), + padding=padding, + fill=fill, + padding_mode=padding_mode, + device="gpu", + ), + transforms.RandomCrop( + size=(6, 7), + padding=padding, + fill=fill, + padding_mode=padding_mode, + ), + device="gpu", + ) + + +@params("L", "RGB", "RGBA") +def test_random_crop_padding_matches_torchvision_pil_cpu(mode): + img = make_pil_image(mode=mode, h=4, w=5) + _assert_equal_to_torchvision( + img, + _build_dali_random_crop(size=(6, 7), padding=1, fill=3), + transforms.RandomCrop(size=(6, 7), padding=1, fill=3), + ) + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +@params("L", "RGB", "RGBA") +def test_random_crop_padding_matches_torchvision_pil_gpu(mode): + img = make_pil_image(mode=mode, h=4, w=5) + _assert_equal_to_torchvision( + img, + _build_dali_random_crop(size=(6, 7), padding=1, fill=3, device="gpu"), + transforms.RandomCrop(size=(6, 7), padding=1, fill=3), + device="gpu", + ) + + +""" +# TODO: Tensor fill pattern is not currently supported +def test_random_crop_fill_dict_matches_torchvision_tensor_cpu(): + tensor = make_tensor(shape=(3, 4, 5)) + fill = {torch.Tensor: 9} + _assert_equal_to_torchvision( + tensor, + _build_dali_random_crop(size=(6, 7), padding=1, fill=fill), + transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), + ) + +# TODO: fill pattern as tensor is not currently supported +def test_random_crop_fill_dict_matches_torchvision_pil_cpu(): + img = make_pil_image(mode="RGB", h=4, w=5) + fill = {Image.Image: (1, 2, 3)} + _assert_equal_to_torchvision( + img, + _build_dali_random_crop(size=(6, 7), padding=1, fill=fill), + transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), + ) +""" + + +@params( + (4, (4, 4)), + ([4, 5], (4, 5)), +) +def test_random_crop_tensor_shape_cpu(size, expected_hw): + tensor = make_tensor() + out = _build_dali_random_crop(size=size)(tensor) + + assert out.shape == (3, *expected_hw) + + +@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") +@params( + (4, (4, 4)), + ([4, 5], (4, 5)), +) +def test_random_crop_tensor_shape_gpu(size, expected_hw): + tensor = make_tensor().cuda() + out = _build_dali_random_crop(size=size, device="gpu")(tensor) + + assert out.shape == (3, *expected_hw) + + +@params("cpu", "gpu") +def test_random_crop_pad_if_needed_shape(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + tensor = make_tensor(shape=(3, 4, 5)) + if device == "gpu": + tensor = tensor.cuda() + out = _build_dali_random_crop(size=(6, 7), pad_if_needed=True, device=device)(tensor) + + assert out.shape == (3, 6, 7) + + +@params( + [], + [0, 5], + [5, 0], + [1.0, 2], + [1, 2, 3], + -1, + 1.0, + {"bad": "value"}, +) +def test_random_crop_invalid_size(size): + with assert_raises((TypeError, ValueError)): + _ = RandomCrop(size=size) + + +@params( + -1, + [1, -1], + [1, 2, 3], + [1.0], + "bad", +) +def test_random_crop_invalid_padding(padding): + with assert_raises((TypeError, ValueError)): + _ = RandomCrop(size=3, padding=padding) + + +def test_random_crop_invalid_pad_if_needed(): + with assert_raises(TypeError): + _ = RandomCrop(size=3, pad_if_needed="yes") + + +@params( + object(), + "bad", + [1, object()], + {object(): 1}, + {torch.Tensor: object()}, +) +def test_random_crop_invalid_fill(fill): + with assert_raises(TypeError): + _ = RandomCrop(size=3, padding=1, fill=fill) + + +def test_random_crop_invalid_padding_mode_when_padding_is_used(): + with assert_raises(ValueError): + _ = RandomCrop(size=3, padding=1, padding_mode="bad") + + +def test_random_crop_invalid_padding_mode_when_pad_if_needed_is_used(): + with assert_raises(ValueError): + _ = RandomCrop(size=3, pad_if_needed=True, padding_mode="bad") From 2c7e9ef4332954420b539fa53d4f87ec86a4a859 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Tue, 19 May 2026 13:18:58 +0200 Subject: [PATCH 06/11] Greptile review comments and "cpu"/"gpu" unit tests Signed-off-by: Marek Dabek --- .../experimental/torchvision/v2/randomcrop.py | 25 +-- .../python/torchvision/test_tv_randomcrop.py | 170 +++++++----------- 2 files changed, 68 insertions(+), 127 deletions(-) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py index 9fb673d7de7..55f8fed339c 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py @@ -163,32 +163,9 @@ def adjust_fill(fill): return fill return tuple(fill) - @staticmethod - def _get_input_type(tensor): - layout = tensor.property("layout")[0] - if layout == np.frombuffer(bytes("F", "utf-8"), dtype=np.uint8)[0]: - layout = tensor.property("layout")[1] - if layout == np.frombuffer(bytes("C", "utf-8"), dtype=np.uint8)[0]: - return torch.Tensor - return Image.Image - - @staticmethod - def _get_fill(fill, tensor): - if not isinstance(fill, dict): - return fill - - input_type = RandomCrop._get_input_type(tensor) - string_keys = (input_type.__name__, f"{input_type.__module__}.{input_type.__name__}") - for key in (input_type, *string_keys): - if key in fill: - return fill[key] - if "others" in fill: - return fill["others"] - raise ValueError(f"fill dictionary does not contain a value for {input_type}") - @staticmethod def _randint(max_value): - range_start = fn.cast(max_value * 0, dtype=dali.types.FLOAT) + range_start = fn.cast(0, dtype=dali.types.FLOAT) range_end = fn.cast(max_value + 1, dtype=dali.types.FLOAT) value = dali.math.floor(fn.random.uniform(range=fn.stack(range_start, range_end))) return fn.cast(value, dtype=dali.types.INT32) diff --git a/dali/test/python/torchvision/test_tv_randomcrop.py b/dali/test/python/torchvision/test_tv_randomcrop.py index df1129a7dea..0fac3565916 100644 --- a/dali/test/python/torchvision/test_tv_randomcrop.py +++ b/dali/test/python/torchvision/test_tv_randomcrop.py @@ -15,7 +15,7 @@ import math import unittest -from nose2.tools import params +from nose2.tools import cartesian_params, params from nose_utils import assert_raises import numpy as np from PIL import Image @@ -70,58 +70,65 @@ def _build_dali_random_crop(**kwargs): return Compose([RandomCrop(**kwargs)], batch_size=batch_size) -def test_random_crop_is_operator(): - assert issubclass(RandomCrop, Operator) +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") -@params( - (make_tensor(), (8, 10)), - (make_tensor(shape=(4, 3, 8, 10)), (8, 10)), - (make_pil_image("L"), (8, 10)), - (make_pil_image("RGB"), (8, 10)), - (make_pil_image("RGBA"), (8, 10)), -) -def test_random_crop_identity_matches_torchvision_cpu(inpt, size): - batch_size = inpt.shape[0] if isinstance(inpt, torch.Tensor) and inpt.ndim > 3 else 1 - _assert_equal_to_torchvision( - inpt, - _build_dali_random_crop(size=size, batch_size=batch_size), - transforms.RandomCrop(size=size), - ) +def _move_tensor_to_device(inpt, device): + if device == "gpu" and isinstance(inpt, torch.Tensor): + return inpt.cuda() + return inpt -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -@params( - ("tensor", (3, 8, 10), (8, 10)), - ("tensor", (4, 3, 8, 10), (8, 10)), - ("pil", "RGB", (8, 10)), +def test_random_crop_is_operator(): + assert issubclass(RandomCrop, Operator) + + +@cartesian_params( + ("cpu", "gpu"), + ( + ("tensor", (3, 8, 10), (8, 10)), + ("tensor", (4, 3, 8, 10), (8, 10)), + ("pil", "L", (8, 10)), + ("pil", "RGB", (8, 10)), + ("pil", "RGBA", (8, 10)), + ), ) -def test_random_crop_identity_matches_torchvision_gpu(input_type, input_arg, size): - inpt = make_pil_image(input_arg) if input_type == "pil" else make_tensor(shape=input_arg).cuda() +def test_random_crop_identity_matches_torchvision(device, input_case): + _skip_if_gpu_unavailable(device) + input_type, input_arg, size = input_case + inpt = make_pil_image(input_arg) if input_type == "pil" else make_tensor(shape=input_arg) + inpt = _move_tensor_to_device(inpt, device) batch_size = inpt.shape[0] if isinstance(inpt, torch.Tensor) and inpt.ndim > 3 else 1 _assert_equal_to_torchvision( inpt, - _build_dali_random_crop(size=size, device="gpu", batch_size=batch_size), + _build_dali_random_crop(size=size, device=device, batch_size=batch_size), transforms.RandomCrop(size=size), - device="gpu", + device=device, ) -@params( - (None, 0, "constant"), - (1, 0, "constant"), - ([1], 0, "constant"), - ([1, 1], 0, "constant"), - ([1, 1, 1, 1], 0, "constant"), - (1, 7, "constant"), - (1, (1, 2, 3), "constant"), - (1, None, "constant"), - (1, 0, "edge"), - (1, 0, "reflect"), - (1, 0, "symmetric"), +@cartesian_params( + ("cpu", "gpu"), + ( + (None, 0, "constant"), + (1, 0, "constant"), + ([1], 0, "constant"), + ([1, 1], 0, "constant"), + ([1, 1, 1, 1], 0, "constant"), + (1, 7, "constant"), + (1, (1, 2, 3), "constant"), + (1, None, "constant"), + (1, 0, "edge"), + (1, 0, "reflect"), + (1, 0, "symmetric"), + ), ) -def test_random_crop_padding_matches_torchvision_tensor_cpu(padding, fill, padding_mode): - tensor = make_tensor(shape=(3, 4, 5)) +def test_random_crop_padding_matches_torchvision_tensor(device, padding_case): + _skip_if_gpu_unavailable(device) + padding, fill, padding_mode = padding_case + tensor = _move_tensor_to_device(make_tensor(shape=(3, 4, 5)), device) size = (4, 5) if padding is None else (6, 7) _assert_equal_to_torchvision( @@ -131,6 +138,7 @@ def test_random_crop_padding_matches_torchvision_tensor_cpu(padding, fill, paddi padding=padding, fill=fill, padding_mode=padding_mode, + device=device, ), transforms.RandomCrop( size=size, @@ -138,62 +146,25 @@ def test_random_crop_padding_matches_torchvision_tensor_cpu(padding, fill, paddi fill=fill, padding_mode=padding_mode, ), + device=device, ) -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -@params( - (1, 7, "constant"), - (1, 0, "edge"), - (1, 0, "reflect"), - (1, 0, "symmetric"), -) -def test_random_crop_padding_matches_torchvision_tensor_gpu(padding, fill, padding_mode): - tensor = make_tensor(shape=(3, 4, 5)).cuda() - _assert_equal_to_torchvision( - tensor, - _build_dali_random_crop( - size=(6, 7), - padding=padding, - fill=fill, - padding_mode=padding_mode, - device="gpu", - ), - transforms.RandomCrop( - size=(6, 7), - padding=padding, - fill=fill, - padding_mode=padding_mode, - ), - device="gpu", - ) - - -@params("L", "RGB", "RGBA") -def test_random_crop_padding_matches_torchvision_pil_cpu(mode): +@cartesian_params(("cpu", "gpu"), ("L", "RGB", "RGBA")) +def test_random_crop_padding_matches_torchvision_pil(device, mode): + _skip_if_gpu_unavailable(device) img = make_pil_image(mode=mode, h=4, w=5) _assert_equal_to_torchvision( img, - _build_dali_random_crop(size=(6, 7), padding=1, fill=3), + _build_dali_random_crop(size=(6, 7), padding=1, fill=3, device=device), transforms.RandomCrop(size=(6, 7), padding=1, fill=3), - ) - - -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -@params("L", "RGB", "RGBA") -def test_random_crop_padding_matches_torchvision_pil_gpu(mode): - img = make_pil_image(mode=mode, h=4, w=5) - _assert_equal_to_torchvision( - img, - _build_dali_random_crop(size=(6, 7), padding=1, fill=3, device="gpu"), - transforms.RandomCrop(size=(6, 7), padding=1, fill=3), - device="gpu", + device=device, ) """ # TODO: Tensor fill pattern is not currently supported -def test_random_crop_fill_dict_matches_torchvision_tensor_cpu(): +def test_random_crop_fill_dict_matches_torchvision_tensor(): tensor = make_tensor(shape=(3, 4, 5)) fill = {torch.Tensor: 9} _assert_equal_to_torchvision( @@ -203,7 +174,7 @@ def test_random_crop_fill_dict_matches_torchvision_tensor_cpu(): ) # TODO: fill pattern as tensor is not currently supported -def test_random_crop_fill_dict_matches_torchvision_pil_cpu(): +def test_random_crop_fill_dict_matches_torchvision_pil(): img = make_pil_image(mode="RGB", h=4, w=5) fill = {Image.Image: (1, 2, 3)} _assert_equal_to_torchvision( @@ -214,25 +185,18 @@ def test_random_crop_fill_dict_matches_torchvision_pil_cpu(): """ -@params( - (4, (4, 4)), - ([4, 5], (4, 5)), -) -def test_random_crop_tensor_shape_cpu(size, expected_hw): - tensor = make_tensor() - out = _build_dali_random_crop(size=size)(tensor) - - assert out.shape == (3, *expected_hw) - - -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -@params( - (4, (4, 4)), - ([4, 5], (4, 5)), +@cartesian_params( + ("cpu", "gpu"), + ( + (4, (4, 4)), + ([4, 5], (4, 5)), + ), ) -def test_random_crop_tensor_shape_gpu(size, expected_hw): - tensor = make_tensor().cuda() - out = _build_dali_random_crop(size=size, device="gpu")(tensor) +def test_random_crop_tensor_shape(device, shape_case): + _skip_if_gpu_unavailable(device) + size, expected_hw = shape_case + tensor = _move_tensor_to_device(make_tensor(), device) + out = _build_dali_random_crop(size=size, device=device)(tensor) assert out.shape == (3, *expected_hw) From 08ebc425711921de449878f17cc25fb0fd380a82 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Tue, 19 May 2026 14:43:56 +0200 Subject: [PATCH 07/11] Lint fixes Signed-off-by: Marek Dabek --- .../nvidia/dali/experimental/torchvision/v2/randomcrop.py | 3 --- dali/test/python/torchvision/test_tv_randomcrop.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py index 55f8fed339c..01bf27b4037 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py @@ -17,11 +17,8 @@ from types import NoneType from typing import Literal, Sequence, Union -from PIL import Image import nvidia.dali as dali import nvidia.dali.fn as fn -import numpy as np -import torch from .centercrop import CenterCrop from .operator import ( diff --git a/dali/test/python/torchvision/test_tv_randomcrop.py b/dali/test/python/torchvision/test_tv_randomcrop.py index 0fac3565916..9ac5425dd45 100644 --- a/dali/test/python/torchvision/test_tv_randomcrop.py +++ b/dali/test/python/torchvision/test_tv_randomcrop.py @@ -163,7 +163,7 @@ def test_random_crop_padding_matches_torchvision_pil(device, mode): """ -# TODO: Tensor fill pattern is not currently supported +# TODO: Fill using dictionary pattern is currently not supported def test_random_crop_fill_dict_matches_torchvision_tensor(): tensor = make_tensor(shape=(3, 4, 5)) fill = {torch.Tensor: 9} @@ -173,7 +173,6 @@ def test_random_crop_fill_dict_matches_torchvision_tensor(): transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), ) -# TODO: fill pattern as tensor is not currently supported def test_random_crop_fill_dict_matches_torchvision_pil(): img = make_pil_image(mode="RGB", h=4, w=5) fill = {Image.Image: (1, 2, 3)} From 12dddd3b3362a6d274ffe5e4a75b27fd519ae9ac Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Tue, 19 May 2026 15:36:25 +0200 Subject: [PATCH 08/11] More tests Signed-off-by: Marek Dabek --- dali/test/python/torchvision/test_tv_crop.py | 91 +++++++-------- .../python/torchvision/test_tv_randomcrop.py | 106 ++++++++++++++++++ 2 files changed, 147 insertions(+), 50 deletions(-) diff --git a/dali/test/python/torchvision/test_tv_crop.py b/dali/test/python/torchvision/test_tv_crop.py index 07d45d0ed2e..3648c6c07c7 100644 --- a/dali/test/python/torchvision/test_tv_crop.py +++ b/dali/test/python/torchvision/test_tv_crop.py @@ -15,7 +15,7 @@ import math import unittest -from nose2.tools import params +from nose2.tools import cartesian_params, params from nose_utils import assert_raises import numpy as np from PIL import Image @@ -61,47 +61,56 @@ def _assert_crop_matches_torchvision(inpt, top, left, height, width, device="cpu assert torch.equal(dali_out, tv_out), "DALI crop output differs from torchvision" -@params( - (1, 2, 4, 5), - (0, 0, 8, 10), - (3, 4, 2, 3), -) -def test_crop_tensor_cpu(top, left, height, width): - _assert_crop_matches_torchvision(make_test_tensor(), top, left, height, width) +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") -@params("L", "RGB", "RGBA") -def test_crop_pil_cpu(mode): - _assert_crop_matches_torchvision(_make_pil_image(mode), top=1, left=2, height=4, width=5) +def _move_tensor_to_device(tensor, device): + if device == "gpu": + return tensor.cuda() + return tensor -@params( - (-1, -2, 6, 8), - (6, 8, 5, 6), - (0, 0, 12, 14), +@cartesian_params( + ("cpu", "gpu"), + ( + (1, 2, 4, 5), + (0, 0, 8, 10), + (3, 4, 2, 3), + (-1, -2, 6, 8), + (6, 8, 5, 6), + (0, 0, 12, 14), + ), ) -def test_crop_out_of_bounds_tensor_cpu(top, left, height, width): - _assert_crop_matches_torchvision(make_test_tensor(), top, left, height, width) - - -@params("L", "RGB", "RGBA") -def test_crop_out_of_bounds_pil_cpu(mode): - _assert_crop_matches_torchvision(_make_pil_image(mode), top=-2, left=-3, height=12, width=14) - - -def test_crop_batched_tensor_cpu(): - tensor = make_test_tensor(shape=(4, 3, 8, 10)) - _assert_crop_matches_torchvision(tensor, top=2, left=3, height=4, width=5) +def test_crop_tensor(device, crop_case): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_test_tensor(), device) + _assert_crop_matches_torchvision(tensor, *crop_case, device=device) + + +@cartesian_params( + ("cpu", "gpu"), + ("L", "RGB", "RGBA"), + ( + (1, 2, 4, 5), + (-2, -3, 12, 14), + ), +) +def test_crop_pil(device, mode, crop_case): + _skip_if_gpu_unavailable(device) + _assert_crop_matches_torchvision(_make_pil_image(mode), *crop_case, device=device) -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -def test_crop_batched_tensor_gpu(): - tensor = make_test_tensor(shape=(4, 3, 8, 10)).cuda() - _assert_crop_matches_torchvision(tensor, top=2, left=3, height=4, width=5, device="gpu") +@cartesian_params(("cpu", "gpu"), ((2, 3, 4, 5),)) +def test_crop_batched_tensor(device, crop_case): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_test_tensor(shape=(4, 3, 8, 10)), device) + _assert_crop_matches_torchvision(tensor, *crop_case, device=device) @params(torch.float32, torch.int16, torch.int32) -def test_crop_preserves_tensor_dtype_cpu(dtype): +def test_crop_preserves_tensor_dtype(dtype): tensor = make_test_tensor(dtype=dtype) dali_out = crop(tensor, top=1, left=1, height=4, width=5) tv_out = tv_fn.crop(tensor, top=1, left=1, height=4, width=5) @@ -137,21 +146,3 @@ def test_crop_invalid_output_size(height, width): def test_crop_invalid_coordinates(top, left): with assert_raises(TypeError): _ = crop(make_test_tensor(), top=top, left=left, height=1, width=1) - - -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -@params( - (1, 2, 4, 5), - (-1, -2, 6, 8), -) -def test_crop_tensor_gpu(top, left, height, width): - tensor = make_test_tensor().cuda() - _assert_crop_matches_torchvision(tensor, top, left, height, width, device="gpu") - - -@unittest.skipUnless(torch.cuda.is_available(), "CUDA is not available") -@params("L", "RGB", "RGBA") -def test_crop_pil_gpu(mode): - _assert_crop_matches_torchvision( - _make_pil_image(mode), top=-2, left=-3, height=12, width=14, device="gpu" - ) diff --git a/dali/test/python/torchvision/test_tv_randomcrop.py b/dali/test/python/torchvision/test_tv_randomcrop.py index 9ac5425dd45..618125215e1 100644 --- a/dali/test/python/torchvision/test_tv_randomcrop.py +++ b/dali/test/python/torchvision/test_tv_randomcrop.py @@ -81,6 +81,42 @@ def _move_tensor_to_device(inpt, device): return inpt +def _possible_torchvision_random_crop_outputs(inpt, size, padding, fill=0, padding_mode="constant"): + crop_h, crop_w = RandomCrop.adjust_size(size) + pad_left, pad_top, pad_right, pad_bottom = RandomCrop.adjust_padding(padding) + + padded_h = inpt.shape[-2] + pad_top + pad_bottom + padded_w = inpt.shape[-1] + pad_left + pad_right + + if padded_h < crop_h: + diff = crop_h - padded_h + pad_top += diff + pad_bottom += diff + padded_h += 2 * diff + + if padded_w < crop_w: + diff = crop_w - padded_w + pad_left += diff + pad_right += diff + padded_w += 2 * diff + + padded = tv_fn.pad( + inpt, + padding=[pad_left, pad_top, pad_right, pad_bottom], + fill=fill, + padding_mode=padding_mode, + ) + + top_values = range(padded_h - crop_h + 1) if padded_h > crop_h else range(1) + left_values = range(padded_w - crop_w + 1) if padded_w > crop_w else range(1) + + return [ + tv_fn.crop(padded, top=top, left=left, height=crop_h, width=crop_w) + for top in top_values + for left in left_values + ] + + def test_random_crop_is_operator(): assert issubclass(RandomCrop, Operator) @@ -162,6 +198,65 @@ def test_random_crop_padding_matches_torchvision_pil(device, mode): ) +@cartesian_params( + ("cpu", "gpu"), + ( + ([0, 1, 2, 0], (7, 8)), + ([2, 0, 0, 1], (6, 9)), + ([1, 2, 0, 3], (10, 7)), + ), +) +def test_random_crop_asymmetric_padding_with_pad_if_needed(device, padding_case): + _skip_if_gpu_unavailable(device) + padding, size = padding_case + tensor = make_tensor(shape=(3, 4, 5)) + expected_outputs = _possible_torchvision_random_crop_outputs( + tensor, + size=size, + padding=padding, + ) + + dali_out = _build_dali_random_crop( + size=size, + padding=padding, + pad_if_needed=True, + device=device, + )(_move_tensor_to_device(tensor, device)).cpu() + + assert any( + torch.equal(dali_out, expected) for expected in expected_outputs + ), "DALI RandomCrop output is not a valid torchvision crop" + + +@cartesian_params(("cpu", "gpu")) +def test_random_crop_pad_if_needed_matches_torchvision_random_offsets(device): + _skip_if_gpu_unavailable(device) + tensor = make_tensor(shape=(3, 4, 5)) + size = (6, 7) + + expected_outputs = { + out.numpy().tobytes() + for out in _possible_torchvision_random_crop_outputs( + tensor, + size=size, + padding=None, + ) + } + tv_transform = transforms.RandomCrop(size=size, pad_if_needed=True) + tv_outputs = {tv_transform(tensor).numpy().tobytes() for _ in range(100)} + + assert len(tv_outputs) > 1, "Torchvision RandomCrop did not sample multiple offsets" + assert tv_outputs <= expected_outputs, "Torchvision produced an unexpected pad_if_needed crop" + + dali_tensor = _move_tensor_to_device(tensor, device) + dali_transform = _build_dali_random_crop(size=size, pad_if_needed=True, device=device) + dali_outputs = {dali_transform(dali_tensor).cpu().numpy().tobytes() for _ in range(20)} + + assert ( + dali_outputs <= expected_outputs + ), "DALI RandomCrop produced an invalid pad_if_needed crop" + + """ # TODO: Fill using dictionary pattern is currently not supported def test_random_crop_fill_dict_matches_torchvision_tensor(): @@ -200,6 +295,17 @@ def test_random_crop_tensor_shape(device, shape_case): assert out.shape == (3, *expected_hw) +@cartesian_params(("cpu", "gpu")) +def test_random_crop_samples_different_offsets(device): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_tensor(), device) + transform = _build_dali_random_crop(size=(4, 5), device=device) + + outputs = {bytes(transform(tensor).cpu().numpy().tobytes()) for _ in range(20)} + + assert len(outputs) > 1, "RandomCrop produced the same crop for every run" + + @params("cpu", "gpu") def test_random_crop_pad_if_needed_shape(device): if device == "gpu" and not torch.cuda.is_available(): From 5c32f8fb4a7aab15ae3c49f7dd720d895a407b92 Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Fri, 22 May 2026 19:35:05 +0200 Subject: [PATCH 09/11] Review comments Signed-off-by: Marek Dabek --- .../experimental/torchvision/v2/randomcrop.py | 100 ++++++------------ .../python/torchvision/test_tv_randomcrop.py | 69 ++++++++++-- 2 files changed, 96 insertions(+), 73 deletions(-) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py index 01bf27b4037..957029b9685 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import collections import numbers -from types import NoneType -from typing import Literal, Sequence, Union +from typing import Literal, Sequence import nvidia.dali as dali import nvidia.dali.fn as fn @@ -76,24 +74,16 @@ class _ValidateFill(_ArgumentValidateRule): """ @classmethod - def _verify_fill_value(cls, fill) -> None: + def verify(cls, *, fill, **_) -> None: if fill is None or isinstance(fill, numbers.Number): return if isinstance(fill, (list, tuple)) and all( isinstance(value, numbers.Number) for value in fill ): + if len(fill) == 0: + raise ValueError("fill sequence must be non-empty") return - raise TypeError(f"fill must be a number, sequence of numbers, None or a dict, got {fill!r}") - - @classmethod - def verify(cls, *, fill, **_) -> None: - if isinstance(fill, dict): - for key, value in fill.items(): - if not isinstance(key, (type, str)): - raise TypeError(f"fill dictionary keys must be types or strings, got {key!r}") - cls._verify_fill_value(value) - else: - cls._verify_fill_value(fill) + raise TypeError(f"fill must be a number, sequence of numbers, or None, got {fill!r}") class RandomCrop(Operator): @@ -117,9 +107,9 @@ class RandomCrop(Operator): bottom borders respectively. pad_if_needed : bool, optional, default = False Pad the image if it is smaller than the desired size. - fill : number or tuple or dict, optional, default = 0 + fill : number or tuple, optional, default = 0 Pixel fill value used when the padding_mode is constant. - padding_mode : Literal["constant", "edge", "reflect", "symmetric"], optional, + padding_mode : Literal["constant", "edge", "reflect", "symmetric"], optional, default="constant" Type of padding. Should be: constant, edge, reflect or symmetric. device : Literal["cpu", "gpu"], optional, default = "cpu" Device to use for the crop. Can be ``"cpu"`` or ``"gpu"``. @@ -152,8 +142,6 @@ def adjust_padding(cls, padding: None | int | Sequence[int]) -> tuple[int, int, @staticmethod def adjust_fill(fill): - if isinstance(fill, dict): - return {key: RandomCrop.adjust_fill(value) for key, value in fill.items()} if fill is None: return 0 if isinstance(fill, numbers.Number): @@ -172,21 +160,7 @@ def __init__( size: int | Sequence[int], padding: None | int | Sequence[int] = None, pad_if_needed: bool = False, - fill: Union[ - int, - float, - Sequence[int], - Sequence[float], - None, - dict[ - type | str, - int - | float - | collections.abc.Sequence[int] - | collections.abc.Sequence[float] - | NoneType, - ], - ] = 0, + fill: int | float | Sequence[int] | Sequence[float] | None = 0, padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant", device: Literal["cpu", "gpu"] = "cpu", ): @@ -214,44 +188,40 @@ def _kernel(self, data_input): crop_h, crop_w = self.size pad_left, pad_top, pad_right, pad_bottom = self.padding - if self.needs_padding: - padded_h = in_h + pad_top + pad_bottom - padded_w = in_w + pad_left + pad_right - - if self.pad_if_needed: - pad_h = dali.math.max(crop_h - padded_h, 0) - pad_w = dali.math.max(crop_w - padded_w, 0) - pad_top = pad_top + pad_h - pad_bottom = pad_bottom + pad_h - pad_left = pad_left + pad_w - pad_right = pad_right + pad_w - - tensor = fn.slice( - tensor, - fn.stack( - fn.cast(-pad_left, dtype=dali.types.INT64), - fn.cast(-pad_top, dtype=dali.types.INT64), - ), - fn.stack(in_w + pad_left + pad_right, in_h + pad_top + pad_bottom), - out_of_bounds_policy=PADDING_CLASS[self.padding_mode].border_type, - fill_values=self.fill, - device=self.device, - axis_names="WH", - ) + padded_h = in_h + pad_top + pad_bottom + padded_w = in_w + pad_left + pad_right - in_h = in_h + pad_top + pad_bottom - in_w = in_w + pad_left + pad_right + if self.pad_if_needed: + pad_h = dali.math.max(crop_h - padded_h, 0) + pad_w = dali.math.max(crop_w - padded_w, 0) + pad_top = pad_top + pad_h + pad_left = pad_left + pad_w + # Only pad_top / pad_left are read below; pad_bottom / pad_right are dropped. + padded_h = padded_h + 2 * pad_h + padded_w = padded_w + 2 * pad_w - max_top = fn.cast(in_h, dtype=dali.types.INT32) - crop_h - max_left = fn.cast(in_w, dtype=dali.types.INT32) - crop_w + max_top = fn.cast(padded_h, dtype=dali.types.INT32) - crop_h + max_left = fn.cast(padded_w, dtype=dali.types.INT32) - crop_w top = RandomCrop._randint(max_top) left = RandomCrop._randint(max_left) + slice_kwargs = { + "device": self.device, + "axis_names": "WH", + } + if self.needs_padding: + slice_kwargs.update( + out_of_bounds_policy=PADDING_CLASS[self.padding_mode].border_type, + fill_values=self.fill, + ) + return fn.slice( tensor, - fn.stack(left, top), + fn.stack( + fn.cast(left - pad_left, dtype=dali.types.INT32), + fn.cast(top - pad_top, dtype=dali.types.INT32), + ), fn.stack(crop_w, crop_h), - device=self.device, - axis_names="WH", + **slice_kwargs, ) diff --git a/dali/test/python/torchvision/test_tv_randomcrop.py b/dali/test/python/torchvision/test_tv_randomcrop.py index 618125215e1..e2201841526 100644 --- a/dali/test/python/torchvision/test_tv_randomcrop.py +++ b/dali/test/python/torchvision/test_tv_randomcrop.py @@ -23,6 +23,8 @@ import torchvision.transforms.v2 as transforms import torchvision.transforms.v2.functional as tv_fn +import nvidia.dali as dali +import nvidia.dali.experimental.torchvision.v2.randomcrop as randomcrop_module from nvidia.dali.experimental.torchvision import Compose, RandomCrop from nvidia.dali.experimental.torchvision.v2.operator import Operator @@ -121,6 +123,47 @@ def test_random_crop_is_operator(): assert issubclass(RandomCrop, Operator) +def test_random_crop_fuses_padding_into_crop_slice(): + transform = RandomCrop(size=(4, 5), padding=1) + slice_calls = [] + cast_calls = [] + + def fake_slice(tensor, anchor, shape, **kwargs): + slice_calls.append((tensor, anchor, shape, kwargs)) + return "cropped" + + def fake_cast(value, dtype): + cast_calls.append((value, dtype)) + return value + + old_slice = randomcrop_module.fn.slice + old_stack = randomcrop_module.fn.stack + old_cast = randomcrop_module.fn.cast + old_randint = RandomCrop._randint + try: + randomcrop_module.fn.slice = fake_slice + randomcrop_module.fn.stack = lambda *args: args + randomcrop_module.fn.cast = fake_cast + RandomCrop._randint = staticmethod(lambda max_value: 0) + + out = transform._kernel((4, 5, 3, "input")) + finally: + randomcrop_module.fn.slice = old_slice + randomcrop_module.fn.stack = old_stack + randomcrop_module.fn.cast = old_cast + RandomCrop._randint = staticmethod(old_randint) + + assert out == "cropped" + assert len(slice_calls) == 1 + tensor, anchor, shape, kwargs = slice_calls[0] + assert tensor == "input" + assert anchor == (-1, -1) + assert shape == (5, 4) + assert kwargs["out_of_bounds_policy"] == "pad" + assert kwargs["fill_values"] == 0 + assert cast_calls[-2:] == [(-1, dali.types.INT32), (-1, dali.types.INT32)] + + @cartesian_params( ("cpu", "gpu"), ( @@ -257,8 +300,8 @@ def test_random_crop_pad_if_needed_matches_torchvision_random_offsets(device): ), "DALI RandomCrop produced an invalid pad_if_needed crop" -""" # TODO: Fill using dictionary pattern is currently not supported +@unittest.skip("dict fill not supported") def test_random_crop_fill_dict_matches_torchvision_tensor(): tensor = make_tensor(shape=(3, 4, 5)) fill = {torch.Tensor: 9} @@ -268,6 +311,9 @@ def test_random_crop_fill_dict_matches_torchvision_tensor(): transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), ) + +# TODO: Fill using dictionary pattern is currently not supported +@unittest.skip("dict fill not supported") def test_random_crop_fill_dict_matches_torchvision_pil(): img = make_pil_image(mode="RGB", h=4, w=5) fill = {Image.Image: (1, 2, 3)} @@ -276,7 +322,6 @@ def test_random_crop_fill_dict_matches_torchvision_pil(): _build_dali_random_crop(size=(6, 7), padding=1, fill=fill), transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), ) -""" @cartesian_params( @@ -330,7 +375,7 @@ def test_random_crop_pad_if_needed_shape(device): {"bad": "value"}, ) def test_random_crop_invalid_size(size): - with assert_raises((TypeError, ValueError)): + with assert_raises((TypeError, ValueError), glob="*size*"): _ = RandomCrop(size=size) @@ -342,12 +387,12 @@ def test_random_crop_invalid_size(size): "bad", ) def test_random_crop_invalid_padding(padding): - with assert_raises((TypeError, ValueError)): + with assert_raises((TypeError, ValueError), glob="*padding*"): _ = RandomCrop(size=3, padding=padding) def test_random_crop_invalid_pad_if_needed(): - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*pad_if_needed must be bool*"): _ = RandomCrop(size=3, pad_if_needed="yes") @@ -357,17 +402,25 @@ def test_random_crop_invalid_pad_if_needed(): [1, object()], {object(): 1}, {torch.Tensor: object()}, + {Image.Image: (1, 2, 3)}, # TODO: dict fill patterns are not supported + {torch.Tensor: 9}, # TODO: dict fill patterns are not supported ) def test_random_crop_invalid_fill(fill): - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*fill must be*"): + _ = RandomCrop(size=3, padding=1, fill=fill) + + +@params(([],), ((),)) +def test_random_crop_empty_fill_sequence(fill): + with assert_raises(ValueError, glob="*fill sequence must be non-empty*"): _ = RandomCrop(size=3, padding=1, fill=fill) def test_random_crop_invalid_padding_mode_when_padding_is_used(): - with assert_raises(ValueError): + with assert_raises(ValueError, glob="*Invalid padding mode*"): _ = RandomCrop(size=3, padding=1, padding_mode="bad") def test_random_crop_invalid_padding_mode_when_pad_if_needed_is_used(): - with assert_raises(ValueError): + with assert_raises(ValueError, glob="*Invalid padding mode*"): _ = RandomCrop(size=3, pad_if_needed=True, padding_mode="bad") From 4fd51eb091da31f2ef0503575062401d24571f5b Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Thu, 9 Apr 2026 13:30:38 +0200 Subject: [PATCH 10/11] Image information Torchvision's functional API Signed-off-by: Marek Dabek --- .../torchvision/v2/functional/__init__.py | 3 + .../v2/functional/image_metadata.py | 82 ++++++++ .../torchvision/test_tv_image_metadata.py | 186 ++++++++++++++++++ 3 files changed, 271 insertions(+) create mode 100644 dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py create mode 100644 dali/test/python/torchvision/test_tv_image_metadata.py diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py index 18003740b00..6064709cd61 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py @@ -17,6 +17,7 @@ from .crop import crop from .flips import horizontal_flip, vertical_flip from .gaussian_blur import gaussian_blur +from .image_metadata import get_dimensions, get_image_size from .normalize import normalize from .pad import pad from .resize import resize @@ -26,6 +27,8 @@ "center_crop", "crop", "gaussian_blur", + "get_dimensions", + "get_image_size", "horizontal_flip", "normalize", "pad", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py new file mode 100644 index 00000000000..4b62db09f20 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py @@ -0,0 +1,82 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from PIL import Image +import torch + + +def get_image_size(inpt: Image.Image | torch.Tensor) -> List[int]: + """ + Return the spatial size of an image as ``[width, height]``. + + Mirrors ``torchvision.transforms.v2.functional.get_image_size``. + + .. note:: + This function is provided for compatibility. The torchvision successor + ``get_size`` returns ``[height, width]`` instead. + + Parameters + ---------- + inpt : PIL Image or torch.Tensor + Input image. Tensors are expected in ``[…, H, W]`` layout (leading + channel / batch dimensions are ignored). + + Returns + ------- + List[int] + ``[width, height]`` + """ + if isinstance(inpt, Image.Image): + return list(inpt.size) # PIL .size is (W, H) + elif isinstance(inpt, torch.Tensor): + if inpt.ndim < 2: + raise TypeError( + f"get_image_size requires a tensor with at least 2 dimensions, got {inpt.ndim}" + ) + return [inpt.shape[-1], inpt.shape[-2]] # [W, H] + raise TypeError(f"Unsupported input type: {type(inpt)}") + + +def get_dimensions(inpt: Image.Image | torch.Tensor) -> List[int]: + """ + Return the number of channels, height, and width of an image as + ``[channels, height, width]``. + + Mirrors ``torchvision.transforms.v2.functional.get_dimensions``. + + Parameters + ---------- + inpt : PIL Image or torch.Tensor + Input image. Tensors are expected in ``[H, W]`` or ``[…, C, H, W]`` layout + (leading batch dimensions are ignored). + + Returns + ------- + List[int] + ``[channels, height, width]`` + """ + if isinstance(inpt, Image.Image): + w, h = inpt.size + return [len(inpt.getbands()), h, w] + elif isinstance(inpt, torch.Tensor): + if inpt.ndim < 2: + raise TypeError( + f"get_dimensions requires a tensor with at least 2 dimensions, got {inpt.ndim}" + ) + if inpt.ndim == 2: + return [1, inpt.shape[-2], inpt.shape[-1]] + return [inpt.shape[-3], inpt.shape[-2], inpt.shape[-1]] # [C, H, W] + raise TypeError(f"Unsupported input type: {type(inpt)}") diff --git a/dali/test/python/torchvision/test_tv_image_metadata.py b/dali/test/python/torchvision/test_tv_image_metadata.py new file mode 100644 index 00000000000..3f24ebcd359 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_image_metadata.py @@ -0,0 +1,186 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import unittest + +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +from PIL import Image +import torch +from torchvision import tv_tensors +import torchvision.transforms.v2.functional as fn_tv + +from nvidia.dali.experimental.torchvision.v2.functional import get_image_size, get_dimensions + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _tv_get_image_size(inpt): + """Call torchvision get_image_size while suppressing its deprecation warning.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + return fn_tv.get_image_size(inpt) + + +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + +def _move_tensor_to_device(tensor, device): + if device == "gpu": + return tensor.cuda() + return tensor + + +def _make_compatibility_input(input_kind, shape): + tensor = torch.zeros(*shape) + if input_kind == "tensor": + return tensor + if input_kind == "tv_image": + return tv_tensors.Image(tensor) + raise ValueError(f"Unsupported input kind: {input_kind}") + + +# PIL images with known exact dimensions (W x H) +PIL_CASES = [ + Image.new("RGB", (320, 240)), # 3 channels + Image.new("L", (100, 50)), # 1 channel, non-square + Image.new("RGBA", (64, 32)), # 4 channels + Image.new("RGB", (1, 1)), # minimal + Image.new("L", (512, 1)), # extreme aspect ratio +] + +# Tensors in CHW / NCHW layout — deliberately use H≠W to catch W/H swap bugs +TENSOR_CASES = [ + torch.zeros(3, 240, 320), # CHW + torch.zeros(1, 3, 240, 320), # NCHW, N=1 + torch.zeros(8, 3, 240, 320), # NCHW, N=8 + torch.zeros(1, 50, 100), # CHW, 1 channel + torch.zeros(4, 32, 64), # CHW, 4 channels + torch.zeros(10, 11, 12, 8, 3, 240, 320), # ...NCHW, N=8 +] + +TORCHVISION_COMPATIBILITY_CASES = [ + ("tensor", (240, 320)), # HW, implicit single channel + ("tensor", (3, 240, 320)), # CHW + ("tensor", (8, 3, 240, 320)), # NCHW + ("tv_image", (240, 320)), # torchvision Image converts HW to 1HW + ("tv_image", (3, 240, 320)), # torchvision Image, CHW +] + + +# --------------------------------------------------------------------------- +# get_image_size — PIL +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_get_image_size_pil(img): + expected = _tv_get_image_size(img) + assert ( + get_image_size(img) == expected + ), f"mode={img.mode} size={img.size}: got {get_image_size(img)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_image_size — tensors +# --------------------------------------------------------------------------- + + +@cartesian_params(("cpu", "gpu"), TENSOR_CASES) +def test_get_image_size_tensor(device, t): + _skip_if_gpu_unavailable(device) + t = _move_tensor_to_device(t, device) + expected = _tv_get_image_size(t) + assert ( + get_image_size(t) == expected + ), f"device={device} shape={t.shape}: got {get_image_size(t)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_dimensions — PIL +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_get_dimensions_pil(img): + expected = fn_tv.get_dimensions(img) + assert ( + get_dimensions(img) == expected + ), f"mode={img.mode} size={img.size}: got {get_dimensions(img)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_dimensions — tensors +# --------------------------------------------------------------------------- + + +@cartesian_params(("cpu", "gpu"), TENSOR_CASES) +def test_get_dimensions_tensor(device, t): + _skip_if_gpu_unavailable(device) + t = _move_tensor_to_device(t, device) + expected = fn_tv.get_dimensions(t) + assert ( + get_dimensions(t) == expected + ), f"device={device} shape={t.shape}: got {get_dimensions(t)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# Torchvision compatibility +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_image_metadata_pil_matches_torchvision(img): + assert get_image_size(img) == _tv_get_image_size(img) + assert get_dimensions(img) == fn_tv.get_dimensions(img) + + +@cartesian_params(("cpu", "gpu"), TORCHVISION_COMPATIBILITY_CASES) +def test_image_metadata_tensor_inputs_match_torchvision(device, input_case): + _skip_if_gpu_unavailable(device) + input_kind, shape = input_case + inpt = _move_tensor_to_device(_make_compatibility_input(input_kind, shape), device) + + assert get_image_size(inpt) == _tv_get_image_size(inpt) + assert get_dimensions(inpt) == fn_tv.get_dimensions(inpt) + + +# --------------------------------------------------------------------------- +# Error cases +# --------------------------------------------------------------------------- + + +def test_get_image_size_1d_tensor_raises(): + with assert_raises(TypeError): + get_image_size(torch.zeros(10)) + + +def test_get_dimensions_1d_tensor_raises(): + with assert_raises(TypeError): + get_dimensions(torch.zeros(10)) + + +def test_get_image_size_unsupported_type_raises(): + with assert_raises(TypeError): + get_image_size("not_an_image") + + +def test_get_dimensions_unsupported_type_raises(): + with assert_raises(TypeError): + get_dimensions("not_an_image") From c024bac1d22cca60d7895ed8f63386a7f02231eb Mon Sep 17 00:00:00 2001 From: Marek Dabek Date: Mon, 25 May 2026 10:37:11 +0200 Subject: [PATCH 11/11] Review fixes Signed-off-by: Marek Dabek --- .../torchvision/v2/functional/image_metadata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py index 4b62db09f20..9ec4c85891d 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py @@ -44,10 +44,10 @@ def get_image_size(inpt: Image.Image | torch.Tensor) -> List[int]: elif isinstance(inpt, torch.Tensor): if inpt.ndim < 2: raise TypeError( - f"get_image_size requires a tensor with at least 2 dimensions, got {inpt.ndim}" + f"get_image_size requires a tensor with at least 2 dimensions, got {inpt.ndim}." ) return [inpt.shape[-1], inpt.shape[-2]] # [W, H] - raise TypeError(f"Unsupported input type: {type(inpt)}") + raise TypeError(f"Unsupported input type: {type(inpt)}.") def get_dimensions(inpt: Image.Image | torch.Tensor) -> List[int]: @@ -74,9 +74,9 @@ def get_dimensions(inpt: Image.Image | torch.Tensor) -> List[int]: elif isinstance(inpt, torch.Tensor): if inpt.ndim < 2: raise TypeError( - f"get_dimensions requires a tensor with at least 2 dimensions, got {inpt.ndim}" + f"get_dimensions requires a tensor with at least 2 dimensions, got {inpt.ndim}." ) if inpt.ndim == 2: return [1, inpt.shape[-2], inpt.shape[-1]] return [inpt.shape[-3], inpt.shape[-2], inpt.shape[-1]] # [C, H, W] - raise TypeError(f"Unsupported input type: {type(inpt)}") + raise TypeError(f"Unsupported input type: {type(inpt)}.")