diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index f4b17e1e..939ccd0d 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -12,7 +12,7 @@ emerging_hotspots,2026-04-30,,MEDIUM,2;3,MEDIUM: threshold_90 uses int() (trunca fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance). NaN handled via x!=x; CUDA bounds use strict <; rdnbr and ros divisions guarded; CPU/GPU/dask paths algorithmically identical. No accuracy issues found. flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage." focal,2026-03-30T13:00:00Z,1092,,, -geotiff,2026-04-23,1247,HIGH,1;3;4;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. +geotiff,2026-05-05,1498,HIGH,1;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. No CRIT findings beyond the predictor=2 fix. glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging" hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." hydro,2026-04-30,,LOW,1,Only LOW: twi log(0)=-inf if fa=0 (out-of-contract); MFD weighted sum no Kahan (negligible). No CRIT/HIGH issues. diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py index 087ebbe9..ef604ed1 100644 --- a/xrspatial/geotiff/_compression.py +++ b/xrspatial/geotiff/_compression.py @@ -333,80 +333,231 @@ def lzw_compress(data: bytes) -> bytes: # -- Horizontal predictor (Numba) -------------------------------------------- +# +# TIFF predictor=2 (horizontal differencing) operates at the *sample* level, +# not the byte level. Per TIFF Technical Note: "the difference is taken +# between adjacent samples (not between adjacent bytes) of the same +# component." This means for multi-byte samples the encoder computes +# +# diff[i] = sample[i] - sample[i - samples_per_pixel] (mod 2^bits) +# +# in the sample's natural bit width (uint16 wraps at 65536, etc.), not +# byte-by-byte. A byte-wise implementation drops the inter-byte carry +# and produces wrong values for any sample wider than one byte. This +# is what libtiff's horAcc16/horAcc32 do, and what GDAL/rasterio/tifffile +# write to disk. The byte-wise path remains correct (and faster) for the +# 8-bit case. + @ngjit -def _predictor_decode(data, width, height, bytes_per_sample): - """Undo horizontal differencing predictor (TIFF predictor=2). +def _predictor_decode_u8(data, width, height, samples_per_pixel): + """Undo predictor=2 for 8-bit samples (one byte per sample). - Operates in-place on the flat byte array, performing cumulative sum - per row at the sample level. + Stride is samples_per_pixel bytes. The byte-wise modular sum is + correct because each sample fits in a single byte. """ - row_bytes = width * bytes_per_sample + row_bytes = width * samples_per_pixel for row in range(height): row_start = row * row_bytes - for col in range(bytes_per_sample, row_bytes): + for col in range(samples_per_pixel, row_bytes): idx = row_start + col - data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF) + data[idx] = np.uint8( + (np.int32(data[idx]) + np.int32(data[idx - samples_per_pixel])) & 0xFF) @ngjit -def _predictor_encode(data, width, height, bytes_per_sample): - """Apply horizontal differencing predictor (TIFF predictor=2). +def _predictor_decode_u16(view, width, height, samples_per_pixel): + """Undo predictor=2 on a uint16 view (in-place, sample-resolution). - Operates in-place, converting absolute values to differences. - Process right-to-left to avoid overwriting values we still need. + *view* indexes by sample. Stride is *samples_per_pixel* samples. """ - row_bytes = width * bytes_per_sample + row_samples = width * samples_per_pixel + for row in range(height): + row_start = row * row_samples + for col in range(samples_per_pixel, row_samples): + idx = row_start + col + view[idx] = np.uint16( + (np.int32(view[idx]) + np.int32(view[idx - samples_per_pixel])) & 0xFFFF) + + +@ngjit +def _predictor_decode_u32(view, width, height, samples_per_pixel): + """Undo predictor=2 on a uint32 view (in-place, sample-resolution).""" + row_samples = width * samples_per_pixel + for row in range(height): + row_start = row * row_samples + for col in range(samples_per_pixel, row_samples): + idx = row_start + col + view[idx] = np.uint32( + (np.uint64(view[idx]) + np.uint64(view[idx - samples_per_pixel])) + & np.uint64(0xFFFFFFFF)) + + +@ngjit +def _predictor_decode_u64(view, width, height, samples_per_pixel): + """Undo predictor=2 on a uint64 view (in-place, sample-resolution).""" + row_samples = width * samples_per_pixel + for row in range(height): + row_start = row * row_samples + for col in range(samples_per_pixel, row_samples): + idx = row_start + col + view[idx] = np.uint64(view[idx]) + np.uint64(view[idx - samples_per_pixel]) + + +@ngjit +def _predictor_encode_u8(data, width, height, samples_per_pixel): + """Apply predictor=2 for 8-bit samples (right-to-left, in-place).""" + row_bytes = width * samples_per_pixel for row in range(height): row_start = row * row_bytes - for col in range(row_bytes - 1, bytes_per_sample - 1, -1): + for col in range(row_bytes - 1, samples_per_pixel - 1, -1): + idx = row_start + col + data[idx] = np.uint8( + (np.int32(data[idx]) - np.int32(data[idx - samples_per_pixel])) & 0xFF) + + +@ngjit +def _predictor_encode_u16(view, width, height, samples_per_pixel): + """Apply predictor=2 on a uint16 view (right-to-left, in-place).""" + row_samples = width * samples_per_pixel + for row in range(height): + row_start = row * row_samples + for col in range(row_samples - 1, samples_per_pixel - 1, -1): idx = row_start + col - data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF) + view[idx] = np.uint16( + (np.int32(view[idx]) - np.int32(view[idx - samples_per_pixel])) & 0xFFFF) + + +@ngjit +def _predictor_encode_u32(view, width, height, samples_per_pixel): + """Apply predictor=2 on a uint32 view (right-to-left, in-place).""" + row_samples = width * samples_per_pixel + for row in range(height): + row_start = row * row_samples + for col in range(row_samples - 1, samples_per_pixel - 1, -1): + idx = row_start + col + view[idx] = np.uint32( + (np.uint64(view[idx]) - np.uint64(view[idx - samples_per_pixel])) + & np.uint64(0xFFFFFFFF)) + + +@ngjit +def _predictor_encode_u64(view, width, height, samples_per_pixel): + """Apply predictor=2 on a uint64 view (right-to-left, in-place).""" + row_samples = width * samples_per_pixel + for row in range(height): + row_start = row * row_samples + for col in range(row_samples - 1, samples_per_pixel - 1, -1): + idx = row_start + col + view[idx] = np.uint64(view[idx]) - np.uint64(view[idx - samples_per_pixel]) + + +def _sample_view(buf: np.ndarray, bytes_per_sample: int, + byte_order: str = '<') -> np.ndarray: + """Return *buf* viewed as a 1-D array of unsigned samples. + + The view shares memory with *buf* and uses the file's byte order so + that the sample-resolution differencing wraps at the correct width. + """ + if bytes_per_sample == 1: + return buf + sample_dtype = np.dtype(f'{byte_order}u{bytes_per_sample}') + return buf.view(sample_dtype) def predictor_decode(data: np.ndarray, width: int, height: int, - bytes_per_sample: int) -> np.ndarray: + bytes_per_sample: int, samples: int = 1, + byte_order: str = '<') -> np.ndarray: """Undo horizontal differencing predictor (predictor=2). + Operates at the sample level, per TIFF Technical Note: differences + are taken between adjacent same-component samples in the sample's + natural bit width. For multi-band chunky data the stride is the + number of samples per pixel. + Parameters ---------- data : np.ndarray Flat uint8 array of decompressed pixel data (modified in-place). width, height : int - Image dimensions. + Image dimensions in pixels. bytes_per_sample : int - Bytes per sample (e.g. 1 for uint8, 4 for float32). + Bytes per sample (1, 2, 4, or 8). + samples : int + Samples per pixel (1 for single-band, 3 for RGB, ...). + byte_order : str + '<' for little-endian, '>' for big-endian. Used to interpret + multi-byte sample bytes during the in-place differencing. Returns ------- np.ndarray - Same array, modified in-place. + The same uint8 array with predictor=2 inverted. """ buf = np.ascontiguousarray(data) - _predictor_decode(buf, width, height, bytes_per_sample) + if bytes_per_sample == 1: + _predictor_decode_u8(buf, width, height, samples) + return buf + + view = _sample_view(buf, bytes_per_sample, byte_order) + if bytes_per_sample == 2: + _predictor_decode_u16(view, width, height, samples) + elif bytes_per_sample == 4: + _predictor_decode_u32(view, width, height, samples) + elif bytes_per_sample == 8: + _predictor_decode_u64(view, width, height, samples) + else: + raise ValueError( + f"predictor=2 with bytes_per_sample={bytes_per_sample} is not " + "supported (TIFF specifies sample-level differencing for 1/2/4/8 " + "byte samples)." + ) return buf def predictor_encode(data: np.ndarray, width: int, height: int, - bytes_per_sample: int) -> np.ndarray: + bytes_per_sample: int, samples: int = 1, + byte_order: str = '<') -> np.ndarray: """Apply horizontal differencing predictor (predictor=2). + See :func:`predictor_decode` for the sample-level semantics. + Parameters ---------- data : np.ndarray Flat uint8 array of pixel data (modified in-place). width, height : int - Image dimensions. + Image dimensions in pixels. bytes_per_sample : int Bytes per sample. + samples : int + Samples per pixel. + byte_order : str + '<' or '>'. Defaults to little-endian, the writer's output order. Returns ------- np.ndarray - Same array, modified in-place. + The same uint8 array with predictor=2 applied. """ buf = np.ascontiguousarray(data) - _predictor_encode(buf, width, height, bytes_per_sample) + if bytes_per_sample == 1: + _predictor_encode_u8(buf, width, height, samples) + return buf + + view = _sample_view(buf, bytes_per_sample, byte_order) + if bytes_per_sample == 2: + _predictor_encode_u16(view, width, height, samples) + elif bytes_per_sample == 4: + _predictor_encode_u32(view, width, height, samples) + elif bytes_per_sample == 8: + _predictor_encode_u64(view, width, height, samples) + else: + raise ValueError( + f"predictor=2 with bytes_per_sample={bytes_per_sample} is not " + "supported (TIFF specifies sample-level differencing for 1/2/4/8 " + "byte samples)." + ) return buf diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 67c36829..0598a6f9 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -208,7 +208,14 @@ def _lzw_decode_tiles_kernel( # Type aliases for Numba CUDA local arrays -from numba import int32 as numba_int32, uint8 as numba_uint8, int64 as numba_int64 +from numba import ( + int32 as numba_int32, + uint8 as numba_uint8, + uint16 as numba_uint16, + uint32 as numba_uint32, + uint64 as numba_uint64, + int64 as numba_int64, +) # --------------------------------------------------------------------------- @@ -628,19 +635,68 @@ def _inflate_tiles_kernel( # --------------------------------------------------------------------------- @cuda.jit -def _predictor_decode_kernel(data, width, height, bytes_per_sample): - """Undo horizontal differencing (predictor=2), one thread per row.""" +def _predictor_decode_kernel_u8(data, width, height, samples_per_pixel): + """Undo predictor=2 for 8-bit samples, one thread per row. + + Stride is ``samples_per_pixel`` bytes. Byte-wise modular sum is + correct here because each sample fits in a single byte. + """ row = cuda.grid(1) if row >= height: return - row_bytes = width * bytes_per_sample + row_bytes = width * samples_per_pixel row_start = row * row_bytes - for col in range(bytes_per_sample, row_bytes): + for col in range(samples_per_pixel, row_bytes): idx = row_start + col data[idx] = numba_uint8( - (numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF) + (numba_int32(data[idx]) + numba_int32(data[idx - samples_per_pixel])) & 0xFF) + + +@cuda.jit +def _predictor_decode_kernel_u16(view, width, height, samples_per_pixel): + """Undo predictor=2 on a uint16 view, one thread per row.""" + row = cuda.grid(1) + if row >= height: + return + + row_samples = width * samples_per_pixel + row_start = row * row_samples + + for col in range(samples_per_pixel, row_samples): + idx = row_start + col + view[idx] = (view[idx] + view[idx - samples_per_pixel]) & numba_int32(0xFFFF) + + +@cuda.jit +def _predictor_decode_kernel_u32(view, width, height, samples_per_pixel): + """Undo predictor=2 on a uint32 view, one thread per row.""" + row = cuda.grid(1) + if row >= height: + return + + row_samples = width * samples_per_pixel + row_start = row * row_samples + + for col in range(samples_per_pixel, row_samples): + idx = row_start + col + view[idx] = (view[idx] + view[idx - samples_per_pixel]) & numba_uint32(0xFFFFFFFF) + + +@cuda.jit +def _predictor_decode_kernel_u64(view, width, height, samples_per_pixel): + """Undo predictor=2 on a uint64 view, one thread per row.""" + row = cuda.grid(1) + if row >= height: + return + + row_samples = width * samples_per_pixel + row_start = row * row_samples + + for col in range(samples_per_pixel, row_samples): + idx = row_start + col + view[idx] = view[idx] + view[idx - samples_per_pixel] @cuda.jit @@ -1370,6 +1426,70 @@ class _NvcompDecompOpts(ctypes.Structure): return None +def _gpu_predictor2_decode(d_decomp, tile_width, total_rows, dtype, samples): + """Run the right predictor=2 decode kernel for *dtype*. + + TIFF predictor=2 differences adjacent same-component samples in the + sample's natural width (uint8/16/32/64). We view the byte buffer as + the matching unsigned dtype and dispatch to a per-width kernel so + the modular wrap matches what GDAL/libtiff write. + """ + import cupy + + tpb = min(256, total_rows) if total_rows > 0 else 1 + bpg = math.ceil(total_rows / tpb) if tpb > 0 else 1 + bps = dtype.itemsize + + if bps == 1: + _predictor_decode_kernel_u8[bpg, tpb]( + d_decomp, tile_width, total_rows, samples) + elif bps == 2: + view = d_decomp.view(cupy.uint16) + _predictor_decode_kernel_u16[bpg, tpb]( + view, tile_width, total_rows, samples) + elif bps == 4: + view = d_decomp.view(cupy.uint32) + _predictor_decode_kernel_u32[bpg, tpb]( + view, tile_width, total_rows, samples) + elif bps == 8: + view = d_decomp.view(cupy.uint64) + _predictor_decode_kernel_u64[bpg, tpb]( + view, tile_width, total_rows, samples) + else: + raise ValueError( + f"GPU predictor=2 unsupported for bytes_per_sample={bps}") + cuda.synchronize() + + +def _gpu_predictor2_encode(d_decomp, tile_width, total_rows, dtype, samples): + """Run the right predictor=2 encode kernel for *dtype*.""" + import cupy + + tpb = min(256, total_rows) if total_rows > 0 else 1 + bpg = math.ceil(total_rows / tpb) if tpb > 0 else 1 + bps = dtype.itemsize + + if bps == 1: + _predictor_encode_kernel_u8[bpg, tpb]( + d_decomp, tile_width, total_rows, samples) + elif bps == 2: + view = d_decomp.view(cupy.uint16) + _predictor_encode_kernel_u16[bpg, tpb]( + view, tile_width, total_rows, samples) + elif bps == 4: + view = d_decomp.view(cupy.uint32) + _predictor_encode_kernel_u32[bpg, tpb]( + view, tile_width, total_rows, samples) + elif bps == 8: + view = d_decomp.view(cupy.uint64) + _predictor_encode_kernel_u64[bpg, tpb]( + view, tile_width, total_rows, samples) + else: + raise ValueError( + f"GPU predictor=2 unsupported for bytes_per_sample={bps}") + cuda.synchronize() + + def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, tile_width, tile_height, image_width, image_height, @@ -1381,14 +1501,11 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, if predictor == 2: total_rows = n_tiles * tile_height - tpb = min(256, total_rows) - bpg = math.ceil(total_rows / tpb) - # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel - # width and full per-pixel size (itemsize * samples). Matches CPU - # call at _reader.py _apply_predictor(..., bytes_per_sample * samples). - _predictor_decode_kernel[bpg, tpb]( - d_decomp, tile_width, total_rows, dtype.itemsize * samples) - cuda.synchronize() + # Sample-level differencing: stride is samples_per_pixel samples, + # row width is tile_width pixels. Per-dtype kernels handle the + # modular wrap at the sample's natural bit width. + _gpu_predictor2_decode( + d_decomp, tile_width, total_rows, dtype, samples) elif predictor == 3: total_rows = n_tiles * tile_height tpb = min(256, total_rows) @@ -1629,16 +1746,11 @@ def gpu_decode_tiles( # Apply predictor on GPU if predictor == 2: - # Horizontal differencing: one thread per row across all tiles + # Sample-level horizontal differencing: stride is samples_per_pixel + # samples; per-dtype kernels handle the natural-width modular wrap. total_rows = n_tiles * tile_height - tpb = min(256, total_rows) - bpg = math.ceil(total_rows / tpb) - # Reshape so each tile's rows are contiguous (they already are). - # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel - # width and full per-pixel size (itemsize * samples). - _predictor_decode_kernel[bpg, tpb]( - d_decomp, tile_width, total_rows, dtype.itemsize * samples) - cuda.synchronize() + _gpu_predictor2_decode( + d_decomp, tile_width, total_rows, dtype, samples) elif predictor == 3: # Float predictor: one thread per row @@ -1720,21 +1832,64 @@ def _extract_tiles_kernel( # --------------------------------------------------------------------------- @cuda.jit -def _predictor_encode_kernel(data, width, height, bytes_per_sample): - """Apply horizontal differencing (predictor=2), one thread per row. - Process right-to-left to avoid overwriting values we still need. - """ +def _predictor_encode_kernel_u8(data, width, height, samples_per_pixel): + """Apply predictor=2 for 8-bit samples (right-to-left), one thread per row.""" row = cuda.grid(1) if row >= height: return - row_bytes = width * bytes_per_sample + row_bytes = width * samples_per_pixel row_start = row * row_bytes - for col in range(row_bytes - 1, bytes_per_sample - 1, -1): + for col in range(row_bytes - 1, samples_per_pixel - 1, -1): idx = row_start + col data[idx] = numba_uint8( - (numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF) + (numba_int32(data[idx]) - numba_int32(data[idx - samples_per_pixel])) & 0xFF) + + +@cuda.jit +def _predictor_encode_kernel_u16(view, width, height, samples_per_pixel): + """Apply predictor=2 on a uint16 view (right-to-left), one thread per row.""" + row = cuda.grid(1) + if row >= height: + return + + row_samples = width * samples_per_pixel + row_start = row * row_samples + + for col in range(row_samples - 1, samples_per_pixel - 1, -1): + idx = row_start + col + view[idx] = (view[idx] - view[idx - samples_per_pixel]) & numba_int32(0xFFFF) + + +@cuda.jit +def _predictor_encode_kernel_u32(view, width, height, samples_per_pixel): + """Apply predictor=2 on a uint32 view (right-to-left), one thread per row.""" + row = cuda.grid(1) + if row >= height: + return + + row_samples = width * samples_per_pixel + row_start = row * row_samples + + for col in range(row_samples - 1, samples_per_pixel - 1, -1): + idx = row_start + col + view[idx] = (view[idx] - view[idx - samples_per_pixel]) & numba_uint32(0xFFFFFFFF) + + +@cuda.jit +def _predictor_encode_kernel_u64(view, width, height, samples_per_pixel): + """Apply predictor=2 on a uint64 view (right-to-left), one thread per row.""" + row = cuda.grid(1) + if row >= height: + return + + row_samples = width * samples_per_pixel + row_start = row * row_samples + + for col in range(row_samples - 1, samples_per_pixel - 1, -1): + idx = row_start + col + view[idx] = view[idx] - view[idx - samples_per_pixel] @cuda.jit @@ -2305,11 +2460,10 @@ def gpu_compress_tiles(d_image, tile_width, tile_height, # Apply predictor encode on GPU total_rows = n_tiles * tile_height if predictor == 2: - tpb_r = min(256, total_rows) - bpg_r = math.ceil(total_rows / tpb_r) - _predictor_encode_kernel[bpg_r, tpb_r]( - d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples) - cuda.synchronize() + # Sample-level differencing: stride is samples_per_pixel samples, + # row width is tile_width pixels. + _gpu_predictor2_encode( + d_tile_buf, tile_width, total_rows, dtype, samples) elif predictor == 3: tpb_r = min(256, total_rows) bpg_r = math.ceil(total_rows / tpb_r) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index feb59574..2770c250 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -341,15 +341,20 @@ def _open_source(source: str): def _apply_predictor(chunk: np.ndarray, pred: int, width: int, height: int, bytes_per_sample: int, - samples: int = 1) -> np.ndarray: + samples: int = 1, + byte_order: str = '<') -> np.ndarray: """Apply the appropriate predictor decode to decompressed data. ``width``, ``height``, ``bytes_per_sample``, and ``samples`` describe the raw pixel layout before predictor inversion: ``width * samples`` samples per row, each ``bytes_per_sample`` bytes wide. - Predictor=2 (horizontal differencing) works byte-wise on a stride of - ``bytes_per_sample * samples``. + Predictor=2 (horizontal differencing) operates at the *sample* level + per TIFF Technical Note (libtiff/GDAL convention): the difference is + taken between adjacent same-component samples in the sample's + natural bit width, with stride equal to ``samples`` samples. A + byte-wise implementation drops the inter-byte carry for multi-byte + samples and produces wrong values. Predictor=3 (floating-point) byte-swizzles each row into ``bytes_per_sample`` interleaved lanes of length ``width * samples``, @@ -359,7 +364,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int, """ if pred == 2: return predictor_decode(chunk, width, height, - bytes_per_sample * samples) + bytes_per_sample, samples=samples, + byte_order=byte_order) elif pred == 3: return fp_predictor_decode(chunk, width * samples, height, bytes_per_sample) @@ -413,7 +419,8 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples, if not chunk.flags.writeable: chunk = chunk.copy() chunk = _apply_predictor(chunk, pred, width, height, - bytes_per_sample, samples=samples) + bytes_per_sample, samples=samples, + byte_order=byte_order) if is_sub_byte: pixels = unpack_bits(chunk, bps, pixel_count) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 8f0d7197..a04b1673 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -94,10 +94,15 @@ def normalize_predictor(predictor, dtype, compression: int) -> int: def _apply_predictor_encode(buf: np.ndarray, predictor: int, width: int, height: int, bytes_per_sample: int, samples: int) -> np.ndarray: - """Apply the chosen predictor to a flat uint8 buffer.""" + """Apply the chosen predictor to a flat uint8 buffer. + + Files always go to disk in little-endian order (see ``BO``), so + ``predictor_encode`` is invoked with ``byte_order='<'``. + """ if predictor == 2: return predictor_encode(buf, width, height, - bytes_per_sample * samples) + bytes_per_sample, samples=samples, + byte_order=BO) if predictor == 3: return fp_predictor_encode(buf, width * samples, height, bytes_per_sample) diff --git a/xrspatial/geotiff/tests/test_predictor_multisample.py b/xrspatial/geotiff/tests/test_predictor_multisample.py index 31f52af0..92828b9a 100644 --- a/xrspatial/geotiff/tests/test_predictor_multisample.py +++ b/xrspatial/geotiff/tests/test_predictor_multisample.py @@ -402,6 +402,194 @@ def test_apply_predictor3_matches_tn3_reference_1247(): # Issue #1479: GPU predictor=3 multi-sample coverage gap # --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Sample-level predictor=2 (libtiff/GDAL convention) for multi-byte dtypes +# --------------------------------------------------------------------------- +# +# Per TIFF Technical Note: predictor=2 differences are taken between +# adjacent same-component samples in the sample's natural bit width +# (uint16 wraps at 65536, uint32 at 2^32, ...). A byte-wise +# implementation drops the inter-byte carry for any sample wider than +# one byte, so xrspatial must read uint16/int16/uint32/int32 TIFFs +# written with predictor=2 by libtiff-compatible tools (rasterio, GDAL, +# tifffile) without corruption, and write TIFFs that those tools can +# read back without corruption. + +try: + import tifffile as _tifffile # noqa: F401 + _HAS_TIFFFILE = True +except Exception: + _HAS_TIFFFILE = False + + +tifffile_required = pytest.mark.skipif( + not _HAS_TIFFFILE, reason="Requires the tifffile package") + + +@tifffile_required +@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32", "int32"]) +def test_predictor2_reads_libtiff_multibyte_correctly(tmp_path, dtype_str): + """xrspatial reads predictor=2 TIFFs with multi-byte samples correctly. + + The bug: the byte-wise predictor decode dropped inter-byte carry, + so libtiff-style sample-wise differences came back as garbage. We + write a known array via tifffile (which uses libtiff conventions) + and verify xrspatial reads it back exactly. + """ + import tifffile + + dtype = np.dtype(dtype_str) + arr = np.array([[1000, 2000, 3000, 4000], + [5000, 6000, 7000, 8000], + [9000, 10000, 11000, 12000], + [13000, 14000, 15000, 16000]], dtype=dtype) + + path = str(tmp_path / f"libtiff_pred2_{dtype_str}.tif") + tifffile.imwrite(path, arr, compression='deflate', predictor=2) + + out = open_geotiff(path).values + np.testing.assert_array_equal(out, arr) + + +@tifffile_required +def test_predictor2_reads_libtiff_multiband_uint16(tmp_path): + """Multi-band chunky uint16 with predictor=2 round-trips through tifffile.""" + import tifffile + + arr = (np.arange(48).reshape(4, 4, 3) * 100).astype(np.uint16) + path = str(tmp_path / "libtiff_pred2_rgb_uint16.tif") + tifffile.imwrite(path, arr, compression='deflate', + predictor=2, photometric='rgb') + + out = open_geotiff(path).values + np.testing.assert_array_equal(out, arr) + + +@tifffile_required +@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32", "int32"]) +def test_predictor2_writer_interops_with_libtiff(tmp_path, dtype_str): + """xrspatial-written predictor=2 TIFFs decode correctly under tifffile. + + The encoder must produce sample-level differences so that + libtiff/GDAL/rasterio can decode the file. Round-trip through + xrspatial alone is not enough -- a byte-wise encoder paired with a + byte-wise decoder agrees with itself but corrupts the file for + everyone else. + """ + import tifffile + + dtype = np.dtype(dtype_str) + arr = (np.arange(16, dtype=dtype) * 250).reshape(4, 4) + da = xr.DataArray(arr, dims=('y', 'x')) + + path = str(tmp_path / f"xrs_pred2_{dtype_str}.tif") + to_geotiff(da, path, compression='deflate', predictor=2) + + out_xrs = open_geotiff(path).values + np.testing.assert_array_equal(out_xrs, arr) + + out_tiff = tifffile.imread(path) + np.testing.assert_array_equal(out_tiff, arr) + + +@gpu_only +@tifffile_required +@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32"]) +def test_gpu_predictor2_multibyte_matches_cpu(tmp_path, dtype_str): + """GPU decode of predictor=2 with multi-byte samples matches CPU. + + Regression for the same sample-level differencing fix on the GPU + kernels. The image is written via tifffile (libtiff convention) so + both backends must walk the predictor inverse at sample resolution. + """ + import tifffile + + dtype = np.dtype(dtype_str) + h, w = 32, 32 + rng = np.random.RandomState(42) + high = np.iinfo(dtype).max // 4 + low = np.iinfo(dtype).min // 4 if dtype.kind == 'i' else 0 + data = rng.randint(low, high, size=(h, w), dtype=dtype) + + path = str(tmp_path / f"gpu_pred2_{dtype_str}.tif") + tifffile.imwrite(path, data, compression='deflate', predictor=2, + tile=(16, 16)) + + cpu_arr = open_geotiff(path).values + np.testing.assert_array_equal(cpu_arr, data) + + gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True)) + np.testing.assert_array_equal(gpu_arr, cpu_arr) + + +@gpu_only +@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32"]) +def test_gpu_predictor2_multibyte_writer_round_trip(tmp_path, dtype_str): + """xrspatial writer + GPU reader round-trip for multi-byte predictor=2.""" + dtype = np.dtype(dtype_str) + h, w = 32, 32 + rng = np.random.RandomState(7) + high = np.iinfo(dtype).max // 4 + low = np.iinfo(dtype).min // 4 if dtype.kind == 'i' else 0 + data = rng.randint(low, high, size=(h, w), dtype=dtype) + da = xr.DataArray(data, dims=['y', 'x']) + + path = str(tmp_path / f"gpu_pred2_writer_{dtype_str}.tif") + to_geotiff(da, path, compression='deflate', tile_size=16, predictor=2) + + cpu_arr = open_geotiff(path).values + np.testing.assert_array_equal(cpu_arr, data) + + gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True)) + np.testing.assert_array_equal(gpu_arr, cpu_arr) + + +@gpu_only +@tifffile_required +def test_gpu_predictor2_multiband_uint16_matches_cpu(tmp_path): + """GPU decode of multi-band uint16 predictor=2 matches CPU and source.""" + import tifffile + + arr = (np.arange(32 * 32 * 3).reshape(32, 32, 3) % 50000).astype(np.uint16) + path = str(tmp_path / "gpu_pred2_rgb_uint16.tif") + tifffile.imwrite(path, arr, compression='deflate', predictor=2, + photometric='rgb', tile=(16, 16)) + + cpu_arr = open_geotiff(path).values + np.testing.assert_array_equal(cpu_arr, arr) + + gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True)) + np.testing.assert_array_equal(gpu_arr, cpu_arr) + + +@gpu_only +@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32"]) +def test_gpu_predictor2_writer_round_trip(tmp_path, dtype_str): + """xrspatial writer + GPU encode path round-trip for multi-byte predictor=2. + + With ``gpu=True`` the writer takes the CUDA encode path; the file + that lands on disk must still decode correctly on both CPU and GPU + readers. + """ + dtype = np.dtype(dtype_str) + h, w = 32, 32 + rng = np.random.RandomState(1234) + high = np.iinfo(dtype).max // 4 + low = np.iinfo(dtype).min // 4 if dtype.kind == 'i' else 0 + data = rng.randint(low, high, size=(h, w), dtype=dtype) + da = xr.DataArray(data, dims=['y', 'x']) + + path = str(tmp_path / f"gpu_pred2_enc_{dtype_str}.tif") + to_geotiff(da, path, compression='deflate', tile_size=16, + predictor=2, gpu=True) + + cpu_arr = open_geotiff(path).values + np.testing.assert_array_equal(cpu_arr, data) + + gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True)) + np.testing.assert_array_equal(gpu_arr, cpu_arr) + + @gpu_only @pytest.mark.parametrize("samples,dtype_str", [ (3, "float32"),