diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv
index f4b17e1e..939ccd0d 100644
--- a/.claude/sweep-accuracy-state.csv
+++ b/.claude/sweep-accuracy-state.csv
@@ -12,7 +12,7 @@ emerging_hotspots,2026-04-30,,MEDIUM,2;3,MEDIUM: threshold_90 uses int() (trunca
 fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance). NaN handled via x!=x; CUDA bounds use strict <; rdnbr and ros divisions guarded; CPU/GPU/dask paths algorithmically identical. No accuracy issues found.
 flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage."
 focal,2026-03-30T13:00:00Z,1092,,,
-geotiff,2026-04-23,1247,HIGH,1;3;4;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py.
+geotiff,2026-05-05,1498,HIGH,1;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. No CRIT findings beyond the predictor=2 fix.
 glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging"
 hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output."
 hydro,2026-04-30,,LOW,1,Only LOW: twi log(0)=-inf if fa=0 (out-of-contract); MFD weighted sum no Kahan (negligible). No CRIT/HIGH issues.
diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py
index 087ebbe9..ef604ed1 100644
--- a/xrspatial/geotiff/_compression.py
+++ b/xrspatial/geotiff/_compression.py
@@ -333,80 +333,231 @@ def lzw_compress(data: bytes) -> bytes:
 
 
 # -- Horizontal predictor (Numba) --------------------------------------------
+#
+# TIFF predictor=2 (horizontal differencing) operates at the *sample* level,
+# not the byte level.  Per TIFF Technical Note: "the difference is taken
+# between adjacent samples (not between adjacent bytes) of the same
+# component."  This means for multi-byte samples the encoder computes
+#
+#     diff[i] = sample[i] - sample[i - samples_per_pixel]   (mod 2^bits)
+#
+# in the sample's natural bit width (uint16 wraps at 65536, etc.), not
+# byte-by-byte.  A byte-wise implementation drops the inter-byte carry
+# and produces wrong values for any sample wider than one byte.  This
+# is what libtiff's horAcc16/horAcc32 do, and what GDAL/rasterio/tifffile
+# write to disk.  The byte-wise path remains correct (and faster) for the
+# 8-bit case.
+
 
 @ngjit
-def _predictor_decode(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing predictor (TIFF predictor=2).
+def _predictor_decode_u8(data, width, height, samples_per_pixel):
+    """Undo predictor=2 for 8-bit samples (one byte per sample).
 
-    Operates in-place on the flat byte array, performing cumulative sum
-    per row at the sample level.
+    Stride is samples_per_pixel bytes.  The byte-wise modular sum is
+    correct because each sample fits in a single byte.
     """
-    row_bytes = width * bytes_per_sample
+    row_bytes = width * samples_per_pixel
     for row in range(height):
         row_start = row * row_bytes
-        for col in range(bytes_per_sample, row_bytes):
+        for col in range(samples_per_pixel, row_bytes):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) + np.int32(data[idx - samples_per_pixel])) & 0xFF)
 
 
 @ngjit
-def _predictor_encode(data, width, height, bytes_per_sample):
-    """Apply horizontal differencing predictor (TIFF predictor=2).
+def _predictor_decode_u16(view, width, height, samples_per_pixel):
+    """Undo predictor=2 on a uint16 view (in-place, sample-resolution).
 
-    Operates in-place, converting absolute values to differences.
-    Process right-to-left to avoid overwriting values we still need.
+    *view* indexes by sample.  Stride is *samples_per_pixel* samples.
     """
-    row_bytes = width * bytes_per_sample
+    row_samples = width * samples_per_pixel
+    for row in range(height):
+        row_start = row * row_samples
+        for col in range(samples_per_pixel, row_samples):
+            idx = row_start + col
+            view[idx] = np.uint16(
+                (np.int32(view[idx]) + np.int32(view[idx - samples_per_pixel])) & 0xFFFF)
+
+
+@ngjit
+def _predictor_decode_u32(view, width, height, samples_per_pixel):
+    """Undo predictor=2 on a uint32 view (in-place, sample-resolution)."""
+    row_samples = width * samples_per_pixel
+    for row in range(height):
+        row_start = row * row_samples
+        for col in range(samples_per_pixel, row_samples):
+            idx = row_start + col
+            view[idx] = np.uint32(
+                (np.uint64(view[idx]) + np.uint64(view[idx - samples_per_pixel]))
+                & np.uint64(0xFFFFFFFF))
+
+
+@ngjit
+def _predictor_decode_u64(view, width, height, samples_per_pixel):
+    """Undo predictor=2 on a uint64 view (in-place, sample-resolution)."""
+    row_samples = width * samples_per_pixel
+    for row in range(height):
+        row_start = row * row_samples
+        for col in range(samples_per_pixel, row_samples):
+            idx = row_start + col
+            view[idx] = np.uint64(view[idx]) + np.uint64(view[idx - samples_per_pixel])
+
+
+@ngjit
+def _predictor_encode_u8(data, width, height, samples_per_pixel):
+    """Apply predictor=2 for 8-bit samples (right-to-left, in-place)."""
+    row_bytes = width * samples_per_pixel
     for row in range(height):
         row_start = row * row_bytes
-        for col in range(row_bytes - 1, bytes_per_sample - 1, -1):
+        for col in range(row_bytes - 1, samples_per_pixel - 1, -1):
+            idx = row_start + col
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) - np.int32(data[idx - samples_per_pixel])) & 0xFF)
+
+
+@ngjit
+def _predictor_encode_u16(view, width, height, samples_per_pixel):
+    """Apply predictor=2 on a uint16 view (right-to-left, in-place)."""
+    row_samples = width * samples_per_pixel
+    for row in range(height):
+        row_start = row * row_samples
+        for col in range(row_samples - 1, samples_per_pixel - 1, -1):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            view[idx] = np.uint16(
+                (np.int32(view[idx]) - np.int32(view[idx - samples_per_pixel])) & 0xFFFF)
+
+
+@ngjit
+def _predictor_encode_u32(view, width, height, samples_per_pixel):
+    """Apply predictor=2 on a uint32 view (right-to-left, in-place)."""
+    row_samples = width * samples_per_pixel
+    for row in range(height):
+        row_start = row * row_samples
+        for col in range(row_samples - 1, samples_per_pixel - 1, -1):
+            idx = row_start + col
+            view[idx] = np.uint32(
+                (np.uint64(view[idx]) - np.uint64(view[idx - samples_per_pixel]))
+                & np.uint64(0xFFFFFFFF))
+
+
+@ngjit
+def _predictor_encode_u64(view, width, height, samples_per_pixel):
+    """Apply predictor=2 on a uint64 view (right-to-left, in-place)."""
+    row_samples = width * samples_per_pixel
+    for row in range(height):
+        row_start = row * row_samples
+        for col in range(row_samples - 1, samples_per_pixel - 1, -1):
+            idx = row_start + col
+            view[idx] = np.uint64(view[idx]) - np.uint64(view[idx - samples_per_pixel])
+
+
+def _sample_view(buf: np.ndarray, bytes_per_sample: int,
+                 byte_order: str = '<') -> np.ndarray:
+    """Return *buf* viewed as a 1-D array of unsigned samples.
+
+    The view shares memory with *buf* and uses the file's byte order so
+    that the sample-resolution differencing wraps at the correct width.
+    """
+    if bytes_per_sample == 1:
+        return buf
+    sample_dtype = np.dtype(f'{byte_order}u{bytes_per_sample}')
+    return buf.view(sample_dtype)
 
 
 def predictor_decode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int, samples: int = 1,
+                     byte_order: str = '<') -> np.ndarray:
     """Undo horizontal differencing predictor (predictor=2).
 
+    Operates at the sample level, per TIFF Technical Note: differences
+    are taken between adjacent same-component samples in the sample's
+    natural bit width.  For multi-band chunky data the stride is the
+    number of samples per pixel.
+
     Parameters
     ----------
     data : np.ndarray
         Flat uint8 array of decompressed pixel data (modified in-place).
     width, height : int
-        Image dimensions.
+        Image dimensions in pixels.
     bytes_per_sample : int
-        Bytes per sample (e.g. 1 for uint8, 4 for float32).
+        Bytes per sample (1, 2, 4, or 8).
+    samples : int
+        Samples per pixel (1 for single-band, 3 for RGB, ...).
+    byte_order : str
+        '<' for little-endian, '>' for big-endian.  Used to interpret
+        multi-byte sample bytes during the in-place differencing.
 
     Returns
     -------
     np.ndarray
-        Same array, modified in-place.
+        The same uint8 array with predictor=2 inverted.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_decode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        _predictor_decode_u8(buf, width, height, samples)
+        return buf
+
+    view = _sample_view(buf, bytes_per_sample, byte_order)
+    if bytes_per_sample == 2:
+        _predictor_decode_u16(view, width, height, samples)
+    elif bytes_per_sample == 4:
+        _predictor_decode_u32(view, width, height, samples)
+    elif bytes_per_sample == 8:
+        _predictor_decode_u64(view, width, height, samples)
+    else:
+        raise ValueError(
+            f"predictor=2 with bytes_per_sample={bytes_per_sample} is not "
+            "supported (TIFF specifies sample-level differencing for 1/2/4/8 "
+            "byte samples)."
+        )
     return buf
 
 
 def predictor_encode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int, samples: int = 1,
+                     byte_order: str = '<') -> np.ndarray:
     """Apply horizontal differencing predictor (predictor=2).
 
+    See :func:`predictor_decode` for the sample-level semantics.
+
     Parameters
     ----------
     data : np.ndarray
         Flat uint8 array of pixel data (modified in-place).
     width, height : int
-        Image dimensions.
+        Image dimensions in pixels.
     bytes_per_sample : int
         Bytes per sample.
+    samples : int
+        Samples per pixel.
+    byte_order : str
+        '<' or '>'.  Defaults to little-endian, the writer's output order.
 
     Returns
     -------
     np.ndarray
-        Same array, modified in-place.
+        The same uint8 array with predictor=2 applied.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_encode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        _predictor_encode_u8(buf, width, height, samples)
+        return buf
+
+    view = _sample_view(buf, bytes_per_sample, byte_order)
+    if bytes_per_sample == 2:
+        _predictor_encode_u16(view, width, height, samples)
+    elif bytes_per_sample == 4:
+        _predictor_encode_u32(view, width, height, samples)
+    elif bytes_per_sample == 8:
+        _predictor_encode_u64(view, width, height, samples)
+    else:
+        raise ValueError(
+            f"predictor=2 with bytes_per_sample={bytes_per_sample} is not "
+            "supported (TIFF specifies sample-level differencing for 1/2/4/8 "
+            "byte samples)."
+        )
     return buf
 
 
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 67c36829..0598a6f9 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -208,7 +208,14 @@ def _lzw_decode_tiles_kernel(
 
 
 # Type aliases for Numba CUDA local arrays
-from numba import int32 as numba_int32, uint8 as numba_uint8, int64 as numba_int64
+from numba import (
+    int32 as numba_int32,
+    uint8 as numba_uint8,
+    uint16 as numba_uint16,
+    uint32 as numba_uint32,
+    uint64 as numba_uint64,
+    int64 as numba_int64,
+)
 
 
 # ---------------------------------------------------------------------------
@@ -628,19 +635,68 @@ def _inflate_tiles_kernel(
 # ---------------------------------------------------------------------------
 
 @cuda.jit
-def _predictor_decode_kernel(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing (predictor=2), one thread per row."""
+def _predictor_decode_kernel_u8(data, width, height, samples_per_pixel):
+    """Undo predictor=2 for 8-bit samples, one thread per row.
+
+    Stride is ``samples_per_pixel`` bytes.  Byte-wise modular sum is
+    correct here because each sample fits in a single byte.
+    """
     row = cuda.grid(1)
     if row >= height:
         return
 
-    row_bytes = width * bytes_per_sample
+    row_bytes = width * samples_per_pixel
     row_start = row * row_bytes
 
-    for col in range(bytes_per_sample, row_bytes):
+    for col in range(samples_per_pixel, row_bytes):
         idx = row_start + col
         data[idx] = numba_uint8(
-            (numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF)
+            (numba_int32(data[idx]) + numba_int32(data[idx - samples_per_pixel])) & 0xFF)
+
+
+@cuda.jit
+def _predictor_decode_kernel_u16(view, width, height, samples_per_pixel):
+    """Undo predictor=2 on a uint16 view, one thread per row."""
+    row = cuda.grid(1)
+    if row >= height:
+        return
+
+    row_samples = width * samples_per_pixel
+    row_start = row * row_samples
+
+    for col in range(samples_per_pixel, row_samples):
+        idx = row_start + col
+        view[idx] = (view[idx] + view[idx - samples_per_pixel]) & numba_int32(0xFFFF)
+
+
+@cuda.jit
+def _predictor_decode_kernel_u32(view, width, height, samples_per_pixel):
+    """Undo predictor=2 on a uint32 view, one thread per row."""
+    row = cuda.grid(1)
+    if row >= height:
+        return
+
+    row_samples = width * samples_per_pixel
+    row_start = row * row_samples
+
+    for col in range(samples_per_pixel, row_samples):
+        idx = row_start + col
+        view[idx] = (view[idx] + view[idx - samples_per_pixel]) & numba_uint32(0xFFFFFFFF)
+
+
+@cuda.jit
+def _predictor_decode_kernel_u64(view, width, height, samples_per_pixel):
+    """Undo predictor=2 on a uint64 view, one thread per row."""
+    row = cuda.grid(1)
+    if row >= height:
+        return
+
+    row_samples = width * samples_per_pixel
+    row_start = row * row_samples
+
+    for col in range(samples_per_pixel, row_samples):
+        idx = row_start + col
+        view[idx] = view[idx] + view[idx - samples_per_pixel]
 
 
 @cuda.jit
@@ -1370,6 +1426,70 @@ class _NvcompDecompOpts(ctypes.Structure):
         return None
 
 
+def _gpu_predictor2_decode(d_decomp, tile_width, total_rows, dtype, samples):
+    """Run the right predictor=2 decode kernel for *dtype*.
+
+    TIFF predictor=2 differences adjacent same-component samples in the
+    sample's natural width (uint8/16/32/64).  We view the byte buffer as
+    the matching unsigned dtype and dispatch to a per-width kernel so
+    the modular wrap matches what GDAL/libtiff write.
+    """
+    import cupy
+
+    tpb = min(256, total_rows) if total_rows > 0 else 1
+    bpg = math.ceil(total_rows / tpb) if tpb > 0 else 1
+    bps = dtype.itemsize
+
+    if bps == 1:
+        _predictor_decode_kernel_u8[bpg, tpb](
+            d_decomp, tile_width, total_rows, samples)
+    elif bps == 2:
+        view = d_decomp.view(cupy.uint16)
+        _predictor_decode_kernel_u16[bpg, tpb](
+            view, tile_width, total_rows, samples)
+    elif bps == 4:
+        view = d_decomp.view(cupy.uint32)
+        _predictor_decode_kernel_u32[bpg, tpb](
+            view, tile_width, total_rows, samples)
+    elif bps == 8:
+        view = d_decomp.view(cupy.uint64)
+        _predictor_decode_kernel_u64[bpg, tpb](
+            view, tile_width, total_rows, samples)
+    else:
+        raise ValueError(
+            f"GPU predictor=2 unsupported for bytes_per_sample={bps}")
+    cuda.synchronize()
+
+
+def _gpu_predictor2_encode(d_decomp, tile_width, total_rows, dtype, samples):
+    """Run the right predictor=2 encode kernel for *dtype*."""
+    import cupy
+
+    tpb = min(256, total_rows) if total_rows > 0 else 1
+    bpg = math.ceil(total_rows / tpb) if tpb > 0 else 1
+    bps = dtype.itemsize
+
+    if bps == 1:
+        _predictor_encode_kernel_u8[bpg, tpb](
+            d_decomp, tile_width, total_rows, samples)
+    elif bps == 2:
+        view = d_decomp.view(cupy.uint16)
+        _predictor_encode_kernel_u16[bpg, tpb](
+            view, tile_width, total_rows, samples)
+    elif bps == 4:
+        view = d_decomp.view(cupy.uint32)
+        _predictor_encode_kernel_u32[bpg, tpb](
+            view, tile_width, total_rows, samples)
+    elif bps == 8:
+        view = d_decomp.view(cupy.uint64)
+        _predictor_encode_kernel_u64[bpg, tpb](
+            view, tile_width, total_rows, samples)
+    else:
+        raise ValueError(
+            f"GPU predictor=2 unsupported for bytes_per_sample={bps}")
+    cuda.synchronize()
+
+
 def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
                                     tile_width, tile_height,
                                     image_width, image_height,
@@ -1381,14 +1501,11 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
 
     if predictor == 2:
         total_rows = n_tiles * tile_height
-        tpb = min(256, total_rows)
-        bpg = math.ceil(total_rows / tpb)
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples). Matches CPU
-        # call at _reader.py _apply_predictor(..., bytes_per_sample * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
-        cuda.synchronize()
+        # Sample-level differencing: stride is samples_per_pixel samples,
+        # row width is tile_width pixels.  Per-dtype kernels handle the
+        # modular wrap at the sample's natural bit width.
+        _gpu_predictor2_decode(
+            d_decomp, tile_width, total_rows, dtype, samples)
     elif predictor == 3:
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
@@ -1629,16 +1746,11 @@ def gpu_decode_tiles(
 
     # Apply predictor on GPU
     if predictor == 2:
-        # Horizontal differencing: one thread per row across all tiles
+        # Sample-level horizontal differencing: stride is samples_per_pixel
+        # samples; per-dtype kernels handle the natural-width modular wrap.
         total_rows = n_tiles * tile_height
-        tpb = min(256, total_rows)
-        bpg = math.ceil(total_rows / tpb)
-        # Reshape so each tile's rows are contiguous (they already are).
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
-        cuda.synchronize()
+        _gpu_predictor2_decode(
+            d_decomp, tile_width, total_rows, dtype, samples)
 
     elif predictor == 3:
         # Float predictor: one thread per row
@@ -1720,21 +1832,64 @@ def _extract_tiles_kernel(
 # ---------------------------------------------------------------------------
 
 @cuda.jit
-def _predictor_encode_kernel(data, width, height, bytes_per_sample):
-    """Apply horizontal differencing (predictor=2), one thread per row.
-    Process right-to-left to avoid overwriting values we still need.
-    """
+def _predictor_encode_kernel_u8(data, width, height, samples_per_pixel):
+    """Apply predictor=2 for 8-bit samples (right-to-left), one thread per row."""
     row = cuda.grid(1)
     if row >= height:
         return
 
-    row_bytes = width * bytes_per_sample
+    row_bytes = width * samples_per_pixel
     row_start = row * row_bytes
 
-    for col in range(row_bytes - 1, bytes_per_sample - 1, -1):
+    for col in range(row_bytes - 1, samples_per_pixel - 1, -1):
         idx = row_start + col
         data[idx] = numba_uint8(
-            (numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF)
+            (numba_int32(data[idx]) - numba_int32(data[idx - samples_per_pixel])) & 0xFF)
+
+
+@cuda.jit
+def _predictor_encode_kernel_u16(view, width, height, samples_per_pixel):
+    """Apply predictor=2 on a uint16 view (right-to-left), one thread per row."""
+    row = cuda.grid(1)
+    if row >= height:
+        return
+
+    row_samples = width * samples_per_pixel
+    row_start = row * row_samples
+
+    for col in range(row_samples - 1, samples_per_pixel - 1, -1):
+        idx = row_start + col
+        view[idx] = (view[idx] - view[idx - samples_per_pixel]) & numba_int32(0xFFFF)
+
+
+@cuda.jit
+def _predictor_encode_kernel_u32(view, width, height, samples_per_pixel):
+    """Apply predictor=2 on a uint32 view (right-to-left), one thread per row."""
+    row = cuda.grid(1)
+    if row >= height:
+        return
+
+    row_samples = width * samples_per_pixel
+    row_start = row * row_samples
+
+    for col in range(row_samples - 1, samples_per_pixel - 1, -1):
+        idx = row_start + col
+        view[idx] = (view[idx] - view[idx - samples_per_pixel]) & numba_uint32(0xFFFFFFFF)
+
+
+@cuda.jit
+def _predictor_encode_kernel_u64(view, width, height, samples_per_pixel):
+    """Apply predictor=2 on a uint64 view (right-to-left), one thread per row."""
+    row = cuda.grid(1)
+    if row >= height:
+        return
+
+    row_samples = width * samples_per_pixel
+    row_start = row * row_samples
+
+    for col in range(row_samples - 1, samples_per_pixel - 1, -1):
+        idx = row_start + col
+        view[idx] = view[idx] - view[idx - samples_per_pixel]
 
 
 @cuda.jit
@@ -2305,11 +2460,10 @@ def gpu_compress_tiles(d_image, tile_width, tile_height,
     # Apply predictor encode on GPU
     total_rows = n_tiles * tile_height
     if predictor == 2:
-        tpb_r = min(256, total_rows)
-        bpg_r = math.ceil(total_rows / tpb_r)
-        _predictor_encode_kernel[bpg_r, tpb_r](
-            d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples)
-        cuda.synchronize()
+        # Sample-level differencing: stride is samples_per_pixel samples,
+        # row width is tile_width pixels.
+        _gpu_predictor2_encode(
+            d_tile_buf, tile_width, total_rows, dtype, samples)
     elif predictor == 3:
         tpb_r = min(256, total_rows)
         bpg_r = math.ceil(total_rows / tpb_r)
diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py
index feb59574..2770c250 100644
--- a/xrspatial/geotiff/_reader.py
+++ b/xrspatial/geotiff/_reader.py
@@ -341,15 +341,20 @@ def _open_source(source: str):
 
 def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
                      height: int, bytes_per_sample: int,
-                     samples: int = 1) -> np.ndarray:
+                     samples: int = 1,
+                     byte_order: str = '<') -> np.ndarray:
     """Apply the appropriate predictor decode to decompressed data.
 
     ``width``, ``height``, ``bytes_per_sample``, and ``samples`` describe
     the raw pixel layout before predictor inversion: ``width * samples``
     samples per row, each ``bytes_per_sample`` bytes wide.
 
-    Predictor=2 (horizontal differencing) works byte-wise on a stride of
-    ``bytes_per_sample * samples``.
+    Predictor=2 (horizontal differencing) operates at the *sample* level
+    per TIFF Technical Note (libtiff/GDAL convention): the difference is
+    taken between adjacent same-component samples in the sample's
+    natural bit width, with stride equal to ``samples`` samples.  A
+    byte-wise implementation drops the inter-byte carry for multi-byte
+    samples and produces wrong values.
 
     Predictor=3 (floating-point) byte-swizzles each row into
     ``bytes_per_sample`` interleaved lanes of length ``width * samples``,
@@ -359,7 +364,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
     """
     if pred == 2:
         return predictor_decode(chunk, width, height,
-                                bytes_per_sample * samples)
+                                bytes_per_sample, samples=samples,
+                                byte_order=byte_order)
     elif pred == 3:
         return fp_predictor_decode(chunk, width * samples, height,
                                    bytes_per_sample)
@@ -413,7 +419,8 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples,
         if not chunk.flags.writeable:
             chunk = chunk.copy()
         chunk = _apply_predictor(chunk, pred, width, height,
-                                 bytes_per_sample, samples=samples)
+                                 bytes_per_sample, samples=samples,
+                                 byte_order=byte_order)
 
     if is_sub_byte:
         pixels = unpack_bits(chunk, bps, pixel_count)
diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index 8f0d7197..a04b1673 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -94,10 +94,15 @@ def normalize_predictor(predictor, dtype, compression: int) -> int:
 def _apply_predictor_encode(buf: np.ndarray, predictor: int,
                             width: int, height: int,
                             bytes_per_sample: int, samples: int) -> np.ndarray:
-    """Apply the chosen predictor to a flat uint8 buffer."""
+    """Apply the chosen predictor to a flat uint8 buffer.
+
+    Files always go to disk in little-endian order (see ``BO``), so
+    ``predictor_encode`` is invoked with ``byte_order='<'``.
+    """
     if predictor == 2:
         return predictor_encode(buf, width, height,
-                                bytes_per_sample * samples)
+                                bytes_per_sample, samples=samples,
+                                byte_order=BO)
     if predictor == 3:
         return fp_predictor_encode(buf, width * samples, height,
                                    bytes_per_sample)
diff --git a/xrspatial/geotiff/tests/test_predictor_multisample.py b/xrspatial/geotiff/tests/test_predictor_multisample.py
index 31f52af0..92828b9a 100644
--- a/xrspatial/geotiff/tests/test_predictor_multisample.py
+++ b/xrspatial/geotiff/tests/test_predictor_multisample.py
@@ -402,6 +402,194 @@ def test_apply_predictor3_matches_tn3_reference_1247():
 # Issue #1479: GPU predictor=3 multi-sample coverage gap
 # ---------------------------------------------------------------------------
 
+# ---------------------------------------------------------------------------
+# Sample-level predictor=2 (libtiff/GDAL convention) for multi-byte dtypes
+# ---------------------------------------------------------------------------
+#
+# Per TIFF Technical Note: predictor=2 differences are taken between
+# adjacent same-component samples in the sample's natural bit width
+# (uint16 wraps at 65536, uint32 at 2^32, ...).  A byte-wise
+# implementation drops the inter-byte carry for any sample wider than
+# one byte, so xrspatial must read uint16/int16/uint32/int32 TIFFs
+# written with predictor=2 by libtiff-compatible tools (rasterio, GDAL,
+# tifffile) without corruption, and write TIFFs that those tools can
+# read back without corruption.
+
+try:
+    import tifffile as _tifffile  # noqa: F401
+    _HAS_TIFFFILE = True
+except Exception:
+    _HAS_TIFFFILE = False
+
+
+tifffile_required = pytest.mark.skipif(
+    not _HAS_TIFFFILE, reason="Requires the tifffile package")
+
+
+@tifffile_required
+@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32", "int32"])
+def test_predictor2_reads_libtiff_multibyte_correctly(tmp_path, dtype_str):
+    """xrspatial reads predictor=2 TIFFs with multi-byte samples correctly.
+
+    The bug: the byte-wise predictor decode dropped inter-byte carry,
+    so libtiff-style sample-wise differences came back as garbage.  We
+    write a known array via tifffile (which uses libtiff conventions)
+    and verify xrspatial reads it back exactly.
+    """
+    import tifffile
+
+    dtype = np.dtype(dtype_str)
+    arr = np.array([[1000, 2000, 3000, 4000],
+                    [5000, 6000, 7000, 8000],
+                    [9000, 10000, 11000, 12000],
+                    [13000, 14000, 15000, 16000]], dtype=dtype)
+
+    path = str(tmp_path / f"libtiff_pred2_{dtype_str}.tif")
+    tifffile.imwrite(path, arr, compression='deflate', predictor=2)
+
+    out = open_geotiff(path).values
+    np.testing.assert_array_equal(out, arr)
+
+
+@tifffile_required
+def test_predictor2_reads_libtiff_multiband_uint16(tmp_path):
+    """Multi-band chunky uint16 with predictor=2 round-trips through tifffile."""
+    import tifffile
+
+    arr = (np.arange(48).reshape(4, 4, 3) * 100).astype(np.uint16)
+    path = str(tmp_path / "libtiff_pred2_rgb_uint16.tif")
+    tifffile.imwrite(path, arr, compression='deflate',
+                     predictor=2, photometric='rgb')
+
+    out = open_geotiff(path).values
+    np.testing.assert_array_equal(out, arr)
+
+
+@tifffile_required
+@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32", "int32"])
+def test_predictor2_writer_interops_with_libtiff(tmp_path, dtype_str):
+    """xrspatial-written predictor=2 TIFFs decode correctly under tifffile.
+
+    The encoder must produce sample-level differences so that
+    libtiff/GDAL/rasterio can decode the file.  Round-trip through
+    xrspatial alone is not enough -- a byte-wise encoder paired with a
+    byte-wise decoder agrees with itself but corrupts the file for
+    everyone else.
+    """
+    import tifffile
+
+    dtype = np.dtype(dtype_str)
+    arr = (np.arange(16, dtype=dtype) * 250).reshape(4, 4)
+    da = xr.DataArray(arr, dims=('y', 'x'))
+
+    path = str(tmp_path / f"xrs_pred2_{dtype_str}.tif")
+    to_geotiff(da, path, compression='deflate', predictor=2)
+
+    out_xrs = open_geotiff(path).values
+    np.testing.assert_array_equal(out_xrs, arr)
+
+    out_tiff = tifffile.imread(path)
+    np.testing.assert_array_equal(out_tiff, arr)
+
+
+@gpu_only
+@tifffile_required
+@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32"])
+def test_gpu_predictor2_multibyte_matches_cpu(tmp_path, dtype_str):
+    """GPU decode of predictor=2 with multi-byte samples matches CPU.
+
+    Regression for the same sample-level differencing fix on the GPU
+    kernels.  The image is written via tifffile (libtiff convention) so
+    both backends must walk the predictor inverse at sample resolution.
+    """
+    import tifffile
+
+    dtype = np.dtype(dtype_str)
+    h, w = 32, 32
+    rng = np.random.RandomState(42)
+    high = np.iinfo(dtype).max // 4
+    low = np.iinfo(dtype).min // 4 if dtype.kind == 'i' else 0
+    data = rng.randint(low, high, size=(h, w), dtype=dtype)
+
+    path = str(tmp_path / f"gpu_pred2_{dtype_str}.tif")
+    tifffile.imwrite(path, data, compression='deflate', predictor=2,
+                     tile=(16, 16))
+
+    cpu_arr = open_geotiff(path).values
+    np.testing.assert_array_equal(cpu_arr, data)
+
+    gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True))
+    np.testing.assert_array_equal(gpu_arr, cpu_arr)
+
+
+@gpu_only
+@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32"])
+def test_gpu_predictor2_multibyte_writer_round_trip(tmp_path, dtype_str):
+    """xrspatial writer + GPU reader round-trip for multi-byte predictor=2."""
+    dtype = np.dtype(dtype_str)
+    h, w = 32, 32
+    rng = np.random.RandomState(7)
+    high = np.iinfo(dtype).max // 4
+    low = np.iinfo(dtype).min // 4 if dtype.kind == 'i' else 0
+    data = rng.randint(low, high, size=(h, w), dtype=dtype)
+    da = xr.DataArray(data, dims=['y', 'x'])
+
+    path = str(tmp_path / f"gpu_pred2_writer_{dtype_str}.tif")
+    to_geotiff(da, path, compression='deflate', tile_size=16, predictor=2)
+
+    cpu_arr = open_geotiff(path).values
+    np.testing.assert_array_equal(cpu_arr, data)
+
+    gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True))
+    np.testing.assert_array_equal(gpu_arr, cpu_arr)
+
+
+@gpu_only
+@tifffile_required
+def test_gpu_predictor2_multiband_uint16_matches_cpu(tmp_path):
+    """GPU decode of multi-band uint16 predictor=2 matches CPU and source."""
+    import tifffile
+
+    arr = (np.arange(32 * 32 * 3).reshape(32, 32, 3) % 50000).astype(np.uint16)
+    path = str(tmp_path / "gpu_pred2_rgb_uint16.tif")
+    tifffile.imwrite(path, arr, compression='deflate', predictor=2,
+                     photometric='rgb', tile=(16, 16))
+
+    cpu_arr = open_geotiff(path).values
+    np.testing.assert_array_equal(cpu_arr, arr)
+
+    gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True))
+    np.testing.assert_array_equal(gpu_arr, cpu_arr)
+
+
+@gpu_only
+@pytest.mark.parametrize("dtype_str", ["uint16", "int16", "uint32"])
+def test_gpu_predictor2_writer_round_trip(tmp_path, dtype_str):
+    """xrspatial writer + GPU encode path round-trip for multi-byte predictor=2.
+
+    With ``gpu=True`` the writer takes the CUDA encode path; the file
+    that lands on disk must still decode correctly on both CPU and GPU
+    readers.
+    """
+    dtype = np.dtype(dtype_str)
+    h, w = 32, 32
+    rng = np.random.RandomState(1234)
+    high = np.iinfo(dtype).max // 4
+    low = np.iinfo(dtype).min // 4 if dtype.kind == 'i' else 0
+    data = rng.randint(low, high, size=(h, w), dtype=dtype)
+    da = xr.DataArray(data, dims=['y', 'x'])
+
+    path = str(tmp_path / f"gpu_pred2_enc_{dtype_str}.tif")
+    to_geotiff(da, path, compression='deflate', tile_size=16,
+               predictor=2, gpu=True)
+
+    cpu_arr = open_geotiff(path).values
+    np.testing.assert_array_equal(cpu_arr, data)
+
+    gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True))
+    np.testing.assert_array_equal(gpu_arr, cpu_arr)
+
+
 @gpu_only
 @pytest.mark.parametrize("samples,dtype_str", [
     (3, "float32"),