From 985d71b59abcf7791cdcfb6e4da46780427b036f Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 11 May 2026 07:24:01 -0700 Subject: [PATCH 1/2] Fix geotiff metadata propagation gaps (#1580, #1582) Two metadata-propagation defects flagged by the geotiff sweep: * #1580 -- write_geotiff_gpu treated arr.shape[:2] as (height, width) regardless of dim names, so a rioxarray-style (band, y, x) CuPy DataArray was written with the band axis stored as image width. Mirror the CPU writer's moveaxis remap when dims[0] is band/bands/ channel so both backends produce the same file for the same input. * #1582 -- to_geotiff (and the GPU writer it dispatches to) only consulted attrs['nodata'] when extracting the NoData sentinel, silently dropping rioxarray's attrs['nodatavals'] and CF-style attrs['_FillValue']. Add a _resolve_nodata_attr helper that walks those aliases in priority order (nodata -> nodatavals -> _FillValue) so a rioxarray-sourced DataArray round-trips with its sentinel intact. xrspatial's own nodata key still wins when both are set. Tests cover band-first dim ordering across all three accepted band dim names, byte-for-byte CPU/GPU parity, alias resolution priority, NaN-in-nodatavals (no GDAL_NODATA tag emitted), and the GPU writer honouring the same aliases. --- .claude/sweep-metadata-state.csv | 1 + xrspatial/geotiff/__init__.py | 63 +++++- .../tests/test_gpu_writer_band_first_1580.py | 169 ++++++++++++++++ .../tests/test_nodata_attr_aliases_1582.py | 189 ++++++++++++++++++ 4 files changed, 420 insertions(+), 2 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_gpu_writer_band_first_1580.py create mode 100644 xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index c9ce285d..a454a620 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,2 +1,3 @@ module,last_inspected,issue,severity_max,categories_found,notes +geotiff,2026-05-11,1580;1582,HIGH,1;3;4;5,"write_geotiff_gpu silently mis-ordered (band, y, x) DataArrays (#1580). to_geotiff ignored rioxarray nodatavals and CF _FillValue attrs (#1582). Both fixed in same branch." reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 58dc73a7..39da5c14 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -653,6 +653,57 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata): _TIFF_SHORT = 3 +def _resolve_nodata_attr(attrs: dict): + """Resolve a NoData sentinel from DataArray attrs. + + xrspatial's own readers always emit ``attrs['nodata']`` (a scalar), + so that key is checked first for a clean intra-library round-trip. + Falls back to two ecosystem conventions on miss: + + * ``attrs['nodatavals']`` -- rioxarray's per-band tuple. Uses the + first band's value; bands with mixed sentinels are uncommon and + would need an explicit ``nodata=`` argument anyway. + * ``attrs['_FillValue']`` -- CF-style xarray pipelines. + + Returns ``None`` when none of the keys carry a usable value. NaN / + None entries in ``nodatavals`` are skipped rather than treated as + a sentinel (NaN means "the float NaN is the sentinel", which is + already the default and doesn't need a GDAL_NODATA tag). + """ + nodata = attrs.get('nodata') + if nodata is not None: + return nodata + + vals = attrs.get('nodatavals') + if vals is not None: + try: + seq = list(vals) + except TypeError: + seq = [vals] + for v in seq: + if v is None: + continue + try: + fv = float(v) + except (TypeError, ValueError): + continue + if np.isnan(fv): + continue + return v + + fill = attrs.get('_FillValue') + if fill is not None: + try: + ffv = float(fill) + except (TypeError, ValueError): + return fill # non-numeric -- pass through verbatim + if np.isnan(ffv): + return None + return fill + + return None + + def _merge_friendly_extra_tags(extra_tags_list, attrs: dict) -> list | None: """Combine ``attrs['extra_tags']`` with friendly tag attrs. @@ -954,7 +1005,7 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path, *, if epsg is None and wkt_fallback is None: wkt_fallback = wkt if nodata is None: - nodata = data.attrs.get('nodata') + nodata = _resolve_nodata_attr(data.attrs) if data.attrs.get('raster_type') == 'point': raster_type = RASTER_PIXEL_IS_POINT gdal_meta_xml = data.attrs.get('gdal_metadata_xml') @@ -2237,6 +2288,14 @@ def write_geotiff_gpu(data, path: str, *, else: arr = cupy.asarray(np.asarray(arr)) # numpy -> GPU + # Handle band-first dimension order (band, y, x) -> (y, x, band). + # rioxarray and CF-style multi-band rasters land here; without + # this remap the writer treats arr.shape[2] as the band axis and + # produces a transposed file (issue #1580). The CPU writer does + # the same remap at the matching step in to_geotiff(). + if arr.ndim == 3 and data.dims[0] in ('band', 'bands', 'channel'): + arr = cupy.ascontiguousarray(cupy.moveaxis(arr, 0, -1)) + # Prefer attrs['transform'] over the coord-derived transform: it # is bit-stable across round-trips, while _coords_to_transform # can drift on fractional pixel sizes (the same reasoning the @@ -2264,7 +2323,7 @@ def write_geotiff_gpu(data, path: str, *, if epsg is None and wkt_fallback is None: wkt_fallback = wkt if nodata is None: - nodata = data.attrs.get('nodata') + nodata = _resolve_nodata_attr(data.attrs) if data.attrs.get('raster_type') == 'point': raster_type = RASTER_PIXEL_IS_POINT # Mirror the CPU writer's pass-through of GDAL metadata, the diff --git a/xrspatial/geotiff/tests/test_gpu_writer_band_first_1580.py b/xrspatial/geotiff/tests/test_gpu_writer_band_first_1580.py new file mode 100644 index 00000000..6a6053eb --- /dev/null +++ b/xrspatial/geotiff/tests/test_gpu_writer_band_first_1580.py @@ -0,0 +1,169 @@ +"""Regression test for issue #1580. + +``write_geotiff_gpu`` used to assume the CuPy-backed DataArray was +``(y, x[, band])`` and computed ``height, width = arr.shape[:2]`` +unconditionally. A rioxarray-style ``(band, y, x)`` 3D DataArray -- +auto-dispatched into the GPU writer from ``to_geotiff`` -- ended up +written with the band axis stored as image width and the actual width +stored as samples-per-pixel. + +The CPU eager path in ``to_geotiff`` already handles this with a +``np.moveaxis(arr, 0, -1)`` when ``data.dims[0] in ('band', 'bands', +'channel')``; the GPU writer now mirrors that remap so both backends +produce the same file for the same input. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff, write_geotiff_gpu + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +@_gpu_only +@pytest.mark.parametrize("band_dim_name", ["band", "bands", "channel"]) +def test_band_first_layout_written_correctly_via_write_geotiff_gpu( + tmp_path, band_dim_name): + """Direct call to write_geotiff_gpu with a (band, y, x) CuPy + DataArray must produce the same file dimensions as the CPU writer + would (height=y, width=x, samples=band). + """ + import cupy + rng = np.random.default_rng(seed=1580) + arr_bhw = rng.integers(0, 255, (3, 16, 32), dtype=np.uint8) + da = xr.DataArray( + cupy.asarray(arr_bhw), + dims=[band_dim_name, "y", "x"], + coords={ + band_dim_name: np.arange(3), + "y": np.arange(16, dtype=np.float64), + "x": np.arange(32, dtype=np.float64), + }, + attrs={"crs": 4326}, + ) + + out = str(tmp_path / f"band_first_1580_{band_dim_name}.tif") + write_geotiff_gpu(da, out, compression="none") + + rd = open_geotiff(out) + assert rd.sizes["y"] == 16, ( + f"expected height=16, got {rd.sizes}; the writer is treating the " + f"band axis as height" + ) + assert rd.sizes["x"] == 32, f"expected width=32, got {rd.sizes}" + assert rd.sizes["band"] == 3, ( + f"expected 3 bands, got {rd.sizes}; the writer is treating the " + f"width as samples-per-pixel" + ) + + # Pixel values should match the source after the (band, y, x) -> + # (y, x, band) remap. + np.testing.assert_array_equal(rd.values, np.moveaxis(arr_bhw, 0, -1)) + + +@_gpu_only +def test_band_first_layout_via_to_geotiff_auto_dispatch(tmp_path): + """The user-facing path: pass a CuPy (band, y, x) DataArray to + ``to_geotiff`` and let auto-detection pick the GPU writer. + """ + import cupy + rng = np.random.default_rng(seed=1580 + 1) + arr_bhw = rng.integers(0, 255, (2, 8, 12), dtype=np.uint8) + da = xr.DataArray( + cupy.asarray(arr_bhw), + dims=["band", "y", "x"], + coords={ + "band": np.arange(2), + "y": np.arange(8, dtype=np.float64), + "x": np.arange(12, dtype=np.float64), + }, + attrs={"crs": 4326}, + ) + + out = str(tmp_path / "band_first_1580_autodispatch.tif") + to_geotiff(da, out, compression="none") + + rd = open_geotiff(out) + assert rd.sizes == {"y": 8, "x": 12, "band": 2}, ( + f"auto-dispatched GPU write produced wrong dims: {rd.sizes}" + ) + np.testing.assert_array_equal(rd.values, np.moveaxis(arr_bhw, 0, -1)) + + +@_gpu_only +def test_yxbands_layout_unchanged(tmp_path): + """Regression guard: the original (y, x, band) layout must still + write correctly after the band-first remap was added. + """ + import cupy + rng = np.random.default_rng(seed=1580 + 2) + arr_yxb = rng.integers(0, 255, (8, 12, 2), dtype=np.uint8) + da = xr.DataArray( + cupy.asarray(arr_yxb), + dims=["y", "x", "band"], + coords={ + "y": np.arange(8, dtype=np.float64), + "x": np.arange(12, dtype=np.float64), + "band": np.arange(2), + }, + attrs={"crs": 4326}, + ) + + out = str(tmp_path / "yxb_1580.tif") + write_geotiff_gpu(da, out, compression="none") + + rd = open_geotiff(out) + assert rd.sizes == {"y": 8, "x": 12, "band": 2} + np.testing.assert_array_equal(rd.values, arr_yxb) + + +@_gpu_only +def test_gpu_band_first_matches_cpu_byte_for_byte_on_pixel_values(tmp_path): + """Cross-backend parity: GPU and CPU writers must emit the same + pixel values for the same (band, y, x) input. + """ + import cupy + rng = np.random.default_rng(seed=1580 + 3) + arr_bhw = rng.integers(0, 255, (3, 24, 40), dtype=np.uint8) + da_cpu = xr.DataArray( + arr_bhw, + dims=["band", "y", "x"], + coords={"band": np.arange(3), + "y": np.arange(24, dtype=np.float64), + "x": np.arange(40, dtype=np.float64)}, + attrs={"crs": 4326}, + ) + da_gpu = xr.DataArray( + cupy.asarray(arr_bhw), + dims=["band", "y", "x"], + coords={"band": np.arange(3), + "y": np.arange(24, dtype=np.float64), + "x": np.arange(40, dtype=np.float64)}, + attrs={"crs": 4326}, + ) + + cpu_path = str(tmp_path / "band_first_1580_cpu.tif") + gpu_path = str(tmp_path / "band_first_1580_gpu.tif") + to_geotiff(da_cpu, cpu_path, compression="none", gpu=False) + write_geotiff_gpu(da_gpu, gpu_path, compression="none") + + cpu_rd = open_geotiff(cpu_path).values + gpu_rd = open_geotiff(gpu_path).values + np.testing.assert_array_equal(cpu_rd, gpu_rd) diff --git a/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py b/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py new file mode 100644 index 00000000..d2a205be --- /dev/null +++ b/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py @@ -0,0 +1,189 @@ +"""Regression test for issue #1582. + +``to_geotiff`` originally only consulted ``attrs['nodata']`` when +resolving the NoData sentinel from a DataArray. Two ecosystem +conventions were silently dropped: + +* ``attrs['nodatavals']`` -- rioxarray's per-band tuple. A user reading + a GeoTIFF with rioxarray and passing the result straight back to + ``to_geotiff`` ended up with a file lacking a GDAL_NODATA tag, and + the sentinel pixels stored as ordinary data. +* ``attrs['_FillValue']`` -- the CF-style xarray pipeline convention. + +Both paths now route through ``_resolve_nodata_attr`` so the eager +CPU writer (``to_geotiff``) and the GPU writer +(``write_geotiff_gpu``) honour either alias. xrspatial's own +``attrs['nodata']`` still wins when set, preserving the existing +intra-library round-trip. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff + + +_SENTINEL = -9999.0 + + +def _da_float(arr, **attrs): + return xr.DataArray( + arr, dims=["y", "x"], + coords={"y": np.arange(arr.shape[0], dtype=np.float64), + "x": np.arange(arr.shape[1], dtype=np.float64)}, + attrs=attrs, + ) + + +@pytest.fixture +def _arr_with_sentinel(): + return np.array( + [[1.0, 2.0, _SENTINEL], [3.0, _SENTINEL, 5.0]], + dtype=np.float32, + ) + + +def test_nodatavals_tuple_resolves_to_nodata_tag(tmp_path, _arr_with_sentinel): + """rioxarray-style ``nodatavals`` tuple lands as the file's nodata.""" + da = _da_float(_arr_with_sentinel, + crs=4326, nodatavals=(_SENTINEL,)) + out = str(tmp_path / "nodatavals_tuple.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == _SENTINEL + + +def test_nodatavals_list_resolves_to_nodata_tag(tmp_path, _arr_with_sentinel): + """List variant of nodatavals (some readers return list, not tuple).""" + da = _da_float(_arr_with_sentinel, + crs=4326, nodatavals=[_SENTINEL]) + out = str(tmp_path / "nodatavals_list.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == _SENTINEL + + +def test_nodatavals_scalar_resolves_to_nodata_tag(tmp_path, + _arr_with_sentinel): + """Single-band variant where the attr is a scalar, not a sequence.""" + da = _da_float(_arr_with_sentinel, + crs=4326, nodatavals=_SENTINEL) + out = str(tmp_path / "nodatavals_scalar.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == _SENTINEL + + +def test_fill_value_resolves_to_nodata_tag(tmp_path, _arr_with_sentinel): + """CF-style ``_FillValue`` lands as the file's nodata.""" + da = _da_float(_arr_with_sentinel, + crs=4326, **{"_FillValue": _SENTINEL}) + out = str(tmp_path / "fillvalue.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == _SENTINEL + + +def test_explicit_nodata_attr_wins_over_aliases(tmp_path, _arr_with_sentinel): + """``attrs['nodata']`` is xrspatial's canonical key; it must win over + rioxarray and CF aliases when both are present.""" + da = _da_float( + _arr_with_sentinel, crs=4326, + nodata=-8888.0, + nodatavals=(_SENTINEL,), + **{"_FillValue": -7777.0}, + ) + out = str(tmp_path / "explicit_wins.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == -8888.0 + + +def test_kwarg_nodata_wins_over_attrs(tmp_path, _arr_with_sentinel): + """The ``nodata=`` keyword overrides anything in attrs.""" + da = _da_float(_arr_with_sentinel, + crs=4326, nodatavals=(_SENTINEL,)) + out = str(tmp_path / "kwarg_wins.tif") + to_geotiff(da, out, nodata=-1234.0) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == -1234.0 + + +def test_nan_nodatavals_does_not_emit_tag(tmp_path): + """NaN in nodatavals means the float NaN is already the sentinel; + no GDAL_NODATA tag should be written for that case (mirrors + rioxarray behaviour, and keeps the file's metadata clean).""" + arr = np.array([[1.0, np.nan], [3.0, 4.0]], dtype=np.float32) + da = _da_float(arr, crs=4326, nodatavals=(float("nan"),)) + out = str(tmp_path / "nan_nodatavals.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") is None + + +def test_no_nodata_attrs_means_no_tag(tmp_path, _arr_with_sentinel): + """Sanity guard: a DataArray with no nodata-related attrs still + writes without a GDAL_NODATA tag.""" + da = _da_float(_arr_with_sentinel, crs=4326) + out = str(tmp_path / "no_nodata.tif") + to_geotiff(da, out) + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") is None + + +# --------------------------------------------------------------------------- +# GPU writer parity -- same alias resolution must apply to +# write_geotiff_gpu / the auto-dispatch from to_geotiff. +# --------------------------------------------------------------------------- + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +@_gpu_only +@pytest.mark.parametrize("attr_key,attr_value", [ + ("nodatavals", (_SENTINEL,)), + ("nodatavals", [_SENTINEL]), + ("_FillValue", _SENTINEL), +]) +def test_gpu_writer_resolves_alias(tmp_path, _arr_with_sentinel, + attr_key, attr_value): + """The GPU write path (write_geotiff_gpu) honours the same aliases.""" + import cupy + from xrspatial.geotiff import write_geotiff_gpu + + da = xr.DataArray( + cupy.asarray(_arr_with_sentinel), + dims=["y", "x"], + coords={"y": np.arange(2, dtype=np.float64), + "x": np.arange(3, dtype=np.float64)}, + attrs={"crs": 4326, attr_key: attr_value}, + ) + out = str(tmp_path / f"gpu_{attr_key}.tif") + write_geotiff_gpu(da, out, compression="none") + + rd = open_geotiff(out) + assert rd.attrs.get("nodata") == _SENTINEL From 2e87151e385c108e48606a0fd6b130d62b1ede34 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 11 May 2026 07:39:24 -0700 Subject: [PATCH 2/2] Clarify _resolve_nodata_attr docstring on nodatavals selection Docstring said "Uses the first band's value" but the implementation returns the first usable entry, skipping None / non-numeric / NaN slots. Reword so callers don't assume only index 0 is consulted. --- xrspatial/geotiff/__init__.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 39da5c14..cd338e57 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -660,14 +660,18 @@ def _resolve_nodata_attr(attrs: dict): so that key is checked first for a clean intra-library round-trip. Falls back to two ecosystem conventions on miss: - * ``attrs['nodatavals']`` -- rioxarray's per-band tuple. Uses the - first band's value; bands with mixed sentinels are uncommon and - would need an explicit ``nodata=`` argument anyway. + * ``attrs['nodatavals']`` -- rioxarray's per-band tuple. Returns + the first entry that is not None, not non-numeric, and not NaN. + In practice this is band 0 for almost every real file; the skip + logic only matters when band 0 is missing a sentinel (NaN / + None) while a later band declares one. Bands with mixed concrete + sentinels are uncommon and would need an explicit ``nodata=`` + argument anyway. * ``attrs['_FillValue']`` -- CF-style xarray pipelines. - Returns ``None`` when none of the keys carry a usable value. NaN / - None entries in ``nodatavals`` are skipped rather than treated as - a sentinel (NaN means "the float NaN is the sentinel", which is + Returns ``None`` when none of the keys carry a usable value. NaN + entries in ``nodatavals`` are skipped rather than treated as a + sentinel (NaN means "the float NaN is the sentinel", which is already the default and doesn't need a GDAL_NODATA tag). """ nodata = attrs.get('nodata')