From 02d42a64b12298d46d4812bdc22b1444dd934b60 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 21:06:39 -0700 Subject: [PATCH] geotiff: route read_vrt(chunks=) XML read through cap (closes #1831) PR #1818 capped VRT XML reads at 64 MiB (configurable via XRSPATIAL_VRT_MAX_XML_BYTES) in xrspatial/geotiff/_vrt.py::_read_vrt_xml to keep a multi-GB attacker-supplied VRT file from exhausting memory at parse time. The chunked dispatcher merged in #1822 (_read_vrt_chunked in xrspatial/geotiff/__init__.py) parses the VRT independently with a bare open().read() before calling parse_vrt, so the cap did not apply to read_vrt(path, chunks=...). The per-task _vrt_chunk_read worker calls _read_vrt_internal (which goes through _read_vrt_xml), so only the parent dispatch path was uncapped. Route the chunked-path XML read through _read_vrt_xml so the same cap and env-var override apply on both eager and chunked entry points. Regression test pins the small-cap rejection, the default-cap success, and the raised-cap success for chunks=. --- xrspatial/geotiff/__init__.py | 13 +-- .../test_vrt_xml_size_cap_chunked_1831.py | 86 +++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index e1c29f22..4fd230f3 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -4431,11 +4431,14 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, import dask.array as da from ._reader import MAX_PIXELS_DEFAULT - from ._vrt import parse_vrt - - # Parse the VRT XML up-front (cheap; no pixel decode). - with open(source, 'r') as f: - xml_str = f.read() + from ._vrt import _read_vrt_xml, parse_vrt + + # Parse the VRT XML up-front (cheap; no pixel decode). Route through + # ``_read_vrt_xml`` so the 64 MiB ``XRSPATIAL_VRT_MAX_XML_BYTES`` cap + # added in #1818 applies to the chunked dispatcher too; a raw + # ``open().read()`` here would let a multi-GB attacker-supplied VRT + # exhaust memory before any parser-side guard fires (issue #1831). + xml_str = _read_vrt_xml(source) vrt_dir = _os.path.dirname(_os.path.abspath(source)) vrt = parse_vrt(xml_str, vrt_dir) diff --git a/xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py b/xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py new file mode 100644 index 00000000..2a36514d --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py @@ -0,0 +1,86 @@ +"""VRT XML reads from the chunked dispatcher must honor the size cap. + +Regression test for issue #1831: ``read_vrt(path, chunks=...)`` (added in +#1822) parsed the VRT XML with an unbounded ``open().read()``, bypassing +the 64 MiB cap that #1818 added in ``_vrt._read_vrt_xml``. An attacker +supplying a multi-GB VRT file plus a chunked workflow would exhaust +host memory before any parser-side guard fired. + +The eager path is already covered by ``test_vrt_xml_size_cap_1815.py``; +this file pins the same behavior for ``chunks=``. +""" +from __future__ import annotations + +import os + +import numpy as np +import pytest + +from xrspatial.geotiff import read_vrt, to_geotiff + + +def _write_source(td: str) -> str: + src_path = os.path.join(td, 'tmp_1831_src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, + compression='none') + return src_path + + +def _write_vrt(td: str, *, pad_bytes: int = 0) -> str: + """Write a VRT, optionally padded with a large XML comment.""" + vrt_path = os.path.join(td, 'tmp_1831_mosaic.vrt') + comment = '' + if pad_bytes > 0: + comment = '\n' + vrt_xml = ( + '\n' + + comment + + ' \n' + ' \n' + ' ' + 'tmp_1831_src.tif\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_chunked_read_vrt_honors_xml_cap(tmp_path, monkeypatch): + """``read_vrt(chunks=...)`` rejects oversized VRT XML.""" + td = str(tmp_path) + _write_source(td) + # 1 KiB cap, 4 KiB pad. The cap message must reference the env var so + # operators know how to raise it. + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', '1024') + vrt_path = _write_vrt(td, pad_bytes=4096) + with pytest.raises(ValueError) as exc_info: + read_vrt(vrt_path, chunks=10) + msg = str(exc_info.value) + assert 'XRSPATIAL_VRT_MAX_XML_BYTES' in msg + assert '1,024' in msg + + +def test_chunked_read_vrt_under_default_cap(tmp_path): + """A normal-sized VRT parses successfully under the default cap.""" + td = str(tmp_path) + _write_source(td) + vrt_path = _write_vrt(td) + arr = read_vrt(vrt_path, chunks=10) + assert arr.shape == (10, 10) + assert arr.dtype == np.uint8 + + +def test_chunked_read_vrt_raised_cap_allows_padded(tmp_path, monkeypatch): + """Raising ``XRSPATIAL_VRT_MAX_XML_BYTES`` lets a padded VRT parse.""" + td = str(tmp_path) + _write_source(td) + vrt_path = _write_vrt(td, pad_bytes=4096) + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', str(1024 * 1024)) + arr = read_vrt(vrt_path, chunks=10) + assert arr.shape == (10, 10)