From 7c3dd30dfd127fce51a5595bbd212144dc1e3492 Mon Sep 17 00:00:00 2001 From: LeSingh1 Date: Mon, 18 May 2026 16:29:36 -0700 Subject: [PATCH] Fix progress bar against non-UTF-8 encodings For #439. `file_progress()` sets the progress bar length to the file size in bytes (`os.path.getsize(file.name)`), but `UpdateWrapper.__iter__` called `update(len(line))`, which is the decoded character count. With UTF-16-LE input every character is 2 bytes, so the bar capped at 50%; UTF-32 capped at 25%; etc. Simon noted in the issue that the obvious fix (calling `.tell()` on the wrapped text stream) doesn't work because text mode disables it during iteration. The underlying binary buffer doesn't have that restriction though, so this tracks progress against `TextIOWrapper.buffer.tell()` when the wrapped object exposes one. For raw binary streams (no `.buffer` attribute) we keep the old behaviour, which was already byte-accurate. Added six regression tests in tests/test_utils.py covering UTF-8, UTF-16-LE, BOM-prefixed UTF-16, the sniff-style `BufferedReader` chain, a raw binary fallback, and the `.read()` path used by the JSON loader. Each asserts that the sum of update() calls equals the on-disk file size, which is what `click.progressbar` needs to reach 100%. --- sqlite_utils/utils.py | 41 ++++++++++++++++++- tests/test_utils.py | 93 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 2 deletions(-) diff --git a/sqlite_utils/utils.py b/sqlite_utils/utils.py index 0ca98fc9a..fb254c8d6 100644 --- a/sqlite_utils/utils.py +++ b/sqlite_utils/utils.py @@ -215,15 +215,52 @@ class UpdateWrapper: def __init__(self, wrapped: io.IOBase, update: Callable[[int], None]) -> None: self._wrapped = wrapped self._update = update + # `file_progress` sets the progress bar length to the file size in + # bytes, but iterating a text-mode stream yields decoded characters, + # so reporting `len(line)` undercounts for any multi-byte encoding + # (UTF-16-LE caps the bar at ~50%, UTF-32 at ~25%, etc.). When the + # wrapped object is a text wrapper, track progress against the + # underlying binary buffer's position instead. See #439. + self._byte_source = getattr(wrapped, "buffer", None) + self._last_byte_pos = 0 + if self._byte_source is not None: + try: + self._last_byte_pos = self._byte_source.tell() + except (io.UnsupportedOperation, OSError): + self._byte_source = None + + def _advance_to_buffer_pos(self) -> None: + # Bring the progress bar up to the current byte position of the + # underlying binary buffer (which may have read ahead). + assert self._byte_source is not None + try: + pos = self._byte_source.tell() + except OSError: + return + delta = pos - self._last_byte_pos + if delta > 0: + self._update(delta) + self._last_byte_pos = pos def __iter__(self) -> Iterator[bytes]: + if self._byte_source is None: + for line in self._wrapped: + self._update(len(line)) + yield line + return for line in self._wrapped: - self._update(len(line)) + self._advance_to_buffer_pos() yield line + # The wrapper may have buffered the last chunk without emitting any + # more lines; flush the remaining bytes so the bar reaches 100%. + self._advance_to_buffer_pos() def read(self, size: int = -1) -> bytes: data = self._wrapped.read(size) - self._update(len(data)) + if self._byte_source is not None: + self._advance_to_buffer_pos() + else: + self._update(len(data)) return data diff --git a/tests/test_utils.py b/tests/test_utils.py index f728bcd38..5cbd836ae 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -83,3 +83,96 @@ def test_maximize_csv_field_size_limit(): ) def test_flatten(input, expected): assert utils.flatten(input) == expected + + +# Regression tests for #439: progress bar against multi-byte encodings + + +def _collect_updates(rows): + """Iterate the wrapper, capturing every update() value.""" + return list(rows) + + +def _make_temp(content_bytes, tmp_path, name): + path = tmp_path / name + path.write_bytes(content_bytes) + return path + + +def test_updatewrapper_utf8_reports_byte_lengths(tmp_path): + # Sanity: ASCII / UTF-8 still hits 100% (this was already correct, + # but we want a baseline to protect.) + raw = b"a,b\n1,2\n3,4\n" + path = _make_temp(raw, tmp_path, "in.csv") + updates = [] + with open(path, "rb") as fp: + wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-8"), updates.append) + _collect_updates(wrapper) + assert sum(updates) == len(raw) + + +def test_updatewrapper_utf16le_reports_byte_lengths(tmp_path): + # Without the fix this test fails: the bar only reaches len(decoded) + # which is half the raw byte length for UTF-16-LE. + raw = "a,b\n1,2\n3,4\n".encode("utf-16-le") + path = _make_temp(raw, tmp_path, "in.csv") + updates = [] + with open(path, "rb") as fp: + wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append) + _collect_updates(wrapper) + assert sum(updates) == len(raw) + + +def test_updatewrapper_utf16le_with_bom_reaches_total_bytes(tmp_path): + # BOM-prefixed UTF-16. The BOM byte is consumed by the TextIOWrapper + # before iteration starts; we should still account for the full file + # size so the bar reaches 100%. + raw = "" + "a,b\n1,2\n3,4\n" + raw_bytes = raw.encode("utf-16-le") + path = _make_temp(raw_bytes, tmp_path, "in.csv") + updates = [] + with open(path, "rb") as fp: + wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16"), updates.append) + _collect_updates(wrapper) + assert sum(updates) == len(raw_bytes) + + +def test_updatewrapper_through_buffered_reader(tmp_path): + # The --sniff path wraps the raw file in io.BufferedReader before the + # TextIOWrapper. Progress reporting must still resolve to the binary + # file's byte count. + raw = "a,b\n1,2\n3,4\n".encode("utf-16-le") + path = _make_temp(raw, tmp_path, "in.csv") + updates = [] + with open(path, "rb") as fp: + buffered = io.BufferedReader(fp, buffer_size=4096) + wrapper = utils.UpdateWrapper( + io.TextIOWrapper(buffered, encoding="utf-16-le"), updates.append + ) + _collect_updates(wrapper) + assert sum(updates) == len(raw) + + +def test_updatewrapper_binary_file_unchanged(tmp_path): + # If the wrapped object is itself a raw binary file (no .buffer attr), + # we should keep the old behaviour: iterate yields bytes and len() is + # already the byte count. + raw = b"a,b\n1,2\n3,4\n" + path = _make_temp(raw, tmp_path, "in.csv") + updates = [] + with open(path, "rb") as fp: + wrapper = utils.UpdateWrapper(fp, updates.append) + _collect_updates(wrapper) + assert sum(updates) == len(raw) + + +def test_updatewrapper_read_path_utf16le(tmp_path): + # The .read() path is used by the JSON loader (not the CSV iterator), + # but must agree with the iterator path on byte accounting. + raw = '{"a": 1}'.encode("utf-16-le") + path = _make_temp(raw, tmp_path, "in.json") + updates = [] + with open(path, "rb") as fp: + wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append) + wrapper.read() + assert sum(updates) == len(raw)