simonw · LeSingh1 · May 18, 2026
diff --git a/sqlite_utils/utils.py b/sqlite_utils/utils.py
@@ -215,15 +215,52 @@ class UpdateWrapper:
     def __init__(self, wrapped: io.IOBase, update: Callable[[int], None]) -> None:
         self._wrapped = wrapped
         self._update = update
+        # `file_progress` sets the progress bar length to the file size in
+        # bytes, but iterating a text-mode stream yields decoded characters,
+        # so reporting `len(line)` undercounts for any multi-byte encoding
+        # (UTF-16-LE caps the bar at ~50%, UTF-32 at ~25%, etc.). When the
+        # wrapped object is a text wrapper, track progress against the
+        # underlying binary buffer's position instead. See #439.
+        self._byte_source = getattr(wrapped, "buffer", None)
+        self._last_byte_pos = 0
+        if self._byte_source is not None:
+            try:
+                self._last_byte_pos = self._byte_source.tell()
+            except (io.UnsupportedOperation, OSError):
+                self._byte_source = None
+
+    def _advance_to_buffer_pos(self) -> None:
+        # Bring the progress bar up to the current byte position of the
+        # underlying binary buffer (which may have read ahead).
+        assert self._byte_source is not None
+        try:
+            pos = self._byte_source.tell()
+        except OSError:
+            return
+        delta = pos - self._last_byte_pos
+        if delta > 0:
+            self._update(delta)
+            self._last_byte_pos = pos
 
     def __iter__(self) -> Iterator[bytes]:
+        if self._byte_source is None:
+            for line in self._wrapped:
+                self._update(len(line))
+                yield line
+            return
         for line in self._wrapped:
-            self._update(len(line))
+            self._advance_to_buffer_pos()
             yield line
+        # The wrapper may have buffered the last chunk without emitting any
+        # more lines; flush the remaining bytes so the bar reaches 100%.
+        self._advance_to_buffer_pos()
 
     def read(self, size: int = -1) -> bytes:
         data = self._wrapped.read(size)
-        self._update(len(data))
+        if self._byte_source is not None:
+            self._advance_to_buffer_pos()
+        else:
+            self._update(len(data))
         return data
 
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -83,3 +83,96 @@ def test_maximize_csv_field_size_limit():
 )
 def test_flatten(input, expected):
     assert utils.flatten(input) == expected
+
+
+# Regression tests for #439: progress bar against multi-byte encodings
+
+
+def _collect_updates(rows):
+    """Iterate the wrapper, capturing every update() value."""
+    return list(rows)
+
+
+def _make_temp(content_bytes, tmp_path, name):
+    path = tmp_path / name
+    path.write_bytes(content_bytes)
+    return path
+
+
+def test_updatewrapper_utf8_reports_byte_lengths(tmp_path):
+    # Sanity: ASCII / UTF-8 still hits 100% (this was already correct,
+    # but we want a baseline to protect.)
+    raw = b"a,b\n1,2\n3,4\n"
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-8"), updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_utf16le_reports_byte_lengths(tmp_path):
+    # Without the fix this test fails: the bar only reaches len(decoded)
+    # which is half the raw byte length for UTF-16-LE.
+    raw = "a,b\n1,2\n3,4\n".encode("utf-16-le")
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_utf16le_with_bom_reaches_total_bytes(tmp_path):
+    # BOM-prefixed UTF-16. The BOM byte is consumed by the TextIOWrapper
+    # before iteration starts; we should still account for the full file
+    # size so the bar reaches 100%.
+    raw = "" + "a,b\n1,2\n3,4\n"
+    raw_bytes = raw.encode("utf-16-le")
+    path = _make_temp(raw_bytes, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16"), updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw_bytes)
+
+
+def test_updatewrapper_through_buffered_reader(tmp_path):
+    # The --sniff path wraps the raw file in io.BufferedReader before the
+    # TextIOWrapper. Progress reporting must still resolve to the binary
+    # file's byte count.
+    raw = "a,b\n1,2\n3,4\n".encode("utf-16-le")
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        buffered = io.BufferedReader(fp, buffer_size=4096)
+        wrapper = utils.UpdateWrapper(
+            io.TextIOWrapper(buffered, encoding="utf-16-le"), updates.append
+        )
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_binary_file_unchanged(tmp_path):
+    # If the wrapped object is itself a raw binary file (no .buffer attr),
+    # we should keep the old behaviour: iterate yields bytes and len() is
+    # already the byte count.
+    raw = b"a,b\n1,2\n3,4\n"
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(fp, updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_read_path_utf16le(tmp_path):
+    # The .read() path is used by the JSON loader (not the CSV iterator),
+    # but must agree with the iterator path on byte accounting.
+    raw = '{"a": 1}'.encode("utf-16-le")
+    path = _make_temp(raw, tmp_path, "in.json")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append)
+        wrapper.read()
+    assert sum(updates) == len(raw)