Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions sqlite_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,15 +215,52 @@ class UpdateWrapper:
def __init__(self, wrapped: io.IOBase, update: Callable[[int], None]) -> None:
self._wrapped = wrapped
self._update = update
# `file_progress` sets the progress bar length to the file size in
# bytes, but iterating a text-mode stream yields decoded characters,
# so reporting `len(line)` undercounts for any multi-byte encoding
# (UTF-16-LE caps the bar at ~50%, UTF-32 at ~25%, etc.). When the
# wrapped object is a text wrapper, track progress against the
# underlying binary buffer's position instead. See #439.
self._byte_source = getattr(wrapped, "buffer", None)
self._last_byte_pos = 0
if self._byte_source is not None:
try:
self._last_byte_pos = self._byte_source.tell()
except (io.UnsupportedOperation, OSError):
self._byte_source = None

def _advance_to_buffer_pos(self) -> None:
# Bring the progress bar up to the current byte position of the
# underlying binary buffer (which may have read ahead).
assert self._byte_source is not None
try:
pos = self._byte_source.tell()
except OSError:
return
delta = pos - self._last_byte_pos
if delta > 0:
self._update(delta)
self._last_byte_pos = pos

def __iter__(self) -> Iterator[bytes]:
if self._byte_source is None:
for line in self._wrapped:
self._update(len(line))
yield line
return
for line in self._wrapped:
self._update(len(line))
self._advance_to_buffer_pos()
yield line
# The wrapper may have buffered the last chunk without emitting any
# more lines; flush the remaining bytes so the bar reaches 100%.
self._advance_to_buffer_pos()

def read(self, size: int = -1) -> bytes:
data = self._wrapped.read(size)
self._update(len(data))
if self._byte_source is not None:
self._advance_to_buffer_pos()
else:
self._update(len(data))
return data


Expand Down
93 changes: 93 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,96 @@ def test_maximize_csv_field_size_limit():
)
def test_flatten(input, expected):
assert utils.flatten(input) == expected


# Regression tests for #439: progress bar against multi-byte encodings


def _collect_updates(rows):
"""Iterate the wrapper, capturing every update() value."""
return list(rows)


def _make_temp(content_bytes, tmp_path, name):
path = tmp_path / name
path.write_bytes(content_bytes)
return path


def test_updatewrapper_utf8_reports_byte_lengths(tmp_path):
# Sanity: ASCII / UTF-8 still hits 100% (this was already correct,
# but we want a baseline to protect.)
raw = b"a,b\n1,2\n3,4\n"
path = _make_temp(raw, tmp_path, "in.csv")
updates = []
with open(path, "rb") as fp:
wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-8"), updates.append)
_collect_updates(wrapper)
assert sum(updates) == len(raw)


def test_updatewrapper_utf16le_reports_byte_lengths(tmp_path):
# Without the fix this test fails: the bar only reaches len(decoded)
# which is half the raw byte length for UTF-16-LE.
raw = "a,b\n1,2\n3,4\n".encode("utf-16-le")
path = _make_temp(raw, tmp_path, "in.csv")
updates = []
with open(path, "rb") as fp:
wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append)
_collect_updates(wrapper)
assert sum(updates) == len(raw)


def test_updatewrapper_utf16le_with_bom_reaches_total_bytes(tmp_path):
# BOM-prefixed UTF-16. The BOM byte is consumed by the TextIOWrapper
# before iteration starts; we should still account for the full file
# size so the bar reaches 100%.
raw = "" + "a,b\n1,2\n3,4\n"
raw_bytes = raw.encode("utf-16-le")
path = _make_temp(raw_bytes, tmp_path, "in.csv")
updates = []
with open(path, "rb") as fp:
wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16"), updates.append)
_collect_updates(wrapper)
assert sum(updates) == len(raw_bytes)


def test_updatewrapper_through_buffered_reader(tmp_path):
# The --sniff path wraps the raw file in io.BufferedReader before the
# TextIOWrapper. Progress reporting must still resolve to the binary
# file's byte count.
raw = "a,b\n1,2\n3,4\n".encode("utf-16-le")
path = _make_temp(raw, tmp_path, "in.csv")
updates = []
with open(path, "rb") as fp:
buffered = io.BufferedReader(fp, buffer_size=4096)
wrapper = utils.UpdateWrapper(
io.TextIOWrapper(buffered, encoding="utf-16-le"), updates.append
)
_collect_updates(wrapper)
assert sum(updates) == len(raw)


def test_updatewrapper_binary_file_unchanged(tmp_path):
# If the wrapped object is itself a raw binary file (no .buffer attr),
# we should keep the old behaviour: iterate yields bytes and len() is
# already the byte count.
raw = b"a,b\n1,2\n3,4\n"
path = _make_temp(raw, tmp_path, "in.csv")
updates = []
with open(path, "rb") as fp:
wrapper = utils.UpdateWrapper(fp, updates.append)
_collect_updates(wrapper)
assert sum(updates) == len(raw)


def test_updatewrapper_read_path_utf16le(tmp_path):
# The .read() path is used by the JSON loader (not the CSV iterator),
# but must agree with the iterator path on byte accounting.
raw = '{"a": 1}'.encode("utf-16-le")
path = _make_temp(raw, tmp_path, "in.json")
updates = []
with open(path, "rb") as fp:
wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append)
wrapper.read()
assert sum(updates) == len(raw)
Loading