Skip to content

Commit 589b54e

Browse files
TomiBelanmiss-islington
authored andcommitted
gh-121109: Fix performance of tarfile reading with "r|*" (GH-121296)
(cherry picked from commit 6d7a19e) Co-authored-by: Tomi Belan <tomi.belan@gmail.com>
1 parent 2f91315 commit 589b54e

3 files changed

Lines changed: 39 additions & 22 deletions

File tree

Lib/tarfile.py

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
380380
except ImportError:
381381
raise CompressionError("bz2 module is not available") from None
382382
if mode == "r":
383-
self.dbuf = b""
384383
self.cmp = bz2.BZ2Decompressor()
385384
self.exception = OSError
386385
else:
@@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
392391
except ImportError:
393392
raise CompressionError("lzma module is not available") from None
394393
if mode == "r":
395-
self.dbuf = b""
396394
self.cmp = lzma.LZMADecompressor()
397395
self.exception = lzma.LZMAError
398396
else:
@@ -403,7 +401,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
403401
except ImportError:
404402
raise CompressionError("compression.zstd module is not available") from None
405403
if mode == "r":
406-
self.dbuf = b""
407404
self.cmp = zstd.ZstdDecompressor()
408405
self.exception = zstd.ZstdError
409406
else:
@@ -485,7 +482,6 @@ def _init_read_gz(self):
485482
"""Initialize for reading a gzip compressed fileobj.
486483
"""
487484
self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
488-
self.dbuf = b""
489485

490486
# taken from gzip.GzipFile with some alterations
491487
if self.__read(2) != b"\037\213":
@@ -543,26 +539,44 @@ def _read(self, size):
543539
if self.comptype == "tar":
544540
return self.__read(size)
545541

546-
c = len(self.dbuf)
547-
t = [self.dbuf]
542+
c = 0
543+
t = []
548544
while c < size:
549-
# Skip underlying buffer to avoid unaligned double buffering.
550-
if self.buf:
551-
buf = self.buf
552-
self.buf = b""
545+
if self.comptype == "gz":
546+
# zlib interface is different than others.
547+
# It returns data in unconsumed_tail.
548+
if self.buf:
549+
cbuf = self.buf
550+
self.buf = b""
551+
else:
552+
cbuf = self.fileobj.read(self.bufsize)
553+
if not cbuf:
554+
break
555+
556+
try:
557+
dbuf = self.cmp.decompress(cbuf, size - c)
558+
self.buf = self.cmp.unconsumed_tail
559+
except self.exception as e:
560+
raise ReadError("invalid compressed data") from e
553561
else:
554-
buf = self.fileobj.read(self.bufsize)
555-
if not buf:
556-
break
557-
try:
558-
buf = self.cmp.decompress(buf)
559-
except self.exception as e:
560-
raise ReadError("invalid compressed data") from e
561-
t.append(buf)
562-
c += len(buf)
563-
t = b"".join(t)
564-
self.dbuf = t[size:]
565-
return t[:size]
562+
# Other decompressors have needs_input.
563+
# decompress() can buffer data internally.
564+
if self.cmp.needs_input:
565+
cbuf = self.fileobj.read(self.bufsize)
566+
if not cbuf:
567+
break
568+
else:
569+
cbuf = b""
570+
571+
try:
572+
dbuf = self.cmp.decompress(cbuf, size - c)
573+
except self.exception as e:
574+
raise ReadError("invalid compressed data") from e
575+
576+
t.append(dbuf)
577+
c += len(dbuf)
578+
579+
return b"".join(t)
566580

567581
def __read(self, size):
568582
"""Return size bytes from stream. If internal buffer is empty,

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ Bas van Beek
144144
Ian Beer
145145
Stefan Behnel
146146
Reimer Behrends
147+
Tomi Belan
147148
Maxime Bélanger
148149
Ben Bell
149150
Thomas Bellman
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix :mod:`tarfile` performance issue when reading archives in streaming mode
2+
(e.g. ``r|*``).

0 commit comments

Comments
 (0)