Skip to content

Commit 6b72dd4

Browse files
committed
gh-151308: Avoid huge pre-allocation in wave.readframes() for crafted files
A WAV data chunk records its size in a 4-byte header field that is not validated against the data actually present in the file. A small, truncated, or maliciously crafted file could therefore claim a chunk of several gigabytes and make wave.Wave_read.readframes() pre-allocate that much memory via a single file.read(chunksize) call, leading to a MemoryError (or memory exhaustion) from a tiny input. When the underlying file is seekable, clamp each read in the internal _Chunk.read() to the number of bytes physically available, so we never allocate more than the file can actually provide. The data returned for valid files is unchanged.
1 parent 5b38519 commit 6b72dd4

3 files changed

Lines changed: 71 additions & 0 deletions

File tree

Lib/test/test_wave.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,30 @@
1010
import wave
1111

1212

13+
class _ReadSizeRecorder(io.BytesIO):
14+
# A seekable file that remembers the largest size ever passed to read()
15+
# (so a test can check that wave does not request far more data than the
16+
# file actually holds, which on a real file would pre-allocate it), and
17+
# that rejects seeks to offsets overflowing a C ssize_t the way a 32-bit
18+
# platform such as WASI does (so a test can check that wave never seeks
19+
# to an untrusted chunk size).
20+
_SSIZE_MAX = (1 << 31) - 1
21+
22+
def __init__(self, *args, **kwargs):
23+
super().__init__(*args, **kwargs)
24+
self.max_read_size = 0
25+
26+
def read(self, size=-1):
27+
if size is not None and size >= 0:
28+
self.max_read_size = max(self.max_read_size, size)
29+
return super().read(size)
30+
31+
def seek(self, pos, whence=0):
32+
if abs(pos) > self._SSIZE_MAX:
33+
raise OverflowError("Python int too large to convert to C ssize_t")
34+
return super().seek(pos, whence)
35+
36+
1337
class WaveTest(audiotests.AudioWriteTests,
1438
audiotests.AudioTestsWithSourceFile):
1539
module = wave
@@ -333,6 +357,25 @@ def test_read_wrong_sample_width(self):
333357
with self.assertRaisesRegex(wave.Error, 'bad sample width'):
334358
wave.open(io.BytesIO(b))
335359

360+
def test_read_data_chunk_size_larger_than_file(self):
361+
# gh-151308: a data chunk header may claim far more data than the
362+
# file actually contains. readframes() must not request (and so,
363+
# on a real file, pre-allocate) the claimed size; reads on a
364+
# seekable file are clamped to the bytes actually available.
365+
real_data = b'\x00' * 10
366+
b = b'RIFF' + struct.pack('<L', 0xFFFFFFFF) + b'WAVE'
367+
b += b'fmt ' + struct.pack('<LHHLLHH', 16, 1, 1, 11025, 11025, 1, 8)
368+
b += b'data' + struct.pack('<L', 0xFFFFFFFF) # bogus, ~4 GiB
369+
b += real_data
370+
# _ReadSizeRecorder also raises OverflowError on a huge seek offset,
371+
# so this exercises the 32-bit (e.g. WASI) path too.
372+
f = _ReadSizeRecorder(b)
373+
with wave.open(f, 'rb') as r:
374+
data = r.readframes(r.getnframes())
375+
self.assertEqual(data, real_data)
376+
# The bogus ~4 GiB size must never reach the underlying read().
377+
self.assertLessEqual(f.max_read_size, len(b))
378+
336379
def test_open_in_write_raises(self):
337380
# gh-136523: Wave_write.__del__ should not throw
338381
with support.catch_unraisable_exception() as cm:

Lib/wave.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,30 @@ def read(self, size=-1):
189189
size = self.chunksize - self.size_read
190190
if size > self.chunksize - self.size_read:
191191
size = self.chunksize - self.size_read
192+
# The chunk size comes from the file header and is not trustworthy:
193+
# a truncated or maliciously crafted file can claim a size far larger
194+
# than the data actually present, which would make the read() below
195+
# pre-allocate that many bytes (gh-151308). When the underlying file
196+
# is seekable, clamp the request to the bytes physically available so
197+
# we never allocate more than the file can provide. This leaves the
198+
# data returned for valid files unchanged, since the requested bytes
199+
# are always present. We probe with tell()/seek() rather than trust
200+
# seekable(), since some file objects report being seekable yet raise
201+
# on the actual call; on any failure we fall back to the original
202+
# behaviour. We only probe the raw file object, never a parent
203+
# _Chunk: seeking a _Chunk would seek the raw file to its (untrusted)
204+
# chunk size, which may overflow on 32-bit platforms. Clamping the
205+
# raw read protects the nested chunks too, as they read through it.
206+
if size > 0 and not isinstance(self.file, _Chunk):
207+
try:
208+
here = self.file.tell()
209+
end = self.file.seek(0, 2)
210+
self.file.seek(here, 0)
211+
except (OSError, ValueError):
212+
pass
213+
else:
214+
if isinstance(end, int):
215+
size = min(size, max(0, end - here))
192216
data = self.file.read(size)
193217
self.size_read = self.size_read + len(data)
194218
if self.size_read == self.chunksize and \
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:meth:`wave.Wave_read.readframes` no longer attempts to pre-allocate a huge
2+
buffer when the data chunk header of a truncated or malformed WAV file claims
3+
a size larger than the file actually contains. When the underlying file is
4+
seekable, reads are now clamped to the number of bytes really available.

0 commit comments

Comments
 (0)