Skip to content

Commit 232ea59

Browse files
committed
gh-151308: Avoid huge pre-allocation in wave.readframes() for crafted files
A WAV data chunk records its size in a 4-byte header field that is not validated against the data actually present in the file. A small, truncated, or maliciously crafted file could therefore claim a chunk of several gigabytes and make wave.Wave_read.readframes() pre-allocate that much memory via a single file.read(chunksize) call, leading to a MemoryError (or memory exhaustion) from a tiny input. When the underlying file is seekable, clamp each read in the internal _Chunk.read() to the number of bytes physically available, so we never allocate more than the file can actually provide. The data returned for valid files is unchanged.
1 parent 5b38519 commit 232ea59

3 files changed

Lines changed: 57 additions & 0 deletions

File tree

Lib/test/test_wave.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,20 @@
1010
import wave
1111

1212

13+
class _ReadSizeRecorder(io.BytesIO):
14+
# A seekable file that remembers the largest size ever passed to read(),
15+
# so a test can check that wave does not request far more data than the
16+
# file actually holds (which on a real file would pre-allocate it).
17+
def __init__(self, *args, **kwargs):
18+
super().__init__(*args, **kwargs)
19+
self.max_read_size = 0
20+
21+
def read(self, size=-1):
22+
if size is not None and size >= 0:
23+
self.max_read_size = max(self.max_read_size, size)
24+
return super().read(size)
25+
26+
1327
class WaveTest(audiotests.AudioWriteTests,
1428
audiotests.AudioTestsWithSourceFile):
1529
module = wave
@@ -333,6 +347,23 @@ def test_read_wrong_sample_width(self):
333347
with self.assertRaisesRegex(wave.Error, 'bad sample width'):
334348
wave.open(io.BytesIO(b))
335349

350+
def test_read_data_chunk_size_larger_than_file(self):
351+
# gh-151308: a data chunk header may claim far more data than the
352+
# file actually contains. readframes() must not request (and so,
353+
# on a real file, pre-allocate) the claimed size; reads on a
354+
# seekable file are clamped to the bytes actually available.
355+
real_data = b'\x00' * 10
356+
b = b'RIFF' + struct.pack('<L', 0xFFFFFFFF) + b'WAVE'
357+
b += b'fmt ' + struct.pack('<LHHLLHH', 16, 1, 1, 11025, 11025, 1, 8)
358+
b += b'data' + struct.pack('<L', 0xFFFFFFFF) # bogus, ~4 GiB
359+
b += real_data
360+
f = _ReadSizeRecorder(b)
361+
with wave.open(f, 'rb') as r:
362+
data = r.readframes(r.getnframes())
363+
self.assertEqual(data, real_data)
364+
# The bogus ~4 GiB size must never reach the underlying read().
365+
self.assertLessEqual(f.max_read_size, len(b))
366+
336367
def test_open_in_write_raises(self):
337368
# gh-136523: Wave_write.__del__ should not throw
338369
with support.catch_unraisable_exception() as cm:

Lib/wave.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,28 @@ def read(self, size=-1):
189189
size = self.chunksize - self.size_read
190190
if size > self.chunksize - self.size_read:
191191
size = self.chunksize - self.size_read
192+
# The chunk size comes from the file header and is not trustworthy:
193+
# a truncated or maliciously crafted file can claim a size far larger
194+
# than the data actually present, which would make the read() below
195+
# pre-allocate that many bytes (gh-151308). When the underlying file
196+
# is seekable, clamp the request to the bytes physically available so
197+
# we never allocate more than the file can provide. This leaves the
198+
# data returned for valid files unchanged, since the requested bytes
199+
# are always present. We probe with tell()/seek() rather than trust
200+
# seekable(), since some file objects report being seekable yet raise
201+
# on the actual call; on any failure we fall back to the original
202+
# behaviour. This also covers the nested chunks, whose seek() returns
203+
# None (so the clamp is skipped) and which read through the raw file.
204+
if size > 0:
205+
try:
206+
here = self.file.tell()
207+
end = self.file.seek(0, 2)
208+
self.file.seek(here, 0)
209+
except (OSError, ValueError):
210+
pass
211+
else:
212+
if isinstance(end, int):
213+
size = min(size, max(0, end - here))
192214
data = self.file.read(size)
193215
self.size_read = self.size_read + len(data)
194216
if self.size_read == self.chunksize and \
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:meth:`wave.Wave_read.readframes` no longer attempts to pre-allocate a huge
2+
buffer when the data chunk header of a truncated or malformed WAV file claims
3+
a size larger than the file actually contains. When the underlying file is
4+
seekable, reads are now clamped to the number of bytes really available.

0 commit comments

Comments
 (0)