diff --git a/Lib/test/test_wave.py b/Lib/test/test_wave.py index d3723c04820d9d4..709db77fb65d5a3 100644 --- a/Lib/test/test_wave.py +++ b/Lib/test/test_wave.py @@ -10,6 +10,30 @@ import wave +class _ReadSizeRecorder(io.BytesIO): + # A seekable file that remembers the largest size ever passed to read() + # (so a test can check that wave does not request far more data than the + # file actually holds, which on a real file would pre-allocate it), and + # that rejects seeks to offsets overflowing a C ssize_t the way a 32-bit + # platform such as WASI does (so a test can check that wave never seeks + # to an untrusted chunk size). + _SSIZE_MAX = (1 << 31) - 1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.max_read_size = 0 + + def read(self, size=-1): + if size is not None and size >= 0: + self.max_read_size = max(self.max_read_size, size) + return super().read(size) + + def seek(self, pos, whence=0): + if abs(pos) > self._SSIZE_MAX: + raise OverflowError("Python int too large to convert to C ssize_t") + return super().seek(pos, whence) + + class WaveTest(audiotests.AudioWriteTests, audiotests.AudioTestsWithSourceFile): module = wave @@ -333,6 +357,25 @@ def test_read_wrong_sample_width(self): with self.assertRaisesRegex(wave.Error, 'bad sample width'): wave.open(io.BytesIO(b)) + def test_read_data_chunk_size_larger_than_file(self): + # gh-151308: a data chunk header may claim far more data than the + # file actually contains. readframes() must not request (and so, + # on a real file, pre-allocate) the claimed size; reads on a + # seekable file are clamped to the bytes actually available. + real_data = b'\x00' * 10 + b = b'RIFF' + struct.pack(' self.chunksize - self.size_read: size = self.chunksize - self.size_read + # The chunk size comes from the file header and is not trustworthy: + # a truncated or maliciously crafted file can claim a size far larger + # than the data actually present, which would make the read() below + # pre-allocate that many bytes (gh-151308). When the underlying file + # is seekable, clamp the request to the bytes physically available so + # we never allocate more than the file can provide. This leaves the + # data returned for valid files unchanged, since the requested bytes + # are always present. We probe with tell()/seek() rather than trust + # seekable(), since some file objects report being seekable yet raise + # on the actual call; on any failure we fall back to the original + # behaviour. We only probe the raw file object, never a parent + # _Chunk: seeking a _Chunk would seek the raw file to its (untrusted) + # chunk size, which may overflow on 32-bit platforms. Clamping the + # raw read protects the nested chunks too, as they read through it. + if size > 0 and not isinstance(self.file, _Chunk): + try: + here = self.file.tell() + end = self.file.seek(0, 2) + self.file.seek(here, 0) + except (OSError, ValueError): + pass + else: + if isinstance(end, int): + size = min(size, max(0, end - here)) data = self.file.read(size) self.size_read = self.size_read + len(data) if self.size_read == self.chunksize and \ diff --git a/Misc/NEWS.d/next/Library/2026-06-15-13-04-03.gh-issue-151308.5gc0g-.rst b/Misc/NEWS.d/next/Library/2026-06-15-13-04-03.gh-issue-151308.5gc0g-.rst new file mode 100644 index 000000000000000..41891f5794c0459 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-15-13-04-03.gh-issue-151308.5gc0g-.rst @@ -0,0 +1,4 @@ +:meth:`wave.Wave_read.readframes` no longer attempts to pre-allocate a huge +buffer when the data chunk header of a truncated or malformed WAV file claims +a size larger than the file actually contains. When the underlying file is +seekable, reads are now clamped to the number of bytes really available.