From 5c73eeda2439da105c401e85dd7f2b4e11f846d7 Mon Sep 17 00:00:00 2001 From: jmestwa-coder Date: Sat, 23 May 2026 16:03:20 +0530 Subject: [PATCH 1/3] reject too-short statistics values in FormatStatValue --- cpp/src/parquet/types.cc | 37 +++++++++++++++++++++++++++++++++++ cpp/src/parquet/types_test.cc | 23 ++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a7544..824913117b5f 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -184,6 +184,43 @@ std::string FormatFloat16Value(::std::string_view val) { std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, const std::shared_ptr& logical_type) { + // Statistics blobs come from the file's Thrift-encoded metadata and may have + // arbitrary length under a malicious writer. Reject any value that is shorter + // than what the physical (and, for FLOAT16, logical) type requires so the + // memcpy/byte loads below cannot run past the buffer. + size_t required = 0; + switch (parquet_type) { + case Type::BOOLEAN: + required = sizeof(bool); + break; + case Type::INT32: + case Type::FLOAT: + required = 4; + break; + case Type::INT64: + case Type::DOUBLE: + required = 8; + break; + case Type::INT96: + required = 3 * sizeof(int32_t); + break; + case Type::BYTE_ARRAY: + case Type::FIXED_LEN_BYTE_ARRAY: + if (logical_type != nullptr && logical_type->is_float16()) { + required = 2; + } + break; + case Type::UNDEFINED: + default: + break; + } + if (val.size() < required) { + std::stringstream ss; + ss << "Statistics value of " << val.size() << " bytes is too small for " + << TypeToString(parquet_type); + throw ParquetException(ss.str()); + } + std::stringstream result; const char* bytes = val.data(); switch (parquet_type) { diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc index 6c77662d58f7..6803dfd136a0 100644 --- a/cpp/src/parquet/types_test.cc +++ b/cpp/src/parquet/types_test.cc @@ -20,6 +20,7 @@ #include #include "arrow/util/endian.h" +#include "parquet/exception.h" #include "parquet/types.h" namespace parquet { @@ -179,6 +180,28 @@ TEST(TypePrinter, StatisticsTypes) { FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, LogicalType::Float16())); } +TEST(TypePrinter, StatisticsTypesShortValue) { + // A maliciously crafted Parquet file may set a statistics min/max shorter + // than the column's physical type. FormatStatValue must reject it instead of + // memcpy'ing past the buffer. + ASSERT_THROW(FormatStatValue(Type::BOOLEAN, std::string()), ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT32, std::string("abc")), ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT64, std::string("abcdefg")), ParquetException); + ASSERT_THROW(FormatStatValue(Type::FLOAT, std::string()), ParquetException); + ASSERT_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567")), ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT96, std::string("12345678901")), + ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT32, std::string("abc"), + LogicalType::Decimal(6, 2)), + ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT64, std::string("abcdefg"), + LogicalType::Decimal(18, 4)), + ParquetException); + ASSERT_THROW(FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), + LogicalType::Float16()), + ParquetException); +} + TEST(TestInt96Timestamp, Decoding) { auto check = [](int32_t julian_day, uint64_t nanoseconds) { #if ARROW_LITTLE_ENDIAN From 4897dee3afc563dd78f9d9225f8642aca327e41b Mon Sep 17 00:00:00 2001 From: jmestwa-coder Date: Wed, 27 May 2026 00:59:55 +0530 Subject: [PATCH 2/3] Group BYTE_ARRAY with skipped types in stat length guard --- cpp/src/parquet/types.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 824913117b5f..3aee0a57fe63 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -204,12 +204,12 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, case Type::INT96: required = 3 * sizeof(int32_t); break; - case Type::BYTE_ARRAY: case Type::FIXED_LEN_BYTE_ARRAY: if (logical_type != nullptr && logical_type->is_float16()) { required = 2; } break; + case Type::BYTE_ARRAY: case Type::UNDEFINED: default: break; From e230d6e07213b593b39ef8e367408414d2eca672 Mon Sep 17 00:00:00 2001 From: jmestwa-coder Date: Tue, 16 Jun 2026 20:56:34 +0530 Subject: [PATCH 3/3] GH-50184: [C++][Parquet] clamp stat value reads to the value length The Parquet spec allows truncated statistics, so a too-short min/max is valid and must not throw. Bound every fixed-width memcpy/Float16 load in FormatStatValue to val.size() and zero-pad the rest instead of reading past the buffer. --- cpp/src/parquet/types.cc | 62 ++++++++++------------------------- cpp/src/parquet/types_test.cc | 35 +++++++++----------- 2 files changed, 33 insertions(+), 64 deletions(-) diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 3aee0a57fe63..b772462082d1 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -110,8 +111,11 @@ template std::enable_if_t, std::string> FormatNumericValue( ::std::string_view val) { std::stringstream result; + // The statistics value comes from the file's Thrift metadata, so its length + // is attacker controlled and the spec allows it to be truncated. Only copy + // the bytes that are actually present and leave the rest zero-padded. T value{}; - std::memcpy(&value, val.data(), sizeof(T)); + std::memcpy(&value, val.data(), std::min(sizeof(T), val.size())); result << value; return result.str(); } @@ -128,14 +132,14 @@ std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val, switch (parquet_type) { case Type::INT32: { int32_t int_value{}; - std::memcpy(&int_value, val.data(), sizeof(int32_t)); + std::memcpy(&int_value, val.data(), std::min(sizeof(int32_t), val.size())); ::arrow::Decimal128 decimal_value(int_value); result << decimal_value.ToString(scale); break; } case Type::INT64: { int64_t long_value{}; - std::memcpy(&long_value, val.data(), sizeof(int64_t)); + std::memcpy(&long_value, val.data(), std::min(sizeof(int64_t), val.size())); ::arrow::Decimal128 decimal_value(long_value); result << decimal_value.ToString(scale); break; @@ -174,8 +178,11 @@ std::string FormatNonUTF8Value(::std::string_view val) { std::string FormatFloat16Value(::std::string_view val) { std::stringstream result; - auto float16 = ::arrow::util::Float16::FromLittleEndian( - reinterpret_cast(val.data())); + // FromLittleEndian reads two bytes unconditionally; a truncated stat value + // may be shorter, so copy into a zero-padded buffer first. + std::array bytes{}; + std::memcpy(bytes.data(), val.data(), std::min(bytes.size(), val.size())); + auto float16 = ::arrow::util::Float16::FromLittleEndian(bytes.data()); result << float16.ToFloat(); return result.str(); } @@ -184,49 +191,16 @@ std::string FormatFloat16Value(::std::string_view val) { std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, const std::shared_ptr& logical_type) { - // Statistics blobs come from the file's Thrift-encoded metadata and may have - // arbitrary length under a malicious writer. Reject any value that is shorter - // than what the physical (and, for FLOAT16, logical) type requires so the - // memcpy/byte loads below cannot run past the buffer. - size_t required = 0; - switch (parquet_type) { - case Type::BOOLEAN: - required = sizeof(bool); - break; - case Type::INT32: - case Type::FLOAT: - required = 4; - break; - case Type::INT64: - case Type::DOUBLE: - required = 8; - break; - case Type::INT96: - required = 3 * sizeof(int32_t); - break; - case Type::FIXED_LEN_BYTE_ARRAY: - if (logical_type != nullptr && logical_type->is_float16()) { - required = 2; - } - break; - case Type::BYTE_ARRAY: - case Type::UNDEFINED: - default: - break; - } - if (val.size() < required) { - std::stringstream ss; - ss << "Statistics value of " << val.size() << " bytes is too small for " - << TypeToString(parquet_type); - throw ParquetException(ss.str()); - } - + // Statistics values come straight from the file's Thrift metadata, so their + // length is attacker controlled and the spec allows them to be truncated. + // Every fixed-width read below clamps its copy to val.size() so a too-short + // value cannot drive a load past the end of the buffer. std::stringstream result; const char* bytes = val.data(); switch (parquet_type) { case Type::BOOLEAN: { bool value{}; - std::memcpy(&value, bytes, sizeof(bool)); + std::memcpy(&value, bytes, std::min(sizeof(bool), val.size())); result << value; break; } @@ -250,7 +224,7 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, } case Type::INT96: { std::array values{}; - std::memcpy(values.data(), bytes, 3 * sizeof(int32_t)); + std::memcpy(values.data(), bytes, std::min(sizeof(values), val.size())); result << values[0] << " " << values[1] << " " << values[2]; break; } diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc index 6803dfd136a0..14814a18c409 100644 --- a/cpp/src/parquet/types_test.cc +++ b/cpp/src/parquet/types_test.cc @@ -20,7 +20,6 @@ #include #include "arrow/util/endian.h" -#include "parquet/exception.h" #include "parquet/types.h" namespace parquet { @@ -181,25 +180,21 @@ TEST(TypePrinter, StatisticsTypes) { } TEST(TypePrinter, StatisticsTypesShortValue) { - // A maliciously crafted Parquet file may set a statistics min/max shorter - // than the column's physical type. FormatStatValue must reject it instead of - // memcpy'ing past the buffer. - ASSERT_THROW(FormatStatValue(Type::BOOLEAN, std::string()), ParquetException); - ASSERT_THROW(FormatStatValue(Type::INT32, std::string("abc")), ParquetException); - ASSERT_THROW(FormatStatValue(Type::INT64, std::string("abcdefg")), ParquetException); - ASSERT_THROW(FormatStatValue(Type::FLOAT, std::string()), ParquetException); - ASSERT_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567")), ParquetException); - ASSERT_THROW(FormatStatValue(Type::INT96, std::string("12345678901")), - ParquetException); - ASSERT_THROW(FormatStatValue(Type::INT32, std::string("abc"), - LogicalType::Decimal(6, 2)), - ParquetException); - ASSERT_THROW(FormatStatValue(Type::INT64, std::string("abcdefg"), - LogicalType::Decimal(18, 4)), - ParquetException); - ASSERT_THROW(FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), - LogicalType::Float16()), - ParquetException); + // The Parquet spec allows truncated statistics, so a min/max may be shorter + // than the column's physical type. FormatStatValue must format it without + // reading past the end of the value buffer (caught here under ASan). + ASSERT_NO_THROW(FormatStatValue(Type::BOOLEAN, std::string())); + ASSERT_NO_THROW(FormatStatValue(Type::INT32, std::string("abc"))); + ASSERT_NO_THROW(FormatStatValue(Type::INT64, std::string("abcdefg"))); + ASSERT_NO_THROW(FormatStatValue(Type::FLOAT, std::string())); + ASSERT_NO_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567"))); + ASSERT_NO_THROW(FormatStatValue(Type::INT96, std::string("12345678901"))); + ASSERT_NO_THROW( + FormatStatValue(Type::INT32, std::string("abc"), LogicalType::Decimal(6, 2))); + ASSERT_NO_THROW( + FormatStatValue(Type::INT64, std::string("abcdefg"), LogicalType::Decimal(18, 4))); + ASSERT_NO_THROW( + FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), LogicalType::Float16())); } TEST(TestInt96Timestamp, Decoding) {