diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a7544..b772462082d1 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -110,8 +111,11 @@ template std::enable_if_t, std::string> FormatNumericValue( ::std::string_view val) { std::stringstream result; + // The statistics value comes from the file's Thrift metadata, so its length + // is attacker controlled and the spec allows it to be truncated. Only copy + // the bytes that are actually present and leave the rest zero-padded. T value{}; - std::memcpy(&value, val.data(), sizeof(T)); + std::memcpy(&value, val.data(), std::min(sizeof(T), val.size())); result << value; return result.str(); } @@ -128,14 +132,14 @@ std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val, switch (parquet_type) { case Type::INT32: { int32_t int_value{}; - std::memcpy(&int_value, val.data(), sizeof(int32_t)); + std::memcpy(&int_value, val.data(), std::min(sizeof(int32_t), val.size())); ::arrow::Decimal128 decimal_value(int_value); result << decimal_value.ToString(scale); break; } case Type::INT64: { int64_t long_value{}; - std::memcpy(&long_value, val.data(), sizeof(int64_t)); + std::memcpy(&long_value, val.data(), std::min(sizeof(int64_t), val.size())); ::arrow::Decimal128 decimal_value(long_value); result << decimal_value.ToString(scale); break; @@ -174,8 +178,11 @@ std::string FormatNonUTF8Value(::std::string_view val) { std::string FormatFloat16Value(::std::string_view val) { std::stringstream result; - auto float16 = ::arrow::util::Float16::FromLittleEndian( - reinterpret_cast(val.data())); + // FromLittleEndian reads two bytes unconditionally; a truncated stat value + // may be shorter, so copy into a zero-padded buffer first. + std::array bytes{}; + std::memcpy(bytes.data(), val.data(), std::min(bytes.size(), val.size())); + auto float16 = ::arrow::util::Float16::FromLittleEndian(bytes.data()); result << float16.ToFloat(); return result.str(); } @@ -184,12 +191,16 @@ std::string FormatFloat16Value(::std::string_view val) { std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, const std::shared_ptr& logical_type) { + // Statistics values come straight from the file's Thrift metadata, so their + // length is attacker controlled and the spec allows them to be truncated. + // Every fixed-width read below clamps its copy to val.size() so a too-short + // value cannot drive a load past the end of the buffer. std::stringstream result; const char* bytes = val.data(); switch (parquet_type) { case Type::BOOLEAN: { bool value{}; - std::memcpy(&value, bytes, sizeof(bool)); + std::memcpy(&value, bytes, std::min(sizeof(bool), val.size())); result << value; break; } @@ -213,7 +224,7 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, } case Type::INT96: { std::array values{}; - std::memcpy(values.data(), bytes, 3 * sizeof(int32_t)); + std::memcpy(values.data(), bytes, std::min(sizeof(values), val.size())); result << values[0] << " " << values[1] << " " << values[2]; break; } diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc index 6c77662d58f7..14814a18c409 100644 --- a/cpp/src/parquet/types_test.cc +++ b/cpp/src/parquet/types_test.cc @@ -179,6 +179,24 @@ TEST(TypePrinter, StatisticsTypes) { FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, LogicalType::Float16())); } +TEST(TypePrinter, StatisticsTypesShortValue) { + // The Parquet spec allows truncated statistics, so a min/max may be shorter + // than the column's physical type. FormatStatValue must format it without + // reading past the end of the value buffer (caught here under ASan). + ASSERT_NO_THROW(FormatStatValue(Type::BOOLEAN, std::string())); + ASSERT_NO_THROW(FormatStatValue(Type::INT32, std::string("abc"))); + ASSERT_NO_THROW(FormatStatValue(Type::INT64, std::string("abcdefg"))); + ASSERT_NO_THROW(FormatStatValue(Type::FLOAT, std::string())); + ASSERT_NO_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567"))); + ASSERT_NO_THROW(FormatStatValue(Type::INT96, std::string("12345678901"))); + ASSERT_NO_THROW( + FormatStatValue(Type::INT32, std::string("abc"), LogicalType::Decimal(6, 2))); + ASSERT_NO_THROW( + FormatStatValue(Type::INT64, std::string("abcdefg"), LogicalType::Decimal(18, 4))); + ASSERT_NO_THROW( + FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), LogicalType::Float16())); +} + TEST(TestInt96Timestamp, Decoding) { auto check = [](int32_t julian_day, uint64_t nanoseconds) { #if ARROW_LITTLE_ENDIAN