Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions cpp/src/parquet/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -110,8 +111,11 @@ template <typename T>
std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
::std::string_view val) {
std::stringstream result;
// The statistics value comes from the file's Thrift metadata, so its length
// is attacker controlled and the spec allows it to be truncated. Only copy
// the bytes that are actually present and leave the rest zero-padded.
T value{};
std::memcpy(&value, val.data(), sizeof(T));
std::memcpy(&value, val.data(), std::min(sizeof(T), val.size()));
result << value;
return result.str();
}
Expand All @@ -128,14 +132,14 @@ std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
switch (parquet_type) {
case Type::INT32: {
int32_t int_value{};
std::memcpy(&int_value, val.data(), sizeof(int32_t));
std::memcpy(&int_value, val.data(), std::min(sizeof(int32_t), val.size()));
::arrow::Decimal128 decimal_value(int_value);
result << decimal_value.ToString(scale);
break;
}
case Type::INT64: {
int64_t long_value{};
std::memcpy(&long_value, val.data(), sizeof(int64_t));
std::memcpy(&long_value, val.data(), std::min(sizeof(int64_t), val.size()));
::arrow::Decimal128 decimal_value(long_value);
result << decimal_value.ToString(scale);
break;
Expand Down Expand Up @@ -174,8 +178,11 @@ std::string FormatNonUTF8Value(::std::string_view val) {

std::string FormatFloat16Value(::std::string_view val) {
std::stringstream result;
auto float16 = ::arrow::util::Float16::FromLittleEndian(
reinterpret_cast<const uint8_t*>(val.data()));
// FromLittleEndian reads two bytes unconditionally; a truncated stat value
// may be shorter, so copy into a zero-padded buffer first.
std::array<uint8_t, 2> bytes{};
std::memcpy(bytes.data(), val.data(), std::min(bytes.size(), val.size()));
auto float16 = ::arrow::util::Float16::FromLittleEndian(bytes.data());
result << float16.ToFloat();
return result.str();
}
Expand All @@ -184,12 +191,16 @@ std::string FormatFloat16Value(::std::string_view val) {

std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
const std::shared_ptr<const LogicalType>& logical_type) {
// Statistics values come straight from the file's Thrift metadata, so their
// length is attacker controlled and the spec allows them to be truncated.
// Every fixed-width read below clamps its copy to val.size() so a too-short
// value cannot drive a load past the end of the buffer.
std::stringstream result;
const char* bytes = val.data();
switch (parquet_type) {
case Type::BOOLEAN: {
bool value{};
std::memcpy(&value, bytes, sizeof(bool));
std::memcpy(&value, bytes, std::min(sizeof(bool), val.size()));
result << value;
break;
}
Expand All @@ -213,7 +224,7 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
}
case Type::INT96: {
std::array<int32_t, 3> values{};
std::memcpy(values.data(), bytes, 3 * sizeof(int32_t));
std::memcpy(values.data(), bytes, std::min(sizeof(values), val.size()));
result << values[0] << " " << values[1] << " " << values[2];
break;
}
Expand Down
18 changes: 18 additions & 0 deletions cpp/src/parquet/types_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,24 @@ TEST(TypePrinter, StatisticsTypes) {
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, LogicalType::Float16()));
}

TEST(TypePrinter, StatisticsTypesShortValue) {
// The Parquet spec allows truncated statistics, so a min/max may be shorter
// than the column's physical type. FormatStatValue must format it without
// reading past the end of the value buffer (caught here under ASan).
ASSERT_NO_THROW(FormatStatValue(Type::BOOLEAN, std::string()));
ASSERT_NO_THROW(FormatStatValue(Type::INT32, std::string("abc")));
ASSERT_NO_THROW(FormatStatValue(Type::INT64, std::string("abcdefg")));
ASSERT_NO_THROW(FormatStatValue(Type::FLOAT, std::string()));
ASSERT_NO_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567")));
ASSERT_NO_THROW(FormatStatValue(Type::INT96, std::string("12345678901")));
ASSERT_NO_THROW(
FormatStatValue(Type::INT32, std::string("abc"), LogicalType::Decimal(6, 2)));
ASSERT_NO_THROW(
FormatStatValue(Type::INT64, std::string("abcdefg"), LogicalType::Decimal(18, 4)));
ASSERT_NO_THROW(
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), LogicalType::Float16()));
}

TEST(TestInt96Timestamp, Decoding) {
auto check = [](int32_t julian_day, uint64_t nanoseconds) {
#if ARROW_LITTLE_ENDIAN
Expand Down
Loading