diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java index 1b743343163e..07631516cd49 100644 --- a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java @@ -58,21 +58,22 @@ private HiveSchemaConverter(boolean autoConvert) { static Schema convert(List names, List typeInfos, List comments, boolean autoConvert, Map defaultValues) { HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert); - return new Schema(converter.convertInternal(names, typeInfos, defaultValues, comments)); + return new Schema(converter.convertInternal(names, typeInfos, defaultValues, comments, false)); } - public static Type convert(TypeInfo typeInfo, boolean autoConvert, String defaultValue) { + public static Type convert(TypeInfo typeInfo, boolean autoConvert, String defaultValue, + boolean shouldAddInitialDefault) { HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert); - return converter.convertType(typeInfo, defaultValue); + return converter.convertType(typeInfo, defaultValue, shouldAddInitialDefault); } List convertInternal(List names, List typeInfos, - Map defaultValues, List comments) { + Map defaultValues, List comments, boolean shouldAddInitialDefault) { List result = Lists.newArrayListWithExpectedSize(names.size()); int outerId = id + names.size(); id = outerId; for (int i = 0; i < names.size(); ++i) { - Type type = convertType(typeInfos.get(i), defaultValues.get(names.get(i))); + Type type = convertType(typeInfos.get(i), defaultValues.get(names.get(i)), shouldAddInitialDefault); String columnName = names.get(i); Types.NestedField.Builder fieldBuilder = Types.NestedField.builder() @@ -87,6 +88,9 @@ List convertInternal(List names, List typeI Object icebergDefaultValue = HiveSchemaUtil.getDefaultValue(defaultValues.get(columnName), type); if (icebergDefaultValue != null) { fieldBuilder.withWriteDefault(Expressions.lit(icebergDefaultValue)); + if (shouldAddInitialDefault) { + fieldBuilder.withInitialDefault(Expressions.lit(icebergDefaultValue)); + } } } else if (!type.isStructType()) { throw new UnsupportedOperationException( @@ -99,7 +103,7 @@ List convertInternal(List names, List typeI return result; } - Type convertType(TypeInfo typeInfo, String defaultValue) { + Type convertType(TypeInfo typeInfo, String defaultValue, boolean shouldAddInitialDefault) { switch (typeInfo.getCategory()) { case PRIMITIVE: switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) { @@ -161,21 +165,24 @@ Type convertType(TypeInfo typeInfo, String defaultValue) { } case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; - List fields = - convertInternal(structTypeInfo.getAllStructFieldNames(), structTypeInfo.getAllStructFieldTypeInfos(), - HiveSchemaUtil.getDefaultValuesMap(null, defaultValue), Collections.emptyList()); + List fields = convertInternal( + structTypeInfo.getAllStructFieldNames(), + structTypeInfo.getAllStructFieldTypeInfos(), + HiveSchemaUtil.getDefaultValuesMap(null, defaultValue), + Collections.emptyList(), + shouldAddInitialDefault); return Types.StructType.of(fields); case MAP: MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; int keyId = id++; - Type keyType = convertType(mapTypeInfo.getMapKeyTypeInfo(), defaultValue); + Type keyType = convertType(mapTypeInfo.getMapKeyTypeInfo(), defaultValue, shouldAddInitialDefault); int valueId = id++; - Type valueType = convertType(mapTypeInfo.getMapValueTypeInfo(), defaultValue); + Type valueType = convertType(mapTypeInfo.getMapValueTypeInfo(), defaultValue, shouldAddInitialDefault); return Types.MapType.ofOptional(keyId, valueId, keyType, valueType); case LIST: ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; int listId = id++; - Type listType = convertType(listTypeInfo.getListElementTypeInfo(), defaultValue); + Type listType = convertType(listTypeInfo.getListElementTypeInfo(), defaultValue, shouldAddInitialDefault); return Types.ListType.ofOptional(listId, listType); case VARIANT: return Types.VariantType.get(); diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java index b1563040bcb9..3f18deab4eaf 100644 --- a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.function.Function; import java.util.stream.Collectors; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -148,10 +149,11 @@ public static TypeInfo convert(Type type) { * * @param typeInfo The Hive type * @param defaultValue the default value for the column, if any + * @param shouldAddInitialDefault whether to set initial default for the column or not * @return The Iceberg type */ - public static Type convert(TypeInfo typeInfo, String defaultValue) { - return HiveSchemaConverter.convert(typeInfo, false, defaultValue); + public static Type convert(TypeInfo typeInfo, String defaultValue, boolean shouldAddInitialDefault) { + return HiveSchemaConverter.convert(typeInfo, false, defaultValue, shouldAddInitialDefault); } /** @@ -418,7 +420,8 @@ public static void setDefaultValues(Record record, List missi for (Types.NestedField field : missingFields) { if (field.type().isStructType()) { // Attempt to build the nested struct with its defaults - Record nestedRecord = buildStructWithDefaults(field.type().asStructType()); + Record nestedRecord = buildStructFromDefaults( + field.type().asStructType(), Types.NestedField::writeDefault); if (nestedRecord != null) { record.setField(field.name(), nestedRecord); } @@ -430,23 +433,39 @@ public static void setDefaultValues(Record record, List missi } /** - * Recursively builds a struct populated with write defaults. - * * @return A populated Record, or null if no nested fields have defaults. + * Backfills struct column that is null on read using nested {@code initialDefault} metadata. + * This applies to rows written before {@code ADD COLUMNS} added the struct. + * Spec allows struct defaults as {@code {}} (see https://iceberg.apache.org/spec/#default-values), but + * {@code UpdateSchema} add column only supports defaults of underlying primitives and keeping the + * struct default as null due to which we need to backfill that nested default record while reading; + * if empty structs are allowed, this backfill can be removed. */ - private static Record buildStructWithDefaults(Types.StructType structType) { + public static void backfillStructInitialDefaults( + Record iceRecord, Map initialDefaultStructsByColumn) { + for (Map.Entry columnAndInitialDefaultStruct : initialDefaultStructsByColumn.entrySet()) { + String columnName = columnAndInitialDefaultStruct.getKey(); + if (iceRecord.getField(columnName) == null) { + iceRecord.setField(columnName, columnAndInitialDefaultStruct.getValue()); + } + } + } + + /** + * Recursively builds a struct populated with underlying field defaults. + * @return A populated Record, or null if no nested fields have defaults. + */ + public static Record buildStructFromDefaults( + Types.StructType structType, Function defaultForField) { Record nestedRecord = GenericRecord.create(structType); boolean hasAnyDefault = false; for (Types.NestedField field : structType.fields()) { - if (field.writeDefault() != null) { - Object defaultValue = convertToWriteType(field.writeDefault(), field.type()); - nestedRecord.setField(field.name(), defaultValue); + Object defaultValue = defaultForField.apply(field); + if (defaultValue != null) { + nestedRecord.setField(field.name(), convertToWriteType(defaultValue, field.type())); hasAnyDefault = true; } else if (field.type().isStructType()) { - // Recursively process deeper nested structs - Record deeperRecord = buildStructWithDefaults(field.type().asStructType()); - - // If the deeper struct has defaults, attach it and flag this current struct as populated + Record deeperRecord = buildStructFromDefaults(field.type().asStructType(), defaultForField); if (deeperRecord != null) { nestedRecord.setField(field.name(), deeperRecord); hasAnyDefault = true; @@ -454,10 +473,24 @@ private static Record buildStructWithDefaults(Types.StructType structType) { } } - // If no fields (or nested fields) had defaults, return null to avoid an empty struct return hasAnyDefault ? nestedRecord : null; } + public static Map getStructInitialDefaults(Types.StructType structType) { + Map result = Maps.newHashMap(); + for (Types.NestedField field : structType.fields()) { + if (field.initialDefault() != null) { + result.put(field.name(), field.initialDefault()); + } else if (field.type().isStructType()) { + Map nested = getStructInitialDefaults(field.type().asStructType()); + if (!nested.isEmpty()) { + result.put(field.name(), nested); + } + } + } + return result; + } + /** * Sets a value into a {@link Record} using a struct-only field path (top-level column or nested * through structs). Intermediate struct records are created as needed. @@ -496,21 +529,6 @@ private static Record getOrCreateStructRecord( return record; } - // Special method for nested structs that always applies defaults to null fields - private static void setDefaultValuesForNestedStruct(Record record, List fields) { - for (Types.NestedField field : fields) { - Object fieldValue = record.getField(field.name()); - - if (field.writeDefault() != null) { - Object defaultValue = convertToWriteType(field.writeDefault(), field.type()); - record.setField(field.name(), defaultValue); - } else if (field.type().isStructType()) { - // Recursively process nested structs - setDefaultValuesForNestedStruct((Record) fieldValue, field.type().asStructType().fields()); - } - } - } - public static Object convertToWriteType(Object value, Type type) { if (value == null) { return null; diff --git a/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java b/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java index 6daf3aeca5d7..6234a6f8db50 100644 --- a/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java +++ b/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java @@ -220,7 +220,7 @@ private void checkConvert(TypeInfo typeInfo, Type type) { // Convert to TypeInfo assertThat(HiveSchemaUtil.convert(type)).isEqualTo(typeInfo); // Convert to Type - assertEquals(type, HiveSchemaUtil.convert(typeInfo, null)); + assertEquals(type, HiveSchemaUtil.convert(typeInfo, null, true)); } /** diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java index 5a2bbbf70219..271ce74bfb7e 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java @@ -741,16 +741,16 @@ private void handleAddColumns(org.apache.hadoop.hive.metastore.api.Table hmsTabl (List) SessionStateUtil.getResource(conf, SessionStateUtil.COLUMN_DEFAULTS).orElse(null); Map defaultValues = Stream.ofNullable(sqlDefaultConstraints).flatMap(Collection::stream) .collect(Collectors.toMap(SQLDefaultConstraint::getColumn_name, SQLDefaultConstraint::getDefault_value)); - boolean isORc = isOrcFileFormat(hmsTable); + boolean isOrc = isOrcFileFormat(hmsTable); for (FieldSchema addedCol : addedCols) { String defaultValue = defaultValues.get(addedCol.getName()); - Type type = HiveSchemaUtil.convert(TypeInfoUtils.getTypeInfoFromTypeString(addedCol.getType()), defaultValue); + Type type = HiveSchemaUtil.convert(TypeInfoUtils.getTypeInfoFromTypeString(addedCol.getType()), defaultValue, + !isOrc); Literal defaultVal = Optional.ofNullable(defaultValue).filter(v -> !type.isStructType()) .map(v -> Expressions.lit(HiveSchemaUtil.getDefaultValue(v, type))).orElse(null); - // ORC doesn't have support for initialDefault from iceberg layer, we only need to set default for writeDefault. - updateSchema.addColumn(addedCol.getName(), type, addedCol.getComment(), isORc ? null : defaultVal); - if (isORc && defaultVal != null) { + updateSchema.addColumn(addedCol.getName(), type, addedCol.getComment(), isOrc ? null : defaultVal); + if (isOrc && defaultVal != null) { updateSchema.updateColumnDefault(addedCol.getName(), defaultVal); } } @@ -933,7 +933,7 @@ private void handlePartitionRename(HiveSchemaUtil.SchemaDifference schemaDiffere } private Type.PrimitiveType getPrimitiveTypeOrThrow(FieldSchema field) throws MetaException { - Type newType = HiveSchemaUtil.convert(TypeInfoUtils.getTypeInfoFromTypeString(field.getType()), null); + Type newType = HiveSchemaUtil.convert(TypeInfoUtils.getTypeInfoFromTypeString(field.getType()), null, true); if (!(newType instanceof Type.PrimitiveType)) { throw new MetaException(String.format("Cannot promote type of column: '%s' to a non-primitive type: %s.", field.getName(), newType)); diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java index 834d762062da..4189b3aea209 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java @@ -52,6 +52,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Table; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.hive.HiveSchemaUtil; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.mr.InputFormatConfig; @@ -187,6 +188,12 @@ static Map getInitialColumnDefaults(List colu for (Types.NestedField column : columns) { if (column.initialDefault() != null) { columnDefaults.put(column.name(), column.initialDefault()); + } else if (column.type().isStructType()) { + Map structDefaults = + HiveSchemaUtil.getStructInitialDefaults(column.type().asStructType()); + if (!structDefaults.isEmpty()) { + columnDefaults.put(column.name(), structDefaults); + } } } return columnDefaults; diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java index e3d03a6bee66..96f8705a3f7e 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java @@ -53,6 +53,7 @@ import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.encryption.EncryptedFiles; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.hive.HiveSchemaUtil; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.InputFile; @@ -63,6 +64,7 @@ import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -173,7 +175,29 @@ private CloseableIterable openGeneric(FileScanTask task, Schema readSchema) { default -> throw new UnsupportedOperationException( String.format("Cannot read %s file: %s", file.format().name(), file.location())); }; - return applyResidualFiltering(iterable, residual, readSchema); + return applyResidualFiltering(withStructInitialDefaultsBackfill(iterable, readSchema), residual, readSchema); + } + + private CloseableIterable withStructInitialDefaultsBackfill(CloseableIterable iterable, Schema readSchema) { + Map initialDefaultStructsByColumn = Maps.newHashMap(); + for (Types.NestedField column : readSchema.columns()) { + if (column.type().isStructType()) { + Record initialDefaultStruct = HiveSchemaUtil + .buildStructFromDefaults(column.type().asStructType(), Types.NestedField::initialDefault); + if (initialDefaultStruct != null) { + initialDefaultStructsByColumn.put(column.name(), initialDefaultStruct); + } + } + } + if (initialDefaultStructsByColumn.isEmpty()) { + return iterable; + } + return CloseableIterable.transform(iterable, row -> { + if (row instanceof Record curIceRecord) { + HiveSchemaUtil.backfillStructInitialDefaults(curIceRecord, initialDefaultStructsByColumn); + } + return row; + }); } private CloseableIterable newAvroIterable( diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q index c0c058bcd5dc..c7007114e5be 100644 --- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q @@ -13,7 +13,14 @@ ALTER TABLE ice_parq ADD COLUMNS (point STRUCT DEFAULT '{"x":100," created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general'); + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'); INSERT INTO ice_parq (id) VALUES (2); @@ -45,7 +52,14 @@ ALTER TABLE ice_avro ADD COLUMNS (point STRUCT DEFAULT '{"x":100," created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general'); + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'); INSERT INTO ice_avro (id) VALUES (2); @@ -77,7 +91,14 @@ ALTER TABLE ice_orc ADD COLUMNS (point STRUCT DEFAULT '{"x":100,"y created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general'); + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'); INSERT INTO ice_orc (id) VALUES (2); diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_alter_default_column.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_alter_default_column.q.out index bb1ae6aa0606..ee405875b382 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_alter_default_column.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_alter_default_column.q.out @@ -60,7 +60,7 @@ POSTHOOK: query: SELECT * FROM ice_t ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_t POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general 2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general PREHOOK: query: ALTER TABLE ice_t REPLACE COLUMNS (id INT, point STRUCT DEFAULT '{"x":100,"y":99}', @@ -94,7 +94,7 @@ POSTHOOK: query: SELECT * FROM ice_t ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_t POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general 2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general PREHOOK: query: ALTER TABLE ice_t CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL @@ -128,7 +128,7 @@ POSTHOOK: query: SELECT * FROM ice_t ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_t POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general 2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general 3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 general PREHOOK: query: ALTER TABLE ice_t CHANGE COLUMN point point_new STRUCT DEFAULT '{"x":55,"y":88}' @@ -155,7 +155,7 @@ POSTHOOK: query: SELECT * FROM ice_t ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_t POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general 2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 general 3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 general 4 {"x":55,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 general diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_default_column.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_default_column.q.out index 8400d21d3572..5ff787ef67bd 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_default_column.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_default_column.q.out @@ -170,11 +170,11 @@ POSTHOOK: query: SELECT * FROM t3 ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@t3 POSTHOOK: Output: hdfs://### HDFS PATH ### -2 {"x":null,"y":7} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL -3 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL -4 {"x":100,"y":99} NULL 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL -5 {"x":100,"y":99} custom_name 30 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL -6 {"x":null,"y":null} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL -7 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general NULL +2 {"x":null,"y":7} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +4 {"x":100,"y":99} NULL 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +5 {"x":100,"y":99} custom_name 30 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +6 {"x":null,"y":null} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +7 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} 8 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} 9 NULL null NULL 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":null,"address":{"street":null,"city":"Bangalore"}} diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out index da857354d811..d15997e7d81f 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out @@ -28,7 +28,14 @@ PREHOOK: query: ALTER TABLE ice_parq ADD COLUMNS (point STRUCT DEF created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') PREHOOK: type: ALTERTABLE_ADDCOLS PREHOOK: Input: default@ice_parq PREHOOK: Output: default@ice_parq @@ -40,7 +47,14 @@ POSTHOOK: query: ALTER TABLE ice_parq ADD COLUMNS (point STRUCT DE created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') POSTHOOK: type: ALTERTABLE_ADDCOLS POSTHOOK: Input: default@ice_parq POSTHOOK: Output: default@ice_parq @@ -60,8 +74,8 @@ POSTHOOK: query: SELECT * FROM ice_parq ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_parq POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: ALTER TABLE ice_parq CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@ice_parq @@ -94,9 +108,9 @@ POSTHOOK: query: SELECT * FROM ice_parq ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_parq POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: CREATE TABLE ice_avro ( id INT) STORED BY ICEBERG stored as avro @@ -127,7 +141,14 @@ PREHOOK: query: ALTER TABLE ice_avro ADD COLUMNS (point STRUCT DEF created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') PREHOOK: type: ALTERTABLE_ADDCOLS PREHOOK: Input: default@ice_avro PREHOOK: Output: default@ice_avro @@ -139,7 +160,14 @@ POSTHOOK: query: ALTER TABLE ice_avro ADD COLUMNS (point STRUCT DE created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') POSTHOOK: type: ALTERTABLE_ADDCOLS POSTHOOK: Input: default@ice_avro POSTHOOK: Output: default@ice_avro @@ -159,8 +187,8 @@ POSTHOOK: query: SELECT * FROM ice_avro ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_avro POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: ALTER TABLE ice_avro CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@ice_avro @@ -193,9 +221,9 @@ POSTHOOK: query: SELECT * FROM ice_avro ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_avro POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: CREATE TABLE ice_orc ( id INT) STORED BY ICEBERG stored as orc @@ -226,7 +254,14 @@ PREHOOK: query: ALTER TABLE ice_orc ADD COLUMNS (point STRUCT DEFA created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') PREHOOK: type: ALTERTABLE_ADDCOLS PREHOOK: Input: default@ice_orc PREHOOK: Output: default@ice_orc @@ -238,7 +273,14 @@ POSTHOOK: query: ALTER TABLE ice_orc ADD COLUMNS (point STRUCT DEF created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') POSTHOOK: type: ALTERTABLE_ADDCOLS POSTHOOK: Input: default@ice_orc POSTHOOK: Output: default@ice_orc @@ -258,8 +300,8 @@ POSTHOOK: query: SELECT * FROM ice_orc ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_orc POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL NULL NULL NULL NULL NULL NULL NULL NULL -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: ALTER TABLE ice_orc CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@ice_orc @@ -292,9 +334,9 @@ POSTHOOK: query: SELECT * FROM ice_orc ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_orc POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL NULL NULL NULL NULL NULL NULL NULL NULL -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: SELECT * FROM ice_parq ORDER BY id PREHOOK: type: QUERY PREHOOK: Input: default@ice_parq @@ -303,9 +345,9 @@ POSTHOOK: query: SELECT * FROM ice_parq ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_parq POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: SELECT * FROM ice_avro ORDER BY id PREHOOK: type: QUERY PREHOOK: Input: default@ice_avro @@ -314,9 +356,9 @@ POSTHOOK: query: SELECT * FROM ice_avro ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_avro POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: SELECT * FROM ice_orc ORDER BY id PREHOOK: type: QUERY PREHOOK: Input: default@ice_orc @@ -325,6 +367,6 @@ POSTHOOK: query: SELECT * FROM ice_orc ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_orc POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL NULL NULL NULL NULL NULL NULL NULL NULL -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java index e8d95ccd1587..e4a2e931ea0b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java @@ -24,14 +24,19 @@ import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; /** * A dummy vectorized parquet reader used for schema evolution. @@ -49,10 +54,13 @@ public VectorizedDummyColumnReader(Object defaultValue) { @Override public void readBatch(int total, ColumnVector col, TypeInfo typeInfo) throws IOException { + fillColumnWithDefault(col, typeInfo, defaultValue); + } + private void fillColumnWithDefault(ColumnVector col, TypeInfo typeInfo, Object value) throws IOException { col.isRepeating = true; // Case 1: No default → (all nulls) - if (defaultValue == null) { + if (value == null) { Arrays.fill(col.isNull, true); col.noNulls = false; return; @@ -63,12 +71,27 @@ public void readBatch(int total, ColumnVector col, TypeInfo typeInfo) throws IOE col.isNull[0] = false; if (typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE) { - fillPrimitive(col, (PrimitiveTypeInfo) typeInfo, defaultValue); + fillPrimitive(col, (PrimitiveTypeInfo) typeInfo, value); + } else if (typeInfo.getCategory() == ObjectInspector.Category.STRUCT) { + fillStruct(col, (StructTypeInfo) typeInfo, value); } else { throw new IOException("Unsupported type category in DummyColumnReader: " + typeInfo.getCategory()); } } + private void fillStruct(ColumnVector col, StructTypeInfo structTypeInfo, Object defaultValue) throws IOException { + StructColumnVector structCol = (StructColumnVector) col; + List fieldNames = structTypeInfo.getAllStructFieldNames(); + List fieldTypes = structTypeInfo.getAllStructFieldTypeInfos(); + Map fieldDefaults = defaultValue instanceof Map ? + (Map) defaultValue : + Collections.emptyMap(); + + for (int i = 0; i < fieldNames.size(); i++) { + fillColumnWithDefault(structCol.fields[i], fieldTypes.get(i), fieldDefaults.get(fieldNames.get(i))); + } + } + /** * Fill the column with the given value. * @param col the column to fill