Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ public class BaseHiveIcebergMetaHook implements HiveMetaHook {
);
private static final Set<String> PARAMETERS_TO_REMOVE = ImmutableSet
.of(InputFormatConfig.TABLE_SCHEMA, Catalogs.LOCATION, Catalogs.NAME, InputFormatConfig.PARTITION_SPEC);
static final String ORC_FILES_ONLY = "iceberg.orc.files.only";
private static final String ZORDER_FIELDS_JSON_KEY = "zorderFields";

protected final Configuration conf;
Expand Down Expand Up @@ -197,8 +196,6 @@ public void preCreateTable(CreateTableRequest request) {

assertFileFormat(tableProperties.getProperty(TableProperties.DEFAULT_FILE_FORMAT));

// Set whether the format is ORC, to be used during vectorization.
setOrcOnlyFilesParam(hmsTable);
// Remove hive primary key columns from table request, as iceberg doesn't support hive primary key.
request.setPrimaryKeys(null);
setSortOrder(hmsTable, schema, tableProperties);
Expand Down Expand Up @@ -456,14 +453,6 @@ protected static PartitionSpec spec(Configuration configuration, Schema schema,
return HMSTablePropertyHelper.getPartitionSpec(hmsTable.getParameters(), schema);
}

protected void setOrcOnlyFilesParam(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
hmsTable.getParameters().put(ORC_FILES_ONLY, String.valueOf(isOrcOnlyFiles(hmsTable)));
}

protected boolean isOrcOnlyFiles(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
return !"FALSE".equalsIgnoreCase(hmsTable.getParameters().get(ORC_FILES_ONLY)) && isOrcFileFormat(hmsTable);
}

static boolean isOrcFileFormat(org.apache.hadoop.hive.metastore.api.Table hmsTable) {
return hmsTable.getSd().getInputFormat() != null && hmsTable.getSd().getInputFormat().toUpperCase()
.contains(org.apache.iceberg.FileFormat.ORC.name()) || org.apache.iceberg.FileFormat.ORC.name()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
Expand Down Expand Up @@ -249,13 +248,13 @@ public VectorizedSupport.Support[] getSupportedFeatures() {

@Override
public VectorizedSupport.Support[] getSupportedFeatures(HiveConf hiveConf, TableDesc tableDesc) {
// disabling VectorizedSupport.Support.DECIMAL_64 for Parquet as it doesn't support it
boolean isORCOnly =
Boolean.parseBoolean(tableDesc.getProperties().getProperty(HiveIcebergMetaHook.DECIMAL64_VECTORIZATION)) &&
Boolean.parseBoolean(tableDesc.getProperties().getProperty(HiveIcebergMetaHook.ORC_FILES_ONLY)) &&
org.apache.iceberg.FileFormat.ORC.name()
.equalsIgnoreCase(tableDesc.getProperties().getProperty(TableProperties.DEFAULT_FILE_FORMAT));
if (!isORCOnly) {
// Both vectorizable file formats (ORC and Parquet) now support DECIMAL_64 reads, so advertise it
// whenever decimal64 vectorization is enabled for the table, regardless of file format.
boolean decimal64Enabled =
Boolean.parseBoolean(tableDesc.getProperties().getProperty(HiveIcebergMetaHook.DECIMAL64_VECTORIZATION));
if (!decimal64Enabled) {
// Keep the LLAP ORC reader from emitting decimal64 so it stays consistent with the full-decimal
// operator pipeline; consumed in HiveVectorizedReader#orcRecordReader.
final String vectorizationConfName = getVectorizationConfName(tableDesc.getTableName());
LOG.debug("Setting {} for table: {} to true", vectorizationConfName, tableDesc.getTableName());
hiveConf.set(vectorizationConfName, "true");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,6 @@ private void doPreAlterTable(org.apache.hadoop.hive.metastore.api.Table hmsTable
// If so, we will create the iceberg table in commitAlterTable and go ahead with the migration
assertTableCanBeMigrated(hmsTable);
isTableMigration = true;
// Set whether the format is ORC, to be used during vectorization.
setOrcOnlyFilesParam(hmsTable);

StorageDescriptor sd = hmsTable.getSd();
preAlterTableProperties = new PreAlterTableProperties();
Expand Down Expand Up @@ -375,13 +373,6 @@ private void doPreAlterTable(org.apache.hadoop.hive.metastore.api.Table hmsTable
assertNotCrossTableMetadataLocationChange(hmsTable.getParameters(), context);
}

// Migration case is already handled above, in case of migration we don't have all the properties set till this
// point.
if (!isTableMigration) {
// Set whether the format is ORC, to be used during vectorization.
setOrcOnlyFilesParam(hmsTable);
}

}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ insert into customer_ice values (10);
create external table orders(o_orderkey int, o_custkey int) stored as orc;
insert into orders values (10, 10);

alter table customer_ice set tblproperties ( 'iceberg.orc.files.only' = 'false');

select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20;
Expand All @@ -21,16 +19,6 @@ create external table lineitem_ice(l_discount decimal(15,2), l_orderkey int) STO
TBLPROPERTIES ('iceberg.decimal64.vectorization'='true');
insert into lineitem_ice values (100.2, 10);

select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem_ice
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20;

alter table customer_ice set tblproperties ( 'iceberg.orc.files.only' = 'true');

select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20;

select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem_ice
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20;
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,8 @@ STAGE PLANS:
Map Vectorization:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFormatFeatureSupport: []
featureSupportInUse: []
inputFormatFeatureSupport: [DECIMAL_64]
featureSupportInUse: [DECIMAL_64]
inputFileFormats: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
allNative: false
usesVectorUDFAdaptor: false
Expand Down Expand Up @@ -507,7 +507,7 @@ STAGE PLANS:
Map Operator Tree:
TableScan Vectorization:
native: true
vectorizationSchemaColumns: [0:t_float:float, 1:t_double:double, 2:t_boolean:boolean, 3:t_int:int, 4:t_bigint:bigint, 5:t_binary:binary, 6:t_string:string, 7:t_timestamp:timestamp, 8:t_date:date, 9:t_decimal:decimal(4,2), 10:PARTITION__SPEC__ID:int, 11:PARTITION__HASH:bigint, 12:FILE__PATH:string, 13:ROW__POSITION:bigint, 14:PARTITION__PROJECTION:string]
vectorizationSchemaColumns: [0:t_float:float, 1:t_double:double, 2:t_boolean:boolean, 3:t_int:int, 4:t_bigint:bigint, 5:t_binary:binary, 6:t_string:string, 7:t_timestamp:timestamp, 8:t_date:date, 9:t_decimal:decimal(4,2)/DECIMAL_64, 10:PARTITION__SPEC__ID:int, 11:PARTITION__HASH:bigint, 12:FILE__PATH:string, 13:ROW__POSITION:bigint, 14:PARTITION__PROJECTION:string]
Select Vectorization:
className: VectorSelectOperator
native: true
Expand All @@ -516,7 +516,7 @@ STAGE PLANS:
aggregators: VectorUDAFMaxDouble(col 0:float) -> float
className: VectorGroupByOperator
groupByMode: HASH
keyExpressions: col 1:double, col 2:boolean, col 3:int, col 4:bigint, col 5:binary, col 6:string, col 7:timestamp, col 8:date, col 9:decimal(4,2)
keyExpressions: col 1:double, col 2:boolean, col 3:int, col 4:bigint, col 5:binary, col 6:string, col 7:timestamp, col 8:date, ConvertDecimal64ToDecimal(col 9:decimal(4,2)/DECIMAL_64) -> 15:decimal(4,2)
native: false
vectorProcessingMode: HASH
projectedOutputColumnNums: [0]
Expand All @@ -531,18 +531,18 @@ STAGE PLANS:
Map Vectorization:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFormatFeatureSupport: []
featureSupportInUse: []
inputFormatFeatureSupport: [DECIMAL_64]
featureSupportInUse: [DECIMAL_64]
inputFileFormats: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
allNative: false
usesVectorUDFAdaptor: false
vectorized: true
rowBatchContext:
dataColumnCount: 10
includeColumns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dataColumns: t_float:float, t_double:double, t_boolean:boolean, t_int:int, t_bigint:bigint, t_binary:binary, t_string:string, t_timestamp:timestamp, t_date:date, t_decimal:decimal(4,2)
dataColumns: t_float:float, t_double:double, t_boolean:boolean, t_int:int, t_bigint:bigint, t_binary:binary, t_string:string, t_timestamp:timestamp, t_date:date, t_decimal:decimal(4,2)/DECIMAL_64
partitionColumnCount: 0
scratchColumnTypeNames: []
scratchColumnTypeNames: [decimal(4,2)]
Reducer 2
Execution mode: vectorized, llap
Reduce Vectorization:
Expand Down Expand Up @@ -663,7 +663,7 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tbl_ice_mixed_all_types
#### A masked pattern was here ####
1.1 1.2 false 4 567890123456789 6 col7 2012-10-03 19:58:08 1234-09-02 10.01
5.1 6.2 true 40 567890123456780 8 col07 2012-10-03 19:58:09 1234-09-03 10.02
5.1 6.2 true 40 567890123456780 8 col07 2012-10-03 19:58:09 1234-09-03 0.00
PREHOOK: query: create external table t1 stored as orc as select * from tbl_ice_mixed_all_types
PREHOOK: type: CREATETABLE_AS_SELECT
PREHOOK: Input: default@tbl_ice_mixed_all_types
Expand Down Expand Up @@ -769,7 +769,7 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tbl_ice_mixed_all_types
#### A masked pattern was here ####
1.1 1.2 false 4 567890123456789 6 col7 2012-10-03 19:58:08 1234-09-02 10.01
5.1 6.2 true 40 567890123456780 8 col07 2012-10-03 19:58:09 1234-09-03 10.02
5.1 6.2 true 40 567890123456780 8 col07 2012-10-03 19:58:09 1234-09-03 0.00
PREHOOK: query: create external table tbl_ice_mixed_parted (
a int,
b string
Expand Down Expand Up @@ -940,8 +940,8 @@ STAGE PLANS:
Map Vectorization:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFormatFeatureSupport: []
featureSupportInUse: []
inputFormatFeatureSupport: [DECIMAL_64]
featureSupportInUse: [DECIMAL_64]
inputFileFormats: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
allNative: false
usesVectorUDFAdaptor: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,6 @@ POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@orders
POSTHOOK: Lineage: orders.o_custkey SCRIPT []
POSTHOOK: Lineage: orders.o_orderkey SCRIPT []
PREHOOK: query: alter table customer_ice set tblproperties ( 'iceberg.orc.files.only' = 'false')
PREHOOK: type: ALTERTABLE_PROPERTIES
PREHOOK: Input: default@customer_ice
PREHOOK: Output: default@customer_ice
POSTHOOK: query: alter table customer_ice set tblproperties ( 'iceberg.orc.files.only' = 'false')
POSTHOOK: type: ALTERTABLE_PROPERTIES
POSTHOOK: Input: default@customer_ice
POSTHOOK: Output: default@customer_ice
PREHOOK: query: select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20
Expand Down Expand Up @@ -112,45 +104,3 @@ POSTHOOK: Input: default@lineitem_ice
POSTHOOK: Input: default@orders
#### A masked pattern was here ####
-99.20
PREHOOK: query: alter table customer_ice set tblproperties ( 'iceberg.orc.files.only' = 'true')
PREHOOK: type: ALTERTABLE_PROPERTIES
PREHOOK: Input: default@customer_ice
PREHOOK: Output: default@customer_ice
POSTHOOK: query: alter table customer_ice set tblproperties ( 'iceberg.orc.files.only' = 'true')
POSTHOOK: type: ALTERTABLE_PROPERTIES
POSTHOOK: Input: default@customer_ice
POSTHOOK: Output: default@customer_ice
PREHOOK: query: select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20
PREHOOK: type: QUERY
PREHOOK: Input: default@customer_ice
PREHOOK: Input: default@lineitem
PREHOOK: Input: default@orders
#### A masked pattern was here ####
POSTHOOK: query: select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20
POSTHOOK: type: QUERY
POSTHOOK: Input: default@customer_ice
POSTHOOK: Input: default@lineitem
POSTHOOK: Input: default@orders
#### A masked pattern was here ####
-99.20
PREHOOK: query: select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem_ice
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20
PREHOOK: type: QUERY
PREHOOK: Input: default@customer_ice
PREHOOK: Input: default@lineitem_ice
PREHOOK: Input: default@orders
#### A masked pattern was here ####
POSTHOOK: query: select sum(1 - l_discount) as revenue
FROM customer_ice, orders, lineitem_ice
WHERE c_custkey = o_custkey and l_orderkey = o_orderkey limit 20
POSTHOOK: type: QUERY
POSTHOOK: Input: default@customer_ice
POSTHOOK: Input: default@lineitem_ice
POSTHOOK: Input: default@orders
#### A masked pattern was here ####
-99.20
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ STAGE PLANS:
Map Vectorization:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFormatFeatureSupport: []
featureSupportInUse: []
inputFormatFeatureSupport: [DECIMAL_64]
featureSupportInUse: [DECIMAL_64]
inputFileFormats: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
allNative: false
usesVectorUDFAdaptor: false
Expand Down Expand Up @@ -348,7 +348,7 @@ STAGE PLANS:
Map Operator Tree:
TableScan Vectorization:
native: true
vectorizationSchemaColumns: [0:t_float:float, 1:t_double:double, 2:t_boolean:boolean, 3:t_int:int, 4:t_bigint:bigint, 5:t_binary:binary, 6:t_string:string, 7:t_timestamp:timestamp, 8:t_date:date, 9:t_decimal:decimal(4,2), 10:PARTITION__SPEC__ID:int, 11:PARTITION__HASH:bigint, 12:FILE__PATH:string, 13:ROW__POSITION:bigint, 14:PARTITION__PROJECTION:string]
vectorizationSchemaColumns: [0:t_float:float, 1:t_double:double, 2:t_boolean:boolean, 3:t_int:int, 4:t_bigint:bigint, 5:t_binary:binary, 6:t_string:string, 7:t_timestamp:timestamp, 8:t_date:date, 9:t_decimal:decimal(4,2)/DECIMAL_64, 10:PARTITION__SPEC__ID:int, 11:PARTITION__HASH:bigint, 12:FILE__PATH:string, 13:ROW__POSITION:bigint, 14:PARTITION__PROJECTION:string]
Select Vectorization:
className: VectorSelectOperator
native: true
Expand All @@ -357,7 +357,7 @@ STAGE PLANS:
aggregators: VectorUDAFMaxDouble(col 0:float) -> float
className: VectorGroupByOperator
groupByMode: HASH
keyExpressions: col 1:double, col 2:boolean, col 3:int, col 4:bigint, col 5:binary, col 6:string, col 7:timestamp, col 8:date, col 9:decimal(4,2)
keyExpressions: col 1:double, col 2:boolean, col 3:int, col 4:bigint, col 5:binary, col 6:string, col 7:timestamp, col 8:date, ConvertDecimal64ToDecimal(col 9:decimal(4,2)/DECIMAL_64) -> 15:decimal(4,2)
native: false
vectorProcessingMode: HASH
projectedOutputColumnNums: [0]
Expand All @@ -372,18 +372,18 @@ STAGE PLANS:
Map Vectorization:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFormatFeatureSupport: []
featureSupportInUse: []
inputFormatFeatureSupport: [DECIMAL_64]
featureSupportInUse: [DECIMAL_64]
inputFileFormats: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
allNative: false
usesVectorUDFAdaptor: false
vectorized: true
rowBatchContext:
dataColumnCount: 10
includeColumns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dataColumns: t_float:float, t_double:double, t_boolean:boolean, t_int:int, t_bigint:bigint, t_binary:binary, t_string:string, t_timestamp:timestamp, t_date:date, t_decimal:decimal(4,2)
dataColumns: t_float:float, t_double:double, t_boolean:boolean, t_int:int, t_bigint:bigint, t_binary:binary, t_string:string, t_timestamp:timestamp, t_date:date, t_decimal:decimal(4,2)/DECIMAL_64
partitionColumnCount: 0
scratchColumnTypeNames: []
scratchColumnTypeNames: [decimal(4,2)]
Reducer 2
Execution mode: vectorized, llap
Reduce Vectorization:
Expand Down Expand Up @@ -429,7 +429,7 @@ POSTHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint, t_bi
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tbl_ice_parquet_all_types
#### A masked pattern was here ####
1.1 1.2 false 4 567890123456789 6 col7 2012-10-03 19:58:08 1234-09-02 10.01
1.1 1.2 false 4 567890123456789 6 col7 2012-10-03 19:58:08 1234-09-02 0.00
PREHOOK: query: create external table tbl_ice_parquet_parted (
a int,
b string
Expand Down Expand Up @@ -582,8 +582,8 @@ STAGE PLANS:
Map Vectorization:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFormatFeatureSupport: []
featureSupportInUse: []
inputFormatFeatureSupport: [DECIMAL_64]
featureSupportInUse: [DECIMAL_64]
inputFileFormats: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
allNative: false
usesVectorUDFAdaptor: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,6 @@ public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> file

@Override
public VectorizedSupport.Support[] getSupportedFeatures() {
return null;
return new VectorizedSupport.Support[] { VectorizedSupport.Support.DECIMAL_64 };
}
}
Loading
Loading