Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: tbl_ice
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22)) or (b) IN ('four', 'one') or (a = 22)) (type: boolean)
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22)) or (b) IN ('four', 'one') or (a = 22)) (type: boolean)
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean)
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean)
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,27 +150,27 @@ Stage-0
File Output Operator [FS_61]
Limit [LIM_60] (rows=20 width=447)
Number of rows:20
Select Operator [SEL_59] (rows=473 width=447)
Select Operator [SEL_59] (rows=791 width=447)
Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 1 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_58]
Top N Key Operator [TNK_57] (rows=473 width=447)
Top N Key Operator [TNK_57] (rows=791 width=447)
keys:_col0,top n:20
Map Join Operator [MAPJOIN_56] (rows=473 width=447)
Map Join Operator [MAPJOIN_56] (rows=791 width=447)
BucketMapJoin:true,Conds:SEL_55._col0, _col1=RS_53._col0, _col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 3 [CUSTOM_EDGE] vectorized, llap
MULTICAST [RS_53]
PartitionCols:_col0, _col1
Select Operator [SEL_52] (rows=387 width=178)
Select Operator [SEL_52] (rows=500 width=178)
Output:["_col0","_col1"]
Filter Operator [FIL_51] (rows=387 width=178)
predicate:(((key < '0') or ((key > '0') and (key < '100')) or (key > '100')) and value is not null)
Filter Operator [FIL_51] (rows=500 width=178)
predicate:((key <> '0') and (key <> '100') and value is not null)
TableScan [TS_3] (rows=500 width=178)
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
<-Select Operator [SEL_55] (rows=387 width=269)
<-Select Operator [SEL_55] (rows=500 width=269)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_54] (rows=387 width=269)
predicate:(((key1 < '0') or ((key1 > '0') and (key1 < '100')) or (key1 > '100')) and key2 is not null)
Filter Operator [FIL_54] (rows=500 width=269)
predicate:((key1 <> '0') and (key1 <> '100') and key2 is not null)
TableScan [TS_0] (rows=500 width=269)
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:8,Grouping Partition Columns:["key1","key2"],Output:["key1","key2","value"]

Expand Down Expand Up @@ -346,27 +346,27 @@ Stage-0
File Output Operator [FS_41]
Limit [LIM_40] (rows=20 width=447)
Number of rows:20
Select Operator [SEL_39] (rows=473 width=447)
Select Operator [SEL_39] (rows=791 width=447)
Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 1 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_38]
Top N Key Operator [TNK_37] (rows=473 width=447)
Top N Key Operator [TNK_37] (rows=791 width=447)
keys:_col0,top n:20
Map Join Operator [MAPJOIN_36] (rows=473 width=447)
Map Join Operator [MAPJOIN_36] (rows=791 width=447)
BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 3 [CUSTOM_EDGE] vectorized, llap
MULTICAST [RS_33]
PartitionCols:_col0
Select Operator [SEL_32] (rows=387 width=178)
Select Operator [SEL_32] (rows=500 width=178)
Output:["_col0","_col1"]
Filter Operator [FIL_31] (rows=387 width=178)
predicate:((key < '0') or (key > '100') or ((key > '0') and (key < '100')))
Filter Operator [FIL_31] (rows=500 width=178)
predicate:((key <> '0') and (key <> '100'))
TableScan [TS_3] (rows=500 width=178)
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
<-Select Operator [SEL_35] (rows=387 width=269)
<-Select Operator [SEL_35] (rows=500 width=269)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_34] (rows=387 width=269)
predicate:((key1 < '0') or (key1 > '100') or ((key1 > '0') and (key1 < '100')))
Filter Operator [FIL_34] (rows=500 width=269)
predicate:((key1 <> '0') and (key1 <> '100'))
TableScan [TS_0] (rows=500 width=269)
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]

Expand Down Expand Up @@ -435,40 +435,40 @@ POSTHOOK: Input: default@srcbucket_big
Plan optimized by CBO.

Vertex dependency in root stage
Map 2 <- Map 1 (BROADCAST_EDGE)
Reducer 3 <- Map 2 (SIMPLE_EDGE)
Map 1 <- Map 3 (CUSTOM_EDGE)
Reducer 2 <- Map 1 (SIMPLE_EDGE)

Stage-0
Fetch Operator
limit:20
Stage-1
Reducer 3 vectorized, llap
Reducer 2 vectorized, llap
File Output Operator [FS_41]
Limit [LIM_40] (rows=20 width=447)
Number of rows:20
Select Operator [SEL_39] (rows=612 width=447)
Select Operator [SEL_39] (rows=791 width=447)
Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 2 [SIMPLE_EDGE] vectorized, llap
<-Map 1 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_38]
Top N Key Operator [TNK_37] (rows=612 width=447)
Top N Key Operator [TNK_37] (rows=791 width=447)
keys:_col0,top n:20
Map Join Operator [MAPJOIN_36] (rows=612 width=447)
Conds:RS_33._col0=SEL_35._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 1 [BROADCAST_EDGE] vectorized, llap
BROADCAST [RS_33]
Map Join Operator [MAPJOIN_36] (rows=791 width=447)
BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
<-Map 3 [CUSTOM_EDGE] vectorized, llap
MULTICAST [RS_33]
PartitionCols:_col0
Select Operator [SEL_32] (rows=387 width=269)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_31] (rows=387 width=269)
predicate:(((key2 < 'val_0') or ((key2 > 'val_0') and (key2 < 'val_100')) or (key2 > 'val_100')) and key1 is not null)
TableScan [TS_0] (rows=500 width=269)
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2","value"]
<-Select Operator [SEL_35] (rows=500 width=178)
Output:["_col0","_col1"]
Filter Operator [FIL_34] (rows=500 width=178)
predicate:key is not null
TableScan [TS_3] (rows=500 width=178)
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
Select Operator [SEL_32] (rows=500 width=178)
Output:["_col0","_col1"]
Filter Operator [FIL_31] (rows=500 width=178)
predicate:key is not null
TableScan [TS_3] (rows=500 width=178)
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
<-Select Operator [SEL_35] (rows=500 width=269)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_34] (rows=500 width=269)
predicate:((key2 <> 'val_0') and (key2 <> 'val_100') and key1 is not null)
TableScan [TS_0] (rows=500 width=269)
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]

PREHOOK: query: SELECT *
FROM srcbucket_big a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: tbl_ice
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) (type: boolean)
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean)
Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: PARTIAL
Filter Operator
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean)
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean)
Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: PARTIAL
Select Operator
expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: tbl_ice
filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) (type: boolean)
filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean)
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((a = 22) or (b) IN ('four', 'one')) (type: boolean)
Expand All @@ -93,7 +93,7 @@ STAGE PLANS:
Map-reduce partition columns: FILE__PATH (type: string)
Statistics: Num rows: 4 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean)
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean)
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,30 +72,44 @@ public SearchTransformer(RexBuilder rexBuilder, RexCall search, final RexUnknown
this.unknownContext = unknownContext;
}

/**
* Transforms the SEARCH expression into an equivalent RexNode expression.
* Warning: when called from a shuttle, callers of this method should consider flattening AND/OR expressions
* afterward, to get the same result as applying {@link SearchTransformer.Shuttle}.
*/
public RexNode transform() {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.SEARCH_TRANSFORMER);

RangeConverter<C> consumer = new RangeConverter<>(rexBuilder, operandType, ref);
RangeSets.forEach(sarg.rangeSet, consumer);

List<RexNode> orList = new ArrayList<>();
if (sarg.nullAs == RexUnknownAs.TRUE && unknownContext != RexUnknownAs.TRUE) {
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref));
}
switch (consumer.inLiterals.size()) {
case 0:
break;
case 1:
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, consumer.inLiterals.get(0)));
break;
default:
List<RexNode> operands = new ArrayList<>(consumer.inLiterals.size() + 1);
operands.add(ref);
operands.addAll(consumer.inLiterals);
orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands));

if (sarg.isComplementedPoints()) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the sarg is not strictly a complement then I guess we cannot handle it. Examples:

... WHERE NOT ( x = 10 OR x = 20 OR (x > 30 AND x < 50))
... WHERE (x <> 10 AND x <> 20 AND (x <= 30 OR x >= 50))

One way to cover those would be to apply the existing RangeConverter on sarg.rangeSet.complement() and invert the EQUALS and HiveIn operators in the switch. The challenge here is when to use the sarg.rangeSet and when its complement. A naive choice could be to base the decision on sarg.rangeSet.asRanges().size() or something along these lines.

Anyways, the change here is fine as it is so we can log a another ticket and follow-up there if its worth to do it or not.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact we could negate the entire sarg (not only the rangeset) if that leads to a simpler more efficient expansion. This would evolve negating the operators (IS NULL, =, IN, etc.), and changing the orList into an andList.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What that would look like? Try systematically the "normal" sarg expansion and the negated one, and pick the "simpler" one?
I guess we could try... but could we consider that a follow-up, separate task, out of the scope of the current PR?

// Generate 'ref <> value1 AND ... AND ref <> valueN'
List<RexNode> list = sarg.rangeSet.complement().asRanges().stream().map(
range -> rexBuilder.makeCall(SqlStdOperatorTable.NOT_EQUALS, ref,
rexBuilder.makeLiteral(range.lowerEndpoint(), operandType, true, true))).toList();
orList.add(RexUtil.composeConjunction(rexBuilder, list));
} else {
RangeConverter<C> consumer = new RangeConverter<>(rexBuilder, operandType, ref);
RangeSets.forEach(sarg.rangeSet, consumer);

switch (consumer.inLiterals.size()) {
case 0:
break;
case 1:
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, consumer.inLiterals.get(0)));
break;
default:
List<RexNode> operands = new ArrayList<>(consumer.inLiterals.size() + 1);
operands.add(ref);
operands.addAll(consumer.inLiterals);
orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands));
}
orList.addAll(consumer.nodes);
}
orList.addAll(consumer.nodes);
RexNode x = RexUtil.composeDisjunction(rexBuilder, orList);

if (sarg.nullAs == RexUnknownAs.FALSE && unknownContext != RexUnknownAs.FALSE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -628,14 +628,23 @@ private RexNode makeLiteral(C value) {
private double compute() {
final List<RexNode> inLiterals = new ArrayList<>();
final List<Double> rangeSelectivities = new ArrayList<>();
for (Range<C> range : sarg.rangeSet.asRanges()) {
if (!range.hasLowerBound() && !range.hasUpperBound()) {
return 1.0; // "all" range
final List<Double> searchSelectivities = new ArrayList<>();

if (sarg.isComplementedPoints()) {
// Generate 'ref <> value1 AND ... AND ref <> valueN'
List<RexNode> notEq = sarg.rangeSet.complement().asRanges().stream()
.map(range -> rexBuilder.makeCall(SqlStdOperatorTable.NOT_EQUALS, ref, makeLiteral(range.lowerEndpoint())))
.toList();
searchSelectivities.add(RexUtil.composeConjunction(rexBuilder, notEq).accept(FilterSelectivityEstimator.this));
} else {
Comment on lines +633 to +639
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this form lead to better selectivity estimates than what we had before? If not then we don't necessarily need to change it. The purpose of this code is to compute a good selectivity for the SEARCH predicate not to transform it to something else.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uhmmm... you're right, the original version (which uses histograms) would have a more accurate estimation than the proposed one with NOT_EQUALS (which is estimated simply with ndv-1/ndv , which can be quite off). However, it might be possible that histograms are not available in general (so the original version would default to a hadcoded selectivity), whereas the sub-optimal optimization with NOT_EQUALS uses a more generally available ndv estimated value (and this estimation, although not perfect, would be better than the hardcoded value of the original version).

Having said that, I guess we should try to aim for the better solution, and trust that statistics would be available, so I lean towards reverting the change in this file.

for (Range<C> range : sarg.rangeSet.asRanges()) {
if (!range.hasLowerBound() && !range.hasUpperBound()) {
return 1.0; // "all" range
}
processRangeSelectivity(range, rangeSelectivities, inLiterals);
}
processRangeSelectivity(range, rangeSelectivities, inLiterals);
}

final List<Double> searchSelectivities = new ArrayList<>();
if (!rangeSelectivities.isEmpty() && rangeSelectivities.stream().noneMatch(Objects::isNull)) {
// Aggregate all ranges selectivity, respecting the max value of 1
double total = Math.min(1.0, rangeSelectivities.stream().mapToDouble(Double::doubleValue).sum());
Expand All @@ -655,7 +664,8 @@ private double compute() {
List<RexNode> operands = new ArrayList<>(inLiterals.size() + 1);
operands.add(ref);
operands.addAll(inLiterals);
searchSelectivities.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this));
searchSelectivities.add(
rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this));
}
}

Expand All @@ -664,7 +674,9 @@ private double compute() {
rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref).accept(FilterSelectivityEstimator.this));
}

return searchSelectivities.size() == 1 ? searchSelectivities.get(0) : computeDisjunctionSelectivity(searchSelectivities);
return searchSelectivities.size() == 1
? searchSelectivities.get(0)
: computeDisjunctionSelectivity(searchSelectivities);
}

private void processRangeSelectivity(Range<C> range, List<Double> rangeSelectivities, List<RexNode> inLiterals) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
Expand Down Expand Up @@ -214,6 +215,12 @@ public ExprNodeDesc visitCall(RexCall call) {
&& SqlTypeUtil.equalSansNullability(dTFactory, call.getType(),
call.operands.get(0).getType())) {
return args.get(0);
} else if (call.isA(SqlKind.AND)) {
// Make sure AND is flattened (we may have nested ANDs due to SearchTransformer conversion above)
return ExprNodeDescUtils.and(args);
} else if (call.isA(SqlKind.OR)) {
// Make sure OR is flattened (we may have nested ORs due to SearchTransformer conversion above)
return ExprNodeDescUtils.or(args);
} else {
GenericUDF hiveUdf = SqlFunctionConverter.getHiveUDF(call.getOperator(), call.getType(),
args.size());
Expand Down
Loading
Loading