diff --git a/mysql-test/main/delete_use_source_engines.result b/mysql-test/main/delete_use_source_engines.result index b1ad9c009d924..7b8097bbd8d92 100644 --- a/mysql-test/main/delete_use_source_engines.result +++ b/mysql-test/main/delete_use_source_engines.result @@ -731,15 +731,15 @@ create table tmp as select * from t1 where c1 in (select a.c2 from t1 a where a. explain select * from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) analyze delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 32.00 100.00 12.50 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 0.25 18.75 100.00 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 0.25 25.00 100.00 Using index; FirstMatch(t1) select * from t1; c1 c2 c3 1 2 2 @@ -2709,15 +2709,15 @@ create table tmp as select * from t1 where c1 in (select a.c2 from t1 a where a. explain select * from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) analyze delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 27.00 100.00 14.81 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 0.25 18.75 100.00 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 0.25 25.00 100.00 Using index; FirstMatch(t1) select * from t1; c1 c2 c3 1 2 2 @@ -2762,7 +2762,7 @@ explain select * from t1 where c1 in (select a.c2 from t1 a where a.c3 = t1.c3) order by c3 desc limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where; Using filesort -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using where; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using where; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c3 = t1.c3) order by c3 desc limit 1; id select_type table type possible_keys key key_len ref rows Extra @@ -5114,15 +5114,15 @@ create table tmp as select * from t1 where c1 in (select a.c2 from t1 a where a. explain select * from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) analyze delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 29.00 100.00 13.79 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 0.25 18.75 100.00 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 0.25 25.00 100.00 Using index; FirstMatch(t1) select * from t1; c1 c2 c3 1 2 2 diff --git a/mysql-test/main/derived_cond_pushdown.result b/mysql-test/main/derived_cond_pushdown.result index 1fa1eea5dd307..0593d8a10d1f6 100644 --- a/mysql-test/main/derived_cond_pushdown.result +++ b/mysql-test/main/derived_cond_pushdown.result @@ -20147,7 +20147,7 @@ from t2, t3, (select c, max(b) max, min(b) min from t4 group by c) t where t2.b < 40 and t2.a=t3.a and t3.c=t.c; id select_type table type possible_keys key key_len ref rows filtered Extra 1 PRIMARY t2 ALL NULL NULL NULL NULL 90 60.00 Using where -1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 1 100.00 Using where +1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 0 100.00 Using where 1 PRIMARY ref key0 key0 128 test.t3.c 5 100.00 2 DERIVED t4 ALL idx_c NULL NULL NULL 160 100.00 Using temporary; Using filesort Warnings: @@ -20182,7 +20182,7 @@ EXPLAIN "used_key_parts": ["a"], "ref": ["test.t2.a"], "loops": 53.99999991, - "rows": 1, + "rows": 0, "cost": "COST_REPLACED", "filtered": 100, "attached_condition": "t3.c is not null" @@ -20197,7 +20197,7 @@ EXPLAIN "key_length": "128", "used_key_parts": ["c"], "ref": ["test.t3.c"], - "loops": 80.99999987, + "loops": 52.89695991, "rows": 5, "cost": "COST_REPLACED", "filtered": 100, @@ -20690,7 +20690,7 @@ from t2, t3, (select c, b, sum(b) over (partition by c) from t4 ) t where t2.b < 40 and t2.a=t3.a and t3.c=t.c; id select_type table type possible_keys key key_len ref rows filtered Extra 1 PRIMARY t2 ALL NULL NULL NULL NULL 90 60.00 Using where -1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 1 100.00 Using where +1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 0 100.00 Using where 1 PRIMARY ref key0 key0 128 test.t3.c 10 100.00 2 DERIVED t4 ALL idx_c NULL NULL NULL 160 100.00 Using temporary Warnings: @@ -20725,7 +20725,7 @@ EXPLAIN "used_key_parts": ["a"], "ref": ["test.t2.a"], "loops": 53.99999991, - "rows": 1, + "rows": 0, "cost": "COST_REPLACED", "filtered": 100, "attached_condition": "t3.c is not null" @@ -20740,7 +20740,7 @@ EXPLAIN "key_length": "128", "used_key_parts": ["c"], "ref": ["test.t3.c"], - "loops": 80.99999987, + "loops": 52.89695991, "rows": 10, "cost": "COST_REPLACED", "filtered": 100, diff --git a/mysql-test/main/join_cache.result b/mysql-test/main/join_cache.result index aa12ac8942d39..ddd82d5ce3933 100644 --- a/mysql-test/main/join_cache.result +++ b/mysql-test/main/join_cache.result @@ -6365,7 +6365,7 @@ LEFT JOIN t3 ON t2.d = t3.c LEFT JOIN t4 ON t3.c=1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 index NULL PRIMARY 4 NULL 10000 Using index -1 SIMPLE t2 ref b b 4 test.t1.b 1 +1 SIMPLE t2 ref b b 4 test.t1.b 0 1 SIMPLE t3 eq_ref PRIMARY PRIMARY 4 test.t2.d 1 Using where; Using index 1 SIMPLE t4 index NULL PRIMARY 4 NULL 3000 Using where; Using index SELECT COUNT(*) @@ -6383,7 +6383,7 @@ LEFT JOIN t3 ON t2.d = t3.c LEFT JOIN t4 ON t3.c=1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 index NULL PRIMARY 4 NULL 10000 Using index -1 SIMPLE t2 ref b b 4 test.t1.b 1 +1 SIMPLE t2 ref b b 4 test.t1.b 0 1 SIMPLE t3 eq_ref PRIMARY PRIMARY 4 test.t2.d 1 Using where; Using index 1 SIMPLE t4 index NULL PRIMARY 4 NULL 3000 Using where; Using index; Using join buffer (flat, BNL join) SELECT COUNT(*) diff --git a/mysql-test/main/match_probability.result b/mysql-test/main/match_probability.result new file mode 100644 index 0000000000000..f2d2bea1286cd --- /dev/null +++ b/mysql-test/main/match_probability.result @@ -0,0 +1,148 @@ +# Setup: Two tables with skewed foreign key distribution +CREATE TABLE t_ndv100 (a INT); +CREATE TABLE t_ndv3 (a INT, b VARCHAR(50), KEY idx_a (a)); +# All values in t_ndv100.a are different (100 distinct values) +INSERT INTO t_ndv100 (a) SELECT seq FROM seq_1_to_100; +# There are only 3 distinct values in t_ndv3.a +INSERT INTO t_ndv3 (a, b) SELECT seq/100, 'def' FROM seq_1_to_300; +set optimizer_trace=1; +# Inefficient plan until EITS is collected (full scan of t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 +1 SIMPLE t_ndv3 ALL idx_a NULL NULL NULL 300 Using where; Using join buffer (flat, BNL join) +# Collect statistics only for the driving table (t_ndv100) +ANALYZE TABLE t_ndv100 PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_ndv100 analyze status Engine-independent statistics collected +test.t_ndv100 analyze status OK +# It is not enough until there are statistics for the inner table (t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 +1 SIMPLE t_ndv3 ALL idx_a NULL NULL NULL 300 Using where; Using join buffer (flat, BNL join) +ANALYZE TABLE t_ndv3 PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_ndv3 analyze status Engine-independent statistics collected +test.t_ndv3 analyze status Table is already up to date +# After EITS is collected for both tables, the plan is efficient +# (index scan of t_ndv3). "rows" in the output are close to actual "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where +1 SIMPLE t_ndv3 ref idx_a idx_a 5 test.t_ndv100.a 3 2.51 100.00 100.00 +# Optimization must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +[ + 0.04, + 75 +] +# Match probability is not applicable because the left part of the +# join condition is not a field but an expression (t_ndv100.a + 10). See the +# difference between "rows" and "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a + 10 = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 +1 SIMPLE t_ndv3 ALL idx_a NULL NULL NULL 300 300.00 25.00 0.00 Using where; Using join buffer (flat, BNL join) +# Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +NULL +# If all values are NULL, match probability is not applicable +CREATE TABLE t_nulls (a INT, b VARCHAR(50), KEY idx_a (a)); +INSERT INTO t_nulls (a, b) SELECT NULL, 'def' FROM seq_1_to_300; +ANALYZE TABLE t_nulls PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_nulls analyze status Engine-independent statistics collected +test.t_nulls analyze status Table is already up to date +EXPLAIN SELECT t_ndv100.a, t_nulls.b +FROM t_ndv100 +STRAIGHT_JOIN t_nulls ON t_ndv100.a = t_nulls.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 Using where +1 SIMPLE t_nulls ref idx_a idx_a 5 test.t_ndv100.a 1 +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +NULL +#------------------------------ +# Test multi-part index +#------------------------------ +CREATE TABLE t_ndv30 (a INT, b INT, c VARCHAR(50), KEY idx_ab (a, b)); +INSERT INTO t_ndv30 (a, b, c) SELECT seq/10, seq/10, 'def' FROM seq_1_to_300; +ANALYZE TABLE t_ndv30 PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_ndv30 analyze status Engine-independent statistics collected +test.t_ndv30 analyze status Table is already up to date +# If more than one key part is used, match probability is not applicable. +# See the mismatch between "rows" and "r_rows" in ANALYZE output +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a +AND t_ndv100.a = t_ndv30.b; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where +1 SIMPLE t_ndv30 ref idx_ab idx_ab 10 test.t_ndv100.a,test.t_ndv100.a 9 2.96 100.00 100.00 Using index +# Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +NULL +# Only first key part of `idx_ab` is used, match probability is applicable. +# "rows" is now much closer to actual "r_rows". +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where +1 SIMPLE t_ndv30 ref idx_ab idx_ab 5 test.t_ndv100.a 3 2.96 100.00 100.00 Using index +# Must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +[ + 0.31000062, + 9.6774 +] +# Three tables with equality propagation. +# min_driving_ndv = min(NDV(t_ndv30.a), NDV(t_ndv100.a)) = min(30, 100) = 30 +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv30 +STRAIGHT_JOIN t_ndv100 ON t_ndv30.a = t_ndv100.a +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv30 index idx_ab idx_ab 10 NULL 300 300.00 100.00 100.00 Using where; Using index +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 0.99 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t_ndv3 ref idx_a idx_a 5 test.t_ndv30.a 9 8.48 100.00 100.00 +# Must be reflected in the trace (match_prob ~= 0.1) +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +[ + 0.129032, + 75 +] +DROP TABLE t_ndv100, t_ndv3, t_ndv30, t_nulls; diff --git a/mysql-test/main/match_probability.test b/mysql-test/main/match_probability.test new file mode 100644 index 0000000000000..7890a5dc99eaf --- /dev/null +++ b/mysql-test/main/match_probability.test @@ -0,0 +1,113 @@ +--source include/have_sequence.inc + +--echo # Setup: Two tables with skewed foreign key distribution +CREATE TABLE t_ndv100 (a INT); + +CREATE TABLE t_ndv3 (a INT, b VARCHAR(50), KEY idx_a (a)); + +--echo # All values in t_ndv100.a are different (100 distinct values) +INSERT INTO t_ndv100 (a) SELECT seq FROM seq_1_to_100; + +--echo # There are only 3 distinct values in t_ndv3.a +INSERT INTO t_ndv3 (a, b) SELECT seq/100, 'def' FROM seq_1_to_300; + +set optimizer_trace=1; + +--echo # Inefficient plan until EITS is collected (full scan of t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +--echo # Collect statistics only for the driving table (t_ndv100) +ANALYZE TABLE t_ndv100 PERSISTENT FOR ALL; + +--echo # It is not enough until there are statistics for the inner table (t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +ANALYZE TABLE t_ndv3 PERSISTENT FOR ALL; + +--echo # After EITS is collected for both tables, the plan is efficient +--echo # (index scan of t_ndv3). "rows" in the output are close to actual "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +--echo # Optimization must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # Match probability is not applicable because the left part of the +--echo # join condition is not a field but an expression (t_ndv100.a + 10). See the +--echo # difference between "rows" and "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a + 10 = t_ndv3.a; + +--echo # Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # If all values are NULL, match probability is not applicable +CREATE TABLE t_nulls (a INT, b VARCHAR(50), KEY idx_a (a)); + +INSERT INTO t_nulls (a, b) SELECT NULL, 'def' FROM seq_1_to_300; + +ANALYZE TABLE t_nulls PERSISTENT FOR ALL; + +EXPLAIN SELECT t_ndv100.a, t_nulls.b +FROM t_ndv100 +STRAIGHT_JOIN t_nulls ON t_ndv100.a = t_nulls.a; + +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo #------------------------------ +--echo # Test multi-part index +--echo #------------------------------ +CREATE TABLE t_ndv30 (a INT, b INT, c VARCHAR(50), KEY idx_ab (a, b)); + +INSERT INTO t_ndv30 (a, b, c) SELECT seq/10, seq/10, 'def' FROM seq_1_to_300; + +ANALYZE TABLE t_ndv30 PERSISTENT FOR ALL; + +--echo # If more than one key part is used, match probability is not applicable. +--echo # See the mismatch between "rows" and "r_rows" in ANALYZE output +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a + AND t_ndv100.a = t_ndv30.b; + +--echo # Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # Only first key part of `idx_ab` is used, match probability is applicable. +--echo # "rows" is now much closer to actual "r_rows". +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a; + +--echo # Must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # Three tables with equality propagation. +--echo # min_driving_ndv = min(NDV(t_ndv30.a), NDV(t_ndv100.a)) = min(30, 100) = 30 +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv30 +STRAIGHT_JOIN t_ndv100 ON t_ndv30.a = t_ndv100.a +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +--echo # Must be reflected in the trace (match_prob ~= 0.1) +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +DROP TABLE t_ndv100, t_ndv3, t_ndv30, t_nulls; \ No newline at end of file diff --git a/mysql-test/main/null_key.result b/mysql-test/main/null_key.result index 0b1446fd72e1c..9bc20427487b1 100644 --- a/mysql-test/main/null_key.result +++ b/mysql-test/main/null_key.result @@ -421,7 +421,7 @@ EXPLAIN SELECT SQL_CALC_FOUND_ROWS * FROM t1 LEFT JOIN t2 ON t1.a=t2.a LEFT JOIN t3 ON t2.b=t3.b; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 4 -1 SIMPLE t2 ref idx idx 5 test.t1.a 1 Using where +1 SIMPLE t2 ref idx idx 5 test.t1.a 0 Using where 1 SIMPLE t3 ref idx idx 5 test.t2.b 1 Using where; Using index FLUSH STATUS ; SELECT SQL_CALC_FOUND_ROWS * FROM t1 LEFT JOIN t2 ON t1.a=t2.a diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result index ef2d2e85b8681..b8f5da7537f1a 100644 --- a/mysql-test/main/selectivity.result +++ b/mysql-test/main/selectivity.result @@ -325,13 +325,13 @@ and o_orderkey = l_orderkey group by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice order by o_totalprice desc, o_orderdate; id select_type table type possible_keys key key_len ref rows filtered Extra -1 PRIMARY orders ALL PRIMARY,i_o_custkey NULL NULL NULL 1500 100.00 Using where; Using temporary; Using filesort -1 PRIMARY customer eq_ref PRIMARY PRIMARY 4 dbt3_s001.orders.o_custkey 1 100.00 +1 PRIMARY customer ALL PRIMARY NULL NULL NULL 150 100.00 Using temporary; Using filesort +1 PRIMARY orders ref PRIMARY,i_o_custkey i_o_custkey 5 dbt3_s001.customer.c_custkey 10 100.00 1 PRIMARY eq_ref distinct_key distinct_key 4 dbt3_s001.orders.o_orderkey 1 100.00 1 PRIMARY lineitem ref PRIMARY,i_l_orderkey,i_l_orderkey_quantity i_l_orderkey_quantity 4 dbt3_s001.orders.o_orderkey 4 100.00 Using index 2 MATERIALIZED lineitem index NULL i_l_orderkey_quantity 13 NULL 6005 100.00 Using index Warnings: -Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`customer`.`c_custkey` = `dbt3_s001`.`orders`.`o_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` +Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`orders`.`o_custkey` = `dbt3_s001`.`customer`.`c_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` select c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice, sum(l_quantity) from customer, orders, lineitem @@ -359,13 +359,13 @@ and o_orderkey = l_orderkey group by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice order by o_totalprice desc, o_orderdate; id select_type table type possible_keys key key_len ref rows filtered Extra -1 PRIMARY orders ALL PRIMARY,i_o_custkey NULL NULL NULL 1500 100.00 Using where; Using temporary; Using filesort -1 PRIMARY customer eq_ref PRIMARY PRIMARY 4 dbt3_s001.orders.o_custkey 1 100.00 +1 PRIMARY customer ALL PRIMARY NULL NULL NULL 150 100.00 Using temporary; Using filesort +1 PRIMARY orders ref PRIMARY,i_o_custkey i_o_custkey 5 dbt3_s001.customer.c_custkey 10 100.00 1 PRIMARY eq_ref distinct_key distinct_key 4 dbt3_s001.orders.o_orderkey 1 100.00 1 PRIMARY lineitem ref PRIMARY,i_l_orderkey,i_l_orderkey_quantity i_l_orderkey_quantity 4 dbt3_s001.orders.o_orderkey 4 100.00 Using index 2 MATERIALIZED lineitem index NULL i_l_orderkey_quantity 13 NULL 6005 100.00 Using index Warnings: -Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`customer`.`c_custkey` = `dbt3_s001`.`orders`.`o_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` +Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`orders`.`o_custkey` = `dbt3_s001`.`customer`.`c_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` select c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice, sum(l_quantity) from customer, orders, lineitem @@ -1294,7 +1294,7 @@ select * from t1, t2, t1 as t3 where t1.b=t2.c and t2.d=t3.a and t3.b<5 and t1.a < 2000; id select_type table type possible_keys key key_len ref rows filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 262144 100.00 Using where -1 SIMPLE t2 ref c,d c 5 test.t1.b 5 100.00 +1 SIMPLE t2 ref c,d c 5 test.t1.b 2 100.00 1 SIMPLE t3 ALL NULL NULL NULL NULL 262144 100.00 Using where; Using join buffer (flat, BNL join) Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`c` AS `c`,`test`.`t2`.`d` AS `d`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t2` join `test`.`t1` `t3` where `test`.`t2`.`c` = `test`.`t1`.`b` and `test`.`t3`.`a` = `test`.`t2`.`d` and `test`.`t3`.`b` < 5 and `test`.`t1`.`a` < 2000 diff --git a/sql/sql_select.cc b/sql/sql_select.cc index b986e59656e08..ba88584ac098d 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -7297,6 +7297,26 @@ add_keyuse(DYNAMIC_ARRAY *keyuse_array, KEY_FIELD *key_field, keyuse.cond_guard= key_field->cond_guard; keyuse.sj_pred_no= key_field->sj_pred_no; keyuse.validity_ref= 0; + + /* Compute driving_ndv from EITS statistics if val is a field */ + keyuse.driving_ndv= 0; + { + Item *driving_item= key_field->val->real_item(); + if (driving_item->type() == Item::FIELD_ITEM) + { + Field *driving_field= ((Item_field*) driving_item)->field; + if (driving_field->read_stats) + { + double avg_freq= driving_field->read_stats->get_avg_frequency(); + if (avg_freq > 0) + { + double rows= (double) driving_field->table->stat_records(); + keyuse.driving_ndv= MY_MIN(rows / avg_freq, rows); + } + } + } + } + return (insert_dynamic(keyuse_array,(uchar*) &keyuse)); } @@ -7450,6 +7470,7 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array, keyuse.sj_pred_no= UINT_MAX; keyuse.validity_ref= 0; keyuse.null_rejecting= FALSE; + keyuse.driving_ndv= 0; /* Not applicable for fulltext keys */ return insert_dynamic(keyuse_array,(uchar*) &keyuse); } @@ -8222,6 +8243,52 @@ inline double use_found_constraint(double records) } +/* + Compute the probability that a value from the driving table will find + a match in the inner table's index, based on NDV (number of distinct values). + + @param inner_table The inner (looked-up) table + @param key The key being used for ref access + @param min_driving_ndv Minimum NDV across all usable driving columns + (precomputed during KEYUSE iteration) + + @return match_probability in range (0, 1.0] + 1.0 if statistics are not available + + The idea: if the driving table has more distinct values than the inner + table's key, only a fraction of driving rows will find matches. + + match_probability = min(1.0, ndv(inner.key) / ndv(outer.col)) + + where ndv = table_rows / avg_frequency +*/ + +static double +get_ref_match_probability(TABLE *inner_table, uint key, double min_driving_ndv) +{ + /* Get NDV for inner table's key (first key part) */ + KEY *keyinfo= inner_table->key_info + key; + Field *inner_field= keyinfo->key_part[0].field; + if (!inner_field->read_stats) + return 1.0; + + double inner_avg_freq= inner_field->read_stats->get_avg_frequency(); + if (inner_avg_freq <= 0) + return 1.0; + + double inner_rows= (double) inner_table->stat_records(); + double inner_ndv= inner_rows / inner_avg_freq; + + /* + match_probability = min(1.0, ndv(inner) / ndv(outer)) + + If inner has fewer distinct values than outer, only a fraction + of outer rows will find a match. + */ + return MY_MIN(1.0, inner_ndv / min_driving_ndv); +} + + /* Calculate the cost of reading a set of rows trough an index @@ -8801,6 +8868,8 @@ best_access_path(JOIN *join, DBUG_PRINT("info", ("Considering ref access on key %s", keyuse->table->key_info[keyuse->key].name.str)); + double min_driving_ndv= DBL_MAX; /* Track min driving NDV for keypart 0 */ + do /* For each keypart */ { uint keypart= keyuse->keypart; @@ -8844,6 +8913,10 @@ best_access_path(JOIN *join, best_part_found_ref= (keyuse->used_tables & ~join->const_table_map); } + /* Track minimum driving NDV for keypart 0 */ + if (keypart == 0 && keyuse->driving_ndv > 0) + set_if_smaller(min_driving_ndv, keyuse->driving_ndv); + if (rec > keyuse->ref_table_rows) rec= keyuse->ref_table_rows; /* @@ -9059,6 +9132,36 @@ best_access_path(JOIN *join, } } } + + /* + Apply match_probability based on NDV to adjust records. + If the driving table has more distinct values than the inner + table's key, only a fraction of driving rows will find matches. + */ + if (key_parts == 1 && + min_driving_ndv > 0 && min_driving_ndv < DBL_MAX) + { + /* + Bound effective NDV by record_count - NDV can't exceed the + number of rows from driving tables after filtering. + */ + double effective_driving_ndv= min_driving_ndv; + set_if_smaller(effective_driving_ndv, record_count); + + double match_prob= get_ref_match_probability(table, key, + effective_driving_ndv); + if (match_prob < 1.0) + { + if (unlikely(trace_access_idx.trace_started())) + { + trace_access_idx. + add("match_probability", match_prob). + add("rows_before_adjustment", records); + } + records *= match_prob; + } + } + /* Calculate the cost of the index access */ tmp= cost_for_index_read(thd, table, key, (ha_rows) records, 0); @@ -9270,6 +9373,36 @@ best_access_path(JOIN *join, } set_if_smaller(records, (double) s->records); + + /* + Apply match_probability based on NDV to adjust records. + If the driving table has more distinct values than the inner + table's key, only a fraction of driving rows will find matches. + */ + if (max_key_part == 1 && + min_driving_ndv > 0 && min_driving_ndv < DBL_MAX) + { + /* + Bound effective NDV by record_count - NDV can't exceed the + number of rows from driving tables after filtering. + */ + double effective_driving_ndv= min_driving_ndv; + set_if_smaller(effective_driving_ndv, record_count); + + double match_prob= get_ref_match_probability(table, key, + effective_driving_ndv); + if (match_prob < 1.0) + { + if (unlikely(trace_access_idx.trace_started())) + { + trace_access_idx. + add("match_probability", match_prob). + add("rows_before_adjustment", records); + } + records *= match_prob; + } + } + tmp= cost_for_index_read(thd, table, key, (ha_rows)records, 0); tmp.copy_cost+= extra_cost; } diff --git a/sql/sql_select.h b/sql/sql_select.h index 82e975407edda..06d92b76e0d07 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -81,6 +81,13 @@ typedef struct keyuse_t { */ bool *validity_ref; + /* + Number of distinct values in the driving column (val). + Computed from EITS statistics when KEYUSE is created. + 0 means statistics not available. + */ + double driving_ndv; + bool is_for_hash_join() { return is_hash_join_key_no(key); } } KEYUSE; diff --git a/sql/sql_test.cc b/sql/sql_test.cc index 24736007f083c..52d5bf97177d9 100644 --- a/sql/sql_test.cc +++ b/sql/sql_test.cc @@ -705,5 +705,7 @@ void print_keyuse_array_for_trace(THD *thd, DYNAMIC_ARRAY *keyuse_array) .field->field_name.str)). add("equals",keyuse->val). add("null_rejecting",keyuse->null_rejecting); + if (keyuse->driving_ndv > 0) + keyuse_elem.add("driving_ndv", keyuse->driving_ndv); } }