From 8ffdb9ded77ce49e6a187666a0aab3af7bf5a07b Mon Sep 17 00:00:00 2001 From: Oleg Smirnov Date: Fri, 23 Jan 2026 17:39:02 +0700 Subject: [PATCH] MDEV-38728 Improve join size estimation for ref access When estimating number of rows produced by a join after `ref` access, the optimizer assumes all driving table values will find matches in the inner table. This causes overestimation when the driving table has more distinct values than the inner table's key. Fix: use number of distinct values (NDV) for columns in the join predicate to calculate match probability: match_prob = min(1.0, NDV(inner) / NDV(driving)) The expected number of records after `ref` access is then multiplied by match probability to provide more accurate estimate. Limitations: - EITS must be available for both columns in the join predicate - both columns must be real table fields - only single-column ref access is supported - only first key part of the inner table's index is used TODO: - WHERE filter on the driving table may reduce NDV and affect estimation. Currently, it is handled only basically (driving_ndv must be <= number of records of current partial join) This commit overwrites only those test results which have been verified, i.e. provided better join size estimation. Other failing tests are not yet verified. --- .../main/delete_use_source_engines.result | 20 +-- mysql-test/main/derived_cond_pushdown.result | 12 +- mysql-test/main/join_cache.result | 4 +- mysql-test/main/match_probability.result | 148 ++++++++++++++++++ mysql-test/main/match_probability.test | 113 +++++++++++++ mysql-test/main/null_key.result | 2 +- mysql-test/main/selectivity.result | 14 +- sql/sql_select.cc | 133 ++++++++++++++++ sql/sql_select.h | 7 + sql/sql_test.cc | 2 + 10 files changed, 429 insertions(+), 26 deletions(-) create mode 100644 mysql-test/main/match_probability.result create mode 100644 mysql-test/main/match_probability.test diff --git a/mysql-test/main/delete_use_source_engines.result b/mysql-test/main/delete_use_source_engines.result index b1ad9c009d924..7b8097bbd8d92 100644 --- a/mysql-test/main/delete_use_source_engines.result +++ b/mysql-test/main/delete_use_source_engines.result @@ -731,15 +731,15 @@ create table tmp as select * from t1 where c1 in (select a.c2 from t1 a where a. explain select * from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) analyze delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 32.00 100.00 12.50 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 0.25 18.75 100.00 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 0.25 25.00 100.00 Using index; FirstMatch(t1) select * from t1; c1 c2 c3 1 2 2 @@ -2709,15 +2709,15 @@ create table tmp as select * from t1 where c1 in (select a.c2 from t1 a where a. explain select * from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) analyze delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 27.00 100.00 14.81 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 0.25 18.75 100.00 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 0.25 25.00 100.00 Using index; FirstMatch(t1) select * from t1; c1 c2 c3 1 2 2 @@ -2762,7 +2762,7 @@ explain select * from t1 where c1 in (select a.c2 from t1 a where a.c3 = t1.c3) order by c3 desc limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where; Using filesort -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using where; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using where; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c3 = t1.c3) order by c3 desc limit 1; id select_type table type possible_keys key key_len ref rows Extra @@ -5114,15 +5114,15 @@ create table tmp as select * from t1 where c1 in (select a.c2 from t1 a where a. explain select * from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) explain delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 Using index; FirstMatch(t1) analyze delete from t1 where c1 in (select a.c2 from t1 a where a.c2 = t1.c3) limit 1; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 32 29.00 100.00 13.79 Using where -1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 5 0.25 18.75 100.00 Using index; FirstMatch(t1) +1 PRIMARY a ref t1_c2 t1_c2 5 test.t1.c1 4 0.25 25.00 100.00 Using index; FirstMatch(t1) select * from t1; c1 c2 c3 1 2 2 diff --git a/mysql-test/main/derived_cond_pushdown.result b/mysql-test/main/derived_cond_pushdown.result index 1fa1eea5dd307..0593d8a10d1f6 100644 --- a/mysql-test/main/derived_cond_pushdown.result +++ b/mysql-test/main/derived_cond_pushdown.result @@ -20147,7 +20147,7 @@ from t2, t3, (select c, max(b) max, min(b) min from t4 group by c) t where t2.b < 40 and t2.a=t3.a and t3.c=t.c; id select_type table type possible_keys key key_len ref rows filtered Extra 1 PRIMARY t2 ALL NULL NULL NULL NULL 90 60.00 Using where -1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 1 100.00 Using where +1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 0 100.00 Using where 1 PRIMARY ref key0 key0 128 test.t3.c 5 100.00 2 DERIVED t4 ALL idx_c NULL NULL NULL 160 100.00 Using temporary; Using filesort Warnings: @@ -20182,7 +20182,7 @@ EXPLAIN "used_key_parts": ["a"], "ref": ["test.t2.a"], "loops": 53.99999991, - "rows": 1, + "rows": 0, "cost": "COST_REPLACED", "filtered": 100, "attached_condition": "t3.c is not null" @@ -20197,7 +20197,7 @@ EXPLAIN "key_length": "128", "used_key_parts": ["c"], "ref": ["test.t3.c"], - "loops": 80.99999987, + "loops": 52.89695991, "rows": 5, "cost": "COST_REPLACED", "filtered": 100, @@ -20690,7 +20690,7 @@ from t2, t3, (select c, b, sum(b) over (partition by c) from t4 ) t where t2.b < 40 and t2.a=t3.a and t3.c=t.c; id select_type table type possible_keys key key_len ref rows filtered Extra 1 PRIMARY t2 ALL NULL NULL NULL NULL 90 60.00 Using where -1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 1 100.00 Using where +1 PRIMARY t3 ref idx_a idx_a 5 test.t2.a 0 100.00 Using where 1 PRIMARY ref key0 key0 128 test.t3.c 10 100.00 2 DERIVED t4 ALL idx_c NULL NULL NULL 160 100.00 Using temporary Warnings: @@ -20725,7 +20725,7 @@ EXPLAIN "used_key_parts": ["a"], "ref": ["test.t2.a"], "loops": 53.99999991, - "rows": 1, + "rows": 0, "cost": "COST_REPLACED", "filtered": 100, "attached_condition": "t3.c is not null" @@ -20740,7 +20740,7 @@ EXPLAIN "key_length": "128", "used_key_parts": ["c"], "ref": ["test.t3.c"], - "loops": 80.99999987, + "loops": 52.89695991, "rows": 10, "cost": "COST_REPLACED", "filtered": 100, diff --git a/mysql-test/main/join_cache.result b/mysql-test/main/join_cache.result index aa12ac8942d39..ddd82d5ce3933 100644 --- a/mysql-test/main/join_cache.result +++ b/mysql-test/main/join_cache.result @@ -6365,7 +6365,7 @@ LEFT JOIN t3 ON t2.d = t3.c LEFT JOIN t4 ON t3.c=1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 index NULL PRIMARY 4 NULL 10000 Using index -1 SIMPLE t2 ref b b 4 test.t1.b 1 +1 SIMPLE t2 ref b b 4 test.t1.b 0 1 SIMPLE t3 eq_ref PRIMARY PRIMARY 4 test.t2.d 1 Using where; Using index 1 SIMPLE t4 index NULL PRIMARY 4 NULL 3000 Using where; Using index SELECT COUNT(*) @@ -6383,7 +6383,7 @@ LEFT JOIN t3 ON t2.d = t3.c LEFT JOIN t4 ON t3.c=1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 index NULL PRIMARY 4 NULL 10000 Using index -1 SIMPLE t2 ref b b 4 test.t1.b 1 +1 SIMPLE t2 ref b b 4 test.t1.b 0 1 SIMPLE t3 eq_ref PRIMARY PRIMARY 4 test.t2.d 1 Using where; Using index 1 SIMPLE t4 index NULL PRIMARY 4 NULL 3000 Using where; Using index; Using join buffer (flat, BNL join) SELECT COUNT(*) diff --git a/mysql-test/main/match_probability.result b/mysql-test/main/match_probability.result new file mode 100644 index 0000000000000..f2d2bea1286cd --- /dev/null +++ b/mysql-test/main/match_probability.result @@ -0,0 +1,148 @@ +# Setup: Two tables with skewed foreign key distribution +CREATE TABLE t_ndv100 (a INT); +CREATE TABLE t_ndv3 (a INT, b VARCHAR(50), KEY idx_a (a)); +# All values in t_ndv100.a are different (100 distinct values) +INSERT INTO t_ndv100 (a) SELECT seq FROM seq_1_to_100; +# There are only 3 distinct values in t_ndv3.a +INSERT INTO t_ndv3 (a, b) SELECT seq/100, 'def' FROM seq_1_to_300; +set optimizer_trace=1; +# Inefficient plan until EITS is collected (full scan of t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 +1 SIMPLE t_ndv3 ALL idx_a NULL NULL NULL 300 Using where; Using join buffer (flat, BNL join) +# Collect statistics only for the driving table (t_ndv100) +ANALYZE TABLE t_ndv100 PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_ndv100 analyze status Engine-independent statistics collected +test.t_ndv100 analyze status OK +# It is not enough until there are statistics for the inner table (t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 +1 SIMPLE t_ndv3 ALL idx_a NULL NULL NULL 300 Using where; Using join buffer (flat, BNL join) +ANALYZE TABLE t_ndv3 PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_ndv3 analyze status Engine-independent statistics collected +test.t_ndv3 analyze status Table is already up to date +# After EITS is collected for both tables, the plan is efficient +# (index scan of t_ndv3). "rows" in the output are close to actual "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where +1 SIMPLE t_ndv3 ref idx_a idx_a 5 test.t_ndv100.a 3 2.51 100.00 100.00 +# Optimization must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +[ + 0.04, + 75 +] +# Match probability is not applicable because the left part of the +# join condition is not a field but an expression (t_ndv100.a + 10). See the +# difference between "rows" and "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a + 10 = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 +1 SIMPLE t_ndv3 ALL idx_a NULL NULL NULL 300 300.00 25.00 0.00 Using where; Using join buffer (flat, BNL join) +# Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +NULL +# If all values are NULL, match probability is not applicable +CREATE TABLE t_nulls (a INT, b VARCHAR(50), KEY idx_a (a)); +INSERT INTO t_nulls (a, b) SELECT NULL, 'def' FROM seq_1_to_300; +ANALYZE TABLE t_nulls PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_nulls analyze status Engine-independent statistics collected +test.t_nulls analyze status Table is already up to date +EXPLAIN SELECT t_ndv100.a, t_nulls.b +FROM t_ndv100 +STRAIGHT_JOIN t_nulls ON t_ndv100.a = t_nulls.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 Using where +1 SIMPLE t_nulls ref idx_a idx_a 5 test.t_ndv100.a 1 +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +NULL +#------------------------------ +# Test multi-part index +#------------------------------ +CREATE TABLE t_ndv30 (a INT, b INT, c VARCHAR(50), KEY idx_ab (a, b)); +INSERT INTO t_ndv30 (a, b, c) SELECT seq/10, seq/10, 'def' FROM seq_1_to_300; +ANALYZE TABLE t_ndv30 PERSISTENT FOR ALL; +Table Op Msg_type Msg_text +test.t_ndv30 analyze status Engine-independent statistics collected +test.t_ndv30 analyze status Table is already up to date +# If more than one key part is used, match probability is not applicable. +# See the mismatch between "rows" and "r_rows" in ANALYZE output +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a +AND t_ndv100.a = t_ndv30.b; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where +1 SIMPLE t_ndv30 ref idx_ab idx_ab 10 test.t_ndv100.a,test.t_ndv100.a 9 2.96 100.00 100.00 Using index +# Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +NULL +# Only first key part of `idx_ab` is used, match probability is applicable. +# "rows" is now much closer to actual "r_rows". +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where +1 SIMPLE t_ndv30 ref idx_ab idx_ab 5 test.t_ndv100.a 3 2.96 100.00 100.00 Using index +# Must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +[ + 0.31000062, + 9.6774 +] +# Three tables with equality propagation. +# min_driving_ndv = min(NDV(t_ndv30.a), NDV(t_ndv100.a)) = min(30, 100) = 30 +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv30 +STRAIGHT_JOIN t_ndv100 ON t_ndv30.a = t_ndv100.a +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t_ndv30 index idx_ab idx_ab 10 NULL 300 300.00 100.00 100.00 Using where; Using index +1 SIMPLE t_ndv100 ALL NULL NULL NULL NULL 100 100.00 100.00 0.99 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t_ndv3 ref idx_a idx_a 5 test.t_ndv30.a 9 8.48 100.00 100.00 +# Must be reflected in the trace (match_prob ~= 0.1) +SELECT json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.match_probability', +'$**.rows_before_adjustment')) +[ + 0.129032, + 75 +] +DROP TABLE t_ndv100, t_ndv3, t_ndv30, t_nulls; diff --git a/mysql-test/main/match_probability.test b/mysql-test/main/match_probability.test new file mode 100644 index 0000000000000..7890a5dc99eaf --- /dev/null +++ b/mysql-test/main/match_probability.test @@ -0,0 +1,113 @@ +--source include/have_sequence.inc + +--echo # Setup: Two tables with skewed foreign key distribution +CREATE TABLE t_ndv100 (a INT); + +CREATE TABLE t_ndv3 (a INT, b VARCHAR(50), KEY idx_a (a)); + +--echo # All values in t_ndv100.a are different (100 distinct values) +INSERT INTO t_ndv100 (a) SELECT seq FROM seq_1_to_100; + +--echo # There are only 3 distinct values in t_ndv3.a +INSERT INTO t_ndv3 (a, b) SELECT seq/100, 'def' FROM seq_1_to_300; + +set optimizer_trace=1; + +--echo # Inefficient plan until EITS is collected (full scan of t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +--echo # Collect statistics only for the driving table (t_ndv100) +ANALYZE TABLE t_ndv100 PERSISTENT FOR ALL; + +--echo # It is not enough until there are statistics for the inner table (t_ndv3) +EXPLAIN SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +ANALYZE TABLE t_ndv3 PERSISTENT FOR ALL; + +--echo # After EITS is collected for both tables, the plan is efficient +--echo # (index scan of t_ndv3). "rows" in the output are close to actual "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +--echo # Optimization must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # Match probability is not applicable because the left part of the +--echo # join condition is not a field but an expression (t_ndv100.a + 10). See the +--echo # difference between "rows" and "r_rows" +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a + 10 = t_ndv3.a; + +--echo # Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # If all values are NULL, match probability is not applicable +CREATE TABLE t_nulls (a INT, b VARCHAR(50), KEY idx_a (a)); + +INSERT INTO t_nulls (a, b) SELECT NULL, 'def' FROM seq_1_to_300; + +ANALYZE TABLE t_nulls PERSISTENT FOR ALL; + +EXPLAIN SELECT t_ndv100.a, t_nulls.b +FROM t_ndv100 +STRAIGHT_JOIN t_nulls ON t_ndv100.a = t_nulls.a; + +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo #------------------------------ +--echo # Test multi-part index +--echo #------------------------------ +CREATE TABLE t_ndv30 (a INT, b INT, c VARCHAR(50), KEY idx_ab (a, b)); + +INSERT INTO t_ndv30 (a, b, c) SELECT seq/10, seq/10, 'def' FROM seq_1_to_300; + +ANALYZE TABLE t_ndv30 PERSISTENT FOR ALL; + +--echo # If more than one key part is used, match probability is not applicable. +--echo # See the mismatch between "rows" and "r_rows" in ANALYZE output +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a + AND t_ndv100.a = t_ndv30.b; + +--echo # Must be no records in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # Only first key part of `idx_ab` is used, match probability is applicable. +--echo # "rows" is now much closer to actual "r_rows". +ANALYZE SELECT t_ndv100.a, t_ndv30.b +FROM t_ndv100 +STRAIGHT_JOIN t_ndv30 ON t_ndv100.a = t_ndv30.a; + +--echo # Must be reflected in the trace +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +--echo # Three tables with equality propagation. +--echo # min_driving_ndv = min(NDV(t_ndv30.a), NDV(t_ndv100.a)) = min(30, 100) = 30 +ANALYZE SELECT t_ndv100.a, t_ndv3.b +FROM t_ndv30 +STRAIGHT_JOIN t_ndv100 ON t_ndv30.a = t_ndv100.a +STRAIGHT_JOIN t_ndv3 ON t_ndv100.a = t_ndv3.a; + +--echo # Must be reflected in the trace (match_prob ~= 0.1) +SELECT json_detailed(json_extract(trace, '$**.match_probability', + '$**.rows_before_adjustment')) +FROM information_schema.optimizer_trace; + +DROP TABLE t_ndv100, t_ndv3, t_ndv30, t_nulls; \ No newline at end of file diff --git a/mysql-test/main/null_key.result b/mysql-test/main/null_key.result index 0b1446fd72e1c..9bc20427487b1 100644 --- a/mysql-test/main/null_key.result +++ b/mysql-test/main/null_key.result @@ -421,7 +421,7 @@ EXPLAIN SELECT SQL_CALC_FOUND_ROWS * FROM t1 LEFT JOIN t2 ON t1.a=t2.a LEFT JOIN t3 ON t2.b=t3.b; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 4 -1 SIMPLE t2 ref idx idx 5 test.t1.a 1 Using where +1 SIMPLE t2 ref idx idx 5 test.t1.a 0 Using where 1 SIMPLE t3 ref idx idx 5 test.t2.b 1 Using where; Using index FLUSH STATUS ; SELECT SQL_CALC_FOUND_ROWS * FROM t1 LEFT JOIN t2 ON t1.a=t2.a diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result index ef2d2e85b8681..b8f5da7537f1a 100644 --- a/mysql-test/main/selectivity.result +++ b/mysql-test/main/selectivity.result @@ -325,13 +325,13 @@ and o_orderkey = l_orderkey group by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice order by o_totalprice desc, o_orderdate; id select_type table type possible_keys key key_len ref rows filtered Extra -1 PRIMARY orders ALL PRIMARY,i_o_custkey NULL NULL NULL 1500 100.00 Using where; Using temporary; Using filesort -1 PRIMARY customer eq_ref PRIMARY PRIMARY 4 dbt3_s001.orders.o_custkey 1 100.00 +1 PRIMARY customer ALL PRIMARY NULL NULL NULL 150 100.00 Using temporary; Using filesort +1 PRIMARY orders ref PRIMARY,i_o_custkey i_o_custkey 5 dbt3_s001.customer.c_custkey 10 100.00 1 PRIMARY eq_ref distinct_key distinct_key 4 dbt3_s001.orders.o_orderkey 1 100.00 1 PRIMARY lineitem ref PRIMARY,i_l_orderkey,i_l_orderkey_quantity i_l_orderkey_quantity 4 dbt3_s001.orders.o_orderkey 4 100.00 Using index 2 MATERIALIZED lineitem index NULL i_l_orderkey_quantity 13 NULL 6005 100.00 Using index Warnings: -Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`customer`.`c_custkey` = `dbt3_s001`.`orders`.`o_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` +Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`orders`.`o_custkey` = `dbt3_s001`.`customer`.`c_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` select c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice, sum(l_quantity) from customer, orders, lineitem @@ -359,13 +359,13 @@ and o_orderkey = l_orderkey group by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice order by o_totalprice desc, o_orderdate; id select_type table type possible_keys key key_len ref rows filtered Extra -1 PRIMARY orders ALL PRIMARY,i_o_custkey NULL NULL NULL 1500 100.00 Using where; Using temporary; Using filesort -1 PRIMARY customer eq_ref PRIMARY PRIMARY 4 dbt3_s001.orders.o_custkey 1 100.00 +1 PRIMARY customer ALL PRIMARY NULL NULL NULL 150 100.00 Using temporary; Using filesort +1 PRIMARY orders ref PRIMARY,i_o_custkey i_o_custkey 5 dbt3_s001.customer.c_custkey 10 100.00 1 PRIMARY eq_ref distinct_key distinct_key 4 dbt3_s001.orders.o_orderkey 1 100.00 1 PRIMARY lineitem ref PRIMARY,i_l_orderkey,i_l_orderkey_quantity i_l_orderkey_quantity 4 dbt3_s001.orders.o_orderkey 4 100.00 Using index 2 MATERIALIZED lineitem index NULL i_l_orderkey_quantity 13 NULL 6005 100.00 Using index Warnings: -Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`customer`.`c_custkey` = `dbt3_s001`.`orders`.`o_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` +Note 1003 /* select#1 */ select `dbt3_s001`.`customer`.`c_name` AS `c_name`,`dbt3_s001`.`customer`.`c_custkey` AS `c_custkey`,`dbt3_s001`.`orders`.`o_orderkey` AS `o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE` AS `o_orderdate`,`dbt3_s001`.`orders`.`o_totalprice` AS `o_totalprice`,sum(`dbt3_s001`.`lineitem`.`l_quantity`) AS `sum(l_quantity)` from (/* select#2 */ select `dbt3_s001`.`lineitem`.`l_orderkey` from `dbt3_s001`.`lineitem` group by `dbt3_s001`.`lineitem`.`l_orderkey` having sum(`dbt3_s001`.`lineitem`.`l_quantity`) > 250) join `dbt3_s001`.`customer` join `dbt3_s001`.`orders` join `dbt3_s001`.`lineitem` where `dbt3_s001`.`orders`.`o_custkey` = `dbt3_s001`.`customer`.`c_custkey` and ``.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` and `dbt3_s001`.`lineitem`.`l_orderkey` = `dbt3_s001`.`orders`.`o_orderkey` group by `dbt3_s001`.`customer`.`c_name`,`dbt3_s001`.`customer`.`c_custkey`,`dbt3_s001`.`orders`.`o_orderkey`,`dbt3_s001`.`orders`.`o_orderDATE`,`dbt3_s001`.`orders`.`o_totalprice` order by `dbt3_s001`.`orders`.`o_totalprice` desc,`dbt3_s001`.`orders`.`o_orderDATE` select c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice, sum(l_quantity) from customer, orders, lineitem @@ -1294,7 +1294,7 @@ select * from t1, t2, t1 as t3 where t1.b=t2.c and t2.d=t3.a and t3.b<5 and t1.a < 2000; id select_type table type possible_keys key key_len ref rows filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 262144 100.00 Using where -1 SIMPLE t2 ref c,d c 5 test.t1.b 5 100.00 +1 SIMPLE t2 ref c,d c 5 test.t1.b 2 100.00 1 SIMPLE t3 ALL NULL NULL NULL NULL 262144 100.00 Using where; Using join buffer (flat, BNL join) Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`c` AS `c`,`test`.`t2`.`d` AS `d`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t2` join `test`.`t1` `t3` where `test`.`t2`.`c` = `test`.`t1`.`b` and `test`.`t3`.`a` = `test`.`t2`.`d` and `test`.`t3`.`b` < 5 and `test`.`t1`.`a` < 2000 diff --git a/sql/sql_select.cc b/sql/sql_select.cc index b986e59656e08..ba88584ac098d 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -7297,6 +7297,26 @@ add_keyuse(DYNAMIC_ARRAY *keyuse_array, KEY_FIELD *key_field, keyuse.cond_guard= key_field->cond_guard; keyuse.sj_pred_no= key_field->sj_pred_no; keyuse.validity_ref= 0; + + /* Compute driving_ndv from EITS statistics if val is a field */ + keyuse.driving_ndv= 0; + { + Item *driving_item= key_field->val->real_item(); + if (driving_item->type() == Item::FIELD_ITEM) + { + Field *driving_field= ((Item_field*) driving_item)->field; + if (driving_field->read_stats) + { + double avg_freq= driving_field->read_stats->get_avg_frequency(); + if (avg_freq > 0) + { + double rows= (double) driving_field->table->stat_records(); + keyuse.driving_ndv= MY_MIN(rows / avg_freq, rows); + } + } + } + } + return (insert_dynamic(keyuse_array,(uchar*) &keyuse)); } @@ -7450,6 +7470,7 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array, keyuse.sj_pred_no= UINT_MAX; keyuse.validity_ref= 0; keyuse.null_rejecting= FALSE; + keyuse.driving_ndv= 0; /* Not applicable for fulltext keys */ return insert_dynamic(keyuse_array,(uchar*) &keyuse); } @@ -8222,6 +8243,52 @@ inline double use_found_constraint(double records) } +/* + Compute the probability that a value from the driving table will find + a match in the inner table's index, based on NDV (number of distinct values). + + @param inner_table The inner (looked-up) table + @param key The key being used for ref access + @param min_driving_ndv Minimum NDV across all usable driving columns + (precomputed during KEYUSE iteration) + + @return match_probability in range (0, 1.0] + 1.0 if statistics are not available + + The idea: if the driving table has more distinct values than the inner + table's key, only a fraction of driving rows will find matches. + + match_probability = min(1.0, ndv(inner.key) / ndv(outer.col)) + + where ndv = table_rows / avg_frequency +*/ + +static double +get_ref_match_probability(TABLE *inner_table, uint key, double min_driving_ndv) +{ + /* Get NDV for inner table's key (first key part) */ + KEY *keyinfo= inner_table->key_info + key; + Field *inner_field= keyinfo->key_part[0].field; + if (!inner_field->read_stats) + return 1.0; + + double inner_avg_freq= inner_field->read_stats->get_avg_frequency(); + if (inner_avg_freq <= 0) + return 1.0; + + double inner_rows= (double) inner_table->stat_records(); + double inner_ndv= inner_rows / inner_avg_freq; + + /* + match_probability = min(1.0, ndv(inner) / ndv(outer)) + + If inner has fewer distinct values than outer, only a fraction + of outer rows will find a match. + */ + return MY_MIN(1.0, inner_ndv / min_driving_ndv); +} + + /* Calculate the cost of reading a set of rows trough an index @@ -8801,6 +8868,8 @@ best_access_path(JOIN *join, DBUG_PRINT("info", ("Considering ref access on key %s", keyuse->table->key_info[keyuse->key].name.str)); + double min_driving_ndv= DBL_MAX; /* Track min driving NDV for keypart 0 */ + do /* For each keypart */ { uint keypart= keyuse->keypart; @@ -8844,6 +8913,10 @@ best_access_path(JOIN *join, best_part_found_ref= (keyuse->used_tables & ~join->const_table_map); } + /* Track minimum driving NDV for keypart 0 */ + if (keypart == 0 && keyuse->driving_ndv > 0) + set_if_smaller(min_driving_ndv, keyuse->driving_ndv); + if (rec > keyuse->ref_table_rows) rec= keyuse->ref_table_rows; /* @@ -9059,6 +9132,36 @@ best_access_path(JOIN *join, } } } + + /* + Apply match_probability based on NDV to adjust records. + If the driving table has more distinct values than the inner + table's key, only a fraction of driving rows will find matches. + */ + if (key_parts == 1 && + min_driving_ndv > 0 && min_driving_ndv < DBL_MAX) + { + /* + Bound effective NDV by record_count - NDV can't exceed the + number of rows from driving tables after filtering. + */ + double effective_driving_ndv= min_driving_ndv; + set_if_smaller(effective_driving_ndv, record_count); + + double match_prob= get_ref_match_probability(table, key, + effective_driving_ndv); + if (match_prob < 1.0) + { + if (unlikely(trace_access_idx.trace_started())) + { + trace_access_idx. + add("match_probability", match_prob). + add("rows_before_adjustment", records); + } + records *= match_prob; + } + } + /* Calculate the cost of the index access */ tmp= cost_for_index_read(thd, table, key, (ha_rows) records, 0); @@ -9270,6 +9373,36 @@ best_access_path(JOIN *join, } set_if_smaller(records, (double) s->records); + + /* + Apply match_probability based on NDV to adjust records. + If the driving table has more distinct values than the inner + table's key, only a fraction of driving rows will find matches. + */ + if (max_key_part == 1 && + min_driving_ndv > 0 && min_driving_ndv < DBL_MAX) + { + /* + Bound effective NDV by record_count - NDV can't exceed the + number of rows from driving tables after filtering. + */ + double effective_driving_ndv= min_driving_ndv; + set_if_smaller(effective_driving_ndv, record_count); + + double match_prob= get_ref_match_probability(table, key, + effective_driving_ndv); + if (match_prob < 1.0) + { + if (unlikely(trace_access_idx.trace_started())) + { + trace_access_idx. + add("match_probability", match_prob). + add("rows_before_adjustment", records); + } + records *= match_prob; + } + } + tmp= cost_for_index_read(thd, table, key, (ha_rows)records, 0); tmp.copy_cost+= extra_cost; } diff --git a/sql/sql_select.h b/sql/sql_select.h index 82e975407edda..06d92b76e0d07 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -81,6 +81,13 @@ typedef struct keyuse_t { */ bool *validity_ref; + /* + Number of distinct values in the driving column (val). + Computed from EITS statistics when KEYUSE is created. + 0 means statistics not available. + */ + double driving_ndv; + bool is_for_hash_join() { return is_hash_join_key_no(key); } } KEYUSE; diff --git a/sql/sql_test.cc b/sql/sql_test.cc index 24736007f083c..52d5bf97177d9 100644 --- a/sql/sql_test.cc +++ b/sql/sql_test.cc @@ -705,5 +705,7 @@ void print_keyuse_array_for_trace(THD *thd, DYNAMIC_ARRAY *keyuse_array) .field->field_name.str)). add("equals",keyuse->val). add("null_rejecting",keyuse->null_rejecting); + if (keyuse->driving_ndv > 0) + keyuse_elem.add("driving_ndv", keyuse->driving_ndv); } }