Properly prepare varinfos in estimate_multivariate_bucketsize()

author Alexander Korotkov <akorotkov@postgresql.org>

Wed, 23 Apr 2025 17:13:51 +0000 (20:13 +0300)

committer Alexander Korotkov <akorotkov@postgresql.org>

Wed, 23 Apr 2025 17:25:21 +0000 (20:25 +0300)
author Alexander Korotkov <akorotkov@postgresql.org>
Wed, 23 Apr 2025 17:13:51 +0000 (20:13 +0300)
committer Alexander Korotkov <akorotkov@postgresql.org>
Wed, 23 Apr 2025 17:25:21 +0000 (20:25 +0300)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 987f215445921c6a8aa2a331a9b8b6f82f46909b..a96b1b9c0bc69e30865221c5e24e37c594f16d21 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
             if (bms_get_singleton_member(relids, &relid) &&
                 root->simple_rel_array[relid]->statlist != NIL)
             {
+               bool        is_duplicate = false;
+
                 /*
                  * This inner-side expression references only one relation.
                  * Extended statistics on this clause can exist.
@@ -3880,11 +3882,61 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
                      */
                     continue;
  
-               varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
+               /*
+                * We're going to add the new clause to the varinfos list.  We
+                * might re-use add_unique_group_var(), but we don't do so for
+                * two reasons.
+                *
+                * 1) We must keep the origin_rinfos list ordered exactly the
+                * same way as varinfos.
+                *
+                * 2) add_unique_group_var() is designed for
+                * estimate_num_groups(), where a larger number of groups is
+                * worse.   While estimating the number of hash buckets, we
+                * have the opposite: a lesser number of groups is worse.
+                * Therefore, we don't have to remove "known equal" vars: the
+                * removed var may valuably contribute to the multivariate
+                * statistics to grow the number of groups.
+                */
+
+               /*
+                * Clear nullingrels to correctly match hash keys.  See
+                * add_unique_group_var()'s comment for details.
+                */
+               expr = remove_nulling_relids(expr, root->outer_join_rels, NULL);
+
+               /*
+                * Detect and exclude exact duplicates from the list of hash
+                * keys (like add_unique_group_var does).
+                */
+               foreach(lc1, varinfos)
+               {
+                   varinfo = (GroupVarInfo *) lfirst(lc1);
+
+                   if (!equal(expr, varinfo->var))
+                       continue;
+
+                   is_duplicate = true;
+                   break;
+               }
+
+               if (is_duplicate)
+               {
+                   /*
+                    * Skip exact duplicates. Adding them to the otherclauses
+                    * list also doesn't make sense.
+                    */
+                   continue;
+               }
+
+               /*
+                * Initialize GroupVarInfo.  We only use it to call
+                * estimate_multivariate_ndistinct(), which doesn't care about
+                * ndistinct and isdefault fields.  Thus, skip these fields.
+                */
+               varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo));
                 varinfo->var = expr;
                 varinfo->rel = root->simple_rel_array[relid];
-               varinfo->ndistinct = 0.0;
-               varinfo->isdefault = false;
                 varinfos = lappend(varinfos, varinfo);
  
                 /*
@@ -3894,8 +3946,10 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
                 origin_rinfos = lappend(origin_rinfos, rinfo);
             }
             else
+           {
                 /* This clause can't be estimated with extended statistics */
                 otherclauses = lappend(otherclauses, rinfo);
+           }
  
             clauses = foreach_delete_current(clauses, lc);
         }
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out

index 686d8c93aa87f57a2b2443aa6d9436d2711fa9e4..6359e5fb689cbca4fdbdb2a3d19d75c5af29bd52 100644 (file)
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
           ->  Seq Scan on sb_2 b
  (5 rows)
  
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+RESET enable_nestloop;
+RESET enable_mergejoin;
  DROP TABLE sb_1, sb_2 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql

index b71a6cd089fc84200f36587648e1973bca0afdc2..da4f2fe9c938f169d9012cbec4d23f7283da343c 100644 (file)
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1747,4 +1747,15 @@ ANALYZE sb_2;
  EXPLAIN (COSTS OFF) -- Choose hash join
  SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
  
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+RESET enable_nestloop;
+RESET enable_mergejoin;
+
  DROP TABLE sb_1, sb_2 CASCADE;
author	Alexander Korotkov <akorotkov@postgresql.org>
	Wed, 23 Apr 2025 17:13:51 +0000 (20:13 +0300)
committer	Alexander Korotkov <akorotkov@postgresql.org>
	Wed, 23 Apr 2025 17:25:21 +0000 (20:25 +0300)
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/test/regress/expected/stats_ext.out		patch \| blob \| blame \| history
src/test/regress/sql/stats_ext.sql		patch \| blob \| blame \| history