Fix parameterized paths

This patch fixes two issues related to parameterized path logic on master. 1. When generating unique row ID on the outer/inner side for join JOIN_DEDUP_SEMI/JOIN_DEDUP_SEMI_REVERSE, we need to pass the param info of outerpath/innerpath to the projection path. Otherwise we would have problems when deciding whether a joinclause is movable to this join rel. 2. We should not pick up the parameterized path when its required outer is beyond a Motion, since we cannot pass a param through Motion. Fixes issue #10012 Reviewed-by: N Heikki Linnakangas <hlinnakangas@pivotal.io> Reviewed-by: N Jinbao Chen <jinchen@pivotal.io>

Fix parameterized paths
This patch fixes two issues related to parameterized path logic on master. 1. When generating unique row ID on the outer/inner side for join JOIN_DEDUP_SEMI/JOIN_DEDUP_SEMI_REVERSE, we need to pass the param info of outerpath/innerpath to the projection path. Otherwise we would have problems when deciding whether a joinclause is movable to this join rel. 2. We should not pick up the parameterized path when its required outer is beyond a Motion, since we cannot pass a param through Motion. Fixes issue #10012 Reviewed-by: N Heikki Linnakangas <hlinnakangas@pivotal.io> Reviewed-by: N Jinbao Chen <jinchen@pivotal.io>
9cc1da61 · Richard Guo · GitHub · f860ff0c · 9cc1da61 · 9cc1da61
7 changed file
--- a/src/backend/cdb/cdbpath.c
+++ b/src/backend/cdb/cdbpath.c
@@ -191,6 +191,12 @@ cdbpath_create_motion_path(PlannerInfo *root,
 		/* singleQE-->entry?  Don't move.  Slice's QE will run on entry db. */
 		if (CdbPathLocus_IsSingleQE(subpath->locus))
 		{
+			/*
+			 * If the subpath requires parameters, we cannot generate Motion atop of it.
+			 */
+			if (!bms_is_empty(PATH_REQ_OUTER(subpath)))
+				return NULL;
+
 			/*
 			 * Create CdbMotionPath node to indicate that the slice must be
 			 * dispatched to a singleton gang running on the entry db.  We
@@ -233,6 +239,12 @@ cdbpath_create_motion_path(PlannerInfo *root,
 		if (CdbPathLocus_IsSegmentGeneral(subpath->locus) ||
 			CdbPathLocus_IsReplicated(subpath->locus))
 		{
+			/*
+			 * If the subpath requires parameters, we cannot generate Motion atop of it.
+			 */
+			if (!bms_is_empty(PATH_REQ_OUTER(subpath)))
+				return NULL;
+
 			/*
 			 * Data is only available on segments, to distingush it with
 			 * CdbLocusType_General, adding a motion to indicated this
@@ -483,6 +495,12 @@ cdbpath_create_motion_path(PlannerInfo *root,
        return (Path *) newSubqueryScanPath;
    }

+	/*
+	 * If the subpath requires parameters, we cannot generate Motion atop of it.
+	 */
+	if (!bms_is_empty(PATH_REQ_OUTER(subpath)))
+		return NULL;
+
 	/* Create CdbMotionPath node. */
 	pathnode = makeNode(CdbMotionPath);
 	pathnode->path.pathtype = T_Motion;
@@ -1166,7 +1184,9 @@ add_rowid_to_path(PlannerInfo *root, Path *path, int *rowidexpr_id)
 	newpathtarget = copy_pathtarget(path->pathtarget);
 	add_column_to_pathtarget(newpathtarget, (Expr *) rowidexpr, 0);

-	return (Path *) create_projection_path(root, path->parent, path, newpathtarget);
+	return (Path *) create_projection_path_with_quals(root, path->parent,
+													  path, newpathtarget,
+													  NIL, true);
 }

 /*

--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -486,7 +486,8 @@ bring_to_outer_query(PlannerInfo *root, RelOptInfo *rel, List *outer_quals)
 															  rel,
 															  path,
 															  path->parent->reltarget,
-															  outer_quals);
+															  outer_quals,
+															  false);
 		add_path(rel, path);
 	}
 	set_cheapest(rel);

--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1910,7 +1910,8 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel,
 					subpath->parent,
 					subpath,
 					subpath->pathtarget,
-					list_make1(restrict_info));
+					list_make1(restrict_info),
+					false);

 				/*
 				 * We use the skill of Result plannode with one time filter
@@ -2440,6 +2441,8 @@ create_unique_rowid_path(PlannerInfo *root,
 										list_make1_int(0),
 										numsegments);
 		subpath = cdbpath_create_motion_path(root, subpath, NIL, false, locus);
+		if (!subpath)
+			return NULL;

 		/*
 		 * The motion path has been created correctly, but there's a little
@@ -3728,15 +3731,18 @@ create_projection_path(PlannerInfo *root,
 					   Path *subpath,
 					   PathTarget *target)
 {
-	return create_projection_path_with_quals(root, rel, subpath, target, NIL);
+	return create_projection_path_with_quals(root, rel,
+											 subpath, target,
+											 NIL, false);
 }

 ProjectionPath *
 create_projection_path_with_quals(PlannerInfo *root,
-					   RelOptInfo *rel,
-					   Path *subpath,
-					   PathTarget *target,
-					   List *restrict_clauses)
+								  RelOptInfo *rel,
+								  Path *subpath,
+								  PathTarget *target,
+								  List *restrict_clauses,
+								  bool need_param)
 {
 	ProjectionPath *pathnode = makeNode(ProjectionPath);
 	PathTarget *oldtarget = subpath->pathtarget;
@@ -3744,8 +3750,7 @@ create_projection_path_with_quals(PlannerInfo *root,
 	pathnode->path.pathtype = T_Result;
 	pathnode->path.parent = rel;
 	pathnode->path.pathtarget = target;
-	/* For now, assume we are above any joins, so no parameterization */
-	pathnode->path.param_info = NULL;
+	pathnode->path.param_info = need_param ? subpath->param_info : NULL;
 	pathnode->path.parallel_aware = false;
 	pathnode->path.parallel_safe = rel->consider_parallel &&
 		subpath->parallel_safe &&

--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -177,7 +177,8 @@ extern ProjectionPath *create_projection_path_with_quals(PlannerInfo *root,
 					   RelOptInfo *rel,
 					   Path *subpath,
 					   PathTarget *target,
-					   List *restrict_clauses);
+					   List *restrict_clauses,
+					   bool need_param);
 extern Path *apply_projection_to_path(PlannerInfo *root,
 						 RelOptInfo *rel,
 						 Path *path,

--- a/src/test/regress/expected/join_gp.out
+++ b/src/test/regress/expected/join_gp.out
@@ -1536,3 +1536,68 @@ select * from foo where exists (select 1 from bar where foo.a = bar.b);
 reset enable_hashagg;
 drop table foo;
 drop table bar;
+-- Fix github issue 10012
+create table fix_param_a (i int, j int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'i' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table fix_param_b (i int UNIQUE, j int);
+create table fix_param_c (i int, j int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'i' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into fix_param_a select i, i from generate_series(1,20)i;
+insert into fix_param_b select i, i from generate_series(1,2000)i;
+insert into fix_param_c select i, i from generate_series(1,2000)i;
+analyze fix_param_a;
+analyze fix_param_b;
+analyze fix_param_c;
+explain (costs off)
+select * from fix_param_a left join fix_param_b on
+	fix_param_a.i = fix_param_b.i and fix_param_b.j in
+		(select j from fix_param_c where fix_param_b.i = fix_param_c.i)
+	order by 1;
+                                              QUERY PLAN                                              
+------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Merge Key: fix_param_a.i
+   ->  Sort
+         Sort Key: fix_param_a.i
+         ->  Hash Right Join
+               Hash Cond: (fix_param_b.i = fix_param_a.i)
+               ->  Hash Semi Join
+                     Hash Cond: ((fix_param_b.i = fix_param_c.i) AND (fix_param_b.j = fix_param_c.j))
+                     ->  Seq Scan on fix_param_b
+                     ->  Hash
+                           ->  Seq Scan on fix_param_c
+               ->  Hash
+                     ->  Seq Scan on fix_param_a
+ Optimizer: Postgres query optimizer
+(14 rows)
+
+select * from fix_param_a left join fix_param_b on
+	fix_param_a.i = fix_param_b.i and fix_param_b.j in
+		(select j from fix_param_c where fix_param_b.i = fix_param_c.i)
+	order by 1;
+ i  | j  | i  | j  
+----+----+----+----
+  1 |  1 |  1 |  1
+  2 |  2 |  2 |  2
+  3 |  3 |  3 |  3
+  4 |  4 |  4 |  4
+  5 |  5 |  5 |  5
+  6 |  6 |  6 |  6
+  7 |  7 |  7 |  7
+  8 |  8 |  8 |  8
+  9 |  9 |  9 |  9
+ 10 | 10 | 10 | 10
+ 11 | 11 | 11 | 11
+ 12 | 12 | 12 | 12
+ 13 | 13 | 13 | 13
+ 14 | 14 | 14 | 14
+ 15 | 15 | 15 | 15
+ 16 | 16 | 16 | 16
+ 17 | 17 | 17 | 17
+ 18 | 18 | 18 | 18
+ 19 | 19 | 19 | 19
+ 20 | 20 | 20 | 20
+(20 rows)
+
--- a/src/test/regress/expected/join_gp_optimizer.out
+++ b/src/test/regress/expected/join_gp_optimizer.out
@@ -1523,3 +1523,68 @@ select * from foo where exists (select 1 from bar where foo.a = bar.b);
 reset enable_hashagg;
 drop table foo;
 drop table bar;
+-- Fix github issue 10012
+create table fix_param_a (i int, j int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'i' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table fix_param_b (i int UNIQUE, j int);
+create table fix_param_c (i int, j int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'i' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into fix_param_a select i, i from generate_series(1,20)i;
+insert into fix_param_b select i, i from generate_series(1,2000)i;
+insert into fix_param_c select i, i from generate_series(1,2000)i;
+analyze fix_param_a;
+analyze fix_param_b;
+analyze fix_param_c;
+explain (costs off)
+select * from fix_param_a left join fix_param_b on
+	fix_param_a.i = fix_param_b.i and fix_param_b.j in
+		(select j from fix_param_c where fix_param_b.i = fix_param_c.i)
+	order by 1;
+                                                 QUERY PLAN                                                 
+------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Merge Key: fix_param_a.i
+   ->  Sort
+         Sort Key: fix_param_a.i
+         ->  Hash Left Join
+               Hash Cond: (fix_param_a.i = fix_param_b.i)
+               ->  Seq Scan on fix_param_a
+               ->  Hash
+                     ->  Hash Semi Join
+                           Hash Cond: ((fix_param_b.i = fix_param_c.i) AND (fix_param_b.j = fix_param_c.j))
+                           ->  Seq Scan on fix_param_b
+                           ->  Hash
+                                 ->  Seq Scan on fix_param_c
+ Optimizer: Pivotal Optimizer (GPORCA)
+(14 rows)
+
+select * from fix_param_a left join fix_param_b on
+	fix_param_a.i = fix_param_b.i and fix_param_b.j in
+		(select j from fix_param_c where fix_param_b.i = fix_param_c.i)
+	order by 1;
+ i  | j  | i  | j  
+----+----+----+----
+  1 |  1 |  1 |  1
+  2 |  2 |  2 |  2
+  3 |  3 |  3 |  3
+  4 |  4 |  4 |  4
+  5 |  5 |  5 |  5
+  6 |  6 |  6 |  6
+  7 |  7 |  7 |  7
+  8 |  8 |  8 |  8
+  9 |  9 |  9 |  9
+ 10 | 10 | 10 | 10
+ 11 | 11 | 11 | 11
+ 12 | 12 | 12 | 12
+ 13 | 13 | 13 | 13
+ 14 | 14 | 14 | 14
+ 15 | 15 | 15 | 15
+ 16 | 16 | 16 | 16
+ 17 | 17 | 17 | 17
+ 18 | 18 | 18 | 18
+ 19 | 19 | 19 | 19
+ 20 | 20 | 20 | 20
+(20 rows)
+
--- a/src/test/regress/sql/join_gp.sql
+++ b/src/test/regress/sql/join_gp.sql
@@ -719,3 +719,26 @@ select * from foo where exists (select 1 from bar where foo.a = bar.b);
 reset enable_hashagg;
 drop table foo;
 drop table bar;
+
+-- Fix github issue 10012
+create table fix_param_a (i int, j int);
+create table fix_param_b (i int UNIQUE, j int);
+create table fix_param_c (i int, j int);
+
+insert into fix_param_a select i, i from generate_series(1,20)i;
+insert into fix_param_b select i, i from generate_series(1,2000)i;
+insert into fix_param_c select i, i from generate_series(1,2000)i;
+
+analyze fix_param_a;
+analyze fix_param_b;
+analyze fix_param_c;
+
+explain (costs off)
+select * from fix_param_a left join fix_param_b on
+	fix_param_a.i = fix_param_b.i and fix_param_b.j in
+		(select j from fix_param_c where fix_param_b.i = fix_param_c.i)
+	order by 1;
+select * from fix_param_a left join fix_param_b on
+	fix_param_a.i = fix_param_b.i and fix_param_b.j in
+		(select j from fix_param_c where fix_param_b.i = fix_param_c.i)
+	order by 1;