Fix crashes when a Values Scan needs to create "fake ctids".

When converting semi-join to inner-join, a distinct agg on ctid is added above the hash-join node. But the fake ctids generated in Values Scan were invalid, with offset number 0, which caused an assertion failure. This patch is based on commit d8886cf9, which fixed the same issue for Function Scans. Co-authored-by: N dh-cloud <60729713+dh-cloud@users.noreply.github.com> Co-authored-by: N Jesse Zhang <sbjesse@gmail.com>

Fix crashes when a Values Scan needs to create "fake ctids".
When converting semi-join to inner-join, a distinct agg on ctid is added above the hash-join node. But the fake ctids generated in Values Scan were invalid, with offset number 0, which caused an assertion failure. This patch is based on commit d8886cf9, which fixed the same issue for Function Scans. Co-authored-by: N dh-cloud <60729713+dh-cloud@users.noreply.github.com> Co-authored-by: N Jesse Zhang <sbjesse@gmail.com>
3b7ca45c · Heikki Linnakangas · ba8f2fe0 · 3b7ca45c · 3b7ca45c · 3b7ca45c
4 changed file
--- a/src/backend/executor/nodeValuesscan.c
+++ b/src/backend/executor/nodeValuesscan.c
@@ -166,13 +166,19 @@ ValuesNext(ValuesScanState *node)
 		 */
 		ExecStoreVirtualTuple(slot);
-        /* CDB: Label each row with a synthetic ctid for subquery dedup. */
+		/*
-        if (node->cdb_want_ctid)
+		 * CDB: Label each row with a synthetic ctid for subquery dedup.
+		 *
+		 * Values Scan supports backward scans too, so we can't use
+		 * slot_set_ctid_from_fake() like most scan types do.
+		 */
+		if (node->cdb_want_ctid)
        {
-            HeapTuple   tuple = ExecFetchSlotHeapTuple(slot); 
+			HeapTuple	tuple = ExecFetchSlotHeapTuple(slot);
-            ItemPointerSet(&tuple->t_self, node->curr_idx >> 16,
+			ItemPointerSet(&tuple->t_self,
-                           (OffsetNumber)node->curr_idx);
+						   (BlockNumber) (node->curr_idx / 1024),
+						   (OffsetNumber) ((node->curr_idx % 1024) + 1));
        }
 	}

--- a/src/test/regress/expected/bfv_subquery.out
+++ b/src/test/regress/expected/bfv_subquery.out
@@ -500,15 +500,67 @@ EXPLAIN SELECT (EXISTS (SELECT UNNEST(X))) AS B FROM A;
 DROP TABLE A;
 --
-- Test the ctid in function scan
+-- Test the ctid in Function and Values Scans
 --
 create table t1(a int) ;
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 insert into t1 select i from generate_series(1, 100000) i;
 analyze t1;
+-- Function Scan
+explain
+select count(*) from pg_backend_pid() b(a) where b.a % 100000 in (select a from t1);
+                                                 QUERY PLAN                                                  
+-------------------------------------------------------------------------------------------------------------
+ Aggregate  (cost=1611.83..1611.84 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice2; segments: 3)  (cost=1611.76..1611.81 rows=1 width=8)
+         ->  Aggregate  (cost=1611.76..1611.77 rows=1 width=8)
+               ->  HashAggregate  (cost=1611.56..1611.72 rows=6 width=6)
+                     Group Key: b.ctid
+                     ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=0.02..1611.52 rows=6 width=6)
+                           Hash Key: b.ctid
+                           ->  Hash Join  (cost=0.02..1611.19 rows=6 width=6)
+                                 Hash Cond: (t1.a = (b.a % 100000))
+                                 ->  Seq Scan on t1  (cost=0.00..1111.00 rows=33334 width=4)
+                                 ->  Hash  (cost=0.01..0.01 rows=1 width=10)
+                                       ->  Function Scan on b  (cost=0.00..0.01 rows=1 width=10)
+ Optimizer: Postgres query optimizer
+(13 rows)
 select count(*) from pg_backend_pid() b(a) where b.a % 100000 in (select a from t1);
 count 
 -------
     1
 (1 row)
+-- Values Scan
+-- We use a large number of entries, to make sure the fake ctids are generated
+-- correctly even when the offset number in the TID wraps around.
+select string_agg('(' || g || ')', ', ') as lots_of_values from generate_series(1, 66000) g
+\gset
+explain
+select count(*) from ( values :lots_of_values ) as b(a) where b.a % 100000 in (select a from t1);
+                                                     QUERY PLAN                                                     
+--------------------------------------------------------------------------------------------------------------------
+ Aggregate  (cost=5001.06..5001.07 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice2; segments: 3)  (cost=5001.00..5001.05 rows=1 width=8)
+         ->  Aggregate  (cost=5001.00..5001.01 rows=1 width=8)
+               ->  HashAggregate  (cost=4588.50..4918.50 rows=11000 width=6)
+                     Group Key: "*VALUES*".ctid
+                     ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=2361.00..4506.00 rows=11000 width=6)
+                           Hash Key: "*VALUES*".ctid
+                           ->  Hash Join  (cost=2361.00..3846.00 rows=11000 width=6)
+                                 Hash Cond: (("*VALUES*".column1 % 100000) = t1.a)
+                                 ->  Values Scan on "*VALUES*"  (cost=0.00..825.00 rows=22000 width=10)
+                                 ->  Hash  (cost=1111.00..1111.00 rows=33334 width=4)
+                                       ->  Seq Scan on t1  (cost=0.00..1111.00 rows=33334 width=4)
+ Optimizer: Postgres query optimizer
+(13 rows)
+select count(*) from ( values :lots_of_values ) as b(a) where b.a % 100000 in (select a from t1);
+ count 
+-------
+ 66000
+(1 row)
 drop table t1;
--- a/src/test/regress/expected/bfv_subquery_optimizer.out
+++ b/src/test/regress/expected/bfv_subquery_optimizer.out
@@ -497,15 +497,63 @@ EXPLAIN SELECT (EXISTS (SELECT UNNEST(X))) AS B FROM A;
 DROP TABLE A;
 --
-- Test the ctid in function scan
+-- Test the ctid in Function and Values Scans
 --
 create table t1(a int) ;
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 insert into t1 select i from generate_series(1, 100000) i;
 analyze t1;
+-- Function Scan
+explain
+select count(*) from pg_backend_pid() b(a) where b.a % 100000 in (select a from t1);
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Aggregate  (cost=0.00..450.95 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..450.95 rows=1 width=1)
+         ->  Hash Semi Join  (cost=0.00..450.95 rows=1 width=1)
+               Hash Cond: ((b % 100000) = a)
+               ->  Result  (cost=0.00..0.00 rows=1 width=4)
+                     ->  Result  (cost=0.00..0.00 rows=1 width=4)
+                           ->  Result  (cost=0.00..0.00 rows=1 width=4)
+                                 ->  Result  (cost=0.00..0.00 rows=1 width=1)
+               ->  Hash  (cost=431.62..431.62 rows=33334 width=4)
+                     ->  Seq Scan on t1  (cost=0.00..431.62 rows=33334 width=4)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(11 rows)
 select count(*) from pg_backend_pid() b(a) where b.a % 100000 in (select a from t1);
 count 
 -------
     1
 (1 row)
+-- Values Scan
+-- We use a large number of entries, to make sure the fake ctids are generated
+-- correctly even when the offset number in the TID wraps around.
+select string_agg('(' || g || ')', ', ') as lots_of_values from generate_series(1, 66000) g
+\gset
+explain
+select count(*) from ( values :lots_of_values ) as b(a) where b.a % 100000 in (select a from t1);
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
+ Aggregate  (cost=0.00..443.14 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..443.14 rows=1 width=8)
+         ->  Aggregate  (cost=0.00..443.14 rows=1 width=8)
+               ->  Hash Semi Join  (cost=0.00..443.14 rows=22000 width=1)
+                     Hash Cond: (("Values".column1 % 100000) = t1.a)
+                     ->  Result  (cost=0.00..0.95 rows=22000 width=4)
+                           ->  Result  (cost=0.00..0.95 rows=22000 width=4)
+                                 ->  Values Scan on "Values"  (cost=0.00..0.26 rows=22000 width=4)
+                     ->  Hash  (cost=431.62..431.62 rows=33334 width=4)
+                           ->  Seq Scan on t1  (cost=0.00..431.62 rows=33334 width=4)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(11 rows)
+select count(*) from ( values :lots_of_values ) as b(a) where b.a % 100000 in (select a from t1);
+ count 
+-------
+ 66000
+(1 row)
 drop table t1;
--- a/src/test/regress/sql/bfv_subquery.sql
+++ b/src/test/regress/sql/bfv_subquery.sql
@@ -293,11 +293,25 @@ EXPLAIN SELECT (EXISTS (SELECT UNNEST(X))) AS B FROM A;
 DROP TABLE A;
 --
-- Test the ctid in function scan
+-- Test the ctid in Function and Values Scans
 --
 create table t1(a int) ;
 insert into t1 select i from generate_series(1, 100000) i;
 analyze t1;
+-- Function Scan
+explain
+select count(*) from pg_backend_pid() b(a) where b.a % 100000 in (select a from t1);
 select count(*) from pg_backend_pid() b(a) where b.a % 100000 in (select a from t1);
+-- Values Scan
+-- We use a large number of entries, to make sure the fake ctids are generated
+-- correctly even when the offset number in the TID wraps around.
+select string_agg('(' || g || ')', ', ') as lots_of_values from generate_series(1, 66000) g
+\gset
+explain
+select count(*) from ( values :lots_of_values ) as b(a) where b.a % 100000 in (select a from t1);
+select count(*) from ( values :lots_of_values ) as b(a) where b.a % 100000 in (select a from t1);
 drop table t1;