提交 5b56f58e 编写于 作者: S Sambitesh Dash

Introduce optimizer_enable_gather_on_segment_for_DML GUC

When ON, ORCA will optimize DML queries by enforcing a non-master gather
whenever possible. When off, a gather on master will be enforced
instead.

Default value will be ON.

Also add new tests to ensure sane behavior when this optimization is
turned on and fix the existing tests.
Signed-off-by: NSambitesh Dash <sdash@pivotal.io>
Signed-off-by: NDhanashree Kashid <dkashid@pivotal.io>
上级 49e53546
......@@ -313,6 +313,13 @@ CConfigParamMapping::SConfigMappingElem CConfigParamMapping::m_elements[] =
GPOS_WSZ_LIT("Enable motion hazard handling during NLJ optimization and generate streaming material when appropriate")
},
{
EopttraceDisableNonMasterGatherForDML,
&optimizer_enable_gather_on_segment_for_dml,
true, // m_fNegate
GPOS_WSZ_LIT("Enable DML optimization by enforcing a non-master gather when appropriate")
},
{
EopttraceEnforceCorrelatedExecution,
&optimizer_enforce_subplans,
......
......@@ -520,6 +520,7 @@ bool optimizer_enable_direct_dispatch;
bool optimizer_enable_hashjoin_redistribute_broadcast_children;
bool optimizer_enable_broadcast_nestloop_outer_child;
bool optimizer_enable_streaming_material;
bool optimizer_enable_gather_on_segment_for_dml;
bool optimizer_enable_assert_maxonerow;
bool optimizer_enable_constant_expression_evaluation;
bool optimizer_enable_bitmapscan;
......@@ -2996,6 +2997,16 @@ struct config_bool ConfigureNamesBool_gp[] =
true,
NULL, NULL, NULL
},
{
{"optimizer_enable_gather_on_segment_for_dml", PGC_USERSET, DEVELOPER_OPTIONS,
gettext_noop("Enable DML optimization by enforcing a non-master gather in the optimizer."),
NULL,
GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
},
&optimizer_enable_gather_on_segment_for_dml,
true,
NULL, NULL, NULL
},
{
{"optimizer_enforce_subplans", PGC_USERSET, DEVELOPER_OPTIONS,
gettext_noop("Enforce correlated execution in the optimizer"),
......
......@@ -434,6 +434,7 @@ extern bool optimizer_enable_multiple_distinct_aggs;
extern bool optimizer_enable_hashjoin_redistribute_broadcast_children;
extern bool optimizer_enable_broadcast_nestloop_outer_child;
extern bool optimizer_enable_streaming_material;
extern bool optimizer_enable_gather_on_segment_for_dml;
extern bool optimizer_enable_assert_maxonerow;
extern bool optimizer_enable_constant_expression_evaluation;
extern bool optimizer_enable_bitmapscan;
......
......@@ -10245,6 +10245,106 @@ select c1 from t_outer where not c1 =all (select c2 from t_inner);
(10 rows)
reset optimizer_enable_streaming_material;
--
-- Test to ensure sane behavior when DML queries are optimized by ORCA by
-- enforcing a non-master gather motion, controlled by
-- optimizer_enable_gather_on_segment_for_DML GUC
--
--
-- CTAS with global-local aggregation
--
-- start_ignore
create table test1 (a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into test1 select generate_series(1,100),generate_series(1,100);
-- end_ignore
create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'avg' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
select * from t_new;
avg
--------------------
2.0000000000000000
(1 row)
-- start_ignore
drop table t_new;
set optimizer_enable_gather_on_segment_for_DML=off;
-- end_ignore
create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'avg' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
select * from t_new;
avg
--------------------
2.0000000000000000
(1 row)
-- start_ignore
reset optimizer_enable_gather_on_segment_for_DML;
-- end_ignore
--
-- Insert with outer references in the subquery
--
-- start_ignore
create table x_tab(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table y_tab(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table z_tab(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into x_tab values(1);
insert into y_tab values(0);
insert into z_tab values(1);
-- end_ignore
insert into x_tab select * from x_tab where exists (select * from x_tab where x_tab.a = (select x_tab.a + y_tab.a from y_tab));
select * from x_tab;
a
---
1
1
(2 rows)
--
-- Insert with Union All with an universal child
--
insert into y_tab select 1 union all select a from x_tab limit 10;
select * from y_tab;
a
---
1
1
1
0
(4 rows)
--
-- Insert with a function containing a SQL
--
create or replace function test_func_pg_stats()
returns integer
as $$ declare cnt int; begin execute 'select count(*) from pg_statistic' into cnt; return cnt; end $$
language plpgsql volatile READS SQL DATA;
insert into y_tab select test_func_pg_stats() from x_tab limit 2;
select count(*) from y_tab;
count
-------
6
(1 row)
--
-- Delete with Hash Join with a universal child
--
delete from x_tab where exists (select z_tab.a from z_tab join (select 1 as g) as tab on z_tab.a = tab.g);
select * from x_tab;
a
---
(0 rows)
-- start_ignore
drop table bar;
-- end_ignore
......
......@@ -10306,6 +10306,104 @@ select c1 from t_outer where not c1 =all (select c2 from t_inner);
(10 rows)
reset optimizer_enable_streaming_material;
--
-- Test to ensure sane behavior when DML queries are optimized by ORCA by
-- enforcing a non-master gather motion, controlled by
-- optimizer_enable_gather_on_segment_for_DML GUC
--
--
-- CTAS with global-local aggregation
--
-- start_ignore
create table test1 (a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into test1 select generate_series(1,100),generate_series(1,100);
-- end_ignore
create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
select * from t_new;
avg
--------------------
2.0000000000000000
(1 row)
-- start_ignore
drop table t_new;
set optimizer_enable_gather_on_segment_for_DML=off;
-- end_ignore
create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
select * from t_new;
avg
--------------------
2.0000000000000000
(1 row)
-- start_ignore
reset optimizer_enable_gather_on_segment_for_DML;
-- end_ignore
--
-- Insert with outer references in the subquery
--
-- start_ignore
create table x_tab(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table y_tab(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table z_tab(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into x_tab values(1);
insert into y_tab values(0);
insert into z_tab values(1);
-- end_ignore
insert into x_tab select * from x_tab where exists (select * from x_tab where x_tab.a = (select x_tab.a + y_tab.a from y_tab));
select * from x_tab;
a
---
1
1
(2 rows)
--
-- Insert with Union All with an universal child
--
insert into y_tab select 1 union all select a from x_tab limit 10;
select * from y_tab;
a
---
0
1
1
1
(4 rows)
--
-- Insert with a function containing a SQL
--
create or replace function test_func_pg_stats()
returns integer
as $$ declare cnt int; begin execute 'select count(*) from pg_statistic' into cnt; return cnt; end $$
language plpgsql volatile READS SQL DATA;
insert into y_tab select test_func_pg_stats() from x_tab limit 2;
select count(*) from y_tab;
count
-------
6
(1 row)
--
-- Delete with Hash Join with a universal child
--
delete from x_tab where exists (select z_tab.a from z_tab join (select 1 as g) as tab on z_tab.a = tab.g);
select * from x_tab;
a
---
(0 rows)
-- start_ignore
drop table bar;
ERROR: table "bar" does not exist
......
......@@ -108,8 +108,8 @@ WHERE t1.user_vie_project_code_pk = keo1.user_vie_project_code_pk;
-> Broadcast Motion 3:3 (slice7; segments: 3) (cost=0.00..2155.00 rows=1 width=8)
-> Hash Join (cost=0.00..2155.00 rows=1 width=8)
Hash Cond: public.keo1.user_vie_project_code_pk::text = keo2.projects_pk::text
-> Redistribute Motion 1:3 (slice5) (cost=0.00..1724.00 rows=1 width=8)
-> Hash Join (cost=0.00..1724.00 rows=1 width=8)
-> Redistribute Motion 1:3 (slice5; segments: 1)
-> Hash Join
Hash Cond: public.keo1.user_vie_fiscal_year_period_sk::text = (max(keo3.sky_per::text))
-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=16)
-> Table Scan on keo1 (cost=0.00..431.00 rows=1 width=16)
......@@ -167,12 +167,21 @@ EXPLAIN DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x F
-> Hash (cost=1324032.17..1324032.17 rows=1 width=4)
-> Nested Loop EXISTS Join (cost=0.00..1324032.17 rows=1 width=4)
Join Filter: true
<<<<<<< HEAD
-> Table Scan on keo5 (cost=0.00..431.00 rows=1 width=4)
-> Materialize (cost=0.00..431.00 rows=1 width=1)
-> Broadcast Motion 1:3 (slice2) (cost=0.00..431.00 rows=3 width=1)
-> Limit (cost=0.00..431.00 rows=1 width=1)
-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=1)
-> Table Scan on keo5 (cost=0.00..431.00 rows=1 width=1)
=======
-> Table Scan on keo5
-> Materialize
-> Broadcast Motion 1:3 (slice2; segments: 1)
-> Limit
-> Gather Motion 3:1 (slice1; segments: 3)
-> Table Scan on keo5
>>>>>>> a341621d64... Introduce optimizer_enable_gather_on_segment_for_DML GUC
Filter: x < 2
Settings: optimizer=on
Optimizer status: PQO version 2.42.3
......
......@@ -1486,6 +1486,72 @@ set optimizer_enable_streaming_material = off;
select c1 from t_outer where not c1 =all (select c2 from t_inner);
reset optimizer_enable_streaming_material;
--
-- Test to ensure sane behavior when DML queries are optimized by ORCA by
-- enforcing a non-master gather motion, controlled by
-- optimizer_enable_gather_on_segment_for_DML GUC
--
--
-- CTAS with global-local aggregation
--
-- start_ignore
create table test1 (a int, b int);
insert into test1 select generate_series(1,100),generate_series(1,100);
-- end_ignore
create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
select * from t_new;
-- start_ignore
drop table t_new;
set optimizer_enable_gather_on_segment_for_DML=off;
-- end_ignore
create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
select * from t_new;
-- start_ignore
reset optimizer_enable_gather_on_segment_for_DML;
-- end_ignore
--
-- Insert with outer references in the subquery
--
-- start_ignore
create table x_tab(a int);
create table y_tab(a int);
create table z_tab(a int);
insert into x_tab values(1);
insert into y_tab values(0);
insert into z_tab values(1);
-- end_ignore
insert into x_tab select * from x_tab where exists (select * from x_tab where x_tab.a = (select x_tab.a + y_tab.a from y_tab));
select * from x_tab;
--
-- Insert with Union All with an universal child
--
insert into y_tab select 1 union all select a from x_tab limit 10;
select * from y_tab;
--
-- Insert with a function containing a SQL
--
create or replace function test_func_pg_stats()
returns integer
as $$ declare cnt int; begin execute 'select count(*) from pg_statistic' into cnt; return cnt; end $$
language plpgsql volatile READS SQL DATA;
insert into y_tab select test_func_pg_stats() from x_tab limit 2;
select count(*) from y_tab;
--
-- Delete with Hash Join with a universal child
--
delete from x_tab where exists (select z_tab.a from z_tab join (select 1 as g) as tab on z_tab.a = tab.g);
select * from x_tab;
-- start_ignore
drop table bar;
-- end_ignore
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册