Limit gxact number on master with MaxBackends.

Previously we assign it as max_prepared_xacts. It is used to initialize some 2pc related shared memory. For example the array shmCommittedGxactArray is created with this length and that array is used to collect not-yet "forgotten" distributed transactions during master/standby recovery, but the array length might be problematic since: 1. If master max_prepared_xacts is equal to segment max_prepared_xacts as usual. It is possible some distributed transactions use just partial gang so the total distributed transactions might be larger (and even much larger) than max_prepared_xacts. The document says max_prepared_xacts should be greater than max_connections but there is no code to enforce that. 2. Also it is possible that master max_prepared_xacts might be different than segment max_prepared_xacts (although the document does not suggest it there is no code to enforce that), To fix that we use MaxBackends for the gxact number on master. We may just use guc max_connections (MaxBackends includes number for autovacuum workers and bg workers additionally besides guc max_connections), but I'm conservatively using MaxBackends, since this issue is annoying - standby can not recover due to the FATAL message as below even after postgres reboot unless we temporarily increase the guc max_prepared_transactions value. 2020-07-17 16:48:19.178667 CST,,,p33652,th1972721600,,,,0,,,seg-1,,,,,"FATAL","XX000","the limit of 3 distributed transactions has been reached","It should not happen. Temporarily increase max_connections (need postmaster reboot) on the postgres (master or standby) to work around this issue and then report a bug",,,,"xlog redo at 0/C339BA0 for Transaction/DISTRIBUTED_COMMIT: distributed commit 2020-07-17 16:48:19.101832+08 gid = 1594975696-0000000009, gxid = 9",,0,,"cdbdtxrecovery.c",571,"Stack trace: 1 0xb3a30f postgres errstart (elog.c:558) 2 0xc3da4d postgres redoDistributedCommitRecord (cdbdtxrecovery.c:565) 3 0x564227 postgres <symbol not found> (xact.c:6942) 4 0x564671 postgres xact_redo (xact.c:7080) 5 0x56fee5 postgres StartupXLOG (xlog.c:7207) Reviewed-by: N xiong-gang <gxiong@pivotal.io>

Limit gxact number on master with MaxBackends.
Previously we assign it as max_prepared_xacts. It is used to initialize some 2pc related shared memory. For example the array shmCommittedGxactArray is created with this length and that array is used to collect not-yet "forgotten" distributed transactions during master/standby recovery, but the array length might be problematic since: 1. If master max_prepared_xacts is equal to segment max_prepared_xacts as usual. It is possible some distributed transactions use just partial gang so the total distributed transactions might be larger (and even much larger) than max_prepared_xacts. The document says max_prepared_xacts should be greater than max_connections but there is no code to enforce that. 2. Also it is possible that master max_prepared_xacts might be different than segment max_prepared_xacts (although the document does not suggest it there is no code to enforce that), To fix that we use MaxBackends for the gxact number on master. We may just use guc max_connections (MaxBackends includes number for autovacuum workers and bg workers additionally besides guc max_connections), but I'm conservatively using MaxBackends, since this issue is annoying - standby can not recover due to the FATAL message as below even after postgres reboot unless we temporarily increase the guc max_prepared_transactions value. 2020-07-17 16:48:19.178667 CST,,,p33652,th1972721600,,,,0,,,seg-1,,,,,"FATAL","XX000","the limit of 3 distributed transactions has been reached","It should not happen. Temporarily increase max_connections (need postmaster reboot) on the postgres (master or standby) to work around this issue and then report a bug",,,,"xlog redo at 0/C339BA0 for Transaction/DISTRIBUTED_COMMIT: distributed commit 2020-07-17 16:48:19.101832+08 gid = 1594975696-0000000009, gxid = 9",,0,,"cdbdtxrecovery.c",571,"Stack trace: 1 0xb3a30f postgres errstart (elog.c:558) 2 0xc3da4d postgres redoDistributedCommitRecord (cdbdtxrecovery.c:565) 3 0x564227 postgres <symbol not found> (xact.c:6942) 4 0x564671 postgres xact_redo (xact.c:7080) 5 0x56fee5 postgres StartupXLOG (xlog.c:7207) Reviewed-by: N xiong-gang <gxiong@pivotal.io>
2a961e65 · Paul Guo · af942980 · 2a961e65 · 2a961e65 · 2a961e65
5 changed file
--- a/src/backend/cdb/cdbdtxrecovery.c
+++ b/src/backend/cdb/cdbdtxrecovery.c
@@ -565,8 +565,10 @@ redoDistributedCommitRecord(TMGXACT_LOG *gxact_log)
 			ereport(FATAL,
 					(errmsg("the limit of %d distributed transactions has been reached",
 							max_tm_gxacts),
-					 errdetail("The global user configuration (GUC) server "
-							   "parameter max_prepared_transactions controls this limit.")));
+					 errdetail("It should not happen. Temporarily increase "
+							   "max_connections (need postmaster reboot) on "
+							   "the postgres (master or standby) to work "
+							   "around this issue and then report a bug")));

 		shmCommittedGxactArray[(*shmNumCommittedGxacts)++] = *gxact_log;
 		elog((Debug_print_full_dtm ? LOG : DEBUG5),

--- a/src/backend/cdb/cdbtm.c
+++ b/src/backend/cdb/cdbtm.c
@@ -610,6 +610,8 @@ doNotifyingCommitPrepared(void)
 			(errmsg("the distributed transaction 'Commit Prepared' broadcast succeeded to all the segments"),
 			TM_ERRDETAIL));

+	SIMPLE_FAULT_INJECTOR("dtm_before_insert_forget_comitted");
+
 	doInsertForgetCommitted();

 	/*
@@ -999,13 +1001,36 @@ tmShmemInit(void)
 	bool		found;
 	TmControlBlock *shared;

+	if (Gp_role == GP_ROLE_DISPATCH && max_prepared_xacts < MaxConnections)
+		elog(WARNING, "Better set max_prepared_transactions greater than max_connections");
+
 	/*
-	 * max_prepared_xacts is a guc which is postmaster-startup setable -- it
-	 * can only be updated by restarting the system. Global transactions will
-	 * all use two-phase commit, so the number of global transactions is bound
-	 * to the number of prepared.
+	 * max_prepared_transactions is a guc which is postmaster-startup setable
+	 * -- it can only be updated by restarting the system. Global transactions
+	 *  will all use two-phase commit, so the number of global transactions is
+	 *  bound to the number of prepared.
+	 *
+	 * Note on master, it is possible that some prepared xacts just use partial
+	 * gang so on QD the total prepared xacts might be quite large but it is
+	 * limited by max_connections since one QD should only have one 2pc one
+	 * time, so if we set max_tm_gxacts as max_prepared_transactions as before,
+	 * shmCommittedGxactArray might not be able to accommodate committed but
+	 * not forgotten transactions (standby recovery will fail if encountering
+	 * this issue) if max_prepared_transactions is smaller than max_connections
+	 * (though this is not suggested). Not to mention that
+	 * max_prepared_transactions might be inconsistent between master/standby
+	 * and segments (though this is not suggested).
+	 *
+	 * We can assign MaxBackends (MaxConnections should be fine also but let's
+	 * be conservative) to max_tm_gxacts on master/standby to tolerate various
+	 * configuration combinations of max_prepared_transactions and
+	 * max_connections. For segments or utility mode, max_tm_gxacts is useless
+	 * so let's set it as zero to save memory.
 	 */
-	max_tm_gxacts = max_prepared_xacts;
+	if (Gp_role == GP_ROLE_DISPATCH)
+		max_tm_gxacts = MaxBackends;
+	else
+		max_tm_gxacts = 0;

 	shared = (TmControlBlock *) ShmemInitStruct("Transaction manager", tmShmemSize(), &found);
 	if (!shared)

--- a/src/test/isolation2/expected/prepare_limit.out
+++ b/src/test/isolation2/expected/prepare_limit.out
+-- test to verify a bug that causes standby startup fatal with message like
+-- "the limit of xxx distributed transactions has been reached".
+-- Refer comment in https://github.com/greenplum-db/gpdb/issues/9207 for the
+-- context.
+include: helpers/server_helpers.sql;
+CREATE
+
+-- We will reset the value to 250 finally so sanity check the current value here.
+6: show max_prepared_transactions;
+ max_prepared_transactions 
+---------------------------
+ 250                       
+(1 row)
+!\retcode gpconfig -c max_prepared_transactions -v 3 --skipvalidation;
+(exited with code 0)
+!\retcode gpstop -ari;
+(exited with code 0)
+
+5: create table prepare_limit1 (a int);
+CREATE
+5: create table prepare_limit2 (a int);
+CREATE
+5: create table prepare_limit3 (a int);
+CREATE
+5: create table prepare_limit4 (a int);
+CREATE
+
+5: select gp_inject_fault_infinite('dtm_before_insert_forget_comitted', 'suspend', 1);
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- Note first insert after table create triggers auto_stats and leads to 2pc
+-- transaction.
+
+-- (2) is on seg0
+1&: insert into prepare_limit1 values(2);  <waiting ...>
+2&: insert into prepare_limit2 values(2);  <waiting ...>
+
+-- (1) is on seg1
+3&: insert into prepare_limit3 values(1);  <waiting ...>
+4&: insert into prepare_limit4 values(1);  <waiting ...>
+
+-- wait until these 2pc reach before inserting forget commit.
+5: SELECT gp_wait_until_triggered_fault('dtm_before_insert_forget_comitted', 4, 1);
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- wait until standby catches up and replays all xlogs.
+5: select wait_for_replication_replay (-1, 5000);
+ wait_for_replication_replay 
+-----------------------------
+ t                           
+(1 row)
+
+-- reset to make testing continue
+5: select gp_inject_fault('dtm_before_insert_forget_comitted', 'reset', 1);
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1<:  <... completed>
+INSERT 1
+2<:  <... completed>
+INSERT 1
+3<:  <... completed>
+INSERT 1
+4<:  <... completed>
+INSERT 1
+
+-- verify that standby is correctly wal streaming.
+5: select state from pg_stat_replication;
+ state     
+-----------
+ streaming 
+(1 row)
+
+-- verify the tuples are on correct segments so the test assumption is
+-- correct. (i.e. tuple 2, 1 are on different segments).
+5: select gp_segment_id, * from prepare_limit1;
+ gp_segment_id | a 
+---------------+---
+ 0             | 2 
+(1 row)
+5: select gp_segment_id, * from prepare_limit2;
+ gp_segment_id | a 
+---------------+---
+ 0             | 2 
+(1 row)
+5: select gp_segment_id, * from prepare_limit3;
+ gp_segment_id | a 
+---------------+---
+ 1             | 1 
+(1 row)
+5: select gp_segment_id, * from prepare_limit4;
+ gp_segment_id | a 
+---------------+---
+ 1             | 1 
+(1 row)
+
+-- cleanup
+5: drop table prepare_limit1;
+DROP
+5: drop table prepare_limit2;
+DROP
+5: drop table prepare_limit3;
+DROP
+5: drop table prepare_limit4;
+DROP
+
+-- Not using gpconfig -r, else it makes max_prepared_transactions be default
+-- (50) and some isolation2 tests will fail due to "too many clients". Hardcode
+-- to 250 which is the default value when demo cluster is created.
+!\retcode gpconfig -c max_prepared_transactions -v 250 --skipvalidation;
+(exited with code 0)
+!\retcode gpstop -ari;
+(exited with code 0)
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
 test: setup

 test: lockmodes
+# Put test prepare_limit near to test lockmodes since both of them reboot the
+# cluster during testing. Usually the 2nd reboot should be faster.
+test: prepare_limit
 test: ao_partition_lock
 test: dml_on_root_locks_all_parts


--- a/src/test/isolation2/sql/prepare_limit.sql
+++ b/src/test/isolation2/sql/prepare_limit.sql
+-- test to verify a bug that causes standby startup fatal with message like
+-- "the limit of xxx distributed transactions has been reached".
+-- Refer comment in https://github.com/greenplum-db/gpdb/issues/9207 for the
+-- context.
+include: helpers/server_helpers.sql;
+
+-- We will reset the value to 250 finally so sanity check the current value here.
+6: show max_prepared_transactions;
+!\retcode gpconfig -c max_prepared_transactions -v 3 --skipvalidation;
+!\retcode gpstop -ari;
+
+5: create table prepare_limit1 (a int);
+5: create table prepare_limit2 (a int);
+5: create table prepare_limit3 (a int);
+5: create table prepare_limit4 (a int);
+
+5: select gp_inject_fault_infinite('dtm_before_insert_forget_comitted', 'suspend', 1);
+
+-- Note first insert after table create triggers auto_stats and leads to 2pc
+-- transaction.
+
+-- (2) is on seg0
+1&: insert into prepare_limit1 values(2);
+2&: insert into prepare_limit2 values(2);
+
+-- (1) is on seg1
+3&: insert into prepare_limit3 values(1);
+4&: insert into prepare_limit4 values(1);
+
+-- wait until these 2pc reach before inserting forget commit.
+5: SELECT gp_wait_until_triggered_fault('dtm_before_insert_forget_comitted', 4, 1);
+
+-- wait until standby catches up and replays all xlogs.
+5: select wait_for_replication_replay (-1, 5000);
+
+-- reset to make testing continue
+5: select gp_inject_fault('dtm_before_insert_forget_comitted', 'reset', 1);
+1<:
+2<:
+3<:
+4<:
+
+-- verify that standby is correctly wal streaming.
+5: select state from pg_stat_replication;
+
+-- verify the tuples are on correct segments so the test assumption is
+-- correct. (i.e. tuple 2, 1 are on different segments).
+5: select gp_segment_id, * from prepare_limit1;
+5: select gp_segment_id, * from prepare_limit2;
+5: select gp_segment_id, * from prepare_limit3;
+5: select gp_segment_id, * from prepare_limit4;
+
+-- cleanup
+5: drop table prepare_limit1;
+5: drop table prepare_limit2;
+5: drop table prepare_limit3;
+5: drop table prepare_limit4;
+
+-- Not using gpconfig -r, else it makes max_prepared_transactions be default
+-- (50) and some isolation2 tests will fail due to "too many clients". Hardcode
+-- to 250 which is the default value when demo cluster is created.
+!\retcode gpconfig -c max_prepared_transactions -v 250 --skipvalidation;
+!\retcode gpstop -ari;