From f93f6f4251f6490ff7d0fd85f0bdf029c0504e8d Mon Sep 17 00:00:00 2001 From: Pengzhou Tang Date: Tue, 11 Sep 2018 01:01:15 -0400 Subject: [PATCH] Fix imtermittent failure dispatch test cases In dispatch test cases, we need a way to put a segment to in-recovery status to test gang recreating logic of dispatcher. We used to trigger a panic fault on a segment and suspend the quickdie() to simulate in-recovery status. To avoid segment staying in recovery mode for a long time, we used a "sleep" fault instead of 'suspend' in quickdie(), so segment can accept new connections after 5 seconds. 5 seconds works fine most of time, but still not stable enough, so we decide to use more straight-forward mean to simulate in-recovery mode which reports a POSTMASTER_IN_RECOVERY_MSG directly in ProcessStartupPacket(). To not affecting other backends, we create a new database so fault injectors only affect dispatch test cases. --- src/backend/postmaster/postmaster.c | 14 +++- src/backend/tcop/postgres.c | 1 - src/backend/utils/misc/faultinjector.c | 3 +- src/include/utils/faultinjector.h | 1 - src/test/regress/input/dispatch.source | 38 ++++++----- src/test/regress/output/dispatch.source | 87 ++++++++++++------------- 6 files changed, 75 insertions(+), 69 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 0b129cb147..6c17d0a78e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2954,7 +2954,19 @@ retry1: break; } - SIMPLE_FAULT_INJECTOR(ProcessStartupPacketFault); +#ifdef FAULT_INJECTOR + if (FaultInjector_InjectFaultIfSet(ProcessStartupPacketFault, + DDLNotSpecified, + port->database_name /* databaseName */, + "" /* tableName */) == FaultInjectorTypeSkip) + { + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errSendAlert(true), + errmsg(POSTMASTER_IN_RECOVERY_MSG), + errdetail("dummy location"))); + } +#endif return STATUS_OK; } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index f4c786dcd3..50a75f08bc 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3352,7 +3352,6 @@ drop_unnamed_stmt(void) void quickdie(SIGNAL_ARGS) { - SIMPLE_FAULT_INJECTOR(QuickDie); quickdie_impl(); } diff --git a/src/backend/utils/misc/faultinjector.c b/src/backend/utils/misc/faultinjector.c index d8bc1b10f6..ca472a19a8 100644 --- a/src/backend/utils/misc/faultinjector.c +++ b/src/backend/utils/misc/faultinjector.c @@ -329,8 +329,6 @@ FaultInjectorIdentifierEnumToString[] = { /* inject fault before sending QE details during backend initialization */ _("process_startup_packet"), /* inject fault in ProcessStartupPacket() */ - _("quickdie"), - /* inject fault in quickdie*/ _("after_one_slice_dispatched"), /* inject fault in cdbdisp_dispatchX*/ _("interconnect_stop_ack_is_lost"), @@ -1070,6 +1068,7 @@ FaultInjector_NewHashEntry( case CreateGangInProgress: case DecreaseToastMaxChunkSize: + case ProcessStartupPacketFault: break; default: diff --git a/src/include/utils/faultinjector.h b/src/include/utils/faultinjector.h index 6d889b6421..2d04380ce8 100644 --- a/src/include/utils/faultinjector.h +++ b/src/include/utils/faultinjector.h @@ -219,7 +219,6 @@ typedef enum FaultInjectorIdentifier_e { SendQEDetailsInitBackend, ProcessStartupPacketFault, - QuickDie, AfterOneSliceDispatched, InterconnectStopAckIsLost, diff --git a/src/test/regress/input/dispatch.source b/src/test/regress/input/dispatch.source index 42e211d295..c6264a7efd 100644 --- a/src/test/regress/input/dispatch.source +++ b/src/test/regress/input/dispatch.source @@ -1,4 +1,6 @@ -- Misc tests related to dispatching queries to segments. +CREATE DATABASE dispatch_test_db; +\c dispatch_test_db; CREATE EXTENSION IF NOT EXISTS gp_inject_fault; @@ -43,7 +45,7 @@ CREATE TABLE "my table" (id integer); DROP TABLE "my table"; -- Clean up -\c regression +\c dispatch_test_db DROP DATABASE "dispatch test db"; -- Test gp_max_plan_size limit @@ -61,13 +63,14 @@ select gp_inject_fault('send_qe_details_init_backend', 'reset', 2); select gp_inject_fault('send_qe_details_init_backend', 'skip', 2); -- terminate exiting QEs first -\c +\c dispatch_test_db -- verify failure will be reported SELECT 1 FROM gp_dist_random('gp_id'); -- reset fault injector select gp_inject_fault('send_qe_details_init_backend', 'reset', 2); + -- -- Test suit : test gang creation and commands dispatching -- @@ -113,11 +116,9 @@ set gp_gang_creation_retry_timer to 1000; select cleanupAllGangs(); --- trigger fault and put segment 0 into recovery mode -select gp_inject_fault('process_startup_packet', 'segv', 2); ---start_ignore -select 'trigger fault' from gp_dist_random('gp_id'); ---end_ignore +-- trigger fault and report segment 0 in recovery for 5 times +select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint); +select cleanupAllGangs(); -- should success after retry select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 @@ -134,19 +135,19 @@ set gp_gang_creation_retry_timer to 200; select cleanupAllGangs(); -- trigger fault and put segment 0 into recovery mode -select gp_inject_fault('process_startup_packet', 'segv', 2); -select gp_inject_fault('quickdie', 'suspend', 2); ---start_ignore -select 'trigger fault' from gp_dist_random('gp_id'); ---end_ignore +select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint); +select cleanupAllGangs(); -- should failed after 2 times select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3; -select gp_inject_fault('quickdie', 'resume', 2); +set gp_gang_creation_retry_count to 10; +-- should success and process_startup_packet will be invalid after this query +select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 +where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3; + select gp_inject_fault('process_startup_packet', 'reset', 2); -select gp_inject_fault('quickdie', 'reset', 2); --start_ignore -- enlarge the retry count @@ -210,9 +211,9 @@ select gp_inject_fault('send_qe_details_init_backend', 'reset', 2); -- gp_segment_connect_timeout = 0 : wait forever -- gp_segment_connect_timeout = 1 : wait 1 second set gp_segment_connect_timeout to 1; -select cleanupAllGangs(); +select gp_inject_fault_new('process_startup_packet', 'suspend', '', 'dispatch_test_db', '', 1, 1, 0, 2::smallint); -select gp_inject_fault('process_startup_packet', 'suspend', 2); +select cleanupAllGangs(); -- expect timeout failure select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 @@ -302,7 +303,7 @@ DROP TABLE foo_test; -- -- Test dangling Gang would be destroyed if interrupted during the creation -- -\c +select cleanupAllGangs(); select gp_inject_fault('gang_created', 'reset', 1); -- The _new() API ensures that the fault is triggered exactly once. select gp_inject_fault_new('gang_created', 'error', 1); @@ -320,3 +321,6 @@ select * from gp_dist_random('gp_id') select gp_inject_fault('after_one_slice_dispatched', 'reset', 1); select * from gp_dist_random('gp_id') where gpname > (select * from repeat('sssss', 10000000)); + +\c regression +DROP DATABASE dispatch_test_db; diff --git a/src/test/regress/output/dispatch.source b/src/test/regress/output/dispatch.source index 454ffc722a..03598954de 100644 --- a/src/test/regress/output/dispatch.source +++ b/src/test/regress/output/dispatch.source @@ -1,4 +1,6 @@ -- Misc tests related to dispatching queries to segments. +CREATE DATABASE dispatch_test_db; +\c dispatch_test_db; CREATE EXTENSION IF NOT EXISTS gp_inject_fault; -- Mask out the whoami message -- start_matchsubs @@ -53,7 +55,7 @@ NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' a HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. DROP TABLE "my table"; -- Clean up -\c regression +\c dispatch_test_db DROP DATABASE "dispatch test db"; -- Test gp_max_plan_size limit set gp_max_plan_size='10 kB'; @@ -83,7 +85,7 @@ NOTICE: Success: (1 row) -- terminate exiting QEs first -\c +\c dispatch_test_db -- verify failure will be reported SELECT 1 FROM gp_dist_random('gp_id'); ERROR: failed to acquire resources on one or more segments @@ -149,22 +151,20 @@ select cleanupAllGangs(); t (1 row) --- trigger fault and put segment 0 into recovery mode -select gp_inject_fault('process_startup_packet', 'segv', 2); +-- trigger fault and report segment 0 in recovery for 5 times +select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint); NOTICE: Success: - gp_inject_fault + gp_inject_fault_new +--------------------- + t +(1 row) + +select cleanupAllGangs(); + cleanupallgangs ----------------- t (1 row) ---start_ignore -select 'trigger fault' from gp_dist_random('gp_id'); -ERROR: failed to acquire resources on one or more segments -DETAIL: server closed the connection unexpectedly - This probably means the server terminated abnormally - before or while processing the request. - (seg0 127.0.0.1:25432) ---end_ignore -- should success after retry select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3; @@ -193,49 +193,35 @@ select cleanupAllGangs(); (1 row) -- trigger fault and put segment 0 into recovery mode -select gp_inject_fault('process_startup_packet', 'segv', 2); +select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint); NOTICE: Success: - gp_inject_fault ------------------ + gp_inject_fault_new +--------------------- t (1 row) -select gp_inject_fault('quickdie', 'suspend', 2); -NOTICE: Success: - gp_inject_fault +select cleanupAllGangs(); + cleanupallgangs ----------------- t (1 row) ---start_ignore -select 'trigger fault' from gp_dist_random('gp_id'); -ERROR: failed to acquire resources on one or more segments -DETAIL: server closed the connection unexpectedly - This probably means the server terminated abnormally - before or while processing the request. - (seg0 127.0.0.1:25432) ---end_ignore -- should failed after 2 times select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3; ERROR: failed to acquire resources on one or more segments DETAIL: segments is in recovery mode -select gp_inject_fault('quickdie', 'resume', 2); -NOTICE: Success: - gp_inject_fault ------------------ - t +set gp_gang_creation_retry_count to 10; +-- should success and process_startup_packet will be invalid after this query +select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3 +where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3; + c1 | c2 | c3 | c1 | c2 | c3 | c1 | c2 | c3 +----+----+----+----+----+----+----+----+---- + 1 | 1 | 2 | 2 | 1 | 2 | 3 | 1 | 2 (1 row) select gp_inject_fault('process_startup_packet', 'reset', 2); -NOTICE: Success: - gp_inject_fault ------------------ - t -(1 row) - -select gp_inject_fault('quickdie', 'reset', 2); -NOTICE: Success: +NOTICE: Success: (seg0 10.153.101.106:25432 pid=373925) gp_inject_fault ----------------- t @@ -370,15 +356,15 @@ NOTICE: Success: -- gp_segment_connect_timeout = 0 : wait forever -- gp_segment_connect_timeout = 1 : wait 1 second set gp_segment_connect_timeout to 1; -select cleanupAllGangs(); - cleanupallgangs ------------------ +select gp_inject_fault_new('process_startup_packet', 'suspend', '', 'dispatch_test_db', '', 1, 1, 0, 2::smallint); +NOTICE: Success: + gp_inject_fault_new +--------------------- t (1 row) -select gp_inject_fault('process_startup_packet', 'suspend', 2); -NOTICE: Success: - gp_inject_fault +select cleanupAllGangs(); + cleanupallgangs ----------------- t (1 row) @@ -524,7 +510,12 @@ DROP TABLE foo_test; -- -- Test dangling Gang would be destroyed if interrupted during the creation -- -\c +select cleanupAllGangs(); + cleanupallgangs +----------------- + t +(1 row) + select gp_inject_fault('gang_created', 'reset', 1); NOTICE: Success: gp_inject_fault @@ -582,3 +573,5 @@ select * from gp_dist_random('gp_id') --------+-------------+------+--------- (0 rows) +\c regression +DROP DATABASE dispatch_test_db; -- GitLab