提交 f93f6f42 编写于 作者: P Pengzhou Tang 提交者: Tang Pengzhou

Fix imtermittent failure dispatch test cases

In dispatch test cases, we need a way to put a segment to in-recovery
status to test gang recreating logic of dispatcher.

We used to trigger a panic fault on a segment and suspend the quickdie()
to simulate in-recovery status. To avoid segment staying in recovery mode
for a long time, we used a "sleep" fault instead of 'suspend' in quickdie(),
so segment can accept new connections after 5 seconds. 5 seconds works
fine most of time, but still not stable enough, so we decide to use more
straight-forward mean to simulate in-recovery mode which reports a
POSTMASTER_IN_RECOVERY_MSG directly in ProcessStartupPacket(). To not
affecting other backends, we create a new database so fault injectors
only affect dispatch test cases.
上级 2d5e8c2c
......@@ -2954,7 +2954,19 @@ retry1:
break;
}
SIMPLE_FAULT_INJECTOR(ProcessStartupPacketFault);
#ifdef FAULT_INJECTOR
if (FaultInjector_InjectFaultIfSet(ProcessStartupPacketFault,
DDLNotSpecified,
port->database_name /* databaseName */,
"" /* tableName */) == FaultInjectorTypeSkip)
{
ereport(FATAL,
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
errSendAlert(true),
errmsg(POSTMASTER_IN_RECOVERY_MSG),
errdetail("dummy location")));
}
#endif
return STATUS_OK;
}
......
......@@ -3352,7 +3352,6 @@ drop_unnamed_stmt(void)
void
quickdie(SIGNAL_ARGS)
{
SIMPLE_FAULT_INJECTOR(QuickDie);
quickdie_impl();
}
......
......@@ -329,8 +329,6 @@ FaultInjectorIdentifierEnumToString[] = {
/* inject fault before sending QE details during backend initialization */
_("process_startup_packet"),
/* inject fault in ProcessStartupPacket() */
_("quickdie"),
/* inject fault in quickdie*/
_("after_one_slice_dispatched"),
/* inject fault in cdbdisp_dispatchX*/
_("interconnect_stop_ack_is_lost"),
......@@ -1070,6 +1068,7 @@ FaultInjector_NewHashEntry(
case CreateGangInProgress:
case DecreaseToastMaxChunkSize:
case ProcessStartupPacketFault:
break;
default:
......
......@@ -219,7 +219,6 @@ typedef enum FaultInjectorIdentifier_e {
SendQEDetailsInitBackend,
ProcessStartupPacketFault,
QuickDie,
AfterOneSliceDispatched,
InterconnectStopAckIsLost,
......
-- Misc tests related to dispatching queries to segments.
CREATE DATABASE dispatch_test_db;
\c dispatch_test_db;
CREATE EXTENSION IF NOT EXISTS gp_inject_fault;
......@@ -43,7 +45,7 @@ CREATE TABLE "my table" (id integer);
DROP TABLE "my table";
-- Clean up
\c regression
\c dispatch_test_db
DROP DATABASE "dispatch test db";
-- Test gp_max_plan_size limit
......@@ -61,13 +63,14 @@ select gp_inject_fault('send_qe_details_init_backend', 'reset', 2);
select gp_inject_fault('send_qe_details_init_backend', 'skip', 2);
-- terminate exiting QEs first
\c
\c dispatch_test_db
-- verify failure will be reported
SELECT 1 FROM gp_dist_random('gp_id');
-- reset fault injector
select gp_inject_fault('send_qe_details_init_backend', 'reset', 2);
--
-- Test suit : test gang creation and commands dispatching
--
......@@ -113,11 +116,9 @@ set gp_gang_creation_retry_timer to 1000;
select cleanupAllGangs();
-- trigger fault and put segment 0 into recovery mode
select gp_inject_fault('process_startup_packet', 'segv', 2);
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
--end_ignore
-- trigger fault and report segment 0 in recovery for 5 times
select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint);
select cleanupAllGangs();
-- should success after retry
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
......@@ -134,19 +135,19 @@ set gp_gang_creation_retry_timer to 200;
select cleanupAllGangs();
-- trigger fault and put segment 0 into recovery mode
select gp_inject_fault('process_startup_packet', 'segv', 2);
select gp_inject_fault('quickdie', 'suspend', 2);
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
--end_ignore
select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint);
select cleanupAllGangs();
-- should failed after 2 times
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
select gp_inject_fault('quickdie', 'resume', 2);
set gp_gang_creation_retry_count to 10;
-- should success and process_startup_packet will be invalid after this query
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
select gp_inject_fault('process_startup_packet', 'reset', 2);
select gp_inject_fault('quickdie', 'reset', 2);
--start_ignore
-- enlarge the retry count
......@@ -210,9 +211,9 @@ select gp_inject_fault('send_qe_details_init_backend', 'reset', 2);
-- gp_segment_connect_timeout = 0 : wait forever
-- gp_segment_connect_timeout = 1 : wait 1 second
set gp_segment_connect_timeout to 1;
select cleanupAllGangs();
select gp_inject_fault_new('process_startup_packet', 'suspend', '', 'dispatch_test_db', '', 1, 1, 0, 2::smallint);
select gp_inject_fault('process_startup_packet', 'suspend', 2);
select cleanupAllGangs();
-- expect timeout failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
......@@ -302,7 +303,7 @@ DROP TABLE foo_test;
--
-- Test dangling Gang would be destroyed if interrupted during the creation
--
\c
select cleanupAllGangs();
select gp_inject_fault('gang_created', 'reset', 1);
-- The _new() API ensures that the fault is triggered exactly once.
select gp_inject_fault_new('gang_created', 'error', 1);
......@@ -320,3 +321,6 @@ select * from gp_dist_random('gp_id')
select gp_inject_fault('after_one_slice_dispatched', 'reset', 1);
select * from gp_dist_random('gp_id')
where gpname > (select * from repeat('sssss', 10000000));
\c regression
DROP DATABASE dispatch_test_db;
-- Misc tests related to dispatching queries to segments.
CREATE DATABASE dispatch_test_db;
\c dispatch_test_db;
CREATE EXTENSION IF NOT EXISTS gp_inject_fault;
-- Mask out the whoami message
-- start_matchsubs
......@@ -53,7 +55,7 @@ NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' a
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
DROP TABLE "my table";
-- Clean up
\c regression
\c dispatch_test_db
DROP DATABASE "dispatch test db";
-- Test gp_max_plan_size limit
set gp_max_plan_size='10 kB';
......@@ -83,7 +85,7 @@ NOTICE: Success:
(1 row)
-- terminate exiting QEs first
\c
\c dispatch_test_db
-- verify failure will be reported
SELECT 1 FROM gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
......@@ -149,22 +151,20 @@ select cleanupAllGangs();
t
(1 row)
-- trigger fault and put segment 0 into recovery mode
select gp_inject_fault('process_startup_packet', 'segv', 2);
-- trigger fault and report segment 0 in recovery for 5 times
select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint);
NOTICE: Success:
gp_inject_fault
gp_inject_fault_new
---------------------
t
(1 row)
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
DETAIL: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
(seg0 127.0.0.1:25432)
--end_ignore
-- should success after retry
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
......@@ -193,49 +193,35 @@ select cleanupAllGangs();
(1 row)
-- trigger fault and put segment 0 into recovery mode
select gp_inject_fault('process_startup_packet', 'segv', 2);
select gp_inject_fault_new('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint);
NOTICE: Success:
gp_inject_fault
-----------------
gp_inject_fault_new
---------------------
t
(1 row)
select gp_inject_fault('quickdie', 'suspend', 2);
NOTICE: Success:
gp_inject_fault
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
DETAIL: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
(seg0 127.0.0.1:25432)
--end_ignore
-- should failed after 2 times
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
ERROR: failed to acquire resources on one or more segments
DETAIL: segments is in recovery mode
select gp_inject_fault('quickdie', 'resume', 2);
NOTICE: Success:
gp_inject_fault
-----------------
t
set gp_gang_creation_retry_count to 10;
-- should success and process_startup_packet will be invalid after this query
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
c1 | c2 | c3 | c1 | c2 | c3 | c1 | c2 | c3
----+----+----+----+----+----+----+----+----
1 | 1 | 2 | 2 | 1 | 2 | 3 | 1 | 2
(1 row)
select gp_inject_fault('process_startup_packet', 'reset', 2);
NOTICE: Success:
gp_inject_fault
-----------------
t
(1 row)
select gp_inject_fault('quickdie', 'reset', 2);
NOTICE: Success:
NOTICE: Success: (seg0 10.153.101.106:25432 pid=373925)
gp_inject_fault
-----------------
t
......@@ -370,15 +356,15 @@ NOTICE: Success:
-- gp_segment_connect_timeout = 0 : wait forever
-- gp_segment_connect_timeout = 1 : wait 1 second
set gp_segment_connect_timeout to 1;
select cleanupAllGangs();
cleanupallgangs
-----------------
select gp_inject_fault_new('process_startup_packet', 'suspend', '', 'dispatch_test_db', '', 1, 1, 0, 2::smallint);
NOTICE: Success:
gp_inject_fault_new
---------------------
t
(1 row)
select gp_inject_fault('process_startup_packet', 'suspend', 2);
NOTICE: Success:
gp_inject_fault
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
......@@ -524,7 +510,12 @@ DROP TABLE foo_test;
--
-- Test dangling Gang would be destroyed if interrupted during the creation
--
\c
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
select gp_inject_fault('gang_created', 'reset', 1);
NOTICE: Success:
gp_inject_fault
......@@ -582,3 +573,5 @@ select * from gp_dist_random('gp_id')
--------+-------------+------+---------
(0 rows)
\c regression
DROP DATABASE dispatch_test_db;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册