提交 ec5f45a5 编写于 作者: B Bhuvnesh Chaudhary 提交者: Kalen Krempely

gpstart: when standby is unreachable don't start it

When the standby is unreachable and the user proceeds with startup,
the standby would attempt to be started resulting in a stack trace.
Detect when the standby is unreachable and set start_standby to False to
prevent starting it later in the startup process.
Co-authored-by: NKalen Krempely <kkrempely@vmware.com>
上级 55dfa785
import imp
import logging
import os
import sys
......@@ -10,8 +9,7 @@ from gppylib.operations.startSegments import StartSegmentsResult
from gppylib.test.unit.gp_unittest import GpTestCase, run_tests
from gppylib.commands import gp
from gppylib.commands.base import ExecutionError
from gppylib.commands.pg import PgControlData
from gppylib.mainUtils import UserAbortedException
from gppylib.mainUtils import ExceptionNoStackTraceNeeded, UserAbortedException
class GpStart(GpTestCase):
......@@ -206,50 +204,126 @@ class GpStart(GpTestCase):
self.assertEqual(gpstart.fetch_tli("", "foo"), 2)
@patch("gpstart.GpStart.shutdown_master_only")
@patch("gppylib.commands.pg.PgControlData.run")
@patch("gppylib.commands.pg.PgControlData.get_value", side_effect=ExecutionError("foobar", Mock()))
def test_fetch_tli_returns_0_when_standby_is_not_accessible_and_user_proceeds(self, mock_value, mock_run, mock_shutdown):
@patch("gpstart.GpStart.fetch_tli", autospec=True)
def test_standby_activated_returns_false_when_primary_tli_is_before_standby_tli(self, mock_fetch_tli):
def mock_fetch_tli_func(self, data_dir, remote_host=None):
if "master" in data_dir:
return 3
if "standby" in data_dir:
return 2
return 1
mock_fetch_tli.side_effect = mock_fetch_tli_func
gpstart = self.setup_gpstart()
self.mock_userinput.ask_yesno.return_value = True
gpstart.master_datadir = "/data/master"
master = Segment.initFromString("1|-1|p|p|n|u|mdw|mdw|5432|/data/master")
standby = Segment.initFromString("6|-1|m|m|n|d|sdw3|sdw3|5433|/data/standby")
gpstart.gparray = GpArray([master, standby])
self.assertFalse(gpstart._standby_activated())
@patch("gpstart.GpStart.fetch_tli", autospec=True)
def test_standby_activated_returns_true_when_standby_tli_is_before_primary_tli(self, mock_fetch_tli):
def mock_fetch_tli_func(self, data_dir, remote_host=None):
if "master" in data_dir:
return 1
if "standby" in data_dir:
return 2
return 3
self.assertEqual(gpstart.fetch_tli("", "foo"), 0)
self.assertFalse(mock_shutdown.called)
mock_fetch_tli.side_effect = mock_fetch_tli_func
gpstart = self.setup_gpstart()
gpstart.master_datadir = "/data/master"
master = Segment.initFromString("1|-1|p|p|n|u|mdw|mdw|5432|/data/master")
standby = Segment.initFromString("6|-1|m|m|n|d|sdw3|sdw3|5433|/data/standby")
gpstart.gparray = GpArray([master, standby])
self.assertTrue(gpstart._standby_activated())
@patch("gpstart.GpStart.fetch_tli", autospec=True)
def test_standby_activated_raises_StandbyUnreachable_exception_when_fetching_standby_tli_fails(self, mock_fetch_tli):
def mock_fetch_tli_func(self, data_dir, remote_host=None):
if "standby" in data_dir:
raise ExecutionError("oops", None)
return 10
mock_fetch_tli.side_effect = mock_fetch_tli_func
gpstart = self.setup_gpstart()
gpstart.master_datadir = "/data/master"
master = Segment.initFromString("1|-1|p|p|n|u|mdw|mdw|5432|/data/master")
standby = Segment.initFromString("6|-1|m|m|n|d|sdw3|sdw3|5433|/data/standby")
gpstart.gparray = GpArray([master, standby])
with self.assertRaises(gpstart.StandbyUnreachable):
gpstart._standby_activated()
@patch("gpstart.gp.GpStop")
@patch("gpstart.GpStart._standby_activated", return_value=False)
def test_check_standby_returns_when_standby_is_not_activated(self, mock_standby_activated, mock_gp_stop):
gpstart = self.setup_gpstart()
gpstart.check_standby()
self.assertFalse(mock_gp_stop.called)
@patch("gpstart.gp.GpStop")
@patch("gpstart.GpStart._standby_activated", return_value=True)
def test_check_standby_stops_master_and_raises_an_exception_when_standby_is_activated(self, mock_standby_activated, mock_gp_stop):
gpstart = self.setup_gpstart()
with self.assertRaises(ExceptionNoStackTraceNeeded):
gpstart.check_standby()
self.assertTrue(mock_gp_stop.return_value.run.called)
@patch("gpstart.GpStart.shutdown_master_only")
@patch("gppylib.commands.pg.PgControlData.run")
@patch("gppylib.commands.pg.PgControlData.get_value", side_effect=ExecutionError("foobar", Mock()))
def test_fetch_tli_raises_exception_when_standby_is_not_accessible_and_user_aborts(self, mock_value, mock_run, mock_shutdown):
@patch("gpstart.gp.GpStop")
@patch("gpstart.GpStart._standby_activated")
def test_check_standby_logs_warning_and_returns_when_standby_is_unreachable_and_user_proceeds(self, mock_standby_activated, mock_gp_stop, mock_shutdown_master):
gpstart = self.setup_gpstart()
self.mock_userinput.ask_yesno.return_value = False
with self.assertRaises(UserAbortedException):
gpstart.fetch_tli("", "foo")
self.assertTrue(mock_shutdown.called)
mock_standby_activated.side_effect = gpstart.StandbyUnreachable()
gpstart.interactive = True
self.mock_userinput.ask_yesno.return_value = True
gpstart.check_standby()
self.subject.logger.warning.assert_any_call(StringContains("Standby host is unreachable, cannot determine whether the standby is currently acting as the master"))
self.assertFalse(mock_shutdown_master.called)
self.assertFalse(mock_gp_stop.called)
@patch("gpstart.GpStart.shutdown_master_only")
@patch("gppylib.commands.pg.PgControlData.run")
@patch("gppylib.commands.pg.PgControlData.get_value", side_effect=ExecutionError("cmd foobar failed", Mock()))
def test_fetch_tli_logs_warning_when_standby_is_not_accessible(self, mock_value, mock_run, mock_shutdown):
@patch("gpstart.gp.GpStop")
@patch("gpstart.GpStart._standby_activated")
def test_check_standby_logs_warning_and_stops_master_and_raises_exception_when_standby_is_unreachable_and_user_does_not_proceeed(self, mock_standby_activated, mock_gp_stop, mock_shutdown_master):
gpstart = self.setup_gpstart()
mock_standby_activated.side_effect = gpstart.StandbyUnreachable()
gpstart.interactive = True
self.mock_userinput.ask_yesno.return_value = False
with self.assertRaises(UserAbortedException):
gpstart.fetch_tli("", "foo")
self.subject.logger.warning.assert_any_call(StringContains("Received error: ExecutionError: 'cmd foobar failed' occurred."))
self.subject.logger.warning.assert_any_call("Continue only if you are certain that the standby is not acting as the master.")
gpstart.check_standby()
self.subject.logger.warning.assert_any_call(StringContains("Standby host is unreachable, cannot determine whether the standby is currently acting as the master"))
self.assertTrue(mock_shutdown_master.called)
self.assertFalse(mock_gp_stop.called)
@patch("gpstart.GpStart.shutdown_master_only")
@patch("gppylib.commands.pg.PgControlData.run")
@patch("gppylib.commands.pg.PgControlData.get_value", side_effect=ExecutionError("foobar", Mock()))
def test_fetch_tli_logs_non_interactive_warning_when_standby_is_not_accessible(self, mock_value, mock_run, mock_shutdown):
@patch("gpstart.gp.GpStop")
@patch("gpstart.GpStart._standby_activated")
def test_check_standby_logs_warning_and_stops_master_and_raises_exception_in_non_interactive_mode_and_standby_is_unreachable(self, mock_standby_activated, mock_gp_stop, mock_shutdown_master):
gpstart = self.setup_gpstart()
mock_standby_activated.side_effect = gpstart.StandbyUnreachable()
gpstart.interactive = False
with self.assertRaises(UserAbortedException):
gpstart.fetch_tli("", "foo")
self.assertTrue(mock_shutdown.called)
gpstart.check_standby()
self.subject.logger.warning.assert_any_call(StringContains("Standby host is unreachable, cannot determine whether the standby is currently acting as the master"))
self.subject.logger.warning.assert_any_call("Non interactive mode detected. Not starting the cluster. Start the cluster in interactive mode.")
self.assertTrue(mock_shutdown_master.called)
self.assertFalse(mock_gp_stop.called)
def _createGpArrayWith2Primary2Mirrors(self):
self.master = Segment.initFromString(
......
......@@ -123,7 +123,7 @@ class GpStart:
logger=logger).get_master_value()
if not self.skip_standby_check:
self._check_standby_activated()
self.check_standby()
else:
logger.info("Skipping Standby activation status checking.")
......@@ -234,10 +234,34 @@ class GpStart:
else:
controldata = PgControlData("fetching pg_controldata remotely", data_dir_path, REMOTE, remoteHost)
controldata.run(validateAfter=True)
return int(controldata.get_value("Latest checkpoint's TimeLineID"))
class StandbyUnreachable(Exception):
pass
def _standby_activated(self):
logger.debug("Checking if standby has been activated...")
if not self.gparray.standbyMaster:
return False
# fetch timelineids for both primary and standby (post-promote)
primary_tli = self.fetch_tli(self.master_datadir)
try:
controldata.run(validateAfter=True)
return int(controldata.get_value("Latest checkpoint's TimeLineID"))
standby_tli = self.fetch_tli(self.gparray.standbyMaster.getSegmentDataDirectory(),
self.gparray.standbyMaster.getSegmentHostName())
except base.ExecutionError as err:
raise GpStart.StandbyUnreachable(err)
logger.debug("Primary TLI = %d" % primary_tli)
logger.debug("Standby TLI = %d" % standby_tli)
return primary_tli < standby_tli
def check_standby(self):
try:
standby_activated = self._standby_activated()
except GpStart.StandbyUnreachable as err:
logger.warning("Standby host is unreachable, cannot determine whether the standby is currently acting as the master. Received error: %s" % err)
logger.warning("Continue only if you are certain that the standby is not acting as the master.")
if not self.interactive or not userinput.ask_yesno(None, "\nContinue with startup", 'N'):
......@@ -245,31 +269,25 @@ class GpStart:
logger.warning("Non interactive mode detected. Not starting the cluster. Start the cluster in interactive mode.")
self.shutdown_master_only()
raise UserAbortedException()
return 0 # a 0 won't lead to standby promotion, as TimeLineIDs start at 1
def _check_standby_activated(self):
logger.debug("Checking if standby has been activated...")
# If the user wants to continue when the standby is unreachable,
# set start_standby to False to prevent starting the unreachable
# standy later in the startup process.
self.start_standby = False
return
if self.gparray.standbyMaster:
# fetch timelineids for both primary and standby (post-promote)
primary_tli = self.fetch_tli(self.master_datadir)
standby_tli = self.fetch_tli(self.gparray.standbyMaster.getSegmentDataDirectory(),
self.gparray.standbyMaster.getSegmentHostName())
if not standby_activated:
return
logger.debug("Primary TLI = %d" % primary_tli)
logger.debug("Standby TLI = %d" % standby_tli)
if primary_tli < standby_tli:
# stop the master we've started up.
cmd = gp.GpStop("Shutting down master", masterOnly=True,
fast=True, quiet=logging_is_quiet(),
verbose=logging_is_verbose(),
parallel=self.parallel,
datadir=self.master_datadir)
cmd.run(validateAfter=True)
logger.info("Master Stopped...")
raise ExceptionNoStackTraceNeeded("Standby activated, this node no more can act as master.")
# stop the master we've started up.
cmd = gp.GpStop("Shutting down master", masterOnly=True,
fast=True, quiet=logging_is_quiet(),
verbose=logging_is_verbose(),
datadir=self.master_datadir,
parallel=self.parallel)
cmd.run(validateAfter=True)
logger.info("Master Stopped...")
raise ExceptionNoStackTraceNeeded("Standby activated, this node no more can act as master.")
######
def _recovery_startup(self):
......
......@@ -13,3 +13,16 @@ Feature: gpstart behave tests
And gpstart should print "Skipped segment starts \(segments are marked down in configuration\) += 1" to stdout
And gpstart should print "Successfully started [0-9]+ of [0-9]+ segment instances, skipped 1 other segments" to stdout
And gpstart should print "Number of segments not attempted to start: 1" to stdout
Scenario: gpstart starts even if the standby host is unreachable
Given the database is running
And the catalog has a standby master entry
When the database is not running
And the standby host goes down
And gpstart is run with prompts accepted
Then gpstart should print "Continue only if you are certain that the standby is not acting as the master." to stdout
And gpstart should print "No standby master configured" to stdout
And gpstart should return a return code of 0
And all the segments are running
\ No newline at end of file
import os
import signal
import subprocess
from behave import given, when, then
from test.behave_utils import utils
def _run_sql(sql, opts=None):
env = None
if opts is not None:
env = os.environ.copy()
options = ''
for key, value in opts.items():
options += "-c {}={} ".format(key, value)
env['PGOPTIONS'] = options
subprocess.check_call([
"psql",
"postgres",
"-c", sql,
], env=env)
@when('the standby host goes down')
def impl(context):
"""
Fakes a host failure by updating the standby segment entry to point at an
invalid hostname and address.
"""
opts = {
'gp_session_role': 'utility',
'allow_system_table_mods': 'on',
}
subprocess.check_call(['gpstart', '-am'])
_run_sql("""
UPDATE gp_segment_configuration
SET hostname = 'standby.invalid',
address = 'standby.invalid'
WHERE content = -1 AND role = 'm'
""", opts=opts)
subprocess.check_call(['gpstop', '-am'])
def cleanup(context):
"""
Reverses the above SQL by starting up in master-only utility mode. Since
the standby host is incorrect, a regular gpstart call won't work.
"""
utils.stop_database_if_started(context)
subprocess.check_call(['gpstart', '-am'])
_run_sql("""
UPDATE gp_segment_configuration
SET hostname = master.hostname,
address = master.address
FROM (
SELECT hostname, address
FROM gp_segment_configuration
WHERE content = -1 and role = 'p'
) master
WHERE content = -1 AND role = 'm'
""", opts=opts)
subprocess.check_call(['gpstop', '-am'])
context.add_cleanup(cleanup, context)
def _handle_sigpipe():
"""
Work around https://bugs.python.org/issue1615376, which is not fixed until
Python 3.2. This bug interferes with Bash pipelines that rely on SIGPIPE to
exit cleanly.
"""
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
@when('gpstart is run with prompts accepted')
def impl(context):
"""
Runs `yes | gpstart`.
"""
p = subprocess.Popen(
[ "bash", "-c", "yes | gpstart" ],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=_handle_sigpipe,
)
context.stdout_message, context.stderr_message = p.communicate()
context.ret_code = p.returncode
......@@ -1234,6 +1234,7 @@ def impl(context):
context.standby_host = standby
run_gpcommand(context, 'gpinitstandby -ra')
@given('the catalog has a standby master entry')
@then('verify the standby master entries in catalog')
def impl(context):
check_segment_config_query = "SELECT * FROM gp_segment_configuration WHERE content = -1 AND role = 'm'"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册