提交 be5d11e2 编写于 作者: J Jamie McAtamney

gpstart: improve handling of down segment hosts

Currently, if a host is unreachable when gpstart is run, it will not report this
and will instead fail with an error that is both inaccurate and unhelpful to the
user, such as claiming that checksums are invalid for segments on a given host
when it simply can't reach that host to verify the checksums.

This commit adds a check to verify that all hosts are reachable before beginning
the startup process and, if one or more hosts are not reachable, marks segments
on those hosts down (in gparray, not in the cluster) so gpstart won't try to run
any checks against unreachable hosts and so that the cluster can still be started
in this state so long as there are otherwise enough valid segments to start it.
上级 a3049894
...@@ -111,12 +111,19 @@ class GpStart: ...@@ -111,12 +111,19 @@ class GpStart:
if self.masteronly: if self.masteronly:
return 0 return 0
num_workers = min(len(self.gparray.get_hostlist()), self.parallel)
# We check for unreachable segment hosts first thing, because if a host is down but its segments
# are marked up, later checks can return invalid or misleading results and the cluster may not
# start in a good state.
unreachable_hosts = self.get_unreachable_segment_hosts(num_workers)
if unreachable_hosts:
self.mark_segments_down_for_unreachable_hosts(unreachable_hosts)
if self.skip_heap_checksum_validation: if self.skip_heap_checksum_validation:
self.master_checksum_value = None self.master_checksum_value = None
logger.warning("Because of --skip-heap-checksum-validation, the GUC for data_checksums " logger.warning("Because of --skip-heap-checksum-validation, the GUC for data_checksums "
"will not be checked between master and segments") "will not be checked between master and segments")
else: else:
num_workers = min(len(self.gparray.get_hostlist()), self.parallel)
self.master_checksum_value = HeapChecksum(gparray=self.gparray, num_workers=num_workers, self.master_checksum_value = HeapChecksum(gparray=self.gparray, num_workers=num_workers,
logger=logger).get_master_value() logger=logger).get_master_value()
...@@ -287,6 +294,46 @@ class GpStart: ...@@ -287,6 +294,46 @@ class GpStart:
logger.info("Master Stopped...") logger.info("Master Stopped...")
raise ExceptionNoStackTraceNeeded("Standby activated, this node no more can act as master.") raise ExceptionNoStackTraceNeeded("Standby activated, this node no more can act as master.")
def get_unreachable_segment_hosts(self, num_workers):
hostlist = set(self.gparray.get_hostlist(includeMaster=False))
pool = base.WorkerPool(numWorkers=num_workers)
try:
for host in hostlist:
cmd = Command(name='check %s is up' % host, cmdStr="ssh %s 'echo %s'" % (host, host))
pool.addCommand(cmd)
pool.join()
finally:
pool.haltWork()
pool.joinWorkers()
# There's no good way to map a CommandResult back to its originating Command so instead
# of looping through and finding the hosts that errored out, we remove any hosts that
# succeeded from the hostlist and any remaining hosts will be ones that were unreachable.
for item in pool.getCompletedItems():
result = item.get_results()
if result.rc == 0:
host = result.stdout.strip()
hostlist.remove(host)
if len(hostlist) > 0:
logger.warning("One or more hosts are not reachable via SSH. Any segments on those hosts will be marked down")
for host in sorted(hostlist):
logger.warning("Host %s is unreachable" % host)
return hostlist
return None
def mark_segments_down_for_unreachable_hosts(self, unreachable_hosts):
# We only mark the segment down in gparray for use by later checks, as
# setting the actual segment down in gp_segment_configuration leads to
# an inconsistent state and may prevent the database from starting.
for segmentPair in self.gparray.segmentPairs:
for seg in [segmentPair.primaryDB, segmentPair.mirrorDB]:
host = seg.getSegmentHostName()
if host in unreachable_hosts:
logger.warning("Marking segment %d down because %s is unreachable" % (seg.dbid, host))
seg.setSegmentStatus(STATUS_DOWN)
###### ######
def _recovery_startup(self): def _recovery_startup(self):
logger.info("Commencing recovery startup checks") logger.info("Commencing recovery startup checks")
......
...@@ -18,11 +18,29 @@ Feature: gpstart behave tests ...@@ -18,11 +18,29 @@ Feature: gpstart behave tests
Given the database is running Given the database is running
And the catalog has a standby master entry And the catalog has a standby master entry
When the database is not running When the standby host goes down
And the standby host goes down And the user runs command "pkill -9 postgres"
And gpstart is run with prompts accepted And gpstart is run with prompts accepted
Then gpstart should print "Continue only if you are certain that the standby is not acting as the master." to stdout Then gpstart should print "Continue only if you are certain that the standby is not acting as the master." to stdout
And gpstart should print "No standby master configured" to stdout And gpstart should print "No standby master configured" to stdout
And gpstart should return a return code of 0 And gpstart should return a return code of 0
And all the segments are running And all the segments are running
\ No newline at end of file
@concourse_cluster
@demo_cluster
Scenario: gpstart starts even if a segment host is unreachable
Given the database is running
And segment 2 goes down
And segment 3 goes down
And the user runs command "pkill -9 postgres"
When gpstart is run with prompts accepted
Then gpstart should print "Host invalid_host is unreachable" to stdout
And gpstart should print "Marking segment 2 down because invalid_host is unreachable" to stdout
And gpstart should print "Marking segment 3 down because invalid_host is unreachable" to stdout
And the status of segment 2 should be "d"
And the status of segment 3 should be "d"
And the cluster is returned to a good state
...@@ -3,7 +3,8 @@ import signal ...@@ -3,7 +3,8 @@ import signal
import subprocess import subprocess
from behave import given, when, then from behave import given, when, then
from test.behave_utils import utils from test.behave_utils import utils
from gppylib.commands.base import Command
def _run_sql(sql, opts=None): def _run_sql(sql, opts=None):
env = None env = None
...@@ -23,32 +24,29 @@ def _run_sql(sql, opts=None): ...@@ -23,32 +24,29 @@ def _run_sql(sql, opts=None):
"-c", sql, "-c", sql,
], env=env) ], env=env)
def do_catalog_query(query):
cmd = '''PGOPTIONS='-c gp_role=utility' psql -t -d template1 -c "SET allow_system_table_mods='true'; %s"''' % query
cmd = Command(name="catalog query", cmdStr=cmd)
cmd.run(validateAfter=True)
return cmd
def change_hostname(dbid, hostname):
do_catalog_query("UPDATE gp_segment_configuration SET hostname = '{0}', address = '{0}' WHERE dbid = {1}".format(hostname, dbid))
def change_status(dbid, status):
do_catalog_query("UPDATE gp_segment_configuration SET status = '%s' WHERE dbid = %s" % (status, dbid))
@when('the standby host goes down') @when('the standby host goes down')
def impl(context): def impl(context):
""" result = do_catalog_query("SELECT dbid FROM gp_segment_configuration WHERE content = -1 AND role = 'm'")
Fakes a host failure by updating the standby segment entry to point at an dbid = result.get_stdout().strip()
invalid hostname and address. change_hostname(dbid, 'invalid_host')
"""
opts = {
'gp_role': 'utility',
'allow_system_table_mods': 'on',
}
subprocess.check_call(['gpstart', '-am'])
_run_sql("""
UPDATE gp_segment_configuration
SET hostname = 'standby.invalid',
address = 'standby.invalid'
WHERE content = -1 AND role = 'm'
""", opts=opts)
subprocess.check_call(['gpstop', '-am'])
def cleanup(context): def cleanup(context):
""" """
Reverses the above SQL by starting up in master-only utility mode. Since Reverses the above SQL by starting up in master-only utility mode. Since
the standby host is incorrect, a regular gpstart call won't work. the standby host is incorrect, a regular gpstart call won't work.
""" """
utils.stop_database_if_started(context) utils.stop_database_if_started(context)
subprocess.check_call(['gpstart', '-am']) subprocess.check_call(['gpstart', '-am'])
...@@ -65,7 +63,7 @@ def impl(context): ...@@ -65,7 +63,7 @@ def impl(context):
""", opts=opts) """, opts=opts)
subprocess.check_call(['gpstop', '-am']) subprocess.check_call(['gpstop', '-am'])
context.add_cleanup(cleanup, context) context.add_cleanup(cleanup, context)
def _handle_sigpipe(): def _handle_sigpipe():
""" """
...@@ -90,3 +88,41 @@ def impl(context): ...@@ -90,3 +88,41 @@ def impl(context):
context.stdout_message, context.stderr_message = p.communicate() context.stdout_message, context.stderr_message = p.communicate()
context.ret_code = p.returncode context.ret_code = p.returncode
@given('segment {dbid} goes down' )
def impl(context, dbid):
result = do_catalog_query("SELECT hostname FROM gp_segment_configuration WHERE dbid = %s" % dbid)
if not hasattr(context, 'old_hostnames'):
context.old_hostnames = {}
context.old_hostnames[dbid] = result.get_stdout().strip()
change_hostname(dbid, 'invalid_host')
@then('the status of segment {dbid} should be "{expected_status}"' )
def impl(context, dbid, expected_status):
result = do_catalog_query("SELECT status FROM gp_segment_configuration WHERE dbid = %s" % dbid)
status = result .get_stdout().strip()
if status != expected_status:
raise Exception("Expected status to be %s, but it is %s" % (expected_status, status))
@then('the status of segment {dbid} is changed to "{status}"' )
def impl(context, dbid, status):
do_catalog_query("UPDATE gp_segment_configuration SET status = '%s' WHERE dbid = %s" % (status, dbid))
@then('the cluster is returned to a good state' )
def impl(context):
if not hasattr(context, 'old_hostnames'):
raise Exception("Cannot reset segment hostnames: no hostnames are saved")
for dbid, hostname in context.old_hostnames.items():
change_hostname(dbid, hostname)
context.execute_steps("""
When the user runs "gprecoverseg -a"
Then gprecoverseg should return a return code of 0
And all the segments are running
And the segments are synchronized
When the user runs "gprecoverseg -a -r"
Then gprecoverseg should return a return code of 0
And all the segments are running
And the segments are synchronized
""")
...@@ -1245,14 +1245,14 @@ def impl(context): ...@@ -1245,14 +1245,14 @@ def impl(context):
statrep = dbconn.query(conn, check_stat_replication_query).fetchall() statrep = dbconn.query(conn, check_stat_replication_query).fetchall()
conn.close() conn.close()
context.standby_dbid = segconfig[0][0]
if len(segconfig) != 1: if len(segconfig) != 1:
raise Exception("gp_segment_configuration did not have standby master") raise Exception("gp_segment_configuration did not have standby master")
if len(statrep) != 1: if len(statrep) != 1:
raise Exception("pg_stat_replication did not have standby master") raise Exception("pg_stat_replication did not have standby master")
context.standby_dbid = segconfig[0][0]
@then('verify the standby master is now acting as master') @then('verify the standby master is now acting as master')
def impl(context): def impl(context):
check_segment_config_query = "SELECT * FROM gp_segment_configuration WHERE content = -1 AND role = 'p' AND preferred_role = 'p' AND dbid = %s" % context.standby_dbid check_segment_config_query = "SELECT * FROM gp_segment_configuration WHERE content = -1 AND role = 'p' AND preferred_role = 'p' AND dbid = %s" % context.standby_dbid
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册