diff --git a/gpMgmt/bin/gpstart b/gpMgmt/bin/gpstart index 4d6a502a165ac86a0cab1eef5d128e849b778ad0..6e4e623c85d1245bd255f32b3fe861e097eca590 100755 --- a/gpMgmt/bin/gpstart +++ b/gpMgmt/bin/gpstart @@ -111,12 +111,19 @@ class GpStart: if self.masteronly: return 0 + num_workers = min(len(self.gparray.get_hostlist()), self.parallel) + # We check for unreachable segment hosts first thing, because if a host is down but its segments + # are marked up, later checks can return invalid or misleading results and the cluster may not + # start in a good state. + unreachable_hosts = self.get_unreachable_segment_hosts(num_workers) + if unreachable_hosts: + self.mark_segments_down_for_unreachable_hosts(unreachable_hosts) + if self.skip_heap_checksum_validation: self.master_checksum_value = None logger.warning("Because of --skip-heap-checksum-validation, the GUC for data_checksums " "will not be checked between master and segments") else: - num_workers = min(len(self.gparray.get_hostlist()), self.parallel) self.master_checksum_value = HeapChecksum(gparray=self.gparray, num_workers=num_workers, logger=logger).get_master_value() @@ -287,6 +294,46 @@ class GpStart: logger.info("Master Stopped...") raise ExceptionNoStackTraceNeeded("Standby activated, this node no more can act as master.") + def get_unreachable_segment_hosts(self, num_workers): + hostlist = set(self.gparray.get_hostlist(includeMaster=False)) + + pool = base.WorkerPool(numWorkers=num_workers) + try: + for host in hostlist: + cmd = Command(name='check %s is up' % host, cmdStr="ssh %s 'echo %s'" % (host, host)) + pool.addCommand(cmd) + pool.join() + finally: + pool.haltWork() + pool.joinWorkers() + + # There's no good way to map a CommandResult back to its originating Command so instead + # of looping through and finding the hosts that errored out, we remove any hosts that + # succeeded from the hostlist and any remaining hosts will be ones that were unreachable. + for item in pool.getCompletedItems(): + result = item.get_results() + if result.rc == 0: + host = result.stdout.strip() + hostlist.remove(host) + + if len(hostlist) > 0: + logger.warning("One or more hosts are not reachable via SSH. Any segments on those hosts will be marked down") + for host in sorted(hostlist): + logger.warning("Host %s is unreachable" % host) + return hostlist + return None + + def mark_segments_down_for_unreachable_hosts(self, unreachable_hosts): + # We only mark the segment down in gparray for use by later checks, as + # setting the actual segment down in gp_segment_configuration leads to + # an inconsistent state and may prevent the database from starting. + for segmentPair in self.gparray.segmentPairs: + for seg in [segmentPair.primaryDB, segmentPair.mirrorDB]: + host = seg.getSegmentHostName() + if host in unreachable_hosts: + logger.warning("Marking segment %d down because %s is unreachable" % (seg.dbid, host)) + seg.setSegmentStatus(STATUS_DOWN) + ###### def _recovery_startup(self): logger.info("Commencing recovery startup checks") diff --git a/gpMgmt/test/behave/mgmt_utils/gpstart.feature b/gpMgmt/test/behave/mgmt_utils/gpstart.feature index 8fcdd41bd5d68294fc7d3d07d25d2b6d49d7ac60..6ef81b1e360e2e1b9a5cbcb73980b273a1f5b36d 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpstart.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpstart.feature @@ -18,11 +18,29 @@ Feature: gpstart behave tests Given the database is running And the catalog has a standby master entry - When the database is not running - And the standby host goes down + When the standby host goes down + And the user runs command "pkill -9 postgres" And gpstart is run with prompts accepted Then gpstart should print "Continue only if you are certain that the standby is not acting as the master." to stdout And gpstart should print "No standby master configured" to stdout And gpstart should return a return code of 0 - And all the segments are running \ No newline at end of file + And all the segments are running + + @concourse_cluster + @demo_cluster + Scenario: gpstart starts even if a segment host is unreachable + Given the database is running + And segment 2 goes down + And segment 3 goes down + And the user runs command "pkill -9 postgres" + + When gpstart is run with prompts accepted + + Then gpstart should print "Host invalid_host is unreachable" to stdout + And gpstart should print "Marking segment 2 down because invalid_host is unreachable" to stdout + And gpstart should print "Marking segment 3 down because invalid_host is unreachable" to stdout + And the status of segment 2 should be "d" + And the status of segment 3 should be "d" + + And the cluster is returned to a good state diff --git a/gpMgmt/test/behave/mgmt_utils/steps/gpstart.py b/gpMgmt/test/behave/mgmt_utils/steps/gpstart.py index 47627411dd25bfe5da8f4d149a9e4b5471aa12bb..ed5a060b67a563e2051f38283d8dc09ac0121b3f 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/gpstart.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/gpstart.py @@ -3,7 +3,8 @@ import signal import subprocess from behave import given, when, then -from test.behave_utils import utils +from test.behave_utils import utils +from gppylib.commands.base import Command def _run_sql(sql, opts=None): env = None @@ -23,32 +24,29 @@ def _run_sql(sql, opts=None): "-c", sql, ], env=env) +def do_catalog_query(query): + cmd = '''PGOPTIONS='-c gp_role=utility' psql -t -d template1 -c "SET allow_system_table_mods='true'; %s"''' % query + cmd = Command(name="catalog query", cmdStr=cmd) + cmd.run(validateAfter=True) + return cmd + +def change_hostname(dbid, hostname): + do_catalog_query("UPDATE gp_segment_configuration SET hostname = '{0}', address = '{0}' WHERE dbid = {1}".format(hostname, dbid)) + +def change_status(dbid, status): + do_catalog_query("UPDATE gp_segment_configuration SET status = '%s' WHERE dbid = %s" % (status, dbid)) + @when('the standby host goes down') def impl(context): - """ - Fakes a host failure by updating the standby segment entry to point at an - invalid hostname and address. - """ - opts = { - 'gp_role': 'utility', - 'allow_system_table_mods': 'on', - } - - subprocess.check_call(['gpstart', '-am']) - _run_sql(""" - UPDATE gp_segment_configuration - SET hostname = 'standby.invalid', - address = 'standby.invalid' - WHERE content = -1 AND role = 'm' - """, opts=opts) - subprocess.check_call(['gpstop', '-am']) + result = do_catalog_query("SELECT dbid FROM gp_segment_configuration WHERE content = -1 AND role = 'm'") + dbid = result.get_stdout().strip() + change_hostname(dbid, 'invalid_host') def cleanup(context): """ Reverses the above SQL by starting up in master-only utility mode. Since the standby host is incorrect, a regular gpstart call won't work. """ - utils.stop_database_if_started(context) subprocess.check_call(['gpstart', '-am']) @@ -65,7 +63,7 @@ def impl(context): """, opts=opts) subprocess.check_call(['gpstop', '-am']) - context.add_cleanup(cleanup, context) + context.add_cleanup(cleanup, context) def _handle_sigpipe(): """ @@ -90,3 +88,41 @@ def impl(context): context.stdout_message, context.stderr_message = p.communicate() context.ret_code = p.returncode + +@given('segment {dbid} goes down' ) +def impl(context, dbid): + result = do_catalog_query("SELECT hostname FROM gp_segment_configuration WHERE dbid = %s" % dbid) + if not hasattr(context, 'old_hostnames'): + context.old_hostnames = {} + context.old_hostnames[dbid] = result.get_stdout().strip() + change_hostname(dbid, 'invalid_host') + +@then('the status of segment {dbid} should be "{expected_status}"' ) +def impl(context, dbid, expected_status): + result = do_catalog_query("SELECT status FROM gp_segment_configuration WHERE dbid = %s" % dbid) + + status = result .get_stdout().strip() + if status != expected_status: + raise Exception("Expected status to be %s, but it is %s" % (expected_status, status)) + +@then('the status of segment {dbid} is changed to "{status}"' ) +def impl(context, dbid, status): + do_catalog_query("UPDATE gp_segment_configuration SET status = '%s' WHERE dbid = %s" % (status, dbid)) + +@then('the cluster is returned to a good state' ) +def impl(context): + if not hasattr(context, 'old_hostnames'): + raise Exception("Cannot reset segment hostnames: no hostnames are saved") + for dbid, hostname in context.old_hostnames.items(): + change_hostname(dbid, hostname) + + context.execute_steps(""" + When the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 0 + And all the segments are running + And the segments are synchronized + When the user runs "gprecoverseg -a -r" + Then gprecoverseg should return a return code of 0 + And all the segments are running + And the segments are synchronized + """) diff --git a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py index 57e55caee236c4c74dc4cd3adcea62ad13404fe2..0853d4186fd6b63f89c07aa2988c050fbbb4d05f 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py @@ -1245,14 +1245,14 @@ def impl(context): statrep = dbconn.query(conn, check_stat_replication_query).fetchall() conn.close() - context.standby_dbid = segconfig[0][0] - if len(segconfig) != 1: raise Exception("gp_segment_configuration did not have standby master") if len(statrep) != 1: raise Exception("pg_stat_replication did not have standby master") + context.standby_dbid = segconfig[0][0] + @then('verify the standby master is now acting as master') def impl(context): check_segment_config_query = "SELECT * FROM gp_segment_configuration WHERE content = -1 AND role = 'p' AND preferred_role = 'p' AND dbid = %s" % context.standby_dbid