gpstart: improve handling of down segment hosts

Currently, if a host is unreachable when gpstart is run, it will not report this and will instead fail with an error that is both inaccurate and unhelpful to the user, such as claiming that checksums are invalid for segments on a given host when it simply can't reach that host to verify the checksums. This commit adds a check to verify that all hosts are reachable before beginning the startup process and, if one or more hosts are not reachable, marks segments on those hosts down (in gparray, not in the cluster) so gpstart won't try to run any checks against unreachable hosts and so that the cluster can still be started in this state so long as there are otherwise enough valid segments to start it.

gpstart: improve handling of down segment hosts
Currently, if a host is unreachable when gpstart is run, it will not report this and will instead fail with an error that is both inaccurate and unhelpful to the user, such as claiming that checksums are invalid for segments on a given host when it simply can't reach that host to verify the checksums. This commit adds a check to verify that all hosts are reachable before beginning the startup process and, if one or more hosts are not reachable, marks segments on those hosts down (in gparray, not in the cluster) so gpstart won't try to run any checks against unreachable hosts and so that the cluster can still be started in this state so long as there are otherwise enough valid segments to start it.
be5d11e2 · Jamie McAtamney · a3049894 · be5d11e2 · be5d11e2 · be5d11e2
4 changed file
--- a/gpMgmt/bin/gpstart
+++ b/gpMgmt/bin/gpstart
@@ -111,12 +111,19 @@ class GpStart:
            if self.masteronly:
                return 0
+            num_workers = min(len(self.gparray.get_hostlist()), self.parallel)
+            # We check for unreachable segment hosts first thing, because if a host is down but its segments
+            # are marked up, later checks can return invalid or misleading results and the cluster may not
+            # start in a good state.
+            unreachable_hosts = self.get_unreachable_segment_hosts(num_workers)
+            if unreachable_hosts:
+                self.mark_segments_down_for_unreachable_hosts(unreachable_hosts)
            if self.skip_heap_checksum_validation:
                self.master_checksum_value = None
                logger.warning("Because of --skip-heap-checksum-validation, the GUC for data_checksums "
                                   "will not be checked between master and segments")
            else:
-                num_workers = min(len(self.gparray.get_hostlist()), self.parallel)
                self.master_checksum_value = HeapChecksum(gparray=self.gparray, num_workers=num_workers,
                                                          logger=logger).get_master_value()
@@ -287,6 +294,46 @@ class GpStart:
        logger.info("Master Stopped...")
        raise ExceptionNoStackTraceNeeded("Standby activated, this node no more can act as master.")
+    def get_unreachable_segment_hosts(self, num_workers):
+        hostlist = set(self.gparray.get_hostlist(includeMaster=False))
+        pool = base.WorkerPool(numWorkers=num_workers)
+        try:
+            for host in hostlist:
+                cmd = Command(name='check %s is up' % host, cmdStr="ssh %s 'echo %s'" % (host, host))
+                pool.addCommand(cmd)
+            pool.join()
+        finally:
+            pool.haltWork()
+            pool.joinWorkers()
+        # There's no good way to map a CommandResult back to its originating Command so instead
+        # of looping through and finding the hosts that errored out, we remove any hosts that
+        # succeeded from the hostlist and any remaining hosts will be ones that were unreachable.
+        for item in pool.getCompletedItems():
+            result = item.get_results()
+            if result.rc == 0:
+                host = result.stdout.strip()
+                hostlist.remove(host)
+        if len(hostlist) > 0:
+            logger.warning("One or more hosts are not reachable via SSH.  Any segments on those hosts will be marked down")
+            for host in sorted(hostlist):
+                logger.warning("Host %s is unreachable" % host)
+            return hostlist
+        return None
+    def mark_segments_down_for_unreachable_hosts(self, unreachable_hosts):
+        # We only mark the segment down in gparray for use by later checks, as
+        # setting the actual segment down in gp_segment_configuration leads to
+        # an inconsistent state and may prevent the database from starting.
+        for segmentPair in self.gparray.segmentPairs:
+            for seg in [segmentPair.primaryDB, segmentPair.mirrorDB]:
+                host = seg.getSegmentHostName()
+                if host in unreachable_hosts:
+                    logger.warning("Marking segment %d down because %s is unreachable" % (seg.dbid, host))
+                    seg.setSegmentStatus(STATUS_DOWN)
    ######
    def _recovery_startup(self):
        logger.info("Commencing recovery startup checks")

--- a/gpMgmt/test/behave/mgmt_utils/gpstart.feature
+++ b/gpMgmt/test/behave/mgmt_utils/gpstart.feature
@@ -18,11 +18,29 @@ Feature: gpstart behave tests
        Given the database is running
          And the catalog has a standby master entry
-         When the database is not running
+         When the standby host goes down
-          And the standby host goes down
+          And the user runs command "pkill -9 postgres"
          And gpstart is run with prompts accepted
         Then gpstart should print "Continue only if you are certain that the standby is not acting as the master." to stdout
          And gpstart should print "No standby master configured" to stdout
          And gpstart should return a return code of 0
          And all the segments are running
\ No newline at end of file
+    @concourse_cluster
+    @demo_cluster
+    Scenario: gpstart starts even if a segment host is unreachable
+        Given the database is running
+          And segment 2 goes down
+          And segment 3 goes down
+          And the user runs command "pkill -9 postgres"
+         When gpstart is run with prompts accepted
+         Then gpstart should print "Host invalid_host is unreachable" to stdout
+          And gpstart should print "Marking segment 2 down because invalid_host is unreachable" to stdout
+          And gpstart should print "Marking segment 3 down because invalid_host is unreachable" to stdout
+          And the status of segment 2 should be "d"
+          And the status of segment 3 should be "d"
+          And the cluster is returned to a good state
--- a/gpMgmt/test/behave/mgmt_utils/steps/gpstart.py
+++ b/gpMgmt/test/behave/mgmt_utils/steps/gpstart.py
@@ -3,7 +3,8 @@ import signal
 import subprocess
 from behave import given, when, then
-from  test.behave_utils import utils
+from test.behave_utils import utils
+from gppylib.commands.base import Command
 def _run_sql(sql, opts=None):
    env = None
@@ -23,32 +24,29 @@ def _run_sql(sql, opts=None):
        "-c", sql,
    ], env=env)
+def do_catalog_query(query):
+    cmd = '''PGOPTIONS='-c gp_role=utility' psql -t -d template1 -c "SET allow_system_table_mods='true'; %s"''' % query
+    cmd = Command(name="catalog query", cmdStr=cmd)
+    cmd.run(validateAfter=True)
+    return cmd
+def change_hostname(dbid, hostname):
+    do_catalog_query("UPDATE gp_segment_configuration SET hostname = '{0}', address = '{0}' WHERE dbid = {1}".format(hostname, dbid))
+def change_status(dbid, status):
+    do_catalog_query("UPDATE gp_segment_configuration SET status = '%s' WHERE dbid = %s" % (status, dbid))
 @when('the standby host goes down')
 def impl(context):
-    """
+    result = do_catalog_query("SELECT dbid FROM gp_segment_configuration WHERE content = -1 AND role = 'm'")
-    Fakes a host failure by updating the standby segment entry to point at an
+    dbid = result.get_stdout().strip()
-    invalid hostname and address.
+    change_hostname(dbid, 'invalid_host')
-    """
-    opts = {
-        'gp_role': 'utility',
-        'allow_system_table_mods': 'on',
-    }
-    subprocess.check_call(['gpstart', '-am'])
-    _run_sql("""
-        UPDATE gp_segment_configuration
-           SET hostname = 'standby.invalid',
-                address = 'standby.invalid'
-         WHERE content = -1 AND role = 'm'
-    """, opts=opts)
-    subprocess.check_call(['gpstop', '-am'])
    def cleanup(context):
        """
        Reverses the above SQL by starting up in master-only utility mode. Since
        the standby host is incorrect, a regular gpstart call won't work.
        """
        utils.stop_database_if_started(context)
        subprocess.check_call(['gpstart', '-am'])
@@ -65,7 +63,7 @@ def impl(context):
        """, opts=opts)
        subprocess.check_call(['gpstop', '-am'])
-    context.add_cleanup(cleanup, context)
+        context.add_cleanup(cleanup, context)
 def _handle_sigpipe():
    """
@@ -90,3 +88,41 @@ def impl(context):
    context.stdout_message, context.stderr_message = p.communicate()
    context.ret_code = p.returncode
+@given('segment {dbid} goes down' )
+def impl(context, dbid):
+    result = do_catalog_query("SELECT hostname FROM gp_segment_configuration WHERE dbid = %s" % dbid)
+    if not hasattr(context, 'old_hostnames'):
+        context.old_hostnames = {}
+    context.old_hostnames[dbid] = result.get_stdout().strip()
+    change_hostname(dbid, 'invalid_host')
+@then('the status of segment {dbid} should be "{expected_status}"' )
+def impl(context, dbid, expected_status):
+    result = do_catalog_query("SELECT status FROM gp_segment_configuration WHERE dbid = %s" % dbid)
+    status = result .get_stdout().strip()
+    if status != expected_status:
+        raise Exception("Expected status to be %s, but it is %s" % (expected_status, status))
+@then('the status of segment {dbid} is changed to "{status}"' )
+def impl(context, dbid, status):
+    do_catalog_query("UPDATE gp_segment_configuration SET status = '%s' WHERE dbid = %s" % (status, dbid))
+@then('the cluster is returned to a good state' )
+def impl(context):
+    if not hasattr(context, 'old_hostnames'):
+        raise Exception("Cannot reset segment hostnames: no hostnames are saved")
+    for dbid, hostname in context.old_hostnames.items():
+        change_hostname(dbid, hostname)
+    context.execute_steps("""
+    When the user runs "gprecoverseg -a"
+    Then gprecoverseg should return a return code of 0
+    And all the segments are running
+    And the segments are synchronized
+    When the user runs "gprecoverseg -a -r"
+    Then gprecoverseg should return a return code of 0
+    And all the segments are running
+    And the segments are synchronized
+    """)
--- a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py
+++ b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py
@@ -1245,14 +1245,14 @@ def impl(context):
        statrep = dbconn.query(conn, check_stat_replication_query).fetchall()
    conn.close()
-    context.standby_dbid = segconfig[0][0]
    if len(segconfig) != 1:
        raise Exception("gp_segment_configuration did not have standby master")
    if len(statrep) != 1:
        raise Exception("pg_stat_replication did not have standby master")
+    context.standby_dbid = segconfig[0][0]
 @then('verify the standby master is now acting as master')
 def impl(context):
    check_segment_config_query = "SELECT * FROM gp_segment_configuration WHERE content = -1 AND role = 'p' AND preferred_role = 'p' AND dbid = %s" % context.standby_dbid