[Backport 6X] Using gp_add_segment to register mirror in catalog

When introducing a new mirror, we need two steps: 1. start mirror segment 2. update gp_segment_configuration catalog Previously gp_add_segment_mirror will be called to update the catalog, but dbid is chosen by get_availableDbId() which cannot ensure to be the same dbid in internal.auto.conf. Reported by issue9837 Reviewed-by: N Paul Guo <pguo@pivotal.io> Reviewed-by: N Bhuvnesh Chaudhary <bhuvnesh2703@gmail.com> cherry-pick from commit: f7965d and 1ee999

[Backport 6X] Using gp_add_segment to register mirror in catalog
When introducing a new mirror, we need two steps: 1. start mirror segment 2. update gp_segment_configuration catalog Previously gp_add_segment_mirror will be called to update the catalog, but dbid is chosen by get_availableDbId() which cannot ensure to be the same dbid in internal.auto.conf. Reported by issue9837 Reviewed-by: N Paul Guo <pguo@pivotal.io> Reviewed-by: N Bhuvnesh Chaudhary <bhuvnesh2703@gmail.com> cherry-pick from commit: f7965d and 1ee999
7b0f1758 · Hubert Zhang · GitHub · 1d37e0ff · 7b0f1758 · 7b0f1758
4 changed file
--- a/gpMgmt/bin/gppylib/system/configurationImplGpdb.py
+++ b/gpMgmt/bin/gppylib/system/configurationImplGpdb.py
@@ -126,11 +126,6 @@ class GpConfigurationProviderUsingGpdbCatalog(GpConfigurationProvider) :
        for seg in update.mirror_to_add:
            mirror_map[ seg.getSegmentContentId() ] = seg

-        # reset dbId of new mirror segments to -1
-        # before invoking the operations which will assign them new ids
-        for seg in update.mirror_to_add:
-            seg.setSegmentDbId(-1)
-
        # remove mirror segments (e.g. for gpexpand rollback)
        for seg in update.mirror_to_remove:
            self.__updateSystemConfigRemoveMirror(conn, seg, textForConfigTable)
@@ -298,17 +293,20 @@ class GpConfigurationProviderUsingGpdbCatalog(GpConfigurationProvider) :

    def __callSegmentAddMirror(self, conn, gpArray, seg):
        """
-        Call gp_add_segment_mirror() to add the mirror.
-        Return the new segment's dbid.
+        Similar to __callSegmentAdd, ideally we should call gp_add_segment_mirror() to add the mirror.
+        But chicken-egg problem also exists in mirror case. If we use gp_add_segment_mirror(),
+        new dbid will be chosen by `get_availableDbId()`, which cannot ensure to be same as dbid
+        in internal.auto.conf(see issue-9837). Refer to __callSegmentAdd for details.
        """
        logger.debug('callSegmentAddMirror %s' % repr(seg))

-        sql = "SELECT gp_add_segment_mirror(%s::int2, %s, %s, %s, %s)" \
+        sql = "SELECT gp_add_segment(%s::int2, %s::int2, 'm', 'm', 'n', 'd', %s, %s, %s, %s)" \
            % (
+                self.__toSqlIntValue(seg.getSegmentDbId()),
                self.__toSqlIntValue(seg.getSegmentContentId()),
+                self.__toSqlIntValue(seg.getSegmentPort()),
                self.__toSqlTextValue(seg.getSegmentHostName()),
                self.__toSqlTextValue(seg.getSegmentAddress()),
-                self.__toSqlIntValue(seg.getSegmentPort()),
                self.__toSqlTextValue(seg.getSegmentDataDirectory()),
              )


--- a/src/test/isolation2/expected/segwalrep/recoverseg_from_file.out
+++ b/src/test/isolation2/expected/segwalrep/recoverseg_from_file.out
+-- Test gprecoverseg from config file uses the correct dbid.
+--
+-- In github issue 9837 dbid in gp_segment_configuration is not
+-- consistent with dbid in file internal.auto.conf.
+-- This is caused by gprecoverseg fetch the smallest dbid in
+-- gp_segment_configuration which is not occupied by others when
+-- adding a new mirror. When dbid in gp_segment_configuration is not
+-- continous, the inconsistent issue will happen
+
+include: helpers/server_helpers.sql;
+CREATE
+
+--
+-- generate_recover_config_file:
+--   generate config file used by recoverseg -i
+--
+create or replace function generate_recover_config_file(datadir text, port text) returns void as $$ import io import os myhost = os.uname()[1] inplaceConfig = myhost + '|' + port + '|' + datadir configStr = inplaceConfig + ' ' + inplaceConfig  f = open("/tmp/recover_config_file", "w") f.write(configStr) f.close() $$ language plpythonu;
+CREATE
+
+SELECT dbid, role, preferred_role, content, mode, status FROM gp_segment_configuration order by dbid;
+ dbid | role | preferred_role | content | mode | status 
+------+------+----------------+---------+------+--------
+ 1    | p    | p              | -1      | n    | u      
+ 2    | p    | p              | 0       | s    | u      
+ 3    | p    | p              | 1       | s    | u      
+ 4    | p    | p              | 2       | s    | u      
+ 5    | m    | m              | 0       | s    | u      
+ 6    | m    | m              | 1       | s    | u      
+ 7    | m    | m              | 2       | s    | u      
+ 8    | m    | m              | -1      | s    | u      
+(8 rows)
+-- stop a primary in order to trigger a mirror promotion
+select pg_ctl((select datadir from gp_segment_configuration c where c.role='p' and c.content=1), 'stop');
+ pg_ctl 
+--------
+ OK     
+(1 row)
+
+-- trigger failover
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+
+-- wait for content 1 (earlier mirror, now primary) to finish the promotion
+1U: select 1;
+ ?column? 
+----------
+ 1        
+(1 row)
+-- Quit this utility mode session, as need to start fresh one below
+1Uq: ... <quitting>
+
+-- make the dbid in gp_segment_configuration not continuous
+-- dbid=2 corresponds to content 0 and role p, change it to dbid=9
+set allow_system_table_mods to true;
+SET
+update gp_segment_configuration set dbid=9 where content=0 and role='p';
+UPDATE 1
+
+-- trigger failover
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+
+-- wait for content 0 (earlier mirror, now primary) to finish the promotion
+0U: select 1;
+ ?column? 
+----------
+ 1        
+(1 row)
+-- Quit this utility mode session, as need to start fresh one below
+0Uq: ... <quitting>
+
+-- generate recover config file
+select generate_recover_config_file( (select datadir from gp_segment_configuration c where c.role='m' and c.content=1), (select port from gp_segment_configuration c where c.role='m' and c.content=1)::text);
+ generate_recover_config_file 
+------------------------------
+                              
+(1 row)
+
+-- recover from config file, only seg with content=1 will be recovered
+!\retcode gprecoverseg -a -i /tmp/recover_config_file;
+-- start_ignore
+-- end_ignore
+(exited with code 0)
+
+-- after gprecoverseg -i, the down segemnt should be up
+-- in mirror mode
+select status from gp_segment_configuration where role='m' and content=1;
+ status 
+--------
+ u      
+(1 row)
+
+-- recover should reuse the old dbid and not occupy dbid=2
+select dbid from gp_segment_configuration where dbid=2;
+ dbid 
+------
+(0 rows)
+
+update gp_segment_configuration set dbid=2 where dbid=9;
+UPDATE 1
+set allow_system_table_mods to false;
+SET
+
+-- we manually change dbid from 2 to 9, which causes the
+-- corresponding segment down as well, so recovery full
+-- at here
+!\retcode gprecoverseg -a;
+-- start_ignore
+-- end_ignore
+(exited with code 0)
+
+-- rebalance the cluster
+!\retcode gprecoverseg -ar;
+-- start_ignore
+-- end_ignore
+(exited with code 0)
+
+-- recheck gp_segment_configuration after rebalance
+SELECT dbid, role, preferred_role, content, mode, status FROM gp_segment_configuration order by dbid;
+ dbid | role | preferred_role | content | mode | status 
+------+------+----------------+---------+------+--------
+ 1    | p    | p              | -1      | n    | u      
+ 2    | p    | p              | 0       | s    | u      
+ 3    | p    | p              | 1       | s    | u      
+ 4    | p    | p              | 2       | s    | u      
+ 5    | m    | m              | 0       | s    | u      
+ 6    | m    | m              | 1       | s    | u      
+ 7    | m    | m              | 2       | s    | u      
+ 8    | m    | m              | -1      | s    | u      
+(8 rows)
+
+-- remove the config file
+!\retcode rm /tmp/recover_config_file
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
@@ -186,6 +186,7 @@ test: segwalrep/die_commit_pending_replication
 test: fts_errors
 test: segwalrep/commit_blocking
 test: segwalrep/fts_unblock_primary
+test: segwalrep/recoverseg_from_file
 test: segwalrep/mirror_promotion
 test: segwalrep/cancel_commit_pending_replication
 test: segwalrep/twophase_tolerance_with_mirror_promotion

--- a/src/test/isolation2/sql/segwalrep/recoverseg_from_file.sql
+++ b/src/test/isolation2/sql/segwalrep/recoverseg_from_file.sql
+-- Test gprecoverseg from config file uses the correct dbid.
+--
+-- In github issue 9837 dbid in gp_segment_configuration is not
+-- consistent with dbid in file internal.auto.conf.
+-- This is caused by gprecoverseg fetch the smallest dbid in
+-- gp_segment_configuration which is not occupied by others when
+-- adding a new mirror. When dbid in gp_segment_configuration is not
+-- continous, the inconsistent issue will happen
+
+include: helpers/server_helpers.sql;
+
+--
+-- generate_recover_config_file:
+--   generate config file used by recoverseg -i
+--
+create or replace function generate_recover_config_file(datadir text, port text)
+returns void as $$
+    import io
+    import os
+    myhost = os.uname()[1]
+    inplaceConfig = myhost + '|' + port + '|' + datadir
+    configStr = inplaceConfig + ' ' + inplaceConfig
+	
+    f = open("/tmp/recover_config_file", "w")
+    f.write(configStr)
+    f.close()
+$$ language plpythonu;
+
+SELECT dbid, role, preferred_role, content, mode, status FROM gp_segment_configuration order by dbid;
+-- stop a primary in order to trigger a mirror promotion
+select pg_ctl((select datadir from gp_segment_configuration c
+where c.role='p' and c.content=1), 'stop');
+
+-- trigger failover
+select gp_request_fts_probe_scan();
+
+-- wait for content 1 (earlier mirror, now primary) to finish the promotion
+1U: select 1;
+-- Quit this utility mode session, as need to start fresh one below
+1Uq:
+
+-- make the dbid in gp_segment_configuration not continuous
+-- dbid=2 corresponds to content 0 and role p, change it to dbid=9
+set allow_system_table_mods to true;
+update gp_segment_configuration set dbid=9 where content=0 and role='p';
+
+-- trigger failover
+select gp_request_fts_probe_scan();
+
+-- wait for content 0 (earlier mirror, now primary) to finish the promotion
+0U: select 1;
+-- Quit this utility mode session, as need to start fresh one below
+0Uq:
+
+-- generate recover config file
+select generate_recover_config_file(
+	(select datadir from gp_segment_configuration c where c.role='m' and c.content=1),
+	(select port from gp_segment_configuration c where c.role='m' and c.content=1)::text);
+
+-- recover from config file, only seg with content=1 will be recovered
+!\retcode gprecoverseg -a -i /tmp/recover_config_file;
+
+-- after gprecoverseg -i, the down segemnt should be up
+-- in mirror mode
+select status from gp_segment_configuration
+where role='m' and content=1;
+
+-- recover should reuse the old dbid and not occupy dbid=2
+select dbid from gp_segment_configuration where dbid=2;
+
+update gp_segment_configuration set dbid=2 where dbid=9;
+set allow_system_table_mods to false;
+
+-- we manually change dbid from 2 to 9, which causes the
+-- corresponding segment down as well, so recovery full
+-- at here
+!\retcode gprecoverseg -a;
+
+-- rebalance the cluster
+!\retcode gprecoverseg -ar;
+
+-- recheck gp_segment_configuration after rebalance
+SELECT dbid, role, preferred_role, content, mode, status FROM gp_segment_configuration order by dbid;
+
+-- remove the config file
+!\retcode rm /tmp/recover_config_file