提交 6e1281db 编写于 作者: Q qinzuoyan 提交者: WeijieSun

improve onebox layout; improve pegasus_rolling_update.sh; add pegasus_offline_node.sh

Summary: Ref T10570

Test Plan: N/A

Reviewers: sunweijie, cailiuyang

Reviewed By: sunweijie

Subscribers: #pegasus

Maniphest Tasks: T10570

Differential Revision: https://phabricator.d.xiaomi.net/D83607

Conflicts:
	rdsn
上级 456200bb
Subproject commit 3c46af2ec013abeb20e6a0d2ec6b16bc6d0e4758
Subproject commit 2b3cbff75c10fd04998d59ea89b560876ce5567c
......@@ -414,39 +414,41 @@ function run_start_onebox()
echo "ERROR: file ${DSN_ROOT}/bin/pegasus_server/pegasus_server not exist"
exit -1
fi
if ps -ef | grep ' \./pegasus_server config.ini' | grep -E 'app_list meta@|app_list replica@'; then
if ps -ef | grep ' /pegasus_server config.ini' | grep -E 'app_list meta|app_list replica'; then
echo "ERROR: some onebox processes are running, start failed"
exit -1
fi
ln -s -f ${DSN_ROOT}/bin/pegasus_server/pegasus_server
run_start_zk
sed "s/@LOCAL_IP@/`hostname -i`/g;s/@META_COUNT@/${META_COUNT}/g;s/@REPLICA_COUNT@/${REPLICA_COUNT}/g;s/@APP_NAME@/${APP_NAME}/g;s/@PARTITION_COUNT@/${PARTITION_COUNT}/g" \
sed "s/@LOCAL_IP@/`hostname -i`/g;s/@APP_NAME@/${APP_NAME}/g;s/@PARTITION_COUNT@/${PARTITION_COUNT}/g" \
${ROOT}/src/server/config-server.ini >${ROOT}/config-server.ini
echo "starting server"
mkdir -p onebox
cd onebox
for i in $(seq ${META_COUNT})
do
meta_port=$((34600+i))
mkdir -p meta$i;
cd meta$i
ln -s -f ${DSN_ROOT}/bin/pegasus_server/pegasus_server pegasus_server
ln -s -f ${ROOT}/config-server.ini config.ini
echo "cd `pwd` && ./pegasus_server config.ini -app_list meta@$i &>result &"
./pegasus_server config.ini -app_list meta@$i &>result &
sed "s/@META_PORT@/$meta_port/;s/@REPLICA_PORT@/34800/" ${ROOT}/config-server.ini >config.ini
echo "cd `pwd` && ../meta$i/pegasus_server config.ini -app_list meta &>result &"
../meta$i/pegasus_server config.ini -app_list meta &>result &
PID=$!
ps -ef | grep ' \./pegasus_server config.ini' | grep "\<$PID\>"
ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
cd ..
done
for j in $(seq ${REPLICA_COUNT})
do
replica_port=$((34800+j))
mkdir -p replica$j
cd replica$j
ln -s -f ${DSN_ROOT}/bin/pegasus_server/pegasus_server pegasus_server
ln -s -f ${ROOT}/config-server.ini config.ini
echo "cd `pwd` && ./pegasus_server config.ini -app_list replica@$j &>result &"
./pegasus_server config.ini -app_list replica@$j &>result &
sed "s/@META_PORT@/34600/;s/@REPLICA_PORT@/$replica_port/" ${ROOT}/config-server.ini >config.ini
echo "cd `pwd` && ../replica$j/pegasus_server config.ini -app_list replica &>result &"
../replica$j/pegasus_server config.ini -app_list replica &>result &
PID=$!
ps -ef | grep ' \./pegasus_server config.ini' | grep "\<$PID\>"
ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
cd ..
done
}
......@@ -478,7 +480,7 @@ function run_stop_onebox()
esac
shift
done
ps -ef | grep ' \./pegasus_server config.ini' | grep -E 'app_list meta@|app_list replica@' | awk '{print $2}' | xargs kill &>/dev/null
ps -ef | grep '/pegasus_server config.ini' | grep -E 'app_list meta|app_list replica' | awk '{print $2}' | xargs kill &>/dev/null
}
#####################
......@@ -508,7 +510,7 @@ function run_list_onebox()
esac
shift
done
ps -ef | grep ' \./pegasus_server config.ini' | grep -E 'app_list meta@|app_list replica@' | sort -k11
ps -ef | grep '/pegasus_server config.ini' | grep -E 'app_list meta|app_list replica' | sort -k11
}
#####################
......@@ -598,17 +600,17 @@ function run_start_onebox_instance()
echo "ERROR: invalid meta_id"
exit -1
fi
if ps -ef | grep ' \./pegasus_server config.ini' | grep "app_list meta@$META_ID\>" ; then
if ps -ef | grep "/meta$META_ID/pegasus_server config.ini" | grep "app_list meta" ; then
echo "INFO: meta@$META_ID already running"
exit -1
fi
cd $dir
echo "cd `pwd` && ./pegasus_server config.ini -app_list meta@$META_ID &>result &"
./pegasus_server config.ini -app_list meta@$META_ID &>result &
echo "cd `pwd` && ../meta$META_ID/pegasus_server config.ini -app_list meta &>result &"
../meta$META_ID/pegasus_server config.ini -app_list meta &>result &
PID=$!
ps -ef | grep ' \./pegasus_server config.ini' | grep "\<$PID\>"
ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
cd ..
echo "INFO: meta@$META started"
echo "INFO: meta@$META_ID started"
fi
if [ $REPLICA_ID != "0" ]; then
dir=onebox/replica$REPLICA_ID
......@@ -616,15 +618,15 @@ function run_start_onebox_instance()
echo "ERROR: invalid replica_id"
exit -1
fi
if ps -ef | grep ' \./pegasus_server config.ini' | grep "app_list replica@$REPLICA_ID\>" ; then
if ps -ef | grep "/replica$REPLICA_ID/pegasus_server config.ini" | grep "app_list replica" ; then
echo "INFO: replica@$REPLICA_ID already running"
exit -1
fi
cd $dir
echo "cd `pwd` && ./pegasus_server config.ini -app_list replica@$REPLICA_ID &>result &"
./pegasus_server config.ini -app_list replica@$REPLICA_ID &>result &
echo "cd `pwd` && ../replica$REPLICA_ID/pegasus_server config.ini -app_list replica &>result &"
../replica$REPLICA_ID/pegasus_server config.ini -app_list replica &>result &
PID=$!
ps -ef | grep ' \./pegasus_server config.ini' | grep "\<$PID\>"
ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
cd ..
echo "INFO: replica@$REPLICA_ID started"
fi
......@@ -685,11 +687,11 @@ function run_stop_onebox_instance()
echo "ERROR: invalid meta_id"
exit -1
fi
if ! ps -ef | grep ' \./pegasus_server config.ini' | grep "app_list meta@$META_ID\>" ; then
if ! ps -ef | grep "/meta$META_ID/pegasus_server config.ini" | grep "app_list meta" ; then
echo "INFO: meta@$META_ID is not running"
exit -1
fi
ps -ef | grep ' \./pegasus_server config.ini' | grep "app_list meta@$META_ID\>" | awk '{print $2}' | xargs kill &>/dev/null
ps -ef | grep "/meta$META_ID/pegasus_server config.ini" | grep "app_list meta" | awk '{print $2}' | xargs kill &>/dev/null
echo "INFO: meta@$META_ID stopped"
fi
if [ $REPLICA_ID != "0" ]; then
......@@ -698,11 +700,11 @@ function run_stop_onebox_instance()
echo "ERROR: invalid replica_id"
exit -1
fi
if ! ps -ef | grep ' \./pegasus_server config.ini' | grep "app_list replica@$REPLICA_ID\>" ; then
if ! ps -ef | grep "/replica$REPLICA_ID/pegasus_server config.ini" | grep "app_list replica" ; then
echo "INFO: replica@$REPLICA_ID is not running"
exit -1
fi
ps -ef | grep ' \./pegasus_server config.ini' | grep "app_list replica@$REPLICA_ID\>" | awk '{print $2}' | xargs kill &>/dev/null
ps -ef | grep "/replica$REPLICA_ID/pegasus_server config.ini" | grep "app_list replica" | awk '{print $2}' | xargs kill &>/dev/null
echo "INFO: replica@$REPLICA_ID stopped"
fi
}
......
cluster_info
server_info
ls -d
nodes -d
app_stat
query_backup_policy -p every_day
......@@ -70,6 +70,7 @@ do
if [ "$type" = "run" ]
then
cat /tmp/pegasus.cmd.$app
cat /tmp/pegasus.cmd.$app | ./run.sh shell --cluster $cluster 2>/dev/null
echo
echo
......
#!/bin/bash
#
# Offline replica server using minos.
#
if [ $# -le 2 ]; then
echo "USAGE: $0 <cluster-name> <cluster-meta-list> <start_task_id>"
echo
echo "For example:"
echo " $0 onebox 127.0.0.1:34601,127.0.0.1:34602 0"
echo
exit -1
fi
cluster=$1
meta_list=$2
start_task_id=$3
pwd="$( cd "$( dirname "$0" )" && pwd )"
shell_dir="$( cd $pwd/.. && pwd )"
minos_config_dir=$(dirname $MINOS_CONFIG_FILE)/xiaomi-config/conf/pegasus
minos_client_dir=/home/work/pegasus/infra/minos/client
cd $shell_dir
minos_config=$minos_config_dir/pegasus-${cluster}.cfg
if [ ! -f $minos_config ]; then
echo "ERROR: minos config \"$minos_config\" not found"
exit -1
fi
minos_client=$minos_client_dir/deploy
if [ ! -f $minos_client ]; then
echo "ERROR: minos client \"$minos_client\" not found"
exit -1
fi
echo "Start time: `date`"
all_start_time=$((`date +%s`))
echo
echo "Generating /tmp/pegasus.offline_node.minos.show..."
cd $minos_client_dir
./deploy show pegasus $cluster &>/tmp/pegasus.offline_node.minos.show
echo "Generating /tmp/pegasus.offline_node.rs.list..."
grep 'Showing task [0-9][0-9]* of replica' /tmp/pegasus.offline_node.minos.show | awk '{print $5,$9}' | sed 's/(.*)$//' >/tmp/pegasus.offline_node.rs.list
replica_server_count=`cat /tmp/pegasus.offline_node.rs.list | wc -l`
if [ $replica_server_count -eq 0 ]; then
echo "ERROR: replica server count is 0 by minos show"
exit -1
fi
cd $shell_dir
echo "Generating /tmp/pegasus.offline_node.cluster_info..."
echo cluster_info | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.offline_node.cluster_info
cname=`grep zookeeper_root /tmp/pegasus.offline_node.cluster_info | grep -o '/[^/]*$' | grep -o '[^/]*$'`
if [ "$cname" != "$cluster" ]; then
echo "ERROR: cluster name and meta list not matched"
exit -1
fi
pmeta=`grep primary_meta_server /tmp/pegasus.offline_node.cluster_info | grep -o '[0-9.:]*$'`
if [ "$pmeta" == ""]; then
echo "ERROR: extract primary_meta_server by shell failed"
exit -1
fi
echo "Generating /tmp/pegasus.offline_node.nodes..."
echo nodes | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.offline_node.nodes
rs_port=`grep '^[0-9.]*:' /tmp/pegasus.offline_node.nodes | head -n 1 | grep -o ':[0-9]*' | grep -o '[0-9]*'`
if [ "$rs_port" == "" ]; then
echo "ERROR: extract replica server port by shell failed"
exit -1
fi
echo "Set meta level to steady..."
echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.offline_node.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/pegasus.offline_node.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to steady failed"
exit -1
fi
echo "Set lb.assign_delay_ms to 10..."
echo "remote_command -l $pmeta meta.lb.assign_delay_ms 10" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.offline_node.assign_delay_ms
set_ok=`grep OK /tmp/pegasus.offline_node.assign_delay_ms | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set lb.assign_delay_ms to 10 failed"
exit -1
fi
echo
while read line
do
task_id=`echo $line | awk '{print $1}'`
if [ $task_id -ne $start_task_id ]; then
continue
fi
node_str=`echo $line | awk '{print $2}'`
node_ip=`getent hosts $node_str | awk '{print $1}'`
node_name=`getent hosts $node_str | awk '{print $2}'`
node=${node_ip}:${rs_port}
echo "=================================================================="
echo "=================================================================="
echo "Offline replica server task $task_id of [$node_name] [$node]..."
echo
echo "Getting serving replica count..."
serving_replica_count=`echo 'nodes -d' | ./run.sh shell --cluster $meta_list | grep $node | awk '{print $3}'`
echo "servicing_replica_count=$serving_replica_count"
echo
echo "Migrating primary replicas out of node..."
./run.sh migrate_node -c $meta_list -n $node -t run &>/tmp/pegasus.offline_node.migrate_node
echo "Wait [$node] to migrate done..."
echo "Refer to /tmp/pegasus.offline_node.migrate_node for details"
while true
do
pri_count=`echo 'nodes -d' | ./run.sh shell --cluster $meta_list | grep $node | awk '{print $4}'`
if [ $pri_count -eq 0 ]; then
echo "Migrate done."
break
else
echo "Still $pri_count primary replicas left on $node"
sleep 1
fi
done
echo
sleep 1
echo "Downgrading replicas on node..."
./run.sh downgrade_node -c $meta_list -n $node -t run &>/tmp/pegasus.offline_node.downgrade_node
echo "Wait [$node] to downgrade done..."
echo "Refer to /tmp/pegasus.offline_node.downgrade_node for details"
while true
do
rep_count=`echo 'nodes -d' | ./run.sh shell --cluster $meta_list | grep $node | awk '{print $3}'`
if [ $rep_count -eq 0 ]; then
echo "Downgrade done."
break
else
echo "Still $rep_count replicas left on $node"
sleep 1
fi
done
echo
sleep 1
echo "Send kill_partition to node..."
grep '^propose ' /tmp/pegasus.offline_node.downgrade_node >/tmp/pegasus.offline_node.downgrade_node.propose
while read line2
do
gpid=`echo $line2 | awk '{print $3}' | sed 's/\./ /'`
echo "remote_command -l $node replica.kill_partition $gpid" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.offline_node.kill_partition
done </tmp/pegasus.offline_node.downgrade_node.propose
echo "Sent kill_partition to `cat /tmp/pegasus.offline_node.downgrade_node.propose | wc -l` partitions"
echo
sleep 1
echo "Stop node by minos..."
cd $minos_client_dir
./deploy stop pegasus $cluster --skip_confirm --job replica --task $task_id
cd $shell_dir
echo "Stop node by minos done."
echo
sleep 1
echo "Wait cluster to become healthy..."
while true
do
unhealthy_count=`echo "ls -d" | ./run.sh shell --cluster $meta_list | awk 'BEGIN{s=0} f{ if($NF<7){f=0} else if($3!=$4){s=s+$5+$6} } /fully_healthy_num/{f=1} END{print s}'`
if [ $unhealthy_count -eq 0 ]; then
echo "Cluster becomes healthy"
break
else
echo "Cluster not healthy, unhealthy_partition_count = $unhealthy_count"
sleep 10
fi
done
echo
sleep 1
done </tmp/pegasus.offline_node.rs.list
echo "Set lb.assign_delay_ms to DEFAULT..."
echo "remote_command -l $pmeta meta.lb.assign_delay_ms DEFAULT" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.offline_node.assign_delay_ms
set_ok=`grep OK /tmp/pegasus.offline_node.assign_delay_ms | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set lb.assign_delay_ms to DEFAULT failed"
exit -1
fi
echo
all_finish_time=$((`date +%s`))
echo "Offline replica server task $start_task_id done."
echo "Elapsed time is $((all_finish_time - all_start_time)) seconds."
......@@ -69,6 +69,11 @@ if [ "$cname" != "$cluster" ]; then
echo "ERROR: cluster name and meta list not matched"
exit -1
fi
pmeta=`grep primary_meta_server /tmp/pegasus.rolling_update.cluster_info | grep -o '[0-9.:]*$'`
if [ "$pmeta" == "" ]; then
echo "ERROR: extract primary_meta_server by shell failed"
exit -1
fi
echo "Generating /tmp/pegasus.rolling_update.nodes..."
echo nodes | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.rolling_update.nodes
......@@ -108,6 +113,14 @@ do
echo "servicing_replica_count=$serving_replica_count"
echo
echo "Set lb.add_secondary_max_count_for_one_node to 0..."
echo "remote_command -l $pmeta meta.lb.add_secondary_max_count_for_one_node 0" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.rolling_update.add_secondary_max_count_for_one_node
set_ok=`grep OK /tmp/pegasus.rolling_update.add_secondary_max_count_for_one_node | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set lb.add_secondary_max_count_for_one_node to 0 failed"
exit -1
fi
echo "Migrating primary replicas out of node..."
./run.sh migrate_node -c $meta_list -n $node -t run &>/tmp/pegasus.rolling_update.migrate_node
echo "Wait [$node] to migrate done..."
......@@ -124,7 +137,7 @@ do
fi
done
echo
sleep 3
sleep 1
echo "Downgrading replicas on node..."
./run.sh downgrade_node -c $meta_list -n $node -t run &>/tmp/pegasus.rolling_update.downgrade_node
......@@ -142,7 +155,52 @@ do
fi
done
echo
sleep 3
sleep 1
echo "Send kill_partition to node..."
grep '^propose ' /tmp/pegasus.rolling_update.downgrade_node >/tmp/pegasus.rolling_update.downgrade_node.propose
while read line2
do
gpid=`echo $line2 | awk '{print $3}' | sed 's/\./ /'`
echo "remote_command -l $node replica.kill_partition $gpid" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.rolling_update.kill_partition
done </tmp/pegasus.rolling_update.downgrade_node.propose
echo "Sent kill_partition to `cat /tmp/pegasus.rolling_update.downgrade_node.propose | wc -l` partitions"
echo
sleep 1
echo "Checking replicas closed on node..."
sleeped=0
while true
do
echo "remote_command -l $node perf-counters '.*replica(Count)'" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.rolling_update.replica_count_perf_counters
serving_count=`grep -o 'replica_stub.replica(Count)","type":"NUMBER","value":[0-9]*' /tmp/pegasus.rolling_update.replica_count_perf_counters | grep -o '[0-9]*$'`
opening_count=`grep -o 'replica_stub.opening.replica(Count)","type":"NUMBER","value":[0-9]*' /tmp/pegasus.rolling_update.replica_count_perf_counters | grep -o '[0-9]*$'`
closing_count=`grep -o 'replica_stub.closing.replica(Count)","type":"NUMBER","value":[0-9]*' /tmp/pegasus.rolling_update.replica_count_perf_counters | grep -o '[0-9]*$'`
if [ "$serving_count" = "" -o "$opening_count" = "" -o "$closing_count" = "" ]; then
echo "ERROR: extract replica count from perf counters failed"
exit -1
fi
rep_count=$((serving_count + opening_count + closing_count))
if [ $rep_count -eq 0 -o $sleeped -gt 20 ]; then
break
else
echo "Still $rep_count replicas not closed on $node"
sleep 1
sleeped=$((sleeped+1))
fi
done
echo
sleep 1
echo "remote_command -l $node flush-log" | ./run.sh shell --cluster $meta_list &>/dev/null
echo "Set lb.add_secondary_max_count_for_one_node to 100..."
echo "remote_command -l $pmeta meta.lb.add_secondary_max_count_for_one_node 100" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.rolling_update.add_secondary_max_count_for_one_node
set_ok=`grep OK /tmp/pegasus.rolling_update.add_secondary_max_count_for_one_node | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set lb.add_secondary_max_count_for_one_node to 100 failed"
exit -1
fi
echo "Rolling update by minos..."
cd $minos_client_dir
......@@ -150,11 +208,7 @@ do
cd $shell_dir
echo "Rolling update by minos done."
echo
echo "Sleep 20 seconds for server restarting..."
sleep 20
echo "Sleep done."
echo
sleep 1
echo "Wait [$node] to become alive..."
while true
......@@ -168,21 +222,21 @@ do
fi
done
echo
sleep 1
echo "Wait cluster to become healthy..."
while true
do
unhealthy_count=`echo "ls -d" | ./run.sh shell --cluster $meta_list | awk 'f{ if($NF<7){f=0} else if($3!=$4){print} } /fully_healthy_num/{f=1}' | wc -l`
if [ $unhealthy_count -eq 0 ]; then
echo "Cluster becomes healthy, sleep 10 seconds before stepping next..."
sleep 10
echo "Cluster becomes healthy."
break
else
sleep 1
fi
done
echo "Sleep done."
echo
sleep 1
finish_time=$((`date +%s`))
echo "Rolling update replica server task $task_id of [$node_name] [$node] done."
......@@ -190,21 +244,28 @@ do
echo
if [ "$type" = "one" ]; then
echo "Finish time: `date`"
all_finish_time=$((`date +%s`))
echo "Rolling update one done, elasped time is $((all_finish_time - all_start_time)) seconds."
exit 0
break
fi
done </tmp/pegasus.rolling_update.rs.list
echo "=================================================================="
echo "=================================================================="
echo "Rolling update meta servers and collectors..."
cd $minos_client_dir
./deploy rolling_update pegasus $cluster --skip_confirm --time_interval 10 $update_options --job meta collector
cd $shell_dir
echo
echo "Set lb.add_secondary_max_count_for_one_node to DEFAULT..."
echo "remote_command -l $pmeta meta.lb.add_secondary_max_count_for_one_node DEFAULT" | ./run.sh shell --cluster $meta_list &>/tmp/pegasus.rolling_update.add_secondary_max_count_for_one_node
set_ok=`grep OK /tmp/pegasus.rolling_update.add_secondary_max_count_for_one_node | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set lb.add_secondary_max_count_for_one_node to DEFAULT failed"
exit -1
fi
if [ "$type" = "all" ]; then
echo "=================================================================="
echo "=================================================================="
echo "Rolling update meta servers and collectors..."
cd $minos_client_dir
./deploy rolling_update pegasus $cluster --skip_confirm --time_interval 10 $update_options --job meta collector
cd $shell_dir
echo
fi
echo "Finish time: `date`"
all_finish_time=$((`date +%s`))
echo "Rolling update all done, elasped time is $((all_finish_time - all_start_time)) seconds."
echo "Rolling update $type done, elasped time is $((all_finish_time - all_start_time)) seconds."
......@@ -9,19 +9,19 @@ count = 1
type = meta
name = meta
arguments =
ports = 34601
ports = @META_PORT@
pools = THREAD_POOL_DEFAULT,THREAD_POOL_META_SERVER,THREAD_POOL_META_STATE,THREAD_POOL_FD,THREAD_POOL_DLOCK,THREAD_POOL_LOCAL_SERVICE,THREAD_POOL_FDS_SERVICE
run = true
count = @META_COUNT@
count = 1
[apps.replica]
type = replica
name = replica
arguments =
ports = 34801
ports = @REPLICA_PORT@
pools = THREAD_POOL_DEFAULT,THREAD_POOL_REPLICATION_LONG,THREAD_POOL_REPLICATION,THREAD_POOL_FD,THREAD_POOL_LOCAL_APP,THREAD_POOL_LOCAL_SERVICE,THREAD_POOL_FDS_SERVICE
run = true
count = @REPLICA_COUNT@
count = 1
[core]
;tool = simulator
......
......@@ -36,7 +36,9 @@ static const char *s_brief_stat_mapper[] = {
"read_qps", "zion*profiler*RPC_L2_CLIENT_READ.qps",
"GET_P99(ns)", "zion*profiler*RPC_RRDB_RRDB_GET.latency.server",
"MULTI_GET_P99(ns)", "zion*profiler*RPC_RRDB_RRDB_MULTI_GET.latency.server",
"replica_count", "replica*eon.replica_stub*replica(Count)",
"serving_replica_count", "replica*eon.replica_stub*replica(Count)",
"opening_replica_count", "replica*eon.replica_stub*opening.replica(Count)",
"closing_replica_count", "replica*eon.replica_stub*closing.replica(Count)",
"commit_throughput", "replica*eon.replica_stub*replicas.commit.qps",
"learning_count", "replica*eon.replica_stub*replicas.learning.count",
"shared_log_size(MB)", "replica*eon.replica_stub*shared.log.size(MB)",
......
......@@ -1558,6 +1558,16 @@ DEFINE_TASK_CODE(UPDATING_ROCKSDB_SSTSIZE, TASK_PRIORITY_COMMON, THREAD_POOL_REP
return ::dsn::ERR_OK;
}
if (!clear_state) {
rocksdb::FlushOptions options;
options.wait = true;
auto status = _db->Flush(options);
if (!status.ok()) {
derror("%s: flush memtable failed: %s",
replica_name(), status.ToString().c_str());
}
}
_context_cache.clear();
// when stop the, should stop the timer_task.
......@@ -1584,6 +1594,8 @@ DEFINE_TASK_CODE(UPDATING_ROCKSDB_SSTSIZE, TASK_PRIORITY_COMMON, THREAD_POOL_REP
_pfc_sst_size->set(0);
}
ddebug("%s: close app succeed, clear_state = %s",
replica_name(), clear_state ? "true" : "false");
return ::dsn::ERR_OK;
}
......
......@@ -13,26 +13,37 @@ global_env global_env::inst;
global_env::global_env()
{
std::cout << "============" << std::endl << "start global_env()" << std::endl;
get_dirs();
get_hostip();
}
void global_env::get_dirs()
{
const char *cmd = "readlink /proc/`ps aux | grep pegasus_server | grep -v grep | grep @ | sed "
"-n \"1p\" | awk '{print $2}'`/cwd";
std::stringstream ss;
pipe_execute(cmd, ss);
const char *cmd1 = "ps aux | grep pegasus_server | grep meta1 | awk '{print $2}'";
std::stringstream ss1;
pipe_execute(cmd1, ss1);
int meta1_pid;
ss1 >> meta1_pid;
std::cout << "meta1 pid: " << meta1_pid << std::endl;
// get the dir of a process in onebox, say: $PEGASUS/onebox/meta1
char task_target[512];
ss >> task_target;
char cmd2[512];
sprintf(cmd2, "readlink /proc/%d/cwd", meta1_pid);
std::stringstream ss2;
pipe_execute(cmd2, ss2);
std::string meta1_dir;
ss2 >> meta1_dir;
std::cout << "meta1 dir: " << meta1_dir << std::endl;
_pegasus_root = dirname(dirname(task_target));
std::cout << "get project root: " << _pegasus_root << std::endl;
_pegasus_root = dirname(dirname((char *)meta1_dir.c_str()));
std::cout << "project root: " << _pegasus_root << std::endl;
assert(_pegasus_root != ".");
char task_target[512];
assert(getcwd(task_target, sizeof(task_target)) != nullptr);
_working_dir = task_target;
std::cout << "working dir: " << _working_dir << std::endl;
}
void global_env::get_hostip()
......@@ -40,7 +51,7 @@ void global_env::get_hostip()
std::stringstream output;
pipe_execute("hostname -i", output);
output >> _host_ip;
std::cout << "get host ip: " << _host_ip << std::endl;
std::cout << "host ip: " << _host_ip << std::endl;
}
/*static*/
......
......@@ -165,7 +165,7 @@ public:
snprintf(command,
512,
"cd %s && sed -i \"/^recover_from_replica_server/c recover_from_replica_server = "
"true\" config-server.ini",
"true\" onebox/meta1/config.ini",
global_env::instance()._pegasus_root.c_str());
system(command);
}
......@@ -175,10 +175,9 @@ public:
char command[512];
snprintf(command,
512,
"cd %s/onebox/replica%d/data/replica%d/reps && rm -rf %d.%d.pegasus",
"cd %s/onebox/replica%d/data/replica/reps && rm -rf %d.%d.pegasus",
global_env::instance()._pegasus_root.c_str(),
replica_id,
replica_id,
app_id,
partition_id);
std::cout << command << std::endl;
......@@ -190,10 +189,9 @@ public:
char command[512];
snprintf(command,
512,
"cd %s/onebox/replica%d/data/replica%d/reps && rm -rf %d.*.pegasus",
"cd %s/onebox/replica%d/data/replica/reps && rm -rf %d.*.pegasus",
global_env::instance()._pegasus_root.c_str(),
replica_id,
replica_id,
app_id);
std::cout << command << std::endl;
system(command);
......
......@@ -75,18 +75,16 @@ public:
virtual void TearDown() override
{
chdir(global_env::instance()._pegasus_root.c_str());
system("./run.sh clear_onebox");
std::this_thread::sleep_for(std::chrono::seconds(3));
// TODO: when teardown must recover config-server.ini
system("git checkout -- src/server/config-server.ini");
system("./run.sh start_onebox");
std::cout << "sleep 10s to restart onebox" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(10));
std::string cmd = "rm -rf " + backup_data_dir;
system(cmd.c_str());
// go back to working dir
chdir(working_root_dir.c_str());
chdir(global_env::instance()._working_dir.c_str());
}
void write_data()
......
......@@ -38,11 +38,22 @@ static void truncate_recent_file(const std::string &path)
std::string file_name;
ss >> file_length >> file_name;
std::cout << "get file with size: (" << file_name << ", " << file_length << ")" << std::endl;
std::cout << "truncate file with size: (" << file_name << ", " << file_length << ")"
<< std::endl;
snprintf(
command, 512, "truncate -s %lu %s/%s", file_length / 3, path.c_str(), file_name.c_str());
std::cout << command << std::endl;
system(command);
snprintf(command, 512, "ls -l %s/%s | awk '{print $5}'", path.c_str(), file_name.c_str());
std::stringstream ss2;
global_env::pipe_execute(command, ss2);
size_t new_file_length;
ss2 >> new_file_length;
ASSERT_LT(new_file_length, file_length);
std::cout << "after truncated file size: " << new_file_length << std::endl;
}
TEST(lost_log, slog)
......@@ -84,7 +95,7 @@ TEST(lost_log, slog)
system("./run.sh stop_onebox");
std::cout << "truncate slog for replica1" << std::endl;
truncate_recent_file("onebox/replica1/data/replica1/slog");
truncate_recent_file("onebox/replica1/data/replica/slog");
std::cout << "restart onebox again" << std::endl;
system("./run.sh start_onebox");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册