Skip to content

Commit d31ffb1

Browse files
authored
Chaos Test: Fix Re-Incarnation (#77)
1 parent 403a65d commit d31ffb1

File tree

1 file changed

+89
-37
lines changed

1 file changed

+89
-37
lines changed

scripts/run.sh

Lines changed: 89 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ function retry {
6363
return $exit
6464
fi
6565
count=$(($count + 1))
66+
retryfile="/scripts/retry-stop"
67+
if [ -e "$retryfile" ]; then
68+
return 0
69+
fi
6670
done
6771
return 0
6872
}
@@ -78,8 +82,8 @@ IFS=', ' read -r -a peers <<<"$hosts"
7882
echo "${peers[@]}"
7983
log "INFO" "hosts are ${peers[@]}"
8084

81-
report_host="$HOSTNAME.$GOV_SVC.$POD_NAMESPACE.svc"
82-
echo "report_host = $report_host "
85+
report_host="$HOSTNAME.$GOV_SVC.$POD_NAMESPACE"
86+
echo "report_host = $report_host"
8387

8488
# comma separated host names
8589
export hosts=$(echo -n ${peers[*]} | sed -e "s/ /,/g")
@@ -122,6 +126,8 @@ echo "!includedir /etc/mysql/conf.d/" >>/etc/mysql/my.cnf
122126
cat >>/etc/mysql/group-replication.conf.d/group.cnf <<EOL
123127
[mysqld]
124128
default-authentication-plugin=mysql_native_password
129+
log_error_suppression_list = 'MY-013360' #remove this message "Plugin mysql_native_password reported: ''mysql_native_password' is deprecated and will be removed in a future release. Please use caching_sha2_password instead'"
130+
#log_error_suppression_list = 'MY-013360,MY-011873,MY-011879' # NUMA related warnings
125131
disabled_storage_engines="MyISAM,BLACKHOLE,FEDERATED,ARCHIVE,MEMORY"
126132
127133
# General replication settings
@@ -136,6 +142,7 @@ binlog_format = ROW
136142
transaction_write_set_extraction = XXHASH64
137143
loose-group_replication_bootstrap_group = OFF
138144
loose-group_replication_start_on_boot = OFF
145+
loose_group_replication_unreachable_majority_timeout = 20
139146
140147
# recommended config
141148
innodb_buffer_pool_size = "$INNODB_BUFFER_POOL_SIZE"
@@ -192,6 +199,12 @@ function wait_for_mysqld_running() {
192199
exit 1
193200
fi
194201
log "INFO" "mysql daemon is ready to use......."
202+
203+
# Set read-only immediately after MySQL starts to prevent any external
204+
# process (e.g. KubeDB health checker) from writing local GTIDs before
205+
# the node joins GR. Cannot be set in my.cnf because it blocks --initialize.
206+
${mysql} -N -e "SET GLOBAL read_only=ON; SET GLOBAL super_read_only=ON;" 2>/dev/null
207+
log "INFO" "Set super_read_only=ON to prevent errant GTIDs"
195208
}
196209

197210
function create_replication_user() {
@@ -204,28 +217,40 @@ function create_replication_user() {
204217
local mysql="$mysql_header --host=$localhost"
205218

206219
# At first, ensure that the command executes without any error. Then, run the command again and extract the output.
207-
retry 120 ${mysql} -N -e "select count(host) from mysql.user where mysql.user.user='repl';" | awk '{print$1}'
220+
retry 60 ${mysql} -N -e "select count(host) from mysql.user where mysql.user.user='repl';" | awk '{print$1}'
208221
out=$(${mysql} -N -e "select count(host) from mysql.user where mysql.user.user='repl';" | awk '{print$1}')
209-
# if the user doesn't exist, crete new one.
222+
# if the user doesn't exist, create new one.
223+
# All operations run in a SINGLE session with SQL_LOG_BIN=0 to prevent
224+
# writing local GTIDs that would create errant transactions on rejoin.
210225
if [[ "$out" -eq "0" ]]; then
211226
log "INFO" "Replication user not found. Creating new replication user........"
212-
retry 120 ${mysql} -N -e "SET SQL_LOG_BIN=0;"
213-
retry 120 ${mysql} -N -e "CREATE USER 'repl'@'%' IDENTIFIED BY '$MYSQL_ROOT_PASSWORD' REQUIRE SSL;"
214-
retry 120 ${mysql} -N -e "GRANT REPLICATION SLAVE ON *.* TO 'repl'@'%';"
215-
# You must therefore give the `BACKUP_ADMIN` and `CLONE_ADMIN` privilege to this replication user on all group members that support cloning process
216-
# https://dev.mysql.com/doc/refman/8.0/en/group-replication-cloning.html
217-
# https://dev.mysql.com/doc/refman/8.0/en/clone-plugin-remote.html
218-
retry 120 ${mysql} -N -e "GRANT BACKUP_ADMIN ON *.* TO 'repl'@'%';"
219-
retry 120 ${mysql} -N -e "GRANT CLONE_ADMIN ON *.* TO 'repl'@'%';"
220-
retry 120 ${mysql} -N -e "FLUSH PRIVILEGES;"
221-
retry 120 ${mysql} -N -e "SET SQL_LOG_BIN=1;"
222-
223-
retry 120 ${mysql} -N -e "CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='$MYSQL_ROOT_PASSWORD' FOR CHANNEL 'group_replication_recovery';"
224-
retry 120 ${mysql} -N -e "RESET MASTER;"
227+
retry 60 ${mysql} -N -e "
228+
SET SQL_LOG_BIN=0;
229+
SET GLOBAL super_read_only=OFF;
230+
SET GLOBAL read_only=OFF;
231+
CREATE USER 'repl'@'%' IDENTIFIED BY '$MYSQL_ROOT_PASSWORD' REQUIRE SSL;
232+
GRANT REPLICATION SLAVE ON *.* TO 'repl'@'%';
233+
GRANT BACKUP_ADMIN ON *.* TO 'repl'@'%';
234+
GRANT CLONE_ADMIN ON *.* TO 'repl'@'%';
235+
FLUSH PRIVILEGES;
236+
CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='$MYSQL_ROOT_PASSWORD' FOR CHANNEL 'group_replication_recovery';
237+
RESET MASTER;
238+
SET GLOBAL read_only=ON;
239+
SET GLOBAL super_read_only=ON;
240+
SET SQL_LOG_BIN=1;
241+
"
225242
else
226243
log "INFO" "Replication user exists. Skipping creating new one......."
227244
# Update replication channel password if it has been changed via RotateAuth
228-
retry 120 ${mysql} -N -e "CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='$MYSQL_ROOT_PASSWORD' FOR CHANNEL 'group_replication_recovery';"
245+
retry 60 ${mysql} -N -e "
246+
SET SQL_LOG_BIN=0;
247+
SET GLOBAL super_read_only=OFF;
248+
SET GLOBAL read_only=OFF;
249+
CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='$MYSQL_ROOT_PASSWORD' FOR CHANNEL 'group_replication_recovery';
250+
SET GLOBAL read_only=ON;
251+
SET GLOBAL super_read_only=ON;
252+
SET SQL_LOG_BIN=1;
253+
"
229254
fi
230255
touch /scripts/ready.txt
231256
}
@@ -235,14 +260,14 @@ function install_group_replication_plugin() {
235260
local mysql="$mysql_header --host=$localhost"
236261

237262
# At first, ensure that the command executes without any error. Then, run the command again and extract the output.
238-
retry 120 ${mysql} -N -e 'SHOW PLUGINS;' | grep group_replication
263+
retry 60 ${mysql} -N -e 'SHOW PLUGINS;' | grep group_replication
239264
out=$(${mysql} -N -e 'SHOW PLUGINS;' | grep group_replication)
240265
if [[ -z "$out" ]]; then
241266
log "INFO" "Group replication plugin is not installed. Installing the plugin...."
242267
# replication plugin will be installed when the member getting bootstrapped or joined into the group first time.
243268
# that's why assign `joining_for_first_time` variable to 1 for making further reset process.
244269
joining_for_first_time=1
245-
retry 120 ${mysql} -e "INSTALL PLUGIN group_replication SONAME 'group_replication.so';"
270+
retry 60 ${mysql} -e "SET SQL_LOG_BIN=0; SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; INSTALL PLUGIN group_replication SONAME 'group_replication.so'; SET GLOBAL read_only=ON; SET GLOBAL super_read_only=ON; SET SQL_LOG_BIN=1;"
246271
log "INFO" "Group replication plugin successfully installed"
247272
else
248273
log "INFO" "Already group replication plugin is installed"
@@ -254,24 +279,30 @@ function install_clone_plugin() {
254279
local mysql="$mysql_header --host=$localhost"
255280

256281
# At first, ensure that the command executes without any error. Then, run the command again and extract the output.
257-
retry 120 ${mysql} -N -e 'SHOW PLUGINS;' | grep clone
282+
retry 60 ${mysql} -N -e 'SHOW PLUGINS;' | grep clone
258283
out=$(${mysql} -N -e 'SHOW PLUGINS;' | grep clone)
259284
if [[ -z "$out" ]]; then
260285
log "INFO" "Clone plugin is not installed. Installing the plugin..."
261-
retry 120 ${mysql} -e "INSTALL PLUGIN clone SONAME 'mysql_clone.so';"
286+
retry 60 ${mysql} -e "SET SQL_LOG_BIN=0; SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; INSTALL PLUGIN clone SONAME 'mysql_clone.so'; SET GLOBAL read_only=ON; SET GLOBAL super_read_only=ON; SET SQL_LOG_BIN=1;"
262287
log "INFO" "Clone plugin successfully installed"
263288
else
264289
log "INFO" "Already clone plugin is installed"
265290
fi
266291
}
267292

268293
function check_member_list_updated() {
294+
269295
for host in $@; do
270296
local mysql="$mysql_header --host=$host"
271297
if [[ "$report_host" == "$host" ]]; then
272298
continue
273299
fi
274300
for i in {60..0}; do
301+
kill -0 $pid
302+
exit="$?"
303+
if [[ "$exit" != "0" ]]; then
304+
break
305+
fi
275306
alive_members_id=($(${mysql} -N -e "SELECT MEMBER_ID FROM performance_schema.replication_group_members WHERE MEMBER_STATE = 'ONLINE';"))
276307
alive_cluster_size=${#alive_members_id[@]}
277308
listed_members_id=($(${mysql} -N -e "SELECT MEMBER_ID FROM performance_schema.replication_group_members;"))
@@ -299,8 +330,13 @@ function wait_for_primary() {
299330
local is_primary_found=0
300331
for member_id in ${members_id[*]}; do
301332
for i in {60..0}; do
333+
kill -0 $pid
334+
exit="$?"
335+
if [[ "$exit" != "0" ]]; then
336+
break
337+
fi
302338
primary_member_id=$(${mysql} -N -e "SHOW STATUS WHERE Variable_name = 'group_replication_primary_member';" | awk '{print $2}')
303-
log "INFO" "Attempt $i: Trying to find primary member........................"
339+
log "INFO" "Attempt $i: Trying to find primary member, from ${host}........................"
304340
if [[ -n "$primary_member_id" ]]; then
305341
is_primary_found=1
306342
primary_host=$(${mysql} -N -e "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ID = '${primary_member_id}';" | awk '{print $1}')
@@ -330,14 +366,19 @@ function wait_for_primary() {
330366
# declare donors array for further use
331367
declare -a donors
332368
function set_valid_donors() {
369+
kill -0 $pid
370+
exit="$?"
371+
if [[ "$exit" != "0" ]]; then
372+
return
373+
fi
333374
log "INFO" "Checking whether valid donor is found or not. If found, set this to 'clone_valid_donor_list'"
334375
local mysql="$mysql_header --host=$localhost"
335376
# clone process run when the donor and recipient must have the same MySQL server version and
336377
# https://dev.mysql.com/doc/refman/8.0/en/clone-plugin-remote.html#:~:text=The%20clone%20plugin%20is%20supported,17%20and%20higher.&text=The%20donor%20and%20recipient%20MySQL%20server%20instances%20must%20run,same%20operating%20system%20and%20platform.
337378
report_host_version=$(${mysql} -N -e "SHOW VARIABLES LIKE 'version';" | awk '{print $2}')
338379

339380
# At first, ensure that the command executes without any error. Then, run the command again and extract the output.
340-
retry 120 ${mysql_header} --host=$primary_host -N -e "SELECT * FROM performance_schema.replication_group_members;"
381+
retry 60 ${mysql_header} --host=$primary_host -N -e "SELECT * FROM performance_schema.replication_group_members;"
341382

342383
donor_list=$(${mysql_header} --host=$primary_host -N -e "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_STATE = 'ONLINE';")
343384

@@ -367,7 +408,7 @@ function set_valid_donors() {
367408
valid_donors=$(echo -n ${donors[*]} | sed -e "s/ /:3306,/g" && echo -n ":3306")
368409
log "INFO" "Valid donors found. The list of valid donor are: ${valid_donors}"
369410
# https://dev.mysql.com/doc/refman/8.0/en/clone-plugin-options-variables.html#sysvar_clone_valid_donor_list
370-
retry 120 ${mysql} -N -e "SET GLOBAL clone_valid_donor_list='${valid_donors}';"
411+
retry 60 ${mysql} -N -e "SET GLOBAL clone_valid_donor_list='${valid_donors}';"
371412
fi
372413
}
373414

@@ -381,26 +422,33 @@ function bootstrap_cluster() {
381422
# ref: https://dev.mysql.com/doc/refman/8.0/en/group-replication-bootstrap.html
382423
local mysql="$mysql_header --host=$localhost"
383424
log "INFO" "bootstrapping cluster with host $report_host..."
425+
# Temporarily disable read-only for bootstrap operations.
426+
# GR will manage read-only after START GROUP_REPLICATION.
427+
retry 60 ${mysql} -N -e "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF;"
384428
if [[ "$joining_for_first_time" == "1" ]]; then
385-
retry 120 ${mysql} -N -e "RESET MASTER;"
429+
retry 60 ${mysql} -N -e "RESET MASTER;"
386430
fi
387-
retry 120 ${mysql} -N -e "SET GLOBAL group_replication_bootstrap_group=ON;"
388-
retry 120 ${mysql} -N -e "START GROUP_REPLICATION;"
389-
retry 120 ${mysql} -N -e "SET GLOBAL group_replication_bootstrap_group=OFF;"
431+
retry 60 ${mysql} -N -e "SET GLOBAL group_replication_bootstrap_group=ON;"
432+
retry 60 ${mysql} -N -e "START GROUP_REPLICATION;"
433+
retry 60 ${mysql} -N -e "SET GLOBAL group_replication_bootstrap_group=OFF;"
390434
}
391435

392436
function join_into_cluster() {
393437
# member try to join into the existing group
394438
log "INFO" "The replica, ${report_host} is joining into the existing group..."
395439
local mysql="$mysql_header --host=$localhost"
396440

441+
# Temporarily disable read-only for join operations.
442+
# GR will manage read-only after START GROUP_REPLICATION.
443+
retry 60 ${mysql} -N -e "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF;"
444+
397445
# for 1st time joining, there need to run `RESET MASTER` to set the binlog and gtid's initial position.
398446
# then run clone process to copy data directly from valid donor. That's why pod will be restart for 1st time joining into the group replication.
399447
# https://dev.mysql.com/doc/refman/8.0/en/clone-plugin-remote.html
400448
export mysqld_alive=1
401449
if [[ "$joining_for_first_time" == "1" ]]; then
402450
log "INFO" "Resetting binlog & gtid to initial state as $report_host is joining for first time.."
403-
retry 120 ${mysql} -N -e "RESET MASTER;"
451+
retry 60 ${mysql} -N -e "RESET MASTER;"
404452
# clone process will run when the joiner get valid donor and the primary member's data will be be gather than or equal 128MB
405453
if [[ $valid_donor_found == 1 ]] && [[ $primary_db_size -ge 128 ]]; then
406454
for donor in ${donors[*]}; do
@@ -418,7 +466,7 @@ function join_into_cluster() {
418466
fi
419467

420468
# wait for background process `mysqld` have been killed
421-
for i in {120..0}; do
469+
for i in {60..0}; do
422470
kill -0 $pid
423471
exit="$?"
424472
log "INFO" "Attempt $i: Checking mysqld(process id=$pid) is alive or not, exit code: $exit"
@@ -439,13 +487,13 @@ function join_into_cluster() {
439487
fi
440488
# If the host is still alive, it will join the cluster directly.
441489
if [[ $mysqld_alive == 1 ]]; then
442-
retry 120 ${mysql} -N -e "START GROUP_REPLICATION;"
490+
retry 60 ${mysql} -N -e "START GROUP_REPLICATION;"
443491
log "INFO" "Host (${report_host}) has joined to the group......."
444492
else
445493
#run mysqld in background since mysqld can't restart after a clone process
446494
start_mysqld_in_background
447495
wait_for_mysqld_running
448-
retry 120 ${mysql} -N -e "START GROUP_REPLICATION;"
496+
retry 60 ${mysql} -N -e "START GROUP_REPLICATION;"
449497
log "INFO" "Host (${report_host}) has joined to the group......."
450498
#
451499
fi
@@ -458,12 +506,16 @@ function join_by_clone() {
458506
log "INFO" "The replica, ${report_host} is joining into the existing group..."
459507
local mysql="$mysql_header --host=$localhost"
460508

509+
# Temporarily disable read-only for clone operations.
510+
# GR will manage read-only after START GROUP_REPLICATION.
511+
retry 60 ${mysql} -N -e "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF;"
512+
461513
# for 1st time joining, there need to run `RESET MASTER` to set the binlog and gtid's initial position.
462514
# then run clone process to copy data directly from valid donor. That's why pod will be restart for 1st time joining into the group replication.
463515
# https://dev.mysql.com/doc/refman/8.0/en/clone-plugin-remote.html
464516
export mysqld_alive=1
465517
log "INFO" "Resetting binlog & gtid to initial state as $report_host is joining for first time.."
466-
retry 120 ${mysql} -N -e "RESET MASTER;"
518+
retry 60 ${mysql} -N -e "RESET MASTER;"
467519
if [[ $valid_donor_found == 1 ]]; then
468520
for donor in ${donors[*]}; do
469521
log "INFO" "Cloning data from $donor to $report_host....."
@@ -480,7 +532,7 @@ function join_by_clone() {
480532
fi
481533

482534
# wait for background process `mysqld` have been killed
483-
for i in {120..0}; do
535+
for i in {60..0}; do
484536
kill -0 $pid
485537
exit="$?"
486538
log "INFO" "Attempt $i: Checking mysqld(process id=$pid) is alive or not, exit code: $exit"
@@ -500,13 +552,13 @@ function join_by_clone() {
500552
fi
501553
# If the host is still alive, it will join the cluster directly.
502554
if [[ $mysqld_alive == 1 ]]; then
503-
retry 120 ${mysql} -N -e "START GROUP_REPLICATION;"
555+
retry 60 ${mysql} -N -e "START GROUP_REPLICATION;"
504556
log "INFO" "Host (${report_host}) has joined to the group......."
505557
else
506558
#run mysqld in background since mysqld can't restart after a clone process
507559
start_mysqld_in_background
508560
wait_for_mysqld_running
509-
retry 120 ${mysql} -N -e "START GROUP_REPLICATION;"
561+
retry 60 ${mysql} -N -e "START GROUP_REPLICATION;"
510562
log "INFO" "Host (${report_host}) has joined to the group......."
511563
#
512564
fi

0 commit comments

Comments
 (0)