Skip to content
This repository was archived by the owner on Sep 21, 2022. It is now read-only.

Commit aad52c4

Browse files
committed
cleanup, & do not include unreachable nodes
1 parent 96b5e82 commit aad52c4

File tree

1 file changed

+56
-33
lines changed

1 file changed

+56
-33
lines changed

jobs/mysql/templates/drain.sh

Lines changed: 56 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,66 @@
1-
#!/bin/bash -eu
1+
#!/usr/bin/env bash
22

3-
NODE_IP=<%= spec.ip %>
4-
MYSQL_PORT=<%= p("cf_mysql.mysql.port") %>
3+
set -e -o pipefail
54

6-
LOG_DIR="/var/vcap/sys/log/mysql/"
5+
<%
6+
require "shellwords"
77

8-
# if the node ain't running, ain't got nothin' to drain
9-
if ! ps -p $(</var/vcap/sys/run/mysql/mysql.pid) >/dev/null; then
10-
echo "mysql is not running: drain OK" &>> "$LOG_DIR/drain.log"
11-
echo 0; exit 0 # drain success
12-
fi
8+
cluster_ips = link('mysql').instances.map(&:address)
9+
if_link('arbitrator') do
10+
cluster_ips += link('arbitrator').instances.map(&:address)
11+
end
12+
%>
13+
14+
CLUSTER_NODES=(<%= cluster_ips.map{|e| Shellwords.escape e}.join(' ') %>)
15+
MYSQL_PORT=<%= Shellwords.escape p("cf_mysql.mysql.port") %>
16+
17+
function prepend_datetime() {
18+
awk -W interactive '{ system("echo -n [$(date +%FT%T%z)]"); print " " $0 }'
19+
}
1320

1421
function wsrep_var() {
15-
local var_name=$1
16-
local host=$2
17-
local port=$3
22+
local var_name="$1"
23+
local host="$2"
1824
if [[ $var_name =~ ^wsrep_[a-z_]+$ ]]; then
1925
timeout 5 \
20-
/usr/local/bin/mysql --defaults-file=/var/vcap/jobs/mysql/config/drain.cnf -h "$host" -P "$port" \
21-
--execute="SHOW STATUS LIKE '$var_name'" -N |\
22-
awk '{print $2}' | tr -d '\n'
26+
/usr/local/bin/mysql --defaults-file=/var/vcap/jobs/mysql/config/drain.cnf -h "$host" -P "$MYSQL_PORT" \
27+
--execute="SHOW STATUS LIKE '$var_name'" -N \
28+
| awk '{print $2}' \
29+
| tr -d '\n'
2330
fi
2431
}
2532

26-
CLUSTER_NODES=(`wsrep_var wsrep_incoming_addresses $NODE_IP $MYSQL_PORT | sed -e 's/,/ /g'`)
33+
LOG_DIR="/var/vcap/sys/log/mysql"
34+
35+
exec 3>&1
36+
exec \
37+
1> >(prepend_datetime >> $LOG_DIR/drain.out.log) \
38+
2> >(prepend_datetime >> $LOG_DIR/drain.err.log)
39+
40+
# if the node ain't running, ain't got nothin' to drain
41+
if ! ps -p $(</var/vcap/sys/run/mysql/mysql.pid) >/dev/null; then
42+
echo "mysql is not running: drain OK"
43+
echo 0 >&3; exit 0 # drain success
44+
fi
45+
46+
# Check each cluster node's availability.
47+
# Jump to next node if unreachable(timeout 5 sec), then do not add it as test component.
48+
# Node may have been deleted or mysql port has been updated.
49+
for NODE in "${CLUSTER_NODES[@]}"; do
50+
{ nc -zv -w 5 $NODE $MYSQL_PORT \
51+
&& CLUSTER_TEST_NODES=(${CLUSTER_TEST_NODES[@]} $NODE); } \
52+
|| continue
53+
done
2754

28-
# check if all nodes are part of the PRIMARY component; if not then
55+
# Check if all nodes are part of the PRIMARY component; if not then
2956
# something is terribly wrong (loss of quorum or split-brain) and doing a
3057
# rolling restart can actually cause data loss (e.g. if a node that is out
3158
# of sync is used to bootstrap the cluster): in this case we fail immediately.
32-
for NODE in "${CLUSTER_NODES[@]}"; do
33-
NODE_IP=`echo $NODE | cut -d ":" -f 1`
34-
NODE_PORT=`echo $NODE | cut -d ":" -f 2`
35-
cluster_status=`wsrep_var wsrep_cluster_status $NODE_IP $NODE_PORT`
36-
if [ "$cluster_status" != "Primary" ]; then
37-
echo "wsrep_cluster_status of node '$NODE_IP' is '$cluster_status' (expected 'Primary'): drain failed" &>> "$LOG_DIR/drain.log"
38-
exit 1 # drain failed
59+
for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do
60+
cluster_status=$(wsrep_var wsrep_cluster_status "$TEST_NODE")
61+
if [ "$cluster_status" != Primary ]; then
62+
echo "wsrep_cluster_status of node '$TEST_NODE' is '$cluster_status' (expected 'Primary'): drain failed"
63+
exit -1 # drain failed
3964
fi
4065
done
4166

@@ -44,16 +69,14 @@ done
4469
# Consider a 3 node cluster: if node1 is donor for node2 and we shut down node3
4570
# -that is synced- then node1 is joining, node2 is donor and node3 is down: as
4671
# a result the cluster lose quorum until node1/node2 complete the transfer!)
47-
for NODE in "${CLUSTER_NODES[@]}"; do
48-
NODE_IP=`echo $NODE | cut -d ":" -f 1`
49-
NODE_PORT=`echo $NODE | cut -d ":" -f 2`
50-
state=`wsrep_var wsrep_local_state_comment $NODE_IP $NODE_PORT`
51-
if [ "$state" != "Synced" ]; then
52-
echo "wsrep_local_state_comment of node '$NODE_IP' is '$state' (expected 'Synced'): retry drain in 5 seconds" &>> "$LOG_DIR/drain.log"
72+
for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do
73+
state=$(wsrep_var wsrep_local_state_comment "$TEST_NODE")
74+
if [ "$state" != Synced ]; then
75+
echo "wsrep_local_state_comment of node '$TEST_NODE' is '$state' (expected 'Synced'): retry drain in 5 seconds"
5376
# TODO: rewrite to avoid using dynamic drain (soon to be deprecated)
54-
echo -5 # retry in 5 seconds
77+
echo -5 >&3; exit 0 # retry in 5 seconds
5578
fi
5679
done
5780

58-
echo "Drain Success" &>> "$LOG_DIR/drain.log"
59-
echo 0; exit 0 # drain success
81+
echo "Drain Success"
82+
echo 0 >&3; exit 0 # drain success

0 commit comments

Comments
 (0)