cleanup, & do not include unreachable nodes

iamejboy · iamejboy · commit aad52c4831d3 · 2018-07-02T03:00:52.000Z
diff --git a/jobs/mysql/templates/drain.sh b/jobs/mysql/templates/drain.sh
@@ -1,41 +1,66 @@
-#!/bin/bash -eu
+#!/usr/bin/env bash
 
-NODE_IP=<%= spec.ip %>
-MYSQL_PORT=<%= p("cf_mysql.mysql.port") %>
+set -e -o pipefail
 
-LOG_DIR="/var/vcap/sys/log/mysql/"
+<%
+  require "shellwords"
 
-# if the node ain't running, ain't got nothin' to drain
-if ! ps -p $(</var/vcap/sys/run/mysql/mysql.pid) >/dev/null; then
-  echo "mysql is not running: drain OK" &>> "$LOG_DIR/drain.log"
-  echo 0; exit 0 # drain success
-fi
+  cluster_ips = link('mysql').instances.map(&:address)
+  if_link('arbitrator') do
+    cluster_ips += link('arbitrator').instances.map(&:address)
+  end
+%>
+
+CLUSTER_NODES=(<%= cluster_ips.map{|e| Shellwords.escape e}.join(' ') %>)
+MYSQL_PORT=<%= Shellwords.escape p("cf_mysql.mysql.port") %>
+
+function prepend_datetime() {
+  awk -W interactive '{ system("echo -n [$(date +%FT%T%z)]"); print " " $0 }'
+}
 
 function wsrep_var() {
-  local var_name=$1
-  local host=$2
-  local port=$3
+  local var_name="$1"
+  local host="$2"
   if [[ $var_name =~ ^wsrep_[a-z_]+$ ]]; then
     timeout 5 \
-      /usr/local/bin/mysql --defaults-file=/var/vcap/jobs/mysql/config/drain.cnf -h "$host" -P "$port" \
-      --execute="SHOW STATUS LIKE '$var_name'" -N |\
-      awk '{print $2}' | tr -d '\n'
+      /usr/local/bin/mysql --defaults-file=/var/vcap/jobs/mysql/config/drain.cnf -h "$host" -P "$MYSQL_PORT" \
+      --execute="SHOW STATUS LIKE '$var_name'" -N \
+      | awk '{print $2}' \
+      | tr -d '\n'
   fi
 }
 
-CLUSTER_NODES=(`wsrep_var wsrep_incoming_addresses $NODE_IP $MYSQL_PORT | sed -e 's/,/ /g'`)
+LOG_DIR="/var/vcap/sys/log/mysql"
+
+exec 3>&1
+exec \
+  1> >(prepend_datetime >> $LOG_DIR/drain.out.log) \
+  2> >(prepend_datetime >> $LOG_DIR/drain.err.log)
+
+# if the node ain't running, ain't got nothin' to drain
+if ! ps -p $(</var/vcap/sys/run/mysql/mysql.pid) >/dev/null; then
+  echo "mysql is not running: drain OK"
+  echo 0 >&3; exit 0 # drain success
+fi
+
+# Check each cluster node's availability.
+# Jump to next node if unreachable(timeout 5 sec), then do not add it as test component.
+# Node may have been deleted or mysql port has been updated.
+for NODE in "${CLUSTER_NODES[@]}"; do
+  { nc -zv -w 5 $NODE $MYSQL_PORT \
+  && CLUSTER_TEST_NODES=(${CLUSTER_TEST_NODES[@]} $NODE); } \
+  || continue
+done
 
-# check if all nodes are part of the PRIMARY component; if not then
+# Check if all nodes are part of the PRIMARY component; if not then
 # something is terribly wrong (loss of quorum or split-brain) and doing a
 # rolling restart can actually cause data loss (e.g. if a node that is out
 # of sync is used to bootstrap the cluster): in this case we fail immediately.
-for NODE in "${CLUSTER_NODES[@]}"; do
-  NODE_IP=`echo $NODE | cut -d ":" -f 1`
-  NODE_PORT=`echo $NODE | cut -d ":" -f 2`
-  cluster_status=`wsrep_var wsrep_cluster_status $NODE_IP $NODE_PORT`
-  if [ "$cluster_status" != "Primary" ]; then
-    echo "wsrep_cluster_status of node '$NODE_IP' is '$cluster_status' (expected 'Primary'): drain failed" &>> "$LOG_DIR/drain.log"
-    exit 1 # drain failed
+for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do
+  cluster_status=$(wsrep_var wsrep_cluster_status "$TEST_NODE")
+  if [ "$cluster_status" != Primary ]; then
+    echo "wsrep_cluster_status of node '$TEST_NODE' is '$cluster_status' (expected 'Primary'): drain failed"
+    exit -1 # drain failed
   fi
 done
 
@@ -44,16 +69,14 @@ done
 # Consider a 3 node cluster: if node1 is donor for node2 and we shut down node3
 # -that is synced- then node1 is joining, node2 is donor and node3 is down: as
 # a result the cluster lose quorum until node1/node2 complete the transfer!)
-for NODE in "${CLUSTER_NODES[@]}"; do
-  NODE_IP=`echo $NODE | cut -d ":" -f 1`
-  NODE_PORT=`echo $NODE | cut -d ":" -f 2`
-  state=`wsrep_var wsrep_local_state_comment $NODE_IP $NODE_PORT`
-  if [ "$state" != "Synced" ]; then
-    echo "wsrep_local_state_comment of node '$NODE_IP' is '$state' (expected 'Synced'): retry drain in 5 seconds" &>> "$LOG_DIR/drain.log"
+for TEST_NODE in "${CLUSTER_TEST_NODES[@]}"; do
+  state=$(wsrep_var wsrep_local_state_comment "$TEST_NODE")
+  if [ "$state" != Synced ]; then
+    echo "wsrep_local_state_comment of node '$TEST_NODE' is '$state' (expected 'Synced'): retry drain in 5 seconds"
     # TODO: rewrite to avoid using dynamic drain (soon to be deprecated)
-    echo -5 # retry in 5 seconds
+    echo -5 >&3; exit 0 # retry in 5 seconds
   fi
 done
 
-echo "Drain Success" &>> "$LOG_DIR/drain.log"
-echo 0; exit 0 # drain success
+echo "Drain Success"
+echo 0 >&3; exit 0 # drain success