Skip to content

Commit 0c27e1f

Browse files
committed
Prefer ready and reachable control plane nodes
1 parent f7e4e7c commit 0c27e1f

File tree

2 files changed

+317
-113
lines changed

2 files changed

+317
-113
lines changed

pkg/cli/admin/mustgather/mustgather.go

Lines changed: 127 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os"
1111
"path"
1212
"regexp"
13+
"sort"
1314
"strconv"
1415
"strings"
1516
"sync"
@@ -39,6 +40,7 @@ import (
3940
"k8s.io/kubectl/pkg/util/templates"
4041
admissionapi "k8s.io/pod-security-admission/api"
4142
"k8s.io/utils/exec"
43+
utilptr "k8s.io/utils/ptr"
4244

4345
configclient "github.com/openshift/client-go/config/clientset/versioned"
4446
imagev1client "github.com/openshift/client-go/image/clientset/versioned/typed/image/v1"
@@ -51,9 +53,10 @@ import (
5153
)
5254

5355
const (
54-
gatherContainerName = "gather"
55-
unreachableTaintKey = "node.kubernetes.io/unreachable"
56-
notReadyTaintKey = "node.kubernetes.io/not-ready"
56+
gatherContainerName = "gather"
57+
unreachableTaintKey = "node.kubernetes.io/unreachable"
58+
notReadyTaintKey = "node.kubernetes.io/not-ready"
59+
controlPlaneNodeRoleLabel = "node-role.kubernetes.io/control-plane"
5760
)
5861

5962
var (
@@ -101,23 +104,9 @@ sleep 5
101104
done`
102105
)
103106

104-
var (
105-
tolerationNotReady = corev1.Toleration{
106-
Key: "node.kubernetes.io/not-ready",
107-
Operator: corev1.TolerationOpExists,
108-
Effect: corev1.TaintEffectNoSchedule,
109-
}
110-
111-
tolerationMasterNoSchedule = corev1.Toleration{
112-
Key: "node-role.kubernetes.io/master",
113-
Operator: corev1.TolerationOpExists,
114-
Effect: corev1.TaintEffectNoSchedule,
115-
}
116-
)
117-
118-
// buildNodeSelector builds a node selector from the provided node names.
119-
func buildNodeAffinity(nodeNames []string) *corev1.Affinity {
120-
if len(nodeNames) == 0 {
107+
// buildNodeAffinity builds a node affinity from the provided node hostnames.
108+
func buildNodeAffinity(nodeHostnames []string) *corev1.Affinity {
109+
if len(nodeHostnames) == 0 {
121110
return nil
122111
}
123112
return &corev1.Affinity{
@@ -129,7 +118,7 @@ func buildNodeAffinity(nodeNames []string) *corev1.Affinity {
129118
{
130119
Key: "kubernetes.io/hostname",
131120
Operator: corev1.NodeSelectorOpIn,
132-
Values: nodeNames,
121+
Values: nodeHostnames,
133122
},
134123
},
135124
},
@@ -444,40 +433,124 @@ func (o *MustGatherOptions) Validate() error {
444433
return nil
445434
}
446435

447-
// prioritizeHealthyNodes returns a preferred node to run the must-gather pod on, and a fallback node if no preferred node is found.
448-
func prioritizeHealthyNodes(nodes *corev1.NodeList) (preferred *corev1.Node, fallback *corev1.Node) {
449-
var fallbackNode *corev1.Node
450-
var latestHeartbeat time.Time
436+
func getNodeLastHeartbeatTime(node corev1.Node) *metav1.Time {
437+
for _, cond := range node.Status.Conditions {
438+
if cond.Type == corev1.NodeReady {
439+
if !cond.LastHeartbeatTime.IsZero() {
440+
return utilptr.To[metav1.Time](cond.LastHeartbeatTime)
441+
}
442+
return nil
443+
}
444+
}
445+
return nil
446+
}
447+
448+
func isNodeReadyByCondition(node corev1.Node) bool {
449+
for _, cond := range node.Status.Conditions {
450+
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
451+
return true
452+
}
453+
}
454+
return false
455+
}
456+
457+
func isNodeReadyAndReachableByTaint(node corev1.Node) bool {
458+
for _, taint := range node.Spec.Taints {
459+
if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey {
460+
return false
461+
}
462+
}
463+
return true
464+
}
451465

466+
// getCandidateNodeNames identifies suitable nodes for constructing a list of
467+
// node names for a node affinity expression.
468+
func getCandidateNodeNames(nodes *corev1.NodeList, hasMaster bool) []string {
469+
// Identify ready and reacheable nodes
470+
var controlPlaneNodes, allControlPlaneNodes, workerNodes, unschedulableNodes, remainingNodes, selectedNodes []corev1.Node
452471
for _, node := range nodes.Items {
453-
var hasUnhealthyTaint bool
454-
for _, taint := range node.Spec.Taints {
455-
if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey {
456-
hasUnhealthyTaint = true
457-
break
458-
}
472+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
473+
allControlPlaneNodes = append(allControlPlaneNodes, node)
474+
}
475+
if !isNodeReadyByCondition(node) || !isNodeReadyAndReachableByTaint(node) {
476+
remainingNodes = append(remainingNodes, node)
477+
continue
478+
}
479+
if node.Spec.Unschedulable {
480+
unschedulableNodes = append(unschedulableNodes, node)
481+
continue
482+
}
483+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
484+
controlPlaneNodes = append(controlPlaneNodes, node)
485+
} else {
486+
workerNodes = append(workerNodes, node)
459487
}
488+
}
460489

461-
// Look at heartbeat time for readiness hint
462-
for _, cond := range node.Status.Conditions {
463-
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
464-
// Check if heartbeat is recent (less than 2m old)
465-
if time.Since(cond.LastHeartbeatTime.Time) < 2*time.Minute {
466-
if !hasUnhealthyTaint {
467-
return &node, nil // return immediately on good candidate
468-
}
469-
}
470-
break
471-
}
490+
// INFO(ingvagabund): a single target node should be enough. Yet,
491+
// it might help to provide more to allow the scheduler choose a more
492+
// suitable node based on the cluster scheduling capabilities.
493+
494+
// hasMaster cause the must-gather pod to set a node selector targeting all
495+
// control plane node. So there's no point of processing other than control
496+
// plane nodes
497+
if hasMaster {
498+
if len(controlPlaneNodes) > 0 {
499+
selectedNodes = controlPlaneNodes
500+
} else {
501+
selectedNodes = allControlPlaneNodes
502+
}
503+
} else {
504+
// For hypershift case
505+
506+
// Order of preference:
507+
// - ready and reachable control plane nodes first
508+
// - then ready and reachable worker nodes
509+
// - then ready and reachable unschedulable nodes
510+
// - then any other node
511+
selectedNodes = controlPlaneNodes // this will be very likely empty for hypershift
512+
if len(selectedNodes) == 0 {
513+
selectedNodes = workerNodes
514+
}
515+
if len(selectedNodes) == 0 {
516+
// unschedulable nodes might still be better candidates than the remaining
517+
// not ready and not reacheable nodes
518+
selectedNodes = unschedulableNodes
519+
}
520+
if len(selectedNodes) == 0 {
521+
// whatever is left for the last resort
522+
selectedNodes = remainingNodes
523+
}
524+
}
525+
526+
// Sort nodes based on the cond.LastHeartbeatTime.Time to prefer nodes that
527+
// reported their status most recently
528+
sort.SliceStable(selectedNodes, func(i, j int) bool {
529+
iTime := getNodeLastHeartbeatTime(selectedNodes[i])
530+
jTime := getNodeLastHeartbeatTime(selectedNodes[j])
531+
// iTime has no effect since all is sorted right
532+
if jTime == nil {
533+
return true
534+
}
535+
if iTime == nil {
536+
return false
472537
}
538+
return jTime.Before(iTime)
539+
})
473540

474-
if fallbackNode == nil || node.CreationTimestamp.Time.After(latestHeartbeat) {
475-
fallbackNode = &node
476-
latestHeartbeat = node.CreationTimestamp.Time
541+
// Limit the number of nodes to 10 at most (choosen randomly) to provide a sane
542+
// list of nodes
543+
nodeNames := []string{}
544+
var nodeNamesSize = 0
545+
for _, n := range selectedNodes {
546+
nodeNames = append(nodeNames, n.Name)
547+
nodeNamesSize++
548+
if nodeNamesSize >= 10 {
549+
break
477550
}
478551
}
479552

480-
return nil, fallbackNode
553+
return nodeNames
481554
}
482555

483556
// Run creates and runs a must-gather pod
@@ -536,59 +609,25 @@ func (o *MustGatherOptions) Run() error {
536609
defer cleanupNamespace()
537610
}
538611

539-
nodes, err := o.Client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
612+
// Prefer to run in master if there's any but don't be explicit otherwise.
613+
// This enables the command to run by default in hypershift where there's no masters.
614+
nodes, err := o.Client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
615+
LabelSelector: o.NodeSelector,
616+
})
540617
if err != nil {
541618
return err
542619
}
543620

544-
var cpNodes, workerNodes, unschedulableNodes []corev1.Node
545-
for _, node := range nodes.Items {
546-
isReady := false
547-
for _, cond := range node.Status.Conditions {
548-
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
549-
isReady = true
550-
break
551-
}
552-
}
553-
if !isReady {
554-
continue
555-
}
556-
if node.Spec.Unschedulable {
557-
unschedulableNodes = append(unschedulableNodes, node)
558-
continue
559-
}
560-
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
561-
cpNodes = append(cpNodes, node)
562-
} else {
563-
workerNodes = append(workerNodes, node)
564-
}
565-
}
566-
567-
selectedNodes := cpNodes
568-
if len(selectedNodes) == 0 {
569-
selectedNodes = workerNodes
570-
}
571-
if len(selectedNodes) == 0 {
572-
selectedNodes = unschedulableNodes
573-
}
574-
575-
var nodeNames []string
576-
for _, n := range selectedNodes {
577-
nodeNames = append(nodeNames, n.Name)
578-
}
579-
580-
affinity := buildNodeAffinity(nodeNames)
581-
// If we have no nodes, we cannot run the must-gather pod.
582-
583-
// Determine if there is a master node
584-
hasMaster := false
621+
var hasMaster bool
585622
for _, node := range nodes.Items {
586623
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
587624
hasMaster = true
588625
break
589626
}
590627
}
591628

629+
affinity := buildNodeAffinity(getCandidateNodeNames(nodes, hasMaster))
630+
592631
// ... and create must-gather pod(s)
593632
var pods []*corev1.Pod
594633
for _, image := range o.Images {
@@ -1057,31 +1096,6 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool, affinity
10571096
cleanedSourceDir := path.Clean(o.SourceDir)
10581097
volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand)
10591098

1060-
// Define taints we do not want to tolerate (can be extended with user input in the future)
1061-
excludedTaints := []corev1.Taint{
1062-
{Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute},
1063-
{Key: notReadyTaintKey, Effect: corev1.TaintEffectNoSchedule},
1064-
}
1065-
1066-
// Define candidate tolerations we may want to apply
1067-
candidateTolerations := []corev1.Toleration{tolerationNotReady}
1068-
if node == "" && hasMaster {
1069-
candidateTolerations = append(candidateTolerations, tolerationMasterNoSchedule)
1070-
}
1071-
1072-
// Build the toleration list by only adding tolerations that do NOT tolerate excluded taints
1073-
filteredTolerations := make([]corev1.Toleration, 0, len(candidateTolerations))
1074-
TolerationLoop:
1075-
for _, tol := range candidateTolerations {
1076-
for _, excluded := range excludedTaints {
1077-
if tol.ToleratesTaint(&excluded) {
1078-
// Skip this toleration if it tolerates an excluded taint
1079-
continue TolerationLoop
1080-
}
1081-
}
1082-
filteredTolerations = append(filteredTolerations, tol)
1083-
}
1084-
10851099
ret := &corev1.Pod{
10861100
ObjectMeta: metav1.ObjectMeta{
10871101
GenerateName: "must-gather-",

0 commit comments

Comments
 (0)