99 "os"
1010 "path"
1111 "regexp"
12+ "sort"
1213 "strconv"
1314 "strings"
1415 "sync"
@@ -38,6 +39,7 @@ import (
3839 "k8s.io/kubectl/pkg/util/templates"
3940 admissionapi "k8s.io/pod-security-admission/api"
4041 "k8s.io/utils/exec"
42+ utilptr "k8s.io/utils/ptr"
4143
4244 configclient "github.com/openshift/client-go/config/clientset/versioned"
4345 imagev1client "github.com/openshift/client-go/image/clientset/versioned/typed/image/v1"
@@ -51,6 +53,10 @@ import (
5153
5254const (
5355 gatherContainerName = "gather"
56+
57+ notReadyTaintKey = "node.kubernetes.io/not-ready"
58+ unreachableTaintKey = "node.kubernetes.io/unreachable"
59+ controlPlaneNodeRoleLabel = "node-role.kubernetes.io/control-plane"
5460)
5561
5662var (
@@ -475,6 +481,9 @@ func (o *MustGatherOptions) Run() error {
475481 }
476482 }
477483
484+ candidateNames := getCandidateNodeNames (nodes , hasMaster )
485+ affinity := buildNodeAffinity (candidateNames )
486+
478487 // ... and create must-gather pod(s)
479488 var pods []* corev1.Pod
480489 for _ , image := range o .Images {
@@ -496,7 +505,7 @@ func (o *MustGatherOptions) Run() error {
496505 return err
497506 }
498507 for _ , node := range nodes .Items {
499- pods = append (pods , o .newPod (node .Name , image , hasMaster ))
508+ pods = append (pods , o .newPod (node .Name , image , hasMaster , affinity ))
500509 }
501510 } else {
502511 if o .NodeName != "" {
@@ -506,7 +515,7 @@ func (o *MustGatherOptions) Run() error {
506515 return err
507516 }
508517 }
509- pods = append (pods , o .newPod (o .NodeName , image , hasMaster ))
518+ pods = append (pods , o .newPod (o .NodeName , image , hasMaster , affinity ))
510519 }
511520 }
512521
@@ -924,7 +933,7 @@ func newClusterRoleBinding(ns *corev1.Namespace) *rbacv1.ClusterRoleBinding {
924933// newPod creates a pod with 2 containers with a shared volume mount:
925934// - gather: init containers that run gather command
926935// - copy: no-op container we can exec into
927- func (o * MustGatherOptions ) newPod (node , image string , hasMaster bool ) * corev1.Pod {
936+ func (o * MustGatherOptions ) newPod (node , image string , hasMaster bool , affinity * corev1. Affinity ) * corev1.Pod {
928937 zero := int64 (0 )
929938
930939 nodeSelector := map [string ]string {
@@ -956,6 +965,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
956965 // so setting priority class to system-cluster-critical
957966 PriorityClassName : "system-cluster-critical" ,
958967 RestartPolicy : corev1 .RestartPolicyNever ,
968+ Affinity : affinity ,
959969 Volumes : []corev1.Volume {
960970 {
961971 Name : "must-gather-output" ,
@@ -1058,6 +1068,121 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
10581068 return ret
10591069}
10601070
1071+ func getNodeLastHeartbeatTime (node corev1.Node ) * metav1.Time {
1072+ for _ , cond := range node .Status .Conditions {
1073+ if cond .Type == corev1 .NodeReady {
1074+ if ! cond .LastHeartbeatTime .IsZero () {
1075+ return utilptr.To [metav1.Time ](cond .LastHeartbeatTime )
1076+ }
1077+ return nil
1078+ }
1079+ }
1080+ return nil
1081+ }
1082+
1083+ func isNodeReadyByCondition (node corev1.Node ) bool {
1084+ for _ , cond := range node .Status .Conditions {
1085+ if cond .Type == corev1 .NodeReady && cond .Status == corev1 .ConditionTrue {
1086+ return true
1087+ }
1088+ }
1089+ return false
1090+ }
1091+
1092+ func isNodeReadyAndReachableByTaint (node corev1.Node ) bool {
1093+ for _ , taint := range node .Spec .Taints {
1094+ if taint .Key == unreachableTaintKey || taint .Key == notReadyTaintKey {
1095+ return false
1096+ }
1097+ }
1098+ return true
1099+ }
1100+
1101+ func getCandidateNodeNames (nodes * corev1.NodeList , hasMaster bool ) []string {
1102+ var controlPlaneNodes , allControlPlaneNodes , workerNodes , unschedulableNodes , remainingNodes , selectedNodes []corev1.Node
1103+ for _ , node := range nodes .Items {
1104+ if _ , ok := node .Labels [controlPlaneNodeRoleLabel ]; ok {
1105+ allControlPlaneNodes = append (allControlPlaneNodes , node )
1106+ }
1107+ if ! isNodeReadyByCondition (node ) || ! isNodeReadyAndReachableByTaint (node ) {
1108+ remainingNodes = append (remainingNodes , node )
1109+ continue
1110+ }
1111+ if node .Spec .Unschedulable {
1112+ unschedulableNodes = append (unschedulableNodes , node )
1113+ continue
1114+ }
1115+ if _ , ok := node .Labels [controlPlaneNodeRoleLabel ]; ok {
1116+ controlPlaneNodes = append (controlPlaneNodes , node )
1117+ } else {
1118+ workerNodes = append (workerNodes , node )
1119+ }
1120+ }
1121+
1122+ if hasMaster {
1123+ if len (controlPlaneNodes ) > 0 {
1124+ selectedNodes = controlPlaneNodes
1125+ } else {
1126+ selectedNodes = allControlPlaneNodes
1127+ }
1128+ } else {
1129+ selectedNodes = controlPlaneNodes
1130+ if len (selectedNodes ) == 0 {
1131+ selectedNodes = workerNodes
1132+ }
1133+ if len (selectedNodes ) == 0 {
1134+ selectedNodes = unschedulableNodes
1135+ }
1136+ if len (selectedNodes ) == 0 {
1137+ selectedNodes = remainingNodes
1138+ }
1139+ }
1140+
1141+ sort .SliceStable (selectedNodes , func (i , j int ) bool {
1142+ iTime := getNodeLastHeartbeatTime (selectedNodes [i ])
1143+ jTime := getNodeLastHeartbeatTime (selectedNodes [j ])
1144+ if jTime == nil {
1145+ return true
1146+ }
1147+ if iTime == nil {
1148+ return false
1149+ }
1150+ return jTime .Before (iTime )
1151+ })
1152+
1153+ nodeNames := []string {}
1154+ for idx , n := range selectedNodes {
1155+ if idx >= 10 {
1156+ break
1157+ }
1158+ nodeNames = append (nodeNames , n .Name )
1159+ }
1160+ return nodeNames
1161+ }
1162+
1163+ func buildNodeAffinity (nodeHostnames []string ) * corev1.Affinity {
1164+ if len (nodeHostnames ) == 0 {
1165+ return nil
1166+ }
1167+ return & corev1.Affinity {
1168+ NodeAffinity : & corev1.NodeAffinity {
1169+ RequiredDuringSchedulingIgnoredDuringExecution : & corev1.NodeSelector {
1170+ NodeSelectorTerms : []corev1.NodeSelectorTerm {
1171+ {
1172+ MatchExpressions : []corev1.NodeSelectorRequirement {
1173+ {
1174+ Key : "kubernetes.io/hostname" ,
1175+ Operator : corev1 .NodeSelectorOpIn ,
1176+ Values : nodeHostnames ,
1177+ },
1178+ },
1179+ },
1180+ },
1181+ },
1182+ },
1183+ }
1184+ }
1185+
10611186// BackupGathering is called if the full must-gather has an error. This is useful for making sure we get *something*
10621187// no matter what has failed. It should be focused on universal openshift failures.
10631188func (o * MustGatherOptions ) BackupGathering (ctx context.Context , errs []error ) {
0 commit comments