Skip to content

Commit c898bdb

Browse files
committed
backport of openshift#2060 to release-4.18
1 parent 4fcb2d0 commit c898bdb

File tree

1 file changed

+128
-3
lines changed

1 file changed

+128
-3
lines changed

pkg/cli/admin/mustgather/mustgather.go

Lines changed: 128 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"os"
1010
"path"
1111
"regexp"
12+
"sort"
1213
"strconv"
1314
"strings"
1415
"sync"
@@ -38,6 +39,7 @@ import (
3839
"k8s.io/kubectl/pkg/util/templates"
3940
admissionapi "k8s.io/pod-security-admission/api"
4041
"k8s.io/utils/exec"
42+
utilptr "k8s.io/utils/ptr"
4143

4244
configclient "github.com/openshift/client-go/config/clientset/versioned"
4345
imagev1client "github.com/openshift/client-go/image/clientset/versioned/typed/image/v1"
@@ -51,6 +53,10 @@ import (
5153

5254
const (
5355
gatherContainerName = "gather"
56+
57+
notReadyTaintKey = "node.kubernetes.io/not-ready"
58+
unreachableTaintKey = "node.kubernetes.io/unreachable"
59+
controlPlaneNodeRoleLabel = "node-role.kubernetes.io/control-plane"
5460
)
5561

5662
var (
@@ -475,6 +481,9 @@ func (o *MustGatherOptions) Run() error {
475481
}
476482
}
477483

484+
candidateNames := getCandidateNodeNames(nodes, hasMaster)
485+
affinity := buildNodeAffinity(candidateNames)
486+
478487
// ... and create must-gather pod(s)
479488
var pods []*corev1.Pod
480489
for _, image := range o.Images {
@@ -496,7 +505,7 @@ func (o *MustGatherOptions) Run() error {
496505
return err
497506
}
498507
for _, node := range nodes.Items {
499-
pods = append(pods, o.newPod(node.Name, image, hasMaster))
508+
pods = append(pods, o.newPod(node.Name, image, hasMaster, affinity))
500509
}
501510
} else {
502511
if o.NodeName != "" {
@@ -506,7 +515,7 @@ func (o *MustGatherOptions) Run() error {
506515
return err
507516
}
508517
}
509-
pods = append(pods, o.newPod(o.NodeName, image, hasMaster))
518+
pods = append(pods, o.newPod(o.NodeName, image, hasMaster, affinity))
510519
}
511520
}
512521

@@ -924,7 +933,7 @@ func newClusterRoleBinding(ns *corev1.Namespace) *rbacv1.ClusterRoleBinding {
924933
// newPod creates a pod with 2 containers with a shared volume mount:
925934
// - gather: init containers that run gather command
926935
// - copy: no-op container we can exec into
927-
func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.Pod {
936+
func (o *MustGatherOptions) newPod(node, image string, hasMaster bool, affinity *corev1.Affinity) *corev1.Pod {
928937
zero := int64(0)
929938

930939
nodeSelector := map[string]string{
@@ -956,6 +965,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
956965
// so setting priority class to system-cluster-critical
957966
PriorityClassName: "system-cluster-critical",
958967
RestartPolicy: corev1.RestartPolicyNever,
968+
Affinity: affinity,
959969
Volumes: []corev1.Volume{
960970
{
961971
Name: "must-gather-output",
@@ -1058,6 +1068,121 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
10581068
return ret
10591069
}
10601070

1071+
func getNodeLastHeartbeatTime(node corev1.Node) *metav1.Time {
1072+
for _, cond := range node.Status.Conditions {
1073+
if cond.Type == corev1.NodeReady {
1074+
if !cond.LastHeartbeatTime.IsZero() {
1075+
return utilptr.To[metav1.Time](cond.LastHeartbeatTime)
1076+
}
1077+
return nil
1078+
}
1079+
}
1080+
return nil
1081+
}
1082+
1083+
func isNodeReadyByCondition(node corev1.Node) bool {
1084+
for _, cond := range node.Status.Conditions {
1085+
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
1086+
return true
1087+
}
1088+
}
1089+
return false
1090+
}
1091+
1092+
func isNodeReadyAndReachableByTaint(node corev1.Node) bool {
1093+
for _, taint := range node.Spec.Taints {
1094+
if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey {
1095+
return false
1096+
}
1097+
}
1098+
return true
1099+
}
1100+
1101+
func getCandidateNodeNames(nodes *corev1.NodeList, hasMaster bool) []string {
1102+
var controlPlaneNodes, allControlPlaneNodes, workerNodes, unschedulableNodes, remainingNodes, selectedNodes []corev1.Node
1103+
for _, node := range nodes.Items {
1104+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
1105+
allControlPlaneNodes = append(allControlPlaneNodes, node)
1106+
}
1107+
if !isNodeReadyByCondition(node) || !isNodeReadyAndReachableByTaint(node) {
1108+
remainingNodes = append(remainingNodes, node)
1109+
continue
1110+
}
1111+
if node.Spec.Unschedulable {
1112+
unschedulableNodes = append(unschedulableNodes, node)
1113+
continue
1114+
}
1115+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
1116+
controlPlaneNodes = append(controlPlaneNodes, node)
1117+
} else {
1118+
workerNodes = append(workerNodes, node)
1119+
}
1120+
}
1121+
1122+
if hasMaster {
1123+
if len(controlPlaneNodes) > 0 {
1124+
selectedNodes = controlPlaneNodes
1125+
} else {
1126+
selectedNodes = allControlPlaneNodes
1127+
}
1128+
} else {
1129+
selectedNodes = controlPlaneNodes
1130+
if len(selectedNodes) == 0 {
1131+
selectedNodes = workerNodes
1132+
}
1133+
if len(selectedNodes) == 0 {
1134+
selectedNodes = unschedulableNodes
1135+
}
1136+
if len(selectedNodes) == 0 {
1137+
selectedNodes = remainingNodes
1138+
}
1139+
}
1140+
1141+
sort.SliceStable(selectedNodes, func(i, j int) bool {
1142+
iTime := getNodeLastHeartbeatTime(selectedNodes[i])
1143+
jTime := getNodeLastHeartbeatTime(selectedNodes[j])
1144+
if jTime == nil {
1145+
return true
1146+
}
1147+
if iTime == nil {
1148+
return false
1149+
}
1150+
return jTime.Before(iTime)
1151+
})
1152+
1153+
nodeNames := []string{}
1154+
for idx, n := range selectedNodes {
1155+
if idx >= 10 {
1156+
break
1157+
}
1158+
nodeNames = append(nodeNames, n.Name)
1159+
}
1160+
return nodeNames
1161+
}
1162+
1163+
func buildNodeAffinity(nodeHostnames []string) *corev1.Affinity {
1164+
if len(nodeHostnames) == 0 {
1165+
return nil
1166+
}
1167+
return &corev1.Affinity{
1168+
NodeAffinity: &corev1.NodeAffinity{
1169+
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
1170+
NodeSelectorTerms: []corev1.NodeSelectorTerm{
1171+
{
1172+
MatchExpressions: []corev1.NodeSelectorRequirement{
1173+
{
1174+
Key: "kubernetes.io/hostname",
1175+
Operator: corev1.NodeSelectorOpIn,
1176+
Values: nodeHostnames,
1177+
},
1178+
},
1179+
},
1180+
},
1181+
},
1182+
},
1183+
}
1184+
}
1185+
10611186
// BackupGathering is called if the full must-gather has an error. This is useful for making sure we get *something*
10621187
// no matter what has failed. It should be focused on universal openshift failures.
10631188
func (o *MustGatherOptions) BackupGathering(ctx context.Context, errs []error) {

0 commit comments

Comments
 (0)