Skip to content

Commit 0046f48

Browse files
TopologyManager Scope support
This policy accounts resources per pod level basis. Signed-off-by: Alexey Perevalov <[email protected]>
1 parent 87bfe51 commit 0046f48

File tree

3 files changed

+206
-56
lines changed

3 files changed

+206
-56
lines changed

pkg/apis/config/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ const (
5151

5252
// to preserve consistency keep it in pkg/apis/core/types.go"
5353
SingleNUMANodeTopologyManagerPolicy TopologyManagerPolicy = "SingleNUMANode"
54+
55+
PodTopologyScope TopologyManagerPolicy = "PodTopologyScope"
5456
)
5557

5658
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

pkg/noderesourcetopology/match.go

Lines changed: 84 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,16 @@ import (
2929
"k8s.io/client-go/tools/clientcmd"
3030
clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
3131
"k8s.io/klog/v2"
32-
3332
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
3433
bm "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
3534
framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
35+
apiconfig "sigs.k8s.io/scheduler-plugins/pkg/apis/config"
36+
3637
topologyv1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1"
3738
topoclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
3839
topoinformerexternal "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/informers/externalversions"
3940
topologyinformers "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/informers/externalversions"
4041
topoinformerv1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/informers/externalversions/topology/v1alpha1"
41-
42-
apiconfig "sigs.k8s.io/scheduler-plugins/pkg/apis/config"
4342
)
4443

4544
const (
@@ -69,6 +68,14 @@ type NodeResourceTopologyMatch struct {
6968
topologyPolicyHandlers PolicyHandlerMap
7069
}
7170

71+
type SingleNUMANodeHandler struct {
72+
match *NodeResourceTopologyMatch
73+
}
74+
75+
type PodLevelResourceHandler struct {
76+
match *NodeResourceTopologyMatch
77+
}
78+
7279
type NUMANode struct {
7380
NUMAID int
7481
Resources v1.ResourceList
@@ -82,39 +89,11 @@ func (tm *NodeResourceTopologyMatch) Name() string {
8289
}
8390

8491
func filter(containers []v1.Container, nodes NUMANodeList, qos v1.PodQOSClass) *framework.Status {
85-
if qos == v1.PodQOSBestEffort {
86-
return nil
87-
}
88-
89-
zeroQuantity := resource.MustParse("0")
9092
for _, container := range containers {
9193
bitmask := bm.NewEmptyBitMask()
9294
bitmask.Fill()
93-
for resource, quantity := range container.Resources.Requests {
94-
resourceBitmask := bm.NewEmptyBitMask()
95-
for _, numaNode := range nodes {
96-
numaQuantity, ok := numaNode.Resources[resource]
97-
// if can't find requested resource on the node - skip (don't set it as available NUMA node)
98-
// if unfound resource has 0 quantity probably this numa node can be considered
99-
if !ok && quantity.Cmp(zeroQuantity) != 0 {
100-
continue
101-
}
102-
// Check for the following:
103-
// 1. set numa node as possible node if resource is memory or Hugepages (until memory manager will not be merged and
104-
// memory will not be provided in CRD
105-
// 2. set numa node as possible node if resource is cpu and it's not guaranteed QoS, since cpu will flow
106-
// 3. set numa node as possible node if zero quantity for non existing resource was requested (TODO check topology manaager behaviour)
107-
// 4. otherwise check amount of resources
108-
if resource == v1.ResourceMemory ||
109-
strings.HasPrefix(string(resource), string(v1.ResourceHugePagesPrefix)) ||
110-
resource == v1.ResourceCPU && qos != v1.PodQOSGuaranteed ||
111-
quantity.Cmp(zeroQuantity) == 0 ||
112-
numaQuantity.Cmp(quantity) >= 0 {
113-
resourceBitmask.Add(numaNode.NUMAID)
114-
}
115-
}
116-
bitmask.And(resourceBitmask)
117-
}
95+
96+
checkResourcesForNUMANodes(bitmask, nodes, container.Resources.Requests, qos)
11897
if bitmask.IsEmpty() {
11998
// definitly we can't align container, so we can't align a pod
12099
return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Can't align container: %s", container.Name))
@@ -147,14 +126,76 @@ func extractResources(zone topologyv1alpha1.Zone) v1.ResourceList {
147126
return res
148127
}
149128

150-
func (tm NodeResourceTopologyMatch) PolicyFilter(pod *v1.Pod, zones topologyv1alpha1.ZoneList) *framework.Status {
129+
func (sh SingleNUMANodeHandler) PolicyFilter(pod *v1.Pod, zones topologyv1alpha1.ZoneList) *framework.Status {
151130
containers := []v1.Container{}
152131
containers = append(pod.Spec.InitContainers, pod.Spec.Containers...)
153132

154-
tm.nodeTopologyGuard.RLock()
155-
defer tm.nodeTopologyGuard.RUnlock()
133+
sh.match.nodeTopologyGuard.RLock()
134+
defer sh.match.nodeTopologyGuard.RUnlock()
156135
// prepare NUMANodes list from zoneMap
157136

137+
nodes := createNUMANodeList(zones)
138+
return filter(containers, nodes, v1qos.GetPodQOS(pod))
139+
}
140+
141+
func checkResourcesForNUMANodes(bitmask bm.BitMask, nodes NUMANodeList, resources v1.ResourceList, qos v1.PodQOSClass) {
142+
zeroQuantity := resource.MustParse("0")
143+
for resource, quantity := range resources {
144+
resourceBitmask := bm.NewEmptyBitMask()
145+
for _, numaNode := range nodes {
146+
numaQuantity, ok := numaNode.Resources[resource]
147+
// if can't find requested resource on the node - skip (don't set it as available NUMA node)
148+
// if unfound resource has 0 quantity probably this numa node can be considered
149+
if !ok && quantity.Cmp(zeroQuantity) != 0 {
150+
continue
151+
}
152+
// Check for the following:
153+
// 1. set numa node as possible node if resource is memory or Hugepages (until memory manager will not be merged and
154+
// memory will not be provided in CRD
155+
// 2. set numa node as possible node if resource is cpu and it's not guaranteed QoS, since cpu will flow
156+
// 3. set numa node as possible node if zero quantity for non existing resource was requested (TODO check topology manaager behaviour)
157+
// 4. otherwise check amount of resources
158+
if resource == v1.ResourceMemory ||
159+
strings.HasPrefix(string(resource), string(v1.ResourceHugePagesPrefix)) ||
160+
resource == v1.ResourceCPU && qos != v1.PodQOSGuaranteed ||
161+
quantity.Cmp(zeroQuantity) == 0 ||
162+
numaQuantity.Cmp(quantity) >= 0 {
163+
resourceBitmask.Add(numaNode.NUMAID)
164+
}
165+
}
166+
bitmask.And(resourceBitmask)
167+
}
168+
}
169+
170+
func (ph PodLevelResourceHandler) PolicyFilter(pod *v1.Pod, zones topologyv1alpha1.ZoneList) *framework.Status {
171+
containers := []v1.Container{}
172+
containers = append(pod.Spec.InitContainers, pod.Spec.Containers...)
173+
174+
resources := make(v1.ResourceList)
175+
176+
for _, container := range containers {
177+
for resource, quantity := range container.Resources.Requests {
178+
if quan, ok := resources[resource]; ok {
179+
quantity.Add(quan)
180+
}
181+
resources[resource] = quantity
182+
}
183+
}
184+
185+
nodes := createNUMANodeList(zones)
186+
bitmask := bm.NewEmptyBitMask()
187+
bitmask.Fill()
188+
checkResourcesForNUMANodes(bitmask, nodes, resources, v1qos.GetPodQOS(pod))
189+
190+
if bitmask.IsEmpty() {
191+
// definitly we can't align container, so we can't align a pod
192+
return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Can't align pod: %s", pod.Name))
193+
}
194+
return nil
195+
196+
}
197+
198+
func createNUMANodeList(zones topologyv1alpha1.ZoneList) NUMANodeList {
158199
nodes := make(NUMANodeList, 0)
159200
for _, zone := range zones {
160201
if zone.Type == "Node" {
@@ -164,14 +205,18 @@ func (tm NodeResourceTopologyMatch) PolicyFilter(pod *v1.Pod, zones topologyv1al
164205
nodes = append(nodes, NUMANode{NUMAID: numaID, Resources: resources})
165206
}
166207
}
167-
return filter(containers, nodes, v1qos.GetPodQOS(pod))
208+
return nodes
168209
}
169210

170211
// Filter Now only single-numa-node supported
171212
func (tm *NodeResourceTopologyMatch) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
172213
if nodeInfo.Node() == nil {
173214
return framework.NewStatus(framework.Error, fmt.Sprintf("Node is nil %s", nodeInfo.Node().Name))
174215
}
216+
if v1qos.GetPodQOS(pod) == v1.PodQOSBestEffort {
217+
return nil
218+
}
219+
175220
nodeName := nodeInfo.Node().Name
176221

177222
topologyPolicies := getTopologyPolicies(tm.nodeTopologies, nodeName)
@@ -305,7 +350,8 @@ func NewNodeResourceTopologyMatch(args runtime.Object, handle framework.Framewor
305350
topologyMatch.handle = handle
306351

307352
topologyMatch.topologyPolicyHandlers = make(PolicyHandlerMap)
308-
topologyMatch.topologyPolicyHandlers[apiconfig.SingleNUMANodeTopologyManagerPolicy] = topologyMatch
353+
topologyMatch.topologyPolicyHandlers[apiconfig.SingleNUMANodeTopologyManagerPolicy] = SingleNUMANodeHandler{match: topologyMatch}
354+
topologyMatch.topologyPolicyHandlers[apiconfig.PodTopologyScope] = PodLevelResourceHandler{match: topologyMatch}
309355
topologyMatch.nodeTopologies = nodeTopologyMap{}
310356

311357
return topologyMatch, nil

pkg/noderesourcetopology/match_test.go

Lines changed: 120 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
intstr "k8s.io/apimachinery/pkg/util/intstr"
3030
framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
31+
"k8s.io/klog/v2"
3132

3233
apiconfig "sigs.k8s.io/scheduler-plugins/pkg/apis/config"
3334
)
@@ -40,11 +41,45 @@ func makePodByResourceList(resources *v1.ResourceList) *v1.Pod {
4041
return &v1.Pod{Spec: v1.PodSpec{Containers: []v1.Container{{
4142
Resources: v1.ResourceRequirements{
4243
Requests: *resources,
44+
Limits: *resources,
4345
},
4446
}},
4547
}}
4648
}
4749

50+
func makeResourceListFromZones(zones topologyv1alpha1.ZoneList) v1.ResourceList {
51+
result := make(v1.ResourceList)
52+
for _, zone := range zones {
53+
for _, resInfo := range zone.Resources {
54+
resQuantity, err := resource.ParseQuantity(resInfo.Allocatable.String())
55+
if err != nil {
56+
klog.Errorf("Failed to parse %s", resInfo.Allocatable.String())
57+
continue
58+
}
59+
if quantity, ok := result[v1.ResourceName(resInfo.Name)]; ok {
60+
resQuantity.Add(quantity)
61+
}
62+
result[v1.ResourceName(resInfo.Name)] = resQuantity
63+
}
64+
}
65+
return result
66+
}
67+
68+
69+
func makePodByResourceListWithManyContainers(resources *v1.ResourceList, containerCount int) *v1.Pod {
70+
containers := []v1.Container{}
71+
72+
for i := 0; i < containerCount; i++ {
73+
containers = append(containers, v1.Container{
74+
Resources: v1.ResourceRequirements{
75+
Requests: *resources,
76+
Limits: *resources,
77+
},
78+
})
79+
}
80+
return &v1.Pod{Spec: v1.PodSpec{Containers: containers },}
81+
}
82+
4883
func TestTopologyRequests(t *testing.T) {
4984
nodes := nodeTopologyMap{}
5085
nodes["node1"] = topologyv1alpha1.NodeResourceTopology{
@@ -92,7 +127,7 @@ func TestTopologyRequests(t *testing.T) {
92127
}
93128

94129
nodes["node2"] = topologyv1alpha1.NodeResourceTopology{
95-
ObjectMeta: metav1.ObjectMeta{Name: "node1"},
130+
ObjectMeta: metav1.ObjectMeta{Name: "node2"},
96131
TopologyPolicies: []string{string(apiconfig.SingleNUMANodeTopologyManagerPolicy)},
97132
Zones: topologyv1alpha1.ZoneList{
98133
topologyv1alpha1.Zone{
@@ -134,12 +169,52 @@ func TestTopologyRequests(t *testing.T) {
134169
},
135170
},
136171
}
137-
node1Resources := v1.ResourceList{
138-
v1.ResourceCPU: *resource.NewQuantity(12, resource.DecimalSI),
139-
v1.ResourceMemory: resource.MustParse("16Gi"),
140-
v1.ResourcePods: *resource.NewQuantity(20, resource.DecimalSI),
141-
nicResourceName: *resource.NewQuantity(14, resource.DecimalSI),
172+
173+
174+
nodes["node3"] = topologyv1alpha1.NodeResourceTopology{
175+
ObjectMeta: metav1.ObjectMeta{Name: "node3"},
176+
TopologyPolicies: []string{string(apiconfig.PodTopologyScope)},
177+
Zones: topologyv1alpha1.ZoneList{
178+
topologyv1alpha1.Zone{
179+
Name: "node-0",
180+
Type: "Node",
181+
Resources: topologyv1alpha1.ResourceInfoList{
182+
topologyv1alpha1.ResourceInfo{
183+
Name: "cpu",
184+
Capacity: intstr.Parse("20"),
185+
Allocatable: intstr.Parse("2"),
186+
}, topologyv1alpha1.ResourceInfo{
187+
Name: "memory",
188+
Capacity: intstr.Parse("8Gi"),
189+
Allocatable: intstr.Parse("4Gi"),
190+
}, topologyv1alpha1.ResourceInfo{
191+
Name: nicResourceName,
192+
Capacity: intstr.Parse("30"),
193+
Allocatable: intstr.Parse("5"),
194+
},
195+
},
196+
}, topologyv1alpha1.Zone{
197+
Name: "node-1",
198+
Type: "Node",
199+
Resources: topologyv1alpha1.ResourceInfoList{
200+
topologyv1alpha1.ResourceInfo{
201+
Name: "cpu",
202+
Capacity: intstr.Parse("30"),
203+
Allocatable: intstr.Parse("4"),
204+
}, topologyv1alpha1.ResourceInfo{
205+
Name: "memory",
206+
Capacity: intstr.Parse("8Gi"),
207+
Allocatable: intstr.Parse("4Gi"),
208+
}, topologyv1alpha1.ResourceInfo{
209+
Name: nicResourceName,
210+
Capacity: intstr.Parse("30"),
211+
Allocatable: intstr.Parse("2"),
212+
},
213+
},
214+
},
215+
},
142216
}
217+
node1Resources := makeResourceListFromZones(nodes["node1"].Zones)
143218
node1 := v1.Node{
144219
ObjectMeta: metav1.ObjectMeta{Name: "node1"},
145220
Status: v1.NodeStatus{
@@ -148,18 +223,21 @@ func TestTopologyRequests(t *testing.T) {
148223
},
149224
}
150225

151-
node2Resources := v1.ResourceList{
152-
v1.ResourceCPU: *resource.NewQuantity(6, resource.DecimalSI),
153-
v1.ResourceMemory: resource.MustParse("8Gi"),
154-
v1.ResourcePods: *resource.NewQuantity(20, resource.DecimalSI),
155-
nicResourceName: *resource.NewQuantity(7, resource.DecimalSI),
156-
}
157-
node2 := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1"}, Status: v1.NodeStatus{
226+
node2Resources := makeResourceListFromZones(nodes["node2"].Zones)
227+
node2 := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2"}, Status: v1.NodeStatus{
158228
Capacity: node2Resources,
159229
Allocatable: node2Resources,
160230
},
161231
}
162232

233+
node3Resources := makeResourceListFromZones(nodes["node3"].Zones)
234+
235+
node3 := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3"}, Status: v1.NodeStatus{
236+
Capacity: node3Resources,
237+
Allocatable: node3Resources,
238+
},
239+
}
240+
163241
// Test different QoS Guaranteed/Burstable/BestEffort
164242
topologyTests := []struct {
165243
pod *v1.Pod
@@ -230,23 +308,47 @@ func TestTopologyRequests(t *testing.T) {
230308
nodeTopologies: &nodes,
231309
name: "Guaranteed QoS, pod fit",
232310
node: node1,
233-
wantStatus: nil, //topology_match has to skip request of 0 resources
311+
wantStatus: nil,
312+
},
313+
{
314+
pod: makePodByResourceListWithManyContainers(&v1.ResourceList{
315+
v1.ResourceCPU: *resource.NewQuantity(3, resource.DecimalSI),
316+
v1.ResourceMemory: resource.MustParse("1Gi"),
317+
notExistingNICResourceName: *resource.NewQuantity(0, resource.DecimalSI)}, 3),
318+
nodeTopologies: &nodes,
319+
name: "Guaranteed QoS Topology Scope, pod doesn't fit",
320+
node: node3,
321+
wantStatus: framework.NewStatus(framework.Unschedulable, "Can't align pod: "),
322+
},
323+
{
324+
pod: makePodByResourceListWithManyContainers(&v1.ResourceList{
325+
v1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI),
326+
v1.ResourceMemory: resource.MustParse("1Gi"),
327+
notExistingNICResourceName: *resource.NewQuantity(0, resource.DecimalSI)}, 3),
328+
nodeTopologies: &nodes,
329+
name: "Guaranteed QoS Topology Scope, pod fit",
330+
node: node3,
331+
wantStatus: nil,
234332
},
235333
}
236334

237335
nodeInfo := framework.NewNodeInfo()
238336
for _, test := range topologyTests {
239337
t.Run(test.name, func(t *testing.T) {
240338
tm := NodeResourceTopologyMatch{}
241-
tm.nodeTopologies = nodes
339+
tm.nodeTopologies = *test.nodeTopologies
242340
tm.topologyPolicyHandlers = make(PolicyHandlerMap)
243-
tm.topologyPolicyHandlers[apiconfig.SingleNUMANodeTopologyManagerPolicy] = tm
341+
tm.topologyPolicyHandlers[apiconfig.SingleNUMANodeTopologyManagerPolicy] = SingleNUMANodeHandler{match: &tm}
342+
tm.topologyPolicyHandlers[apiconfig.PodTopologyScope] = PodLevelResourceHandler{match: &tm}
244343
nodeInfo.SetNode(&test.node)
245344
test.pod.Spec.Containers[0].Name = containerName
246-
test.pod.Spec.Containers[0].Resources.Limits = test.pod.Spec.Containers[0].Resources.Requests
345+
// this was done to make pod's QoS Guaranted
346+
//test.pod.Spec.Containers[0].Resources.Limits = test.pod.Spec.Containers[0].Resources.Requests
247347
gotStatus := tm.Filter(context.Background(), framework.NewCycleState(), test.pod, nodeInfo)
348+
349+
fmt.Printf("test.Name: %v; status: %v\n", test.name, gotStatus)
248350
if !reflect.DeepEqual(gotStatus, test.wantStatus) {
249-
t.Errorf("status does not match: %v, want: %v", gotStatus, test.wantStatus)
351+
t.Errorf("status does not match: %v, want: %v\n", gotStatus, test.wantStatus)
250352
}
251353
})
252354
}

0 commit comments

Comments
 (0)