Skip to content

Commit f169cb4

Browse files
Tekon tasks for running AI/ML workloads
1 parent 5cf3f10 commit f169cb4

File tree

6 files changed

+1099
-0
lines changed

6 files changed

+1099
-0
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# AI/ML Workload test configuration
2+
# This test creates n number of statefulsets with x replicas each.
3+
4+
5+
{{$NAMESPACE := DefaultParam .NAMESPACE "aiml-test2"}}
6+
{{$STATEFULSETS_COUNT := DefaultParam .STATEFULSETS_COUNT 1}}
7+
{{$REPLICAS_PER_STATEFULSET := DefaultParam .REPLICAS_PER_STATEFULSET 100000}}
8+
{{$STATEFULSET_CREATION_QPS := DefaultParam .STATEFULSET_CREATION_QPS 5}}
9+
{{$SCALING_QPS := DefaultParam .SCALING_QPS 10}}
10+
{{$BATCH_SIZE := DefaultParam .BATCH_SIZE 500}}
11+
{{$CREATION_TIMEOUT := DefaultParam .CREATION_TIMEOUT "15m"}}
12+
{{$REQUEST_CPU := DefaultParam .REQUEST_CPU "100m"}}
13+
{{$REQUEST_MEMORY := DefaultParam .REQUEST_MEMORY "128Mi"}}
14+
{{$LIMIT_CPU := DefaultParam .LIMIT_CPU "1000m"}}
15+
{{$LIMIT_MEMORY := DefaultParam .LIMIT_MEMORY "512Mi"}}
16+
{{$POD_STARTUP_THRESHOLD := DefaultParam .POD_STARTUP_THRESHOLD "10s"}}
17+
{{$CLUSTER_NAME := DefaultParam .CLUSTER_NAME "perflab-titan-1"}}
18+
{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 1000}}
19+
{{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 1000}}
20+
{{$SLEEP_DURATION := DefaultParam .SLEEP_DURATION 1}}
21+
22+
name: aws-mock-aiml-application-test
23+
24+
tuningSets:
25+
- name: StatefulSetCreationQPS
26+
qpsLoad:
27+
qps: {{$STATEFULSET_CREATION_QPS}}
28+
- name: ScalingQPS
29+
qpsLoad:
30+
qps: {{$SCALING_QPS}}
31+
- name: default
32+
globalQPSLoad:
33+
qps: {{$defaultQps}}
34+
burst: {{$defaultBurst}}
35+
36+
steps:
37+
- name: Starting measurements
38+
measurements:
39+
- Identifier: PodStartupLatency
40+
Method: PodStartupLatency
41+
Params:
42+
action: start
43+
labelSelector: app = aiml-training-job
44+
threshold: {{$POD_STARTUP_THRESHOLD}}
45+
- Identifier: WaitForRunningStatefulSets
46+
Method: WaitForControlledPodsRunning
47+
Params:
48+
action: start
49+
apiVersion: apps/v1
50+
kind: StatefulSet
51+
labelSelector: app = aiml-training-job
52+
operationTimeout: {{$CREATION_TIMEOUT}}
53+
54+
- name: Creating StatefulSets with full replica count
55+
phases:
56+
- namespaceRange:
57+
min: 1
58+
max: 1
59+
basename: {{$NAMESPACE}}
60+
replicasPerNamespace: {{$STATEFULSETS_COUNT}}
61+
tuningSet: StatefulSetCreationQPS
62+
objectBundle:
63+
- basename: aiml-training-job
64+
objectTemplatePath: sts.yaml
65+
templateFillMap:
66+
Group: aiml-training-job
67+
Replicas: {{$BATCH_SIZE}}
68+
RequestCPU: "{{$REQUEST_CPU}}"
69+
RequestMemory: "{{$REQUEST_MEMORY}}"
70+
LimitCPU: "{{$LIMIT_CPU}}"
71+
LimitMemory: "{{$LIMIT_MEMORY}}"
72+
ClusterName: "{{$CLUSTER_NAME}}"
73+
74+
{{range $batch := Loop (SubtractInt (DivideInt $REPLICAS_PER_STATEFULSET $BATCH_SIZE) 1)}}
75+
- name: Scaling StatefulSets batch {{AddInt $batch 2}}
76+
phases:
77+
- namespaceRange:
78+
min: 1
79+
max: 1
80+
basename: {{$NAMESPACE}}
81+
replicasPerNamespace: {{$STATEFULSETS_COUNT}}
82+
tuningSet: ScalingQPS
83+
objectBundle:
84+
- basename: aiml-training-job
85+
objectTemplatePath: sts.yaml
86+
templateFillMap:
87+
Group: aiml-training-job
88+
Replicas: {{MultiplyInt $BATCH_SIZE (AddInt $batch 2)}}
89+
RequestCPU: "{{$REQUEST_CPU}}"
90+
RequestMemory: "{{$REQUEST_MEMORY}}"
91+
LimitCPU: "{{$LIMIT_CPU}}"
92+
LimitMemory: "{{$LIMIT_MEMORY}}"
93+
ClusterName: "{{$CLUSTER_NAME}}"
94+
- name: Sleep after scaling batch {{AddInt $batch 2}}
95+
measurements:
96+
- Identifier: Wait
97+
Method: Sleep
98+
Params:
99+
duration: "{{$SLEEP_DURATION}}s"
100+
{{end}}
101+
102+
- name: Waiting for StatefulSets to be ready
103+
measurements:
104+
- Identifier: WaitForRunningStatefulSets
105+
Method: WaitForControlledPodsRunning
106+
Params:
107+
action: gather
108+
109+
- name: Gathering measurements
110+
measurements:
111+
- Identifier: PodStartupLatency
112+
Method: PodStartupLatency
113+
Params:
114+
action: gather
115+
labelSelector: app = aiml-training-job
116+
threshold: {{$POD_STARTUP_THRESHOLD}}
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
apiVersion: apps/v1
2+
kind: StatefulSet
3+
metadata:
4+
name: aiml-training-job-{{.Group}}
5+
labels:
6+
app: aiml-training-job
7+
group: {{.Group}}
8+
spec:
9+
serviceName: aiml-training-service
10+
replicas: {{.Replicas}}
11+
podManagementPolicy: Parallel
12+
selector:
13+
matchLabels:
14+
app: aiml-training-job
15+
group: {{.Group}}
16+
template:
17+
metadata:
18+
labels:
19+
app: aiml-training-job
20+
group: {{.Group}}
21+
spec:
22+
nodeSelector:
23+
purpose: ml-large
24+
containers:
25+
- name: app-with-awsapi
26+
image: 953421922360.dkr.ecr.us-west-2.amazonaws.com/aws-cli:2.27.49
27+
imagePullPolicy: IfNotPresent
28+
resources:
29+
requests:
30+
cpu: {{.RequestCPU}}
31+
memory: {{.RequestMemory}}
32+
limits:
33+
cpu: {{.LimitCPU}}
34+
memory: {{.LimitMemory}}
35+
env:
36+
- name: CLUSTER_NAME
37+
value: "{{.ClusterName}}"
38+
command:
39+
- sh
40+
- -c
41+
- |
42+
#ToDo remove this once PIA is compliant with exit criteria
43+
sleep 3600
44+
AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE)
45+
MAX_ATTEMPTS=7
46+
INITIAL_DELAY=0.2 # 200ms
47+
start_epoch=$(date +%s%3N)
48+
METRIC_MAX_RETRIES=3
49+
METRIC_RETRY_DELAY=1
50+
NAMESPACE=TitanApplicationLatencyForLargeSTS
51+
DIMENSION_NAME=ClusterName
52+
DIMENSION_VALUE={{.ClusterName}}
53+
METRIC_LATENCY_NAME=TitanApplicationLatencyForLargeSTS
54+
55+
echo "Starting credential fetch and S3 verification process..."
56+
57+
# Fetch credentials from EKS Pod Identity agent with exponential backoff
58+
for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do
59+
status_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials)
60+
if [ "$status_code" -eq 200 ]; then
61+
echo "Successfully fetched credentials at attempt $i"
62+
break
63+
fi
64+
65+
if [ "$i" -eq $((MAX_ATTEMPTS - 1)) ]; then
66+
echo "Failed to fetch credentials after $MAX_ATTEMPTS attempts. Exiting."
67+
exit 1
68+
fi
69+
70+
SLEEP_TIME=$(echo "$INITIAL_DELAY * (2 ^ $i)" | bc -l)
71+
echo "Credential fetch failed. Sleeping $SLEEP_TIME seconds before retry..."
72+
sleep "$SLEEP_TIME"
73+
done
74+
75+
# Verify S3 access
76+
echo "Verifying S3 access..."
77+
while ! aws s3 ls; do
78+
echo "Waiting for S3 bucket access..."
79+
sleep 5
80+
done
81+
echo "S3 bucket is accessible, proceeding."
82+
83+
# Calculate total latency for credential fetch + S3 verification
84+
end_epoch=$(date +%s%3N)
85+
latency_ms=$((end_epoch - start_epoch))
86+
latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }")
87+
88+
echo "Total operation latency: ${latency_sec} seconds (credential fetch + S3 verification)"
89+
90+
# Send combined operation latency metric
91+
for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do
92+
aws cloudwatch put-metric-data \
93+
--namespace "$NAMESPACE" \
94+
--metric-name "$METRIC_LATENCY_NAME" \
95+
--dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \
96+
--value "$latency_sec" \
97+
--unit Seconds && {
98+
echo "Metric $METRIC_LATENCY_NAME sent successfully with value: ${latency_sec}s"
99+
break
100+
}
101+
102+
if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then
103+
echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2
104+
sleep $METRIC_RETRY_DELAY
105+
METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff
106+
else
107+
echo "Failed to send metric $METRIC_LATENCY_NAME after $METRIC_MAX_RETRIES attempts." >&2
108+
exit 1
109+
fi
110+
done
111+
112+
echo "Operation completed successfully. Keeping pod alive..."
113+
# Keep pod alive
114+
while true; do
115+
echo "Sleeping for 1 hour..."
116+
sleep 3600
117+
done
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# AI/ML Workload test configuration using Kubernetes Jobs
2+
# This test creates X number of Jobs with AI/ML workload pods, scaling in batches by updating parallelism.
3+
4+
{{$NAMESPACE := DefaultParam .NAMESPACE "aiml-test"}}
5+
{{$JOBS_COUNT := DefaultParam .JOBS_COUNT 10}}
6+
{{$COMPLETIONS_PER_JOB := DefaultParam .COMPLETIONS_PER_JOB 10100}}
7+
{{$BATCH_SIZE := DefaultParam .BATCH_SIZE 200}}
8+
{{$JOB_CREATION_QPS := DefaultParam .JOB_CREATION_QPS 100}}
9+
{{$SCALING_QPS := DefaultParam .SCALING_QPS 100}}
10+
{{$JOB_COMPLETION_TIMEOUT := DefaultParam .JOB_COMPLETION_TIMEOUT "45m"}}
11+
{{$CREATION_TIMEOUT := DefaultParam .CREATION_TIMEOUT "51m"}}
12+
{{$REQUEST_CPU := DefaultParam .REQUEST_CPU "1000m"}}
13+
{{$REQUEST_MEMORY := DefaultParam .REQUEST_MEMORY "128Mi"}}
14+
{{$LIMIT_CPU := DefaultParam .LIMIT_CPU "1500m"}}
15+
{{$LIMIT_MEMORY := DefaultParam .LIMIT_MEMORY "512Mi"}}
16+
{{$CLUSTER_NAME := DefaultParam .CLUSTER_NAME "perflab-titan-1"}}
17+
{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 500}}
18+
{{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 500}}
19+
{{$SLEEP_DURATION := DefaultParam .SLEEP_DURATION 2}}
20+
21+
name: aws-mock-aiml-application-large-jobs
22+
23+
tuningSets:
24+
- name: JobCreationQPS
25+
qpsLoad:
26+
qps: {{$JOB_CREATION_QPS}}
27+
- name: ScalingQPS
28+
qpsLoad:
29+
qps: {{$SCALING_QPS}}
30+
- name: default
31+
globalQPSLoad:
32+
qps: {{$defaultQps}}
33+
burst: {{$defaultBurst}}
34+
35+
steps:
36+
- name: Starting measurements
37+
measurements:
38+
- Identifier: PodStartupLatency
39+
Method: PodStartupLatency
40+
Params:
41+
action: start
42+
labelSelector: app = aiml-training-job
43+
threshold: 5s
44+
- Identifier: WaitForRunningJobs
45+
Method: WaitForControlledPodsRunning
46+
Params:
47+
action: start
48+
apiVersion: batch/v1
49+
kind: Job
50+
labelSelector: app = aiml-training-job
51+
operationTimeout: {{$CREATION_TIMEOUT}}
52+
53+
- name: Creating Jobs with initial parallelism
54+
phases:
55+
- namespaceRange:
56+
min: 1
57+
max: 1
58+
basename: {{$NAMESPACE}}
59+
replicasPerNamespace: {{$JOBS_COUNT}}
60+
tuningSet: JobCreationQPS
61+
objectBundle:
62+
- basename: aiml-training-job
63+
objectTemplatePath: job-with-fsx.yaml
64+
templateFillMap:
65+
Group: aiml-training-job
66+
Completions: {{$COMPLETIONS_PER_JOB}}
67+
Parallelism: {{$BATCH_SIZE}}
68+
RequestCPU: "{{$REQUEST_CPU}}"
69+
RequestMemory: "{{$REQUEST_MEMORY}}"
70+
LimitCPU: "{{$LIMIT_CPU}}"
71+
LimitMemory: "{{$LIMIT_MEMORY}}"
72+
ClusterName: "{{$CLUSTER_NAME}}"
73+
74+
{{ $numBatches := SubtractInt (DivideInt $COMPLETIONS_PER_JOB $BATCH_SIZE) 1 }}
75+
{{range $batch := Loop $numBatches}}
76+
- name: Scaling Jobs batch {{AddInt $batch 2}}
77+
phases:
78+
- namespaceRange:
79+
min: 1
80+
max: 1
81+
basename: {{$NAMESPACE}}
82+
replicasPerNamespace: {{$JOBS_COUNT}}
83+
tuningSet: ScalingQPS
84+
objectBundle:
85+
- basename: aiml-training-job
86+
objectTemplatePath: job-with-fsx.yaml
87+
updateFromTemplate: true
88+
templateFillMap:
89+
Group: aiml-training-job
90+
Completions: {{$COMPLETIONS_PER_JOB}}
91+
Parallelism: {{MultiplyInt $BATCH_SIZE (AddInt $batch 2)}}
92+
RequestCPU: "{{$REQUEST_CPU}}"
93+
RequestMemory: "{{$REQUEST_MEMORY}}"
94+
LimitCPU: "{{$LIMIT_CPU}}"
95+
LimitMemory: "{{$LIMIT_MEMORY}}"
96+
ClusterName: "{{$CLUSTER_NAME}}"
97+
- name: Sleep after scaling batch {{AddInt $batch 2}}
98+
measurements:
99+
- Identifier: Wait
100+
Method: Sleep
101+
Params:
102+
duration: "{{$SLEEP_DURATION}}s"
103+
{{end}}
104+
105+
- name: Waiting for Jobs to be ready
106+
measurements:
107+
- Identifier: WaitForRunningJobs
108+
Method: WaitForControlledPodsRunning
109+
Params:
110+
action: gather
111+
- Identifier: PodStartupLatency
112+
Method: PodStartupLatency
113+
Params:
114+
action: gather
115+
labelSelector: app = aiml-training-job

0 commit comments

Comments
 (0)