awslabs · mengqiy · Jul 28, 2025 · Jul 22, 2025 · Jul 24, 2025 · Jul 25, 2025
diff --git a/tests/assets/eks-networking/config-eks-networking.yaml b/tests/assets/eks-networking/config-eks-networking.yaml
@@ -0,0 +1,83 @@
+#Constants
+{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .NODES_PER_NAMESPACE 100)}}
+{{$IS_SMALL_CLUSTER := lt .Nodes 100}}
+{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}}
+{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}}
+# Variables
+{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
+{{$totalPods := MultiplyInt $namespaces $NODES_PER_NAMESPACE $PODS_PER_NODE}}
+{{$podsPerNamespace := DivideInt $totalPods $namespaces}}
+# Command to be executed
+{{$EXEC_COMMAND := DefaultParam .CL2_EXEC_COMMAND nil}}
+{{$EXIT_AFTER_EXEC := DefaultParam .CL2_EXIT_AFTER_EXEC false}}
+{{$EXEC_TIMEOUT := DefaultParam .CL2_EXEC_TIMEOUT "3600s"}}
+{{$SLEEP_AFTER_EXEC_DURATION := DefaultParam .CL2_SLEEP_AFTER_EXEC_DURATION "0s"}}
+
+{{$registry := DefaultParam .CL2_LATENCY_POD_REGISTRY "registry.k8s.io"}}
+{{$latencyPodImage := DefaultParam .CL2_LATENCY_POD_IMAGE (Concat $registry "/pause:3.9")}}
+{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS (IfThenElse (le .Nodes 500) 10 100)}}
+{{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 500}}
+
+name: load-eks-networking
+namespace:
+  number: {{$namespaces}}
+tuningSets:
+- name: Sequence
+  parallelismLimitedLoad:
+    parallelismLimit: 1
+- name: UniformQPS
+  qpsLoad:
+    qps: {{$uniformQps}}
+- name: default
+  globalQPSLoad:
+    qps: {{$defaultQps}}
+    burst: 1
+steps:
+- module:
+    path: /modules/measurements.yaml
+    params:
+      action: start
+{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}}
+- module:
+    path: modules/network-policy/net-policy-enforcement-latency.yaml
+    params:
+      setup: true
+      run: true
+      testType: "pod-creation"
+{{end}}
+- module:
+    path: modules/dns-k8s-hostnames.yaml
+- name: Sleep
+  measurements:
+  - Identifier: WaitAfterExec
+    Method: Sleep
+    Params:
+      duration: {{$SLEEP_AFTER_EXEC_DURATION}}
+{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}}
+- module:
+    path: modules/network-policy/net-policy-metrics.yaml
+    params:
+      action: gather
+      usePolicyCreationMetrics: false
+- module:
+    path: modules/network-policy/net-policy-enforcement-latency.yaml
+    params:
+      complete: true
+      testType: "pod-creation"
+- module:
+    path: modules/network-policy/net-policy-enforcement-latency.yaml
+    params:
+      run: true
+      testType: "policy-creation"
+{{end}}
+- module:
+    path: /modules/measurements.yaml
+    params:
+      action: gather
+{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}}
+- module:
+    path: modules/network-policy/net-policy-enforcement-latency.yaml
+    params:
+      complete: true
+      testType: "policy-creation"
+{{end}}
diff --git a/tests/assets/eks-networking/test-svc.yaml b/tests/assets/eks-networking/test-svc.yaml
@@ -0,0 +1,40 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: test-svc
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-svc-deployment
+  namespace: test-svc
+spec:
+  replicas: 5000
+  selector:
+    matchLabels:
+      app: test-svc
+  template:
+    metadata:
+      labels:
+        app: test-svc
+    spec:
+      containers:
+      - name: pause
+        image: public.ecr.aws/eks-distro/kubernetes/pause:3.9
+        ports:
+        - containerPort: 8080
+          name: http
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: test-svc
+  namespace: test-svc
+spec:
+  type: ClusterIP
+  selector:
+    app: test-svc
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-networking.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-networking.yaml
@@ -0,0 +1,205 @@
+---
+apiVersion: tekton.dev/v1beta1
+kind: Task
+metadata:
+  name: load-networking
+  namespace: scalability
+spec:
+  description: "clusterloader2 task to run various types of cl2 tests on a given cluster."
+  params:
+  - name: giturl
+    description: "git url to clone the package"
+    default: https://github.com/mengqiy/perf-tests.git
+  - name: cl2-branch
+    description: "The branch of clusterloader2 you want to use"
+    default: "master"
+  - name: nodes-per-namespace
+    description: "nodes per namespace to get created for load test "
+    default: "100"
+  - name: cl2-load-test-throughput
+    description: " throughput used for mutate operations"
+    default: "15"
+  - name: pods-per-node
+    description: "pod density"
+    default: "10"
+  - name: nodes
+    description: "number of dataplane nodes to run the load test against"
+    default: "1000"
+  - name: results-bucket
+    description: "Results bucket with path of s3 to upload results"
+  - name: region
+    default: "us-west-2"
+    description: The region where the cluster is in.
+  - name: cluster-name
+    description: The name of the EKS cluster you want to spin.
+  - name: amp-workspace-id
+    description: The AMP workspace ID where remote write needs to happen.
+    default: ""
+  - name: networking-test-config-url
+    default: "https://raw.githubusercontent.com/oliviassss/kubernetes-iteration-toolkit/refs/heads/sonyingy-dev/tests/assets/eks-networking/config-eks-networking.yaml"
+  - name: networking-svc-test-config-url
+    default: "https://raw.githubusercontent.com/oliviassss/kubernetes-iteration-toolkit/refs/heads/sonyingy-dev/tests/assets/eks-networking/test-svc.yaml"
+  results:
+    - name: datapoint
+      description: Stores the CL2 result that can be consumed by other tasks (e.g. cloudwatch) 
+    - name: s3_result
+      description: Stores the S3 result path after compute
+  workspaces:
+  - name: source
+    mountPath: /src/k8s.io/
+  - name: results
+  - name: config
+    mountPath: /config/
+  stepTemplate:
+    env:
+    - name: KUBECONFIG
+      value: /config/kubeconfig
+  steps:
+  - name: git-clone      
+    image: alpine/git
+    workingDir: $(workspaces.source.path)
+    script: |
+      git clone $(params.giturl)
+      cd $(workspaces.source.path)/perf-tests/
+      git fetch origin --verbose --tags
+      git checkout $(params.cl2-branch)
+      git branch
+  - name: prepare-loadtest
+    image: golang:1.24
+    workingDir: $(workspaces.source.path)
+    script: |
+      S3_RESULT_PATH=$(params.results-bucket)
+      echo $S3_RESULT_PATH > $(results.s3_result.path)
+      echo "S3 Path: $S3_RESULT_PATH" 
+      cat > "$(workspaces.source.path)/overrides.yaml" <<EOL
+      NODES_PER_NAMESPACE: $(params.nodes-per-namespace)
+      PODS_PER_NODE: $(params.pods-per-node)
+      NODE_MODE: master 
+      # DNS test settings
+      CL2_ENABLE_DNSTESTS: true
+      CL2_USE_ADVANCED_DNSTEST: true
+
+      # Note: default setting for dns client pod 
+      #       bydefault the tester create 5 dns client pods
+      #       additionally, 1 extra DNS client pod is created for every 100 nodes in the cluster
+      CL2_DNS_K8S_HOSTNAMES_PER_CLIENT_QPS: 10
+      CL2_DNS_K8S_HOSTNAMES_CLIENT_PODS_FACTOR: 1
+      CL2_DNS_K8S_HOSTNAMES_TEST_MINUTES: 1
+
+      # DNS SLOs, ignore the DNS error for now since the dnsperfgo contains non-existing FQDNs
+      CL2_DNS_LOOKUP_LATENCY_99_THRESHOLD: 60
+      DNS_ERROR_PERC_THRESHOLD: 100
+
+      # KubeProxy SLOs
+      CL2_ENABLE_VIOLATIONS_FOR_KUBEPROXY_PROGRAMMING_LATENCIES: true
+      CL2_NETWORK_LATENCY_THRESHOLD: 20s
+      CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD: 300s
+
+      CL2_PROMETHEUS_NODE_SELECTOR: "eks.amazonaws.com/nodegroup: monitoring-$(params.cluster-name)-nodes-1"
+      CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4
+      EOL
+      cat $(workspaces.source.path)/overrides.yaml
+      cp $(workspaces.source.path)/overrides.yaml $(workspaces.results.path)/overrides.yaml
+
+      # Enable Prometheus if the remote workspace id is provided
+      if [ -n "$(params.amp-workspace-id)" ]; then
+      cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml
+        containers:
+          - name: aws-sigv4-proxy-sidecar
+            image: public.ecr.aws/aws-observability/aws-sigv4-proxy:1.0
+            args:
+              - --name
+              - aps
+              - --region
+              - $(params.region)
+              - --host
+              - aps-workspaces.$(params.region).amazonaws.com
+              - --port
+              - :8005
+            ports:
+              - name: aws-sigv4-proxy
+                containerPort: 8005
+        remoteWrite:
+          - url: http://localhost:8005/workspaces/$(params.amp-workspace-id)/api/v1/remote_write
+            queueConfig:
+              capacity: 2500
+              maxSamplesPerSend: 1000
+              maxShards: 200
+        externalLabels:
+          cluster_name: $(params.cluster-name)
+          s3_path: $S3_RESULT_PATH
+      EOF
+      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml
+      cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml
+            tolerations:
+              - key: monitoring
+                operator: Exists
+                effect: NoSchedule  
+      EOF
+      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml
+      # schedule kube-state-pod onto the same node as prometheus
+      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
+      cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
+            tolerations:
+              - key: monitoring
+                operator: Exists
+                effect: NoSchedule  
+      EOF
+      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
+
+      # # TODO: Remove this once we fix https://github.com/kubernetes/kubernetes/issues/126578 or find a better way to work around it.
+      # rm $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/default/prometheus-serviceMonitorCoreDNS.yaml
+      # rm $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/default/prometheus-serviceMonitorLegacyKubeDNS.yaml
+
+      fi
+      # Building clusterloader2 binary 
+      cd $(workspaces.source.path)/perf-tests/clusterloader2/
+      GOOS=linux CGO_ENABLED=0  go build -v -o ./clusterloader ./cmd
+  - name: run-loadtest
+    image: alpine/k8s:1.30.2
+    onError: continue
+    script: |
+      #!/bin/bash
+      if [ -n "$(params.amp-workspace-id)" ]; then
+        # Enable prometheus flags
+        export ENABLE_PROMETHEUS_SERVER=true
+        export PROMETHEUS_PVC_STORAGE_CLASS=gp2
+        export PROMETHEUS_SCRAPE_KUBE_PROXY=true
+        export PROMETHEUS_SCRAPE_APISERVER_ONLY=true
+        export PROMETHEUS_SCRAPE_KUBE_STATE_METRICS=false
+        export PROMETHEUS_KUBE_PROXY_SELECTOR_KEY=k8s-app
+        export PROMETHEUS_MEMORY_REQUEST=16Gi
+      fi
+
+      # prepare eks networking load test config
+      curl -s $(params.networking-test-config-url) -o $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config-eks-networking.yaml
+      curl -s $(params.networking-svc-test-config-url) -o $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/test-svc.yaml
+      cat $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config-eks-networking.yaml
+      cat $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/test-svc.yaml
+      cd $(workspaces.source.path)/perf-tests/clusterloader2/
+
+      # create the service backed by 5k pods to test kubeproxy network programming performance
+      # we can tune the scale of pods later
+      kubectl apply -f $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/test-svc.yaml
+      kubectl rollout status deployment/test-svc-deployment -n test-svc --timeout=300s
+
+      # run the CL2 test suite for dns performance test
+      ENABLE_EXEC_SERVICE=false ./clusterloader --kubeconfig=$KUBECONFIG --testconfig=$(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config-eks-networking.yaml --testoverrides=$(workspaces.source.path)/overrides.yaml --nodes=$(params.nodes) --provider=eks --report-dir=$(workspaces.results.path) --alsologtostderr --v=2
+      exit_code=$?
+      if [ $exit_code -eq 0 ]; then
+      echo "1" | tee $(results.datapoint.path)
+      else
+      echo "0" | tee $(results.datapoint.path)
+      fi
+      exit $exit_code
+    timeout: 30000s
+  - name: upload-results
+    image: amazon/aws-cli
+    workingDir: $(workspaces.results.path)
+    script: |
+      S3_RESULT_PATH=$(cat $(results.s3_result.path))
+      echo "S3 Path: $S3_RESULT_PATH" 
+      aws sts get-caller-identity
+      # we expect to see all files from loadtest that clusterloader2 outputs here in this dir
+      ls -larth
+      aws s3 cp . s3://$S3_RESULT_PATH/  --recursive