diff --git a/CHANGELOG.md b/CHANGELOG.md index d106e37..c84974e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # Changelog ## master / unreleased +* [CHANGE] Remove mem-ballast from distributor and querier. +* [CHANGE] Increase cpu requests for querier to 2. +* [CHANGE] Configure default GOMAXPROCS and GOMEMLIMIT for all cortex modules * [CHANGE] Add default tenant shard sizes * [CHANGE] Use cortex v1.15.3 * [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index 480112d..4df2e77 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -96,6 +96,7 @@ if $._config.alertmanager_enabled then container.new('alertmanager', $._images.alertmanager) + container.withPorts($.util.defaultPorts + mode.ports) + + container.withEnvMap($.alertmanager_env_map) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + @@ -112,6 +113,11 @@ $.jaeger_mixin else {}, + alertmanager_env_map:: { + GOMAXPROCS: '1', + GOMEMLIMIT: '1GiB', + }, + alertmanager_statefulset: if $._config.alertmanager_enabled then statefulSet.new('alertmanager', $._config.alertmanager.replicas, [$.alertmanager_container], $.alertmanager_pvc) + diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet index 03df1ab..9edfcdc 100644 --- a/cortex/compactor.libsonnet +++ b/cortex/compactor.libsonnet @@ -43,6 +43,7 @@ container.new('compactor', $._images.compactor) + container.withPorts($.compactor_ports) + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + + container.withEnvMap($.compactor_env_map) + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + @@ -50,6 +51,11 @@ $.util.readinessProbe + $.jaeger_mixin, + compactor_env_map:: { + GOMAXPROCS: std.toString($._config.cortex_compactor_max_concurrency), + GOMEMLIMIT: '6GiB', + }, + newCompactorStatefulSet(name, container):: statefulSet.new(name, 1, [container], compactor_data_pvc) + statefulSet.mixin.spec.withServiceName(name) + diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index c2bcfe0..86a17e2 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -20,11 +20,6 @@ 'distributor.ha-tracker.etcd.endpoints': 'etcd-client.%s.svc.cluster.local.:2379' % $._config.namespace, 'distributor.ha-tracker.prefix': 'prom_ha/', - // The memory requests are 2G, and we barely use 100M. - // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at - // around 1.25G, reducing the 99%ile. - 'mem-ballast-size-bytes': 1 << 30, // 1GB - 'server.grpc.keepalive.max-connection-age': '2m', 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', @@ -38,12 +33,18 @@ 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, }, + distributor_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '2GiB', + }, + distributor_ports:: $.util.defaultPorts, distributor_container:: container.new('distributor', $._images.distributor) + container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + + container.withEnvMap($.distributor_env_map) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '4Gi') + $.util.readinessProbe + diff --git a/cortex/flusher-job-blocks.libsonnet b/cortex/flusher-job-blocks.libsonnet index 1e6266c..6917a86 100644 --- a/cortex/flusher-job-blocks.libsonnet +++ b/cortex/flusher-job-blocks.libsonnet @@ -21,11 +21,17 @@ target: 'flusher', 'blocks-storage.tsdb.retention-period': '10000h', // don't delete old blocks too soon. })) + + container.withEnvMap($.flusher_env_map) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + $.jaeger_mixin, + flusher_env_map:: { + GOMAXPROCS: '4', + GOMEMLIMIT: '15GiB', + }, + flusher_job_func(jobName, pvcName):: job.new() + job.mixin.spec.template.spec.withContainers([ diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 818716e..7994589 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -3,6 +3,7 @@ local pvc = $.core.v1.persistentVolumeClaim, local statefulSet = $.apps.v1.statefulSet, local volume = $.core.v1.volume, + local volumeMount = $.core.v1.volumeMount, // The ingesters should persist TSDB blocks and WAL on a persistent // volume in order to be crash resilient. @@ -44,18 +45,6 @@ 'ingester.tokens-file-path': '/data/tokens', }, - ingester_statefulset_args:: - $._config.grpcConfig - { - 'ingester.wal-enabled': true, - 'ingester.checkpoint-enabled': true, - 'ingester.recover-from-wal': true, - 'ingester.wal-dir': $._config.ingester.wal_dir, - 'ingester.checkpoint-duration': '15m', - '-log.level': 'info', - 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', - }, - ingester_ports:: $.util.defaultPorts, local name = 'ingester', @@ -65,22 +54,19 @@ container.new(name, $._images.ingester) + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + + container.withEnvMap($.ingester_env_map) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + $.jaeger_mixin, - local volumeMount = $.core.v1.volumeMount, - - ingester_statefulset_container:: - $.ingester_container + - container.withArgsMixin($.util.mapToFlags($.ingester_statefulset_args)) + - container.withVolumeMountsMixin([ - volumeMount.new('ingester-pvc', $._config.ingester.wal_dir), - ]), - ingester_deployment_labels:: {}, + ingester_env_map:: { + GOMAXPROCS: '4', + GOMEMLIMIT: '15GiB', + }, + local ingester_pvc = pvc.new('ingester-pvc') + pvc.mixin.spec.resources.withRequests({ storage: $._config.ingester.statefulset_disk }) + diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index 6ebe85f..e5cb82e 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -26,16 +26,14 @@ 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, - // We request high memory but the Go heap is typically very low (< 100MB) and this causes - // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. - 'mem-ballast-size-bytes': 1 << 28, // 256M - 'log.level': 'debug', }, querier_ports:: $.util.defaultPorts, querier_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '12Gi', JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. }, @@ -46,7 +44,7 @@ $.jaeger_mixin + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + - $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1.deployment, diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index 80f36d0..39d4f6d 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -42,11 +42,17 @@ container.new('query-frontend', $._images.query_frontend) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + + container.withEnvMap($.query_frontend_env_map) + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '600Mi') + $.util.resourcesLimits(null, '1200Mi'), + query_frontend_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '600MiB', + }, + local deployment = $.apps.v1.deployment, newQueryFrontendDeployment(name, container):: diff --git a/cortex/query-scheduler.libsonnet b/cortex/query-scheduler.libsonnet index 604d258..b0a60a5 100644 --- a/cortex/query-scheduler.libsonnet +++ b/cortex/query-scheduler.libsonnet @@ -17,6 +17,7 @@ container.new('query-scheduler', $._images.query_scheduler) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + + container.withEnvMap($.query_scheduler_env_map) + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '1Gi') + @@ -30,6 +31,11 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + query_scheduler_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '1GiB', + }, + query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else self.newQuerySchedulerDeployment('query-scheduler', $.query_scheduler_container), diff --git a/cortex/query-tee.libsonnet b/cortex/query-tee.libsonnet index 4ac3b0a..0e1250c 100644 --- a/cortex/query-tee.libsonnet +++ b/cortex/query-tee.libsonnet @@ -18,9 +18,15 @@ containerPort.newNamed(name='http-metrics', containerPort=9900), ]) + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + + container.withEnvMap($.query_tee_env_map) + $.util.resourcesRequests('1', '512Mi') + $.jaeger_mixin, + query_tee_env_map:: { + GOMAXPROCS: '1', + GOMEMLIMIT: '512MiB', + }, + query_tee_deployment: if !($._config.query_tee_enabled) then {} else deployment.new('query-tee', 2, [$.query_tee_container]), diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index cfb0252..1688ca6 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -38,6 +38,7 @@ container.new('ruler', $._images.ruler) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + + container.withEnvMap($.ruler_env_map) + $.util.resourcesRequests('1', '6Gi') + $.util.resourcesLimits('16', '16Gi') + $.util.readinessProbe + @@ -56,6 +57,11 @@ $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') else {}, + ruler_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '6GiB', + }, + local service = $.core.v1.service, ruler_service: diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet index cea6308..757d9b6 100644 --- a/cortex/store-gateway.libsonnet +++ b/cortex/store-gateway.libsonnet @@ -40,12 +40,18 @@ container.new('store-gateway', $._images.store_gateway) + container.withPorts($.store_gateway_ports) + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + + container.withEnvMap($.store_gateway_env_map) + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + $.util.resourcesRequests('1', '12Gi') + $.util.resourcesLimits(null, '18Gi') + $.util.readinessProbe + $.jaeger_mixin, + store_gateway_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '12GiB', + }, + newStoreGatewayStatefulSet(name, container):: statefulSet.new(name, 3, [container], store_gateway_data_pvc) + statefulSet.mixin.spec.withServiceName(name) + diff --git a/cortex/test-exporter.libsonnet b/cortex/test-exporter.libsonnet index 9d69abe..036d6fe 100644 --- a/cortex/test-exporter.libsonnet +++ b/cortex/test-exporter.libsonnet @@ -18,10 +18,16 @@ container.new('test-exporter', $._images.testExporter) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.test_exporter_args)) + + container.withEnvMap($.test_exporter_env_map) + $.util.resourcesRequests('100m', '100Mi') + $.util.resourcesLimits('100m', '100Mi') + $.jaeger_mixin, + test_exporter_env_map:: { + GOMAXPROCS: '1', + GOMEMLIMIT: '100MiB', + }, + local deployment = $.apps.v1.deployment, test_exporter_deployment: