From 6462e9de6752a729518b3fe63b340cbcd14adb19 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 29 Aug 2022 12:16:46 -0700 Subject: [PATCH] runc update: implement memory.checkBeforeUpdate This is aimed at solving the problem of cgroup v2 memory controller behavior which is not compatible with that of cgroup v1. In cgroup v1, if the new memory limit being set is lower than the current usage, setting the new limit fails. In cgroup v2, same operation succeeds, and the container is OOM killed. Introduce a new setting, memory.checkBeforeUpdate, and use it to mimic cgroup v1 behavior. Note that this is not 100% reliable because of TOCTOU, but this is the best we can do. Add some test cases. Signed-off-by: Kir Kolyshkin --- go.mod | 2 +- go.sum | 4 +- libcontainer/cgroups/fs2/fs2.go | 32 +++++++++++++++ libcontainer/cgroups/fs2/memory.go | 5 +++ libcontainer/cgroups/systemd/v2.go | 11 ++++- libcontainer/configs/cgroup_linux.go | 5 +++ libcontainer/specconv/spec_linux.go | 3 ++ tests/integration/update.bats | 40 +++++++++++++++++++ update.go | 16 +++++--- .../runtime-spec/specs-go/config.go | 10 +++++ vendor/modules.txt | 2 +- 11 files changed, 118 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 08c1dda6bf9..6aba3012de6 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/godbus/dbus/v5 v5.1.0 github.com/moby/sys/mountinfo v0.6.2 github.com/mrunalp/fileutils v0.5.0 - github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b + github.com/opencontainers/runtime-spec v1.0.3-0.20220909204839-494a5a6aca78 github.com/opencontainers/selinux v1.10.2 github.com/seccomp/libseccomp-golang v0.10.0 github.com/sirupsen/logrus v1.9.0 diff --git a/go.sum b/go.sum index ca21b0937f0..8a399d74f7d 100644 --- a/go.sum +++ b/go.sum @@ -31,8 +31,8 @@ github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vyg github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4= github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= -github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b h1:udwtfS44rxYE/ViMLchHQBjfE60GZSB1arY7BFbyxLs= -github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.0.3-0.20220909204839-494a5a6aca78 h1:R5M2qXZiK/mWPMT4VldCOiSL9HIAMuxQZWdG0CSM5+4= +github.com/opencontainers/runtime-spec v1.0.3-0.20220909204839-494a5a6aca78/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/selinux v1.10.2 h1:NFy2xCsjn7+WspbfZkUd5zyVeisV7VFbPSP96+8/ha4= github.com/opencontainers/selinux v1.10.2/go.mod h1:cARutUbaUrlRClyvxOICCgKixCs6L05aUsohzA3EkHQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 7d366d0f3b9..42b5bcb60cf 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -269,3 +269,35 @@ func (m *Manager) OOMKillCount() (uint64, error) { return c, err } + +func CheckMemoryUsage(dirPath string, r *configs.Resources) error { + if !r.MemoryCheckBeforeUpdate { + return nil + } + + if r.Memory <= 0 && r.MemorySwap <= 0 { + return nil + } + + usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current") + if err != nil { + // This check is on best-effort basis, so if we can't read the + // current usage (cgroup not yet created, or any other error), + // we should not fail. + return nil + } + + if r.MemorySwap > 0 { + if uint64(r.MemorySwap) <= usage { + return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage) + } + } + + if r.Memory > 0 { + if uint64(r.Memory) <= usage { + return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage) + } + } + + return nil +} diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go index adbc4b23086..e3b857dc1db 100644 --- a/libcontainer/cgroups/fs2/memory.go +++ b/libcontainer/cgroups/fs2/memory.go @@ -40,6 +40,11 @@ func setMemory(dirPath string, r *configs.Resources) error { if !isMemorySet(r) { return nil } + + if err := CheckMemoryUsage(dirPath, r); err != nil { + return err + } + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) if err != nil { return err diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 48e9750cb87..823bf38fc17 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -174,7 +174,14 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props return props, nil } -func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { +func genV2ResourcesProperties(dirPath string, r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + // We need this check before setting systemd properties, otherwise + // the container is OOM-killed and the systemd unit is removed + // before we get to fsMgr.Set(). + if err := fs2.CheckMemoryUsage(dirPath, r); err != nil { + return nil, err + } + var properties []systemdDbus.Property // NOTE: This is of questionable correctness because we insert our own @@ -437,7 +444,7 @@ func (m *UnifiedManager) Set(r *configs.Resources) error { if r == nil { return nil } - properties, err := genV2ResourcesProperties(r, m.dbus) + properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus) if err != nil { return err } diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index 2d4a8987109..b5a1ebb9169 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -155,4 +155,9 @@ type Resources struct { // during Set() to figure out whether the freeze is required. Those // methods may be relatively slow, thus this flag. SkipFreezeOnSet bool `json:"-"` + + // MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check + // if the new memory limits (Memory and MemorySwap) being set are lower + // than the current memory usage, and reject if so. + MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"` } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index d62e34be713..4b32f286e44 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -723,6 +723,9 @@ func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*confi if r.Memory.DisableOOMKiller != nil { c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller } + if r.Memory.CheckBeforeUpdate != nil { + c.Resources.MemoryCheckBeforeUpdate = *r.Memory.CheckBeforeUpdate + } } if r.CPU != nil { if r.CPU.Shares != nil { diff --git a/tests/integration/update.bats b/tests/integration/update.bats index d2489009304..1407118c42b 100644 --- a/tests/integration/update.bats +++ b/tests/integration/update.bats @@ -726,3 +726,43 @@ EOF runc resume test_update [ "$status" -eq 0 ] } + +@test "update memory vs CheckBeforeUpdate" { + requires cgroups_v2 + [ $EUID -ne 0 ] && requires rootless_cgroup + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # Setting memory to low value with checkBeforeUpdate=true should fail. + runc update -r - test_update <