Skip to content

Commit a9f95b3

Browse files
chown cgroup to process uid in container namespace
Delegating cgroups to the container enables more complex workloads, including systemd-based workloads. The OCI runtime-spec was recently updated to explicitly admit such delegation, through specification of cgroup ownership semantics: opencontainers/runtime-spec#1123 Pursuant to the updated OCI runtime-spec, change the ownership of the container's cgroup directory and particular files therein, when using cgroups v2 and when the cgroupfs is to be mounted read/write. As a result of this change, systemd workloads can run in isolated user namespaces on OpenShift when the sandbox's cgroupfs is mounted read/write. It might be possible to implement this feature in other cgroup managers, but that work is deferred. Signed-off-by: Fraser Tweedale <[email protected]>
1 parent 20feb5d commit a9f95b3

File tree

4 files changed

+147
-0
lines changed

4 files changed

+147
-0
lines changed

libcontainer/cgroups/systemd/v2.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package systemd
22

33
import (
4+
"bufio"
45
"fmt"
56
"math"
7+
"os"
68
"path/filepath"
79
"strconv"
810
"strings"
@@ -288,6 +290,36 @@ func (m *unifiedManager) Apply(pid int) error {
288290
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
289291
return err
290292
}
293+
294+
if c.OwnerUID != nil {
295+
// The kernel exposes a list of files that should chowned to
296+
// the delegate uid in /sys/kernel/cgroup/delegate. If the
297+
// file is not present (Linux < 4.15), use the initial
298+
// values mentioned in in cgroups(7).
299+
filesToChown := []string{"."} // the directory itself must be chowned
300+
cgroupDelegateFile := "/sys/kernel/cgroup/delegate"
301+
f, err := os.Open(cgroupDelegateFile)
302+
if err == nil {
303+
defer f.Close()
304+
scanner := bufio.NewScanner(f)
305+
for scanner.Scan() {
306+
filesToChown = append(filesToChown, scanner.Text())
307+
}
308+
if err := scanner.Err(); err != nil {
309+
return fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
310+
}
311+
} else {
312+
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
313+
}
314+
315+
for _, v := range filesToChown {
316+
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
317+
if err != nil {
318+
return err
319+
}
320+
}
321+
}
322+
291323
return nil
292324
}
293325

libcontainer/configs/cgroup_linux.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ type Cgroup struct {
4141

4242
// Rootless tells if rootless cgroups should be used.
4343
Rootless bool
44+
45+
// The host UID that should own the cgroup, or nil to accept
46+
// the default ownership. This should only be set when the
47+
// cgroupfs is to be mounted read/write.
48+
// Not all cgroup manager implementations support changing
49+
// the ownership.
50+
OwnerUID *int `json:"owner_uid,omitempty"`
4451
}
4552

4653
type Resources struct {

libcontainer/specconv/spec_linux.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
366366
}
367367
}
368368
}
369+
370+
// Set the host UID that should own the container's cgroup.
371+
// This must be performed after setupUserNamespace, so that
372+
// config.HostRootUID() returns the correct result.
373+
//
374+
// Only set it if the container will have its own cgroup
375+
// namespace and the cgroupfs will be mounted read/write.
376+
//
377+
hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == ""
378+
hasRwCgroupfs := false
379+
if hasCgroupNS {
380+
for _, m := range config.Mounts {
381+
if m.Source == "cgroup" && m.Destination == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 {
382+
hasRwCgroupfs = true
383+
break
384+
}
385+
}
386+
}
387+
processUid := 0
388+
if spec.Process != nil {
389+
// Chown the cgroup to the UID running the process,
390+
// which is not necessarily UID 0 in the container
391+
// namespace (e.g., an unprivileged UID in the host
392+
// user namespace).
393+
processUid = int(spec.Process.User.UID)
394+
}
395+
if hasCgroupNS && hasRwCgroupfs {
396+
ownerUid, err := config.HostUID(processUid)
397+
// There are two error cases; we can ignore both.
398+
//
399+
// 1. uidMappings is unset. Either there is no user
400+
// namespace (fine), or it is an error (which is
401+
// checked elsewhere).
402+
//
403+
// 2. The user is unmapped in the user namespace. This is an
404+
// unusual configuration and might be an error. But it too
405+
// will be checked elsewhere, so we can ignore it here.
406+
//
407+
if err == nil {
408+
config.Cgroups.OwnerUID = &ownerUid
409+
}
410+
}
411+
369412
if spec.Process != nil {
370413
config.OomScoreAdj = spec.Process.OOMScoreAdj
371414
config.NoNewPrivileges = spec.Process.NoNewPrivileges
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env bats
2+
3+
load helpers
4+
5+
function teardown() {
6+
teardown_bundle
7+
}
8+
9+
function setup() {
10+
setup_busybox
11+
12+
set_cgroups_path
13+
14+
# configure a user namespace
15+
update_config ' .linux.namespaces += [{"type": "user"}]
16+
| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
17+
| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
18+
'
19+
20+
# chown test temp dir to allow host user to read it
21+
chown 100000 "$ROOT"
22+
23+
# chown rootfs to allow host user to mkdir mount points
24+
chown 100000 "$ROOT"/bundle/rootfs
25+
}
26+
27+
@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" {
28+
requires root cgroups_v2 systemd
29+
30+
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
31+
[ "$status" -eq 0 ]
32+
33+
runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
34+
[ "$status" -eq 0 ]
35+
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
36+
}
37+
38+
@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" {
39+
requires root cgroups_v2 systemd
40+
41+
set_cgroup_mount_writable
42+
43+
# inherit cgroup namespace (remove cgroup from namespaces list)
44+
update_config '.linux.namespaces |= map(select(.type != "cgroup"))'
45+
46+
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
47+
[ "$status" -eq 0 ]
48+
49+
runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
50+
[ "$status" -eq 0 ]
51+
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
52+
}
53+
54+
@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" {
55+
requires root cgroups_v2 systemd
56+
57+
set_cgroup_mount_writable
58+
59+
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
60+
[ "$status" -eq 0 ]
61+
62+
runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
63+
[ "$status" -eq 0 ]
64+
[ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace)
65+
}

0 commit comments

Comments
 (0)