-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Support Id mapped mounts for shared volumes #3429
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,10 +5,13 @@ import ( | |
| "fmt" | ||
| "io" | ||
| "os" | ||
| "os/exec" | ||
| "path" | ||
| "path/filepath" | ||
| "strconv" | ||
| "strings" | ||
| "sync" | ||
| "syscall" | ||
| "time" | ||
|
|
||
| securejoin "github.com/cyphar/filepath-securejoin" | ||
|
|
@@ -18,6 +21,7 @@ import ( | |
| "github.com/opencontainers/runc/libcontainer/cgroups/fs2" | ||
| "github.com/opencontainers/runc/libcontainer/configs" | ||
| "github.com/opencontainers/runc/libcontainer/devices" | ||
| "github.com/opencontainers/runc/libcontainer/user" | ||
| "github.com/opencontainers/runc/libcontainer/userns" | ||
| "github.com/opencontainers/runc/libcontainer/utils" | ||
| "github.com/opencontainers/runtime-spec/specs-go" | ||
|
|
@@ -328,6 +332,53 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { | |
| }) | ||
| } | ||
|
|
||
| var ( | ||
| syscallOnce sync.Once | ||
| syscallErr error | ||
| ) | ||
|
|
||
| func mountByMove(m *configs.Mount, dest string) error { | ||
| // Check once for all mount point that syscall exists | ||
| syscallOnce.Do(func() { | ||
| _, err := unix.OpenTree(-int(unix.EBADF), "", uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH|unix.AT_RECURSIVE)) | ||
| if errors.Is(err, unix.ENOSYS) { | ||
| logrus.Debugf("no open_tree syscall on the system") | ||
| syscallErr = err | ||
| } | ||
|
|
||
| err = unix.MoveMount(-int(unix.EBADF), "", unix.AT_FDCWD, "", unix.MOVE_MOUNT_F_EMPTY_PATH) | ||
| if errors.Is(err, unix.ENOSYS) { | ||
| logrus.Debugf("no move_mount syscall on the system") | ||
| syscallErr = err | ||
| } | ||
|
|
||
| err = unix.MountSetattr(-int(unix.EBADF), "", unix.AT_EMPTY_PATH|unix.AT_RECURSIVE, nil) | ||
| if errors.Is(err, unix.ENOSYS) { | ||
| logrus.Debugf("no mount_setattr syscall on the system") | ||
| syscallErr = err | ||
| } | ||
| }) | ||
| if syscallErr != nil { | ||
| return syscallErr | ||
| } | ||
| // If syscalls exist we may fail somewhere else | ||
| treeFd, err := unix.OpenTree(-int(unix.EBADF), m.Source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH|unix.AT_RECURSIVE)) | ||
| if err != nil || treeFd < 0 { | ||
| return err | ||
| } | ||
|
|
||
| err = unix.MoveMount(treeFd, "", unix.AT_FDCWD, dest, unix.MOVE_MOUNT_F_EMPTY_PATH) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if err := mountIDMapMapped(m, treeFd); err != nil { | ||
| return err | ||
| } | ||
| logrus.Debugf("move_mount succeed") | ||
| return nil | ||
| } | ||
|
|
||
| func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) { | ||
| // Set up a scratch dir for the tmpfs on the host. | ||
| tmpdir, err := prepareTmp("/tmp") | ||
|
|
@@ -437,9 +488,13 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { | |
| if err := prepareBindMount(m, rootfs, mountFd); err != nil { | ||
| return err | ||
| } | ||
| if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil { | ||
| return err | ||
|
|
||
| if mountByMove(m, dest) != nil { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm testing this, and it seems this can't work. At this point we are inside the userns (I've added some Did you made this runc code work for you at some point? Or was this a c&p from working code in a standalone binary (that you tried from the host, so the initial userns) that never was tested in the runc binary (that IIUC this part runs inside the userns). Am I missing something? Or do we need a big rework to make this work? |
||
| if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil { | ||
| return err | ||
|
Comment on lines
+492
to
+494
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This means, if we fail to mount with idmap mounts, it just does a bind-mount? I'm not sure what is the right behavior, because just mounting and ignoring the mapping for that mount will probably create issues to access the files (if the container running inside a userns). I'm not sure what the right behavior should be. What do you think? And other maintainers?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm leaning towards in case of errors just failing the mount, rather than doing a bind-mount and ignoring the mapping. It will definitely cause access issues, or even worse: files might get written with completely different UIDs/GIDs and it will be painful to fix once that happens. @AlexeyPerevalov What do you think? @AkihiroSuda @kolyshkin What do you think?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking a little bit how higher layers will use this, yes, I think we need to error out. At the k8s layer we want to start the container with an id mapped mount or not start it at all. If it is started without the id mapped mounts, the files created will have just random garbage as UID/GID, won't be able to read files... it will be a complete mess. I think if we just return an error if an id mapped mount was requested and we failed to create it is the right path here. Also, even if in the future we want to open this door (doubt about it), we can always open a door. But we won't be able to close it if we open it now. @AlexeyPerevalov Can we make this error out if the id map mount fails? |
||
| } | ||
| } | ||
|
|
||
| // bind mount won't change mount options, we need remount to make mount options effective. | ||
| // first check that we have non-default options required before attempting a remount | ||
| if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 { | ||
|
|
@@ -472,7 +527,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { | |
| } | ||
| return mountPropagate(m, rootfs, mountLabel, mountFd) | ||
| } | ||
| if err := setRecAttr(m, rootfs); err != nil { | ||
| if err := setMountAttr(m, rootfs); err != nil { | ||
| return err | ||
| } | ||
| return nil | ||
|
|
@@ -1044,7 +1099,7 @@ func remount(m *configs.Mount, rootfs string, mountFd *int) error { | |
|
|
||
| return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { | ||
| flags := uintptr(m.Flags | unix.MS_REMOUNT) | ||
| err := mount(source, m.Destination, procfd, m.Device, flags, "") | ||
| err := mount(source, m.Destination, procfd, m.Device, flags, m.Data) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
|
|
@@ -1058,7 +1113,8 @@ func remount(m *configs.Mount, rootfs string, mountFd *int) error { | |
| } | ||
| // ... and retry the mount with ro flag set. | ||
| flags |= unix.MS_RDONLY | ||
| return mount(source, m.Destination, procfd, m.Device, flags, "") | ||
|
|
||
| return mount(source, m.Destination, procfd, m.Device, flags, m.Data) | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -1107,11 +1163,87 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd | |
| return nil | ||
| } | ||
|
|
||
| func setRecAttr(m *configs.Mount, rootfs string) error { | ||
| if m.RecAttr == nil { | ||
| func toLinuxIDMap(mappings []user.IDMap) (configMappings []syscall.SysProcIDMap) { | ||
| createIDMap := func(m user.IDMap) syscall.SysProcIDMap { | ||
| return syscall.SysProcIDMap{ | ||
| ContainerID: int(m.ID), | ||
| HostID: int(m.ParentID), | ||
| Size: int(m.Count), | ||
| } | ||
| } | ||
| for _, m := range mappings { | ||
| configMappings = append(configMappings, createIDMap(m)) | ||
| } | ||
| return | ||
| } | ||
|
|
||
| func mountIDMapMapped(m *configs.Mount, fsmFd int) (err error) { | ||
| if m.Device != "bind" || utils.CleanPath(m.Destination) == "/sys" || len(m.UIDMaps) == 0 || len(m.GIDMaps) == 0 { | ||
| return nil | ||
| } | ||
| return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { | ||
| return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr) | ||
| }) | ||
| const userNsHelperBinary = "/bin/true" | ||
| var attr unix.MountAttr | ||
|
|
||
| attr.Attr_set = unix.MOUNT_ATTR_IDMAP | ||
| attr.Attr_clr = 0 | ||
| attr.Propagation = 0 | ||
|
|
||
| // TODO: this was given from containerd/containerd | ||
| // https://github.com/containerd/containerd/pull/5890 | ||
| // this code requese generalization | ||
| // TODO: Avoid dependency on /bin/true or do in a completely different way | ||
| // Currently there is no way to pass idmapping directly to mount_setattr, | ||
| // this is not very convenient from the container runtime point of view. | ||
| // The id remapping procedure should be done in containerd, due to we have | ||
| // old approach that use recursive chown. | ||
| // Maybe it is necessary to think about moving of container rootfs ownership | ||
| // adjustment to runc due to runc has information about container user namespace. | ||
| // But personally I think that it would be better to add possibility to call | ||
| // mount_setattr with explicit id mappings and leave container runtime components | ||
| // responsibilities unchanged. | ||
| cmd := exec.Command(userNsHelperBinary) | ||
| cmd.SysProcAttr = &syscall.SysProcAttr{ | ||
| Cloneflags: syscall.CLONE_NEWUSER, | ||
| } | ||
|
|
||
| cmd.SysProcAttr.UidMappings = toLinuxIDMap(m.UIDMaps) | ||
| cmd.SysProcAttr.GidMappings = toLinuxIDMap(m.GIDMaps) | ||
|
|
||
| if err = cmd.Start(); err != nil { | ||
| return fmt.Errorf("Failed to run the %s helper binary, %w", userNsHelperBinary, err) | ||
| } | ||
|
|
||
| defer func() { | ||
| if waitErr := cmd.Wait(); waitErr != nil { | ||
| err = fmt.Errorf("Failed to run the %s helper binary, %w", userNsHelperBinary, waitErr) | ||
| } | ||
| }() | ||
|
|
||
| path := fmt.Sprintf("/proc/%d/ns/user", cmd.Process.Pid) | ||
| var userNsFile *os.File | ||
| if userNsFile, err = os.Open(path); err != nil { | ||
| return fmt.Errorf("Unable to get user ns file descriptor for - %s, %w", path, err) | ||
| } | ||
| defer userNsFile.Close() | ||
|
|
||
| attr.Userns_fd = uint64(userNsFile.Fd()) | ||
|
|
||
| err = unix.MountSetattr(fsmFd, "", unix.AT_EMPTY_PATH|unix.AT_RECURSIVE, &attr) | ||
| if err != nil { | ||
| return fmt.Errorf("MountSetAttr has failed idmapping - %s, %w", m.Destination, err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func setMountAttr(m *configs.Mount, rootfs string) error { | ||
| if m.RecAttr != nil { | ||
| err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { | ||
| return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr) | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("MountSetattr failed, AT_RECURSIVE, %w", err) | ||
| } | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be exported as an interface so other parts may query the syscall availability status? We face the same issue too and seems it could be reused.