Skip to content

Commit ee114af

Browse files
committed
libct/nsenter: namespace the bindfd shuffle
Processes can watch /proc/self/mounts or /mountinfo, and the kernel will notify them whenever the namespace's mount table is modified. The notified process still needs to read and parse the mountinfo to determine what changed once notified. Many such processes, including udisksd and SystemD < v248, make no attempt to rate-limit their mountinfo notifications. This tends to not be a problem on many systems, where mount tables are small and mounting and unmounting is uncommon. Every runC exec which successfully uses the try_bindfd container-escape mitigation performs two mount()s and one umount() in the host's mount namespace, causing any mount-watching processes to wake up and parse the mountinfo file three times in a row. Consequently, using 'exec' health checks on containers has a larger-than-expected impact on system load when such mount-watching daemons are running. Furthermore, the size of the mount table in the host's mount namespace tends to be proportional to the number of OCI containers as a unique mount is required for the rootfs of each container. Therefore, on systems with mount-watching processes, the system load increases *quadratically* with the number of running containers which use health checks! Prevent runC from incidentally modifying the host's mount namespace for container-escape mitigations by setting up the mitigation in a temporary mount namespace. Signed-off-by: Cory Snider <[email protected]>
1 parent cccb8fe commit ee114af

File tree

1 file changed

+165
-21
lines changed

1 file changed

+165
-21
lines changed

libcontainer/nsenter/cloned_binary.c

Lines changed: 165 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,20 @@
4949
#include <fcntl.h>
5050
#include <errno.h>
5151

52+
#include <sched.h>
5253
#include <sys/types.h>
5354
#include <sys/stat.h>
5455
#include <sys/statfs.h>
5556
#include <sys/vfs.h>
5657
#include <sys/mman.h>
5758
#include <sys/mount.h>
5859
#include <sys/sendfile.h>
60+
#include <sys/socket.h>
5961
#include <sys/syscall.h>
62+
#include <sys/wait.h>
63+
64+
#include "ipc.h"
65+
#include "log.h"
6066

6167
/* Use our own wrapper for memfd_create. */
6268
#ifndef SYS_memfd_create
@@ -394,6 +400,123 @@ static int seal_execfd(int *fd, int fdtype)
394400
return -1;
395401
}
396402

403+
struct bindfd_child_args {
404+
int sockfd;
405+
const char *mount_target;
406+
};
407+
408+
static int bindfd_in_subprocess(void *arg)
409+
{
410+
/*
411+
* In the interests of efficiency (read: minimizing the syscall count)
412+
* and conciseness, no attempt is made to release resources which would
413+
* be cleaned up automatically on process exit, i.e. when this function
414+
* returns. This includes filesystem mounts, as this function is
415+
* executed in a dedicated mount namespace.
416+
*/
417+
418+
/*
419+
* For obvious reasons this won't work in rootless mode because we
420+
* haven't created a userns -- but getting that to work will be a bit
421+
* complicated and it's only worth doing if someone actually needs it.
422+
*/
423+
if (mount("none", "/", NULL, MS_SLAVE | MS_REC, NULL) < 0)
424+
return errno;
425+
/*
426+
* The kernel resolves the magic symlink /proc/self/exe to the real file
427+
* _in the original mount namespace_. Cross-namespace bind mounts are
428+
* not allowed, so we must locate the file inside the current mount
429+
* namespace to be able to bind-mount it. (The mount(8) command resolves
430+
* symlinks, which is why it appears to work at first glance.)
431+
*/
432+
char linkbuf[PATH_MAX + 1] = { 0 };
433+
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
434+
if (linkpathlen < 0)
435+
return errno;
436+
if (linkpathlen == sizeof(linkbuf)) {
437+
/*
438+
* The link path is longer than PATH_MAX, and the contents of
439+
* linkbuf might have been truncated. A truncated path could
440+
* happen to be a valid path to a different file, which could
441+
* allow for local privilege escalation if we were to exec it.
442+
* The mount syscall doesn't accept paths longer than PATH_MAX,
443+
* anyway.
444+
*/
445+
return ENAMETOOLONG;
446+
}
447+
448+
int srcfd = open(linkbuf, O_PATH | O_CLOEXEC);
449+
if (srcfd < 0)
450+
return errno;
451+
/*
452+
* linkbuf holds the path to the binary which the parent process was
453+
* launched from. Someone could have moved a different file to that path
454+
* in the interim, in which case srcfd is not the file we want to
455+
* bind-mount. Guard against this situation by verifying srcfd is the
456+
* same file as /proc/self/exe.
457+
*/
458+
struct stat realexe = { 0 };
459+
if (stat("/proc/self/exe", &realexe) < 0)
460+
return errno;
461+
struct stat resolved = { 0 };
462+
if (fstat(srcfd, &resolved) < 0)
463+
return errno;
464+
if (resolved.st_dev != realexe.st_dev || resolved.st_ino != realexe.st_ino)
465+
return ENOENT;
466+
if (snprintf(linkbuf, sizeof(linkbuf), "/proc/self/fd/%d", srcfd) == sizeof(linkbuf))
467+
return ENAMETOOLONG;
468+
469+
const struct bindfd_child_args *args = arg;
470+
if (mount(linkbuf, args->mount_target, "", MS_BIND, "") < 0)
471+
return errno;
472+
if (mount("", args->mount_target, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
473+
return errno;
474+
475+
int fd = open(args->mount_target, O_PATH | O_CLOEXEC);
476+
if (fd < 0)
477+
return errno;
478+
479+
/*
480+
* Make sure the MNT_DETACH works, otherwise we could get remounted
481+
* read-write and that would be quite bad.
482+
*/
483+
if (umount2(args->mount_target, MNT_DETACH) < 0)
484+
return errno;
485+
486+
if (send_fd(args->sockfd, fd) < 0)
487+
return errno;
488+
return 0;
489+
}
490+
491+
static int spawn_bindfd_child(const struct bindfd_child_args *args) __attribute__((noinline));
492+
static int spawn_bindfd_child(const struct bindfd_child_args *args)
493+
{
494+
/*
495+
* Carve out a chunk of our call stack for the child process to use as
496+
* we can be sure it is correctly mapped for use as stack. (Technically
497+
* only the libc clone() wrapper writes to this buffer. The child
498+
* process operates on a copy of the parent's virtual memory space and
499+
* so can safely overflow into the rest of the stack memory region
500+
* without consequence.)
501+
*/
502+
char stack[4 * 1024] __attribute__((aligned(16)));
503+
int tid = clone(bindfd_in_subprocess,
504+
/*
505+
* Assume stack grows down, as HP-PA, the only Linux
506+
* platform where stack grows up, is obsolete.
507+
*/
508+
stack + sizeof(stack),
509+
/*
510+
* Suspend the parent process until the child has exited to
511+
* save an unnecessary context switch as we'd just be
512+
* waiting for the child process to exit anyway.
513+
*/
514+
CLONE_NEWNS | CLONE_VFORK, (void *)args);
515+
if (tid < 0)
516+
return -errno;
517+
return tid;
518+
}
519+
397520
static int try_bindfd(void)
398521
{
399522
int fd, ret = -1;
@@ -415,32 +538,53 @@ static int try_bindfd(void)
415538
close(fd);
416539

417540
/*
418-
* For obvious reasons this won't work in rootless mode because we haven't
419-
* created a userns+mntns -- but getting that to work will be a bit
420-
* complicated and it's only worth doing if someone actually needs it.
541+
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
542+
* re-parse it on every change, which gets expensive when the mount table
543+
* is large and/or changes frequently. Perform the mount operations in a
544+
* new, private mount namespace so as not to wake up those processes
545+
* every time we nsexec into a container. We clone a child process into
546+
* a new mount namespace to do the dirty work so the side effects of
547+
* unsharing the mount namespace do not leak into the current process.
421548
*/
422-
ret = -EPERM;
423-
if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
424-
goto out;
425-
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
426-
goto out_umount;
549+
int sock[2];
550+
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sock) < 0) {
551+
ret = -errno;
552+
goto cleanup_unlink;
553+
}
427554

428-
/* Get read-only handle that we're sure can't be made read-write. */
429-
ret = open(template, O_PATH | O_CLOEXEC);
555+
struct bindfd_child_args args = {
556+
.sockfd = sock[0],
557+
.mount_target = template,
558+
};
559+
int cpid = spawn_bindfd_child(&args);
560+
close(sock[0]);
561+
if (cpid < 0) {
562+
ret = cpid;
563+
goto cleanup_socketpair;
564+
}
430565

431-
out_umount:
432-
/*
433-
* Make sure the MNT_DETACH works, otherwise we could get remounted
434-
* read-write and that would be quite bad (the fd would be made read-write
435-
* too, invalidating the protection).
436-
*/
437-
if (umount2(template, MNT_DETACH) < 0) {
438-
if (ret >= 0)
439-
close(ret);
440-
ret = -ENOTRECOVERABLE;
566+
int wstatus = 0;
567+
if (waitpid(cpid, &wstatus, __WCLONE) < 0)
568+
bail("error waiting for bindfd child process to exit");
569+
if (WIFEXITED(wstatus)) {
570+
if (WEXITSTATUS(wstatus)) {
571+
ret = -WEXITSTATUS(wstatus);
572+
goto cleanup_socketpair;
573+
}
574+
} else if (WIFSIGNALED(wstatus)) {
575+
int sig = WTERMSIG(wstatus);
576+
bail("bindfd child process terminated by signal %d (%s)", sig, strsignal(sig));
577+
} else {
578+
/* Should never happen... */
579+
bail("unexpected waitpid() status for bindfd child process: 0x%x", wstatus);
441580
}
442581

443-
out:
582+
ret = receive_fd(sock[1]);
583+
584+
cleanup_socketpair:
585+
close(sock[1]);
586+
587+
cleanup_unlink:
444588
/*
445589
* We don't care about unlink errors, the worst that happens is that
446590
* there's an empty file left around in STATEDIR.

0 commit comments

Comments
 (0)