Skip to content

Commit 703a133

Browse files
committed
libct/nsenter: namespace the bindfd shuffle
Processes can watch /proc/self/mounts or /mountinfo, and the kernel will notify them whenever the namespace's mount table is modified. The notified process still needs to read and parse the mountinfo to determine what changed once notified. Many such processes, including udisksd and SystemD < v248, make no attempt to rate-limit their mountinfo notifications. This tends to not be a problem on many systems, where mount tables are small and mounting and unmounting is uncommon. Every runC exec which successfully uses the try_bindfd container-escape mitigation performs two mount()s and one umount() in the host's mount namespace, causing any mount-watching processes to wake up and parse the mountinfo file three times in a row. Consequently, using 'exec' health checks on containers has a larger-than-expected impact on system load when such mount-watching daemons are running. Furthermore, the size of the mount table in the host's mount namespace tends to be proportional to the number of OCI containers as a unique mount is required for the rootfs of each container. Therefore, on systems with mount-watching processes, the system load increases *quadratically* with the number of running containers which use health checks! Prevent runC from incidentally modifying the host's mount namespace for container-escape mitigations by setting up the mitigation in a private mount namespace. Signed-off-by: Cory Snider <[email protected]>
1 parent 1591202 commit 703a133

File tree

4 files changed

+151
-37
lines changed

4 files changed

+151
-37
lines changed

libcontainer/nsenter/cloned_binary.c

Lines changed: 139 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,20 @@
4949
#include <fcntl.h>
5050
#include <errno.h>
5151

52+
#include <sched.h>
5253
#include <sys/types.h>
5354
#include <sys/stat.h>
5455
#include <sys/statfs.h>
5556
#include <sys/vfs.h>
5657
#include <sys/mman.h>
5758
#include <sys/mount.h>
5859
#include <sys/sendfile.h>
60+
#include <sys/socket.h>
5961
#include <sys/syscall.h>
62+
#include <sys/wait.h>
63+
64+
#include "ipc.h"
65+
#include "log.h"
6066

6167
/* Use our own wrapper for memfd_create. */
6268
#ifndef SYS_memfd_create
@@ -394,6 +400,98 @@ static int seal_execfd(int *fd, int fdtype)
394400
return -1;
395401
}
396402

403+
struct bindfd_child_args {
404+
int sockfd;
405+
const char *mount_target;
406+
};
407+
408+
static int bindfd_in_subprocess(void *arg)
409+
{
410+
/*
411+
* In the interests of efficiency (read: minimizing the syscall count)
412+
* and conciseness, no attempt is made to release resources which would
413+
* be cleaned up automatically on process exit, i.e. when this function
414+
* returns. This includes filesystem mounts, as this function is
415+
* executed in a dedicated mount namespace.
416+
*/
417+
418+
/*
419+
* For obvious reasons this won't work in rootless mode because we
420+
* haven't created a userns -- but getting that to work will be a bit
421+
* complicated and it's only worth doing if someone actually needs it.
422+
*/
423+
if (mount("none", "/", NULL, MS_PRIVATE | MS_REC, NULL) < 0)
424+
return errno;
425+
/*
426+
* The kernel refuses to bind-mount from the magic symlink when the
427+
* process has been cloned (or unshared) into a new mount namespace,
428+
* at least on Linux 4.4.
429+
*/
430+
char linkbuf[PATH_MAX + 1] = { 0 };
431+
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
432+
if (linkpathlen < 0)
433+
return errno;
434+
if (linkpathlen == sizeof(linkbuf)) {
435+
/*
436+
* The link path is longer than PATH_MAX, and the contents of
437+
* linkbuf might have been truncated. A truncated path could
438+
* happen to be a valid path to a different file, which could
439+
* allow for local privilege escalation if we were to exec it.
440+
* The mount syscall doesn't accept paths longer than PATH_MAX,
441+
* anyway.
442+
*/
443+
return ENAMETOOLONG;
444+
}
445+
446+
const struct bindfd_child_args *args = arg;
447+
if (mount(linkbuf, args->mount_target, "", MS_BIND, "") < 0)
448+
return errno;
449+
if (mount("", args->mount_target, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
450+
return errno;
451+
452+
/* Get read-only handle that we're sure can't be made read-write. */
453+
int fd = open(args->mount_target, O_PATH | O_CLOEXEC);
454+
if (fd < 0)
455+
return errno;
456+
457+
/*
458+
* Make sure the MNT_DETACH works, otherwise we could get remounted
459+
* read-write and that would be quite bad (the fd would be made
460+
* read-write too, invalidating the protection).
461+
*/
462+
if (umount2(args->mount_target, MNT_DETACH) < 0)
463+
return errno;
464+
465+
if (send_fd(args->sockfd, fd) < 0)
466+
return errno;
467+
return 0;
468+
}
469+
470+
static int spawn_bindfd_child(const struct bindfd_child_args *args) __attribute__((noinline));
471+
static int spawn_bindfd_child(const struct bindfd_child_args *args)
472+
{
473+
/*
474+
* Carve out a chunk of our call stack for the child process to use as
475+
* we can be sure it is correctly mapped for use as stack. (Technically
476+
* only the libc clone() wrapper writes to it; the child process
477+
* operates on a copy of the parent's virtual memory space unless we use
478+
* the CLONE_VM flag.)
479+
*/
480+
char stack[128 * 1024] __attribute__((aligned(16)));
481+
return clone(bindfd_in_subprocess,
482+
/*
483+
* Assume stack grows down, as HP-PA, the only Linux
484+
* platform where stack grows up, is obsolete.
485+
*/
486+
stack + sizeof(stack),
487+
/*
488+
* Suspend the parent process until the child has exited to
489+
* save an unnecessary context switch as we'd just be
490+
* waiting for the child process to exit anyway.
491+
*/
492+
CLONE_NEWNS | CLONE_VFORK, (void *)args);
493+
}
494+
397495
static int try_bindfd(void)
398496
{
399497
int fd, ret = -1;
@@ -415,32 +513,51 @@ static int try_bindfd(void)
415513
close(fd);
416514

417515
/*
418-
* For obvious reasons this won't work in rootless mode because we haven't
419-
* created a userns+mntns -- but getting that to work will be a bit
420-
* complicated and it's only worth doing if someone actually needs it.
516+
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
517+
* re-parse it on every change, which gets expensive when the mount table
518+
* is large and/or changes frequently. Set up the bind-mounts in a new,
519+
* private mount namespace so as not to wake up those processes every
520+
* time we nsexec into a container. We clone a child process into a new
521+
* mount namespace to do the dirty work so the side effects of unsharing
522+
* the mount namespace do not leak into the current process.
421523
*/
422-
ret = -EPERM;
423-
if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
424-
goto out;
425-
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
426-
goto out_umount;
427-
428-
/* Get read-only handle that we're sure can't be made read-write. */
429-
ret = open(template, O_PATH | O_CLOEXEC);
524+
int sock[2];
525+
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sock) < 0)
526+
goto cleanup_unlink;
527+
528+
struct bindfd_child_args args = {
529+
.sockfd = sock[0],
530+
.mount_target = template,
531+
};
532+
int cpid = spawn_bindfd_child(&args);
533+
close(sock[0]);
534+
if (cpid < 0)
535+
goto cleanup_socketpair;
536+
537+
int wstatus = 0;
538+
if (waitpid(cpid, &wstatus, __WCLONE) < 0)
539+
bail("error waiting for bindfd child process to exit");
540+
if (WIFEXITED(wstatus)) {
541+
if ((errno = WEXITSTATUS(wstatus)))
542+
goto cleanup_socketpair;
543+
} else if (WIFSIGNALED(wstatus)) {
544+
int sig = WTERMSIG(wstatus);
545+
bail("bindfd child process terminated by signal %d (%s)", sig, strsignal(sig));
546+
} else {
547+
/* Should never happen... */
548+
bail("unexpected waitpid() status for bindfd child process: 0x%x", wstatus);
549+
}
430550

431-
out_umount:
432-
/*
433-
* Make sure the MNT_DETACH works, otherwise we could get remounted
434-
* read-write and that would be quite bad (the fd would be made read-write
435-
* too, invalidating the protection).
436-
*/
437-
if (umount2(template, MNT_DETACH) < 0) {
438-
if (ret >= 0)
439-
close(ret);
440-
ret = -ENOTRECOVERABLE;
551+
ret = receive_fd(sock[1]);
552+
if (fcntl(ret, F_SETFD, O_PATH | O_CLOEXEC) < 0) {
553+
ret = -1;
554+
goto cleanup_socketpair;
441555
}
442556

443-
out:
557+
cleanup_socketpair:
558+
close(sock[1]);
559+
560+
cleanup_unlink:
444561
/*
445562
* We don't care about unlink errors, the worst that happens is that
446563
* there's an empty file left around in STATEDIR.

libcontainer/nsenter/ipc.c

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#define _GNU_SOURCE
2+
#include <alloca.h>
23
#include <stdlib.h>
34
#include <string.h>
45
#include <sys/socket.h>
@@ -54,9 +55,8 @@ int receive_fd(int sockfd)
5455
return ret;
5556
}
5657

57-
void send_fd(int sockfd, int fd)
58+
int send_fd(int sockfd, int fd)
5859
{
59-
int bytes_written;
6060
struct msghdr msg = { };
6161
struct cmsghdr *cmsg;
6262
struct iovec iov[1] = { };
@@ -71,11 +71,7 @@ void send_fd(int sockfd, int fd)
7171
/* We send only one fd as specified by cmsg->cmsg_len below, even
7272
* though msg.msg_controllen might have more space due to alignment. */
7373
msg.msg_controllen = CMSG_SPACE(sizeof(int));
74-
msg.msg_control = malloc(msg.msg_controllen);
75-
if (msg.msg_control == NULL) {
76-
bail("Can't allocate memory to send fd.");
77-
}
78-
74+
msg.msg_control = alloca(msg.msg_controllen);
7975
memset(msg.msg_control, 0, msg.msg_controllen);
8076

8177
cmsg = CMSG_FIRSTHDR(&msg);
@@ -84,10 +80,5 @@ void send_fd(int sockfd, int fd)
8480
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
8581
memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
8682

87-
bytes_written = sendmsg(sockfd, &msg, 0);
88-
89-
free(msg.msg_control);
90-
91-
if (bytes_written != 1)
92-
bail("failed to send fd %d via unix socket %d", fd, sockfd);
83+
return sendmsg(sockfd, &msg, 0);
9384
}

libcontainer/nsenter/ipc.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
#define NSENTER_IPC_H
33

44
int receive_fd(int sockfd);
5-
void send_fd(int sockfd, int fd);
5+
6+
/*
7+
* send_fd passes the open file descriptor fd to another process via the UNIX
8+
* domain socket sockfd. The return value of the sendmsg(2) call is returned.
9+
*/
10+
int send_fd(int sockfd, int fd);
611

712
#endif /* NSENTER_IPC_H */

libcontainer/nsenter/nsexec.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,8 @@ void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mount
600600
if (fd < 0)
601601
bail("failed to open mount source %s", mountsources);
602602

603-
send_fd(sockfd, fd);
603+
if (send_fd(sockfd, fd) < 0)
604+
bail("failed to send fd %d via unix socket %d", fd, sockfd);
604605

605606
ret = close(fd);
606607
if (ret != 0)

0 commit comments

Comments
 (0)