4949#include <fcntl.h>
5050#include <errno.h>
5151
52+ #include <sched.h>
5253#include <sys/types.h>
5354#include <sys/stat.h>
5455#include <sys/statfs.h>
5556#include <sys/vfs.h>
5657#include <sys/mman.h>
5758#include <sys/mount.h>
5859#include <sys/sendfile.h>
60+ #include <sys/socket.h>
5961#include <sys/syscall.h>
62+ #include <sys/wait.h>
63+
64+ #include "ipc.h"
65+ #include "log.h"
6066
6167/* Use our own wrapper for memfd_create. */
6268#ifndef SYS_memfd_create
@@ -394,6 +400,123 @@ static int seal_execfd(int *fd, int fdtype)
394400 return -1 ;
395401}
396402
403+ struct bindfd_child_args {
404+ int sockfd ;
405+ const char * mount_target ;
406+ };
407+
408+ static int bindfd_in_subprocess (void * arg )
409+ {
410+ /*
411+ * In the interests of efficiency (read: minimizing the syscall count)
412+ * and conciseness, no attempt is made to release resources which would
413+ * be cleaned up automatically on process exit, i.e. when this function
414+ * returns. This includes filesystem mounts, as this function is
415+ * executed in a dedicated mount namespace.
416+ */
417+
418+ /*
419+ * For obvious reasons this won't work in rootless mode because we
420+ * haven't created a userns -- but getting that to work will be a bit
421+ * complicated and it's only worth doing if someone actually needs it.
422+ */
423+ if (mount ("none" , "/" , NULL , MS_SLAVE | MS_REC , NULL ) < 0 )
424+ return errno ;
425+ /*
426+ * The kernel resolves the magic symlink /proc/self/exe to the real file
427+ * _in the original mount namespace_. Cross-namespace bind mounts are
428+ * not allowed, so we must locate the file inside the current mount
429+ * namespace to be able to bind-mount it. (The mount(8) command resolves
430+ * symlinks, which is why it appears to work at first glance.)
431+ */
432+ char linkbuf [PATH_MAX + 1 ] = { 0 };
433+ ssize_t linkpathlen = readlink ("/proc/self/exe" , linkbuf , sizeof (linkbuf ));
434+ if (linkpathlen < 0 )
435+ return errno ;
436+ if (linkpathlen == sizeof (linkbuf )) {
437+ /*
438+ * The link path is longer than PATH_MAX, and the contents of
439+ * linkbuf might have been truncated. A truncated path could
440+ * happen to be a valid path to a different file, which could
441+ * allow for local privilege escalation if we were to exec it.
442+ * The mount syscall doesn't accept paths longer than PATH_MAX,
443+ * anyway.
444+ */
445+ return ENAMETOOLONG ;
446+ }
447+
448+ int srcfd = open (linkbuf , O_PATH | O_CLOEXEC );
449+ if (srcfd < 0 )
450+ return errno ;
451+ /*
452+ * linkbuf holds the path to the binary which the parent process was
453+ * launched from. Someone could have moved a different file to that path
454+ * in the interim, in which case srcfd is not the file we want to
455+ * bind-mount. Guard against this situation by verifying srcfd is the
456+ * same file as /proc/self/exe.
457+ */
458+ struct stat realexe = { 0 };
459+ if (stat ("/proc/self/exe" , & realexe ) < 0 )
460+ return errno ;
461+ struct stat resolved = { 0 };
462+ if (fstat (srcfd , & resolved ) < 0 )
463+ return errno ;
464+ if (resolved .st_dev != realexe .st_dev || resolved .st_ino != realexe .st_ino )
465+ return ENOENT ;
466+ if (snprintf (linkbuf , sizeof (linkbuf ), "/proc/self/fd/%d" , srcfd ) == sizeof (linkbuf ))
467+ return ENAMETOOLONG ;
468+
469+ const struct bindfd_child_args * args = arg ;
470+ if (mount (linkbuf , args -> mount_target , "" , MS_BIND , "" ) < 0 )
471+ return errno ;
472+ if (mount ("" , args -> mount_target , "" , MS_REMOUNT | MS_BIND | MS_RDONLY , "" ) < 0 )
473+ return errno ;
474+
475+ int fd = open (args -> mount_target , O_PATH | O_CLOEXEC );
476+ if (fd < 0 )
477+ return errno ;
478+
479+ /*
480+ * Make sure the MNT_DETACH works, otherwise we could get remounted
481+ * read-write and that would be quite bad.
482+ */
483+ if (umount2 (args -> mount_target , MNT_DETACH ) < 0 )
484+ return errno ;
485+
486+ if (send_fd (args -> sockfd , fd ) < 0 )
487+ return errno ;
488+ return 0 ;
489+ }
490+
491+ static int spawn_bindfd_child (const struct bindfd_child_args * args ) __attribute__((noinline ));
492+ static int spawn_bindfd_child (const struct bindfd_child_args * args )
493+ {
494+ /*
495+ * Carve out a chunk of our call stack for the child process to use as
496+ * we can be sure it is correctly mapped for use as stack. (Technically
497+ * only the libc clone() wrapper writes to this buffer. The child
498+ * process operates on a copy of the parent's virtual memory space and
499+ * so can safely overflow into the rest of the stack memory region
500+ * without consequence.)
501+ */
502+ char stack [4 * 1024 ] __attribute__((aligned (16 )));
503+ int tid = clone (bindfd_in_subprocess ,
504+ /*
505+ * Assume stack grows down, as HP-PA, the only Linux
506+ * platform where stack grows up, is obsolete.
507+ */
508+ stack + sizeof (stack ),
509+ /*
510+ * Suspend the parent process until the child has exited to
511+ * save an unnecessary context switch as we'd just be
512+ * waiting for the child process to exit anyway.
513+ */
514+ CLONE_NEWNS | CLONE_VFORK , (void * )args );
515+ if (tid < 0 )
516+ return - errno ;
517+ return tid ;
518+ }
519+
397520static int try_bindfd (void )
398521{
399522 int fd , ret = -1 ;
@@ -415,32 +538,53 @@ static int try_bindfd(void)
415538 close (fd );
416539
417540 /*
418- * For obvious reasons this won't work in rootless mode because we haven't
419- * created a userns+mntns -- but getting that to work will be a bit
420- * complicated and it's only worth doing if someone actually needs it.
541+ * Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
542+ * re-parse it on every change, which gets expensive when the mount table
543+ * is large and/or changes frequently. Perform the mount operations in a
544+ * new, private mount namespace so as not to wake up those processes
545+ * every time we nsexec into a container. We clone a child process into
546+ * a new mount namespace to do the dirty work so the side effects of
547+ * unsharing the mount namespace do not leak into the current process.
421548 */
422- ret = - EPERM ;
423- if (mount ( "/proc/self/exe" , template , "" , MS_BIND , "" ) < 0 )
424- goto out ;
425- if ( mount ( "" , template , "" , MS_REMOUNT | MS_BIND | MS_RDONLY , "" ) < 0 )
426- goto out_umount ;
549+ int sock [ 2 ] ;
550+ if (socketpair ( AF_LOCAL , SOCK_STREAM , 0 , sock ) < 0 ) {
551+ ret = - errno ;
552+ goto cleanup_unlink ;
553+ }
427554
428- /* Get read-only handle that we're sure can't be made read-write. */
429- ret = open (template , O_PATH | O_CLOEXEC );
555+ struct bindfd_child_args args = {
556+ .sockfd = sock [0 ],
557+ .mount_target = template ,
558+ };
559+ int cpid = spawn_bindfd_child (& args );
560+ close (sock [0 ]);
561+ if (cpid < 0 ) {
562+ ret = cpid ;
563+ goto cleanup_socketpair ;
564+ }
430565
431- out_umount :
432- /*
433- * Make sure the MNT_DETACH works, otherwise we could get remounted
434- * read-write and that would be quite bad (the fd would be made read-write
435- * too, invalidating the protection).
436- */
437- if (umount2 (template , MNT_DETACH ) < 0 ) {
438- if (ret >= 0 )
439- close (ret );
440- ret = - ENOTRECOVERABLE ;
566+ int wstatus = 0 ;
567+ if (waitpid (cpid , & wstatus , __WCLONE ) < 0 )
568+ bail ("error waiting for bindfd child process to exit" );
569+ if (WIFEXITED (wstatus )) {
570+ if (WEXITSTATUS (wstatus )) {
571+ ret = - WEXITSTATUS (wstatus );
572+ goto cleanup_socketpair ;
573+ }
574+ } else if (WIFSIGNALED (wstatus )) {
575+ int sig = WTERMSIG (wstatus );
576+ bail ("bindfd child process terminated by signal %d (%s)" , sig , strsignal (sig ));
577+ } else {
578+ /* Should never happen... */
579+ bail ("unexpected waitpid() status for bindfd child process: 0x%x" , wstatus );
441580 }
442581
443- out :
582+ ret = receive_fd (sock [1 ]);
583+
584+ cleanup_socketpair :
585+ close (sock [1 ]);
586+
587+ cleanup_unlink :
444588 /*
445589 * We don't care about unlink errors, the worst that happens is that
446590 * there's an empty file left around in STATEDIR.
0 commit comments