diff --git a/Makefile b/Makefile index 0a15fd908ea..1ddaefaa29a 100644 --- a/Makefile +++ b/Makefile @@ -207,7 +207,7 @@ install-man: man .PHONY: cfmt cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/') cfmt: - indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC) + indent -linux -l120 -il0 -ppi2 -cp1 -sar -T size_t -T jmp_buf $(C_SRC) .PHONY: shellcheck shellcheck: diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 565b2ca2030..607d495a263 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -322,30 +322,6 @@ static int clone_parent(jmp_buf *env, int jmpval) return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); } -/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ -static int nsflag(char *name) -{ - if (!strcmp(name, "cgroup")) - return CLONE_NEWCGROUP; - else if (!strcmp(name, "ipc")) - return CLONE_NEWIPC; - else if (!strcmp(name, "mnt")) - return CLONE_NEWNS; - else if (!strcmp(name, "net")) - return CLONE_NEWNET; - else if (!strcmp(name, "pid")) - return CLONE_NEWPID; - else if (!strcmp(name, "user")) - return CLONE_NEWUSER; - else if (!strcmp(name, "uts")) - return CLONE_NEWUTS; - else if (!strcmp(name, "time")) - return CLONE_NEWTIME; - - /* If we don't recognise a name, fallback to 0. */ - return 0; -} - static uint32_t readint32(char *buf) { return *(uint32_t *) buf; @@ -444,35 +420,67 @@ void nl_free(struct nlconfig_t *config) free(config->data); } -void join_namespaces(char *nslist) -{ - int num = 0, i; - char *saveptr = NULL; - char *namespace = strtok_r(nslist, ",", &saveptr); - struct namespace_t { - int fd; - char type[PATH_MAX]; - char path[PATH_MAX]; - } *namespaces = NULL; +struct namespace_t { + int fd; + char type[PATH_MAX]; + char path[PATH_MAX]; +}; - if (!namespace || !strlen(namespace) || !strlen(nslist)) - bail("ns paths are empty"); +typedef int nsset_t; + +static struct nstype_t { + int type; + char *name; +} all_ns_types[] = { + { CLONE_NEWCGROUP, "cgroup" }, + { CLONE_NEWIPC, "ipc" }, + { CLONE_NEWNS, "mnt" }, + { CLONE_NEWNET, "net" }, + { CLONE_NEWPID, "pid" }, + { CLONE_NEWTIME, "time" }, + { CLONE_NEWUSER, "user" }, + { CLONE_NEWUTS, "uts" }, + { }, /* null terminator */ +}; +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nstype(char *name) +{ + for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++) + if (!strcmp(name, ns->name)) + return ns->type; /* - * We have to open the file descriptors first, since after - * we join the mnt namespace we might no longer be able to - * access the paths. + * setns(2) lets us join namespaces without knowing the type, but + * namespaces usually require special handling of some kind (so joining + * a namespace without knowing its type or joining a new namespace type + * without corresponding handling could result in broken behaviour) and + * the rest of runc doesn't allow unknown namespace types anyway. */ + bail("unknown namespace type %s", name); +} + +static nsset_t __open_namespaces(char *nsspec, struct namespace_t **ns_list, size_t *ns_len) +{ + int len = 0; + nsset_t ns_to_join = 0; + char *namespace, *saveptr = NULL; + struct namespace_t *namespaces = NULL; + + namespace = strtok_r(nsspec, ",", &saveptr); + + if (!namespace || !strlen(namespace) || !strlen(nsspec)) + bail("ns paths are empty"); + do { int fd; char *path; struct namespace_t *ns; /* Resize the namespace array. */ - namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + namespaces = realloc(namespaces, ++len * sizeof(struct namespace_t)); if (!namespaces) bail("failed to reallocate namespace array"); - ns = &namespaces[num - 1]; + ns = &namespaces[len - 1]; /* Split 'ns:path'. */ path = strstr(namespace, ":"); @@ -488,22 +496,43 @@ void join_namespaces(char *nslist) strncpy(ns->type, namespace, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1); ns->path[PATH_MAX - 1] = '\0'; - } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); - /* - * The ordering in which we join namespaces is important. We should - * always join the user namespace *first*. This is all guaranteed - * from the container_linux.go side of this, so we're just going to - * follow the order given to us. - */ + ns_to_join |= nstype(ns->type); + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); - for (i = 0; i < num; i++) { - struct namespace_t *ns = &namespaces[i]; - int flag = nsflag(ns->type); + *ns_list = namespaces; + *ns_len = len; + return ns_to_join; +} - write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); - if (setns(ns->fd, flag) < 0) +/* + * Try to join all namespaces that are in the "allow" nsset, and return the + * set we were able to successfully join. If a permission error is returned + * from nsset(2), the namespace is skipped (non-permission errors are fatal). + */ +static nsset_t __join_namespaces(nsset_t allow, struct namespace_t *ns_list, size_t ns_len) +{ + nsset_t joined = 0; + + for (size_t i = 0; i < ns_len; i++) { + struct namespace_t *ns = &ns_list[i]; + int type = nstype(ns->type); + int err, saved_errno; + + if (!(type & allow)) + continue; + + err = setns(ns->fd, type); + saved_errno = errno; + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s): %s", + type, ns->type, ns->path, strerror(errno)); + if (err < 0) { + /* Skip permission errors. */ + if (saved_errno == EPERM) + continue; bail("failed to setns into %s namespace", ns->type); + } + joined |= type; /* * If we change user namespaces, make sure we switch to root in the @@ -511,15 +540,101 @@ void join_namespaces(char *nslist) * of things can break if we aren't the right user. See * for one example. */ - if (flag == CLONE_NEWUSER) { + if (type == CLONE_NEWUSER) { if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } close(ns->fd); + ns->fd = -1; + } + return joined; +} + +static char *strappend(char *dst, char *src) +{ + if (!dst) + return strdup(src); + + size_t len = strlen(dst) + strlen(src) + 1; + dst = realloc(dst, len); + strncat(dst, src, len); + return dst; +} + +static char *nsset_to_str(nsset_t nsset) +{ + char *str = NULL; + for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++) { + if (ns->type & nsset) { + if (str) + str = strappend(str, ", "); + str = strappend(str, ns->name); + } + } + return str ? : strdup(""); +} + +static void __close_namespaces(nsset_t to_join, nsset_t joined, struct namespace_t *ns_list, size_t ns_len) +{ + /* We expect to have joined every namespace. */ + nsset_t failed_to_join = to_join & ~joined; + + /* Double-check that we used up (and thus joined) all of the nsfds. */ + for (size_t i = 0; i < ns_len; i++) { + struct namespace_t *ns = &ns_list[i]; + int type = nstype(ns->type); + + if (ns->fd < 0) + continue; + + failed_to_join |= type; + write_log(FATAL, "failed to setns(%#x) into %s namespace (with path %s): %s", + type, ns->type, ns->path, strerror(EPERM)); + close(ns->fd); + ns->fd = -1; } - free(namespaces); + /* Make sure we joined the namespaces we planned to. */ + if (failed_to_join) + bail("failed to join {%s} namespaces: %s", nsset_to_str(failed_to_join), strerror(EPERM)); + + free(ns_list); +} + +void join_namespaces(char *nsspec) +{ + nsset_t to_join = 0, joined = 0; + struct namespace_t *ns_list; + size_t ns_len; + + /* + * We have to open the file descriptors first, since after we join the + * mnt or user namespaces we might no longer be able to access the + * paths. + */ + to_join = __open_namespaces(nsspec, &ns_list, &ns_len); + + /* + * We first try to join all non-userns namespaces to join any namespaces + * that we might not be able to join once we switch credentials to the + * container's userns. We then join the user namespace, and then try to + * join any remaining namespaces (this last step is needed for rootless + * containers -- we don't get setns(2) permissions until we join the userns + * and get CAP_SYS_ADMIN). + * + * Splitting the joins this way is necessary for containers that are + * configured to join some externally-created namespace but are also + * configured to join an unrelated user namespace. + * + * This is similar to what nsenter(1) seems to do in practice. + */ + joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len); + joined |= __join_namespaces(CLONE_NEWUSER, ns_list, ns_len); + joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len); + + /* Verify that we joined all of the namespaces. */ + __close_namespaces(to_join, joined, ns_list, ns_len); } static inline int sane_kill(pid_t pid, int signum) diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats index 78583ba4d45..67766401693 100644 --- a/tests/integration/userns.bats +++ b/tests/integration/userns.bats @@ -29,7 +29,7 @@ function teardown() { if [ -v to_umount_list ]; then while read -r mount_path; do umount -l "$mount_path" || : - rm -f "$mount_path" + rm -rf "$mount_path" done <"$to_umount_list" rm -f "$to_umount_list" unset to_umount_list @@ -184,3 +184,65 @@ function teardown() { grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" fi } + +# +@test "userns join external namespaces [wrong userns owner]" { + requires root + + # Create an external user namespace for us to join. It seems on some + # operating systems (AlmaLinux in particular) "unshare -U" will + # automatically use an identity mapping (which breaks this test) so we need + # to use runc to create the userns. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Bind-mount the first containers userns nsfd to a different path, to + # exercise the non-fast-path (where runc has to join the userns to get the + # mappings). + userns_pid="$(__runc state target_userns | jq .pid)" + userns_path="$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX")" + mount --bind "/proc/$userns_pid/ns/user" "$userns_path" + echo "$userns_path" >>"$to_umount_list" + + # Kill the container -- we have the userns bind-mounted. + runc delete -f target_userns + [ "$status" -eq 0 ] + + # Configure our container to attach to the external userns. + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + + # Also create a network namespace that *is not owned* by the above userns. + # NOTE: Having no permissions in a namespaces makes it necessary to modify + # the config so that we don't get mount errors (for reference: no netns + # permissions == no sysfs mounts, no pidns permissoins == no procfs mounts, + # no utsns permissions == no sethostname(2), no ipc permissions == no + # mqueue mounts, etc). + netns_path="$(mktemp "$BATS_RUN_TMPDIR/netns.XXXXXX")" + unshare -i -- mount --bind "/proc/self/ns/net" "$netns_path" + echo "$netns_path" >>"$to_umount_list" + # Configure our container to attach to the external netns. + update_config '.linux.namespaces |= map(if .type == "network" then (.path = "'"$netns_path"'") else . end)' + + # Convert sysfs mounts to a bind-mount from the host, to avoid permission + # issues due to the netns setup we have. + update_config '.mounts |= map((select(.type == "sysfs") | { "source": "/sys", "destination": .destination, "type": "bind", "options": ["rbind"] }) // .)' + + # Create a detached container to verify the namespaces are correct. + update_config '.process.args = ["sleep", "infinity"]' + runc --debug run -d --console-socket "$CONSOLE_SOCKET" ctr + [ "$status" -eq 0 ] + + userns_id="user:[$(stat -c "%i" "$userns_path")]" + netns_id="net:[$(stat -c "%i" "$netns_path")]" + + runc exec ctr readlink /proc/self/ns/user + [ "$status" -eq 0 ] + [[ "$output" == "$userns_id" ]] + + runc exec ctr readlink /proc/self/ns/net + [ "$status" -eq 0 ] + [[ "$output" == "$netns_id" ]] +}