Skip to content

Commit db2e718

Browse files
hallyntorvalds
authored andcommitted
capabilities: require CAP_SETFCAP to map uid 0
cap_setfcap is required to create file capabilities. Since commit 8db6c34 ("Introduce v3 namespaced file capabilities"), a process running as uid 0 but without cap_setfcap is able to work around this as follows: unshare a new user namespace which maps parent uid 0 into the child namespace. While this task will not have new capabilities against the parent namespace, there is a loophole due to the way namespaced file capabilities are represented as xattrs. File capabilities valid in userns 1 are distinguished from file capabilities valid in userns 2 by the kuid which underlies uid 0. Therefore the restricted root process can unshare a new self-mapping namespace, add a namespaced file capability onto a file, then use that file capability in the parent namespace. To prevent that, do not allow mapping parent uid 0 if the process which opened the uid_map file does not have CAP_SETFCAP, which is the capability for setting file capabilities. As a further wrinkle: a task can unshare its user namespace, then open its uid_map file itself, and map (only) its own uid. In this case we do not have the credential from before unshare, which was potentially more restricted. So, when creating a user namespace, we record whether the creator had CAP_SETFCAP. Then we can use that during map_write(). With this patch: 1. Unprivileged user can still unshare -Ur ubuntu@caps:~$ unshare -Ur root@caps:~# logout 2. Root user can still unshare -Ur ubuntu@caps:~$ sudo bash root@caps:/home/ubuntu# unshare -Ur root@caps:/home/ubuntu# logout 3. Root user without CAP_SETFCAP cannot unshare -Ur: root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap -- root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap unable to set CAP_SETFCAP effective capability: Operation not permitted root@caps:/home/ubuntu# unshare -Ur unshare: write failed /proc/self/uid_map: Operation not permitted Note: an alternative solution would be to allow uid 0 mappings by processes without CAP_SETFCAP, but to prevent such a namespace from writing any file capabilities. This approach can be seen at [1]. Background history: commit 95ebabd ("capabilities: Don't allow writing ambiguous v3 file capabilities") tried to fix the issue by preventing v3 fscaps to be written to disk when the root uid would map to the same uid in nested user namespaces. This led to regressions for various workloads. For example, see [2]. Ultimately this is a valid use-case we have to support meaning we had to revert this change in 3b0c2d3 ("Revert 95ebabd ("capabilities: Don't allow writing ambiguous v3 file capabilities")"). Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1] Link: containers/buildah#3071 [2] Signed-off-by: Serge Hallyn <serge@hallyn.com> Reviewed-by: Andrew G. Morgan <morgan@kernel.org> Tested-by: Christian Brauner <christian.brauner@ubuntu.com> Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com> Tested-by: Giuseppe Scrivano <gscrivan@redhat.com> Cc: Eric Biederman <ebiederm@xmission.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 7af0814 commit db2e718

3 files changed

Lines changed: 67 additions & 4 deletions

File tree

include/linux/user_namespace.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ struct user_namespace {
6363
kgid_t group;
6464
struct ns_common ns;
6565
unsigned long flags;
66+
/* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
67+
* in its effective capability set at the child ns creation time. */
68+
bool parent_could_setfcap;
6669

6770
#ifdef CONFIG_KEYS
6871
/* List of joinable keyrings in this namespace. Modification access of

include/uapi/linux/capability.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
335335

336336
#define CAP_AUDIT_CONTROL 30
337337

338-
/* Set or remove capabilities on files */
338+
/* Set or remove capabilities on files.
339+
Map uid=0 into a child user namespace. */
339340

340341
#define CAP_SETFCAP 31
341342

kernel/user_namespace.c

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
106106
if (!ns)
107107
goto fail_dec;
108108

109+
ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
109110
ret = ns_alloc_inum(&ns->ns);
110111
if (ret)
111112
goto fail_free;
@@ -841,14 +842,68 @@ static int sort_idmaps(struct uid_gid_map *map)
841842
return 0;
842843
}
843844

845+
/**
846+
* verify_root_map() - check the uid 0 mapping
847+
* @file: idmapping file
848+
* @map_ns: user namespace of the target process
849+
* @new_map: requested idmap
850+
*
851+
* If a process requests mapping parent uid 0 into the new ns, verify that the
852+
* process writing the map had the CAP_SETFCAP capability as the target process
853+
* will be able to write fscaps that are valid in ancestor user namespaces.
854+
*
855+
* Return: true if the mapping is allowed, false if not.
856+
*/
857+
static bool verify_root_map(const struct file *file,
858+
struct user_namespace *map_ns,
859+
struct uid_gid_map *new_map)
860+
{
861+
int idx;
862+
const struct user_namespace *file_ns = file->f_cred->user_ns;
863+
struct uid_gid_extent *extent0 = NULL;
864+
865+
for (idx = 0; idx < new_map->nr_extents; idx++) {
866+
if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
867+
extent0 = &new_map->extent[idx];
868+
else
869+
extent0 = &new_map->forward[idx];
870+
if (extent0->lower_first == 0)
871+
break;
872+
873+
extent0 = NULL;
874+
}
875+
876+
if (!extent0)
877+
return true;
878+
879+
if (map_ns == file_ns) {
880+
/* The process unshared its ns and is writing to its own
881+
* /proc/self/uid_map. User already has full capabilites in
882+
* the new namespace. Verify that the parent had CAP_SETFCAP
883+
* when it unshared.
884+
* */
885+
if (!file_ns->parent_could_setfcap)
886+
return false;
887+
} else {
888+
/* Process p1 is writing to uid_map of p2, who is in a child
889+
* user namespace to p1's. Verify that the opener of the map
890+
* file has CAP_SETFCAP against the parent of the new map
891+
* namespace */
892+
if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
893+
return false;
894+
}
895+
896+
return true;
897+
}
898+
844899
static ssize_t map_write(struct file *file, const char __user *buf,
845900
size_t count, loff_t *ppos,
846901
int cap_setid,
847902
struct uid_gid_map *map,
848903
struct uid_gid_map *parent_map)
849904
{
850905
struct seq_file *seq = file->private_data;
851-
struct user_namespace *ns = seq->private;
906+
struct user_namespace *map_ns = seq->private;
852907
struct uid_gid_map new_map;
853908
unsigned idx;
854909
struct uid_gid_extent extent;
@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
895950
/*
896951
* Adjusting namespace settings requires capabilities on the target.
897952
*/
898-
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
953+
if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
899954
goto out;
900955

901956
/* Parse the user data */
@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
9651020

9661021
ret = -EPERM;
9671022
/* Validate the user is allowed to use user id's mapped to. */
968-
if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
1023+
if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
9691024
goto out;
9701025

9711026
ret = -EPERM;
@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
10861141
struct uid_gid_map *new_map)
10871142
{
10881143
const struct cred *cred = file->f_cred;
1144+
1145+
if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
1146+
return false;
1147+
10891148
/* Don't allow mappings that would allow anything that wouldn't
10901149
* be allowed without the establishment of unprivileged mappings.
10911150
*/

0 commit comments

Comments
 (0)