Skip to content

Commit 9caccd4

Browse files
author
Christian Brauner
committed
fs: introduce MOUNT_ATTR_IDMAP
Introduce a new mount bind mount property to allow idmapping mounts. The MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall together with a file descriptor referring to a user namespace. The user namespace referenced by the namespace file descriptor will be attached to the bind mount. All interactions with the filesystem going through that mount will be mapped according to the mapping specified in the user namespace attached to it. Using user namespaces to mark mounts means we can reuse all the existing infrastructure in the kernel that already exists to handle idmappings and can also use this for permission checking to allow unprivileged user to create idmapped mounts in the future. Idmapping a mount is decoupled from the caller's user and mount namespace. This means idmapped mounts can be created in the initial user namespace which is an important use-case for systemd-homed, portable usb-sticks between systems, sharing data between the initial user namespace and unprivileged containers, and other use-cases that have been brought up. For example, assume a home directory where all files are owned by uid and gid 1000 and the home directory is brought to a new laptop where the user has id 12345. The system administrator can simply create a mount of this home directory with a mapping of 1000:12345:1 and other mappings to indicate the ids should be kept. (With this it is e.g. also possible to create idmapped mounts on the host with an identity mapping 1:1:100000 where the root user is not mapped. A user with root access that e.g. has been pivot rooted into such a mount on the host will be not be able to execute, read, write, or create files as root.) Given that mapping a mount is decoupled from the caller's user namespace a sufficiently privileged process such as a container manager can set up an idmapped mount for the container and the container can simply pivot root to it. There's no need for the container to do anything. The mount will appear correctly mapped independent of the user namespace the container uses. This means we don't need to mark a mount as idmappable. In order to create an idmapped mount the caller must currently be privileged in the user namespace of the superblock the mount belongs to. Once a mount has been idmapped we don't allow it to change its mapping. This keeps permission checking and life-cycle management simple. Users wanting to change the idmapped can always create a new detached mount with a different idmapping. Link: https://lore.kernel.org/r/[email protected] Cc: Christoph Hellwig <[email protected]> Cc: David Howells <[email protected]> Cc: Mauricio Vásquez Bernal <[email protected]> Cc: Al Viro <[email protected]> Cc: [email protected] Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Christian Brauner <[email protected]>
1 parent 2a18672 commit 9caccd4

File tree

4 files changed

+121
-8
lines changed

4 files changed

+121
-8
lines changed

fs/namespace.c

Lines changed: 115 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/proc_ns.h>
2626
#include <linux/magic.h>
2727
#include <linux/memblock.h>
28+
#include <linux/proc_fs.h>
2829
#include <linux/task_work.h>
2930
#include <linux/sched/task.h>
3031
#include <uapi/linux/mount.h>
@@ -79,6 +80,7 @@ struct mount_kattr {
7980
unsigned int propagation;
8081
unsigned int lookup_flags;
8182
bool recurse;
83+
struct user_namespace *mnt_userns;
8284
};
8385

8486
/* /sys/fs */
@@ -3477,7 +3479,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
34773479
(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
34783480
MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
34793481

3480-
#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS
3482+
#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
34813483

34823484
#define MOUNT_SETATTR_PROPAGATION_FLAGS \
34833485
(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
@@ -3845,6 +3847,36 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
38453847
return flags;
38463848
}
38473849

3850+
static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
3851+
{
3852+
struct vfsmount *m = &mnt->mnt;
3853+
3854+
if (!kattr->mnt_userns)
3855+
return 0;
3856+
3857+
/*
3858+
* Once a mount has been idmapped we don't allow it to change its
3859+
* mapping. It makes things simpler and callers can just create
3860+
* another bind-mount they can idmap if they want to.
3861+
*/
3862+
if (mnt_user_ns(m) != &init_user_ns)
3863+
return -EPERM;
3864+
3865+
/* The underlying filesystem doesn't support idmapped mounts yet. */
3866+
if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
3867+
return -EINVAL;
3868+
3869+
/* We're not controlling the superblock. */
3870+
if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
3871+
return -EPERM;
3872+
3873+
/* Mount has already been visible in the filesystem hierarchy. */
3874+
if (!is_anon_ns(mnt->mnt_ns))
3875+
return -EINVAL;
3876+
3877+
return 0;
3878+
}
3879+
38483880
static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
38493881
struct mount *mnt, int *err)
38503882
{
@@ -3869,6 +3901,10 @@ static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
38693901
goto out;
38703902
}
38713903

3904+
*err = can_idmap_mount(kattr, m);
3905+
if (*err)
3906+
goto out;
3907+
38723908
last = m;
38733909

38743910
if ((kattr->attr_set & MNT_READONLY) &&
@@ -3883,6 +3919,18 @@ static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
38833919
return last;
38843920
}
38853921

3922+
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
3923+
{
3924+
struct user_namespace *mnt_userns;
3925+
3926+
if (!kattr->mnt_userns)
3927+
return;
3928+
3929+
mnt_userns = get_user_ns(kattr->mnt_userns);
3930+
/* Pairs with smp_load_acquire() in mnt_user_ns(). */
3931+
smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
3932+
}
3933+
38863934
static void mount_setattr_commit(struct mount_kattr *kattr,
38873935
struct mount *mnt, struct mount *last,
38883936
int err)
@@ -3893,6 +3941,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr,
38933941
if (!err) {
38943942
unsigned int flags;
38953943

3944+
do_idmap_mount(kattr, m);
38963945
flags = recalc_flags(kattr, m);
38973946
WRITE_ONCE(m->mnt.mnt_flags, flags);
38983947
}
@@ -3965,7 +4014,62 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
39654014
return err;
39664015
}
39674016

3968-
static int build_mount_kattr(const struct mount_attr *attr,
4017+
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4018+
struct mount_kattr *kattr, unsigned int flags)
4019+
{
4020+
int err = 0;
4021+
struct ns_common *ns;
4022+
struct user_namespace *mnt_userns;
4023+
struct file *file;
4024+
4025+
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
4026+
return 0;
4027+
4028+
/*
4029+
* We currently do not support clearing an idmapped mount. If this ever
4030+
* is a use-case we can revisit this but for now let's keep it simple
4031+
* and not allow it.
4032+
*/
4033+
if (attr->attr_clr & MOUNT_ATTR_IDMAP)
4034+
return -EINVAL;
4035+
4036+
if (attr->userns_fd > INT_MAX)
4037+
return -EINVAL;
4038+
4039+
file = fget(attr->userns_fd);
4040+
if (!file)
4041+
return -EBADF;
4042+
4043+
if (!proc_ns_file(file)) {
4044+
err = -EINVAL;
4045+
goto out_fput;
4046+
}
4047+
4048+
ns = get_proc_ns(file_inode(file));
4049+
if (ns->ops->type != CLONE_NEWUSER) {
4050+
err = -EINVAL;
4051+
goto out_fput;
4052+
}
4053+
4054+
/*
4055+
* The init_user_ns is used to indicate that a vfsmount is not idmapped.
4056+
* This is simpler than just having to treat NULL as unmapped. Users
4057+
* wanting to idmap a mount to init_user_ns can just use a namespace
4058+
* with an identity mapping.
4059+
*/
4060+
mnt_userns = container_of(ns, struct user_namespace, ns);
4061+
if (mnt_userns == &init_user_ns) {
4062+
err = -EPERM;
4063+
goto out_fput;
4064+
}
4065+
kattr->mnt_userns = get_user_ns(mnt_userns);
4066+
4067+
out_fput:
4068+
fput(file);
4069+
return err;
4070+
}
4071+
4072+
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
39694073
struct mount_kattr *kattr, unsigned int flags)
39704074
{
39714075
unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
@@ -3991,9 +4095,6 @@ static int build_mount_kattr(const struct mount_attr *attr,
39914095
if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
39924096
return -EINVAL;
39934097

3994-
if (attr->userns_fd)
3995-
return -EINVAL;
3996-
39974098
kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
39984099
kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
39994100

@@ -4032,7 +4133,13 @@ static int build_mount_kattr(const struct mount_attr *attr,
40324133
return -EINVAL;
40334134
}
40344135

4035-
return 0;
4136+
return build_mount_idmapped(attr, usize, kattr, flags);
4137+
}
4138+
4139+
static void finish_mount_kattr(struct mount_kattr *kattr)
4140+
{
4141+
put_user_ns(kattr->mnt_userns);
4142+
kattr->mnt_userns = NULL;
40364143
}
40374144

40384145
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
@@ -4070,7 +4177,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
40704177
attr.propagation == 0)
40714178
return 0;
40724179

4073-
err = build_mount_kattr(&attr, &kattr, flags);
4180+
err = build_mount_kattr(&attr, usize, &kattr, flags);
40744181
if (err)
40754182
return err;
40764183

@@ -4079,6 +4186,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
40794186
return err;
40804187

40814188
err = do_mount_setattr(&target, &kattr);
4189+
finish_mount_kattr(&kattr);
40824190
path_put(&target);
40834191
return err;
40844192
}

fs/proc_namespace.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
7979
if (mnt->mnt_flags & fs_infop->flag)
8080
seq_puts(m, fs_infop->str);
8181
}
82+
83+
if (mnt_user_ns(mnt) != &init_user_ns)
84+
seq_puts(m, ",idmapped");
8285
}
8386

8487
static inline void mangle(struct seq_file *m, const char *s)

include/linux/mount.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ struct vfsmount {
7777

7878
static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
7979
{
80-
return mnt->mnt_userns;
80+
/* Pairs with smp_store_release() in do_idmap_mount(). */
81+
return smp_load_acquire(&mnt->mnt_userns);
8182
}
8283

8384
struct file; /* forward dec */

include/uapi/linux/mount.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ enum fsconfig_command {
119119
#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */
120120
#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */
121121
#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */
122+
#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
122123

123124
/*
124125
* mount_setattr()

0 commit comments

Comments
 (0)