Skip to content

Commit 73b6fa8

Browse files
committed
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace updates from Eric Biederman: "This finishes up the changes to ensure proc and sysfs do not start implementing executable files, as the there are application today that are only secure because such files do not exist. It akso fixes a long standing misfeature of /proc/<pid>/mountinfo that did not show the proper source for files bind mounted from /proc/<pid>/ns/*. It also straightens out the handling of clone flags related to user namespaces, fixing an unnecessary failure of unshare(CLONE_NEWUSER) when files such as /proc/<pid>/environ are read while <pid> is calling unshare. This winds up fixing a minor bug in unshare flag handling that dates back to the first version of unshare in the kernel. Finally, this fixes a minor regression caused by the introduction of sysfs_create_mount_point, which broke someone's in house application, by restoring the size of /sys/fs/cgroup to 0 bytes. Apparently that application uses the directory size to determine if a tmpfs is mounted on /sys/fs/cgroup. The bind mount escape fixes are present in Al Viros for-next branch. and I expect them to come from there. The bind mount escape is the last of the user namespace related security bugs that I am aware of" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: fs: Set the size of empty dirs to 0. userns,pidns: Force thread group sharing, not signal handler sharing. unshare: Unsharing a thread does not require unsharing a vm nsfs: Add a show_path method to fix mountinfo mnt: fs_fully_visible enforce noexec and nosuid if !SB_I_NOEXEC vfs: Commit to never having exectuables on proc and sysfs.
2 parents e713c80 + 4b75de8 commit 73b6fa8

File tree

14 files changed

+83
-34
lines changed

14 files changed

+83
-34
lines changed

fs/exec.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
9898
module_put(fmt->module);
9999
}
100100

101+
bool path_noexec(const struct path *path)
102+
{
103+
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
104+
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
105+
}
106+
101107
#ifdef CONFIG_USELIB
102108
/*
103109
* Note that a shared library must be both readable and executable due to
@@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
132138
goto exit;
133139

134140
error = -EACCES;
135-
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
141+
if (path_noexec(&file->f_path))
136142
goto exit;
137143

138144
fsnotify_open(file);
@@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
777783
if (!S_ISREG(file_inode(file)->i_mode))
778784
goto exit;
779785

780-
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
786+
if (path_noexec(&file->f_path))
781787
goto exit;
782788

783789
err = deny_write_access(file);

fs/libfs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1185,7 +1185,7 @@ void make_empty_dir_inode(struct inode *inode)
11851185
inode->i_uid = GLOBAL_ROOT_UID;
11861186
inode->i_gid = GLOBAL_ROOT_GID;
11871187
inode->i_rdev = 0;
1188-
inode->i_size = 2;
1188+
inode->i_size = 0;
11891189
inode->i_blkbits = PAGE_SHIFT;
11901190
inode->i_blocks = 0;
11911191

fs/namespace.c

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3218,6 +3218,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
32183218
down_read(&namespace_sem);
32193219
list_for_each_entry(mnt, &ns->list, mnt_list) {
32203220
struct mount *child;
3221+
int mnt_flags;
3222+
32213223
if (mnt->mnt.mnt_sb->s_type != type)
32223224
continue;
32233225

@@ -3227,17 +3229,30 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
32273229
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
32283230
continue;
32293231

3232+
/* Read the mount flags and filter out flags that
3233+
* may safely be ignored.
3234+
*/
3235+
mnt_flags = mnt->mnt.mnt_flags;
3236+
if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
3237+
mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
3238+
32303239
/* Verify the mount flags are equal to or more permissive
32313240
* than the proposed new mount.
32323241
*/
3233-
if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
3242+
if ((mnt_flags & MNT_LOCK_READONLY) &&
32343243
!(new_flags & MNT_READONLY))
32353244
continue;
3236-
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
3245+
if ((mnt_flags & MNT_LOCK_NODEV) &&
32373246
!(new_flags & MNT_NODEV))
32383247
continue;
3239-
if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
3240-
((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
3248+
if ((mnt_flags & MNT_LOCK_NOSUID) &&
3249+
!(new_flags & MNT_NOSUID))
3250+
continue;
3251+
if ((mnt_flags & MNT_LOCK_NOEXEC) &&
3252+
!(new_flags & MNT_NOEXEC))
3253+
continue;
3254+
if ((mnt_flags & MNT_LOCK_ATIME) &&
3255+
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
32413256
continue;
32423257

32433258
/* This mount is not fully visible if there are any
@@ -3247,16 +3262,18 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
32473262
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
32483263
struct inode *inode = child->mnt_mountpoint->d_inode;
32493264
/* Only worry about locked mounts */
3250-
if (!(mnt->mnt.mnt_flags & MNT_LOCKED))
3265+
if (!(mnt_flags & MNT_LOCKED))
32513266
continue;
32523267
/* Is the directory permanetly empty? */
32533268
if (!is_empty_dir_inode(inode))
32543269
goto next;
32553270
}
32563271
/* Preserve the locked attributes */
3257-
*new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
3258-
MNT_LOCK_NODEV | \
3259-
MNT_LOCK_ATIME);
3272+
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
3273+
MNT_LOCK_NODEV | \
3274+
MNT_LOCK_NOSUID | \
3275+
MNT_LOCK_NOEXEC | \
3276+
MNT_LOCK_ATIME);
32603277
visible = true;
32613278
goto found;
32623279
next: ;

fs/nsfs.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <linux/proc_ns.h>
55
#include <linux/magic.h>
66
#include <linux/ktime.h>
7+
#include <linux/seq_file.h>
78

89
static struct vfsmount *nsfs_mnt;
910

@@ -136,9 +137,18 @@ struct file *proc_ns_fget(int fd)
136137
return ERR_PTR(-EINVAL);
137138
}
138139

140+
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
141+
{
142+
struct inode *inode = d_inode(dentry);
143+
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
144+
145+
return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
146+
}
147+
139148
static const struct super_operations nsfs_ops = {
140149
.statfs = simple_statfs,
141150
.evict_inode = nsfs_evict,
151+
.show_path = nsfs_show_path,
142152
};
143153
static struct dentry *nsfs_mount(struct file_system_type *fs_type,
144154
int flags, const char *dev_name, void *data)

fs/open.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
377377
* with the "noexec" flag.
378378
*/
379379
res = -EACCES;
380-
if (path.mnt->mnt_flags & MNT_NOEXEC)
380+
if (path_noexec(&path))
381381
goto out_path_release;
382382
}
383383

fs/proc/root.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
134134
}
135135

136136
sb->s_flags |= MS_ACTIVE;
137+
/* User space would break if executables appear on proc */
138+
sb->s_iflags |= SB_I_NOEXEC;
137139
}
138140

139141
return dget(sb->s_root);

fs/sysfs/mount.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
4040
SYSFS_MAGIC, &new_sb, ns);
4141
if (IS_ERR(root) || !new_sb)
4242
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
43+
else if (new_sb)
44+
/* Userspace would break if executables appear on sysfs */
45+
root->d_sb->s_iflags |= SB_I_NOEXEC;
46+
4347
return root;
4448
}
4549

include/linux/fs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,7 @@ struct mm_struct;
12601260

12611261
/* sb->s_iflags */
12621262
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
1263+
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
12631264

12641265
/* Possible states of 'frozen' field */
12651266
enum {
@@ -3041,4 +3042,6 @@ static inline bool dir_relax(struct inode *inode)
30413042
return !IS_DEADDIR(inode);
30423043
}
30433044

3045+
extern bool path_noexec(const struct path *path);
3046+
30443047
#endif /* _LINUX_FS_H */

kernel/fork.c

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,10 +1280,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
12801280

12811281
/*
12821282
* If the new process will be in a different pid or user namespace
1283-
* do not allow it to share a thread group or signal handlers or
1284-
* parent with the forking task.
1283+
* do not allow it to share a thread group with the forking task.
12851284
*/
1286-
if (clone_flags & CLONE_SIGHAND) {
1285+
if (clone_flags & CLONE_THREAD) {
12871286
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
12881287
(task_active_pid_ns(current) !=
12891288
current->nsproxy->pid_ns_for_children))
@@ -1872,13 +1871,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
18721871
CLONE_NEWUSER|CLONE_NEWPID))
18731872
return -EINVAL;
18741873
/*
1875-
* Not implemented, but pretend it works if there is nothing to
1876-
* unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1877-
* needs to unshare vm.
1874+
* Not implemented, but pretend it works if there is nothing
1875+
* to unshare. Note that unsharing the address space or the
1876+
* signal handlers also need to unshare the signal queues (aka
1877+
* CLONE_THREAD).
18781878
*/
18791879
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1880-
/* FIXME: get_task_mm() increments ->mm_users */
1881-
if (atomic_read(&current->mm->mm_users) > 1)
1880+
if (!thread_group_empty(current))
1881+
return -EINVAL;
1882+
}
1883+
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
1884+
if (atomic_read(&current->sighand->count) > 1)
1885+
return -EINVAL;
1886+
}
1887+
if (unshare_flags & CLONE_VM) {
1888+
if (!current_is_single_threaded())
18821889
return -EINVAL;
18831890
}
18841891

@@ -1942,20 +1949,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
19421949
int err;
19431950

19441951
/*
1945-
* If unsharing a user namespace must also unshare the thread.
1952+
* If unsharing a user namespace must also unshare the thread group
1953+
* and unshare the filesystem root and working directories.
19461954
*/
19471955
if (unshare_flags & CLONE_NEWUSER)
19481956
unshare_flags |= CLONE_THREAD | CLONE_FS;
1949-
/*
1950-
* If unsharing a thread from a thread group, must also unshare vm.
1951-
*/
1952-
if (unshare_flags & CLONE_THREAD)
1953-
unshare_flags |= CLONE_VM;
19541957
/*
19551958
* If unsharing vm, must also unshare signal handlers.
19561959
*/
19571960
if (unshare_flags & CLONE_VM)
19581961
unshare_flags |= CLONE_SIGHAND;
1962+
/*
1963+
* If unsharing a signal handlers, must also unshare the signal queues.
1964+
*/
1965+
if (unshare_flags & CLONE_SIGHAND)
1966+
unshare_flags |= CLONE_THREAD;
19591967
/*
19601968
* If unsharing namespace, must also unshare filesystem information.
19611969
*/

kernel/sys.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
16681668
* overall picture.
16691669
*/
16701670
err = -EACCES;
1671-
if (!S_ISREG(inode->i_mode) ||
1672-
exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1671+
if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
16731672
goto exit;
16741673

16751674
err = inode_permission(inode, MAY_EXEC);

kernel/user_namespace.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -976,8 +976,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
976976
if (user_ns == current_user_ns())
977977
return -EINVAL;
978978

979-
/* Threaded processes may not enter a different user namespace */
980-
if (atomic_read(&current->mm->mm_users) > 1)
979+
/* Tasks that share a thread group must share a user namespace */
980+
if (!thread_group_empty(current))
981981
return -EINVAL;
982982

983983
if (current->fs->users != 1)

mm/mmap.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
12681268
* mounted, in which case we dont add PROT_EXEC.)
12691269
*/
12701270
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1271-
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1271+
if (!(file && path_noexec(&file->f_path)))
12721272
prot |= PROT_EXEC;
12731273

12741274
if (!(flags & MAP_FIXED))
@@ -1337,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
13371337
case MAP_PRIVATE:
13381338
if (!(file->f_mode & FMODE_READ))
13391339
return -EACCES;
1340-
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1340+
if (path_noexec(&file->f_path)) {
13411341
if (vm_flags & VM_EXEC)
13421342
return -EPERM;
13431343
vm_flags &= ~VM_MAYEXEC;

mm/nommu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file,
10351035

10361036
/* handle executable mappings and implied executable
10371037
* mappings */
1038-
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1038+
if (path_noexec(&file->f_path)) {
10391039
if (prot & PROT_EXEC)
10401040
return -EPERM;
10411041
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {

security/security.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,7 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
776776
* ditto if it's not on noexec mount, except that on !MMU we need
777777
* NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
778778
*/
779-
if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
779+
if (!path_noexec(&file->f_path)) {
780780
#ifndef CONFIG_MMU
781781
if (file->f_op->mmap_capabilities) {
782782
unsigned caps = file->f_op->mmap_capabilities(file);

0 commit comments

Comments
 (0)