Skip to content

Commit a7ebb0f

Browse files
committed
Merge patch series "Support foreign mount namespace with statmount/listmount"
Josef Bacik <[email protected]> says: Currently the only way to iterate over mount entries in mount namespaces that aren't your own is to trawl through /proc in order to find /proc/$PID/mountinfo for the mount namespace that you want. This is hugely inefficient, so extend both statmount() and listmount() to allow specifying a mount namespace id in order to get to mounts in other mount namespaces. There are a few components to this 1. Having a global index of the mount namespace based on the ->seq value in the mount namespace. This gives us a unique identifier that isn't re-used. 2. Support looking up mount namespaces based on that unique identifier, and validating the user has permission to access the given mount namespace. 3. Provide a new ioctl() on nsfs in order to extract the unique identifier we can use for statmount() and listmount(). The code is relatively straightforward, and there is a selftest provided to validate everything works properly. This is based on vfs.all as of last week, so must be applied onto a tree that has Christians error handling rework in this area. If you wish you can pull the tree directly here https://github.com/josefbacik/linux/tree/listmount.combined Christian and I collaborated on this series, which is why there's patches from both of us in this series. Christian Brauner (4): fs: relax permissions for listmount() fs: relax permissions for statmount() fs: Allow listmount() in foreign mount namespace fs: Allow statmount() in foreign mount namespace Josef Bacik (4): fs: keep an index of current mount namespaces fs: export the mount ns id via statmount fs: add an ioctl to get the mnt ns id from nsfs selftests: add a test for the foreign mnt ns extensions fs/mount.h | 2 + fs/namespace.c | 240 ++++++++++-- fs/nsfs.c | 14 + include/uapi/linux/mount.h | 6 +- include/uapi/linux/nsfs.h | 2 + .../selftests/filesystems/statmount/Makefile | 2 +- .../filesystems/statmount/statmount.h | 46 +++ .../filesystems/statmount/statmount_test.c | 53 +-- .../filesystems/statmount/statmount_test_ns.c | 360 ++++++++++++++++++ 9 files changed, 659 insertions(+), 66 deletions(-) create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents d04bccd + d896f71 commit a7ebb0f

File tree

9 files changed

+663
-66
lines changed

9 files changed

+663
-66
lines changed

fs/mount.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ struct mnt_namespace {
1616
u64 event;
1717
unsigned int nr_mounts; /* # of mounts in the namespace */
1818
unsigned int pending_mounts;
19+
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
20+
refcount_t passive; /* number references not pinning @mounts */
1921
} __randomize_layout;
2022

2123
struct mnt_pcp {

fs/namespace.c

Lines changed: 216 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
7878
static DECLARE_RWSEM(namespace_sem);
7979
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
8080
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
81+
static DEFINE_RWLOCK(mnt_ns_tree_lock);
82+
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
8183

8284
struct mount_kattr {
8385
unsigned int attr_set;
@@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
103105
*/
104106
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
105107

108+
static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
109+
{
110+
u64 seq_b = ns->seq;
111+
112+
if (seq < seq_b)
113+
return -1;
114+
if (seq > seq_b)
115+
return 1;
116+
return 0;
117+
}
118+
119+
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
120+
{
121+
if (!node)
122+
return NULL;
123+
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
124+
}
125+
126+
static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
127+
{
128+
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
129+
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
130+
u64 seq_a = ns_a->seq;
131+
132+
return mnt_ns_cmp(seq_a, ns_b) < 0;
133+
}
134+
135+
static void mnt_ns_tree_add(struct mnt_namespace *ns)
136+
{
137+
guard(write_lock)(&mnt_ns_tree_lock);
138+
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
139+
}
140+
141+
static void mnt_ns_release(struct mnt_namespace *ns)
142+
{
143+
lockdep_assert_not_held(&mnt_ns_tree_lock);
144+
145+
/* keep alive for {list,stat}mount() */
146+
if (refcount_dec_and_test(&ns->passive)) {
147+
put_user_ns(ns->user_ns);
148+
kfree(ns);
149+
}
150+
}
151+
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
152+
153+
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
154+
{
155+
/* remove from global mount namespace list */
156+
if (!is_anon_ns(ns)) {
157+
guard(write_lock)(&mnt_ns_tree_lock);
158+
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
159+
}
160+
161+
mnt_ns_release(ns);
162+
}
163+
164+
/*
165+
* Returns the mount namespace which either has the specified id, or has the
166+
* next smallest id afer the specified one.
167+
*/
168+
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
169+
{
170+
struct rb_node *node = mnt_ns_tree.rb_node;
171+
struct mnt_namespace *ret = NULL;
172+
173+
lockdep_assert_held(&mnt_ns_tree_lock);
174+
175+
while (node) {
176+
struct mnt_namespace *n = node_to_mnt_ns(node);
177+
178+
if (mnt_ns_id <= n->seq) {
179+
ret = node_to_mnt_ns(node);
180+
if (mnt_ns_id == n->seq)
181+
break;
182+
node = node->rb_left;
183+
} else {
184+
node = node->rb_right;
185+
}
186+
}
187+
return ret;
188+
}
189+
190+
/*
191+
* Lookup a mount namespace by id and take a passive reference count. Taking a
192+
* passive reference means the mount namespace can be emptied if e.g., the last
193+
* task holding an active reference exits. To access the mounts of the
194+
* namespace the @namespace_sem must first be acquired. If the namespace has
195+
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
196+
* see that the mount rbtree of the namespace is empty.
197+
*/
198+
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
199+
{
200+
struct mnt_namespace *ns;
201+
202+
guard(read_lock)(&mnt_ns_tree_lock);
203+
ns = mnt_ns_find_id_at(mnt_ns_id);
204+
if (!ns || ns->seq != mnt_ns_id)
205+
return NULL;
206+
207+
refcount_inc(&ns->passive);
208+
return ns;
209+
}
210+
106211
static inline void lock_mount_hash(void)
107212
{
108213
write_seqlock(&mount_lock);
@@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
37333838
if (!is_anon_ns(ns))
37343839
ns_free_inum(&ns->ns);
37353840
dec_mnt_namespaces(ns->ucounts);
3736-
put_user_ns(ns->user_ns);
3737-
kfree(ns);
3841+
mnt_ns_tree_remove(ns);
37383842
}
37393843

37403844
/*
@@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
37733877
if (!anon)
37743878
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
37753879
refcount_set(&new_ns->ns.count, 1);
3880+
refcount_set(&new_ns->passive, 1);
37763881
new_ns->mounts = RB_ROOT;
3882+
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
37773883
init_waitqueue_head(&new_ns->poll);
37783884
new_ns->user_ns = get_user_ns(user_ns);
37793885
new_ns->ucounts = ucounts;
@@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
38503956
while (p->mnt.mnt_root != q->mnt.mnt_root)
38513957
p = next_mnt(skip_mnt_tree(p), old);
38523958
}
3959+
mnt_ns_tree_add(new_ns);
38533960
namespace_unlock();
38543961

38553962
if (rootmnt)
@@ -4867,6 +4974,12 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
48674974
return 0;
48684975
}
48694976

4977+
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
4978+
{
4979+
s->sm.mask |= STATMOUNT_MNT_NS_ID;
4980+
s->sm.mnt_ns_id = ns->seq;
4981+
}
4982+
48704983
static int statmount_string(struct kstatmount *s, u64 flag)
48714984
{
48724985
int ret;
@@ -4930,14 +5043,15 @@ static int copy_statmount_to_user(struct kstatmount *s)
49305043
static int do_statmount(struct kstatmount *s)
49315044
{
49325045
struct mount *m = real_mount(s->mnt);
5046+
struct mnt_namespace *ns = m->mnt_ns;
49335047
int err;
49345048

49355049
/*
49365050
* Don't trigger audit denials. We just want to determine what
49375051
* mounts to show users.
49385052
*/
49395053
if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
4940-
!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
5054+
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
49415055
return -EPERM;
49425056

49435057
err = security_sb_statfs(s->mnt->mnt_root);
@@ -4962,6 +5076,9 @@ static int do_statmount(struct kstatmount *s)
49625076
if (!err && s->mask & STATMOUNT_MNT_POINT)
49635077
err = statmount_string(s, STATMOUNT_MNT_POINT);
49645078

5079+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
5080+
statmount_mnt_ns_id(s, ns);
5081+
49655082
if (err)
49665083
return err;
49675084

@@ -5003,7 +5120,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
50035120
int ret;
50045121
size_t usize;
50055122

5006-
BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
5123+
BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
50075124

50085125
ret = get_user(usize, &req->size);
50095126
if (ret)
@@ -5021,10 +5138,63 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
50215138
return 0;
50225139
}
50235140

5141+
static struct mount *listmnt_next(struct mount *curr, bool reverse)
5142+
{
5143+
struct rb_node *node;
5144+
5145+
if (reverse)
5146+
node = rb_prev(&curr->mnt_node);
5147+
else
5148+
node = rb_next(&curr->mnt_node);
5149+
5150+
return node_to_mount(node);
5151+
}
5152+
5153+
static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
5154+
{
5155+
struct mount *first;
5156+
5157+
rwsem_assert_held(&namespace_sem);
5158+
5159+
/* We're looking at our own ns, just use get_fs_root. */
5160+
if (ns == current->nsproxy->mnt_ns) {
5161+
get_fs_root(current->fs, root);
5162+
return 0;
5163+
}
5164+
5165+
/*
5166+
* We have to find the first mount in our ns and use that, however it
5167+
* may not exist, so handle that properly.
5168+
*/
5169+
if (RB_EMPTY_ROOT(&ns->mounts))
5170+
return -ENOENT;
5171+
5172+
first = listmnt_next(ns->root, false);
5173+
if (!first)
5174+
return -ENOENT;
5175+
root->mnt = mntget(&first->mnt);
5176+
root->dentry = dget(root->mnt->mnt_root);
5177+
return 0;
5178+
}
5179+
5180+
/*
5181+
* If the user requested a specific mount namespace id, look that up and return
5182+
* that, or if not simply grab a passive reference on our mount namespace and
5183+
* return that.
5184+
*/
5185+
static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
5186+
{
5187+
if (mnt_ns_id)
5188+
return lookup_mnt_ns(mnt_ns_id);
5189+
refcount_inc(&current->nsproxy->mnt_ns->passive);
5190+
return current->nsproxy->mnt_ns;
5191+
}
5192+
50245193
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
50255194
struct statmount __user *, buf, size_t, bufsize,
50265195
unsigned int, flags)
50275196
{
5197+
struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
50285198
struct vfsmount *mnt;
50295199
struct mnt_id_req kreq;
50305200
struct kstatmount ks;
@@ -5039,21 +5209,41 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
50395209
if (ret)
50405210
return ret;
50415211

5212+
ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
5213+
if (!ns)
5214+
return -ENOENT;
5215+
5216+
if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
5217+
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
5218+
return -ENOENT;
5219+
50425220
retry:
50435221
ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
50445222
if (ret)
50455223
return ret;
50465224

50475225
down_read(&namespace_sem);
5048-
mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
5226+
/* Has the namespace already been emptied? */
5227+
if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) {
5228+
up_read(&namespace_sem);
5229+
kvfree(ks.seq.buf);
5230+
return -ENOENT;
5231+
}
5232+
5233+
mnt = lookup_mnt_in_ns(kreq.mnt_id, ns);
50495234
if (!mnt) {
50505235
up_read(&namespace_sem);
50515236
kvfree(ks.seq.buf);
50525237
return -ENOENT;
50535238
}
50545239

50555240
ks.mnt = mnt;
5056-
get_fs_root(current->fs, &ks.root);
5241+
ret = grab_requested_root(ns, &ks.root);
5242+
if (ret) {
5243+
up_read(&namespace_sem);
5244+
kvfree(ks.seq.buf);
5245+
return ret;
5246+
}
50575247
ret = do_statmount(&ks);
50585248
path_put(&ks.root);
50595249
up_read(&namespace_sem);
@@ -5066,30 +5256,21 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
50665256
return ret;
50675257
}
50685258

5069-
static struct mount *listmnt_next(struct mount *curr, bool reverse)
5070-
{
5071-
struct rb_node *node;
5072-
5073-
if (reverse)
5074-
node = rb_prev(&curr->mnt_node);
5075-
else
5076-
node = rb_next(&curr->mnt_node);
5077-
5078-
return node_to_mount(node);
5079-
}
5080-
5081-
static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
5082-
size_t nr_mnt_ids, bool reverse)
5259+
static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
5260+
u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
5261+
bool reverse)
50835262
{
50845263
struct path root __free(path_put) = {};
5085-
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
50865264
struct path orig;
50875265
struct mount *r, *first;
50885266
ssize_t ret;
50895267

50905268
rwsem_assert_held(&namespace_sem);
50915269

5092-
get_fs_root(current->fs, &root);
5270+
ret = grab_requested_root(ns, &root);
5271+
if (ret)
5272+
return ret;
5273+
50935274
if (mnt_parent_id == LSMT_ROOT) {
50945275
orig = root;
50955276
} else {
@@ -5104,7 +5285,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
51045285
* mounts to show users.
51055286
*/
51065287
if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
5107-
!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
5288+
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
51085289
return -EPERM;
51095290

51105291
ret = security_sb_statfs(orig.dentry);
@@ -5141,6 +5322,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
51415322
{
51425323
u64 *kmnt_ids __free(kvfree) = NULL;
51435324
const size_t maxcount = 1000000;
5325+
struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
51445326
struct mnt_id_req kreq;
51455327
ssize_t ret;
51465328

@@ -5167,8 +5349,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
51675349
if (!kmnt_ids)
51685350
return -ENOMEM;
51695351

5352+
ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
5353+
if (!ns)
5354+
return -ENOENT;
5355+
5356+
if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
5357+
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
5358+
return -ENOENT;
5359+
51705360
scoped_guard(rwsem_read, &namespace_sem)
5171-
ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids,
5361+
ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
51725362
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
51735363

51745364
if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
@@ -5204,6 +5394,8 @@ static void __init init_mount_tree(void)
52045394

52055395
set_fs_pwd(current->fs, &root);
52065396
set_fs_root(current->fs, &root);
5397+
5398+
mnt_ns_tree_add(ns);
52075399
}
52085400

52095401
void __init mnt_init(void)

0 commit comments

Comments
 (0)