Skip to content

Commit 5da3bfa

Browse files
inwardvesselhtejun
authored andcommitted
cgroup: use separate rstat trees for each subsystem
Different subsystems may call cgroup_rstat_updated() within the same cgroup, resulting in a tree of pending updates from multiple subsystems. When one of these subsystems is flushed via cgroup_rstat_flushed(), all other subsystems with pending updates on the tree will also be flushed. Change the paradigm of having a single rstat tree for all subsystems to having separate trees for each subsystem. This separation allows for subsystems to perform flushes without the side effects of other subsystems. As an example, flushing the cpu stats will no longer cause the memory stats to be flushed and vice versa. In order to achieve subsystem-specific trees, change the tree node type from cgroup to cgroup_subsys_state pointer. Then remove those pointers from the cgroup and instead place them on the css. Finally, change update/flush functions to make use of the different node type (css). These changes allow a specific subsystem to be associated with an update or flush. Separate rstat trees will now exist for each unique subsystem. Since updating/flushing will now be done at the subsystem level, there is no longer a need to keep track of updated css nodes at the cgroup level. The list management of these nodes done within the cgroup (rstat_css_list and related) has been removed accordingly. Conditional guards for checking validity of a given css were placed within css_rstat_updated/flush() to prevent undefined behavior occuring from kfunc usage in bpf programs. Guards were also placed within css_rstat_init/exit() in order to help consolidate calls to them. At call sites for all four functions, the existing guards were removed. Signed-off-by: JP Kobryn <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 541a421 commit 5da3bfa

File tree

4 files changed

+162
-142
lines changed

4 files changed

+162
-142
lines changed

include/linux/cgroup-defs.h

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ struct cgroup_subsys_state {
169169
/* reference count - access via css_[try]get() and css_put() */
170170
struct percpu_ref refcnt;
171171

172+
struct css_rstat_cpu __percpu *rstat_cpu;
173+
172174
/*
173175
* siblings list anchored at the parent's ->children
174176
*
@@ -177,9 +179,6 @@ struct cgroup_subsys_state {
177179
struct list_head sibling;
178180
struct list_head children;
179181

180-
/* flush target list anchored at cgrp->rstat_css_list */
181-
struct list_head rstat_css_node;
182-
183182
/*
184183
* PI: Subsys-unique ID. 0 is unused and root is always 1. The
185184
* matching css can be looked up using css_from_id().
@@ -219,6 +218,13 @@ struct cgroup_subsys_state {
219218
* Protected by cgroup_mutex.
220219
*/
221220
int nr_descendants;
221+
222+
/*
223+
* A singly-linked list of css structures to be rstat flushed.
224+
* This is a scratch field to be used exclusively by
225+
* css_rstat_flush() and protected by cgroup_rstat_lock.
226+
*/
227+
struct cgroup_subsys_state *rstat_flush_next;
222228
};
223229

224230
/*
@@ -329,10 +335,10 @@ struct cgroup_base_stat {
329335

330336
/*
331337
* rstat - cgroup scalable recursive statistics. Accounting is done
332-
* per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
338+
* per-cpu in css_rstat_cpu which is then lazily propagated up the
333339
* hierarchy on reads.
334340
*
335-
* When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
341+
* When a stat gets updated, the css_rstat_cpu and its ancestors are
336342
* linked into the updated tree. On the following read, propagation only
337343
* considers and consumes the updated tree. This makes reading O(the
338344
* number of descendants which have been active since last read) instead of
@@ -346,20 +352,20 @@ struct cgroup_base_stat {
346352
* This struct hosts both the fields which implement the above -
347353
* updated_children and updated_next.
348354
*/
349-
struct cgroup_rstat_cpu {
355+
struct css_rstat_cpu {
350356
/*
351357
* Child cgroups with stat updates on this cpu since the last read
352358
* are linked on the parent's ->updated_children through
353-
* ->updated_next.
359+
* ->updated_next. updated_children is terminated by its container css.
354360
*
355-
* In addition to being more compact, singly-linked list pointing
356-
* to the cgroup makes it unnecessary for each per-cpu struct to
357-
* point back to the associated cgroup.
361+
* In addition to being more compact, singly-linked list pointing to
362+
* the css makes it unnecessary for each per-cpu struct to point back
363+
* to the associated css.
358364
*
359365
* Protected by per-cpu cgroup_rstat_cpu_lock.
360366
*/
361-
struct cgroup *updated_children; /* terminated by self cgroup */
362-
struct cgroup *updated_next; /* NULL iff not on the list */
367+
struct cgroup_subsys_state *updated_children;
368+
struct cgroup_subsys_state *updated_next; /* NULL if not on the list */
363369
};
364370

365371
/*
@@ -521,25 +527,15 @@ struct cgroup {
521527
struct cgroup *dom_cgrp;
522528
struct cgroup *old_dom_cgrp; /* used while enabling threaded */
523529

524-
/* per-cpu recursive resource statistics */
525-
struct cgroup_rstat_cpu __percpu *rstat_cpu;
526530
struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu;
527-
struct list_head rstat_css_list;
528531

529532
/*
530-
* Add padding to separate the read mostly rstat_cpu and
531-
* rstat_css_list into a different cacheline from the following
532-
* rstat_flush_next and *bstat fields which can have frequent updates.
533+
* Add padding to keep the read mostly rstat per-cpu pointer on a
534+
* different cacheline than the following *bstat fields which can have
535+
* frequent updates.
533536
*/
534537
CACHELINE_PADDING(_pad_);
535538

536-
/*
537-
* A singly-linked list of cgroup structures to be rstat flushed.
538-
* This is a scratch field to be used exclusively by
539-
* css_rstat_flush_locked() and protected by cgroup_rstat_lock.
540-
*/
541-
struct cgroup *rstat_flush_next;
542-
543539
/* cgroup basic resource statistics */
544540
struct cgroup_base_stat last_bstat;
545541
struct cgroup_base_stat bstat;

kernel/cgroup/cgroup.c

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,12 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
161161
};
162162
#undef SUBSYS
163163

164-
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, root_rstat_cpu);
164+
static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
165165
static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);
166166

167167
/* the default hierarchy */
168168
struct cgroup_root cgrp_dfl_root = {
169-
.cgrp.rstat_cpu = &root_rstat_cpu,
169+
.cgrp.self.rstat_cpu = &root_rstat_cpu,
170170
.cgrp.rstat_base_cpu = &root_rstat_base_cpu,
171171
};
172172
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
@@ -1362,7 +1362,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
13621362

13631363
cgroup_unlock();
13641364

1365-
css_rstat_exit(&cgrp->self);
13661365
kernfs_destroy_root(root->kf_root);
13671366
cgroup_free_root(root);
13681367
}
@@ -1867,13 +1866,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
18671866
}
18681867
spin_unlock_irq(&css_set_lock);
18691868

1870-
if (ss->css_rstat_flush) {
1871-
list_del_rcu(&css->rstat_css_node);
1872-
synchronize_rcu();
1873-
list_add_rcu(&css->rstat_css_node,
1874-
&dcgrp->rstat_css_list);
1875-
}
1876-
18771869
/* default hierarchy doesn't enable controllers by default */
18781870
dst_root->subsys_mask |= 1 << ssid;
18791871
if (dst_root == &cgrp_dfl_root) {
@@ -2056,7 +2048,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
20562048
cgrp->dom_cgrp = cgrp;
20572049
cgrp->max_descendants = INT_MAX;
20582050
cgrp->max_depth = INT_MAX;
2059-
INIT_LIST_HEAD(&cgrp->rstat_css_list);
20602051
prev_cputime_init(&cgrp->prev_cputime);
20612052

20622053
for_each_subsys(ss, ssid)
@@ -5405,6 +5396,7 @@ static void css_free_rwork_fn(struct work_struct *work)
54055396
struct cgroup *cgrp = css->cgroup;
54065397

54075398
percpu_ref_exit(&css->refcnt);
5399+
css_rstat_exit(css);
54085400

54095401
if (!css_is_self(css)) {
54105402
/* css free path */
@@ -5435,7 +5427,6 @@ static void css_free_rwork_fn(struct work_struct *work)
54355427
cgroup_put(cgroup_parent(cgrp));
54365428
kernfs_put(cgrp->kn);
54375429
psi_cgroup_free(cgrp);
5438-
css_rstat_exit(css);
54395430
kfree(cgrp);
54405431
} else {
54415432
/*
@@ -5463,11 +5454,7 @@ static void css_release_work_fn(struct work_struct *work)
54635454
if (!css_is_self(css)) {
54645455
struct cgroup *parent_cgrp;
54655456

5466-
/* css release path */
5467-
if (!list_empty(&css->rstat_css_node)) {
5468-
css_rstat_flush(css);
5469-
list_del_rcu(&css->rstat_css_node);
5470-
}
5457+
css_rstat_flush(css);
54715458

54725459
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
54735460
if (ss->css_released)
@@ -5493,7 +5480,7 @@ static void css_release_work_fn(struct work_struct *work)
54935480
/* cgroup release path */
54945481
TRACE_CGROUP_PATH(release, cgrp);
54955482

5496-
css_rstat_flush(css);
5483+
css_rstat_flush(&cgrp->self);
54975484

54985485
spin_lock_irq(&css_set_lock);
54995486
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5541,7 +5528,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
55415528
css->id = -1;
55425529
INIT_LIST_HEAD(&css->sibling);
55435530
INIT_LIST_HEAD(&css->children);
5544-
INIT_LIST_HEAD(&css->rstat_css_node);
55455531
css->serial_nr = css_serial_nr_next++;
55465532
atomic_set(&css->online_cnt, 0);
55475533

@@ -5550,9 +5536,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
55505536
css_get(css->parent);
55515537
}
55525538

5553-
if (ss->css_rstat_flush)
5554-
list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5555-
55565539
BUG_ON(cgroup_css(cgrp, ss));
55575540
}
55585541

@@ -5645,6 +5628,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
56455628
goto err_free_css;
56465629
css->id = err;
56475630

5631+
err = css_rstat_init(css);
5632+
if (err)
5633+
goto err_free_css;
5634+
56485635
/* @css is ready to be brought online now, make it visible */
56495636
list_add_tail_rcu(&css->sibling, &parent_css->children);
56505637
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -5658,7 +5645,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
56585645
err_list_del:
56595646
list_del_rcu(&css->sibling);
56605647
err_free_css:
5661-
list_del_rcu(&css->rstat_css_node);
56625648
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
56635649
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
56645650
return ERR_PTR(err);
@@ -6101,6 +6087,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
61016087
} else {
61026088
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
61036089
BUG_ON(css->id < 0);
6090+
6091+
BUG_ON(css_rstat_init(css));
61046092
}
61056093

61066094
/* Update the init_css_set to contain a subsys

0 commit comments

Comments
 (0)