Skip to content

Commit 5ac8fb3

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: convert reclaim iterator to simple css refcounting
The memcg reclaim iterators use a complicated weak reference scheme to prevent pinning cgroups indefinitely in the absence of memory pressure. However, during the ongoing cgroup core rework, css lifetime has been decoupled such that a pinned css no longer interferes with removal of the user-visible cgroup, and all this complexity is now unnecessary. [[email protected]: ensure that the cached reference is always released] Signed-off-by: Johannes Weiner <[email protected]> Cc: Vladimir Davydov <[email protected]> Cc: David Rientjes <[email protected]> Cc: Tejun Heo <[email protected]> Signed-off-by: Michal Hocko <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 5b1efc0 commit 5ac8fb3

File tree

1 file changed

+84
-174
lines changed

1 file changed

+84
-174
lines changed

mm/memcontrol.c

Lines changed: 84 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
143143
unsigned long targets[MEM_CGROUP_NTARGETS];
144144
};
145145

146-
struct mem_cgroup_reclaim_iter {
147-
/*
148-
* last scanned hierarchy member. Valid only if last_dead_count
149-
* matches memcg->dead_count of the hierarchy root group.
150-
*/
151-
struct mem_cgroup *last_visited;
152-
int last_dead_count;
153-
146+
struct reclaim_iter {
147+
struct mem_cgroup *position;
154148
/* scan generation, increased every round-trip */
155149
unsigned int generation;
156150
};
@@ -162,7 +156,7 @@ struct mem_cgroup_per_zone {
162156
struct lruvec lruvec;
163157
unsigned long lru_size[NR_LRU_LISTS];
164158

165-
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
159+
struct reclaim_iter iter[DEF_PRIORITY + 1];
166160

167161
struct rb_node tree_node; /* RB tree node */
168162
unsigned long usage_in_excess;/* Set to the value by which */
@@ -346,7 +340,6 @@ struct mem_cgroup {
346340
struct mem_cgroup_stat_cpu nocpu_base;
347341
spinlock_t pcp_counter_lock;
348342

349-
atomic_t dead_count;
350343
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
351344
struct cg_proto tcp_mem;
352345
#endif
@@ -1067,122 +1060,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
10671060
return memcg;
10681061
}
10691062

1070-
/*
1071-
* Returns a next (in a pre-order walk) alive memcg (with elevated css
1072-
* ref. count) or NULL if the whole root's subtree has been visited.
1073-
*
1074-
* helper function to be used by mem_cgroup_iter
1075-
*/
1076-
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1077-
struct mem_cgroup *last_visited)
1078-
{
1079-
struct cgroup_subsys_state *prev_css, *next_css;
1080-
1081-
prev_css = last_visited ? &last_visited->css : NULL;
1082-
skip_node:
1083-
next_css = css_next_descendant_pre(prev_css, &root->css);
1084-
1085-
/*
1086-
* Even if we found a group we have to make sure it is
1087-
* alive. css && !memcg means that the groups should be
1088-
* skipped and we should continue the tree walk.
1089-
* last_visited css is safe to use because it is
1090-
* protected by css_get and the tree walk is rcu safe.
1091-
*
1092-
* We do not take a reference on the root of the tree walk
1093-
* because we might race with the root removal when it would
1094-
* be the only node in the iterated hierarchy and mem_cgroup_iter
1095-
* would end up in an endless loop because it expects that at
1096-
* least one valid node will be returned. Root cannot disappear
1097-
* because caller of the iterator should hold it already so
1098-
* skipping css reference should be safe.
1099-
*/
1100-
if (next_css) {
1101-
struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
1102-
1103-
if (next_css == &root->css)
1104-
return memcg;
1105-
1106-
if (css_tryget_online(next_css)) {
1107-
/*
1108-
* Make sure the memcg is initialized:
1109-
* mem_cgroup_css_online() orders the the
1110-
* initialization against setting the flag.
1111-
*/
1112-
if (smp_load_acquire(&memcg->initialized))
1113-
return memcg;
1114-
css_put(next_css);
1115-
}
1116-
1117-
prev_css = next_css;
1118-
goto skip_node;
1119-
}
1120-
1121-
return NULL;
1122-
}
1123-
1124-
static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1125-
{
1126-
/*
1127-
* When a group in the hierarchy below root is destroyed, the
1128-
* hierarchy iterator can no longer be trusted since it might
1129-
* have pointed to the destroyed group. Invalidate it.
1130-
*/
1131-
atomic_inc(&root->dead_count);
1132-
}
1133-
1134-
static struct mem_cgroup *
1135-
mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1136-
struct mem_cgroup *root,
1137-
int *sequence)
1138-
{
1139-
struct mem_cgroup *position = NULL;
1140-
/*
1141-
* A cgroup destruction happens in two stages: offlining and
1142-
* release. They are separated by a RCU grace period.
1143-
*
1144-
* If the iterator is valid, we may still race with an
1145-
* offlining. The RCU lock ensures the object won't be
1146-
* released, tryget will fail if we lost the race.
1147-
*/
1148-
*sequence = atomic_read(&root->dead_count);
1149-
if (iter->last_dead_count == *sequence) {
1150-
smp_rmb();
1151-
position = iter->last_visited;
1152-
1153-
/*
1154-
* We cannot take a reference to root because we might race
1155-
* with root removal and returning NULL would end up in
1156-
* an endless loop on the iterator user level when root
1157-
* would be returned all the time.
1158-
*/
1159-
if (position && position != root &&
1160-
!css_tryget_online(&position->css))
1161-
position = NULL;
1162-
}
1163-
return position;
1164-
}
1165-
1166-
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1167-
struct mem_cgroup *last_visited,
1168-
struct mem_cgroup *new_position,
1169-
struct mem_cgroup *root,
1170-
int sequence)
1171-
{
1172-
/* root reference counting symmetric to mem_cgroup_iter_load */
1173-
if (last_visited && last_visited != root)
1174-
css_put(&last_visited->css);
1175-
/*
1176-
* We store the sequence count from the time @last_visited was
1177-
* loaded successfully instead of rereading it here so that we
1178-
* don't lose destruction events in between. We could have
1179-
* raced with the destruction of @new_position after all.
1180-
*/
1181-
iter->last_visited = new_position;
1182-
smp_wmb();
1183-
iter->last_dead_count = sequence;
1184-
}
1185-
11861063
/**
11871064
* mem_cgroup_iter - iterate over memory cgroup hierarchy
11881065
* @root: hierarchy root
@@ -1204,8 +1081,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
12041081
struct mem_cgroup *prev,
12051082
struct mem_cgroup_reclaim_cookie *reclaim)
12061083
{
1084+
struct reclaim_iter *uninitialized_var(iter);
1085+
struct cgroup_subsys_state *css = NULL;
12071086
struct mem_cgroup *memcg = NULL;
1208-
struct mem_cgroup *last_visited = NULL;
1087+
struct mem_cgroup *pos = NULL;
12091088

12101089
if (mem_cgroup_disabled())
12111090
return NULL;
@@ -1214,50 +1093,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
12141093
root = root_mem_cgroup;
12151094

12161095
if (prev && !reclaim)
1217-
last_visited = prev;
1096+
pos = prev;
12181097

12191098
if (!root->use_hierarchy && root != root_mem_cgroup) {
12201099
if (prev)
1221-
goto out_css_put;
1100+
goto out;
12221101
return root;
12231102
}
12241103

12251104
rcu_read_lock();
1226-
while (!memcg) {
1227-
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1228-
int uninitialized_var(seq);
1229-
1230-
if (reclaim) {
1231-
struct mem_cgroup_per_zone *mz;
1232-
1233-
mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1234-
iter = &mz->reclaim_iter[reclaim->priority];
1235-
if (prev && reclaim->generation != iter->generation) {
1236-
iter->last_visited = NULL;
1237-
goto out_unlock;
1238-
}
12391105

1240-
last_visited = mem_cgroup_iter_load(iter, root, &seq);
1106+
if (reclaim) {
1107+
struct mem_cgroup_per_zone *mz;
1108+
1109+
mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1110+
iter = &mz->iter[reclaim->priority];
1111+
1112+
if (prev && reclaim->generation != iter->generation)
1113+
goto out_unlock;
1114+
1115+
do {
1116+
pos = ACCESS_ONCE(iter->position);
1117+
/*
1118+
* A racing update may change the position and
1119+
* put the last reference, hence css_tryget(),
1120+
* or retry to see the updated position.
1121+
*/
1122+
} while (pos && !css_tryget(&pos->css));
1123+
}
1124+
1125+
if (pos)
1126+
css = &pos->css;
1127+
1128+
for (;;) {
1129+
css = css_next_descendant_pre(css, &root->css);
1130+
if (!css) {
1131+
/*
1132+
* Reclaimers share the hierarchy walk, and a
1133+
* new one might jump in right at the end of
1134+
* the hierarchy - make sure they see at least
1135+
* one group and restart from the beginning.
1136+
*/
1137+
if (!prev)
1138+
continue;
1139+
break;
12411140
}
12421141

1243-
memcg = __mem_cgroup_iter_next(root, last_visited);
1142+
/*
1143+
* Verify the css and acquire a reference. The root
1144+
* is provided by the caller, so we know it's alive
1145+
* and kicking, and don't take an extra reference.
1146+
*/
1147+
memcg = mem_cgroup_from_css(css);
12441148

1245-
if (reclaim) {
1246-
mem_cgroup_iter_update(iter, last_visited, memcg, root,
1247-
seq);
1149+
if (css == &root->css)
1150+
break;
12481151

1249-
if (!memcg)
1250-
iter->generation++;
1251-
else if (!prev && memcg)
1252-
reclaim->generation = iter->generation;
1152+
if (css_tryget_online(css)) {
1153+
/*
1154+
* Make sure the memcg is initialized:
1155+
* mem_cgroup_css_online() orders the the
1156+
* initialization against setting the flag.
1157+
*/
1158+
if (smp_load_acquire(&memcg->initialized))
1159+
break;
1160+
1161+
css_put(css);
12531162
}
12541163

1255-
if (prev && !memcg)
1256-
goto out_unlock;
1164+
memcg = NULL;
1165+
}
1166+
1167+
if (reclaim) {
1168+
if (cmpxchg(&iter->position, pos, memcg) == pos) {
1169+
if (memcg)
1170+
css_get(&memcg->css);
1171+
if (pos)
1172+
css_put(&pos->css);
1173+
}
1174+
1175+
/*
1176+
* pairs with css_tryget when dereferencing iter->position
1177+
* above.
1178+
*/
1179+
if (pos)
1180+
css_put(&pos->css);
1181+
1182+
if (!memcg)
1183+
iter->generation++;
1184+
else if (!prev)
1185+
reclaim->generation = iter->generation;
12571186
}
1187+
12581188
out_unlock:
12591189
rcu_read_unlock();
1260-
out_css_put:
1190+
out:
12611191
if (prev && prev != root)
12621192
css_put(&prev->css);
12631193

@@ -5447,24 +5377,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
54475377
return 0;
54485378
}
54495379

5450-
/*
5451-
* Announce all parents that a group from their hierarchy is gone.
5452-
*/
5453-
static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
5454-
{
5455-
struct mem_cgroup *parent = memcg;
5456-
5457-
while ((parent = parent_mem_cgroup(parent)))
5458-
mem_cgroup_iter_invalidate(parent);
5459-
5460-
/*
5461-
* if the root memcg is not hierarchical we have to check it
5462-
* explicitely.
5463-
*/
5464-
if (!root_mem_cgroup->use_hierarchy)
5465-
mem_cgroup_iter_invalidate(root_mem_cgroup);
5466-
}
5467-
54685380
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
54695381
{
54705382
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5485,8 +5397,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
54855397

54865398
kmem_cgroup_css_offline(memcg);
54875399

5488-
mem_cgroup_invalidate_reclaim_iterators(memcg);
5489-
54905400
/*
54915401
* This requires that offlining is serialized. Right now that is
54925402
* guaranteed because css_killed_work_fn() holds the cgroup_mutex.

0 commit comments

Comments
 (0)