@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
143
143
unsigned long targets [MEM_CGROUP_NTARGETS ];
144
144
};
145
145
146
- struct mem_cgroup_reclaim_iter {
147
- /*
148
- * last scanned hierarchy member. Valid only if last_dead_count
149
- * matches memcg->dead_count of the hierarchy root group.
150
- */
151
- struct mem_cgroup * last_visited ;
152
- int last_dead_count ;
153
-
146
+ struct reclaim_iter {
147
+ struct mem_cgroup * position ;
154
148
/* scan generation, increased every round-trip */
155
149
unsigned int generation ;
156
150
};
@@ -162,7 +156,7 @@ struct mem_cgroup_per_zone {
162
156
struct lruvec lruvec ;
163
157
unsigned long lru_size [NR_LRU_LISTS ];
164
158
165
- struct mem_cgroup_reclaim_iter reclaim_iter [DEF_PRIORITY + 1 ];
159
+ struct reclaim_iter iter [DEF_PRIORITY + 1 ];
166
160
167
161
struct rb_node tree_node ; /* RB tree node */
168
162
unsigned long usage_in_excess ;/* Set to the value by which */
@@ -346,7 +340,6 @@ struct mem_cgroup {
346
340
struct mem_cgroup_stat_cpu nocpu_base ;
347
341
spinlock_t pcp_counter_lock ;
348
342
349
- atomic_t dead_count ;
350
343
#if defined(CONFIG_MEMCG_KMEM ) && defined(CONFIG_INET )
351
344
struct cg_proto tcp_mem ;
352
345
#endif
@@ -1067,122 +1060,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1067
1060
return memcg ;
1068
1061
}
1069
1062
1070
- /*
1071
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
1072
- * ref. count) or NULL if the whole root's subtree has been visited.
1073
- *
1074
- * helper function to be used by mem_cgroup_iter
1075
- */
1076
- static struct mem_cgroup * __mem_cgroup_iter_next (struct mem_cgroup * root ,
1077
- struct mem_cgroup * last_visited )
1078
- {
1079
- struct cgroup_subsys_state * prev_css , * next_css ;
1080
-
1081
- prev_css = last_visited ? & last_visited -> css : NULL ;
1082
- skip_node :
1083
- next_css = css_next_descendant_pre (prev_css , & root -> css );
1084
-
1085
- /*
1086
- * Even if we found a group we have to make sure it is
1087
- * alive. css && !memcg means that the groups should be
1088
- * skipped and we should continue the tree walk.
1089
- * last_visited css is safe to use because it is
1090
- * protected by css_get and the tree walk is rcu safe.
1091
- *
1092
- * We do not take a reference on the root of the tree walk
1093
- * because we might race with the root removal when it would
1094
- * be the only node in the iterated hierarchy and mem_cgroup_iter
1095
- * would end up in an endless loop because it expects that at
1096
- * least one valid node will be returned. Root cannot disappear
1097
- * because caller of the iterator should hold it already so
1098
- * skipping css reference should be safe.
1099
- */
1100
- if (next_css ) {
1101
- struct mem_cgroup * memcg = mem_cgroup_from_css (next_css );
1102
-
1103
- if (next_css == & root -> css )
1104
- return memcg ;
1105
-
1106
- if (css_tryget_online (next_css )) {
1107
- /*
1108
- * Make sure the memcg is initialized:
1109
- * mem_cgroup_css_online() orders the the
1110
- * initialization against setting the flag.
1111
- */
1112
- if (smp_load_acquire (& memcg -> initialized ))
1113
- return memcg ;
1114
- css_put (next_css );
1115
- }
1116
-
1117
- prev_css = next_css ;
1118
- goto skip_node ;
1119
- }
1120
-
1121
- return NULL ;
1122
- }
1123
-
1124
- static void mem_cgroup_iter_invalidate (struct mem_cgroup * root )
1125
- {
1126
- /*
1127
- * When a group in the hierarchy below root is destroyed, the
1128
- * hierarchy iterator can no longer be trusted since it might
1129
- * have pointed to the destroyed group. Invalidate it.
1130
- */
1131
- atomic_inc (& root -> dead_count );
1132
- }
1133
-
1134
- static struct mem_cgroup *
1135
- mem_cgroup_iter_load (struct mem_cgroup_reclaim_iter * iter ,
1136
- struct mem_cgroup * root ,
1137
- int * sequence )
1138
- {
1139
- struct mem_cgroup * position = NULL ;
1140
- /*
1141
- * A cgroup destruction happens in two stages: offlining and
1142
- * release. They are separated by a RCU grace period.
1143
- *
1144
- * If the iterator is valid, we may still race with an
1145
- * offlining. The RCU lock ensures the object won't be
1146
- * released, tryget will fail if we lost the race.
1147
- */
1148
- * sequence = atomic_read (& root -> dead_count );
1149
- if (iter -> last_dead_count == * sequence ) {
1150
- smp_rmb ();
1151
- position = iter -> last_visited ;
1152
-
1153
- /*
1154
- * We cannot take a reference to root because we might race
1155
- * with root removal and returning NULL would end up in
1156
- * an endless loop on the iterator user level when root
1157
- * would be returned all the time.
1158
- */
1159
- if (position && position != root &&
1160
- !css_tryget_online (& position -> css ))
1161
- position = NULL ;
1162
- }
1163
- return position ;
1164
- }
1165
-
1166
- static void mem_cgroup_iter_update (struct mem_cgroup_reclaim_iter * iter ,
1167
- struct mem_cgroup * last_visited ,
1168
- struct mem_cgroup * new_position ,
1169
- struct mem_cgroup * root ,
1170
- int sequence )
1171
- {
1172
- /* root reference counting symmetric to mem_cgroup_iter_load */
1173
- if (last_visited && last_visited != root )
1174
- css_put (& last_visited -> css );
1175
- /*
1176
- * We store the sequence count from the time @last_visited was
1177
- * loaded successfully instead of rereading it here so that we
1178
- * don't lose destruction events in between. We could have
1179
- * raced with the destruction of @new_position after all.
1180
- */
1181
- iter -> last_visited = new_position ;
1182
- smp_wmb ();
1183
- iter -> last_dead_count = sequence ;
1184
- }
1185
-
1186
1063
/**
1187
1064
* mem_cgroup_iter - iterate over memory cgroup hierarchy
1188
1065
* @root: hierarchy root
@@ -1204,8 +1081,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1204
1081
struct mem_cgroup * prev ,
1205
1082
struct mem_cgroup_reclaim_cookie * reclaim )
1206
1083
{
1084
+ struct reclaim_iter * uninitialized_var (iter );
1085
+ struct cgroup_subsys_state * css = NULL ;
1207
1086
struct mem_cgroup * memcg = NULL ;
1208
- struct mem_cgroup * last_visited = NULL ;
1087
+ struct mem_cgroup * pos = NULL ;
1209
1088
1210
1089
if (mem_cgroup_disabled ())
1211
1090
return NULL ;
@@ -1214,50 +1093,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1214
1093
root = root_mem_cgroup ;
1215
1094
1216
1095
if (prev && !reclaim )
1217
- last_visited = prev ;
1096
+ pos = prev ;
1218
1097
1219
1098
if (!root -> use_hierarchy && root != root_mem_cgroup ) {
1220
1099
if (prev )
1221
- goto out_css_put ;
1100
+ goto out ;
1222
1101
return root ;
1223
1102
}
1224
1103
1225
1104
rcu_read_lock ();
1226
- while (!memcg ) {
1227
- struct mem_cgroup_reclaim_iter * uninitialized_var (iter );
1228
- int uninitialized_var (seq );
1229
-
1230
- if (reclaim ) {
1231
- struct mem_cgroup_per_zone * mz ;
1232
-
1233
- mz = mem_cgroup_zone_zoneinfo (root , reclaim -> zone );
1234
- iter = & mz -> reclaim_iter [reclaim -> priority ];
1235
- if (prev && reclaim -> generation != iter -> generation ) {
1236
- iter -> last_visited = NULL ;
1237
- goto out_unlock ;
1238
- }
1239
1105
1240
- last_visited = mem_cgroup_iter_load (iter , root , & seq );
1106
+ if (reclaim ) {
1107
+ struct mem_cgroup_per_zone * mz ;
1108
+
1109
+ mz = mem_cgroup_zone_zoneinfo (root , reclaim -> zone );
1110
+ iter = & mz -> iter [reclaim -> priority ];
1111
+
1112
+ if (prev && reclaim -> generation != iter -> generation )
1113
+ goto out_unlock ;
1114
+
1115
+ do {
1116
+ pos = ACCESS_ONCE (iter -> position );
1117
+ /*
1118
+ * A racing update may change the position and
1119
+ * put the last reference, hence css_tryget(),
1120
+ * or retry to see the updated position.
1121
+ */
1122
+ } while (pos && !css_tryget (& pos -> css ));
1123
+ }
1124
+
1125
+ if (pos )
1126
+ css = & pos -> css ;
1127
+
1128
+ for (;;) {
1129
+ css = css_next_descendant_pre (css , & root -> css );
1130
+ if (!css ) {
1131
+ /*
1132
+ * Reclaimers share the hierarchy walk, and a
1133
+ * new one might jump in right at the end of
1134
+ * the hierarchy - make sure they see at least
1135
+ * one group and restart from the beginning.
1136
+ */
1137
+ if (!prev )
1138
+ continue ;
1139
+ break ;
1241
1140
}
1242
1141
1243
- memcg = __mem_cgroup_iter_next (root , last_visited );
1142
+ /*
1143
+ * Verify the css and acquire a reference. The root
1144
+ * is provided by the caller, so we know it's alive
1145
+ * and kicking, and don't take an extra reference.
1146
+ */
1147
+ memcg = mem_cgroup_from_css (css );
1244
1148
1245
- if (reclaim ) {
1246
- mem_cgroup_iter_update (iter , last_visited , memcg , root ,
1247
- seq );
1149
+ if (css == & root -> css )
1150
+ break ;
1248
1151
1249
- if (!memcg )
1250
- iter -> generation ++ ;
1251
- else if (!prev && memcg )
1252
- reclaim -> generation = iter -> generation ;
1152
+ if (css_tryget_online (css )) {
1153
+ /*
1154
+ * Make sure the memcg is initialized:
1155
+ * mem_cgroup_css_online() orders the the
1156
+ * initialization against setting the flag.
1157
+ */
1158
+ if (smp_load_acquire (& memcg -> initialized ))
1159
+ break ;
1160
+
1161
+ css_put (css );
1253
1162
}
1254
1163
1255
- if (prev && !memcg )
1256
- goto out_unlock ;
1164
+ memcg = NULL ;
1165
+ }
1166
+
1167
+ if (reclaim ) {
1168
+ if (cmpxchg (& iter -> position , pos , memcg ) == pos ) {
1169
+ if (memcg )
1170
+ css_get (& memcg -> css );
1171
+ if (pos )
1172
+ css_put (& pos -> css );
1173
+ }
1174
+
1175
+ /*
1176
+ * pairs with css_tryget when dereferencing iter->position
1177
+ * above.
1178
+ */
1179
+ if (pos )
1180
+ css_put (& pos -> css );
1181
+
1182
+ if (!memcg )
1183
+ iter -> generation ++ ;
1184
+ else if (!prev )
1185
+ reclaim -> generation = iter -> generation ;
1257
1186
}
1187
+
1258
1188
out_unlock :
1259
1189
rcu_read_unlock ();
1260
- out_css_put :
1190
+ out :
1261
1191
if (prev && prev != root )
1262
1192
css_put (& prev -> css );
1263
1193
@@ -5447,24 +5377,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5447
5377
return 0 ;
5448
5378
}
5449
5379
5450
- /*
5451
- * Announce all parents that a group from their hierarchy is gone.
5452
- */
5453
- static void mem_cgroup_invalidate_reclaim_iterators (struct mem_cgroup * memcg )
5454
- {
5455
- struct mem_cgroup * parent = memcg ;
5456
-
5457
- while ((parent = parent_mem_cgroup (parent )))
5458
- mem_cgroup_iter_invalidate (parent );
5459
-
5460
- /*
5461
- * if the root memcg is not hierarchical we have to check it
5462
- * explicitely.
5463
- */
5464
- if (!root_mem_cgroup -> use_hierarchy )
5465
- mem_cgroup_iter_invalidate (root_mem_cgroup );
5466
- }
5467
-
5468
5380
static void mem_cgroup_css_offline (struct cgroup_subsys_state * css )
5469
5381
{
5470
5382
struct mem_cgroup * memcg = mem_cgroup_from_css (css );
@@ -5485,8 +5397,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5485
5397
5486
5398
kmem_cgroup_css_offline (memcg );
5487
5399
5488
- mem_cgroup_invalidate_reclaim_iterators (memcg );
5489
-
5490
5400
/*
5491
5401
* This requires that offlining is serialized. Right now that is
5492
5402
* guaranteed because css_killed_work_fn() holds the cgroup_mutex.
0 commit comments