Skip to content

Commit e4dde56

Browse files
yuzhaogoogleakpm00
authored andcommitted
mm: multi-gen LRU: per-node lru_gen_folio lists
For each node, memcgs are divided into two generations: the old and the young. For each generation, memcgs are randomly sharded into multiple bins to improve scalability. For each bin, an RCU hlist_nulls is virtually divided into three segments: the head, the tail and the default. An onlining memcg is added to the tail of a random bin in the old generation. The eviction starts at the head of a random bin in the old generation. The per-node memcg generation counter, whose reminder (mod 2) indexes the old generation, is incremented when all its bins become empty. There are four operations: 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its current generation (old or young) and updates its "seg" to "head"; 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its current generation (old or young) and updates its "seg" to "tail"; 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old generation, updates its "gen" to "old" and resets its "seg" to "default"; 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the young generation, updates its "gen" to "young" and resets its "seg" to "default". The events that trigger the above operations are: 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; 2. The first attempt to reclaim an memcg below low, which triggers MEMCG_LRU_TAIL; 3. The first attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_TAIL; 4. The second attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_YOUNG; 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. Note that memcg LRU only applies to global reclaim, and the round-robin incrementing of their max_seq counters ensures the eventual fairness to all eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Yu Zhao <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Michael Larabel <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mike Rapoport <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Suren Baghdasaryan <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 77d4459 commit e4dde56

File tree

6 files changed

+500
-35
lines changed

6 files changed

+500
-35
lines changed

include/linux/memcontrol.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
794794
percpu_ref_put(&objcg->refcnt);
795795
}
796796

797+
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
798+
{
799+
return !memcg || css_tryget(&memcg->css);
800+
}
801+
797802
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
798803
{
799804
if (memcg)
@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
13011306
{
13021307
}
13031308

1309+
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
1310+
{
1311+
return true;
1312+
}
1313+
13041314
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
13051315
{
13061316
}

include/linux/mm_inline.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void)
122122
return current->in_lru_fault;
123123
}
124124

125+
#ifdef CONFIG_MEMCG
126+
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
127+
{
128+
return READ_ONCE(lruvec->lrugen.seg);
129+
}
130+
#else
131+
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
132+
{
133+
return 0;
134+
}
135+
#endif
136+
125137
static inline int lru_gen_from_seq(unsigned long seq)
126138
{
127139
return seq % MAX_NR_GENS;
@@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void)
297309
return false;
298310
}
299311

312+
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
313+
{
314+
return 0;
315+
}
316+
300317
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
301318
{
302319
return false;

include/linux/mmzone.h

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include <linux/spinlock.h>
99
#include <linux/list.h>
10+
#include <linux/list_nulls.h>
1011
#include <linux/wait.h>
1112
#include <linux/bitops.h>
1213
#include <linux/cache.h>
@@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
367368
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
368369
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
369370

371+
/* see the comment on MEMCG_NR_GENS */
372+
enum {
373+
MEMCG_LRU_NOP,
374+
MEMCG_LRU_HEAD,
375+
MEMCG_LRU_TAIL,
376+
MEMCG_LRU_OLD,
377+
MEMCG_LRU_YOUNG,
378+
};
379+
370380
#ifdef CONFIG_LRU_GEN
371381

372382
enum {
@@ -426,6 +436,14 @@ struct lru_gen_folio {
426436
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
427437
/* whether the multi-gen LRU is enabled */
428438
bool enabled;
439+
#ifdef CONFIG_MEMCG
440+
/* the memcg generation this lru_gen_folio belongs to */
441+
u8 gen;
442+
/* the list segment this lru_gen_folio belongs to */
443+
u8 seg;
444+
/* per-node lru_gen_folio list for global reclaim */
445+
struct hlist_nulls_node list;
446+
#endif
429447
};
430448

431449
enum {
@@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
479497
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
480498

481499
#ifdef CONFIG_MEMCG
500+
501+
/*
502+
* For each node, memcgs are divided into two generations: the old and the
503+
* young. For each generation, memcgs are randomly sharded into multiple bins
504+
* to improve scalability. For each bin, the hlist_nulls is virtually divided
505+
* into three segments: the head, the tail and the default.
506+
*
507+
* An onlining memcg is added to the tail of a random bin in the old generation.
508+
* The eviction starts at the head of a random bin in the old generation. The
509+
* per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
510+
* the old generation, is incremented when all its bins become empty.
511+
*
512+
* There are four operations:
513+
* 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
514+
* current generation (old or young) and updates its "seg" to "head";
515+
* 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
516+
* current generation (old or young) and updates its "seg" to "tail";
517+
* 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
518+
* generation, updates its "gen" to "old" and resets its "seg" to "default";
519+
* 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
520+
* young generation, updates its "gen" to "young" and resets its "seg" to
521+
* "default".
522+
*
523+
* The events that trigger the above operations are:
524+
* 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
525+
* 2. The first attempt to reclaim an memcg below low, which triggers
526+
* MEMCG_LRU_TAIL;
527+
* 3. The first attempt to reclaim an memcg below reclaimable size threshold,
528+
* which triggers MEMCG_LRU_TAIL;
529+
* 4. The second attempt to reclaim an memcg below reclaimable size threshold,
530+
* which triggers MEMCG_LRU_YOUNG;
531+
* 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
532+
* 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
533+
* 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
534+
*
535+
* Note that memcg LRU only applies to global reclaim, and the round-robin
536+
* incrementing of their max_seq counters ensures the eventual fairness to all
537+
* eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
538+
*/
539+
#define MEMCG_NR_GENS 2
540+
#define MEMCG_NR_BINS 8
541+
542+
struct lru_gen_memcg {
543+
/* the per-node memcg generation counter */
544+
unsigned long seq;
545+
/* each memcg has one lru_gen_folio per node */
546+
unsigned long nr_memcgs[MEMCG_NR_GENS];
547+
/* per-node lru_gen_folio list for global reclaim */
548+
struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
549+
/* protects the above */
550+
spinlock_t lock;
551+
};
552+
553+
void lru_gen_init_pgdat(struct pglist_data *pgdat);
554+
482555
void lru_gen_init_memcg(struct mem_cgroup *memcg);
483556
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
484-
#endif
557+
void lru_gen_online_memcg(struct mem_cgroup *memcg);
558+
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
559+
void lru_gen_release_memcg(struct mem_cgroup *memcg);
560+
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
561+
562+
#else /* !CONFIG_MEMCG */
563+
564+
#define MEMCG_NR_GENS 1
565+
566+
struct lru_gen_memcg {
567+
};
568+
569+
static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
570+
{
571+
}
572+
573+
#endif /* CONFIG_MEMCG */
485574

486575
#else /* !CONFIG_LRU_GEN */
487576

577+
static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
578+
{
579+
}
580+
488581
static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
489582
{
490583
}
@@ -494,14 +587,32 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
494587
}
495588

496589
#ifdef CONFIG_MEMCG
590+
497591
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
498592
{
499593
}
500594

501595
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
502596
{
503597
}
504-
#endif
598+
599+
static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
600+
{
601+
}
602+
603+
static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
604+
{
605+
}
606+
607+
static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
608+
{
609+
}
610+
611+
static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
612+
{
613+
}
614+
615+
#endif /* CONFIG_MEMCG */
505616

506617
#endif /* CONFIG_LRU_GEN */
507618

@@ -1243,6 +1354,8 @@ typedef struct pglist_data {
12431354
#ifdef CONFIG_LRU_GEN
12441355
/* kswap mm walk data */
12451356
struct lru_gen_mm_walk mm_walk;
1357+
/* lru_gen_folio list */
1358+
struct lru_gen_memcg memcg_lru;
12461359
#endif
12471360

12481361
CACHELINE_PADDING(_pad2_);

mm/memcontrol.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
478478
struct mem_cgroup_per_node *mz;
479479
struct mem_cgroup_tree_per_node *mctz;
480480

481+
if (lru_gen_enabled()) {
482+
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
483+
484+
/* see the comment on MEMCG_NR_GENS */
485+
if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
486+
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
487+
488+
return;
489+
}
490+
481491
mctz = soft_limit_tree.rb_tree_per_node[nid];
482492
if (!mctz)
483493
return;
@@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
35303540
struct mem_cgroup_tree_per_node *mctz;
35313541
unsigned long excess;
35323542

3543+
if (lru_gen_enabled())
3544+
return 0;
3545+
35333546
if (order > 0)
35343547
return 0;
35353548

@@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
53915404
if (unlikely(mem_cgroup_is_root(memcg)))
53925405
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
53935406
2UL*HZ);
5407+
lru_gen_online_memcg(memcg);
53945408
return 0;
53955409
offline_kmem:
53965410
memcg_offline_kmem(memcg);
@@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
54225436
memcg_offline_kmem(memcg);
54235437
reparent_shrinker_deferred(memcg);
54245438
wb_memcg_offline(memcg);
5439+
lru_gen_offline_memcg(memcg);
54255440

54265441
drain_all_stock(memcg);
54275442

@@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
54335448
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
54345449

54355450
invalidate_reclaim_iterators(memcg);
5451+
lru_gen_release_memcg(memcg);
54365452
}
54375453

54385454
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

mm/page_alloc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid)
79417941
pgdat_set_deferred_range(pgdat);
79427942

79437943
free_area_init_core(pgdat);
7944+
lru_gen_init_pgdat(pgdat);
79447945
}
79457946

79467947
static void __init free_area_init_memoryless_node(int nid)

0 commit comments

Comments
 (0)