Skip to content

Commit 5ecd9d4

Browse files
rientjestorvalds
authored andcommitted
mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory
Kswapd will not wakeup if per-zone watermarks are not failing or if too many previous attempts at background reclaim have failed. This can be true if there is a lot of free memory available. For high- order allocations, kswapd is responsible for waking up kcompactd for background compaction. If the zone is not below its watermarks or reclaim has recently failed (lots of free memory, nothing left to reclaim), kcompactd does not get woken up. When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be woken up even if kswapd will not reclaim. This allows high-order allocations, such as thp, to still trigger background compaction even when the zone has an abundance of free memory. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: David Rientjes <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Cc: Mel Gorman <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 3eda69c commit 5ecd9d4

File tree

5 files changed

+45
-25
lines changed

5 files changed

+45
-25
lines changed

Documentation/trace/postprocess/trace-vmscan-postprocess.pl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ sub sigint_handler {
111111
my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
112112
my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
113113
my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
114-
my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
114+
my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
115115
my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
116116
my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
117117
my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
@@ -201,7 +201,7 @@ sub generate_traceevent_regex {
201201
$regex_wakeup_kswapd = generate_traceevent_regex(
202202
"vmscan/mm_vmscan_wakeup_kswapd",
203203
$regex_wakeup_kswapd_default,
204-
"nid", "zid", "order");
204+
"nid", "zid", "order", "gfp_flags");
205205
$regex_lru_isolate = generate_traceevent_regex(
206206
"vmscan/mm_vmscan_lru_isolate",
207207
$regex_lru_isolate_default,

include/linux/mmzone.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,8 @@ static inline bool is_dev_zone(const struct zone *zone)
776776
#include <linux/memory_hotplug.h>
777777

778778
void build_all_zonelists(pg_data_t *pgdat);
779-
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
779+
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
780+
enum zone_type classzone_idx);
780781
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
781782
int classzone_idx, unsigned int alloc_flags,
782783
long free_pages);

include/trace/events/vmscan.h

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
7878

7979
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
8080

81-
TP_PROTO(int nid, int zid, int order),
81+
TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
8282

83-
TP_ARGS(nid, zid, order),
83+
TP_ARGS(nid, zid, order, gfp_flags),
8484

8585
TP_STRUCT__entry(
86-
__field( int, nid )
87-
__field( int, zid )
88-
__field( int, order )
86+
__field( int, nid )
87+
__field( int, zid )
88+
__field( int, order )
89+
__field( gfp_t, gfp_flags )
8990
),
9091

9192
TP_fast_assign(
9293
__entry->nid = nid;
9394
__entry->zid = zid;
9495
__entry->order = order;
96+
__entry->gfp_flags = gfp_flags;
9597
),
9698

97-
TP_printk("nid=%d zid=%d order=%d",
99+
TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
98100
__entry->nid,
99101
__entry->zid,
100-
__entry->order)
102+
__entry->order,
103+
show_gfp_flags(__entry->gfp_flags))
101104
);
102105

103106
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,

mm/page_alloc.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3805,16 +3805,18 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
38053805
return page;
38063806
}
38073807

3808-
static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
3808+
static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
3809+
const struct alloc_context *ac)
38093810
{
38103811
struct zoneref *z;
38113812
struct zone *zone;
38123813
pg_data_t *last_pgdat = NULL;
3814+
enum zone_type high_zoneidx = ac->high_zoneidx;
38133815

3814-
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3815-
ac->high_zoneidx, ac->nodemask) {
3816+
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
3817+
ac->nodemask) {
38163818
if (last_pgdat != zone->zone_pgdat)
3817-
wakeup_kswapd(zone, order, ac->high_zoneidx);
3819+
wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
38183820
last_pgdat = zone->zone_pgdat;
38193821
}
38203822
}
@@ -4093,7 +4095,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
40934095
goto nopage;
40944096

40954097
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4096-
wake_all_kswapds(order, ac);
4098+
wake_all_kswapds(order, gfp_mask, ac);
40974099

40984100
/*
40994101
* The adjusted alloc_flags might result in immediate success, so try
@@ -4151,7 +4153,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
41514153
retry:
41524154
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
41534155
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4154-
wake_all_kswapds(order, ac);
4156+
wake_all_kswapds(order, gfp_mask, ac);
41554157

41564158
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
41574159
if (reserve_flags)

mm/vmscan.c

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3539,16 +3539,21 @@ static int kswapd(void *p)
35393539
}
35403540

35413541
/*
3542-
* A zone is low on free memory, so wake its kswapd task to service it.
3542+
* A zone is low on free memory or too fragmented for high-order memory. If
3543+
* kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
3544+
* pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
3545+
* has failed or is not needed, still wake up kcompactd if only compaction is
3546+
* needed.
35433547
*/
3544-
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3548+
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3549+
enum zone_type classzone_idx)
35453550
{
35463551
pg_data_t *pgdat;
35473552

35483553
if (!managed_zone(zone))
35493554
return;
35503555

3551-
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
3556+
if (!cpuset_zone_allowed(zone, gfp_flags))
35523557
return;
35533558
pgdat = zone->zone_pgdat;
35543559
pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
@@ -3557,14 +3562,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
35573562
if (!waitqueue_active(&pgdat->kswapd_wait))
35583563
return;
35593564

3560-
/* Hopeless node, leave it to direct reclaim */
3561-
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3562-
return;
3563-
3564-
if (pgdat_balanced(pgdat, order, classzone_idx))
3565+
/* Hopeless node, leave it to direct reclaim if possible */
3566+
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3567+
pgdat_balanced(pgdat, order, classzone_idx)) {
3568+
/*
3569+
* There may be plenty of free memory available, but it's too
3570+
* fragmented for high-order allocations. Wake up kcompactd
3571+
* and rely on compaction_suitable() to determine if it's
3572+
* needed. If it fails, it will defer subsequent attempts to
3573+
* ratelimit its work.
3574+
*/
3575+
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3576+
wakeup_kcompactd(pgdat, order, classzone_idx);
35653577
return;
3578+
}
35663579

3567-
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
3580+
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
3581+
gfp_flags);
35683582
wake_up_interruptible(&pgdat->kswapd_wait);
35693583
}
35703584

0 commit comments

Comments
 (0)