Skip to content

Commit 700d2e9

Browse files
tehcasterakpm00
authored andcommitted
mm, page_alloc: reduce page alloc/free sanity checks
Historically, we have performed sanity checks on all struct pages being allocated or freed, making sure they have no unexpected page flags or certain field values. This can detect insufficient cleanup and some cases of use-after-free, although on its own it can't always identify the culprit. The result is a warning and the "bad page" being leaked. The checks do need some cpu cycles, so in 4.7 with commits 479f854 ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") and 4db7548 ("mm, page_alloc: defer debugging checks of freed pages until a PCP drain") they were no longer performed in the hot paths when allocating and freeing from pcplists, but only when pcplists are bypassed, refilled or drained. For debugging purposes, with CONFIG_DEBUG_VM enabled the checks were instead still done in the hot paths and not when refilling or draining pcplists. With 4462b32 ("mm, page_alloc: more extensive free page checking with debug_pagealloc"), enabling debug_pagealloc also moved the sanity checks back to hot pahs. When both debug_pagealloc and CONFIG_DEBUG_VM are enabled, the checks are done both in hotpaths and pcplist refill/drain. Even though the non-debug default today might seem to be a sensible tradeoff between overhead and ability to detect bad pages, on closer look it's arguably not. As most allocations go through the pcplists, catching any bad pages when refilling or draining pcplists has only a small chance, insufficient for debugging or serious hardening purposes. On the other hand the cost of the checks is concentrated in the already expensive drain/refill batching operations, and those are done under the often contended zone lock. That was recently identified as an issue for page allocation and the zone lock contention reduced by moving the checks outside of the locked section with a patch "mm: reduce lock contention of pcp buffer refill", but the cost of the checks is still visible compared to their removal [1]. In the pcplist draining path free_pcppages_bulk() the checks are still done under zone->lock. Thus, remove the checks from pcplist refill and drain paths completely. Introduce a static key check_pages_enabled to control checks during page allocation a freeing (whether pcplist is used or bypassed). The static key is enabled if either is true: - kernel is built with CONFIG_DEBUG_VM=y (debugging) - debug_pagealloc or page poisoning is boot-time enabled (debugging) - init_on_alloc or init_on_free is boot-time enabled (hardening) The resulting user visible changes: - no checks when draining/refilling pcplists - less overhead, with likely no practical reduction of ability to catch bad pages - no checks when bypassing pcplists in default config (no debugging/hardening) - less overhead etc. as above - on typical hardened kernels [2], checks are now performed on each page allocation/free (previously only when bypassing/draining/refilling pcplists) - the init_on_alloc/init_on_free enabled should be sufficient indication for preferring more costly alloc/free operations for hardening purposes and we shouldn't need to introduce another toggle - code (various wrappers) removal and simplification [1] https://lore.kernel.org/all/[email protected]/ [2] https://lore.kernel.org/all/[email protected]/ [[email protected]: coding-style cleanups] [[email protected]: make check_pages_enabled static] Link: https://lkml.kernel.org/r/[email protected] Reported-by: Alexander Halbuer <[email protected]> Reported-by: Andrew Morton <[email protected]> Signed-off-by: Vlastimil Babka <[email protected]> Cc: Kees Cook <[email protected]> Cc: Mel Gorman <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 2ede3c1 commit 700d2e9

File tree

1 file changed

+52
-135
lines changed

1 file changed

+52
-135
lines changed

mm/page_alloc.c

Lines changed: 52 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ EXPORT_SYMBOL(init_on_alloc);
253253
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
254254
EXPORT_SYMBOL(init_on_free);
255255

256+
/* perform sanity checks on struct pages being allocated or freed */
257+
static DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
258+
256259
static bool _init_on_alloc_enabled_early __read_mostly
257260
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
258261
static int __init early_init_on_alloc(char *buf)
@@ -893,6 +896,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
893896
void __init init_mem_debugging_and_hardening(void)
894897
{
895898
bool page_poisoning_requested = false;
899+
bool want_check_pages = false;
896900

897901
#ifdef CONFIG_PAGE_POISONING
898902
/*
@@ -904,6 +908,7 @@ void __init init_mem_debugging_and_hardening(void)
904908
debug_pagealloc_enabled())) {
905909
static_branch_enable(&_page_poisoning_enabled);
906910
page_poisoning_requested = true;
911+
want_check_pages = true;
907912
}
908913
#endif
909914

@@ -915,31 +920,41 @@ void __init init_mem_debugging_and_hardening(void)
915920
_init_on_free_enabled_early = false;
916921
}
917922

918-
if (_init_on_alloc_enabled_early)
923+
if (_init_on_alloc_enabled_early) {
924+
want_check_pages = true;
919925
static_branch_enable(&init_on_alloc);
920-
else
926+
} else {
921927
static_branch_disable(&init_on_alloc);
928+
}
922929

923-
if (_init_on_free_enabled_early)
930+
if (_init_on_free_enabled_early) {
931+
want_check_pages = true;
924932
static_branch_enable(&init_on_free);
925-
else
933+
} else {
926934
static_branch_disable(&init_on_free);
935+
}
927936

928937
if (IS_ENABLED(CONFIG_KMSAN) &&
929938
(_init_on_alloc_enabled_early || _init_on_free_enabled_early))
930939
pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
931940

932941
#ifdef CONFIG_DEBUG_PAGEALLOC
933-
if (!debug_pagealloc_enabled())
934-
return;
935-
936-
static_branch_enable(&_debug_pagealloc_enabled);
937-
938-
if (!debug_guardpage_minorder())
939-
return;
942+
if (debug_pagealloc_enabled()) {
943+
want_check_pages = true;
944+
static_branch_enable(&_debug_pagealloc_enabled);
940945

941-
static_branch_enable(&_debug_guardpage_enabled);
946+
if (debug_guardpage_minorder())
947+
static_branch_enable(&_debug_guardpage_enabled);
948+
}
942949
#endif
950+
951+
/*
952+
* Any page debugging or hardening option also enables sanity checking
953+
* of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
954+
* enabled already.
955+
*/
956+
if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
957+
static_branch_enable(&check_pages_enabled);
943958
}
944959

945960
static inline void set_buddy_order(struct page *page, unsigned int order)
@@ -1395,7 +1410,7 @@ static void kernel_init_pages(struct page *page, int numpages)
13951410
}
13961411

13971412
static __always_inline bool free_pages_prepare(struct page *page,
1398-
unsigned int order, bool check_free, fpi_t fpi_flags)
1413+
unsigned int order, fpi_t fpi_flags)
13991414
{
14001415
int bad = 0;
14011416
bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
@@ -1433,9 +1448,11 @@ static __always_inline bool free_pages_prepare(struct page *page,
14331448
for (i = 1; i < (1 << order); i++) {
14341449
if (compound)
14351450
bad += free_tail_pages_check(page, page + i);
1436-
if (unlikely(free_page_is_bad(page + i))) {
1437-
bad++;
1438-
continue;
1451+
if (static_branch_unlikely(&check_pages_enabled)) {
1452+
if (unlikely(free_page_is_bad(page + i))) {
1453+
bad++;
1454+
continue;
1455+
}
14391456
}
14401457
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
14411458
}
@@ -1444,10 +1461,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
14441461
page->mapping = NULL;
14451462
if (memcg_kmem_online() && PageMemcgKmem(page))
14461463
__memcg_kmem_uncharge_page(page, order);
1447-
if (check_free && free_page_is_bad(page))
1448-
bad++;
1449-
if (bad)
1450-
return false;
1464+
if (static_branch_unlikely(&check_pages_enabled)) {
1465+
if (free_page_is_bad(page))
1466+
bad++;
1467+
if (bad)
1468+
return false;
1469+
}
14511470

14521471
page_cpupid_reset_last(page);
14531472
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
@@ -1493,46 +1512,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
14931512
return true;
14941513
}
14951514

1496-
#ifdef CONFIG_DEBUG_VM
1497-
/*
1498-
* With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
1499-
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
1500-
* moved from pcp lists to free lists.
1501-
*/
1502-
static bool free_pcp_prepare(struct page *page, unsigned int order)
1503-
{
1504-
return free_pages_prepare(page, order, true, FPI_NONE);
1505-
}
1506-
1507-
/* return true if this page has an inappropriate state */
1508-
static bool bulkfree_pcp_prepare(struct page *page)
1509-
{
1510-
if (debug_pagealloc_enabled_static())
1511-
return free_page_is_bad(page);
1512-
else
1513-
return false;
1514-
}
1515-
#else
1516-
/*
1517-
* With DEBUG_VM disabled, order-0 pages being freed are checked only when
1518-
* moving from pcp lists to free list in order to reduce overhead. With
1519-
* debug_pagealloc enabled, they are checked also immediately when being freed
1520-
* to the pcp lists.
1521-
*/
1522-
static bool free_pcp_prepare(struct page *page, unsigned int order)
1523-
{
1524-
if (debug_pagealloc_enabled_static())
1525-
return free_pages_prepare(page, order, true, FPI_NONE);
1526-
else
1527-
return free_pages_prepare(page, order, false, FPI_NONE);
1528-
}
1529-
1530-
static bool bulkfree_pcp_prepare(struct page *page)
1531-
{
1532-
return free_page_is_bad(page);
1533-
}
1534-
#endif /* CONFIG_DEBUG_VM */
1535-
15361515
/*
15371516
* Frees a number of pages from the PCP lists
15381517
* Assumes all pages on list are in same zone.
@@ -1592,9 +1571,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
15921571
count -= nr_pages;
15931572
pcp->count -= nr_pages;
15941573

1595-
if (bulkfree_pcp_prepare(page))
1596-
continue;
1597-
15981574
/* MIGRATE_ISOLATE page should not go to pcplists */
15991575
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
16001576
/* Pageblock could have been isolated meanwhile */
@@ -1707,7 +1683,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
17071683
unsigned long pfn = page_to_pfn(page);
17081684
struct zone *zone = page_zone(page);
17091685

1710-
if (!free_pages_prepare(page, order, true, fpi_flags))
1686+
if (!free_pages_prepare(page, order, fpi_flags))
17111687
return;
17121688

17131689
/*
@@ -2383,7 +2359,7 @@ static void check_new_page_bad(struct page *page)
23832359
/*
23842360
* This page is about to be returned from the page allocator
23852361
*/
2386-
static inline int check_new_page(struct page *page)
2362+
static int check_new_page(struct page *page)
23872363
{
23882364
if (likely(page_expected_state(page,
23892365
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
@@ -2393,56 +2369,20 @@ static inline int check_new_page(struct page *page)
23932369
return 1;
23942370
}
23952371

2396-
static bool check_new_pages(struct page *page, unsigned int order)
2372+
static inline bool check_new_pages(struct page *page, unsigned int order)
23972373
{
2398-
int i;
2399-
for (i = 0; i < (1 << order); i++) {
2400-
struct page *p = page + i;
2374+
if (static_branch_unlikely(&check_pages_enabled)) {
2375+
for (int i = 0; i < (1 << order); i++) {
2376+
struct page *p = page + i;
24012377

2402-
if (unlikely(check_new_page(p)))
2403-
return true;
2378+
if (unlikely(check_new_page(p)))
2379+
return true;
2380+
}
24042381
}
24052382

24062383
return false;
24072384
}
24082385

2409-
#ifdef CONFIG_DEBUG_VM
2410-
/*
2411-
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
2412-
* being allocated from pcp lists. With debug_pagealloc also enabled, they are
2413-
* also checked when pcp lists are refilled from the free lists.
2414-
*/
2415-
static inline bool check_pcp_refill(struct page *page, unsigned int order)
2416-
{
2417-
if (debug_pagealloc_enabled_static())
2418-
return check_new_pages(page, order);
2419-
else
2420-
return false;
2421-
}
2422-
2423-
static inline bool check_new_pcp(struct page *page, unsigned int order)
2424-
{
2425-
return check_new_pages(page, order);
2426-
}
2427-
#else
2428-
/*
2429-
* With DEBUG_VM disabled, free order-0 pages are checked for expected state
2430-
* when pcp lists are being refilled from the free lists. With debug_pagealloc
2431-
* enabled, they are also checked when being allocated from the pcp lists.
2432-
*/
2433-
static inline bool check_pcp_refill(struct page *page, unsigned int order)
2434-
{
2435-
return check_new_pages(page, order);
2436-
}
2437-
static inline bool check_new_pcp(struct page *page, unsigned int order)
2438-
{
2439-
if (debug_pagealloc_enabled_static())
2440-
return check_new_pages(page, order);
2441-
else
2442-
return false;
2443-
}
2444-
#endif /* CONFIG_DEBUG_VM */
2445-
24462386
static inline bool should_skip_kasan_unpoison(gfp_t flags)
24472387
{
24482388
/* Don't skip if a software KASAN mode is enabled. */
@@ -3137,9 +3077,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
31373077
int migratetype, unsigned int alloc_flags)
31383078
{
31393079
unsigned long flags;
3140-
int i, allocated = 0;
3141-
struct list_head *prev_tail = list->prev;
3142-
struct page *pos, *n;
3080+
int i;
31433081

31443082
spin_lock_irqsave(&zone->lock, flags);
31453083
for (i = 0; i < count; ++i) {
@@ -3164,31 +3102,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
31643102
-(1 << order));
31653103
}
31663104

3167-
/*
3168-
* i pages were removed from the buddy list even if some leak due
3169-
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
3170-
* on i. Do not confuse with 'allocated' which is the number of
3171-
* pages added to the pcp list.
3172-
*/
31733105
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
31743106
spin_unlock_irqrestore(&zone->lock, flags);
31753107

3176-
/*
3177-
* Pages are appended to the pcp list without checking to reduce the
3178-
* time holding the zone lock. Checking the appended pages happens right
3179-
* after the critical section while still holding the pcp lock.
3180-
*/
3181-
pos = list_first_entry(prev_tail, struct page, pcp_list);
3182-
list_for_each_entry_safe_from(pos, n, list, pcp_list) {
3183-
if (unlikely(check_pcp_refill(pos, order))) {
3184-
list_del(&pos->pcp_list);
3185-
continue;
3186-
}
3187-
3188-
allocated++;
3189-
}
3190-
3191-
return allocated;
3108+
return i;
31923109
}
31933110

31943111
#ifdef CONFIG_NUMA
@@ -3399,7 +3316,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
33993316
{
34003317
int migratetype;
34013318

3402-
if (!free_pcp_prepare(page, order))
3319+
if (!free_pages_prepare(page, order, FPI_NONE))
34033320
return false;
34043321

34053322
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3805,7 +3722,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
38053722
page = list_first_entry(list, struct page, pcp_list);
38063723
list_del(&page->pcp_list);
38073724
pcp->count -= 1 << order;
3808-
} while (check_new_pcp(page, order));
3725+
} while (check_new_pages(page, order));
38093726

38103727
return page;
38113728
}

0 commit comments

Comments
 (0)