Skip to content

Commit c9e97a1

Browse files
Pavel Tatashintorvalds
authored andcommitted
mm: initialize pages on demand during boot
Deferred page initialization allows the boot cpu to initialize a small subset of the system's pages early in boot, with other cpus doing the rest later on. It is, however, problematic to know how many pages the kernel needs during boot. Different modules and kernel parameters may change the requirement, so the boot cpu either initializes too many pages or runs out of memory. To fix that, initialize early pages on demand. This ensures the kernel does the minimum amount of work to initialize pages during boot and leaves the rest to be divided in the multithreaded initialization path (deferred_init_memmap). The on-demand code is permanently disabled using static branching once deferred pages are initialized. After the static branch is changed to false, the overhead is up-to two branch-always instructions if the zone watermark check fails or if rmqueue fails. Sergey Senozhatsky noticed that while deferred pages currently make sense only on NUMA machines (we start one thread per latency node), CONFIG_NUMA is not a requirement for CONFIG_DEFERRED_STRUCT_PAGE_INIT, so that is also must be addressed in the patch. [[email protected]: fix typo in comment, make deferred_pages static] [[email protected]: fix min() type mismatch warning] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: use zone_to_nid() in deferred_grow_zone()] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: might_sleep warning] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: s/spin_lock/spin_lock_irq/ in page_alloc_init_late()] [[email protected]: v5] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: tweak comments] [[email protected]: v6] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: coding-style fixes] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Pavel Tatashin <[email protected]> Reviewed-by: Daniel Jordan <[email protected]> Reviewed-by: Steven Sistare <[email protected]> Reviewed-by: Andrew Morton <[email protected]> Tested-by: Masayoshi Mizuma <[email protected]> Acked-by: Mel Gorman <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Catalin Marinas <[email protected]> Cc: AKASHI Takahiro <[email protected]> Cc: Gioh Kim <[email protected]> Cc: Heiko Carstens <[email protected]> Cc: Yaowei Bai <[email protected]> Cc: Wei Yang <[email protected]> Cc: Paul Burton <[email protected]> Cc: Miles Chen <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Johannes Weiner <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 3a2d7fa commit c9e97a1

File tree

3 files changed

+144
-72
lines changed

3 files changed

+144
-72
lines changed

include/linux/memblock.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -416,21 +416,11 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end)
416416
{
417417
}
418418
#endif
419-
420-
extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
421-
phys_addr_t end_addr);
422419
#else
423420
static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
424421
{
425422
return 0;
426423
}
427-
428-
static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
429-
phys_addr_t end_addr)
430-
{
431-
return 0;
432-
}
433-
434424
#endif /* CONFIG_HAVE_MEMBLOCK */
435425

436426
#endif /* __KERNEL__ */

mm/memblock.c

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,29 +1750,6 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
17501750
}
17511751
}
17521752

1753-
extern unsigned long __init_memblock
1754-
memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
1755-
{
1756-
struct memblock_region *rgn;
1757-
unsigned long size = 0;
1758-
int idx;
1759-
1760-
for_each_memblock_type(idx, (&memblock.reserved), rgn) {
1761-
phys_addr_t start, end;
1762-
1763-
if (rgn->base + rgn->size < start_addr)
1764-
continue;
1765-
if (rgn->base > end_addr)
1766-
continue;
1767-
1768-
start = rgn->base;
1769-
end = start + rgn->size;
1770-
size += end - start;
1771-
}
1772-
1773-
return size;
1774-
}
1775-
17761753
void __init_memblock __memblock_dump_all(void)
17771754
{
17781755
pr_info("MEMBLOCK configuration:\n");

mm/page_alloc.c

Lines changed: 144 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -292,40 +292,6 @@ EXPORT_SYMBOL(nr_online_nodes);
292292
int page_group_by_mobility_disabled __read_mostly;
293293

294294
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
295-
296-
/*
297-
* Determine how many pages need to be initialized during early boot
298-
* (non-deferred initialization).
299-
* The value of first_deferred_pfn will be set later, once non-deferred pages
300-
* are initialized, but for now set it ULONG_MAX.
301-
*/
302-
static inline void reset_deferred_meminit(pg_data_t *pgdat)
303-
{
304-
phys_addr_t start_addr, end_addr;
305-
unsigned long max_pgcnt;
306-
unsigned long reserved;
307-
308-
/*
309-
* Initialise at least 2G of a node but also take into account that
310-
* two large system hashes that can take up 1GB for 0.25TB/node.
311-
*/
312-
max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
313-
(pgdat->node_spanned_pages >> 8));
314-
315-
/*
316-
* Compensate the all the memblock reservations (e.g. crash kernel)
317-
* from the initial estimation to make sure we will initialize enough
318-
* memory to boot.
319-
*/
320-
start_addr = PFN_PHYS(pgdat->node_start_pfn);
321-
end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
322-
reserved = memblock_reserved_memory_within(start_addr, end_addr);
323-
max_pgcnt += PHYS_PFN(reserved);
324-
325-
pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
326-
pgdat->first_deferred_pfn = ULONG_MAX;
327-
}
328-
329295
/* Returns true if the struct page for the pfn is uninitialised */
330296
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
331297
{
@@ -361,10 +327,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
361327
return true;
362328
}
363329
#else
364-
static inline void reset_deferred_meminit(pg_data_t *pgdat)
365-
{
366-
}
367-
368330
static inline bool early_page_uninitialised(unsigned long pfn)
369331
{
370332
return false;
@@ -1611,6 +1573,117 @@ static int __init deferred_init_memmap(void *data)
16111573
pgdat_init_report_one_done();
16121574
return 0;
16131575
}
1576+
1577+
/*
1578+
* During boot we initialize deferred pages on-demand, as needed, but once
1579+
* page_alloc_init_late() has finished, the deferred pages are all initialized,
1580+
* and we can permanently disable that path.
1581+
*/
1582+
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
1583+
1584+
/*
1585+
* If this zone has deferred pages, try to grow it by initializing enough
1586+
* deferred pages to satisfy the allocation specified by order, rounded up to
1587+
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
1588+
* of SECTION_SIZE bytes by initializing struct pages in increments of
1589+
* PAGES_PER_SECTION * sizeof(struct page) bytes.
1590+
*
1591+
* Return true when zone was grown, otherwise return false. We return true even
1592+
* when we grow less than requested, to let the caller decide if there are
1593+
* enough pages to satisfy the allocation.
1594+
*
1595+
* Note: We use noinline because this function is needed only during boot, and
1596+
* it is called from a __ref function _deferred_grow_zone. This way we are
1597+
* making sure that it is not inlined into permanent text section.
1598+
*/
1599+
static noinline bool __init
1600+
deferred_grow_zone(struct zone *zone, unsigned int order)
1601+
{
1602+
int zid = zone_idx(zone);
1603+
int nid = zone_to_nid(zone);
1604+
pg_data_t *pgdat = NODE_DATA(nid);
1605+
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1606+
unsigned long nr_pages = 0;
1607+
unsigned long first_init_pfn, spfn, epfn, t, flags;
1608+
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1609+
phys_addr_t spa, epa;
1610+
u64 i;
1611+
1612+
/* Only the last zone may have deferred pages */
1613+
if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
1614+
return false;
1615+
1616+
pgdat_resize_lock(pgdat, &flags);
1617+
1618+
/*
1619+
* If deferred pages have been initialized while we were waiting for
1620+
* the lock, return true, as the zone was grown. The caller will retry
1621+
* this zone. We won't return to this function since the caller also
1622+
* has this static branch.
1623+
*/
1624+
if (!static_branch_unlikely(&deferred_pages)) {
1625+
pgdat_resize_unlock(pgdat, &flags);
1626+
return true;
1627+
}
1628+
1629+
/*
1630+
* If someone grew this zone while we were waiting for spinlock, return
1631+
* true, as there might be enough pages already.
1632+
*/
1633+
if (first_deferred_pfn != pgdat->first_deferred_pfn) {
1634+
pgdat_resize_unlock(pgdat, &flags);
1635+
return true;
1636+
}
1637+
1638+
first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1639+
1640+
if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
1641+
pgdat_resize_unlock(pgdat, &flags);
1642+
return false;
1643+
}
1644+
1645+
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1646+
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1647+
epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1648+
1649+
while (spfn < epfn && nr_pages < nr_pages_needed) {
1650+
t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1651+
first_deferred_pfn = min(t, epfn);
1652+
nr_pages += deferred_init_pages(nid, zid, spfn,
1653+
first_deferred_pfn);
1654+
spfn = first_deferred_pfn;
1655+
}
1656+
1657+
if (nr_pages >= nr_pages_needed)
1658+
break;
1659+
}
1660+
1661+
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1662+
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1663+
epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1664+
deferred_free_pages(nid, zid, spfn, epfn);
1665+
1666+
if (first_deferred_pfn == epfn)
1667+
break;
1668+
}
1669+
pgdat->first_deferred_pfn = first_deferred_pfn;
1670+
pgdat_resize_unlock(pgdat, &flags);
1671+
1672+
return nr_pages > 0;
1673+
}
1674+
1675+
/*
1676+
* deferred_grow_zone() is __init, but it is called from
1677+
* get_page_from_freelist() during early boot until deferred_pages permanently
1678+
* disables this call. This is why we have refdata wrapper to avoid warning,
1679+
* and to ensure that the function body gets unloaded.
1680+
*/
1681+
static bool __ref
1682+
_deferred_grow_zone(struct zone *zone, unsigned int order)
1683+
{
1684+
return deferred_grow_zone(zone, order);
1685+
}
1686+
16141687
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
16151688

16161689
void __init page_alloc_init_late(void)
@@ -1629,6 +1702,12 @@ void __init page_alloc_init_late(void)
16291702
/* Block until all are initialised */
16301703
wait_for_completion(&pgdat_init_all_done_comp);
16311704

1705+
/*
1706+
* We initialized the rest of the deferred pages. Permanently disable
1707+
* on-demand struct page initialization.
1708+
*/
1709+
static_branch_disable(&deferred_pages);
1710+
16321711
/* Reinit limits that are based on free pages after the kernel is up */
16331712
files_maxfiles_init();
16341713
#endif
@@ -3208,6 +3287,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
32083287
ac_classzone_idx(ac), alloc_flags)) {
32093288
int ret;
32103289

3290+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3291+
/*
3292+
* Watermark failed for this zone, but see if we can
3293+
* grow this zone if it contains deferred pages.
3294+
*/
3295+
if (static_branch_unlikely(&deferred_pages)) {
3296+
if (_deferred_grow_zone(zone, order))
3297+
goto try_this_zone;
3298+
}
3299+
#endif
32113300
/* Checked here to keep the fast path fast */
32123301
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
32133302
if (alloc_flags & ALLOC_NO_WATERMARKS)
@@ -3249,6 +3338,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
32493338
reserve_highatomic_pageblock(page, zone, order);
32503339

32513340
return page;
3341+
} else {
3342+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3343+
/* Try again if zone has deferred pages */
3344+
if (static_branch_unlikely(&deferred_pages)) {
3345+
if (_deferred_grow_zone(zone, order))
3346+
goto try_this_zone;
3347+
}
3348+
#endif
32523349
}
32533350
}
32543351

@@ -6244,7 +6341,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
62446341

62456342
alloc_node_mem_map(pgdat);
62466343

6247-
reset_deferred_meminit(pgdat);
6344+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6345+
/*
6346+
* We start only with one section of pages, more pages are added as
6347+
* needed until the rest of deferred pages are initialized.
6348+
*/
6349+
pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6350+
pgdat->node_spanned_pages);
6351+
pgdat->first_deferred_pfn = ULONG_MAX;
6352+
#endif
62486353
free_area_init_core(pgdat);
62496354
}
62506355

0 commit comments

Comments
 (0)