@@ -292,40 +292,6 @@ EXPORT_SYMBOL(nr_online_nodes);
292
292
int page_group_by_mobility_disabled __read_mostly ;
293
293
294
294
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
295
-
296
- /*
297
- * Determine how many pages need to be initialized during early boot
298
- * (non-deferred initialization).
299
- * The value of first_deferred_pfn will be set later, once non-deferred pages
300
- * are initialized, but for now set it ULONG_MAX.
301
- */
302
- static inline void reset_deferred_meminit (pg_data_t * pgdat )
303
- {
304
- phys_addr_t start_addr , end_addr ;
305
- unsigned long max_pgcnt ;
306
- unsigned long reserved ;
307
-
308
- /*
309
- * Initialise at least 2G of a node but also take into account that
310
- * two large system hashes that can take up 1GB for 0.25TB/node.
311
- */
312
- max_pgcnt = max (2UL << (30 - PAGE_SHIFT ),
313
- (pgdat -> node_spanned_pages >> 8 ));
314
-
315
- /*
316
- * Compensate the all the memblock reservations (e.g. crash kernel)
317
- * from the initial estimation to make sure we will initialize enough
318
- * memory to boot.
319
- */
320
- start_addr = PFN_PHYS (pgdat -> node_start_pfn );
321
- end_addr = PFN_PHYS (pgdat -> node_start_pfn + max_pgcnt );
322
- reserved = memblock_reserved_memory_within (start_addr , end_addr );
323
- max_pgcnt += PHYS_PFN (reserved );
324
-
325
- pgdat -> static_init_pgcnt = min (max_pgcnt , pgdat -> node_spanned_pages );
326
- pgdat -> first_deferred_pfn = ULONG_MAX ;
327
- }
328
-
329
295
/* Returns true if the struct page for the pfn is uninitialised */
330
296
static inline bool __meminit early_page_uninitialised (unsigned long pfn )
331
297
{
@@ -361,10 +327,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
361
327
return true;
362
328
}
363
329
#else
364
- static inline void reset_deferred_meminit (pg_data_t * pgdat )
365
- {
366
- }
367
-
368
330
static inline bool early_page_uninitialised (unsigned long pfn )
369
331
{
370
332
return false;
@@ -1611,6 +1573,117 @@ static int __init deferred_init_memmap(void *data)
1611
1573
pgdat_init_report_one_done ();
1612
1574
return 0 ;
1613
1575
}
1576
+
1577
+ /*
1578
+ * During boot we initialize deferred pages on-demand, as needed, but once
1579
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
1580
+ * and we can permanently disable that path.
1581
+ */
1582
+ static DEFINE_STATIC_KEY_TRUE (deferred_pages );
1583
+
1584
+ /*
1585
+ * If this zone has deferred pages, try to grow it by initializing enough
1586
+ * deferred pages to satisfy the allocation specified by order, rounded up to
1587
+ * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
1588
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
1589
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
1590
+ *
1591
+ * Return true when zone was grown, otherwise return false. We return true even
1592
+ * when we grow less than requested, to let the caller decide if there are
1593
+ * enough pages to satisfy the allocation.
1594
+ *
1595
+ * Note: We use noinline because this function is needed only during boot, and
1596
+ * it is called from a __ref function _deferred_grow_zone. This way we are
1597
+ * making sure that it is not inlined into permanent text section.
1598
+ */
1599
+ static noinline bool __init
1600
+ deferred_grow_zone (struct zone * zone , unsigned int order )
1601
+ {
1602
+ int zid = zone_idx (zone );
1603
+ int nid = zone_to_nid (zone );
1604
+ pg_data_t * pgdat = NODE_DATA (nid );
1605
+ unsigned long nr_pages_needed = ALIGN (1 << order , PAGES_PER_SECTION );
1606
+ unsigned long nr_pages = 0 ;
1607
+ unsigned long first_init_pfn , spfn , epfn , t , flags ;
1608
+ unsigned long first_deferred_pfn = pgdat -> first_deferred_pfn ;
1609
+ phys_addr_t spa , epa ;
1610
+ u64 i ;
1611
+
1612
+ /* Only the last zone may have deferred pages */
1613
+ if (zone_end_pfn (zone ) != pgdat_end_pfn (pgdat ))
1614
+ return false;
1615
+
1616
+ pgdat_resize_lock (pgdat , & flags );
1617
+
1618
+ /*
1619
+ * If deferred pages have been initialized while we were waiting for
1620
+ * the lock, return true, as the zone was grown. The caller will retry
1621
+ * this zone. We won't return to this function since the caller also
1622
+ * has this static branch.
1623
+ */
1624
+ if (!static_branch_unlikely (& deferred_pages )) {
1625
+ pgdat_resize_unlock (pgdat , & flags );
1626
+ return true;
1627
+ }
1628
+
1629
+ /*
1630
+ * If someone grew this zone while we were waiting for spinlock, return
1631
+ * true, as there might be enough pages already.
1632
+ */
1633
+ if (first_deferred_pfn != pgdat -> first_deferred_pfn ) {
1634
+ pgdat_resize_unlock (pgdat , & flags );
1635
+ return true;
1636
+ }
1637
+
1638
+ first_init_pfn = max (zone -> zone_start_pfn , first_deferred_pfn );
1639
+
1640
+ if (first_init_pfn >= pgdat_end_pfn (pgdat )) {
1641
+ pgdat_resize_unlock (pgdat , & flags );
1642
+ return false;
1643
+ }
1644
+
1645
+ for_each_free_mem_range (i , nid , MEMBLOCK_NONE , & spa , & epa , NULL ) {
1646
+ spfn = max_t (unsigned long , first_init_pfn , PFN_UP (spa ));
1647
+ epfn = min_t (unsigned long , zone_end_pfn (zone ), PFN_DOWN (epa ));
1648
+
1649
+ while (spfn < epfn && nr_pages < nr_pages_needed ) {
1650
+ t = ALIGN (spfn + PAGES_PER_SECTION , PAGES_PER_SECTION );
1651
+ first_deferred_pfn = min (t , epfn );
1652
+ nr_pages += deferred_init_pages (nid , zid , spfn ,
1653
+ first_deferred_pfn );
1654
+ spfn = first_deferred_pfn ;
1655
+ }
1656
+
1657
+ if (nr_pages >= nr_pages_needed )
1658
+ break ;
1659
+ }
1660
+
1661
+ for_each_free_mem_range (i , nid , MEMBLOCK_NONE , & spa , & epa , NULL ) {
1662
+ spfn = max_t (unsigned long , first_init_pfn , PFN_UP (spa ));
1663
+ epfn = min_t (unsigned long , first_deferred_pfn , PFN_DOWN (epa ));
1664
+ deferred_free_pages (nid , zid , spfn , epfn );
1665
+
1666
+ if (first_deferred_pfn == epfn )
1667
+ break ;
1668
+ }
1669
+ pgdat -> first_deferred_pfn = first_deferred_pfn ;
1670
+ pgdat_resize_unlock (pgdat , & flags );
1671
+
1672
+ return nr_pages > 0 ;
1673
+ }
1674
+
1675
+ /*
1676
+ * deferred_grow_zone() is __init, but it is called from
1677
+ * get_page_from_freelist() during early boot until deferred_pages permanently
1678
+ * disables this call. This is why we have refdata wrapper to avoid warning,
1679
+ * and to ensure that the function body gets unloaded.
1680
+ */
1681
+ static bool __ref
1682
+ _deferred_grow_zone (struct zone * zone , unsigned int order )
1683
+ {
1684
+ return deferred_grow_zone (zone , order );
1685
+ }
1686
+
1614
1687
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1615
1688
1616
1689
void __init page_alloc_init_late (void )
@@ -1629,6 +1702,12 @@ void __init page_alloc_init_late(void)
1629
1702
/* Block until all are initialised */
1630
1703
wait_for_completion (& pgdat_init_all_done_comp );
1631
1704
1705
+ /*
1706
+ * We initialized the rest of the deferred pages. Permanently disable
1707
+ * on-demand struct page initialization.
1708
+ */
1709
+ static_branch_disable (& deferred_pages );
1710
+
1632
1711
/* Reinit limits that are based on free pages after the kernel is up */
1633
1712
files_maxfiles_init ();
1634
1713
#endif
@@ -3208,6 +3287,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3208
3287
ac_classzone_idx (ac ), alloc_flags )) {
3209
3288
int ret ;
3210
3289
3290
+ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3291
+ /*
3292
+ * Watermark failed for this zone, but see if we can
3293
+ * grow this zone if it contains deferred pages.
3294
+ */
3295
+ if (static_branch_unlikely (& deferred_pages )) {
3296
+ if (_deferred_grow_zone (zone , order ))
3297
+ goto try_this_zone ;
3298
+ }
3299
+ #endif
3211
3300
/* Checked here to keep the fast path fast */
3212
3301
BUILD_BUG_ON (ALLOC_NO_WATERMARKS < NR_WMARK );
3213
3302
if (alloc_flags & ALLOC_NO_WATERMARKS )
@@ -3249,6 +3338,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3249
3338
reserve_highatomic_pageblock (page , zone , order );
3250
3339
3251
3340
return page ;
3341
+ } else {
3342
+ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3343
+ /* Try again if zone has deferred pages */
3344
+ if (static_branch_unlikely (& deferred_pages )) {
3345
+ if (_deferred_grow_zone (zone , order ))
3346
+ goto try_this_zone ;
3347
+ }
3348
+ #endif
3252
3349
}
3253
3350
}
3254
3351
@@ -6244,7 +6341,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6244
6341
6245
6342
alloc_node_mem_map (pgdat );
6246
6343
6247
- reset_deferred_meminit (pgdat );
6344
+ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6345
+ /*
6346
+ * We start only with one section of pages, more pages are added as
6347
+ * needed until the rest of deferred pages are initialized.
6348
+ */
6349
+ pgdat -> static_init_pgcnt = min_t (unsigned long , PAGES_PER_SECTION ,
6350
+ pgdat -> node_spanned_pages );
6351
+ pgdat -> first_deferred_pfn = ULONG_MAX ;
6352
+ #endif
6248
6353
free_area_init_core (pgdat );
6249
6354
}
6250
6355
0 commit comments