Skip to content

Commit dcdfdd4

Browse files
kirylbp3tk0v
authored andcommitted
mm: Add support for unaccepted memory
UEFI Specification version 2.9 introduces the concept of memory acceptance. Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, require memory to be accepted before it can be used by the guest. Accepting happens via a protocol specific to the Virtual Machine platform. There are several ways the kernel can deal with unaccepted memory: 1. Accept all the memory during boot. It is easy to implement and it doesn't have runtime cost once the system is booted. The downside is very long boot time. Accept can be parallelized to multiple CPUs to keep it manageable (i.e. via DEFERRED_STRUCT_PAGE_INIT), but it tends to saturate memory bandwidth and does not scale beyond the point. 2. Accept a block of memory on the first use. It requires more infrastructure and changes in page allocator to make it work, but it provides good boot time. On-demand memory accept means latency spikes every time kernel steps onto a new memory block. The spikes will go away once workload data set size gets stabilized or all memory gets accepted. 3. Accept all memory in background. Introduce a thread (or multiple) that gets memory accepted proactively. It will minimize time the system experience latency spikes on memory allocation while keeping low boot time. This approach cannot function on its own. It is an extension of #2: background memory acceptance requires functional scheduler, but the page allocator may need to tap into unaccepted memory before that. The downside of the approach is that these threads also steal CPU cycles and memory bandwidth from the user's workload and may hurt user experience. Implement #1 and #2 for now. #2 is the default. Some workloads may want to use #1 with accept_memory=eager in kernel command line. #3 can be implemented later based on user's demands. Support of unaccepted memory requires a few changes in core-mm code: - memblock accepts memory on allocation. It serves early boot memory allocations and doesn't limit them to pre-accepted pool of memory. - page allocator accepts memory on the first allocation of the page. When kernel runs out of accepted memory, it accepts memory until the high watermark is reached. It helps to minimize fragmentation. EFI code will provide two helpers if the platform supports unaccepted memory: - accept_memory() makes a range of physical addresses accepted. - range_contains_unaccepted_memory() checks anything within the range of physical addresses requires acceptance. Signed-off-by: Kirill A. Shutemov <[email protected]> Signed-off-by: Borislav Petkov (AMD) <[email protected]> Reviewed-by: Vlastimil Babka <[email protected]> Acked-by: Mike Rapoport <[email protected]> # memblock Link: https://lore.kernel.org/r/[email protected]
1 parent 9561de3 commit dcdfdd4

File tree

8 files changed

+231
-0
lines changed

8 files changed

+231
-0
lines changed

drivers/base/node.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,9 @@ static ssize_t node_read_meminfo(struct device *dev,
448448
"Node %d ShmemPmdMapped: %8lu kB\n"
449449
"Node %d FileHugePages: %8lu kB\n"
450450
"Node %d FilePmdMapped: %8lu kB\n"
451+
#endif
452+
#ifdef CONFIG_UNACCEPTED_MEMORY
453+
"Node %d Unaccepted: %8lu kB\n"
451454
#endif
452455
,
453456
nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
@@ -477,6 +480,10 @@ static ssize_t node_read_meminfo(struct device *dev,
477480
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
478481
nid, K(node_page_state(pgdat, NR_FILE_THPS)),
479482
nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
483+
#endif
484+
#ifdef CONFIG_UNACCEPTED_MEMORY
485+
,
486+
nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED))
480487
#endif
481488
);
482489
len += hugetlb_report_node_meminfo(buf, len, nid);

fs/proc/meminfo.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
168168
global_zone_page_state(NR_FREE_CMA_PAGES));
169169
#endif
170170

171+
#ifdef CONFIG_UNACCEPTED_MEMORY
172+
show_val_kb(m, "Unaccepted: ",
173+
global_zone_page_state(NR_UNACCEPTED));
174+
#endif
175+
171176
hugetlb_report_meminfo(m);
172177

173178
arch_report_meminfo(m);

include/linux/mm.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3816,4 +3816,23 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
38163816
}
38173817
#endif
38183818

3819+
#ifdef CONFIG_UNACCEPTED_MEMORY
3820+
3821+
bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
3822+
void accept_memory(phys_addr_t start, phys_addr_t end);
3823+
3824+
#else
3825+
3826+
static inline bool range_contains_unaccepted_memory(phys_addr_t start,
3827+
phys_addr_t end)
3828+
{
3829+
return false;
3830+
}
3831+
3832+
static inline void accept_memory(phys_addr_t start, phys_addr_t end)
3833+
{
3834+
}
3835+
3836+
#endif
3837+
38193838
#endif /* _LINUX_MM_H */

include/linux/mmzone.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ enum zone_stat_item {
143143
NR_ZSPAGES, /* allocated in zsmalloc */
144144
#endif
145145
NR_FREE_CMA_PAGES,
146+
#ifdef CONFIG_UNACCEPTED_MEMORY
147+
NR_UNACCEPTED,
148+
#endif
146149
NR_VM_ZONE_STAT_ITEMS };
147150

148151
enum node_stat_item {
@@ -910,6 +913,11 @@ struct zone {
910913
/* free areas of different sizes */
911914
struct free_area free_area[MAX_ORDER + 1];
912915

916+
#ifdef CONFIG_UNACCEPTED_MEMORY
917+
/* Pages to be accepted. All pages on the list are MAX_ORDER */
918+
struct list_head unaccepted_pages;
919+
#endif
920+
913921
/* zone flags, see below */
914922
unsigned long flags;
915923

mm/memblock.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,6 +1436,15 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
14361436
*/
14371437
kmemleak_alloc_phys(found, size, 0);
14381438

1439+
/*
1440+
* Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
1441+
* require memory to be accepted before it can be used by the
1442+
* guest.
1443+
*
1444+
* Accept the memory of the allocated buffer.
1445+
*/
1446+
accept_memory(found, found + size);
1447+
14391448
return found;
14401449
}
14411450

mm/mm_init.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)
13751375
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
13761376
zone->free_area[order].nr_free = 0;
13771377
}
1378+
1379+
#ifdef CONFIG_UNACCEPTED_MEMORY
1380+
INIT_LIST_HEAD(&zone->unaccepted_pages);
1381+
#endif
13781382
}
13791383

13801384
void __meminit init_currently_empty_zone(struct zone *zone,
@@ -1960,6 +1964,9 @@ static void __init deferred_free_range(unsigned long pfn,
19601964
return;
19611965
}
19621966

1967+
/* Accept chunks smaller than MAX_ORDER upfront */
1968+
accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
1969+
19631970
for (i = 0; i < nr_pages; i++, page++, pfn++) {
19641971
if (pageblock_aligned(pfn))
19651972
set_pageblock_migratetype(page, MIGRATE_MOVABLE);

mm/page_alloc.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,12 @@ EXPORT_SYMBOL(nr_node_ids);
387387
EXPORT_SYMBOL(nr_online_nodes);
388388
#endif
389389

390+
static bool page_contains_unaccepted(struct page *page, unsigned int order);
391+
static void accept_page(struct page *page, unsigned int order);
392+
static bool try_to_accept_memory(struct zone *zone, unsigned int order);
393+
static inline bool has_unaccepted_memory(void);
394+
static bool __free_unaccepted(struct page *page);
395+
390396
int page_group_by_mobility_disabled __read_mostly;
391397

392398
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1481,6 +1487,13 @@ void __free_pages_core(struct page *page, unsigned int order)
14811487

14821488
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
14831489

1490+
if (page_contains_unaccepted(page, order)) {
1491+
if (order == MAX_ORDER && __free_unaccepted(page))
1492+
return;
1493+
1494+
accept_page(page, order);
1495+
}
1496+
14841497
/*
14851498
* Bypass PCP and place fresh pages right to the tail, primarily
14861499
* relevant for memory onlining.
@@ -3159,6 +3172,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
31593172
if (!(alloc_flags & ALLOC_CMA))
31603173
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
31613174
#endif
3175+
#ifdef CONFIG_UNACCEPTED_MEMORY
3176+
unusable_free += zone_page_state(z, NR_UNACCEPTED);
3177+
#endif
31623178

31633179
return unusable_free;
31643180
}
@@ -3458,6 +3474,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
34583474
gfp_mask)) {
34593475
int ret;
34603476

3477+
if (has_unaccepted_memory()) {
3478+
if (try_to_accept_memory(zone, order))
3479+
goto try_this_zone;
3480+
}
3481+
34613482
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
34623483
/*
34633484
* Watermark failed for this zone, but see if we can
@@ -3510,6 +3531,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
35103531

35113532
return page;
35123533
} else {
3534+
if (has_unaccepted_memory()) {
3535+
if (try_to_accept_memory(zone, order))
3536+
goto try_this_zone;
3537+
}
3538+
35133539
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
35143540
/* Try again if zone has deferred pages */
35153541
if (deferred_pages_enabled()) {
@@ -7215,3 +7241,150 @@ bool has_managed_dma(void)
72157241
return false;
72167242
}
72177243
#endif /* CONFIG_ZONE_DMA */
7244+
7245+
#ifdef CONFIG_UNACCEPTED_MEMORY
7246+
7247+
/* Counts number of zones with unaccepted pages. */
7248+
static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
7249+
7250+
static bool lazy_accept = true;
7251+
7252+
static int __init accept_memory_parse(char *p)
7253+
{
7254+
if (!strcmp(p, "lazy")) {
7255+
lazy_accept = true;
7256+
return 0;
7257+
} else if (!strcmp(p, "eager")) {
7258+
lazy_accept = false;
7259+
return 0;
7260+
} else {
7261+
return -EINVAL;
7262+
}
7263+
}
7264+
early_param("accept_memory", accept_memory_parse);
7265+
7266+
static bool page_contains_unaccepted(struct page *page, unsigned int order)
7267+
{
7268+
phys_addr_t start = page_to_phys(page);
7269+
phys_addr_t end = start + (PAGE_SIZE << order);
7270+
7271+
return range_contains_unaccepted_memory(start, end);
7272+
}
7273+
7274+
static void accept_page(struct page *page, unsigned int order)
7275+
{
7276+
phys_addr_t start = page_to_phys(page);
7277+
7278+
accept_memory(start, start + (PAGE_SIZE << order));
7279+
}
7280+
7281+
static bool try_to_accept_memory_one(struct zone *zone)
7282+
{
7283+
unsigned long flags;
7284+
struct page *page;
7285+
bool last;
7286+
7287+
if (list_empty(&zone->unaccepted_pages))
7288+
return false;
7289+
7290+
spin_lock_irqsave(&zone->lock, flags);
7291+
page = list_first_entry_or_null(&zone->unaccepted_pages,
7292+
struct page, lru);
7293+
if (!page) {
7294+
spin_unlock_irqrestore(&zone->lock, flags);
7295+
return false;
7296+
}
7297+
7298+
list_del(&page->lru);
7299+
last = list_empty(&zone->unaccepted_pages);
7300+
7301+
__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
7302+
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
7303+
spin_unlock_irqrestore(&zone->lock, flags);
7304+
7305+
accept_page(page, MAX_ORDER);
7306+
7307+
__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
7308+
7309+
if (last)
7310+
static_branch_dec(&zones_with_unaccepted_pages);
7311+
7312+
return true;
7313+
}
7314+
7315+
static bool try_to_accept_memory(struct zone *zone, unsigned int order)
7316+
{
7317+
long to_accept;
7318+
int ret = false;
7319+
7320+
/* How much to accept to get to high watermark? */
7321+
to_accept = high_wmark_pages(zone) -
7322+
(zone_page_state(zone, NR_FREE_PAGES) -
7323+
__zone_watermark_unusable_free(zone, order, 0));
7324+
7325+
/* Accept at least one page */
7326+
do {
7327+
if (!try_to_accept_memory_one(zone))
7328+
break;
7329+
ret = true;
7330+
to_accept -= MAX_ORDER_NR_PAGES;
7331+
} while (to_accept > 0);
7332+
7333+
return ret;
7334+
}
7335+
7336+
static inline bool has_unaccepted_memory(void)
7337+
{
7338+
return static_branch_unlikely(&zones_with_unaccepted_pages);
7339+
}
7340+
7341+
static bool __free_unaccepted(struct page *page)
7342+
{
7343+
struct zone *zone = page_zone(page);
7344+
unsigned long flags;
7345+
bool first = false;
7346+
7347+
if (!lazy_accept)
7348+
return false;
7349+
7350+
spin_lock_irqsave(&zone->lock, flags);
7351+
first = list_empty(&zone->unaccepted_pages);
7352+
list_add_tail(&page->lru, &zone->unaccepted_pages);
7353+
__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
7354+
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
7355+
spin_unlock_irqrestore(&zone->lock, flags);
7356+
7357+
if (first)
7358+
static_branch_inc(&zones_with_unaccepted_pages);
7359+
7360+
return true;
7361+
}
7362+
7363+
#else
7364+
7365+
static bool page_contains_unaccepted(struct page *page, unsigned int order)
7366+
{
7367+
return false;
7368+
}
7369+
7370+
static void accept_page(struct page *page, unsigned int order)
7371+
{
7372+
}
7373+
7374+
static bool try_to_accept_memory(struct zone *zone, unsigned int order)
7375+
{
7376+
return false;
7377+
}
7378+
7379+
static inline bool has_unaccepted_memory(void)
7380+
{
7381+
return false;
7382+
}
7383+
7384+
static bool __free_unaccepted(struct page *page)
7385+
{
7386+
BUILD_BUG();
7387+
return false;
7388+
}
7389+
7390+
#endif /* CONFIG_UNACCEPTED_MEMORY */

mm/vmstat.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,9 @@ const char * const vmstat_text[] = {
11801180
"nr_zspages",
11811181
#endif
11821182
"nr_free_cma",
1183+
#ifdef CONFIG_UNACCEPTED_MEMORY
1184+
"nr_unaccepted",
1185+
#endif
11831186

11841187
/* enum numa_stat_item counters */
11851188
#ifdef CONFIG_NUMA

0 commit comments

Comments
 (0)