Skip to content

Commit 9f1b868

Browse files
aet00torvalds
authored andcommitted
mm: thp: khugepaged: add policy for finding target node
Khugepaged will scan/free HPAGE_PMD_NR normal pages and replace with a hugepage which is allocated from the node of the first scanned normal page, but this policy is too rough and may end with unexpected result to upper users. The problem is the original page-balancing among all nodes will be broken after hugepaged started. Thinking about the case if the first scanned normal page is allocated from node A, most of other scanned normal pages are allocated from node B or C.. But hugepaged will always allocate hugepage from node A which will cause extra memory pressure on node A which is not the situation before khugepaged started. This patch try to fix this problem by making khugepaged allocate hugepage from the node which have max record of scaned normal pages hit, so that the effect to original page-balancing can be minimized. The other problem is if normal scanned pages are equally allocated from Node A,B and C, after khugepaged started Node A will still suffer extra memory pressure. Andrew Davidoff reported a related issue several days ago. He wanted his application interleaving among all nodes and "numactl --interleave=all ./test" was used to run the testcase, but the result wasn't not as expected. cat /proc/2814/numa_maps: 7f50bd440000 interleave:0-3 anon=51403 dirty=51403 N0=435 N1=435 N2=435 N3=50098 The end result showed that most pages are from Node3 instead of interleave among node0-3 which was unreasonable. This patch also fix this issue by allocating hugepage round robin from all nodes have the same record, after this patch the result was as expected: 7f78399c0000 interleave:0-3 anon=51403 dirty=51403 N0=12723 N1=12723 N2=13235 N3=12722 The simple testcase is like this: int main() { char *p; int i; int j; for (i=0; i < 200; i++) { p = (char *)malloc(1048576); printf("malloc done\n"); if (p == 0) { printf("Out of memory\n"); return 1; } for (j=0; j < 1048576; j++) { p[j] = 'A'; } printf("touched memory\n"); sleep(1); } printf("enter sleep\n"); while(1) { sleep(100); } } [[email protected]: make last_khugepaged_target_node local to khugepaged_find_target_node()] Reported-by: Andrew Davidoff <[email protected]> Tested-by: Andrew Davidoff <[email protected]> Signed-off-by: Bob Liu <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Yasuaki Ishimatsu <[email protected]> Cc: Wanpeng Li <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 10dc415 commit 9f1b868

File tree

1 file changed

+44
-9
lines changed

1 file changed

+44
-9
lines changed

mm/huge_memory.c

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2191,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
21912191
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
21922192
}
21932193

2194+
static int khugepaged_node_load[MAX_NUMNODES];
2195+
21942196
#ifdef CONFIG_NUMA
2197+
static int khugepaged_find_target_node(void)
2198+
{
2199+
static int last_khugepaged_target_node = NUMA_NO_NODE;
2200+
int nid, target_node = 0, max_value = 0;
2201+
2202+
/* find first node with max normal pages hit */
2203+
for (nid = 0; nid < MAX_NUMNODES; nid++)
2204+
if (khugepaged_node_load[nid] > max_value) {
2205+
max_value = khugepaged_node_load[nid];
2206+
target_node = nid;
2207+
}
2208+
2209+
/* do some balance if several nodes have the same hit record */
2210+
if (target_node <= last_khugepaged_target_node)
2211+
for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2212+
nid++)
2213+
if (max_value == khugepaged_node_load[nid]) {
2214+
target_node = nid;
2215+
break;
2216+
}
2217+
2218+
last_khugepaged_target_node = target_node;
2219+
return target_node;
2220+
}
2221+
21952222
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
21962223
{
21972224
if (IS_ERR(*hpage)) {
@@ -2225,9 +2252,8 @@ static struct page
22252252
* mmap_sem in read mode is good idea also to allow greater
22262253
* scalability.
22272254
*/
2228-
*hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
2229-
node, __GFP_OTHER_NODE);
2230-
2255+
*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2256+
khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
22312257
/*
22322258
* After allocating the hugepage, release the mmap_sem read lock in
22332259
* preparation for taking it in write mode.
@@ -2243,6 +2269,11 @@ static struct page
22432269
return *hpage;
22442270
}
22452271
#else
2272+
static int khugepaged_find_target_node(void)
2273+
{
2274+
return 0;
2275+
}
2276+
22462277
static inline struct page *alloc_hugepage(int defrag)
22472278
{
22482279
return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
@@ -2455,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
24552486
if (pmd_trans_huge(*pmd))
24562487
goto out;
24572488

2489+
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
24582490
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
24592491
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
24602492
_pte++, _address += PAGE_SIZE) {
@@ -2471,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
24712503
if (unlikely(!page))
24722504
goto out_unmap;
24732505
/*
2474-
* Chose the node of the first page. This could
2475-
* be more sophisticated and look at more pages,
2476-
* but isn't for now.
2506+
* Record which node the original page is from and save this
2507+
* information to khugepaged_node_load[].
2508+
* Khupaged will allocate hugepage from the node has the max
2509+
* hit record.
24772510
*/
2478-
if (node == NUMA_NO_NODE)
2479-
node = page_to_nid(page);
2511+
node = page_to_nid(page);
2512+
khugepaged_node_load[node]++;
24802513
VM_BUG_ON(PageCompound(page));
24812514
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
24822515
goto out_unmap;
@@ -2491,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
24912524
ret = 1;
24922525
out_unmap:
24932526
pte_unmap_unlock(pte, ptl);
2494-
if (ret)
2527+
if (ret) {
2528+
node = khugepaged_find_target_node();
24952529
/* collapse_huge_page will return with the mmap_sem released */
24962530
collapse_huge_page(mm, address, hpage, vma, node);
2531+
}
24972532
out:
24982533
return ret;
24992534
}

0 commit comments

Comments
 (0)