Skip to content

Commit 89c83fb

Browse files
Michal Hockotorvalds
authored andcommitted
mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask
THP allocation mode is quite complex and it depends on the defrag mode. This complexity is hidden in alloc_hugepage_direct_gfpmask from a large part currently. The NUMA special casing (namely __GFP_THISNODE) is however independent and placed in alloc_pages_vma currently. This both adds an unnecessary branch to all vma based page allocation requests and it makes the code more complex unnecessarily as well. Not to mention that e.g. shmem THP used to do the node reclaiming unconditionally regardless of the defrag mode until recently. This was not only unexpected behavior but it was also hardly a good default behavior and I strongly suspect it was just a side effect of the code sharing more than a deliberate decision which suggests that such a layering is wrong. Get rid of the thp special casing from alloc_pages_vma and move the logic to alloc_hugepage_direct_gfpmask. __GFP_THISNODE is applied to the resulting gfp mask only when the direct reclaim is not requested and when there is no explicit numa binding to preserve the current logic. Please note that there's also a slight difference wrt MPOL_BIND now. The previous code would avoid using __GFP_THISNODE if the local node was outside of policy_nodemask(). After this patch __GFP_THISNODE is avoided for all MPOL_BIND policies. So there's a difference that if local node is actually allowed by the bind policy's nodemask, previously __GFP_THISNODE would be added, but now it won't be. From the behavior POV this is still correct because the policy nodemask is used. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Michal Hocko <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Cc: Alex Williamson <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: David Rientjes <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Stefan Priebe - Profihost AG <[email protected]> Cc: Zi Yan <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 6194ae4 commit 89c83fb

File tree

5 files changed

+40
-77
lines changed

5 files changed

+40
-77
lines changed

include/linux/gfp.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
510510
}
511511
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
512512
struct vm_area_struct *vma, unsigned long addr,
513-
int node, bool hugepage);
514-
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
515-
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
513+
int node);
516514
#else
517515
#define alloc_pages(gfp_mask, order) \
518516
alloc_pages_node(numa_node_id(), gfp_mask, order)
519-
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
520-
alloc_pages(gfp_mask, order)
521-
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
517+
#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
522518
alloc_pages(gfp_mask, order)
523519
#endif
524520
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
525521
#define alloc_page_vma(gfp_mask, vma, addr) \
526-
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
522+
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
527523
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
528-
alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
524+
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
529525

530526
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
531527
extern unsigned long get_zeroed_page(gfp_t gfp_mask);

include/linux/mempolicy.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
139139
struct mempolicy *get_task_policy(struct task_struct *p);
140140
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
141141
unsigned long addr);
142+
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
143+
unsigned long addr);
142144
bool vma_policy_mof(struct vm_area_struct *vma);
143145

144146
extern void numa_default_policy(void);

mm/huge_memory.c

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -629,21 +629,40 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
629629
* available
630630
* never: never stall for any thp allocation
631631
*/
632-
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
632+
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
633633
{
634634
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
635+
gfp_t this_node = 0;
636+
637+
#ifdef CONFIG_NUMA
638+
struct mempolicy *pol;
639+
/*
640+
* __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
641+
* specified, to express a general desire to stay on the current
642+
* node for optimistic allocation attempts. If the defrag mode
643+
* and/or madvise hint requires the direct reclaim then we prefer
644+
* to fallback to other node rather than node reclaim because that
645+
* can lead to excessive reclaim even though there is free memory
646+
* on other nodes. We expect that NUMA preferences are specified
647+
* by memory policies.
648+
*/
649+
pol = get_vma_policy(vma, addr);
650+
if (pol->mode != MPOL_BIND)
651+
this_node = __GFP_THISNODE;
652+
mpol_cond_put(pol);
653+
#endif
635654

636655
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
637656
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
638657
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
639-
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
658+
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
640659
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
641660
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
642-
__GFP_KSWAPD_RECLAIM);
661+
__GFP_KSWAPD_RECLAIM | this_node);
643662
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
644663
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
645-
0);
646-
return GFP_TRANSHUGE_LIGHT;
664+
this_node);
665+
return GFP_TRANSHUGE_LIGHT | this_node;
647666
}
648667

649668
/* Caller must hold page table lock. */
@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
715734
pte_free(vma->vm_mm, pgtable);
716735
return ret;
717736
}
718-
gfp = alloc_hugepage_direct_gfpmask(vma);
719-
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
737+
gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
738+
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
720739
if (unlikely(!page)) {
721740
count_vm_event(THP_FAULT_FALLBACK);
722741
return VM_FAULT_FALLBACK;
@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
12861305
alloc:
12871306
if (transparent_hugepage_enabled(vma) &&
12881307
!transparent_hugepage_debug_cow()) {
1289-
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
1290-
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1308+
huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
1309+
new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
1310+
haddr, numa_node_id());
12911311
} else
12921312
new_page = NULL;
12931313

mm/mempolicy.c

Lines changed: 4 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
11161116
} else if (PageTransHuge(page)) {
11171117
struct page *thp;
11181118

1119-
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1120-
HPAGE_PMD_ORDER);
1119+
thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
1120+
address, numa_node_id());
11211121
if (!thp)
11221122
return NULL;
11231123
prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
16621662
* freeing by another task. It is the caller's responsibility to free the
16631663
* extra reference for shared policies.
16641664
*/
1665-
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1665+
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
16661666
unsigned long addr)
16671667
{
16681668
struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
20112011
* @vma: Pointer to VMA or NULL if not available.
20122012
* @addr: Virtual Address of the allocation. Must be inside the VMA.
20132013
* @node: Which node to prefer for allocation (modulo policy).
2014-
* @hugepage: for hugepages try only the preferred node if possible
20152014
*
20162015
* This function allocates a page from the kernel page pool and applies
20172016
* a NUMA policy associated with the VMA or the current process.
@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
20222021
*/
20232022
struct page *
20242023
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2025-
unsigned long addr, int node, bool hugepage)
2024+
unsigned long addr, int node)
20262025
{
20272026
struct mempolicy *pol;
20282027
struct page *page;
@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
20402039
goto out;
20412040
}
20422041

2043-
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2044-
int hpage_node = node;
2045-
2046-
/*
2047-
* For hugepage allocation and non-interleave policy which
2048-
* allows the current node (or other explicitly preferred
2049-
* node) we only try to allocate from the current/preferred
2050-
* node and don't fall back to other nodes, as the cost of
2051-
* remote accesses would likely offset THP benefits.
2052-
*
2053-
* If the policy is interleave, or does not allow the current
2054-
* node in its nodemask, we allocate the standard way.
2055-
*/
2056-
if (pol->mode == MPOL_PREFERRED &&
2057-
!(pol->flags & MPOL_F_LOCAL))
2058-
hpage_node = pol->v.preferred_node;
2059-
2060-
nmask = policy_nodemask(gfp, pol);
2061-
if (!nmask || node_isset(hpage_node, *nmask)) {
2062-
mpol_cond_put(pol);
2063-
/*
2064-
* We cannot invoke reclaim if __GFP_THISNODE
2065-
* is set. Invoking reclaim with
2066-
* __GFP_THISNODE set, would cause THP
2067-
* allocations to trigger heavy swapping
2068-
* despite there may be tons of free memory
2069-
* (including potentially plenty of THP
2070-
* already available in the buddy) on all the
2071-
* other NUMA nodes.
2072-
*
2073-
* At most we could invoke compaction when
2074-
* __GFP_THISNODE is set (but we would need to
2075-
* refrain from invoking reclaim even if
2076-
* compaction returned COMPACT_SKIPPED because
2077-
* there wasn't not enough memory to succeed
2078-
* compaction). For now just avoid
2079-
* __GFP_THISNODE instead of limiting the
2080-
* allocation path to a strict and single
2081-
* compaction invocation.
2082-
*
2083-
* Supposedly if direct reclaim was enabled by
2084-
* the caller, the app prefers THP regardless
2085-
* of the node it comes from so this would be
2086-
* more desiderable behavior than only
2087-
* providing THP originated from the local
2088-
* node in such case.
2089-
*/
2090-
if (!(gfp & __GFP_DIRECT_RECLAIM))
2091-
gfp |= __GFP_THISNODE;
2092-
page = __alloc_pages_node(hpage_node, gfp, order);
2093-
goto out;
2094-
}
2095-
}
2096-
20972042
nmask = policy_nodemask(gfp, pol);
20982043
preferred_nid = policy_node(gfp, pol, node);
20992044
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);

mm/shmem.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
14351435

14361436
shmem_pseudo_vma_init(&pvma, info, hindex);
14371437
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1438-
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
1438+
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
14391439
shmem_pseudo_vma_destroy(&pvma);
14401440
if (page)
14411441
prep_transhuge_page(page);

0 commit comments

Comments
 (0)