Skip to content

Commit 2f0799a

Browse files
rientjestorvalds
authored andcommitted
mm, thp: restore node-local hugepage allocations
This is a full revert of ac5b2c1 ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings") and a partial revert of 89c83fb ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). By not setting __GFP_THISNODE, applications can allocate remote hugepages when the local node is fragmented or low on memory when either the thp defrag setting is "always" or the vma has been madvised with MADV_HUGEPAGE. Remote access to hugepages often has much higher latency than local pages of the native page size. On Haswell, ac5b2c1 was shown to have a 13.9% access regression after this commit for binaries that remap their text segment to be backed by transparent hugepages. The intent of ac5b2c1 is to address an issue where a local node is low on memory or fragmented such that a hugepage cannot be allocated. In every scenario where this was described as a fix, there is abundant and unfragmented remote memory available to allocate from, even with a greater access latency. If remote memory is also low or fragmented, not setting __GFP_THISNODE was also measured on Haswell to have a 40% regression in allocation latency. Restore __GFP_THISNODE for thp allocations. Fixes: ac5b2c1 ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings") Fixes: 89c83fb ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask") Cc: Andrea Arcangeli <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Andrew Morton <[email protected]> Signed-off-by: David Rientjes <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 4eaaa2b commit 2f0799a

File tree

3 files changed

+17
-29
lines changed

3 files changed

+17
-29
lines changed

include/linux/mempolicy.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
139139
struct mempolicy *get_task_policy(struct task_struct *p);
140140
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
141141
unsigned long addr);
142-
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
143-
unsigned long addr);
144142
bool vma_policy_mof(struct vm_area_struct *vma);
145143

146144
extern void numa_default_policy(void);

mm/huge_memory.c

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -632,37 +632,27 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
632632
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
633633
{
634634
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
635-
gfp_t this_node = 0;
636-
637-
#ifdef CONFIG_NUMA
638-
struct mempolicy *pol;
639-
/*
640-
* __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
641-
* specified, to express a general desire to stay on the current
642-
* node for optimistic allocation attempts. If the defrag mode
643-
* and/or madvise hint requires the direct reclaim then we prefer
644-
* to fallback to other node rather than node reclaim because that
645-
* can lead to excessive reclaim even though there is free memory
646-
* on other nodes. We expect that NUMA preferences are specified
647-
* by memory policies.
648-
*/
649-
pol = get_vma_policy(vma, addr);
650-
if (pol->mode != MPOL_BIND)
651-
this_node = __GFP_THISNODE;
652-
mpol_cond_put(pol);
653-
#endif
635+
const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE;
654636

637+
/* Always do synchronous compaction */
655638
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
656-
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
639+
return GFP_TRANSHUGE | __GFP_THISNODE |
640+
(vma_madvised ? 0 : __GFP_NORETRY);
641+
642+
/* Kick kcompactd and fail quickly */
657643
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
658-
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
644+
return gfp_mask | __GFP_KSWAPD_RECLAIM;
645+
646+
/* Synchronous compaction if madvised, otherwise kick kcompactd */
659647
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
660-
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
661-
__GFP_KSWAPD_RECLAIM | this_node);
648+
return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM :
649+
__GFP_KSWAPD_RECLAIM);
650+
651+
/* Only do synchronous compaction if madvised */
662652
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
663-
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
664-
this_node);
665-
return GFP_TRANSHUGE_LIGHT | this_node;
653+
return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
654+
655+
return gfp_mask;
666656
}
667657

668658
/* Caller must hold page table lock. */

mm/mempolicy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
16621662
* freeing by another task. It is the caller's responsibility to free the
16631663
* extra reference for shared policies.
16641664
*/
1665-
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1665+
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
16661666
unsigned long addr)
16671667
{
16681668
struct mempolicy *pol = __get_vma_policy(vma, addr);

0 commit comments

Comments
 (0)