Skip to content

Commit 97ae174

Browse files
kiryltorvalds
authored andcommitted
thp: implement refcounting for huge zero page
H. Peter Anvin doesn't like huge zero page which sticks in memory forever after the first allocation. Here's implementation of lockless refcounting for huge zero page. We have two basic primitives: {get,put}_huge_zero_page(). They manipulate reference counter. If counter is 0, get_huge_zero_page() allocates a new huge page and takes two references: one for caller and one for shrinker. We free the page only in shrinker callback if counter is 1 (only shrinker has the reference). put_huge_zero_page() only decrements counter. Counter is never zero in put_huge_zero_page() since shrinker holds on reference. Freeing huge zero page in shrinker callback helps to avoid frequent allocate-free. Refcounting has cost. On 4 socket machine I observe ~1% slowdown on parallel (40 processes) read page faulting comparing to lazy huge page allocation. I think it's pretty reasonable for synthetic benchmark. [[email protected]: fix mismerge] Signed-off-by: Kirill A. Shutemov <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Andi Kleen <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Mel Gorman <[email protected]> Cc: David Rientjes <[email protected]> Signed-off-by: Bob Liu <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 78ca0e6 commit 97ae174

File tree

1 file changed

+88
-25
lines changed

1 file changed

+88
-25
lines changed

mm/huge_memory.c

Lines changed: 88 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@
1212
#include <linux/mmu_notifier.h>
1313
#include <linux/rmap.h>
1414
#include <linux/swap.h>
15+
#include <linux/shrinker.h>
1516
#include <linux/mm_inline.h>
1617
#include <linux/kthread.h>
1718
#include <linux/khugepaged.h>
1819
#include <linux/freezer.h>
1920
#include <linux/mman.h>
2021
#include <linux/pagemap.h>
22+
2123
#include <asm/tlb.h>
2224
#include <asm/pgalloc.h>
2325
#include "internal.h"
@@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
4749
/* during fragmentation poll the hugepage allocator once every minute */
4850
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
4951
static struct task_struct *khugepaged_thread __read_mostly;
50-
static unsigned long huge_zero_pfn __read_mostly;
5152
static DEFINE_MUTEX(khugepaged_mutex);
5253
static DEFINE_SPINLOCK(khugepaged_mm_lock);
5354
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -160,31 +161,74 @@ static int start_khugepaged(void)
160161
return err;
161162
}
162163

163-
static int init_huge_zero_pfn(void)
164+
static atomic_t huge_zero_refcount;
165+
static unsigned long huge_zero_pfn __read_mostly;
166+
167+
static inline bool is_huge_zero_pfn(unsigned long pfn)
164168
{
165-
struct page *hpage;
166-
unsigned long pfn;
169+
unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
170+
return zero_pfn && pfn == zero_pfn;
171+
}
167172

168-
hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
173+
static inline bool is_huge_zero_pmd(pmd_t pmd)
174+
{
175+
return is_huge_zero_pfn(pmd_pfn(pmd));
176+
}
177+
178+
static unsigned long get_huge_zero_page(void)
179+
{
180+
struct page *zero_page;
181+
retry:
182+
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
183+
return ACCESS_ONCE(huge_zero_pfn);
184+
185+
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
169186
HPAGE_PMD_ORDER);
170-
if (!hpage)
171-
return -ENOMEM;
172-
pfn = page_to_pfn(hpage);
173-
if (cmpxchg(&huge_zero_pfn, 0, pfn))
174-
__free_page(hpage);
175-
return 0;
187+
if (!zero_page)
188+
return 0;
189+
preempt_disable();
190+
if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
191+
preempt_enable();
192+
__free_page(zero_page);
193+
goto retry;
194+
}
195+
196+
/* We take additional reference here. It will be put back by shrinker */
197+
atomic_set(&huge_zero_refcount, 2);
198+
preempt_enable();
199+
return ACCESS_ONCE(huge_zero_pfn);
176200
}
177201

178-
static inline bool is_huge_zero_pfn(unsigned long pfn)
202+
static void put_huge_zero_page(void)
179203
{
180-
return huge_zero_pfn && pfn == huge_zero_pfn;
204+
/*
205+
* Counter should never go to zero here. Only shrinker can put
206+
* last reference.
207+
*/
208+
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
181209
}
182210

183-
static inline bool is_huge_zero_pmd(pmd_t pmd)
211+
static int shrink_huge_zero_page(struct shrinker *shrink,
212+
struct shrink_control *sc)
184213
{
185-
return is_huge_zero_pfn(pmd_pfn(pmd));
214+
if (!sc->nr_to_scan)
215+
/* we can free zero page only if last reference remains */
216+
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
217+
218+
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
219+
unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
220+
BUG_ON(zero_pfn == 0);
221+
__free_page(__pfn_to_page(zero_pfn));
222+
}
223+
224+
return 0;
186225
}
187226

227+
static struct shrinker huge_zero_page_shrinker = {
228+
.shrink = shrink_huge_zero_page,
229+
.seeks = DEFAULT_SEEKS,
230+
};
231+
188232
#ifdef CONFIG_SYSFS
189233

190234
static ssize_t double_flag_show(struct kobject *kobj,
@@ -576,6 +620,8 @@ static int __init hugepage_init(void)
576620
goto out;
577621
}
578622

623+
register_shrinker(&huge_zero_page_shrinker);
624+
579625
/*
580626
* By default disable transparent hugepages on smaller systems,
581627
* where the extra memory used could hurt more than TLB overhead
@@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag)
705751
#endif
706752

707753
static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
708-
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
754+
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
755+
unsigned long zero_pfn)
709756
{
710757
pmd_t entry;
711-
entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
758+
entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
712759
entry = pmd_wrprotect(entry);
713760
entry = pmd_mkhuge(entry);
714761
set_pmd_at(mm, haddr, pmd, entry);
@@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
731778
return VM_FAULT_OOM;
732779
if (!(flags & FAULT_FLAG_WRITE)) {
733780
pgtable_t pgtable;
734-
if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
735-
count_vm_event(THP_FAULT_FALLBACK);
736-
goto out;
737-
}
781+
unsigned long zero_pfn;
738782
pgtable = pte_alloc_one(mm, haddr);
739783
if (unlikely(!pgtable))
740784
return VM_FAULT_OOM;
785+
zero_pfn = get_huge_zero_page();
786+
if (unlikely(!zero_pfn)) {
787+
pte_free(mm, pgtable);
788+
count_vm_event(THP_FAULT_FALLBACK);
789+
goto out;
790+
}
741791
spin_lock(&mm->page_table_lock);
742-
set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
792+
set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
793+
zero_pfn);
743794
spin_unlock(&mm->page_table_lock);
744795
return 0;
745796
}
@@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
813864
* a page table.
814865
*/
815866
if (is_huge_zero_pmd(pmd)) {
816-
set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
867+
unsigned long zero_pfn;
868+
/*
869+
* get_huge_zero_page() will never allocate a new page here,
870+
* since we already have a zero page to copy. It just takes a
871+
* reference.
872+
*/
873+
zero_pfn = get_huge_zero_page();
874+
set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
875+
zero_pfn);
817876
ret = 0;
818877
goto out_unlock;
819878
}
@@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
923982
smp_wmb(); /* make pte visible before pmd */
924983
pmd_populate(mm, pmd, pgtable);
925984
spin_unlock(&mm->page_table_lock);
985+
put_huge_zero_page();
926986
inc_mm_counter(mm, MM_ANONPAGES);
927987

928988
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1123,9 +1183,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
11231183
page_add_new_anon_rmap(new_page, vma, haddr);
11241184
set_pmd_at(mm, haddr, pmd, entry);
11251185
update_mmu_cache_pmd(vma, address, pmd);
1126-
if (is_huge_zero_pmd(orig_pmd))
1186+
if (is_huge_zero_pmd(orig_pmd)) {
11271187
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1128-
else {
1188+
put_huge_zero_page();
1189+
} else {
11291190
VM_BUG_ON(!PageHead(page));
11301191
page_remove_rmap(page);
11311192
put_page(page);
@@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
12021263
if (is_huge_zero_pmd(orig_pmd)) {
12031264
tlb->mm->nr_ptes--;
12041265
spin_unlock(&tlb->mm->page_table_lock);
1266+
put_huge_zero_page();
12051267
} else {
12061268
page = pmd_page(orig_pmd);
12071269
page_remove_rmap(page);
@@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
25112573
}
25122574
smp_wmb(); /* make pte visible before pmd */
25132575
pmd_populate(mm, pmd, pgtable);
2576+
put_huge_zero_page();
25142577
}
25152578

25162579
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,

0 commit comments

Comments
 (0)