Skip to content

Commit 953c66c

Browse files
kvaneeshtorvalds
authored andcommitted
mm: THP page cache support for ppc64
Add arch specific callback in the generic THP page cache code that will deposit and withdarw preallocated page table. Archs like ppc64 use this preallocated table to store the hash pte slot information. Testing: kernel build of the patch series on tmpfs mounted with option huge=always The related thp stat: thp_fault_alloc 72939 thp_fault_fallback 60547 thp_collapse_alloc 603 thp_collapse_alloc_failed 0 thp_file_alloc 253763 thp_file_mapped 4251 thp_split_page 51518 thp_split_page_failed 1 thp_deferred_split_page 73566 thp_split_pmd 665 thp_zero_page_alloc 3 thp_zero_page_alloc_failed 0 [[email protected]: remove unneeded parentheses, per Kirill] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Aneesh Kumar K.V <[email protected]> Acked-by: Kirill A. Shutemov <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Benjamin Herrenschmidt <[email protected]> Cc: Michael Neuling <[email protected]> Cc: Paul Mackerras <[email protected]> Cc: Balbir Singh <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 1dd38b6 commit 953c66c

File tree

6 files changed

+100
-17
lines changed

6 files changed

+100
-17
lines changed

arch/powerpc/include/asm/book3s/64/pgtable.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
10211021
*/
10221022
return true;
10231023
}
1024+
1025+
1026+
#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
1027+
static inline bool arch_needs_pgtable_deposit(void)
1028+
{
1029+
if (radix_enabled())
1030+
return false;
1031+
return true;
1032+
}
1033+
10241034
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
10251035
#endif /* __ASSEMBLY__ */
10261036
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */

include/asm-generic/pgtable.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
652652
}
653653
#endif
654654

655+
#ifndef arch_needs_pgtable_deposit
656+
#define arch_needs_pgtable_deposit() (false)
657+
#endif
655658
/*
656659
* This function is meant to be used by sites walking pagetables with
657660
* the mmap_sem hold in read mode to protect against MADV_DONTNEED and

mm/Kconfig

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -447,13 +447,9 @@ choice
447447
benefit.
448448
endchoice
449449

450-
#
451-
# We don't deposit page tables on file THP mapping,
452-
# but Power makes use of them to address MMU quirk.
453-
#
454450
config TRANSPARENT_HUGE_PAGECACHE
455451
def_bool y
456-
depends on TRANSPARENT_HUGEPAGE && !PPC
452+
depends on TRANSPARENT_HUGEPAGE
457453

458454
#
459455
# UP and nommu archs use km based percpu allocator

mm/huge_memory.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
13801380
return ret;
13811381
}
13821382

1383+
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1384+
{
1385+
pgtable_t pgtable;
1386+
1387+
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1388+
pte_free(mm, pgtable);
1389+
atomic_long_dec(&mm->nr_ptes);
1390+
}
1391+
13831392
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
13841393
pmd_t *pmd, unsigned long addr)
13851394
{
@@ -1421,6 +1430,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
14211430
atomic_long_dec(&tlb->mm->nr_ptes);
14221431
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
14231432
} else {
1433+
if (arch_needs_pgtable_deposit())
1434+
zap_deposited_table(tlb->mm, pmd);
14241435
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
14251436
}
14261437
spin_unlock(ptl);
@@ -1607,6 +1618,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
16071618

16081619
if (!vma_is_anonymous(vma)) {
16091620
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1621+
/*
1622+
* We are going to unmap this huge page. So
1623+
* just go ahead and zap it
1624+
*/
1625+
if (arch_needs_pgtable_deposit())
1626+
zap_deposited_table(mm, pmd);
16101627
if (vma_is_dax(vma))
16111628
return;
16121629
page = pmd_page(_pmd);

mm/khugepaged.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12421242
struct vm_area_struct *vma;
12431243
unsigned long addr;
12441244
pmd_t *pmd, _pmd;
1245+
bool deposited = false;
12451246

12461247
i_mmap_lock_write(mapping);
12471248
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12661267
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
12671268
/* assume page table is clear */
12681269
_pmd = pmdp_collapse_flush(vma, addr, pmd);
1270+
/*
1271+
* now deposit the pgtable for arch that need it
1272+
* otherwise free it.
1273+
*/
1274+
if (arch_needs_pgtable_deposit()) {
1275+
/*
1276+
* The deposit should be visibile only after
1277+
* collapse is seen by others.
1278+
*/
1279+
smp_wmb();
1280+
pgtable_trans_huge_deposit(vma->vm_mm, pmd,
1281+
pmd_pgtable(_pmd));
1282+
deposited = true;
1283+
}
12691284
spin_unlock(ptl);
12701285
up_write(&vma->vm_mm->mmap_sem);
1271-
atomic_long_dec(&vma->vm_mm->nr_ptes);
1272-
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1286+
if (!deposited) {
1287+
atomic_long_dec(&vma->vm_mm->nr_ptes);
1288+
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1289+
}
12731290
}
12741291
}
12751292
i_mmap_unlock_write(mapping);

mm/memory.c

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2935,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
29352935
return true;
29362936
}
29372937

2938+
static void deposit_prealloc_pte(struct fault_env *fe)
2939+
{
2940+
struct vm_area_struct *vma = fe->vma;
2941+
2942+
pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
2943+
/*
2944+
* We are going to consume the prealloc table,
2945+
* count that as nr_ptes.
2946+
*/
2947+
atomic_long_inc(&vma->vm_mm->nr_ptes);
2948+
fe->prealloc_pte = 0;
2949+
}
2950+
29382951
static int do_set_pmd(struct fault_env *fe, struct page *page)
29392952
{
29402953
struct vm_area_struct *vma = fe->vma;
@@ -2949,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
29492962
ret = VM_FAULT_FALLBACK;
29502963
page = compound_head(page);
29512964

2965+
/*
2966+
* Archs like ppc64 need additonal space to store information
2967+
* related to pte entry. Use the preallocated table for that.
2968+
*/
2969+
if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
2970+
fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
2971+
if (!fe->prealloc_pte)
2972+
return VM_FAULT_OOM;
2973+
smp_wmb(); /* See comment in __pte_alloc() */
2974+
}
2975+
29522976
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
29532977
if (unlikely(!pmd_none(*fe->pmd)))
29542978
goto out;
@@ -2962,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
29622986

29632987
add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
29642988
page_add_file_rmap(page, true);
2989+
/*
2990+
* deposit and withdraw with pmd lock held
2991+
*/
2992+
if (arch_needs_pgtable_deposit())
2993+
deposit_prealloc_pte(fe);
29652994

29662995
set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
29672996

@@ -2971,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
29713000
ret = 0;
29723001
count_vm_event(THP_FILE_MAPPED);
29733002
out:
3003+
/*
3004+
* If we are going to fallback to pte mapping, do a
3005+
* withdraw with pmd lock held.
3006+
*/
3007+
if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
3008+
fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
3009+
fe->pmd);
29743010
spin_unlock(fe->ptl);
29753011
return ret;
29763012
}
@@ -3010,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
30103046

30113047
ret = do_set_pmd(fe, page);
30123048
if (ret != VM_FAULT_FALLBACK)
3013-
return ret;
3049+
goto fault_handled;
30143050
}
30153051

30163052
if (!fe->pte) {
30173053
ret = pte_alloc_one_map(fe);
30183054
if (ret)
3019-
return ret;
3055+
goto fault_handled;
30203056
}
30213057

30223058
/* Re-check under ptl */
3023-
if (unlikely(!pte_none(*fe->pte)))
3024-
return VM_FAULT_NOPAGE;
3059+
if (unlikely(!pte_none(*fe->pte))) {
3060+
ret = VM_FAULT_NOPAGE;
3061+
goto fault_handled;
3062+
}
30253063

30263064
flush_icache_page(vma, page);
30273065
entry = mk_pte(page, vma->vm_page_prot);
@@ -3041,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
30413079

30423080
/* no need to invalidate: a not-present page won't be cached */
30433081
update_mmu_cache(vma, fe->address, fe->pte);
3082+
ret = 0;
30443083

3045-
return 0;
3084+
fault_handled:
3085+
/* preallocated pagetable is unused: free it */
3086+
if (fe->prealloc_pte) {
3087+
pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3088+
fe->prealloc_pte = 0;
3089+
}
3090+
return ret;
30463091
}
30473092

30483093
static unsigned long fault_around_bytes __read_mostly =
@@ -3141,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
31413186

31423187
fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
31433188

3144-
/* preallocated pagetable is unused: free it */
3145-
if (fe->prealloc_pte) {
3146-
pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3147-
fe->prealloc_pte = 0;
3148-
}
31493189
/* Huge page is mapped? Page fault is solved */
31503190
if (pmd_trans_huge(*fe->pmd)) {
31513191
ret = VM_FAULT_NOPAGE;

0 commit comments

Comments
 (0)