12
12
#include <linux/mmu_notifier.h>
13
13
#include <linux/rmap.h>
14
14
#include <linux/swap.h>
15
+ #include <linux/shrinker.h>
15
16
#include <linux/mm_inline.h>
16
17
#include <linux/kthread.h>
17
18
#include <linux/khugepaged.h>
18
19
#include <linux/freezer.h>
19
20
#include <linux/mman.h>
20
21
#include <linux/pagemap.h>
22
+
21
23
#include <asm/tlb.h>
22
24
#include <asm/pgalloc.h>
23
25
#include "internal.h"
@@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
47
49
/* during fragmentation poll the hugepage allocator once every minute */
48
50
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000 ;
49
51
static struct task_struct * khugepaged_thread __read_mostly ;
50
- static unsigned long huge_zero_pfn __read_mostly ;
51
52
static DEFINE_MUTEX (khugepaged_mutex );
52
53
static DEFINE_SPINLOCK (khugepaged_mm_lock );
53
54
static DECLARE_WAIT_QUEUE_HEAD (khugepaged_wait );
@@ -160,31 +161,74 @@ static int start_khugepaged(void)
160
161
return err ;
161
162
}
162
163
163
- static int init_huge_zero_pfn (void )
164
+ static atomic_t huge_zero_refcount ;
165
+ static unsigned long huge_zero_pfn __read_mostly ;
166
+
167
+ static inline bool is_huge_zero_pfn (unsigned long pfn )
164
168
{
165
- struct page * hpage ;
166
- unsigned long pfn ;
169
+ unsigned long zero_pfn = ACCESS_ONCE (huge_zero_pfn );
170
+ return zero_pfn && pfn == zero_pfn ;
171
+ }
167
172
168
- hpage = alloc_pages ((GFP_TRANSHUGE | __GFP_ZERO ) & ~__GFP_MOVABLE ,
173
+ static inline bool is_huge_zero_pmd (pmd_t pmd )
174
+ {
175
+ return is_huge_zero_pfn (pmd_pfn (pmd ));
176
+ }
177
+
178
+ static unsigned long get_huge_zero_page (void )
179
+ {
180
+ struct page * zero_page ;
181
+ retry :
182
+ if (likely (atomic_inc_not_zero (& huge_zero_refcount )))
183
+ return ACCESS_ONCE (huge_zero_pfn );
184
+
185
+ zero_page = alloc_pages ((GFP_TRANSHUGE | __GFP_ZERO ) & ~__GFP_MOVABLE ,
169
186
HPAGE_PMD_ORDER );
170
- if (!hpage )
171
- return - ENOMEM ;
172
- pfn = page_to_pfn (hpage );
173
- if (cmpxchg (& huge_zero_pfn , 0 , pfn ))
174
- __free_page (hpage );
175
- return 0 ;
187
+ if (!zero_page )
188
+ return 0 ;
189
+ preempt_disable ();
190
+ if (cmpxchg (& huge_zero_pfn , 0 , page_to_pfn (zero_page ))) {
191
+ preempt_enable ();
192
+ __free_page (zero_page );
193
+ goto retry ;
194
+ }
195
+
196
+ /* We take additional reference here. It will be put back by shrinker */
197
+ atomic_set (& huge_zero_refcount , 2 );
198
+ preempt_enable ();
199
+ return ACCESS_ONCE (huge_zero_pfn );
176
200
}
177
201
178
- static inline bool is_huge_zero_pfn ( unsigned long pfn )
202
+ static void put_huge_zero_page ( void )
179
203
{
180
- return huge_zero_pfn && pfn == huge_zero_pfn ;
204
+ /*
205
+ * Counter should never go to zero here. Only shrinker can put
206
+ * last reference.
207
+ */
208
+ BUG_ON (atomic_dec_and_test (& huge_zero_refcount ));
181
209
}
182
210
183
- static inline bool is_huge_zero_pmd (pmd_t pmd )
211
+ static int shrink_huge_zero_page (struct shrinker * shrink ,
212
+ struct shrink_control * sc )
184
213
{
185
- return is_huge_zero_pfn (pmd_pfn (pmd ));
214
+ if (!sc -> nr_to_scan )
215
+ /* we can free zero page only if last reference remains */
216
+ return atomic_read (& huge_zero_refcount ) == 1 ? HPAGE_PMD_NR : 0 ;
217
+
218
+ if (atomic_cmpxchg (& huge_zero_refcount , 1 , 0 ) == 1 ) {
219
+ unsigned long zero_pfn = xchg (& huge_zero_pfn , 0 );
220
+ BUG_ON (zero_pfn == 0 );
221
+ __free_page (__pfn_to_page (zero_pfn ));
222
+ }
223
+
224
+ return 0 ;
186
225
}
187
226
227
+ static struct shrinker huge_zero_page_shrinker = {
228
+ .shrink = shrink_huge_zero_page ,
229
+ .seeks = DEFAULT_SEEKS ,
230
+ };
231
+
188
232
#ifdef CONFIG_SYSFS
189
233
190
234
static ssize_t double_flag_show (struct kobject * kobj ,
@@ -576,6 +620,8 @@ static int __init hugepage_init(void)
576
620
goto out ;
577
621
}
578
622
623
+ register_shrinker (& huge_zero_page_shrinker );
624
+
579
625
/*
580
626
* By default disable transparent hugepages on smaller systems,
581
627
* where the extra memory used could hurt more than TLB overhead
@@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag)
705
751
#endif
706
752
707
753
static void set_huge_zero_page (pgtable_t pgtable , struct mm_struct * mm ,
708
- struct vm_area_struct * vma , unsigned long haddr , pmd_t * pmd )
754
+ struct vm_area_struct * vma , unsigned long haddr , pmd_t * pmd ,
755
+ unsigned long zero_pfn )
709
756
{
710
757
pmd_t entry ;
711
- entry = pfn_pmd (huge_zero_pfn , vma -> vm_page_prot );
758
+ entry = pfn_pmd (zero_pfn , vma -> vm_page_prot );
712
759
entry = pmd_wrprotect (entry );
713
760
entry = pmd_mkhuge (entry );
714
761
set_pmd_at (mm , haddr , pmd , entry );
@@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
731
778
return VM_FAULT_OOM ;
732
779
if (!(flags & FAULT_FLAG_WRITE )) {
733
780
pgtable_t pgtable ;
734
- if (unlikely (!huge_zero_pfn && init_huge_zero_pfn ())) {
735
- count_vm_event (THP_FAULT_FALLBACK );
736
- goto out ;
737
- }
781
+ unsigned long zero_pfn ;
738
782
pgtable = pte_alloc_one (mm , haddr );
739
783
if (unlikely (!pgtable ))
740
784
return VM_FAULT_OOM ;
785
+ zero_pfn = get_huge_zero_page ();
786
+ if (unlikely (!zero_pfn )) {
787
+ pte_free (mm , pgtable );
788
+ count_vm_event (THP_FAULT_FALLBACK );
789
+ goto out ;
790
+ }
741
791
spin_lock (& mm -> page_table_lock );
742
- set_huge_zero_page (pgtable , mm , vma , haddr , pmd );
792
+ set_huge_zero_page (pgtable , mm , vma , haddr , pmd ,
793
+ zero_pfn );
743
794
spin_unlock (& mm -> page_table_lock );
744
795
return 0 ;
745
796
}
@@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
813
864
* a page table.
814
865
*/
815
866
if (is_huge_zero_pmd (pmd )) {
816
- set_huge_zero_page (pgtable , dst_mm , vma , addr , dst_pmd );
867
+ unsigned long zero_pfn ;
868
+ /*
869
+ * get_huge_zero_page() will never allocate a new page here,
870
+ * since we already have a zero page to copy. It just takes a
871
+ * reference.
872
+ */
873
+ zero_pfn = get_huge_zero_page ();
874
+ set_huge_zero_page (pgtable , dst_mm , vma , addr , dst_pmd ,
875
+ zero_pfn );
817
876
ret = 0 ;
818
877
goto out_unlock ;
819
878
}
@@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
923
982
smp_wmb (); /* make pte visible before pmd */
924
983
pmd_populate (mm , pmd , pgtable );
925
984
spin_unlock (& mm -> page_table_lock );
985
+ put_huge_zero_page ();
926
986
inc_mm_counter (mm , MM_ANONPAGES );
927
987
928
988
mmu_notifier_invalidate_range_end (mm , mmun_start , mmun_end );
@@ -1123,9 +1183,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1123
1183
page_add_new_anon_rmap (new_page , vma , haddr );
1124
1184
set_pmd_at (mm , haddr , pmd , entry );
1125
1185
update_mmu_cache_pmd (vma , address , pmd );
1126
- if (is_huge_zero_pmd (orig_pmd ))
1186
+ if (is_huge_zero_pmd (orig_pmd )) {
1127
1187
add_mm_counter (mm , MM_ANONPAGES , HPAGE_PMD_NR );
1128
- else {
1188
+ put_huge_zero_page ();
1189
+ } else {
1129
1190
VM_BUG_ON (!PageHead (page ));
1130
1191
page_remove_rmap (page );
1131
1192
put_page (page );
@@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1202
1263
if (is_huge_zero_pmd (orig_pmd )) {
1203
1264
tlb -> mm -> nr_ptes -- ;
1204
1265
spin_unlock (& tlb -> mm -> page_table_lock );
1266
+ put_huge_zero_page ();
1205
1267
} else {
1206
1268
page = pmd_page (orig_pmd );
1207
1269
page_remove_rmap (page );
@@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2511
2573
}
2512
2574
smp_wmb (); /* make pte visible before pmd */
2513
2575
pmd_populate (mm , pmd , pgtable );
2576
+ put_huge_zero_page ();
2514
2577
}
2515
2578
2516
2579
void __split_huge_page_pmd (struct vm_area_struct * vma , unsigned long address ,
0 commit comments