@@ -135,6 +135,10 @@ static struct khugepaged_scan khugepaged_scan = {
135
135
.mm_head = LIST_HEAD_INIT (khugepaged_scan .mm_head ),
136
136
};
137
137
138
+ static DEFINE_SPINLOCK (split_queue_lock );
139
+ static LIST_HEAD (split_queue );
140
+ static unsigned long split_queue_len ;
141
+ static struct shrinker deferred_split_shrinker ;
138
142
139
143
static void set_recommended_min_free_kbytes (void )
140
144
{
@@ -667,6 +671,9 @@ static int __init hugepage_init(void)
667
671
err = register_shrinker (& huge_zero_page_shrinker );
668
672
if (err )
669
673
goto err_hzp_shrinker ;
674
+ err = register_shrinker (& deferred_split_shrinker );
675
+ if (err )
676
+ goto err_split_shrinker ;
670
677
671
678
/*
672
679
* By default disable transparent hugepages on smaller systems,
@@ -684,6 +691,8 @@ static int __init hugepage_init(void)
684
691
685
692
return 0 ;
686
693
err_khugepaged :
694
+ unregister_shrinker (& deferred_split_shrinker );
695
+ err_split_shrinker :
687
696
unregister_shrinker (& huge_zero_page_shrinker );
688
697
err_hzp_shrinker :
689
698
khugepaged_slab_exit ();
@@ -740,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
740
749
return entry ;
741
750
}
742
751
752
+ static inline struct list_head * page_deferred_list (struct page * page )
753
+ {
754
+ /*
755
+ * ->lru in the tail pages is occupied by compound_head.
756
+ * Let's use ->mapping + ->index in the second tail page as list_head.
757
+ */
758
+ return (struct list_head * )& page [2 ].mapping ;
759
+ }
760
+
761
+ void prep_transhuge_page (struct page * page )
762
+ {
763
+ /*
764
+ * we use page->mapping and page->indexlru in second tail page
765
+ * as list_head: assuming THP order >= 2
766
+ */
767
+ BUILD_BUG_ON (HPAGE_PMD_ORDER < 2 );
768
+
769
+ INIT_LIST_HEAD (page_deferred_list (page ));
770
+ set_compound_page_dtor (page , TRANSHUGE_PAGE_DTOR );
771
+ }
772
+
743
773
static int __do_huge_pmd_anonymous_page (struct mm_struct * mm ,
744
774
struct vm_area_struct * vma ,
745
775
unsigned long address , pmd_t * pmd ,
@@ -896,6 +926,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
896
926
count_vm_event (THP_FAULT_FALLBACK );
897
927
return VM_FAULT_FALLBACK ;
898
928
}
929
+ prep_transhuge_page (page );
899
930
return __do_huge_pmd_anonymous_page (mm , vma , address , pmd , page , gfp ,
900
931
flags );
901
932
}
@@ -1192,7 +1223,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1192
1223
} else
1193
1224
new_page = NULL ;
1194
1225
1195
- if (unlikely (!new_page )) {
1226
+ if (likely (new_page )) {
1227
+ prep_transhuge_page (new_page );
1228
+ } else {
1196
1229
if (!page ) {
1197
1230
split_huge_pmd (vma , pmd , address );
1198
1231
ret |= VM_FAULT_FALLBACK ;
@@ -2109,6 +2142,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2109
2142
return NULL ;
2110
2143
}
2111
2144
2145
+ prep_transhuge_page (* hpage );
2112
2146
count_vm_event (THP_COLLAPSE_ALLOC );
2113
2147
return * hpage ;
2114
2148
}
@@ -2120,8 +2154,12 @@ static int khugepaged_find_target_node(void)
2120
2154
2121
2155
static inline struct page * alloc_hugepage (int defrag )
2122
2156
{
2123
- return alloc_pages (alloc_hugepage_gfpmask (defrag , 0 ),
2124
- HPAGE_PMD_ORDER );
2157
+ struct page * page ;
2158
+
2159
+ page = alloc_pages (alloc_hugepage_gfpmask (defrag , 0 ), HPAGE_PMD_ORDER );
2160
+ if (page )
2161
+ prep_transhuge_page (page );
2162
+ return page ;
2125
2163
}
2126
2164
2127
2165
static struct page * khugepaged_alloc_hugepage (bool * wait )
@@ -3098,7 +3136,7 @@ static int __split_huge_page_tail(struct page *head, int tail,
3098
3136
set_page_idle (page_tail );
3099
3137
3100
3138
/* ->mapping in first tail page is compound_mapcount */
3101
- VM_BUG_ON_PAGE (tail != 1 && page_tail -> mapping != TAIL_MAPPING ,
3139
+ VM_BUG_ON_PAGE (tail > 2 && page_tail -> mapping != TAIL_MAPPING ,
3102
3140
page_tail );
3103
3141
page_tail -> mapping = head -> mapping ;
3104
3142
@@ -3207,19 +3245,28 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3207
3245
freeze_page (anon_vma , head );
3208
3246
VM_BUG_ON_PAGE (compound_mapcount (head ), head );
3209
3247
3248
+ /* Prevent deferred_split_scan() touching ->_count */
3249
+ spin_lock (& split_queue_lock );
3210
3250
count = page_count (head );
3211
3251
mapcount = total_mapcount (head );
3212
3252
if (mapcount == count - 1 ) {
3253
+ if (!list_empty (page_deferred_list (head ))) {
3254
+ split_queue_len -- ;
3255
+ list_del (page_deferred_list (head ));
3256
+ }
3257
+ spin_unlock (& split_queue_lock );
3213
3258
__split_huge_page (page , list );
3214
3259
ret = 0 ;
3215
3260
} else if (IS_ENABLED (CONFIG_DEBUG_VM ) && mapcount > count - 1 ) {
3261
+ spin_unlock (& split_queue_lock );
3216
3262
pr_alert ("total_mapcount: %u, page_count(): %u\n" ,
3217
3263
mapcount , count );
3218
3264
if (PageTail (page ))
3219
3265
dump_page (head , NULL );
3220
3266
dump_page (page , "total_mapcount(head) > page_count(head) - 1" );
3221
3267
BUG ();
3222
3268
} else {
3269
+ spin_unlock (& split_queue_lock );
3223
3270
unfreeze_page (anon_vma , head );
3224
3271
ret = - EBUSY ;
3225
3272
}
@@ -3231,3 +3278,87 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3231
3278
count_vm_event (!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED );
3232
3279
return ret ;
3233
3280
}
3281
+
3282
+ void free_transhuge_page (struct page * page )
3283
+ {
3284
+ unsigned long flags ;
3285
+
3286
+ spin_lock_irqsave (& split_queue_lock , flags );
3287
+ if (!list_empty (page_deferred_list (page ))) {
3288
+ split_queue_len -- ;
3289
+ list_del (page_deferred_list (page ));
3290
+ }
3291
+ spin_unlock_irqrestore (& split_queue_lock , flags );
3292
+ free_compound_page (page );
3293
+ }
3294
+
3295
+ void deferred_split_huge_page (struct page * page )
3296
+ {
3297
+ unsigned long flags ;
3298
+
3299
+ VM_BUG_ON_PAGE (!PageTransHuge (page ), page );
3300
+
3301
+ spin_lock_irqsave (& split_queue_lock , flags );
3302
+ if (list_empty (page_deferred_list (page ))) {
3303
+ list_add_tail (page_deferred_list (page ), & split_queue );
3304
+ split_queue_len ++ ;
3305
+ }
3306
+ spin_unlock_irqrestore (& split_queue_lock , flags );
3307
+ }
3308
+
3309
+ static unsigned long deferred_split_count (struct shrinker * shrink ,
3310
+ struct shrink_control * sc )
3311
+ {
3312
+ /*
3313
+ * Split a page from split_queue will free up at least one page,
3314
+ * at most HPAGE_PMD_NR - 1. We don't track exact number.
3315
+ * Let's use HPAGE_PMD_NR / 2 as ballpark.
3316
+ */
3317
+ return ACCESS_ONCE (split_queue_len ) * HPAGE_PMD_NR / 2 ;
3318
+ }
3319
+
3320
+ static unsigned long deferred_split_scan (struct shrinker * shrink ,
3321
+ struct shrink_control * sc )
3322
+ {
3323
+ unsigned long flags ;
3324
+ LIST_HEAD (list ), * pos , * next ;
3325
+ struct page * page ;
3326
+ int split = 0 ;
3327
+
3328
+ spin_lock_irqsave (& split_queue_lock , flags );
3329
+ list_splice_init (& split_queue , & list );
3330
+
3331
+ /* Take pin on all head pages to avoid freeing them under us */
3332
+ list_for_each_safe (pos , next , & list ) {
3333
+ page = list_entry ((void * )pos , struct page , mapping );
3334
+ page = compound_head (page );
3335
+ /* race with put_compound_page() */
3336
+ if (!get_page_unless_zero (page )) {
3337
+ list_del_init (page_deferred_list (page ));
3338
+ split_queue_len -- ;
3339
+ }
3340
+ }
3341
+ spin_unlock_irqrestore (& split_queue_lock , flags );
3342
+
3343
+ list_for_each_safe (pos , next , & list ) {
3344
+ page = list_entry ((void * )pos , struct page , mapping );
3345
+ lock_page (page );
3346
+ /* split_huge_page() removes page from list on success */
3347
+ if (!split_huge_page (page ))
3348
+ split ++ ;
3349
+ unlock_page (page );
3350
+ put_page (page );
3351
+ }
3352
+
3353
+ spin_lock_irqsave (& split_queue_lock , flags );
3354
+ list_splice_tail (& list , & split_queue );
3355
+ spin_unlock_irqrestore (& split_queue_lock , flags );
3356
+
3357
+ return split * HPAGE_PMD_NR / 2 ;
3358
+ }
3359
+
3360
+ static struct shrinker deferred_split_shrinker = {
3361
+ .count_objects = deferred_split_count ,
3362
+ .scan_objects = deferred_split_scan ,
3363
+ .seeks = DEFAULT_SEEKS ,
3364
+ };
0 commit comments