47
47
#include <asm/kvm_page_track.h>
48
48
#include "trace.h"
49
49
50
+ extern bool itlb_multihit_kvm_mitigation ;
51
+
52
+ static int __read_mostly nx_huge_pages = -1 ;
53
+
54
+ static int set_nx_huge_pages (const char * val , const struct kernel_param * kp );
55
+
56
+ static struct kernel_param_ops nx_huge_pages_ops = {
57
+ .set = set_nx_huge_pages ,
58
+ .get = param_get_bool ,
59
+ };
60
+
61
+ module_param_cb (nx_huge_pages , & nx_huge_pages_ops , & nx_huge_pages , 0644 );
62
+ __MODULE_PARM_TYPE (nx_huge_pages , "bool" );
63
+
50
64
/*
51
65
* When setting this variable to true it enables Two-Dimensional-Paging
52
66
* where the hardware walks 2 page tables:
@@ -352,6 +366,11 @@ static inline bool spte_ad_need_write_protect(u64 spte)
352
366
return (spte & SPTE_SPECIAL_MASK ) != SPTE_AD_ENABLED_MASK ;
353
367
}
354
368
369
+ static bool is_nx_huge_page_enabled (void )
370
+ {
371
+ return READ_ONCE (nx_huge_pages );
372
+ }
373
+
355
374
static inline u64 spte_shadow_accessed_mask (u64 spte )
356
375
{
357
376
MMU_WARN_ON (is_mmio_spte (spte ));
@@ -1190,6 +1209,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1190
1209
kvm_mmu_gfn_disallow_lpage (slot , gfn );
1191
1210
}
1192
1211
1212
+ static void account_huge_nx_page (struct kvm * kvm , struct kvm_mmu_page * sp )
1213
+ {
1214
+ if (sp -> lpage_disallowed )
1215
+ return ;
1216
+
1217
+ ++ kvm -> stat .nx_lpage_splits ;
1218
+ sp -> lpage_disallowed = true;
1219
+ }
1220
+
1193
1221
static void unaccount_shadowed (struct kvm * kvm , struct kvm_mmu_page * sp )
1194
1222
{
1195
1223
struct kvm_memslots * slots ;
@@ -1207,6 +1235,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1207
1235
kvm_mmu_gfn_allow_lpage (slot , gfn );
1208
1236
}
1209
1237
1238
+ static void unaccount_huge_nx_page (struct kvm * kvm , struct kvm_mmu_page * sp )
1239
+ {
1240
+ -- kvm -> stat .nx_lpage_splits ;
1241
+ sp -> lpage_disallowed = false;
1242
+ }
1243
+
1210
1244
static bool __mmu_gfn_lpage_is_disallowed (gfn_t gfn , int level ,
1211
1245
struct kvm_memory_slot * slot )
1212
1246
{
@@ -2792,6 +2826,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2792
2826
kvm_reload_remote_mmus (kvm );
2793
2827
}
2794
2828
2829
+ if (sp -> lpage_disallowed )
2830
+ unaccount_huge_nx_page (kvm , sp );
2831
+
2795
2832
sp -> role .invalid = 1 ;
2796
2833
return list_unstable ;
2797
2834
}
@@ -3013,6 +3050,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3013
3050
if (!speculative )
3014
3051
spte |= spte_shadow_accessed_mask (spte );
3015
3052
3053
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK ) &&
3054
+ is_nx_huge_page_enabled ()) {
3055
+ pte_access &= ~ACC_EXEC_MASK ;
3056
+ }
3057
+
3016
3058
if (pte_access & ACC_EXEC_MASK )
3017
3059
spte |= shadow_x_mask ;
3018
3060
else
@@ -3233,9 +3275,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3233
3275
__direct_pte_prefetch (vcpu , sp , sptep );
3234
3276
}
3235
3277
3278
+ static void disallowed_hugepage_adjust (struct kvm_shadow_walk_iterator it ,
3279
+ gfn_t gfn , kvm_pfn_t * pfnp , int * levelp )
3280
+ {
3281
+ int level = * levelp ;
3282
+ u64 spte = * it .sptep ;
3283
+
3284
+ if (it .level == level && level > PT_PAGE_TABLE_LEVEL &&
3285
+ is_nx_huge_page_enabled () &&
3286
+ is_shadow_present_pte (spte ) &&
3287
+ !is_large_pte (spte )) {
3288
+ /*
3289
+ * A small SPTE exists for this pfn, but FNAME(fetch)
3290
+ * and __direct_map would like to create a large PTE
3291
+ * instead: just force them to go down another level,
3292
+ * patching back for them into pfn the next 9 bits of
3293
+ * the address.
3294
+ */
3295
+ u64 page_mask = KVM_PAGES_PER_HPAGE (level ) - KVM_PAGES_PER_HPAGE (level - 1 );
3296
+ * pfnp |= gfn & page_mask ;
3297
+ (* levelp )-- ;
3298
+ }
3299
+ }
3300
+
3236
3301
static int __direct_map (struct kvm_vcpu * vcpu , gpa_t gpa , int write ,
3237
3302
int map_writable , int level , kvm_pfn_t pfn ,
3238
- bool prefault )
3303
+ bool prefault , bool lpage_disallowed )
3239
3304
{
3240
3305
struct kvm_shadow_walk_iterator it ;
3241
3306
struct kvm_mmu_page * sp ;
@@ -3248,6 +3313,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3248
3313
3249
3314
trace_kvm_mmu_spte_requested (gpa , level , pfn );
3250
3315
for_each_shadow_entry (vcpu , gpa , it ) {
3316
+ /*
3317
+ * We cannot overwrite existing page tables with an NX
3318
+ * large page, as the leaf could be executable.
3319
+ */
3320
+ disallowed_hugepage_adjust (it , gfn , & pfn , & level );
3321
+
3251
3322
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE (it .level ) - 1 );
3252
3323
if (it .level == level )
3253
3324
break ;
@@ -3258,6 +3329,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3258
3329
it .level - 1 , true, ACC_ALL );
3259
3330
3260
3331
link_shadow_page (vcpu , it .sptep , sp );
3332
+ if (lpage_disallowed )
3333
+ account_huge_nx_page (vcpu -> kvm , sp );
3261
3334
}
3262
3335
}
3263
3336
@@ -3550,11 +3623,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3550
3623
{
3551
3624
int r ;
3552
3625
int level ;
3553
- bool force_pt_level = false ;
3626
+ bool force_pt_level ;
3554
3627
kvm_pfn_t pfn ;
3555
3628
unsigned long mmu_seq ;
3556
3629
bool map_writable , write = error_code & PFERR_WRITE_MASK ;
3630
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK ) &&
3631
+ is_nx_huge_page_enabled ();
3557
3632
3633
+ force_pt_level = lpage_disallowed ;
3558
3634
level = mapping_level (vcpu , gfn , & force_pt_level );
3559
3635
if (likely (!force_pt_level )) {
3560
3636
/*
@@ -3588,7 +3664,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3588
3664
goto out_unlock ;
3589
3665
if (likely (!force_pt_level ))
3590
3666
transparent_hugepage_adjust (vcpu , gfn , & pfn , & level );
3591
- r = __direct_map (vcpu , v , write , map_writable , level , pfn , prefault );
3667
+ r = __direct_map (vcpu , v , write , map_writable , level , pfn ,
3668
+ prefault , false);
3592
3669
out_unlock :
3593
3670
spin_unlock (& vcpu -> kvm -> mmu_lock );
3594
3671
kvm_release_pfn_clean (pfn );
@@ -4174,6 +4251,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4174
4251
unsigned long mmu_seq ;
4175
4252
int write = error_code & PFERR_WRITE_MASK ;
4176
4253
bool map_writable ;
4254
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK ) &&
4255
+ is_nx_huge_page_enabled ();
4177
4256
4178
4257
MMU_WARN_ON (!VALID_PAGE (vcpu -> arch .mmu -> root_hpa ));
4179
4258
@@ -4184,8 +4263,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4184
4263
if (r )
4185
4264
return r ;
4186
4265
4187
- force_pt_level = !check_hugepage_cache_consistency (vcpu , gfn ,
4188
- PT_DIRECTORY_LEVEL );
4266
+ force_pt_level =
4267
+ lpage_disallowed ||
4268
+ !check_hugepage_cache_consistency (vcpu , gfn , PT_DIRECTORY_LEVEL );
4189
4269
level = mapping_level (vcpu , gfn , & force_pt_level );
4190
4270
if (likely (!force_pt_level )) {
4191
4271
if (level > PT_DIRECTORY_LEVEL &&
@@ -4214,7 +4294,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4214
4294
goto out_unlock ;
4215
4295
if (likely (!force_pt_level ))
4216
4296
transparent_hugepage_adjust (vcpu , gfn , & pfn , & level );
4217
- r = __direct_map (vcpu , gpa , write , map_writable , level , pfn , prefault );
4297
+ r = __direct_map (vcpu , gpa , write , map_writable , level , pfn ,
4298
+ prefault , lpage_disallowed );
4218
4299
out_unlock :
4219
4300
spin_unlock (& vcpu -> kvm -> mmu_lock );
4220
4301
kvm_release_pfn_clean (pfn );
@@ -6155,10 +6236,58 @@ static void kvm_set_mmio_spte_mask(void)
6155
6236
kvm_mmu_set_mmio_spte_mask (mask , mask , ACC_WRITE_MASK | ACC_USER_MASK );
6156
6237
}
6157
6238
6239
+ static bool get_nx_auto_mode (void )
6240
+ {
6241
+ /* Return true when CPU has the bug, and mitigations are ON */
6242
+ return boot_cpu_has_bug (X86_BUG_ITLB_MULTIHIT ) && !cpu_mitigations_off ();
6243
+ }
6244
+
6245
+ static void __set_nx_huge_pages (bool val )
6246
+ {
6247
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val ;
6248
+ }
6249
+
6250
+ static int set_nx_huge_pages (const char * val , const struct kernel_param * kp )
6251
+ {
6252
+ bool old_val = nx_huge_pages ;
6253
+ bool new_val ;
6254
+
6255
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
6256
+ if (sysfs_streq (val , "off" ))
6257
+ new_val = 0 ;
6258
+ else if (sysfs_streq (val , "force" ))
6259
+ new_val = 1 ;
6260
+ else if (sysfs_streq (val , "auto" ))
6261
+ new_val = get_nx_auto_mode ();
6262
+ else if (strtobool (val , & new_val ) < 0 )
6263
+ return - EINVAL ;
6264
+
6265
+ __set_nx_huge_pages (new_val );
6266
+
6267
+ if (new_val != old_val ) {
6268
+ struct kvm * kvm ;
6269
+ int idx ;
6270
+
6271
+ mutex_lock (& kvm_lock );
6272
+
6273
+ list_for_each_entry (kvm , & vm_list , vm_list ) {
6274
+ idx = srcu_read_lock (& kvm -> srcu );
6275
+ kvm_mmu_zap_all_fast (kvm );
6276
+ srcu_read_unlock (& kvm -> srcu , idx );
6277
+ }
6278
+ mutex_unlock (& kvm_lock );
6279
+ }
6280
+
6281
+ return 0 ;
6282
+ }
6283
+
6158
6284
int kvm_mmu_module_init (void )
6159
6285
{
6160
6286
int ret = - ENOMEM ;
6161
6287
6288
+ if (nx_huge_pages == -1 )
6289
+ __set_nx_huge_pages (get_nx_auto_mode ());
6290
+
6162
6291
/*
6163
6292
* MMU roles use union aliasing which is, generally speaking, an
6164
6293
* undefined behavior. However, we supposedly know how compilers behave
0 commit comments