Skip to content

Commit b8e8c83

Browse files
bonziniKAGA-KOKO
authored andcommitted
kvm: mmu: ITLB_MULTIHIT mitigation
With some Intel processors, putting the same virtual address in the TLB as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit and cause the processor to issue a machine check resulting in a CPU lockup. Unfortunately when EPT page tables use huge pages, it is possible for a malicious guest to cause this situation. Add a knob to mark huge pages as non-executable. When the nx_huge_pages parameter is enabled (and we are using EPT), all huge pages are marked as NX. If the guest attempts to execute in one of those pages, the page is broken down into 4K pages, which are then marked executable. This is not an issue for shadow paging (except nested EPT), because then the host is in control of TLB flushes and the problematic situation cannot happen. With nested EPT, again the nested guest can cause problems shadow and direct EPT is treated in the same way. [ tglx: Fixup default to auto and massage wording a bit ] Originally-by: Junaid Shahid <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]>
1 parent 731dc9d commit b8e8c83

File tree

6 files changed

+200
-13
lines changed

6 files changed

+200
-13
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2055,6 +2055,19 @@
20552055
KVM MMU at runtime.
20562056
Default is 0 (off)
20572057

2058+
kvm.nx_huge_pages=
2059+
[KVM] Controls the software workaround for the
2060+
X86_BUG_ITLB_MULTIHIT bug.
2061+
force : Always deploy workaround.
2062+
off : Never deploy workaround.
2063+
auto : Deploy workaround based on the presence of
2064+
X86_BUG_ITLB_MULTIHIT.
2065+
2066+
Default is 'auto'.
2067+
2068+
If the software workaround is enabled for the host,
2069+
guests do need not to enable it for nested guests.
2070+
20582071
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
20592072
Default is 1 (enabled)
20602073

@@ -2637,6 +2650,12 @@
26372650
l1tf=off [X86]
26382651
mds=off [X86]
26392652
tsx_async_abort=off [X86]
2653+
kvm.nx_huge_pages=off [X86]
2654+
2655+
Exceptions:
2656+
This does not have any effect on
2657+
kvm.nx_huge_pages when
2658+
kvm.nx_huge_pages=force.
26402659

26412660
auto (default)
26422661
Mitigate all CPU vulnerabilities, but leave SMT

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ struct kvm_mmu_page {
315315
bool unsync;
316316
u8 mmu_valid_gen;
317317
bool mmio_cached;
318+
bool lpage_disallowed; /* Can't be replaced by an equiv large page */
318319

319320
/*
320321
* The following two entries are used to key the shadow page in the
@@ -946,6 +947,7 @@ struct kvm_vm_stat {
946947
ulong mmu_unsync;
947948
ulong remote_tlb_flush;
948949
ulong lpages;
950+
ulong nx_lpage_splits;
949951
ulong max_mmu_page_hash_collisions;
950952
};
951953

arch/x86/kernel/cpu/bugs.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1257,6 +1257,9 @@ void x86_spec_ctrl_setup_ap(void)
12571257
x86_amd_ssb_disable();
12581258
}
12591259

1260+
bool itlb_multihit_kvm_mitigation;
1261+
EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
1262+
12601263
#undef pr_fmt
12611264
#define pr_fmt(fmt) "L1TF: " fmt
12621265

@@ -1412,17 +1415,25 @@ static ssize_t l1tf_show_state(char *buf)
14121415
l1tf_vmx_states[l1tf_vmx_mitigation],
14131416
sched_smt_active() ? "vulnerable" : "disabled");
14141417
}
1418+
1419+
static ssize_t itlb_multihit_show_state(char *buf)
1420+
{
1421+
if (itlb_multihit_kvm_mitigation)
1422+
return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
1423+
else
1424+
return sprintf(buf, "KVM: Vulnerable\n");
1425+
}
14151426
#else
14161427
static ssize_t l1tf_show_state(char *buf)
14171428
{
14181429
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
14191430
}
1420-
#endif
14211431

14221432
static ssize_t itlb_multihit_show_state(char *buf)
14231433
{
14241434
return sprintf(buf, "Processor vulnerable\n");
14251435
}
1436+
#endif
14261437

14271438
static ssize_t mds_show_state(char *buf)
14281439
{

arch/x86/kvm/mmu.c

Lines changed: 135 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,20 @@
4747
#include <asm/kvm_page_track.h>
4848
#include "trace.h"
4949

50+
extern bool itlb_multihit_kvm_mitigation;
51+
52+
static int __read_mostly nx_huge_pages = -1;
53+
54+
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
55+
56+
static struct kernel_param_ops nx_huge_pages_ops = {
57+
.set = set_nx_huge_pages,
58+
.get = param_get_bool,
59+
};
60+
61+
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
62+
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
63+
5064
/*
5165
* When setting this variable to true it enables Two-Dimensional-Paging
5266
* where the hardware walks 2 page tables:
@@ -352,6 +366,11 @@ static inline bool spte_ad_need_write_protect(u64 spte)
352366
return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
353367
}
354368

369+
static bool is_nx_huge_page_enabled(void)
370+
{
371+
return READ_ONCE(nx_huge_pages);
372+
}
373+
355374
static inline u64 spte_shadow_accessed_mask(u64 spte)
356375
{
357376
MMU_WARN_ON(is_mmio_spte(spte));
@@ -1190,6 +1209,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
11901209
kvm_mmu_gfn_disallow_lpage(slot, gfn);
11911210
}
11921211

1212+
static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1213+
{
1214+
if (sp->lpage_disallowed)
1215+
return;
1216+
1217+
++kvm->stat.nx_lpage_splits;
1218+
sp->lpage_disallowed = true;
1219+
}
1220+
11931221
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
11941222
{
11951223
struct kvm_memslots *slots;
@@ -1207,6 +1235,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
12071235
kvm_mmu_gfn_allow_lpage(slot, gfn);
12081236
}
12091237

1238+
static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1239+
{
1240+
--kvm->stat.nx_lpage_splits;
1241+
sp->lpage_disallowed = false;
1242+
}
1243+
12101244
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
12111245
struct kvm_memory_slot *slot)
12121246
{
@@ -2792,6 +2826,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
27922826
kvm_reload_remote_mmus(kvm);
27932827
}
27942828

2829+
if (sp->lpage_disallowed)
2830+
unaccount_huge_nx_page(kvm, sp);
2831+
27952832
sp->role.invalid = 1;
27962833
return list_unstable;
27972834
}
@@ -3013,6 +3050,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
30133050
if (!speculative)
30143051
spte |= spte_shadow_accessed_mask(spte);
30153052

3053+
if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3054+
is_nx_huge_page_enabled()) {
3055+
pte_access &= ~ACC_EXEC_MASK;
3056+
}
3057+
30163058
if (pte_access & ACC_EXEC_MASK)
30173059
spte |= shadow_x_mask;
30183060
else
@@ -3233,9 +3275,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
32333275
__direct_pte_prefetch(vcpu, sp, sptep);
32343276
}
32353277

3278+
static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3279+
gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3280+
{
3281+
int level = *levelp;
3282+
u64 spte = *it.sptep;
3283+
3284+
if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3285+
is_nx_huge_page_enabled() &&
3286+
is_shadow_present_pte(spte) &&
3287+
!is_large_pte(spte)) {
3288+
/*
3289+
* A small SPTE exists for this pfn, but FNAME(fetch)
3290+
* and __direct_map would like to create a large PTE
3291+
* instead: just force them to go down another level,
3292+
* patching back for them into pfn the next 9 bits of
3293+
* the address.
3294+
*/
3295+
u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3296+
*pfnp |= gfn & page_mask;
3297+
(*levelp)--;
3298+
}
3299+
}
3300+
32363301
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
32373302
int map_writable, int level, kvm_pfn_t pfn,
3238-
bool prefault)
3303+
bool prefault, bool lpage_disallowed)
32393304
{
32403305
struct kvm_shadow_walk_iterator it;
32413306
struct kvm_mmu_page *sp;
@@ -3248,6 +3313,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
32483313

32493314
trace_kvm_mmu_spte_requested(gpa, level, pfn);
32503315
for_each_shadow_entry(vcpu, gpa, it) {
3316+
/*
3317+
* We cannot overwrite existing page tables with an NX
3318+
* large page, as the leaf could be executable.
3319+
*/
3320+
disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3321+
32513322
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
32523323
if (it.level == level)
32533324
break;
@@ -3258,6 +3329,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
32583329
it.level - 1, true, ACC_ALL);
32593330

32603331
link_shadow_page(vcpu, it.sptep, sp);
3332+
if (lpage_disallowed)
3333+
account_huge_nx_page(vcpu->kvm, sp);
32613334
}
32623335
}
32633336

@@ -3550,11 +3623,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
35503623
{
35513624
int r;
35523625
int level;
3553-
bool force_pt_level = false;
3626+
bool force_pt_level;
35543627
kvm_pfn_t pfn;
35553628
unsigned long mmu_seq;
35563629
bool map_writable, write = error_code & PFERR_WRITE_MASK;
3630+
bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3631+
is_nx_huge_page_enabled();
35573632

3633+
force_pt_level = lpage_disallowed;
35583634
level = mapping_level(vcpu, gfn, &force_pt_level);
35593635
if (likely(!force_pt_level)) {
35603636
/*
@@ -3588,7 +3664,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
35883664
goto out_unlock;
35893665
if (likely(!force_pt_level))
35903666
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3591-
r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
3667+
r = __direct_map(vcpu, v, write, map_writable, level, pfn,
3668+
prefault, false);
35923669
out_unlock:
35933670
spin_unlock(&vcpu->kvm->mmu_lock);
35943671
kvm_release_pfn_clean(pfn);
@@ -4174,6 +4251,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
41744251
unsigned long mmu_seq;
41754252
int write = error_code & PFERR_WRITE_MASK;
41764253
bool map_writable;
4254+
bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4255+
is_nx_huge_page_enabled();
41774256

41784257
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
41794258

@@ -4184,8 +4263,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
41844263
if (r)
41854264
return r;
41864265

4187-
force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4188-
PT_DIRECTORY_LEVEL);
4266+
force_pt_level =
4267+
lpage_disallowed ||
4268+
!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
41894269
level = mapping_level(vcpu, gfn, &force_pt_level);
41904270
if (likely(!force_pt_level)) {
41914271
if (level > PT_DIRECTORY_LEVEL &&
@@ -4214,7 +4294,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
42144294
goto out_unlock;
42154295
if (likely(!force_pt_level))
42164296
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4217-
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
4297+
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4298+
prefault, lpage_disallowed);
42184299
out_unlock:
42194300
spin_unlock(&vcpu->kvm->mmu_lock);
42204301
kvm_release_pfn_clean(pfn);
@@ -6155,10 +6236,58 @@ static void kvm_set_mmio_spte_mask(void)
61556236
kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
61566237
}
61576238

6239+
static bool get_nx_auto_mode(void)
6240+
{
6241+
/* Return true when CPU has the bug, and mitigations are ON */
6242+
return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6243+
}
6244+
6245+
static void __set_nx_huge_pages(bool val)
6246+
{
6247+
nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6248+
}
6249+
6250+
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6251+
{
6252+
bool old_val = nx_huge_pages;
6253+
bool new_val;
6254+
6255+
/* In "auto" mode deploy workaround only if CPU has the bug. */
6256+
if (sysfs_streq(val, "off"))
6257+
new_val = 0;
6258+
else if (sysfs_streq(val, "force"))
6259+
new_val = 1;
6260+
else if (sysfs_streq(val, "auto"))
6261+
new_val = get_nx_auto_mode();
6262+
else if (strtobool(val, &new_val) < 0)
6263+
return -EINVAL;
6264+
6265+
__set_nx_huge_pages(new_val);
6266+
6267+
if (new_val != old_val) {
6268+
struct kvm *kvm;
6269+
int idx;
6270+
6271+
mutex_lock(&kvm_lock);
6272+
6273+
list_for_each_entry(kvm, &vm_list, vm_list) {
6274+
idx = srcu_read_lock(&kvm->srcu);
6275+
kvm_mmu_zap_all_fast(kvm);
6276+
srcu_read_unlock(&kvm->srcu, idx);
6277+
}
6278+
mutex_unlock(&kvm_lock);
6279+
}
6280+
6281+
return 0;
6282+
}
6283+
61586284
int kvm_mmu_module_init(void)
61596285
{
61606286
int ret = -ENOMEM;
61616287

6288+
if (nx_huge_pages == -1)
6289+
__set_nx_huge_pages(get_nx_auto_mode());
6290+
61626291
/*
61636292
* MMU roles use union aliasing which is, generally speaking, an
61646293
* undefined behavior. However, we supposedly know how compilers behave

0 commit comments

Comments
 (0)