Skip to content

Commit 15d4507

Browse files
ashok-rajKAGA-KOKO
authored andcommitted
KVM/x86: Add IBPB support
The Indirect Branch Predictor Barrier (IBPB) is an indirect branch control mechanism. It keeps earlier branches from influencing later ones. Unlike IBRS and STIBP, IBPB does not define a new mode of operation. It's a command that ensures predicted branch targets aren't used after the barrier. Although IBRS and IBPB are enumerated by the same CPUID enumeration, IBPB is very different. IBPB helps mitigate against three potential attacks: * Mitigate guests from being attacked by other guests. - This is addressed by issing IBPB when we do a guest switch. * Mitigate attacks from guest/ring3->host/ring3. These would require a IBPB during context switch in host, or after VMEXIT. The host process has two ways to mitigate - Either it can be compiled with retpoline - If its going through context switch, and has set !dumpable then there is a IBPB in that path. (Tim's patch: https://patchwork.kernel.org/patch/10192871) - The case where after a VMEXIT you return back to Qemu might make Qemu attackable from guest when Qemu isn't compiled with retpoline. There are issues reported when doing IBPB on every VMEXIT that resulted in some tsc calibration woes in guest. * Mitigate guest/ring0->host/ring0 attacks. When host kernel is using retpoline it is safe against these attacks. If host kernel isn't using retpoline we might need to do a IBPB flush on every VMEXIT. Even when using retpoline for indirect calls, in certain conditions 'ret' can use the BTB on Skylake-era CPUs. There are other mitigations available like RSB stuffing/clearing. * IBPB is issued only for SVM during svm_free_vcpu(). VMX has a vmclear and SVM doesn't. Follow discussion here: https://lkml.org/lkml/2018/1/15/146 Please refer to the following spec for more details on the enumeration and control. Refer here to get documentation about mitigations. https://software.intel.com/en-us/side-channel-security-support [peterz: rebase and changelog rewrite] [karahmed: - rebase - vmx: expose PRED_CMD if guest has it in CPUID - svm: only pass through IBPB if guest has it in CPUID - vmx: support !cpu_has_vmx_msr_bitmap()] - vmx: support nested] [dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS) PRED_CMD is a write-only MSR] Signed-off-by: Ashok Raj <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: David Woodhouse <[email protected]> Signed-off-by: KarimAllah Ahmed <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Konrad Rzeszutek Wilk <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Andi Kleen <[email protected]> Cc: [email protected] Cc: Asit Mallick <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Arjan Van De Ven <[email protected]> Cc: Greg KH <[email protected]> Cc: Jun Nakajima <[email protected]> Cc: Paolo Bonzini <[email protected]> Cc: Dan Williams <[email protected]> Cc: Tim Chen <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected]
1 parent b7b27aa commit 15d4507

File tree

3 files changed

+116
-3
lines changed

3 files changed

+116
-3
lines changed

arch/x86/kvm/cpuid.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
365365
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
366366
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
367367

368+
/* cpuid 0x80000008.ebx */
369+
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
370+
F(IBPB);
371+
368372
/* cpuid 0xC0000001.edx */
369373
const u32 kvm_cpuid_C000_0001_edx_x86_features =
370374
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
@@ -625,7 +629,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
625629
if (!g_phys_as)
626630
g_phys_as = phys_as;
627631
entry->eax = g_phys_as | (virt_as << 8);
628-
entry->ebx = entry->edx = 0;
632+
entry->edx = 0;
633+
/* IBPB isn't necessarily present in hardware cpuid */
634+
if (boot_cpu_has(X86_FEATURE_IBPB))
635+
entry->ebx |= F(IBPB);
636+
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
637+
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
629638
break;
630639
}
631640
case 0x80000019:

arch/x86/kvm/svm.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ static const struct svm_direct_access_msrs {
249249
{ .index = MSR_CSTAR, .always = true },
250250
{ .index = MSR_SYSCALL_MASK, .always = true },
251251
#endif
252+
{ .index = MSR_IA32_PRED_CMD, .always = false },
252253
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
253254
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
254255
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -529,6 +530,7 @@ struct svm_cpu_data {
529530
struct kvm_ldttss_desc *tss_desc;
530531

531532
struct page *save_area;
533+
struct vmcb *current_vmcb;
532534
};
533535

534536
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -1703,11 +1705,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
17031705
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
17041706
kvm_vcpu_uninit(vcpu);
17051707
kmem_cache_free(kvm_vcpu_cache, svm);
1708+
/*
1709+
* The vmcb page can be recycled, causing a false negative in
1710+
* svm_vcpu_load(). So do a full IBPB now.
1711+
*/
1712+
indirect_branch_prediction_barrier();
17061713
}
17071714

17081715
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
17091716
{
17101717
struct vcpu_svm *svm = to_svm(vcpu);
1718+
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
17111719
int i;
17121720

17131721
if (unlikely(cpu != vcpu->cpu)) {
@@ -1736,6 +1744,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
17361744
if (static_cpu_has(X86_FEATURE_RDTSCP))
17371745
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
17381746

1747+
if (sd->current_vmcb != svm->vmcb) {
1748+
sd->current_vmcb = svm->vmcb;
1749+
indirect_branch_prediction_barrier();
1750+
}
17391751
avic_vcpu_load(vcpu, cpu);
17401752
}
17411753

@@ -3684,6 +3696,22 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
36843696
case MSR_IA32_TSC:
36853697
kvm_write_tsc(vcpu, msr);
36863698
break;
3699+
case MSR_IA32_PRED_CMD:
3700+
if (!msr->host_initiated &&
3701+
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
3702+
return 1;
3703+
3704+
if (data & ~PRED_CMD_IBPB)
3705+
return 1;
3706+
3707+
if (!data)
3708+
break;
3709+
3710+
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3711+
if (is_guest_mode(vcpu))
3712+
break;
3713+
set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
3714+
break;
36873715
case MSR_STAR:
36883716
svm->vmcb->save.star = data;
36893717
break;

arch/x86/kvm/vmx.c

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ struct vcpu_vmx {
593593
u64 msr_host_kernel_gs_base;
594594
u64 msr_guest_kernel_gs_base;
595595
#endif
596+
596597
u32 vm_entry_controls_shadow;
597598
u32 vm_exit_controls_shadow;
598599
u32 secondary_exec_control;
@@ -934,6 +935,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
934935
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
935936
u16 error_code);
936937
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
938+
static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
939+
u32 msr, int type);
937940

938941
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
939942
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1905,6 +1908,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
19051908
vmcs_write32(EXCEPTION_BITMAP, eb);
19061909
}
19071910

1911+
/*
1912+
* Check if MSR is intercepted for L01 MSR bitmap.
1913+
*/
1914+
static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
1915+
{
1916+
unsigned long *msr_bitmap;
1917+
int f = sizeof(unsigned long);
1918+
1919+
if (!cpu_has_vmx_msr_bitmap())
1920+
return true;
1921+
1922+
msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
1923+
1924+
if (msr <= 0x1fff) {
1925+
return !!test_bit(msr, msr_bitmap + 0x800 / f);
1926+
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1927+
msr &= 0x1fff;
1928+
return !!test_bit(msr, msr_bitmap + 0xc00 / f);
1929+
}
1930+
1931+
return true;
1932+
}
1933+
19081934
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
19091935
unsigned long entry, unsigned long exit)
19101936
{
@@ -2283,6 +2309,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
22832309
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
22842310
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
22852311
vmcs_load(vmx->loaded_vmcs->vmcs);
2312+
indirect_branch_prediction_barrier();
22862313
}
22872314

22882315
if (!already_loaded) {
@@ -3340,6 +3367,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
33403367
case MSR_IA32_TSC:
33413368
kvm_write_tsc(vcpu, msr_info);
33423369
break;
3370+
case MSR_IA32_PRED_CMD:
3371+
if (!msr_info->host_initiated &&
3372+
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
3373+
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
3374+
return 1;
3375+
3376+
if (data & ~PRED_CMD_IBPB)
3377+
return 1;
3378+
3379+
if (!data)
3380+
break;
3381+
3382+
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3383+
3384+
/*
3385+
* For non-nested:
3386+
* When it's written (to non-zero) for the first time, pass
3387+
* it through.
3388+
*
3389+
* For nested:
3390+
* The handling of the MSR bitmap for L2 guests is done in
3391+
* nested_vmx_merge_msr_bitmap. We should not touch the
3392+
* vmcs02.msr_bitmap here since it gets completely overwritten
3393+
* in the merging.
3394+
*/
3395+
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
3396+
MSR_TYPE_W);
3397+
break;
33433398
case MSR_IA32_CR_PAT:
33443399
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
33453400
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -10042,9 +10097,23 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
1004210097
struct page *page;
1004310098
unsigned long *msr_bitmap_l1;
1004410099
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
10100+
/*
10101+
* pred_cmd is trying to verify two things:
10102+
*
10103+
* 1. L0 gave a permission to L1 to actually passthrough the MSR. This
10104+
* ensures that we do not accidentally generate an L02 MSR bitmap
10105+
* from the L12 MSR bitmap that is too permissive.
10106+
* 2. That L1 or L2s have actually used the MSR. This avoids
10107+
* unnecessarily merging of the bitmap if the MSR is unused. This
10108+
* works properly because we only update the L01 MSR bitmap lazily.
10109+
* So even if L0 should pass L1 these MSRs, the L01 bitmap is only
10110+
* updated to reflect this when L1 (or its L2s) actually write to
10111+
* the MSR.
10112+
*/
10113+
bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
1004510114

10046-
/* This shortcut is ok because we support only x2APIC MSRs so far. */
10047-
if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
10115+
if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10116+
!pred_cmd)
1004810117
return false;
1004910118

1005010119
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
@@ -10077,6 +10146,13 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
1007710146
MSR_TYPE_W);
1007810147
}
1007910148
}
10149+
10150+
if (pred_cmd)
10151+
nested_vmx_disable_intercept_for_msr(
10152+
msr_bitmap_l1, msr_bitmap_l0,
10153+
MSR_IA32_PRED_CMD,
10154+
MSR_TYPE_W);
10155+
1008010156
kunmap(page);
1008110157
kvm_release_page_clean(page);
1008210158

0 commit comments

Comments
 (0)