Skip to content

Commit 660a5d5

Browse files
committed
KVM: x86: save/load state on SMM switch
The big ugly one. This patch adds support for switching in and out of system management mode, respectively upon receiving KVM_REQ_SMI and upon executing a RSM instruction. Both 32- and 64-bit formats are supported for the SMM state save area. Reviewed-by: Radim Krčmář <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent cd7764f commit 660a5d5

File tree

4 files changed

+498
-2
lines changed

4 files changed

+498
-2
lines changed

arch/x86/kvm/cpuid.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
7070
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
7171
}
7272

73+
static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
74+
{
75+
struct kvm_cpuid_entry2 *best;
76+
77+
best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
78+
return best && (best->edx & bit(X86_FEATURE_LM));
79+
}
80+
7381
static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
7482
{
7583
struct kvm_cpuid_entry2 *best;

arch/x86/kvm/emulate.c

Lines changed: 247 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2259,12 +2259,258 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
22592259
return rc;
22602260
}
22612261

2262+
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
2263+
{
2264+
u32 eax, ebx, ecx, edx;
2265+
2266+
eax = 0x80000001;
2267+
ecx = 0;
2268+
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
2269+
return edx & bit(X86_FEATURE_LM);
2270+
}
2271+
2272+
#define GET_SMSTATE(type, smbase, offset) \
2273+
({ \
2274+
type __val; \
2275+
int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
2276+
sizeof(__val), NULL); \
2277+
if (r != X86EMUL_CONTINUE) \
2278+
return X86EMUL_UNHANDLEABLE; \
2279+
__val; \
2280+
})
2281+
2282+
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
2283+
{
2284+
desc->g = (flags >> 23) & 1;
2285+
desc->d = (flags >> 22) & 1;
2286+
desc->l = (flags >> 21) & 1;
2287+
desc->avl = (flags >> 20) & 1;
2288+
desc->p = (flags >> 15) & 1;
2289+
desc->dpl = (flags >> 13) & 3;
2290+
desc->s = (flags >> 12) & 1;
2291+
desc->type = (flags >> 8) & 15;
2292+
}
2293+
2294+
static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
2295+
{
2296+
struct desc_struct desc;
2297+
int offset;
2298+
u16 selector;
2299+
2300+
selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
2301+
2302+
if (n < 3)
2303+
offset = 0x7f84 + n * 12;
2304+
else
2305+
offset = 0x7f2c + (n - 3) * 12;
2306+
2307+
set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
2308+
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
2309+
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
2310+
ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
2311+
return X86EMUL_CONTINUE;
2312+
}
2313+
2314+
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
2315+
{
2316+
struct desc_struct desc;
2317+
int offset;
2318+
u16 selector;
2319+
u32 base3;
2320+
2321+
offset = 0x7e00 + n * 16;
2322+
2323+
selector = GET_SMSTATE(u16, smbase, offset);
2324+
rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
2325+
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
2326+
set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
2327+
base3 = GET_SMSTATE(u32, smbase, offset + 12);
2328+
2329+
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
2330+
return X86EMUL_CONTINUE;
2331+
}
2332+
2333+
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
2334+
u64 cr0, u64 cr4)
2335+
{
2336+
int bad;
2337+
2338+
/*
2339+
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
2340+
* Then enable protected mode. However, PCID cannot be enabled
2341+
* if EFER.LMA=0, so set it separately.
2342+
*/
2343+
bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
2344+
if (bad)
2345+
return X86EMUL_UNHANDLEABLE;
2346+
2347+
bad = ctxt->ops->set_cr(ctxt, 0, cr0);
2348+
if (bad)
2349+
return X86EMUL_UNHANDLEABLE;
2350+
2351+
if (cr4 & X86_CR4_PCIDE) {
2352+
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
2353+
if (bad)
2354+
return X86EMUL_UNHANDLEABLE;
2355+
}
2356+
2357+
return X86EMUL_CONTINUE;
2358+
}
2359+
2360+
static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
2361+
{
2362+
struct desc_struct desc;
2363+
struct desc_ptr dt;
2364+
u16 selector;
2365+
u32 val, cr0, cr4;
2366+
int i;
2367+
2368+
cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
2369+
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
2370+
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
2371+
ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
2372+
2373+
for (i = 0; i < 8; i++)
2374+
*reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
2375+
2376+
val = GET_SMSTATE(u32, smbase, 0x7fcc);
2377+
ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
2378+
val = GET_SMSTATE(u32, smbase, 0x7fc8);
2379+
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
2380+
2381+
selector = GET_SMSTATE(u32, smbase, 0x7fc4);
2382+
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
2383+
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
2384+
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
2385+
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
2386+
2387+
selector = GET_SMSTATE(u32, smbase, 0x7fc0);
2388+
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
2389+
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
2390+
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
2391+
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
2392+
2393+
dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
2394+
dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
2395+
ctxt->ops->set_gdt(ctxt, &dt);
2396+
2397+
dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
2398+
dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
2399+
ctxt->ops->set_idt(ctxt, &dt);
2400+
2401+
for (i = 0; i < 6; i++) {
2402+
int r = rsm_load_seg_32(ctxt, smbase, i);
2403+
if (r != X86EMUL_CONTINUE)
2404+
return r;
2405+
}
2406+
2407+
cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
2408+
2409+
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
2410+
2411+
return rsm_enter_protected_mode(ctxt, cr0, cr4);
2412+
}
2413+
2414+
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
2415+
{
2416+
struct desc_struct desc;
2417+
struct desc_ptr dt;
2418+
u64 val, cr0, cr4;
2419+
u32 base3;
2420+
u16 selector;
2421+
int i;
2422+
2423+
for (i = 0; i < 16; i++)
2424+
*reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
2425+
2426+
ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
2427+
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
2428+
2429+
val = GET_SMSTATE(u32, smbase, 0x7f68);
2430+
ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
2431+
val = GET_SMSTATE(u32, smbase, 0x7f60);
2432+
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
2433+
2434+
cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
2435+
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
2436+
cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
2437+
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
2438+
val = GET_SMSTATE(u64, smbase, 0x7ed0);
2439+
ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
2440+
2441+
selector = GET_SMSTATE(u32, smbase, 0x7e90);
2442+
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
2443+
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
2444+
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
2445+
base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
2446+
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
2447+
2448+
dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
2449+
dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
2450+
ctxt->ops->set_idt(ctxt, &dt);
2451+
2452+
selector = GET_SMSTATE(u32, smbase, 0x7e70);
2453+
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
2454+
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
2455+
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
2456+
base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
2457+
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
2458+
2459+
dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
2460+
dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
2461+
ctxt->ops->set_gdt(ctxt, &dt);
2462+
2463+
for (i = 0; i < 6; i++) {
2464+
int r = rsm_load_seg_64(ctxt, smbase, i);
2465+
if (r != X86EMUL_CONTINUE)
2466+
return r;
2467+
}
2468+
2469+
return rsm_enter_protected_mode(ctxt, cr0, cr4);
2470+
}
2471+
22622472
static int em_rsm(struct x86_emulate_ctxt *ctxt)
22632473
{
2474+
unsigned long cr0, cr4, efer;
2475+
u64 smbase;
2476+
int ret;
2477+
22642478
if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
22652479
return emulate_ud(ctxt);
22662480

2267-
return X86EMUL_UNHANDLEABLE;
2481+
/*
2482+
* Get back to real mode, to prepare a safe state in which to load
2483+
* CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
2484+
* to read_std/write_std are not virtual.
2485+
*
2486+
* CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
2487+
*/
2488+
cr0 = ctxt->ops->get_cr(ctxt, 0);
2489+
if (cr0 & X86_CR0_PE)
2490+
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
2491+
cr4 = ctxt->ops->get_cr(ctxt, 4);
2492+
if (cr4 & X86_CR4_PAE)
2493+
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
2494+
efer = 0;
2495+
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
2496+
2497+
smbase = ctxt->ops->get_smbase(ctxt);
2498+
if (emulator_has_longmode(ctxt))
2499+
ret = rsm_load_state_64(ctxt, smbase + 0x8000);
2500+
else
2501+
ret = rsm_load_state_32(ctxt, smbase + 0x8000);
2502+
2503+
if (ret != X86EMUL_CONTINUE) {
2504+
/* FIXME: should triple fault */
2505+
return X86EMUL_UNHANDLEABLE;
2506+
}
2507+
2508+
if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
2509+
ctxt->ops->set_nmi_mask(ctxt, false);
2510+
2511+
ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
2512+
ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
2513+
return X86EMUL_CONTINUE;
22682514
}
22692515

22702516
static void

arch/x86/kvm/trace.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
952952
__entry->delta < 0 ? "early" : "late")
953953
);
954954

955+
TRACE_EVENT(kvm_enter_smm,
956+
TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
957+
TP_ARGS(vcpu_id, smbase, entering),
958+
959+
TP_STRUCT__entry(
960+
__field( unsigned int, vcpu_id )
961+
__field( u64, smbase )
962+
__field( bool, entering )
963+
),
964+
965+
TP_fast_assign(
966+
__entry->vcpu_id = vcpu_id;
967+
__entry->smbase = smbase;
968+
__entry->entering = entering;
969+
),
970+
971+
TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
972+
__entry->vcpu_id,
973+
__entry->entering ? "entering" : "leaving",
974+
__entry->smbase)
975+
);
976+
955977
#endif /* _TRACE_KVM_H */
956978

957979
#undef TRACE_INCLUDE_PATH

0 commit comments

Comments
 (0)