Skip to content

Commit ce88dec

Browse files
Xiao Guangrongavikivity
authored andcommitted
KVM: MMU: mmio page fault support
The idea is from Avi: | We could cache the result of a miss in an spte by using a reserved bit, and | checking the page fault error code (or seeing if we get an ept violation or | ept misconfiguration), so if we get repeated mmio on a page, we don't need to | search the slot list/tree. | (https://lkml.org/lkml/2011/2/22/221) When the page fault is caused by mmio, we cache the info in the shadow page table, and also set the reserved bits in the shadow page table, so if the mmio is caused again, we can quickly identify it and emulate it directly Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it can be reduced by this feature, and also avoid walking guest page table for soft mmu. [jan: fix operator precedence issue] Signed-off-by: Xiao Guangrong <[email protected]> Signed-off-by: Jan Kiszka <[email protected]> Signed-off-by: Avi Kivity <[email protected]>
1 parent dd3bfd5 commit ce88dec

File tree

6 files changed

+255
-14
lines changed

6 files changed

+255
-14
lines changed

arch/x86/kvm/mmu.c

Lines changed: 185 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
197197
static u64 __read_mostly shadow_user_mask;
198198
static u64 __read_mostly shadow_accessed_mask;
199199
static u64 __read_mostly shadow_dirty_mask;
200+
static u64 __read_mostly shadow_mmio_mask;
201+
202+
static void mmu_spte_set(u64 *sptep, u64 spte);
203+
204+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
205+
{
206+
shadow_mmio_mask = mmio_mask;
207+
}
208+
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
209+
210+
static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
211+
{
212+
access &= ACC_WRITE_MASK | ACC_USER_MASK;
213+
214+
mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
215+
}
216+
217+
static bool is_mmio_spte(u64 spte)
218+
{
219+
return (spte & shadow_mmio_mask) == shadow_mmio_mask;
220+
}
221+
222+
static gfn_t get_mmio_spte_gfn(u64 spte)
223+
{
224+
return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
225+
}
226+
227+
static unsigned get_mmio_spte_access(u64 spte)
228+
{
229+
return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
230+
}
231+
232+
static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
233+
{
234+
if (unlikely(is_noslot_pfn(pfn))) {
235+
mark_mmio_spte(sptep, gfn, access);
236+
return true;
237+
}
238+
239+
return false;
240+
}
200241

201242
static inline u64 rsvd_bits(int s, int e)
202243
{
@@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
226267

227268
static int is_shadow_present_pte(u64 pte)
228269
{
229-
return pte & PT_PRESENT_MASK;
270+
return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
230271
}
231272

232273
static int is_large_pte(u64 pte)
@@ -285,6 +326,12 @@ static u64 __get_spte_lockless(u64 *sptep)
285326
{
286327
return ACCESS_ONCE(*sptep);
287328
}
329+
330+
static bool __check_direct_spte_mmio_pf(u64 spte)
331+
{
332+
/* It is valid if the spte is zapped. */
333+
return spte == 0ull;
334+
}
288335
#else
289336
union split_spte {
290337
struct {
@@ -388,6 +435,23 @@ static u64 __get_spte_lockless(u64 *sptep)
388435

389436
return spte.spte;
390437
}
438+
439+
static bool __check_direct_spte_mmio_pf(u64 spte)
440+
{
441+
union split_spte sspte = (union split_spte)spte;
442+
u32 high_mmio_mask = shadow_mmio_mask >> 32;
443+
444+
/* It is valid if the spte is zapped. */
445+
if (spte == 0ull)
446+
return true;
447+
448+
/* It is valid if the spte is being zapped. */
449+
if (sspte.spte_low == 0ull &&
450+
(sspte.spte_high & high_mmio_mask) == high_mmio_mask)
451+
return true;
452+
453+
return false;
454+
}
391455
#endif
392456

393457
static bool spte_has_volatile_bits(u64 spte)
@@ -1745,7 +1809,8 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
17451809
child = page_header(pte & PT64_BASE_ADDR_MASK);
17461810
drop_parent_pte(child, spte);
17471811
}
1748-
}
1812+
} else if (is_mmio_spte(pte))
1813+
mmu_spte_clear_no_track(spte);
17491814

17501815
if (is_large_pte(pte))
17511816
--kvm->stat.lpages;
@@ -2120,6 +2185,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
21202185
u64 spte, entry = *sptep;
21212186
int ret = 0;
21222187

2188+
if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2189+
return 0;
2190+
21232191
/*
21242192
* We don't set the accessed bit, since we sometimes want to see
21252193
* whether the guest actually used the pte (in order to detect
@@ -2255,6 +2323,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
22552323
kvm_mmu_flush_tlb(vcpu);
22562324
}
22572325

2326+
if (unlikely(is_mmio_spte(*sptep) && emulate))
2327+
*emulate = 1;
2328+
22582329
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
22592330
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
22602331
is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2481,7 +2552,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
24812552

24822553
static bool mmu_invalid_pfn(pfn_t pfn)
24832554
{
2484-
return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn));
2555+
return unlikely(is_invalid_pfn(pfn));
24852556
}
24862557

24872558
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
@@ -2495,11 +2566,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
24952566
goto exit;
24962567
}
24972568

2498-
if (unlikely(is_noslot_pfn(pfn))) {
2569+
if (unlikely(is_noslot_pfn(pfn)))
24992570
vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2500-
*ret_val = 1;
2501-
goto exit;
2502-
}
25032571

25042572
ret = false;
25052573
exit:
@@ -2813,13 +2881,103 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
28132881
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
28142882
}
28152883

2884+
static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2885+
{
2886+
if (direct)
2887+
return vcpu_match_mmio_gpa(vcpu, addr);
2888+
2889+
return vcpu_match_mmio_gva(vcpu, addr);
2890+
}
2891+
2892+
2893+
/*
2894+
* On direct hosts, the last spte is only allows two states
2895+
* for mmio page fault:
2896+
* - It is the mmio spte
2897+
* - It is zapped or it is being zapped.
2898+
*
2899+
* This function completely checks the spte when the last spte
2900+
* is not the mmio spte.
2901+
*/
2902+
static bool check_direct_spte_mmio_pf(u64 spte)
2903+
{
2904+
return __check_direct_spte_mmio_pf(spte);
2905+
}
2906+
2907+
static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
2908+
{
2909+
struct kvm_shadow_walk_iterator iterator;
2910+
u64 spte = 0ull;
2911+
2912+
walk_shadow_page_lockless_begin(vcpu);
2913+
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
2914+
if (!is_shadow_present_pte(spte))
2915+
break;
2916+
walk_shadow_page_lockless_end(vcpu);
2917+
2918+
return spte;
2919+
}
2920+
2921+
/*
2922+
* If it is a real mmio page fault, return 1 and emulat the instruction
2923+
* directly, return 0 to let CPU fault again on the address, -1 is
2924+
* returned if bug is detected.
2925+
*/
2926+
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2927+
{
2928+
u64 spte;
2929+
2930+
if (quickly_check_mmio_pf(vcpu, addr, direct))
2931+
return 1;
2932+
2933+
spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
2934+
2935+
if (is_mmio_spte(spte)) {
2936+
gfn_t gfn = get_mmio_spte_gfn(spte);
2937+
unsigned access = get_mmio_spte_access(spte);
2938+
2939+
if (direct)
2940+
addr = 0;
2941+
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
2942+
return 1;
2943+
}
2944+
2945+
/*
2946+
* It's ok if the gva is remapped by other cpus on shadow guest,
2947+
* it's a BUG if the gfn is not a mmio page.
2948+
*/
2949+
if (direct && !check_direct_spte_mmio_pf(spte))
2950+
return -1;
2951+
2952+
/*
2953+
* If the page table is zapped by other cpus, let CPU fault again on
2954+
* the address.
2955+
*/
2956+
return 0;
2957+
}
2958+
EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
2959+
2960+
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
2961+
u32 error_code, bool direct)
2962+
{
2963+
int ret;
2964+
2965+
ret = handle_mmio_page_fault_common(vcpu, addr, direct);
2966+
WARN_ON(ret < 0);
2967+
return ret;
2968+
}
2969+
28162970
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
28172971
u32 error_code, bool prefault)
28182972
{
28192973
gfn_t gfn;
28202974
int r;
28212975

28222976
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2977+
2978+
if (unlikely(error_code & PFERR_RSVD_MASK))
2979+
return handle_mmio_page_fault(vcpu, gva, error_code, true);
2980+
28232981
r = mmu_topup_memory_caches(vcpu);
28242982
if (r)
28252983
return r;
@@ -2896,6 +3054,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
28963054
ASSERT(vcpu);
28973055
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
28983056

3057+
if (unlikely(error_code & PFERR_RSVD_MASK))
3058+
return handle_mmio_page_fault(vcpu, gpa, error_code, true);
3059+
28993060
r = mmu_topup_memory_caches(vcpu);
29003061
if (r)
29013062
return r;
@@ -2993,6 +3154,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
29933154
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
29943155
}
29953156

3157+
static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3158+
int *nr_present)
3159+
{
3160+
if (unlikely(is_mmio_spte(*sptep))) {
3161+
if (gfn != get_mmio_spte_gfn(*sptep)) {
3162+
mmu_spte_clear_no_track(sptep);
3163+
return true;
3164+
}
3165+
3166+
(*nr_present)++;
3167+
mark_mmio_spte(sptep, gfn, access);
3168+
return true;
3169+
}
3170+
3171+
return false;
3172+
}
3173+
29963174
#define PTTYPE 64
29973175
#include "paging_tmpl.h"
29983176
#undef PTTYPE

arch/x86/kvm/mmu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
#define PFERR_FETCH_MASK (1U << 4)
5050

5151
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
53+
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
5254
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
5355

5456
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)

arch/x86/kvm/paging_tmpl.h

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
577577

578578
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
579579

580+
if (unlikely(error_code & PFERR_RSVD_MASK))
581+
return handle_mmio_page_fault(vcpu, addr, error_code,
582+
mmu_is_nested(vcpu));
583+
580584
r = mmu_topup_memory_caches(vcpu);
581585
if (r)
582586
return r;
@@ -684,7 +688,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
684688
--vcpu->kvm->stat.lpages;
685689
drop_spte(vcpu->kvm, sptep);
686690
need_flush = 1;
687-
}
691+
} else if (is_mmio_spte(*sptep))
692+
mmu_spte_clear_no_track(sptep);
688693

689694
break;
690695
}
@@ -780,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
780785
gpa_t pte_gpa;
781786
gfn_t gfn;
782787

783-
if (!is_shadow_present_pte(sp->spt[i]))
788+
if (!sp->spt[i])
784789
continue;
785790

786791
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -789,22 +794,26 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
789794
sizeof(pt_element_t)))
790795
return -EINVAL;
791796

792-
gfn = gpte_to_gfn(gpte);
793-
794797
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
795798
vcpu->kvm->tlbs_dirty++;
796799
continue;
797800
}
798801

802+
gfn = gpte_to_gfn(gpte);
803+
pte_access = sp->role.access;
804+
pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
805+
806+
if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
807+
continue;
808+
799809
if (gfn != sp->gfns[i]) {
800810
drop_spte(vcpu->kvm, &sp->spt[i]);
801811
vcpu->kvm->tlbs_dirty++;
802812
continue;
803813
}
804814

805815
nr_present++;
806-
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
807-
true);
816+
808817
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
809818

810819
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,

0 commit comments

Comments
 (0)