Skip to content

Commit c2a2ac2

Browse files
Xiao Guangrongavikivity
authored andcommitted
KVM: MMU: lockless walking shadow page table
Use rcu to protect shadow pages table to be freed, so we can safely walk it, it should run fastly and is needed by mmio page fault Signed-off-by: Xiao Guangrong <[email protected]> Signed-off-by: Avi Kivity <[email protected]>
1 parent 603e065 commit c2a2ac2

File tree

2 files changed

+132
-8
lines changed

2 files changed

+132
-8
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,12 @@ struct kvm_mmu_page {
233233
unsigned int unsync_children;
234234
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
235235
DECLARE_BITMAP(unsync_child_bitmap, 512);
236+
237+
#ifdef CONFIG_X86_32
238+
int clear_spte_count;
239+
#endif
240+
241+
struct rcu_head rcu;
236242
};
237243

238244
struct kvm_pv_mmu_op_buffer {
@@ -486,6 +492,8 @@ struct kvm_arch {
486492
u64 hv_guest_os_id;
487493
u64 hv_hypercall;
488494

495+
atomic_t reader_counter;
496+
489497
#ifdef CONFIG_KVM_MMU_AUDIT
490498
int audit_point;
491499
#endif

arch/x86/kvm/mmu.c

Lines changed: 124 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
182182
shadow_walk_okay(&(_walker)); \
183183
shadow_walk_next(&(_walker)))
184184

185+
#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
186+
for (shadow_walk_init(&(_walker), _vcpu, _addr); \
187+
shadow_walk_okay(&(_walker)) && \
188+
({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
189+
__shadow_walk_next(&(_walker), spte))
190+
185191
static struct kmem_cache *pte_list_desc_cache;
186192
static struct kmem_cache *mmu_page_header_cache;
187193
static struct percpu_counter kvm_total_used_mmu_pages;
@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
274280
{
275281
return xchg(sptep, spte);
276282
}
283+
284+
static u64 __get_spte_lockless(u64 *sptep)
285+
{
286+
return ACCESS_ONCE(*sptep);
287+
}
277288
#else
278289
union split_spte {
279290
struct {
@@ -283,6 +294,18 @@ union split_spte {
283294
u64 spte;
284295
};
285296

297+
static void count_spte_clear(u64 *sptep, u64 spte)
298+
{
299+
struct kvm_mmu_page *sp = page_header(__pa(sptep));
300+
301+
if (is_shadow_present_pte(spte))
302+
return;
303+
304+
/* Ensure the spte is completely set before we increase the count */
305+
smp_wmb();
306+
sp->clear_spte_count++;
307+
}
308+
286309
static void __set_spte(u64 *sptep, u64 spte)
287310
{
288311
union split_spte *ssptep, sspte;
@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
318341
smp_wmb();
319342

320343
ssptep->spte_high = sspte.spte_high;
344+
count_spte_clear(sptep, spte);
321345
}
322346

323347
static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
330354
/* xchg acts as a barrier before the setting of the high bits */
331355
orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
332356
orig.spte_high = ssptep->spte_high = sspte.spte_high;
357+
count_spte_clear(sptep, spte);
333358

334359
return orig.spte;
335360
}
361+
362+
/*
363+
* The idea using the light way get the spte on x86_32 guest is from
364+
* gup_get_pte(arch/x86/mm/gup.c).
365+
* The difference is we can not catch the spte tlb flush if we leave
366+
* guest mode, so we emulate it by increase clear_spte_count when spte
367+
* is cleared.
368+
*/
369+
static u64 __get_spte_lockless(u64 *sptep)
370+
{
371+
struct kvm_mmu_page *sp = page_header(__pa(sptep));
372+
union split_spte spte, *orig = (union split_spte *)sptep;
373+
int count;
374+
375+
retry:
376+
count = sp->clear_spte_count;
377+
smp_rmb();
378+
379+
spte.spte_low = orig->spte_low;
380+
smp_rmb();
381+
382+
spte.spte_high = orig->spte_high;
383+
smp_rmb();
384+
385+
if (unlikely(spte.spte_low != orig->spte_low ||
386+
count != sp->clear_spte_count))
387+
goto retry;
388+
389+
return spte.spte;
390+
}
336391
#endif
337392

338393
static bool spte_has_volatile_bits(u64 spte)
@@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
435490
__update_clear_spte_fast(sptep, 0ull);
436491
}
437492

493+
static u64 mmu_spte_get_lockless(u64 *sptep)
494+
{
495+
return __get_spte_lockless(sptep);
496+
}
497+
498+
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
499+
{
500+
rcu_read_lock();
501+
atomic_inc(&vcpu->kvm->arch.reader_counter);
502+
503+
/* Increase the counter before walking shadow page table */
504+
smp_mb__after_atomic_inc();
505+
}
506+
507+
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
508+
{
509+
/* Decrease the counter after walking shadow page table finished */
510+
smp_mb__before_atomic_dec();
511+
atomic_dec(&vcpu->kvm->arch.reader_counter);
512+
rcu_read_unlock();
513+
}
514+
438515
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
439516
struct kmem_cache *base_cache, int min)
440517
{
@@ -1597,17 +1674,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
15971674
return true;
15981675
}
15991676

1600-
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1677+
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
1678+
u64 spte)
16011679
{
1602-
if (is_last_spte(*iterator->sptep, iterator->level)) {
1680+
if (is_last_spte(spte, iterator->level)) {
16031681
iterator->level = 0;
16041682
return;
16051683
}
16061684

1607-
iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1685+
iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
16081686
--iterator->level;
16091687
}
16101688

1689+
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1690+
{
1691+
return __shadow_walk_next(iterator, *iterator->sptep);
1692+
}
1693+
16111694
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
16121695
{
16131696
u64 spte;
@@ -1754,6 +1837,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
17541837
return ret;
17551838
}
17561839

1840+
static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1841+
{
1842+
struct kvm_mmu_page *sp;
1843+
1844+
list_for_each_entry(sp, invalid_list, link)
1845+
kvm_mmu_isolate_page(sp);
1846+
}
1847+
1848+
static void free_pages_rcu(struct rcu_head *head)
1849+
{
1850+
struct kvm_mmu_page *next, *sp;
1851+
1852+
sp = container_of(head, struct kvm_mmu_page, rcu);
1853+
while (sp) {
1854+
if (!list_empty(&sp->link))
1855+
next = list_first_entry(&sp->link,
1856+
struct kvm_mmu_page, link);
1857+
else
1858+
next = NULL;
1859+
kvm_mmu_free_page(sp);
1860+
sp = next;
1861+
}
1862+
}
1863+
17571864
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
17581865
struct list_head *invalid_list)
17591866
{
@@ -1764,6 +1871,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
17641871

17651872
kvm_flush_remote_tlbs(kvm);
17661873

1874+
if (atomic_read(&kvm->arch.reader_counter)) {
1875+
kvm_mmu_isolate_pages(invalid_list);
1876+
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1877+
list_del_init(invalid_list);
1878+
call_rcu(&sp->rcu, free_pages_rcu);
1879+
return;
1880+
}
1881+
17671882
do {
17681883
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
17691884
WARN_ON(!sp->role.invalid || sp->root_count);
@@ -3784,16 +3899,17 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
37843899
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
37853900
{
37863901
struct kvm_shadow_walk_iterator iterator;
3902+
u64 spte;
37873903
int nr_sptes = 0;
37883904

3789-
spin_lock(&vcpu->kvm->mmu_lock);
3790-
for_each_shadow_entry(vcpu, addr, iterator) {
3791-
sptes[iterator.level-1] = *iterator.sptep;
3905+
walk_shadow_page_lockless_begin(vcpu);
3906+
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3907+
sptes[iterator.level-1] = spte;
37923908
nr_sptes++;
3793-
if (!is_shadow_present_pte(*iterator.sptep))
3909+
if (!is_shadow_present_pte(spte))
37943910
break;
37953911
}
3796-
spin_unlock(&vcpu->kvm->mmu_lock);
3912+
walk_shadow_page_lockless_end(vcpu);
37973913

37983914
return nr_sptes;
37993915
}

0 commit comments

Comments
 (0)