Skip to content

Commit 002c5f7

Browse files
Sean Christophersonbonzini
authored andcommitted
KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot
James Harvey reported a livelock that was introduced by commit d012a06 ("Revert "KVM: x86/mmu: Zap only the relevant pages when removing a memslot""). The livelock occurs because kvm_mmu_zap_all() as it exists today will voluntarily reschedule and drop KVM's mmu_lock, which allows other vCPUs to add shadow pages. With enough vCPUs, kvm_mmu_zap_all() can get stuck in an infinite loop as it can never zap all pages before observing lock contention or the need to reschedule. The equivalent of kvm_mmu_zap_all() that was in use at the time of the reverted commit (4e10313, "KVM: x86/mmu: Zap only the relevant pages when removing a memslot") employed a fast invalidate mechanism and was not susceptible to the above livelock. There are three ways to fix the livelock: - Reverting the revert (commit d012a06) is not a viable option as the revert is needed to fix a regression that occurs when the guest has one or more assigned devices. It's unlikely we'll root cause the device assignment regression soon enough to fix the regression timely. - Remove the conditional reschedule from kvm_mmu_zap_all(). However, although removing the reschedule would be a smaller code change, it's less safe in the sense that the resulting kvm_mmu_zap_all() hasn't been used in the wild for flushing memslots since the fast invalidate mechanism was introduced by commit 6ca18b6 ("KVM: x86: use the fast way to invalidate all pages"), back in 2013. - Reintroduce the fast invalidate mechanism and use it when zapping shadow pages in response to a memslot being deleted/moved, which is what this patch does. For all intents and purposes, this is a revert of commit ea145aa ("Revert "KVM: MMU: fast invalidate all pages"") and a partial revert of commit 7390de1 ("Revert "KVM: x86: use the fast way to invalidate all pages""), i.e. restores the behavior of commit 5304b8d ("KVM: MMU: fast invalidate all pages") and commit 6ca18b6 ("KVM: x86: use the fast way to invalidate all pages") respectively. Fixes: d012a06 ("Revert "KVM: x86/mmu: Zap only the relevant pages when removing a memslot"") Reported-by: James Harvey <[email protected]> Cc: Alex Willamson <[email protected]> Cc: Paolo Bonzini <[email protected]> Cc: [email protected] Signed-off-by: Sean Christopherson <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 541ab2a commit 002c5f7

File tree

2 files changed

+101
-2
lines changed

2 files changed

+101
-2
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ struct kvm_mmu_page {
335335
int root_count; /* Currently serving as active root */
336336
unsigned int unsync_children;
337337
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
338+
unsigned long mmu_valid_gen;
338339
DECLARE_BITMAP(unsync_child_bitmap, 512);
339340

340341
#ifdef CONFIG_X86_32
@@ -856,6 +857,7 @@ struct kvm_arch {
856857
unsigned long n_requested_mmu_pages;
857858
unsigned long n_max_mmu_pages;
858859
unsigned int indirect_shadow_pages;
860+
unsigned long mmu_valid_gen;
859861
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
860862
/*
861863
* Hash table of struct kvm_mmu_page.

arch/x86/kvm/mmu.c

Lines changed: 99 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
20952095
if (!direct)
20962096
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
20972097
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2098+
2099+
/*
2100+
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2101+
* depends on valid pages being added to the head of the list. See
2102+
* comments in kvm_zap_obsolete_pages().
2103+
*/
20982104
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
20992105
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
21002106
return sp;
@@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
22442250
#define for_each_valid_sp(_kvm, _sp, _gfn) \
22452251
hlist_for_each_entry(_sp, \
22462252
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2247-
if ((_sp)->role.invalid) { \
2253+
if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
22482254
} else
22492255

22502256
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
23012307
static void mmu_audit_disable(void) { }
23022308
#endif
23032309

2310+
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2311+
{
2312+
return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2313+
}
2314+
23042315
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
23052316
struct list_head *invalid_list)
23062317
{
@@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
25252536
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
25262537
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
25272538
}
2539+
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
25282540
clear_page(sp->spt);
25292541
trace_kvm_mmu_get_page(sp, true);
25302542

@@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
42334245
return false;
42344246

42354247
if (cached_root_available(vcpu, new_cr3, new_role)) {
4248+
/*
4249+
* It is possible that the cached previous root page is
4250+
* obsolete because of a change in the MMU generation
4251+
* number. However, changing the generation number is
4252+
* accompanied by KVM_REQ_MMU_RELOAD, which will free
4253+
* the root set here and allocate a new one.
4254+
*/
42364255
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
42374256
if (!skip_tlb_flush) {
42384257
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -5649,11 +5668,89 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
56495668
return alloc_mmu_pages(vcpu);
56505669
}
56515670

5671+
5672+
static void kvm_zap_obsolete_pages(struct kvm *kvm)
5673+
{
5674+
struct kvm_mmu_page *sp, *node;
5675+
LIST_HEAD(invalid_list);
5676+
int ign;
5677+
5678+
restart:
5679+
list_for_each_entry_safe_reverse(sp, node,
5680+
&kvm->arch.active_mmu_pages, link) {
5681+
/*
5682+
* No obsolete valid page exists before a newly created page
5683+
* since active_mmu_pages is a FIFO list.
5684+
*/
5685+
if (!is_obsolete_sp(kvm, sp))
5686+
break;
5687+
5688+
/*
5689+
* Do not repeatedly zap a root page to avoid unnecessary
5690+
* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
5691+
* progress:
5692+
* vcpu 0 vcpu 1
5693+
* call vcpu_enter_guest():
5694+
* 1): handle KVM_REQ_MMU_RELOAD
5695+
* and require mmu-lock to
5696+
* load mmu
5697+
* repeat:
5698+
* 1): zap root page and
5699+
* send KVM_REQ_MMU_RELOAD
5700+
*
5701+
* 2): if (cond_resched_lock(mmu-lock))
5702+
*
5703+
* 2): hold mmu-lock and load mmu
5704+
*
5705+
* 3): see KVM_REQ_MMU_RELOAD bit
5706+
* on vcpu->requests is set
5707+
* then return 1 to call
5708+
* vcpu_enter_guest() again.
5709+
* goto repeat;
5710+
*
5711+
* Since we are reversely walking the list and the invalid
5712+
* list will be moved to the head, skip the invalid page
5713+
* can help us to avoid the infinity list walking.
5714+
*/
5715+
if (sp->role.invalid)
5716+
continue;
5717+
5718+
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5719+
kvm_mmu_commit_zap_page(kvm, &invalid_list);
5720+
cond_resched_lock(&kvm->mmu_lock);
5721+
goto restart;
5722+
}
5723+
5724+
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5725+
goto restart;
5726+
}
5727+
5728+
kvm_mmu_commit_zap_page(kvm, &invalid_list);
5729+
}
5730+
5731+
/*
5732+
* Fast invalidate all shadow pages and use lock-break technique
5733+
* to zap obsolete pages.
5734+
*
5735+
* It's required when memslot is being deleted or VM is being
5736+
* destroyed, in these cases, we should ensure that KVM MMU does
5737+
* not use any resource of the being-deleted slot or all slots
5738+
* after calling the function.
5739+
*/
5740+
static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5741+
{
5742+
spin_lock(&kvm->mmu_lock);
5743+
kvm->arch.mmu_valid_gen++;
5744+
5745+
kvm_zap_obsolete_pages(kvm);
5746+
spin_unlock(&kvm->mmu_lock);
5747+
}
5748+
56525749
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
56535750
struct kvm_memory_slot *slot,
56545751
struct kvm_page_track_notifier_node *node)
56555752
{
5656-
kvm_mmu_zap_all(kvm);
5753+
kvm_mmu_zap_all_fast(kvm);
56575754
}
56585755

56595756
void kvm_mmu_init_vm(struct kvm *kvm)

0 commit comments

Comments
 (0)