Skip to content

Commit 137e0ec

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "KVM GUEST_MEMFD fixes for 6.8: - Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY to avoid creating an inconsistent ABI (KVM_MEM_GUEST_MEMFD is not writable from userspace, so there would be no way to write to a read-only guest_memfd). - Update documentation for KVM_SW_PROTECTED_VM to make it abundantly clear that such VMs are purely for development and testing. - Limit KVM_SW_PROTECTED_VM guests to the TDP MMU, as the long term plan is to support confidential VMs with deterministic private memory (SNP and TDX) only in the TDP MMU. - Fix a bug in a GUEST_MEMFD dirty logging test that caused false passes. x86 fixes: - Fix missing marking of a guest page as dirty when emulating an atomic access. - Check for mmu_notifier invalidation events before faulting in the pfn, and before acquiring mmu_lock, to avoid unnecessary work and lock contention with preemptible kernels (including CONFIG_PREEMPT_DYNAMIC in non-preemptible mode). - Disable AMD DebugSwap by default, it breaks VMSA signing and will be re-enabled with a better VM creation API in 6.10. - Do the cache flush of converted pages in svm_register_enc_region() before dropping kvm->lock, to avoid a race with unregistering of the same region and the consequent use-after-free issue" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: SEV: disable SEV-ES DebugSwap by default KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing KVM: SVM: Flush pages under kvm->lock to fix UAF in svm_register_enc_region() KVM: selftests: Add a testcase to verify GUEST_MEMFD and READONLY are exclusive KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases KVM: x86/mmu: Restrict KVM_SW_PROTECTED_VM to the TDP MMU KVM: x86: Update KVM_SW_PROTECTED_VM docs to make it clear they're a WIP KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY KVM: x86: Mark target gfn of emulated atomic instruction as dirty
2 parents 005f6f3 + 5abf6dc commit 137e0ec

File tree

8 files changed

+120
-15
lines changed

8 files changed

+120
-15
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8791,6 +8791,11 @@ means the VM type with value @n is supported. Possible values of @n are::
87918791
#define KVM_X86_DEFAULT_VM 0
87928792
#define KVM_X86_SW_PROTECTED_VM 1
87938793

8794+
Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing.
8795+
Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in
8796+
production. The behavior and effective ABI for software-protected VMs is
8797+
unstable.
8798+
87948799
9. Known KVM API problems
87958800
=========================
87968801

arch/x86/kvm/Kconfig

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,10 @@ config KVM_SW_PROTECTED_VM
8080
depends on KVM && X86_64
8181
select KVM_GENERIC_PRIVATE_MEM
8282
help
83-
Enable support for KVM software-protected VMs. Currently "protected"
84-
means the VM can be backed with memory provided by
85-
KVM_CREATE_GUEST_MEMFD.
83+
Enable support for KVM software-protected VMs. Currently, software-
84+
protected VMs are purely a development and testing vehicle for
85+
KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a
86+
software-protected VM will fail miserably.
8687

8788
If unsure, say "N".
8889

arch/x86/kvm/mmu/mmu.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
44054405
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
44064406
smp_rmb();
44074407

4408+
/*
4409+
* Check for a relevant mmu_notifier invalidation event before getting
4410+
* the pfn from the primary MMU, and before acquiring mmu_lock.
4411+
*
4412+
* For mmu_lock, if there is an in-progress invalidation and the kernel
4413+
* allows preemption, the invalidation task may drop mmu_lock and yield
4414+
* in response to mmu_lock being contended, which is *very* counter-
4415+
* productive as this vCPU can't actually make forward progress until
4416+
* the invalidation completes.
4417+
*
4418+
* Retrying now can also avoid unnessary lock contention in the primary
4419+
* MMU, as the primary MMU doesn't necessarily hold a single lock for
4420+
* the duration of the invalidation, i.e. faulting in a conflicting pfn
4421+
* can cause the invalidation to take longer by holding locks that are
4422+
* needed to complete the invalidation.
4423+
*
4424+
* Do the pre-check even for non-preemtible kernels, i.e. even if KVM
4425+
* will never yield mmu_lock in response to contention, as this vCPU is
4426+
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
4427+
* to detect retry guarantees the worst case latency for the vCPU.
4428+
*/
4429+
if (fault->slot &&
4430+
mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
4431+
return RET_PF_RETRY;
4432+
44084433
ret = __kvm_faultin_pfn(vcpu, fault);
44094434
if (ret != RET_PF_CONTINUE)
44104435
return ret;
@@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
44154440
if (unlikely(!fault->slot))
44164441
return kvm_handle_noslot_fault(vcpu, fault, access);
44174442

4443+
/*
4444+
* Check again for a relevant mmu_notifier invalidation event purely to
4445+
* avoid contending mmu_lock. Most invalidations will be detected by
4446+
* the previous check, but checking is extremely cheap relative to the
4447+
* overall cost of failing to detect the invalidation until after
4448+
* mmu_lock is acquired.
4449+
*/
4450+
if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
4451+
kvm_release_pfn_clean(fault->pfn);
4452+
return RET_PF_RETRY;
4453+
}
4454+
44184455
return RET_PF_CONTINUE;
44194456
}
44204457

@@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
44424479
if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
44434480
return true;
44444481

4482+
/*
4483+
* Check for a relevant mmu_notifier invalidation event one last time
4484+
* now that mmu_lock is held, as the "unsafe" checks performed without
4485+
* holding mmu_lock can get false negatives.
4486+
*/
44454487
return fault->slot &&
44464488
mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
44474489
}

arch/x86/kvm/svm/sev.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ static bool sev_es_enabled = true;
5757
module_param_named(sev_es, sev_es_enabled, bool, 0444);
5858

5959
/* enable/disable SEV-ES DebugSwap support */
60-
static bool sev_es_debug_swap_enabled = true;
60+
static bool sev_es_debug_swap_enabled = false;
6161
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
6262
#else
6363
#define sev_enabled false
@@ -612,8 +612,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
612612
save->xss = svm->vcpu.arch.ia32_xss;
613613
save->dr6 = svm->vcpu.arch.dr6;
614614

615-
if (sev_es_debug_swap_enabled)
615+
if (sev_es_debug_swap_enabled) {
616616
save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
617+
pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
618+
"This will not work starting with Linux 6.10\n");
619+
}
617620

618621
pr_debug("Virtual Machine Save Area (VMSA):\n");
619622
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
@@ -1975,20 +1978,22 @@ int sev_mem_enc_register_region(struct kvm *kvm,
19751978
goto e_free;
19761979
}
19771980

1978-
region->uaddr = range->addr;
1979-
region->size = range->size;
1980-
1981-
list_add_tail(&region->list, &sev->regions_list);
1982-
mutex_unlock(&kvm->lock);
1983-
19841981
/*
19851982
* The guest may change the memory encryption attribute from C=0 -> C=1
19861983
* or vice versa for this memory range. Lets make sure caches are
19871984
* flushed to ensure that guest data gets written into memory with
1988-
* correct C-bit.
1985+
* correct C-bit. Note, this must be done before dropping kvm->lock,
1986+
* as region and its array of pages can be freed by a different task
1987+
* once kvm->lock is released.
19891988
*/
19901989
sev_clflush_pages(region->pages, region->npages);
19911990

1991+
region->uaddr = range->addr;
1992+
region->size = range->size;
1993+
1994+
list_add_tail(&region->list, &sev->regions_list);
1995+
mutex_unlock(&kvm->lock);
1996+
19921997
return ret;
19931998

19941999
e_free:

arch/x86/kvm/x86.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4580,7 +4580,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
45804580
{
45814581
return type == KVM_X86_DEFAULT_VM ||
45824582
(type == KVM_X86_SW_PROTECTED_VM &&
4583-
IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled);
4583+
IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
45844584
}
45854585

45864586
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
80078007

80088008
if (r < 0)
80098009
return X86EMUL_UNHANDLEABLE;
8010+
8011+
/*
8012+
* Mark the page dirty _before_ checking whether or not the CMPXCHG was
8013+
* successful, as the old value is written back on failure. Note, for
8014+
* live migration, this is unnecessarily conservative as CMPXCHG writes
8015+
* back the original value and the access is atomic, but KVM's ABI is
8016+
* that all writes are dirty logged, regardless of the value written.
8017+
*/
8018+
kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
8019+
80108020
if (r)
80118021
return X86EMUL_CMPXCHG_FAILED;
80128022

include/linux/kvm_host.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,32 @@ static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
20312031
return 1;
20322032
return 0;
20332033
}
2034+
2035+
/*
2036+
* This lockless version of the range-based retry check *must* be paired with a
2037+
* call to the locked version after acquiring mmu_lock, i.e. this is safe to
2038+
* use only as a pre-check to avoid contending mmu_lock. This version *will*
2039+
* get false negatives and false positives.
2040+
*/
2041+
static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm,
2042+
unsigned long mmu_seq,
2043+
gfn_t gfn)
2044+
{
2045+
/*
2046+
* Use READ_ONCE() to ensure the in-progress flag and sequence counter
2047+
* are always read from memory, e.g. so that checking for retry in a
2048+
* loop won't result in an infinite retry loop. Don't force loads for
2049+
* start+end, as the key to avoiding infinite retry loops is observing
2050+
* the 1=>0 transition of in-progress, i.e. getting false negatives
2051+
* due to stale start+end values is acceptable.
2052+
*/
2053+
if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) &&
2054+
gfn >= kvm->mmu_invalidate_range_start &&
2055+
gfn < kvm->mmu_invalidate_range_end)
2056+
return true;
2057+
2058+
return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq;
2059+
}
20342060
#endif
20352061

20362062
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING

tools/testing/selftests/kvm/set_memory_region_test.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,11 +367,21 @@ static void test_invalid_memory_region_flags(void)
367367
}
368368

369369
if (supported_flags & KVM_MEM_GUEST_MEMFD) {
370+
int guest_memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0);
371+
370372
r = __vm_set_user_memory_region2(vm, 0,
371373
KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD,
372-
0, MEM_REGION_SIZE, NULL, 0, 0);
374+
0, MEM_REGION_SIZE, NULL, guest_memfd, 0);
373375
TEST_ASSERT(r && errno == EINVAL,
374376
"KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported");
377+
378+
r = __vm_set_user_memory_region2(vm, 0,
379+
KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD,
380+
0, MEM_REGION_SIZE, NULL, guest_memfd, 0);
381+
TEST_ASSERT(r && errno == EINVAL,
382+
"KVM_SET_USER_MEMORY_REGION2 should have failed, read-only GUEST_MEMFD memslots are unsupported");
383+
384+
close(guest_memfd);
375385
}
376386
}
377387

virt/kvm/kvm_main.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1615,7 +1615,13 @@ static int check_memory_region_flags(struct kvm *kvm,
16151615
valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
16161616

16171617
#ifdef __KVM_HAVE_READONLY_MEM
1618-
valid_flags |= KVM_MEM_READONLY;
1618+
/*
1619+
* GUEST_MEMFD is incompatible with read-only memslots, as writes to
1620+
* read-only memslots have emulated MMIO, not page fault, semantics,
1621+
* and KVM doesn't allow emulated MMIO for private memory.
1622+
*/
1623+
if (!(mem->flags & KVM_MEM_GUEST_MEMFD))
1624+
valid_flags |= KVM_MEM_READONLY;
16191625
#endif
16201626

16211627
if (mem->flags & ~valid_flags)

0 commit comments

Comments
 (0)