Skip to content

Commit a5704e8

Browse files
npigginpaulusmack
authored andcommitted
KVM: PPC: Book3S HV: Recursively unmap all page table entries when unmapping
When partition scope mappings are unmapped with kvm_unmap_radix, the pte is cleared, but the page table structure is left in place. If the next page fault requests a different page table geometry (e.g., due to THP promotion or split), kvmppc_create_pte is responsible for changing the page tables. When a page table entry is to be converted to a large pte, the page table entry is cleared, the PWC flushed, then the page table it points to freed. This will cause pte page tables to leak when a 1GB page is to replace a pud entry points to a pmd table with pte tables under it: The pmd table will be freed, but its pte tables will be missed. Fix this by replacing the simple clear and free code with one that walks down the page tables and frees children. Care must be taken to clear the root entry being unmapped then flushing the PWC before freeing any page tables, as explained in comments. This requires PWC flush to logically become a flush-all-PWC (which it already is in hardware, but the KVM API needs to be changed to avoid confusion). This code also checks that no unexpected pte entries exist in any page table being freed, and unmaps those and emits a WARN. This is an expensive operation for the pte page level, but partition scope changes are rare, so it's unconditional for now to iron out bugs. It can be put under a CONFIG option or removed after some time. Signed-off-by: Nicholas Piggin <[email protected]> Signed-off-by: Paul Mackerras <[email protected]>
1 parent a5fad1e commit a5704e8

File tree

1 file changed

+138
-54
lines changed

1 file changed

+138
-54
lines changed

arch/powerpc/kvm/book3s_64_mmu_radix.c

Lines changed: 138 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
165165
asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
166166
}
167167

168-
static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
168+
static void kvmppc_radix_flush_pwc(struct kvm *kvm)
169169
{
170170
unsigned long rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */
171171

@@ -247,6 +247,139 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
247247
}
248248
}
249249

250+
/*
251+
* kvmppc_free_p?d are used to free existing page tables, and recursively
252+
* descend and clear and free children.
253+
* Callers are responsible for flushing the PWC.
254+
*
255+
* When page tables are being unmapped/freed as part of page fault path
256+
* (full == false), ptes are not expected. There is code to unmap them
257+
* and emit a warning if encountered, but there may already be data
258+
* corruption due to the unexpected mappings.
259+
*/
260+
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
261+
{
262+
if (full) {
263+
memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
264+
} else {
265+
pte_t *p = pte;
266+
unsigned long it;
267+
268+
for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
269+
if (pte_val(*p) == 0)
270+
continue;
271+
WARN_ON_ONCE(1);
272+
kvmppc_unmap_pte(kvm, p,
273+
pte_pfn(*p) << PAGE_SHIFT,
274+
PAGE_SHIFT);
275+
}
276+
}
277+
278+
kvmppc_pte_free(pte);
279+
}
280+
281+
static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
282+
{
283+
unsigned long im;
284+
pmd_t *p = pmd;
285+
286+
for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
287+
if (!pmd_present(*p))
288+
continue;
289+
if (pmd_is_leaf(*p)) {
290+
if (full) {
291+
pmd_clear(p);
292+
} else {
293+
WARN_ON_ONCE(1);
294+
kvmppc_unmap_pte(kvm, (pte_t *)p,
295+
pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
296+
PMD_SHIFT);
297+
}
298+
} else {
299+
pte_t *pte;
300+
301+
pte = pte_offset_map(p, 0);
302+
kvmppc_unmap_free_pte(kvm, pte, full);
303+
pmd_clear(p);
304+
}
305+
}
306+
kvmppc_pmd_free(pmd);
307+
}
308+
309+
static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
310+
{
311+
unsigned long iu;
312+
pud_t *p = pud;
313+
314+
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
315+
if (!pud_present(*p))
316+
continue;
317+
if (pud_huge(*p)) {
318+
pud_clear(p);
319+
} else {
320+
pmd_t *pmd;
321+
322+
pmd = pmd_offset(p, 0);
323+
kvmppc_unmap_free_pmd(kvm, pmd, true);
324+
pud_clear(p);
325+
}
326+
}
327+
pud_free(kvm->mm, pud);
328+
}
329+
330+
void kvmppc_free_radix(struct kvm *kvm)
331+
{
332+
unsigned long ig;
333+
pgd_t *pgd;
334+
335+
if (!kvm->arch.pgtable)
336+
return;
337+
pgd = kvm->arch.pgtable;
338+
for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
339+
pud_t *pud;
340+
341+
if (!pgd_present(*pgd))
342+
continue;
343+
pud = pud_offset(pgd, 0);
344+
kvmppc_unmap_free_pud(kvm, pud);
345+
pgd_clear(pgd);
346+
}
347+
pgd_free(kvm->mm, kvm->arch.pgtable);
348+
kvm->arch.pgtable = NULL;
349+
}
350+
351+
static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
352+
unsigned long gpa)
353+
{
354+
pte_t *pte = pte_offset_kernel(pmd, 0);
355+
356+
/*
357+
* Clearing the pmd entry then flushing the PWC ensures that the pte
358+
* page no longer be cached by the MMU, so can be freed without
359+
* flushing the PWC again.
360+
*/
361+
pmd_clear(pmd);
362+
kvmppc_radix_flush_pwc(kvm);
363+
364+
kvmppc_unmap_free_pte(kvm, pte, false);
365+
}
366+
367+
static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
368+
unsigned long gpa)
369+
{
370+
pmd_t *pmd = pmd_offset(pud, 0);
371+
372+
/*
373+
* Clearing the pud entry then flushing the PWC ensures that the pmd
374+
* page and any children pte pages will no longer be cached by the MMU,
375+
* so can be freed without flushing the PWC again.
376+
*/
377+
pud_clear(pud);
378+
kvmppc_radix_flush_pwc(kvm);
379+
380+
kvmppc_unmap_free_pmd(kvm, pmd, false);
381+
}
382+
250383
static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
251384
unsigned int level, unsigned long mmu_seq)
252385
{
@@ -312,11 +445,9 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
312445
/*
313446
* There's a page table page here, but we wanted to
314447
* install a large page, so remove and free the page
315-
* table page. new_pmd will be NULL since level == 2.
448+
* table page.
316449
*/
317-
new_pmd = pmd_offset(pud, 0);
318-
pud_clear(pud);
319-
kvmppc_radix_flush_pwc(kvm, gpa);
450+
kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
320451
}
321452
kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
322453
ret = 0;
@@ -353,11 +484,9 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
353484
/*
354485
* There's a page table page here, but we wanted to
355486
* install a large page, so remove and free the page
356-
* table page. new_ptep will be NULL since level == 1.
487+
* table page.
357488
*/
358-
new_ptep = pte_offset_kernel(pmd, 0);
359-
pmd_clear(pmd);
360-
kvmppc_radix_flush_pwc(kvm, gpa);
489+
kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
361490
}
362491
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
363492
ret = 0;
@@ -734,51 +863,6 @@ int kvmppc_init_vm_radix(struct kvm *kvm)
734863
return 0;
735864
}
736865

737-
void kvmppc_free_radix(struct kvm *kvm)
738-
{
739-
unsigned long ig, iu, im;
740-
pte_t *pte;
741-
pmd_t *pmd;
742-
pud_t *pud;
743-
pgd_t *pgd;
744-
745-
if (!kvm->arch.pgtable)
746-
return;
747-
pgd = kvm->arch.pgtable;
748-
for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
749-
if (!pgd_present(*pgd))
750-
continue;
751-
pud = pud_offset(pgd, 0);
752-
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
753-
if (!pud_present(*pud))
754-
continue;
755-
if (pud_huge(*pud)) {
756-
pud_clear(pud);
757-
continue;
758-
}
759-
pmd = pmd_offset(pud, 0);
760-
for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
761-
if (pmd_is_leaf(*pmd)) {
762-
pmd_clear(pmd);
763-
continue;
764-
}
765-
if (!pmd_present(*pmd))
766-
continue;
767-
pte = pte_offset_map(pmd, 0);
768-
memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
769-
kvmppc_pte_free(pte);
770-
pmd_clear(pmd);
771-
}
772-
kvmppc_pmd_free(pmd_offset(pud, 0));
773-
pud_clear(pud);
774-
}
775-
pud_free(kvm->mm, pud_offset(pgd, 0));
776-
pgd_clear(pgd);
777-
}
778-
pgd_free(kvm->mm, kvm->arch.pgtable);
779-
kvm->arch.pgtable = NULL;
780-
}
781-
782866
static void pte_ctor(void *addr)
783867
{
784868
memset(addr, 0, RADIX_PTE_TABLE_SIZE);

0 commit comments

Comments
 (0)