Skip to content

Commit 03760d4

Browse files
author
Martin Schwidefsky
committed
Merge tag 'hlp_stage1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into features
Pull hlp_stage1 from Christian Borntraeger with the following changes: KVM: s390: initial host large page support - must be enabled via module parameter hpage=1 - cannot be used together with nested - does support migration - does support hugetlbfs - no THP yet
2 parents 6eedfaa + a449938 commit 03760d4

File tree

13 files changed

+757
-121
lines changed

13 files changed

+757
-121
lines changed

Documentation/virtual/kvm/api.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4391,6 +4391,22 @@ all such vmexits.
43914391

43924392
Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
43934393

4394+
7.14 KVM_CAP_S390_HPAGE_1M
4395+
4396+
Architectures: s390
4397+
Parameters: none
4398+
Returns: 0 on success, -EINVAL if hpage module parameter was not set
4399+
or cmma is enabled
4400+
4401+
With this capability the KVM support for memory backing with 1m pages
4402+
through hugetlbfs can be enabled for a VM. After the capability is
4403+
enabled, cmma can't be enabled anymore and pfmfi and the storage key
4404+
interpretation are disabled. If cmma has already been enabled or the
4405+
hpage module parameter is not set to 1, -EINVAL is returned.
4406+
4407+
While it is generally possible to create a huge page backed VM without
4408+
this capability, the VM will not be able to run.
4409+
43944410
8. Other capabilities.
43954411
----------------------
43964412

arch/s390/include/asm/gmap.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,14 @@
99
#ifndef _ASM_S390_GMAP_H
1010
#define _ASM_S390_GMAP_H
1111

12+
/* Generic bits for GMAP notification on DAT table entry changes. */
13+
#define GMAP_NOTIFY_SHADOW 0x2
14+
#define GMAP_NOTIFY_MPROT 0x1
15+
16+
/* Status bits only for huge segment entries */
17+
#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */
18+
#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */
19+
1220
/**
1321
* struct gmap_struct - guest address space
1422
* @list: list head for the mm->context gmap list
@@ -132,4 +140,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
132140
int gmap_mprotect_notify(struct gmap *, unsigned long start,
133141
unsigned long len, int prot);
134142

143+
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
144+
unsigned long gaddr, unsigned long vmaddr);
135145
#endif /* _ASM_S390_GMAP_H */

arch/s390/include/asm/hugetlb.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ static inline int prepare_hugepage_range(struct file *file,
3737
return 0;
3838
}
3939

40-
#define arch_clear_hugepage_flags(page) do { } while (0)
40+
static inline void arch_clear_hugepage_flags(struct page *page)
41+
{
42+
clear_bit(PG_arch_1, &page->flags);
43+
}
4144

4245
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
4346
pte_t *ptep, unsigned long sz)

arch/s390/include/asm/mmu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ typedef struct {
2424
unsigned int uses_skeys:1;
2525
/* The mmu context uses CMM. */
2626
unsigned int uses_cmm:1;
27+
/* The gmaps associated with this context are allowed to use huge pages. */
28+
unsigned int allow_gmap_hpage_1m:1;
2729
} mm_context_t;
2830

2931
#define INIT_MM_CONTEXT(name) \

arch/s390/include/asm/mmu_context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ static inline int init_new_context(struct task_struct *tsk,
3232
mm->context.has_pgste = 0;
3333
mm->context.uses_skeys = 0;
3434
mm->context.uses_cmm = 0;
35+
mm->context.allow_gmap_hpage_1m = 0;
3536
#endif
3637
switch (mm->context.asce_limit) {
3738
case _REGION2_SIZE:

arch/s390/include/asm/pgtable.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,10 @@ static inline int is_module_addr(void *addr)
268268
#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL
269269

270270
/* Bits in the segment table entry */
271-
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
272-
#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
271+
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
272+
#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
273+
#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL
274+
#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff00730UL
273275
#define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */
274276
#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */
275277
#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */
@@ -1101,7 +1103,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
11011103
pte_t *sptep, pte_t *tptep, pte_t pte);
11021104
void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
11031105

1104-
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
1106+
bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address,
1107+
pte_t *ptep);
11051108
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
11061109
unsigned char key, bool nq);
11071110
int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
@@ -1116,6 +1119,10 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
11161119
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
11171120
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
11181121
unsigned long *oldpte, unsigned long *oldpgste);
1122+
void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr);
1123+
void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
1124+
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
1125+
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
11191126

11201127
/*
11211128
* Certain architectures need to do special things when PTEs

arch/s390/kvm/kvm-s390.c

Lines changed: 70 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,10 @@ static int nested;
172172
module_param(nested, int, S_IRUGO);
173173
MODULE_PARM_DESC(nested, "Nested virtualization support");
174174

175+
/* allow 1m huge page guest backing, if !nested */
176+
static int hpage;
177+
module_param(hpage, int, 0444);
178+
MODULE_PARM_DESC(hpage, "1m huge page backing support");
175179

176180
/*
177181
* For now we handle at most 16 double words as this is what the s390 base
@@ -475,6 +479,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
475479
case KVM_CAP_S390_AIS_MIGRATION:
476480
r = 1;
477481
break;
482+
case KVM_CAP_S390_HPAGE_1M:
483+
r = 0;
484+
if (hpage)
485+
r = 1;
486+
break;
478487
case KVM_CAP_S390_MEM_OP:
479488
r = MEM_OP_MAX_SIZE;
480489
break;
@@ -511,19 +520,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
511520
}
512521

513522
static void kvm_s390_sync_dirty_log(struct kvm *kvm,
514-
struct kvm_memory_slot *memslot)
523+
struct kvm_memory_slot *memslot)
515524
{
525+
int i;
516526
gfn_t cur_gfn, last_gfn;
517-
unsigned long address;
527+
unsigned long gaddr, vmaddr;
518528
struct gmap *gmap = kvm->arch.gmap;
529+
DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
519530

520-
/* Loop over all guest pages */
531+
/* Loop over all guest segments */
532+
cur_gfn = memslot->base_gfn;
521533
last_gfn = memslot->base_gfn + memslot->npages;
522-
for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
523-
address = gfn_to_hva_memslot(memslot, cur_gfn);
534+
for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
535+
gaddr = gfn_to_gpa(cur_gfn);
536+
vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
537+
if (kvm_is_error_hva(vmaddr))
538+
continue;
539+
540+
bitmap_zero(bitmap, _PAGE_ENTRIES);
541+
gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
542+
for (i = 0; i < _PAGE_ENTRIES; i++) {
543+
if (test_bit(i, bitmap))
544+
mark_page_dirty(kvm, cur_gfn + i);
545+
}
524546

525-
if (test_and_clear_guest_dirty(gmap->mm, address))
526-
mark_page_dirty(kvm, cur_gfn);
527547
if (fatal_signal_pending(current))
528548
return;
529549
cond_resched();
@@ -667,6 +687,27 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
667687
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
668688
r ? "(not available)" : "(success)");
669689
break;
690+
case KVM_CAP_S390_HPAGE_1M:
691+
mutex_lock(&kvm->lock);
692+
if (kvm->created_vcpus)
693+
r = -EBUSY;
694+
else if (!hpage || kvm->arch.use_cmma)
695+
r = -EINVAL;
696+
else {
697+
r = 0;
698+
kvm->mm->context.allow_gmap_hpage_1m = 1;
699+
/*
700+
* We might have to create fake 4k page
701+
* tables. To avoid that the hardware works on
702+
* stale PGSTEs, we emulate these instructions.
703+
*/
704+
kvm->arch.use_skf = 0;
705+
kvm->arch.use_pfmfi = 0;
706+
}
707+
mutex_unlock(&kvm->lock);
708+
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s",
709+
r ? "(not available)" : "(success)");
710+
break;
670711
case KVM_CAP_S390_USER_STSI:
671712
VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
672713
kvm->arch.user_stsi = 1;
@@ -714,10 +755,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
714755
if (!sclp.has_cmma)
715756
break;
716757

717-
ret = -EBUSY;
718758
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
719759
mutex_lock(&kvm->lock);
720-
if (!kvm->created_vcpus) {
760+
if (kvm->created_vcpus)
761+
ret = -EBUSY;
762+
else if (kvm->mm->context.allow_gmap_hpage_1m)
763+
ret = -EINVAL;
764+
else {
721765
kvm->arch.use_cmma = 1;
722766
/* Not compatible with cmma. */
723767
kvm->arch.use_pfmfi = 0;
@@ -1540,6 +1584,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
15401584
uint8_t *keys;
15411585
uint64_t hva;
15421586
int srcu_idx, i, r = 0;
1587+
bool unlocked;
15431588

15441589
if (args->flags != 0)
15451590
return -EINVAL;
@@ -1564,9 +1609,11 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
15641609
if (r)
15651610
goto out;
15661611

1612+
i = 0;
15671613
down_read(&current->mm->mmap_sem);
15681614
srcu_idx = srcu_read_lock(&kvm->srcu);
1569-
for (i = 0; i < args->count; i++) {
1615+
while (i < args->count) {
1616+
unlocked = false;
15701617
hva = gfn_to_hva(kvm, args->start_gfn + i);
15711618
if (kvm_is_error_hva(hva)) {
15721619
r = -EFAULT;
@@ -1580,8 +1627,14 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
15801627
}
15811628

15821629
r = set_guest_storage_key(current->mm, hva, keys[i], 0);
1583-
if (r)
1584-
break;
1630+
if (r) {
1631+
r = fixup_user_fault(current, current->mm, hva,
1632+
FAULT_FLAG_WRITE, &unlocked);
1633+
if (r)
1634+
break;
1635+
}
1636+
if (!r)
1637+
i++;
15851638
}
15861639
srcu_read_unlock(&kvm->srcu, srcu_idx);
15871640
up_read(&current->mm->mmap_sem);
@@ -4082,6 +4135,11 @@ static int __init kvm_s390_init(void)
40824135
return -ENODEV;
40834136
}
40844137

4138+
if (nested && hpage) {
4139+
pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently");
4140+
return -EINVAL;
4141+
}
4142+
40854143
for (i = 0; i < 16; i++)
40864144
kvm_s390_fac_base[i] |=
40874145
S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);

0 commit comments

Comments
 (0)