|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
| 2 | +/* |
| 3 | + * Device driver to expose SGX enclave memory to KVM guests. |
| 4 | + * |
| 5 | + * Copyright(c) 2021 Intel Corporation. |
| 6 | + */ |
| 7 | + |
| 8 | +#include <linux/miscdevice.h> |
| 9 | +#include <linux/mm.h> |
| 10 | +#include <linux/mman.h> |
| 11 | +#include <linux/sched/mm.h> |
| 12 | +#include <linux/sched/signal.h> |
| 13 | +#include <linux/slab.h> |
| 14 | +#include <linux/xarray.h> |
| 15 | +#include <asm/sgx.h> |
| 16 | +#include <uapi/asm/sgx.h> |
| 17 | + |
| 18 | +#include "encls.h" |
| 19 | +#include "sgx.h" |
| 20 | + |
| 21 | +struct sgx_vepc { |
| 22 | + struct xarray page_array; |
| 23 | + struct mutex lock; |
| 24 | +}; |
| 25 | + |
| 26 | +/* |
| 27 | + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other |
| 28 | + * virtual EPC instances, and the lock to protect it. |
| 29 | + */ |
| 30 | +static struct mutex zombie_secs_pages_lock; |
| 31 | +static struct list_head zombie_secs_pages; |
| 32 | + |
| 33 | +static int __sgx_vepc_fault(struct sgx_vepc *vepc, |
| 34 | + struct vm_area_struct *vma, unsigned long addr) |
| 35 | +{ |
| 36 | + struct sgx_epc_page *epc_page; |
| 37 | + unsigned long index, pfn; |
| 38 | + int ret; |
| 39 | + |
| 40 | + WARN_ON(!mutex_is_locked(&vepc->lock)); |
| 41 | + |
| 42 | + /* Calculate index of EPC page in virtual EPC's page_array */ |
| 43 | + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); |
| 44 | + |
| 45 | + epc_page = xa_load(&vepc->page_array, index); |
| 46 | + if (epc_page) |
| 47 | + return 0; |
| 48 | + |
| 49 | + epc_page = sgx_alloc_epc_page(vepc, false); |
| 50 | + if (IS_ERR(epc_page)) |
| 51 | + return PTR_ERR(epc_page); |
| 52 | + |
| 53 | + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); |
| 54 | + if (ret) |
| 55 | + goto err_free; |
| 56 | + |
| 57 | + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); |
| 58 | + |
| 59 | + ret = vmf_insert_pfn(vma, addr, pfn); |
| 60 | + if (ret != VM_FAULT_NOPAGE) { |
| 61 | + ret = -EFAULT; |
| 62 | + goto err_delete; |
| 63 | + } |
| 64 | + |
| 65 | + return 0; |
| 66 | + |
| 67 | +err_delete: |
| 68 | + xa_erase(&vepc->page_array, index); |
| 69 | +err_free: |
| 70 | + sgx_free_epc_page(epc_page); |
| 71 | + return ret; |
| 72 | +} |
| 73 | + |
| 74 | +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) |
| 75 | +{ |
| 76 | + struct vm_area_struct *vma = vmf->vma; |
| 77 | + struct sgx_vepc *vepc = vma->vm_private_data; |
| 78 | + int ret; |
| 79 | + |
| 80 | + mutex_lock(&vepc->lock); |
| 81 | + ret = __sgx_vepc_fault(vepc, vma, vmf->address); |
| 82 | + mutex_unlock(&vepc->lock); |
| 83 | + |
| 84 | + if (!ret) |
| 85 | + return VM_FAULT_NOPAGE; |
| 86 | + |
| 87 | + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { |
| 88 | + mmap_read_unlock(vma->vm_mm); |
| 89 | + return VM_FAULT_RETRY; |
| 90 | + } |
| 91 | + |
| 92 | + return VM_FAULT_SIGBUS; |
| 93 | +} |
| 94 | + |
| 95 | +const struct vm_operations_struct sgx_vepc_vm_ops = { |
| 96 | + .fault = sgx_vepc_fault, |
| 97 | +}; |
| 98 | + |
| 99 | +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) |
| 100 | +{ |
| 101 | + struct sgx_vepc *vepc = file->private_data; |
| 102 | + |
| 103 | + if (!(vma->vm_flags & VM_SHARED)) |
| 104 | + return -EINVAL; |
| 105 | + |
| 106 | + vma->vm_ops = &sgx_vepc_vm_ops; |
| 107 | + /* Don't copy VMA in fork() */ |
| 108 | + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; |
| 109 | + vma->vm_private_data = vepc; |
| 110 | + |
| 111 | + return 0; |
| 112 | +} |
| 113 | + |
| 114 | +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) |
| 115 | +{ |
| 116 | + int ret; |
| 117 | + |
| 118 | + /* |
| 119 | + * Take a previously guest-owned EPC page and return it to the |
| 120 | + * general EPC page pool. |
| 121 | + * |
| 122 | + * Guests can not be trusted to have left this page in a good |
| 123 | + * state, so run EREMOVE on the page unconditionally. In the |
| 124 | + * case that a guest properly EREMOVE'd this page, a superfluous |
| 125 | + * EREMOVE is harmless. |
| 126 | + */ |
| 127 | + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); |
| 128 | + if (ret) { |
| 129 | + /* |
| 130 | + * Only SGX_CHILD_PRESENT is expected, which is because of |
| 131 | + * EREMOVE'ing an SECS still with child, in which case it can |
| 132 | + * be handled by EREMOVE'ing the SECS again after all pages in |
| 133 | + * virtual EPC have been EREMOVE'd. See comments in below in |
| 134 | + * sgx_vepc_release(). |
| 135 | + * |
| 136 | + * The user of virtual EPC (KVM) needs to guarantee there's no |
| 137 | + * logical processor is still running in the enclave in guest, |
| 138 | + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be |
| 139 | + * handled here. |
| 140 | + */ |
| 141 | + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, |
| 142 | + ret, ret); |
| 143 | + return ret; |
| 144 | + } |
| 145 | + |
| 146 | + sgx_free_epc_page(epc_page); |
| 147 | + |
| 148 | + return 0; |
| 149 | +} |
| 150 | + |
| 151 | +static int sgx_vepc_release(struct inode *inode, struct file *file) |
| 152 | +{ |
| 153 | + struct sgx_vepc *vepc = file->private_data; |
| 154 | + struct sgx_epc_page *epc_page, *tmp, *entry; |
| 155 | + unsigned long index; |
| 156 | + |
| 157 | + LIST_HEAD(secs_pages); |
| 158 | + |
| 159 | + xa_for_each(&vepc->page_array, index, entry) { |
| 160 | + /* |
| 161 | + * Remove all normal, child pages. sgx_vepc_free_page() |
| 162 | + * will fail if EREMOVE fails, but this is OK and expected on |
| 163 | + * SECS pages. Those can only be EREMOVE'd *after* all their |
| 164 | + * child pages. Retries below will clean them up. |
| 165 | + */ |
| 166 | + if (sgx_vepc_free_page(entry)) |
| 167 | + continue; |
| 168 | + |
| 169 | + xa_erase(&vepc->page_array, index); |
| 170 | + } |
| 171 | + |
| 172 | + /* |
| 173 | + * Retry EREMOVE'ing pages. This will clean up any SECS pages that |
| 174 | + * only had children in this 'epc' area. |
| 175 | + */ |
| 176 | + xa_for_each(&vepc->page_array, index, entry) { |
| 177 | + epc_page = entry; |
| 178 | + /* |
| 179 | + * An EREMOVE failure here means that the SECS page still |
| 180 | + * has children. But, since all children in this 'sgx_vepc' |
| 181 | + * have been removed, the SECS page must have a child on |
| 182 | + * another instance. |
| 183 | + */ |
| 184 | + if (sgx_vepc_free_page(epc_page)) |
| 185 | + list_add_tail(&epc_page->list, &secs_pages); |
| 186 | + |
| 187 | + xa_erase(&vepc->page_array, index); |
| 188 | + } |
| 189 | + |
| 190 | + /* |
| 191 | + * SECS pages are "pinned" by child pages, and "unpinned" once all |
| 192 | + * children have been EREMOVE'd. A child page in this instance |
| 193 | + * may have pinned an SECS page encountered in an earlier release(), |
| 194 | + * creating a zombie. Since some children were EREMOVE'd above, |
| 195 | + * try to EREMOVE all zombies in the hopes that one was unpinned. |
| 196 | + */ |
| 197 | + mutex_lock(&zombie_secs_pages_lock); |
| 198 | + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { |
| 199 | + /* |
| 200 | + * Speculatively remove the page from the list of zombies, |
| 201 | + * if the page is successfully EREMOVE'd it will be added to |
| 202 | + * the list of free pages. If EREMOVE fails, throw the page |
| 203 | + * on the local list, which will be spliced on at the end. |
| 204 | + */ |
| 205 | + list_del(&epc_page->list); |
| 206 | + |
| 207 | + if (sgx_vepc_free_page(epc_page)) |
| 208 | + list_add_tail(&epc_page->list, &secs_pages); |
| 209 | + } |
| 210 | + |
| 211 | + if (!list_empty(&secs_pages)) |
| 212 | + list_splice_tail(&secs_pages, &zombie_secs_pages); |
| 213 | + mutex_unlock(&zombie_secs_pages_lock); |
| 214 | + |
| 215 | + kfree(vepc); |
| 216 | + |
| 217 | + return 0; |
| 218 | +} |
| 219 | + |
| 220 | +static int sgx_vepc_open(struct inode *inode, struct file *file) |
| 221 | +{ |
| 222 | + struct sgx_vepc *vepc; |
| 223 | + |
| 224 | + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); |
| 225 | + if (!vepc) |
| 226 | + return -ENOMEM; |
| 227 | + mutex_init(&vepc->lock); |
| 228 | + xa_init(&vepc->page_array); |
| 229 | + |
| 230 | + file->private_data = vepc; |
| 231 | + |
| 232 | + return 0; |
| 233 | +} |
| 234 | + |
| 235 | +static const struct file_operations sgx_vepc_fops = { |
| 236 | + .owner = THIS_MODULE, |
| 237 | + .open = sgx_vepc_open, |
| 238 | + .release = sgx_vepc_release, |
| 239 | + .mmap = sgx_vepc_mmap, |
| 240 | +}; |
| 241 | + |
| 242 | +static struct miscdevice sgx_vepc_dev = { |
| 243 | + .minor = MISC_DYNAMIC_MINOR, |
| 244 | + .name = "sgx_vepc", |
| 245 | + .nodename = "sgx_vepc", |
| 246 | + .fops = &sgx_vepc_fops, |
| 247 | +}; |
| 248 | + |
| 249 | +int __init sgx_vepc_init(void) |
| 250 | +{ |
| 251 | + /* SGX virtualization requires KVM to work */ |
| 252 | + if (!cpu_feature_enabled(X86_FEATURE_VMX)) |
| 253 | + return -ENODEV; |
| 254 | + |
| 255 | + INIT_LIST_HEAD(&zombie_secs_pages); |
| 256 | + mutex_init(&zombie_secs_pages_lock); |
| 257 | + |
| 258 | + return misc_register(&sgx_vepc_dev); |
| 259 | +} |
0 commit comments