1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Device driver to expose SGX enclave memory to KVM guests. 4 * 5 * Copyright(c) 2021 Intel Corporation. 6 */ 7 8 #include <linux/kvm_types.h> 9 #include <linux/miscdevice.h> 10 #include <linux/mm.h> 11 #include <linux/mman.h> 12 #include <linux/sched/mm.h> 13 #include <linux/sched/signal.h> 14 #include <linux/slab.h> 15 #include <linux/xarray.h> 16 #include <asm/sgx.h> 17 #include <uapi/asm/sgx.h> 18 19 #include "encls.h" 20 #include "sgx.h" 21 22 struct sgx_vepc { 23 struct xarray page_array; 24 struct mutex lock; 25 }; 26 27 /* 28 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other 29 * virtual EPC instances, and the lock to protect it. 30 */ 31 static struct mutex zombie_secs_pages_lock; 32 static struct list_head zombie_secs_pages; 33 34 static int __sgx_vepc_fault(struct sgx_vepc *vepc, 35 struct vm_area_struct *vma, unsigned long addr) 36 { 37 struct sgx_epc_page *epc_page; 38 unsigned long index, pfn; 39 int ret; 40 41 WARN_ON(!mutex_is_locked(&vepc->lock)); 42 43 /* Calculate index of EPC page in virtual EPC's page_array */ 44 index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); 45 46 epc_page = xa_load(&vepc->page_array, index); 47 if (epc_page) 48 return 0; 49 50 epc_page = sgx_alloc_epc_page(vepc, false); 51 if (IS_ERR(epc_page)) 52 return PTR_ERR(epc_page); 53 54 ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); 55 if (ret) 56 goto err_free; 57 58 pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); 59 60 ret = vmf_insert_pfn(vma, addr, pfn); 61 if (ret != VM_FAULT_NOPAGE) { 62 ret = -EFAULT; 63 goto err_delete; 64 } 65 66 return 0; 67 68 err_delete: 69 xa_erase(&vepc->page_array, index); 70 err_free: 71 sgx_free_epc_page(epc_page); 72 return ret; 73 } 74 75 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) 76 { 77 struct vm_area_struct *vma = vmf->vma; 78 struct sgx_vepc *vepc = vma->vm_private_data; 79 int ret; 80 81 mutex_lock(&vepc->lock); 82 ret = __sgx_vepc_fault(vepc, vma, vmf->address); 83 mutex_unlock(&vepc->lock); 84 85 if (!ret) 86 return VM_FAULT_NOPAGE; 87 88 if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { 89 mmap_read_unlock(vma->vm_mm); 90 return VM_FAULT_RETRY; 91 } 92 93 return VM_FAULT_SIGBUS; 94 } 95 96 static const struct vm_operations_struct sgx_vepc_vm_ops = { 97 .fault = sgx_vepc_fault, 98 }; 99 100 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) 101 { 102 struct sgx_vepc *vepc = file->private_data; 103 104 if (!(vma->vm_flags & VM_SHARED)) 105 return -EINVAL; 106 107 vma->vm_ops = &sgx_vepc_vm_ops; 108 /* Don't copy VMA in fork() */ 109 vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); 110 vma->vm_private_data = vepc; 111 112 return 0; 113 } 114 115 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) 116 { 117 /* 118 * Take a previously guest-owned EPC page and return it to the 119 * general EPC page pool. 120 * 121 * Guests can not be trusted to have left this page in a good 122 * state, so run EREMOVE on the page unconditionally. In the 123 * case that a guest properly EREMOVE'd this page, a superfluous 124 * EREMOVE is harmless. 125 */ 126 return __eremove(sgx_get_epc_virt_addr(epc_page)); 127 } 128 129 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) 130 { 131 int ret = sgx_vepc_remove_page(epc_page); 132 if (ret) { 133 /* 134 * Only SGX_CHILD_PRESENT is expected, which is because of 135 * EREMOVE'ing an SECS still with child, in which case it can 136 * be handled by EREMOVE'ing the SECS again after all pages in 137 * virtual EPC have been EREMOVE'd. See comments in below in 138 * sgx_vepc_release(). 139 * 140 * The user of virtual EPC (KVM) needs to guarantee there's no 141 * logical processor is still running in the enclave in guest, 142 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be 143 * handled here. 144 */ 145 WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, 146 ret, ret); 147 return ret; 148 } 149 150 sgx_free_epc_page(epc_page); 151 return 0; 152 } 153 154 static long sgx_vepc_remove_all(struct sgx_vepc *vepc) 155 { 156 struct sgx_epc_page *entry; 157 unsigned long index; 158 long failures = 0; 159 160 xa_for_each(&vepc->page_array, index, entry) { 161 int ret = sgx_vepc_remove_page(entry); 162 if (ret) { 163 if (ret == SGX_CHILD_PRESENT) { 164 /* The page is a SECS, userspace will retry. */ 165 failures++; 166 } else { 167 /* 168 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not 169 * WARN, as userspace can induce said failures by 170 * calling the ioctl concurrently on multiple vEPCs or 171 * while one or more CPUs is running the enclave. Only 172 * a #PF on EREMOVE indicates a kernel/hardware issue. 173 */ 174 WARN_ON_ONCE(encls_faulted(ret) && 175 ENCLS_TRAPNR(ret) != X86_TRAP_GP); 176 return -EBUSY; 177 } 178 } 179 cond_resched(); 180 } 181 182 /* 183 * Return the number of SECS pages that failed to be removed, so 184 * userspace knows that it has to retry. 185 */ 186 return failures; 187 } 188 189 static int sgx_vepc_release(struct inode *inode, struct file *file) 190 { 191 struct sgx_vepc *vepc = file->private_data; 192 struct sgx_epc_page *epc_page, *tmp, *entry; 193 unsigned long index; 194 195 LIST_HEAD(secs_pages); 196 197 xa_for_each(&vepc->page_array, index, entry) { 198 /* 199 * Remove all normal, child pages. sgx_vepc_free_page() 200 * will fail if EREMOVE fails, but this is OK and expected on 201 * SECS pages. Those can only be EREMOVE'd *after* all their 202 * child pages. Retries below will clean them up. 203 */ 204 if (sgx_vepc_free_page(entry)) 205 continue; 206 207 xa_erase(&vepc->page_array, index); 208 cond_resched(); 209 } 210 211 /* 212 * Retry EREMOVE'ing pages. This will clean up any SECS pages that 213 * only had children in this 'epc' area. 214 */ 215 xa_for_each(&vepc->page_array, index, entry) { 216 epc_page = entry; 217 /* 218 * An EREMOVE failure here means that the SECS page still 219 * has children. But, since all children in this 'sgx_vepc' 220 * have been removed, the SECS page must have a child on 221 * another instance. 222 */ 223 if (sgx_vepc_free_page(epc_page)) 224 list_add_tail(&epc_page->list, &secs_pages); 225 226 xa_erase(&vepc->page_array, index); 227 cond_resched(); 228 } 229 230 /* 231 * SECS pages are "pinned" by child pages, and "unpinned" once all 232 * children have been EREMOVE'd. A child page in this instance 233 * may have pinned an SECS page encountered in an earlier release(), 234 * creating a zombie. Since some children were EREMOVE'd above, 235 * try to EREMOVE all zombies in the hopes that one was unpinned. 236 */ 237 mutex_lock(&zombie_secs_pages_lock); 238 list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { 239 /* 240 * Speculatively remove the page from the list of zombies, 241 * if the page is successfully EREMOVE'd it will be added to 242 * the list of free pages. If EREMOVE fails, throw the page 243 * on the local list, which will be spliced on at the end. 244 */ 245 list_del(&epc_page->list); 246 247 if (sgx_vepc_free_page(epc_page)) 248 list_add_tail(&epc_page->list, &secs_pages); 249 cond_resched(); 250 } 251 252 if (!list_empty(&secs_pages)) 253 list_splice_tail(&secs_pages, &zombie_secs_pages); 254 mutex_unlock(&zombie_secs_pages_lock); 255 256 xa_destroy(&vepc->page_array); 257 kfree(vepc); 258 259 sgx_dec_usage_count(); 260 return 0; 261 } 262 263 static int __sgx_vepc_open(struct inode *inode, struct file *file) 264 { 265 struct sgx_vepc *vepc; 266 267 vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); 268 if (!vepc) 269 return -ENOMEM; 270 mutex_init(&vepc->lock); 271 xa_init(&vepc->page_array); 272 273 file->private_data = vepc; 274 275 return 0; 276 } 277 278 static int sgx_vepc_open(struct inode *inode, struct file *file) 279 { 280 int ret; 281 282 ret = sgx_inc_usage_count(); 283 if (ret) 284 return ret; 285 286 ret = __sgx_vepc_open(inode, file); 287 if (ret) { 288 sgx_dec_usage_count(); 289 return ret; 290 } 291 292 return 0; 293 } 294 295 static long sgx_vepc_ioctl(struct file *file, 296 unsigned int cmd, unsigned long arg) 297 { 298 struct sgx_vepc *vepc = file->private_data; 299 300 switch (cmd) { 301 case SGX_IOC_VEPC_REMOVE_ALL: 302 if (arg) 303 return -EINVAL; 304 return sgx_vepc_remove_all(vepc); 305 306 default: 307 return -ENOTTY; 308 } 309 } 310 311 static const struct file_operations sgx_vepc_fops = { 312 .owner = THIS_MODULE, 313 .open = sgx_vepc_open, 314 .unlocked_ioctl = sgx_vepc_ioctl, 315 .compat_ioctl = sgx_vepc_ioctl, 316 .release = sgx_vepc_release, 317 .mmap = sgx_vepc_mmap, 318 }; 319 320 static struct miscdevice sgx_vepc_dev = { 321 .minor = MISC_DYNAMIC_MINOR, 322 .name = "sgx_vepc", 323 .nodename = "sgx_vepc", 324 .fops = &sgx_vepc_fops, 325 }; 326 327 int __init sgx_vepc_init(void) 328 { 329 /* SGX virtualization requires KVM to work */ 330 if (!cpu_feature_enabled(X86_FEATURE_VMX)) 331 return -ENODEV; 332 333 INIT_LIST_HEAD(&zombie_secs_pages); 334 mutex_init(&zombie_secs_pages_lock); 335 336 return misc_register(&sgx_vepc_dev); 337 } 338 339 /** 340 * sgx_virt_ecreate() - Run ECREATE on behalf of guest 341 * @pageinfo: Pointer to PAGEINFO structure 342 * @secs: Userspace pointer to SECS page 343 * @trapnr: trap number injected to guest in case of ECREATE error 344 * 345 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose 346 * of enforcing policies of guest's enclaves, and return the trap number 347 * which should be injected to guest in case of any ECREATE error. 348 * 349 * Return: 350 * - 0: ECREATE was successful. 351 * - <0: on error. 352 */ 353 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, 354 int *trapnr) 355 { 356 int ret; 357 358 /* 359 * @secs is an untrusted, userspace-provided address. It comes from 360 * KVM and is assumed to be a valid pointer which points somewhere in 361 * userspace. This can fault and call SGX or other fault handlers when 362 * userspace mapping @secs doesn't exist. 363 * 364 * Add a WARN() to make sure @secs is already valid userspace pointer 365 * from caller (KVM), who should already have handled invalid pointer 366 * case (for instance, made by malicious guest). All other checks, 367 * such as alignment of @secs, are deferred to ENCLS itself. 368 */ 369 if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) 370 return -EINVAL; 371 372 __uaccess_begin(); 373 ret = __ecreate(pageinfo, (void *)secs); 374 __uaccess_end(); 375 376 if (encls_faulted(ret)) { 377 *trapnr = ENCLS_TRAPNR(ret); 378 return -EFAULT; 379 } 380 381 /* ECREATE doesn't return an error code, it faults or succeeds. */ 382 WARN_ON_ONCE(ret); 383 return 0; 384 } 385 EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate); 386 387 static int __sgx_virt_einit(void __user *sigstruct, void __user *token, 388 void __user *secs) 389 { 390 int ret; 391 392 /* 393 * Make sure all userspace pointers from caller (KVM) are valid. 394 * All other checks deferred to ENCLS itself. Also see comment 395 * for @secs in sgx_virt_ecreate(). 396 */ 397 #define SGX_EINITTOKEN_SIZE 304 398 if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || 399 !access_ok(token, SGX_EINITTOKEN_SIZE) || 400 !access_ok(secs, PAGE_SIZE))) 401 return -EINVAL; 402 403 __uaccess_begin(); 404 ret = __einit((void *)sigstruct, (void *)token, (void *)secs); 405 __uaccess_end(); 406 407 return ret; 408 } 409 410 /** 411 * sgx_virt_einit() - Run EINIT on behalf of guest 412 * @sigstruct: Userspace pointer to SIGSTRUCT structure 413 * @token: Userspace pointer to EINITTOKEN structure 414 * @secs: Userspace pointer to SECS page 415 * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values 416 * @trapnr: trap number injected to guest in case of EINIT error 417 * 418 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available 419 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM 420 * needs to update hardware values to guest's virtual MSR values in order to 421 * ensure EINIT is executed with expected hardware values. 422 * 423 * Return: 424 * - 0: EINIT was successful. 425 * - <0: on error. 426 */ 427 int sgx_virt_einit(void __user *sigstruct, void __user *token, 428 void __user *secs, u64 *lepubkeyhash, int *trapnr) 429 { 430 int ret; 431 432 if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { 433 ret = __sgx_virt_einit(sigstruct, token, secs); 434 } else { 435 preempt_disable(); 436 437 sgx_update_lepubkeyhash(lepubkeyhash); 438 439 ret = __sgx_virt_einit(sigstruct, token, secs); 440 preempt_enable(); 441 } 442 443 /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ 444 if (ret == -EINVAL) 445 return ret; 446 447 if (encls_faulted(ret)) { 448 *trapnr = ENCLS_TRAPNR(ret); 449 return -EFAULT; 450 } 451 452 return ret; 453 } 454 EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit); 455