1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-20 Intel Corporation. */ 3 4 #include <linux/lockdep.h> 5 #include <linux/mm.h> 6 #include <linux/mman.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/suspend.h> 9 #include <linux/sched/mm.h> 10 #include <asm/sgx.h> 11 #include "encl.h" 12 #include "encls.h" 13 #include "sgx.h" 14 15 static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 16 struct sgx_backing *backing); 17 18 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) 19 /* 20 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to 21 * determine the page index associated with the first PCMD entry 22 * within a PCMD page. 23 */ 24 #define PCMD_FIRST_MASK GENMASK(4, 0) 25 26 /** 27 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with 28 * a PCMD page is in process of being reclaimed. 29 * @encl: Enclave to which PCMD page belongs 30 * @start_addr: Address of enclave page using first entry within the PCMD page 31 * 32 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is 33 * stored. The PCMD data of a reclaimed enclave page contains enough 34 * information for the processor to verify the page at the time 35 * it is loaded back into the Enclave Page Cache (EPC). 36 * 37 * The backing storage to which enclave pages are reclaimed is laid out as 38 * follows: 39 * Encrypted enclave pages:SECS page:PCMD pages 40 * 41 * Each PCMD page contains the PCMD metadata of 42 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. 43 * 44 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the 45 * process of getting data (and thus soon being non-empty). (b) is tested with 46 * a check if an enclave page sharing the PCMD page is in the process of being 47 * reclaimed. 48 * 49 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it 50 * intends to reclaim that enclave page - it means that the PCMD page 51 * associated with that enclave page is about to get some data and thus 52 * even if the PCMD page is empty, it should not be truncated. 53 * 54 * Context: Enclave mutex (&sgx_encl->lock) must be held. 55 * Return: 1 if the reclaimer is about to write to the PCMD page 56 * 0 if the reclaimer has no intention to write to the PCMD page 57 */ 58 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, 59 unsigned long start_addr) 60 { 61 int reclaimed = 0; 62 int i; 63 64 /* 65 * PCMD_FIRST_MASK is based on number of PCMD entries within 66 * PCMD page being 32. 67 */ 68 BUILD_BUG_ON(PCMDS_PER_PAGE != 32); 69 70 for (i = 0; i < PCMDS_PER_PAGE; i++) { 71 struct sgx_encl_page *entry; 72 unsigned long addr; 73 74 addr = start_addr + i * PAGE_SIZE; 75 76 /* 77 * Stop when reaching the SECS page - it does not 78 * have a page_array entry and its reclaim is 79 * started and completed with enclave mutex held so 80 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED 81 * flag. 82 */ 83 if (addr == encl->base + encl->size) 84 break; 85 86 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 87 if (!entry) 88 continue; 89 90 /* 91 * VA page slot ID uses same bit as the flag so it is important 92 * to ensure that the page is not already in backing store. 93 */ 94 if (entry->epc_page && 95 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { 96 reclaimed = 1; 97 break; 98 } 99 } 100 101 return reclaimed; 102 } 103 104 /* 105 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's 106 * follow right after the EPC data in the backing storage. In addition to the 107 * visible enclave pages, there's one extra page slot for SECS, before PCMD 108 * structs. 109 */ 110 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, 111 unsigned long page_index) 112 { 113 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); 114 115 return epc_end_off + page_index * sizeof(struct sgx_pcmd); 116 } 117 118 /* 119 * Free a page from the backing storage in the given page index. 120 */ 121 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) 122 { 123 struct inode *inode = file_inode(encl->backing); 124 125 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); 126 } 127 128 /* 129 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC 130 * Pages" in the SDM. 131 */ 132 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, 133 struct sgx_epc_page *epc_page, 134 struct sgx_epc_page *secs_page) 135 { 136 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 137 struct sgx_encl *encl = encl_page->encl; 138 pgoff_t page_index, page_pcmd_off; 139 unsigned long pcmd_first_page; 140 struct sgx_pageinfo pginfo; 141 struct sgx_backing b; 142 bool pcmd_page_empty; 143 u8 *pcmd_page; 144 int ret; 145 146 if (secs_page) 147 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 148 else 149 page_index = PFN_DOWN(encl->size); 150 151 /* 152 * Address of enclave page using the first entry within the PCMD page. 153 */ 154 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; 155 156 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 157 158 ret = sgx_encl_lookup_backing(encl, page_index, &b); 159 if (ret) 160 return ret; 161 162 pginfo.addr = encl_page->desc & PAGE_MASK; 163 pginfo.contents = (unsigned long)kmap_local_page(b.contents); 164 pcmd_page = kmap_local_page(b.pcmd); 165 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; 166 167 if (secs_page) 168 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); 169 else 170 pginfo.secs = 0; 171 172 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), 173 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); 174 if (ret) { 175 if (encls_failed(ret)) 176 ENCLS_WARN(ret, "ELDU"); 177 178 ret = -EFAULT; 179 } 180 181 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); 182 set_page_dirty(b.pcmd); 183 184 /* 185 * The area for the PCMD in the page was zeroed above. Check if the 186 * whole page is now empty meaning that all PCMD's have been zeroed: 187 */ 188 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); 189 190 kunmap_local(pcmd_page); 191 kunmap_local((void *)(unsigned long)pginfo.contents); 192 193 get_page(b.pcmd); 194 sgx_encl_put_backing(&b); 195 196 sgx_encl_truncate_backing_page(encl, page_index); 197 198 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { 199 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); 200 pcmd_page = kmap_local_page(b.pcmd); 201 if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) 202 pr_warn("PCMD page not empty after truncate.\n"); 203 kunmap_local(pcmd_page); 204 } 205 206 put_page(b.pcmd); 207 208 return ret; 209 } 210 211 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, 212 struct sgx_epc_page *secs_page) 213 { 214 215 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; 216 struct sgx_encl *encl = encl_page->encl; 217 struct sgx_epc_page *epc_page; 218 int ret; 219 220 epc_page = sgx_alloc_epc_page(encl_page, false); 221 if (IS_ERR(epc_page)) 222 return epc_page; 223 224 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); 225 if (ret) { 226 sgx_encl_free_epc_page(epc_page); 227 return ERR_PTR(ret); 228 } 229 230 sgx_free_va_slot(encl_page->va_page, va_offset); 231 list_move(&encl_page->va_page->list, &encl->va_pages); 232 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; 233 encl_page->epc_page = epc_page; 234 235 return epc_page; 236 } 237 238 /* 239 * Ensure the SECS page is not swapped out. Must be called with encl->lock 240 * to protect the enclave states including SECS and ensure the SECS page is 241 * not swapped out again while being used. 242 */ 243 static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) 244 { 245 struct sgx_epc_page *epc_page = encl->secs.epc_page; 246 247 if (!epc_page) 248 epc_page = sgx_encl_eldu(&encl->secs, NULL); 249 250 return epc_page; 251 } 252 253 static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, 254 struct sgx_encl_page *entry) 255 { 256 struct sgx_epc_page *epc_page; 257 258 /* Entry successfully located. */ 259 if (entry->epc_page) { 260 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) 261 return ERR_PTR(-EBUSY); 262 263 return entry; 264 } 265 266 epc_page = sgx_encl_load_secs(encl); 267 if (IS_ERR(epc_page)) 268 return ERR_CAST(epc_page); 269 270 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); 271 if (IS_ERR(epc_page)) 272 return ERR_CAST(epc_page); 273 274 encl->secs_child_cnt++; 275 sgx_mark_page_reclaimable(entry->epc_page); 276 277 return entry; 278 } 279 280 static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, 281 unsigned long addr, 282 vm_flags_t vm_flags) 283 { 284 unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; 285 struct sgx_encl_page *entry; 286 287 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 288 if (!entry) 289 return ERR_PTR(-EFAULT); 290 291 /* 292 * Verify that the page has equal or higher build time 293 * permissions than the VMA permissions (i.e. the subset of {VM_READ, 294 * VM_WRITE, VM_EXECUTE} in vma->vm_flags). 295 */ 296 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) 297 return ERR_PTR(-EFAULT); 298 299 return __sgx_encl_load_page(encl, entry); 300 } 301 302 struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, 303 unsigned long addr) 304 { 305 struct sgx_encl_page *entry; 306 307 entry = xa_load(&encl->page_array, PFN_DOWN(addr)); 308 if (!entry) 309 return ERR_PTR(-EFAULT); 310 311 return __sgx_encl_load_page(encl, entry); 312 } 313 314 /** 315 * sgx_encl_eaug_page() - Dynamically add page to initialized enclave 316 * @vma: VMA obtained from fault info from where page is accessed 317 * @encl: enclave accessing the page 318 * @addr: address that triggered the page fault 319 * 320 * When an initialized enclave accesses a page with no backing EPC page 321 * on a SGX2 system then the EPC can be added dynamically via the SGX2 322 * ENCLS[EAUG] instruction. 323 * 324 * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed 325 * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. 326 */ 327 static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, 328 struct sgx_encl *encl, unsigned long addr) 329 { 330 vm_fault_t vmret = VM_FAULT_SIGBUS; 331 struct sgx_pageinfo pginfo = {0}; 332 struct sgx_encl_page *encl_page; 333 struct sgx_epc_page *epc_page; 334 struct sgx_va_page *va_page; 335 unsigned long phys_addr; 336 u64 secinfo_flags; 337 int ret; 338 339 if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) 340 return VM_FAULT_SIGBUS; 341 342 /* 343 * Ignore internal permission checking for dynamically added pages. 344 * They matter only for data added during the pre-initialization 345 * phase. The enclave decides the permissions by the means of 346 * EACCEPT, EACCEPTCOPY and EMODPE. 347 */ 348 secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; 349 encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); 350 if (IS_ERR(encl_page)) 351 return VM_FAULT_OOM; 352 353 mutex_lock(&encl->lock); 354 355 epc_page = sgx_encl_load_secs(encl); 356 if (IS_ERR(epc_page)) { 357 if (PTR_ERR(epc_page) == -EBUSY) 358 vmret = VM_FAULT_NOPAGE; 359 goto err_out_unlock; 360 } 361 362 epc_page = sgx_alloc_epc_page(encl_page, false); 363 if (IS_ERR(epc_page)) { 364 if (PTR_ERR(epc_page) == -EBUSY) 365 vmret = VM_FAULT_NOPAGE; 366 goto err_out_unlock; 367 } 368 369 va_page = sgx_encl_grow(encl, false); 370 if (IS_ERR(va_page)) { 371 if (PTR_ERR(va_page) == -EBUSY) 372 vmret = VM_FAULT_NOPAGE; 373 goto err_out_epc; 374 } 375 376 if (va_page) 377 list_add(&va_page->list, &encl->va_pages); 378 379 ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), 380 encl_page, GFP_KERNEL); 381 /* 382 * If ret == -EBUSY then page was created in another flow while 383 * running without encl->lock 384 */ 385 if (ret) 386 goto err_out_shrink; 387 388 pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); 389 pginfo.addr = encl_page->desc & PAGE_MASK; 390 pginfo.metadata = 0; 391 392 ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); 393 if (ret) 394 goto err_out; 395 396 encl_page->encl = encl; 397 encl_page->epc_page = epc_page; 398 encl_page->type = SGX_PAGE_TYPE_REG; 399 encl->secs_child_cnt++; 400 401 sgx_mark_page_reclaimable(encl_page->epc_page); 402 403 phys_addr = sgx_get_epc_phys_addr(epc_page); 404 /* 405 * Do not undo everything when creating PTE entry fails - next #PF 406 * would find page ready for a PTE. 407 */ 408 vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 409 if (vmret != VM_FAULT_NOPAGE) { 410 mutex_unlock(&encl->lock); 411 return VM_FAULT_SIGBUS; 412 } 413 mutex_unlock(&encl->lock); 414 return VM_FAULT_NOPAGE; 415 416 err_out: 417 xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); 418 419 err_out_shrink: 420 sgx_encl_shrink(encl, va_page); 421 err_out_epc: 422 sgx_encl_free_epc_page(epc_page); 423 err_out_unlock: 424 mutex_unlock(&encl->lock); 425 kfree(encl_page); 426 427 return vmret; 428 } 429 430 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) 431 { 432 unsigned long addr = (unsigned long)vmf->address; 433 struct vm_area_struct *vma = vmf->vma; 434 struct sgx_encl_page *entry; 435 unsigned long phys_addr; 436 struct sgx_encl *encl; 437 vm_fault_t ret; 438 439 encl = vma->vm_private_data; 440 441 /* 442 * It's very unlikely but possible that allocating memory for the 443 * mm_list entry of a forked process failed in sgx_vma_open(). When 444 * this happens, vm_private_data is set to NULL. 445 */ 446 if (unlikely(!encl)) 447 return VM_FAULT_SIGBUS; 448 449 /* 450 * The page_array keeps track of all enclave pages, whether they 451 * are swapped out or not. If there is no entry for this page and 452 * the system supports SGX2 then it is possible to dynamically add 453 * a new enclave page. This is only possible for an initialized 454 * enclave that will be checked for right away. 455 */ 456 if (cpu_feature_enabled(X86_FEATURE_SGX2) && 457 (!xa_load(&encl->page_array, PFN_DOWN(addr)))) 458 return sgx_encl_eaug_page(vma, encl, addr); 459 460 mutex_lock(&encl->lock); 461 462 entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); 463 if (IS_ERR(entry)) { 464 mutex_unlock(&encl->lock); 465 466 if (PTR_ERR(entry) == -EBUSY) 467 return VM_FAULT_NOPAGE; 468 469 return VM_FAULT_SIGBUS; 470 } 471 472 phys_addr = sgx_get_epc_phys_addr(entry->epc_page); 473 474 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); 475 if (ret != VM_FAULT_NOPAGE) { 476 mutex_unlock(&encl->lock); 477 478 return VM_FAULT_SIGBUS; 479 } 480 481 sgx_encl_test_and_clear_young(vma->vm_mm, entry); 482 mutex_unlock(&encl->lock); 483 484 return VM_FAULT_NOPAGE; 485 } 486 487 static void sgx_vma_open(struct vm_area_struct *vma) 488 { 489 struct sgx_encl *encl = vma->vm_private_data; 490 491 /* 492 * It's possible but unlikely that vm_private_data is NULL. This can 493 * happen in a grandchild of a process, when sgx_encl_mm_add() had 494 * failed to allocate memory in this callback. 495 */ 496 if (unlikely(!encl)) 497 return; 498 499 if (sgx_encl_mm_add(encl, vma->vm_mm)) 500 vma->vm_private_data = NULL; 501 } 502 503 504 /** 505 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed 506 * @encl: an enclave pointer 507 * @start: lower bound of the address range, inclusive 508 * @end: upper bound of the address range, exclusive 509 * @vm_flags: VMA flags 510 * 511 * Iterate through the enclave pages contained within [@start, @end) to verify 512 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} 513 * do not contain any permissions that are not contained in the build time 514 * permissions of any of the enclave pages within the given address range. 515 * 516 * An enclave creator must declare the strongest permissions that will be 517 * needed for each enclave page. This ensures that mappings have the identical 518 * or weaker permissions than the earlier declared permissions. 519 * 520 * Return: 0 on success, -EACCES otherwise 521 */ 522 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, 523 unsigned long end, vm_flags_t vm_flags) 524 { 525 vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; 526 struct sgx_encl_page *page; 527 unsigned long count = 0; 528 int ret = 0; 529 530 XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); 531 532 /* Disallow mapping outside enclave's address range. */ 533 if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && 534 (start < encl->base || end > encl->base + encl->size)) 535 return -EACCES; 536 537 /* 538 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might 539 * conflict with the enclave page permissions. 540 */ 541 if (current->personality & READ_IMPLIES_EXEC) 542 return -EACCES; 543 544 mutex_lock(&encl->lock); 545 xas_lock(&xas); 546 xas_for_each(&xas, page, PFN_DOWN(end - 1)) { 547 if (~page->vm_max_prot_bits & vm_prot_bits) { 548 ret = -EACCES; 549 break; 550 } 551 552 /* Reschedule on every XA_CHECK_SCHED iteration. */ 553 if (!(++count % XA_CHECK_SCHED)) { 554 xas_pause(&xas); 555 xas_unlock(&xas); 556 mutex_unlock(&encl->lock); 557 558 cond_resched(); 559 560 mutex_lock(&encl->lock); 561 xas_lock(&xas); 562 } 563 } 564 xas_unlock(&xas); 565 mutex_unlock(&encl->lock); 566 567 return ret; 568 } 569 570 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, 571 unsigned long end, unsigned long newflags) 572 { 573 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); 574 } 575 576 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, 577 unsigned long addr, void *data) 578 { 579 unsigned long offset = addr & ~PAGE_MASK; 580 int ret; 581 582 583 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 584 if (ret) 585 return -EIO; 586 587 return 0; 588 } 589 590 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, 591 unsigned long addr, void *data) 592 { 593 unsigned long offset = addr & ~PAGE_MASK; 594 int ret; 595 596 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); 597 if (ret) 598 return -EIO; 599 600 return 0; 601 } 602 603 /* 604 * Load an enclave page to EPC if required, and take encl->lock. 605 */ 606 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, 607 unsigned long addr, 608 vm_flags_t vm_flags) 609 { 610 struct sgx_encl_page *entry; 611 612 for ( ; ; ) { 613 mutex_lock(&encl->lock); 614 615 entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); 616 if (PTR_ERR(entry) != -EBUSY) 617 break; 618 619 mutex_unlock(&encl->lock); 620 } 621 622 if (IS_ERR(entry)) 623 mutex_unlock(&encl->lock); 624 625 return entry; 626 } 627 628 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, 629 void *buf, int len, int write) 630 { 631 struct sgx_encl *encl = vma->vm_private_data; 632 struct sgx_encl_page *entry = NULL; 633 char data[sizeof(unsigned long)]; 634 unsigned long align; 635 int offset; 636 int cnt; 637 int ret = 0; 638 int i; 639 640 /* 641 * If process was forked, VMA is still there but vm_private_data is set 642 * to NULL. 643 */ 644 if (!encl) 645 return -EFAULT; 646 647 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) 648 return -EFAULT; 649 650 for (i = 0; i < len; i += cnt) { 651 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, 652 vma->vm_flags); 653 if (IS_ERR(entry)) { 654 ret = PTR_ERR(entry); 655 break; 656 } 657 658 align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); 659 offset = (addr + i) & (sizeof(unsigned long) - 1); 660 cnt = sizeof(unsigned long) - offset; 661 cnt = min(cnt, len - i); 662 663 ret = sgx_encl_debug_read(encl, entry, align, data); 664 if (ret) 665 goto out; 666 667 if (write) { 668 memcpy(data + offset, buf + i, cnt); 669 ret = sgx_encl_debug_write(encl, entry, align, data); 670 if (ret) 671 goto out; 672 } else { 673 memcpy(buf + i, data + offset, cnt); 674 } 675 676 out: 677 mutex_unlock(&encl->lock); 678 679 if (ret) 680 break; 681 } 682 683 return ret < 0 ? ret : i; 684 } 685 686 const struct vm_operations_struct sgx_vm_ops = { 687 .fault = sgx_vma_fault, 688 .mprotect = sgx_vma_mprotect, 689 .open = sgx_vma_open, 690 .access = sgx_vma_access, 691 }; 692 693 /** 694 * sgx_encl_release - Destroy an enclave instance 695 * @ref: address of a kref inside &sgx_encl 696 * 697 * Used together with kref_put(). Frees all the resources associated with the 698 * enclave and the instance itself. 699 */ 700 void sgx_encl_release(struct kref *ref) 701 { 702 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); 703 unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1); 704 struct sgx_va_page *va_page; 705 struct sgx_encl_page *entry; 706 unsigned long count = 0; 707 708 XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base)); 709 710 xas_lock(&xas); 711 xas_for_each(&xas, entry, max_page_index) { 712 if (entry->epc_page) { 713 /* 714 * The page and its radix tree entry cannot be freed 715 * if the page is being held by the reclaimer. 716 */ 717 if (sgx_unmark_page_reclaimable(entry->epc_page)) 718 continue; 719 720 sgx_encl_free_epc_page(entry->epc_page); 721 encl->secs_child_cnt--; 722 entry->epc_page = NULL; 723 } 724 725 kfree(entry); 726 /* 727 * Invoke scheduler on every XA_CHECK_SCHED iteration 728 * to prevent soft lockups. 729 */ 730 if (!(++count % XA_CHECK_SCHED)) { 731 xas_pause(&xas); 732 xas_unlock(&xas); 733 734 cond_resched(); 735 736 xas_lock(&xas); 737 } 738 } 739 xas_unlock(&xas); 740 741 xa_destroy(&encl->page_array); 742 743 if (!encl->secs_child_cnt && encl->secs.epc_page) { 744 sgx_encl_free_epc_page(encl->secs.epc_page); 745 encl->secs.epc_page = NULL; 746 } 747 748 while (!list_empty(&encl->va_pages)) { 749 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, 750 list); 751 list_del(&va_page->list); 752 sgx_encl_free_epc_page(va_page->epc_page); 753 kfree(va_page); 754 } 755 756 if (encl->backing) 757 fput(encl->backing); 758 759 cleanup_srcu_struct(&encl->srcu); 760 761 WARN_ON_ONCE(!list_empty(&encl->mm_list)); 762 763 /* Detect EPC page leak's. */ 764 WARN_ON_ONCE(encl->secs_child_cnt); 765 WARN_ON_ONCE(encl->secs.epc_page); 766 767 kfree(encl); 768 sgx_dec_usage_count(); 769 } 770 771 /* 772 * 'mm' is exiting and no longer needs mmu notifications. 773 */ 774 static void sgx_mmu_notifier_release(struct mmu_notifier *mn, 775 struct mm_struct *mm) 776 { 777 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 778 struct sgx_encl_mm *tmp = NULL; 779 bool found = false; 780 781 /* 782 * The enclave itself can remove encl_mm. Note, objects can't be moved 783 * off an RCU protected list, but deletion is ok. 784 */ 785 spin_lock(&encl_mm->encl->mm_lock); 786 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { 787 if (tmp == encl_mm) { 788 list_del_rcu(&encl_mm->list); 789 found = true; 790 break; 791 } 792 } 793 spin_unlock(&encl_mm->encl->mm_lock); 794 795 if (found) { 796 synchronize_srcu(&encl_mm->encl->srcu); 797 mmu_notifier_put(mn); 798 } 799 } 800 801 static void sgx_mmu_notifier_free(struct mmu_notifier *mn) 802 { 803 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); 804 805 /* 'encl_mm' is going away, put encl_mm->encl reference: */ 806 kref_put(&encl_mm->encl->refcount, sgx_encl_release); 807 808 kfree(encl_mm); 809 } 810 811 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { 812 .release = sgx_mmu_notifier_release, 813 .free_notifier = sgx_mmu_notifier_free, 814 }; 815 816 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, 817 struct mm_struct *mm) 818 { 819 struct sgx_encl_mm *encl_mm = NULL; 820 struct sgx_encl_mm *tmp; 821 int idx; 822 823 idx = srcu_read_lock(&encl->srcu); 824 825 list_for_each_entry_rcu(tmp, &encl->mm_list, list) { 826 if (tmp->mm == mm) { 827 encl_mm = tmp; 828 break; 829 } 830 } 831 832 srcu_read_unlock(&encl->srcu, idx); 833 834 return encl_mm; 835 } 836 837 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) 838 { 839 struct sgx_encl_mm *encl_mm; 840 int ret; 841 842 /* 843 * Even though a single enclave may be mapped into an mm more than once, 844 * each 'mm' only appears once on encl->mm_list. This is guaranteed by 845 * holding the mm's mmap lock for write before an mm can be added or 846 * remove to an encl->mm_list. 847 */ 848 mmap_assert_write_locked(mm); 849 850 /* 851 * It's possible that an entry already exists in the mm_list, because it 852 * is removed only on VFS release or process exit. 853 */ 854 if (sgx_encl_find_mm(encl, mm)) 855 return 0; 856 857 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); 858 if (!encl_mm) 859 return -ENOMEM; 860 861 /* Grab a refcount for the encl_mm->encl reference: */ 862 kref_get(&encl->refcount); 863 encl_mm->encl = encl; 864 encl_mm->mm = mm; 865 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; 866 867 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); 868 if (ret) { 869 kfree(encl_mm); 870 return ret; 871 } 872 873 spin_lock(&encl->mm_lock); 874 list_add_rcu(&encl_mm->list, &encl->mm_list); 875 /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ 876 smp_wmb(); 877 encl->mm_list_version++; 878 spin_unlock(&encl->mm_lock); 879 880 return 0; 881 } 882 883 /** 884 * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave 885 * @encl: the enclave 886 * 887 * Some SGX functions require that no cached linear-to-physical address 888 * mappings are present before they can succeed. For example, ENCLS[EWB] 889 * copies a page from the enclave page cache to regular main memory but 890 * it fails if it cannot ensure that there are no cached 891 * linear-to-physical address mappings referring to the page. 892 * 893 * SGX hardware flushes all cached linear-to-physical mappings on a CPU 894 * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave 895 * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical 896 * address mappings are cleared but coordination with the tracking done within 897 * the SGX hardware is needed to support the SGX functions that depend on this 898 * cache clearing. 899 * 900 * When the ENCLS[ETRACK] function is issued on an enclave the hardware 901 * tracks threads operating inside the enclave at that time. The SGX 902 * hardware tracking require that all the identified threads must have 903 * exited the enclave in order to flush the mappings before a function such 904 * as ENCLS[EWB] will be permitted 905 * 906 * The following flow is used to support SGX functions that require that 907 * no cached linear-to-physical address mappings are present: 908 * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. 909 * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be 910 * accessing the enclave. 911 * 3) Send IPI to identified CPUs, kicking them out of the enclave and 912 * thus flushing all locally cached linear-to-physical address mappings. 913 * 4) Execute SGX function. 914 * 915 * Context: It is required to call this function after ENCLS[ETRACK]. 916 * This will ensure that if any new mm appears (racing with 917 * sgx_encl_mm_add()) then the new mm will enter into the 918 * enclave with fresh linear-to-physical address mappings. 919 * 920 * It is required that all IPIs are completed before a new 921 * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 922 * of the above flow with the enclave's mutex. 923 * 924 * Return: cpumask of CPUs that might be accessing @encl 925 */ 926 const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) 927 { 928 cpumask_t *cpumask = &encl->cpumask; 929 struct sgx_encl_mm *encl_mm; 930 int idx; 931 932 cpumask_clear(cpumask); 933 934 idx = srcu_read_lock(&encl->srcu); 935 936 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 937 if (!mmget_not_zero(encl_mm->mm)) 938 continue; 939 940 cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); 941 942 mmput_async(encl_mm->mm); 943 } 944 945 srcu_read_unlock(&encl->srcu, idx); 946 947 return cpumask; 948 } 949 950 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, 951 pgoff_t index) 952 { 953 struct address_space *mapping = encl->backing->f_mapping; 954 gfp_t gfpmask = mapping_gfp_mask(mapping); 955 956 return shmem_read_mapping_page_gfp(mapping, index, gfpmask); 957 } 958 959 /** 960 * __sgx_encl_get_backing() - Pin the backing storage 961 * @encl: an enclave pointer 962 * @page_index: enclave page index 963 * @backing: data for accessing backing storage for the page 964 * 965 * Pin the backing storage pages for storing the encrypted contents and Paging 966 * Crypto MetaData (PCMD) of an enclave page. 967 * 968 * Return: 969 * 0 on success, 970 * -errno otherwise. 971 */ 972 static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 973 struct sgx_backing *backing) 974 { 975 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 976 struct page *contents; 977 struct page *pcmd; 978 979 contents = sgx_encl_get_backing_page(encl, page_index); 980 if (IS_ERR(contents)) 981 return PTR_ERR(contents); 982 983 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); 984 if (IS_ERR(pcmd)) { 985 put_page(contents); 986 return PTR_ERR(pcmd); 987 } 988 989 backing->contents = contents; 990 backing->pcmd = pcmd; 991 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); 992 993 return 0; 994 } 995 996 /* 997 * When called from ksgxd, returns the mem_cgroup of a struct mm stored 998 * in the enclave's mm_list. When not called from ksgxd, just returns 999 * the mem_cgroup of the current task. 1000 */ 1001 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) 1002 { 1003 struct mem_cgroup *memcg = NULL; 1004 struct sgx_encl_mm *encl_mm; 1005 int idx; 1006 1007 /* 1008 * If called from normal task context, return the mem_cgroup 1009 * of the current task's mm. The remainder of the handling is for 1010 * ksgxd. 1011 */ 1012 if (!current_is_ksgxd()) 1013 return get_mem_cgroup_from_mm(current->mm); 1014 1015 /* 1016 * Search the enclave's mm_list to find an mm associated with 1017 * this enclave to charge the allocation to. 1018 */ 1019 idx = srcu_read_lock(&encl->srcu); 1020 1021 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 1022 if (!mmget_not_zero(encl_mm->mm)) 1023 continue; 1024 1025 memcg = get_mem_cgroup_from_mm(encl_mm->mm); 1026 1027 mmput_async(encl_mm->mm); 1028 1029 break; 1030 } 1031 1032 srcu_read_unlock(&encl->srcu, idx); 1033 1034 /* 1035 * In the rare case that there isn't an mm associated with 1036 * the enclave, set memcg to the current active mem_cgroup. 1037 * This will be the root mem_cgroup if there is no active 1038 * mem_cgroup. 1039 */ 1040 if (!memcg) 1041 return get_mem_cgroup_from_mm(NULL); 1042 1043 return memcg; 1044 } 1045 1046 /** 1047 * sgx_encl_alloc_backing() - create a new backing storage page 1048 * @encl: an enclave pointer 1049 * @page_index: enclave page index 1050 * @backing: data for accessing backing storage for the page 1051 * 1052 * When called from ksgxd, sets the active memcg from one of the 1053 * mms in the enclave's mm_list prior to any backing page allocation, 1054 * in order to ensure that shmem page allocations are charged to the 1055 * enclave. Create a backing page for loading data back into an EPC page with 1056 * ELDU. This function takes a reference on a new backing page which 1057 * must be dropped with a corresponding call to sgx_encl_put_backing(). 1058 * 1059 * Return: 1060 * 0 on success, 1061 * -errno otherwise. 1062 */ 1063 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, 1064 struct sgx_backing *backing) 1065 { 1066 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); 1067 struct mem_cgroup *memcg = set_active_memcg(encl_memcg); 1068 int ret; 1069 1070 ret = __sgx_encl_get_backing(encl, page_index, backing); 1071 1072 set_active_memcg(memcg); 1073 mem_cgroup_put(encl_memcg); 1074 1075 return ret; 1076 } 1077 1078 /** 1079 * sgx_encl_lookup_backing() - retrieve an existing backing storage page 1080 * @encl: an enclave pointer 1081 * @page_index: enclave page index 1082 * @backing: data for accessing backing storage for the page 1083 * 1084 * Retrieve a backing page for loading data back into an EPC page with ELDU. 1085 * It is the caller's responsibility to ensure that it is appropriate to use 1086 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is 1087 * not used correctly, this will cause an allocation which is not accounted for. 1088 * This function takes a reference on an existing backing page which must be 1089 * dropped with a corresponding call to sgx_encl_put_backing(). 1090 * 1091 * Return: 1092 * 0 on success, 1093 * -errno otherwise. 1094 */ 1095 static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 1096 struct sgx_backing *backing) 1097 { 1098 return __sgx_encl_get_backing(encl, page_index, backing); 1099 } 1100 1101 /** 1102 * sgx_encl_put_backing() - Unpin the backing storage 1103 * @backing: data for accessing backing storage for the page 1104 */ 1105 void sgx_encl_put_backing(struct sgx_backing *backing) 1106 { 1107 put_page(backing->pcmd); 1108 put_page(backing->contents); 1109 } 1110 1111 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, 1112 void *data) 1113 { 1114 pte_t pte; 1115 int ret; 1116 1117 ret = pte_young(*ptep); 1118 if (ret) { 1119 pte = pte_mkold(*ptep); 1120 set_pte_at((struct mm_struct *)data, addr, ptep, pte); 1121 } 1122 1123 return ret; 1124 } 1125 1126 /** 1127 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit 1128 * @mm: mm_struct that is checked 1129 * @page: enclave page to be tested for recent access 1130 * 1131 * Checks the Access (A) bit from the PTE corresponding to the enclave page and 1132 * clears it. 1133 * 1134 * Return: 1 if the page has been recently accessed and 0 if not. 1135 */ 1136 int sgx_encl_test_and_clear_young(struct mm_struct *mm, 1137 struct sgx_encl_page *page) 1138 { 1139 unsigned long addr = page->desc & PAGE_MASK; 1140 struct sgx_encl *encl = page->encl; 1141 struct vm_area_struct *vma; 1142 int ret; 1143 1144 ret = sgx_encl_find(mm, addr, &vma); 1145 if (ret) 1146 return 0; 1147 1148 if (encl != vma->vm_private_data) 1149 return 0; 1150 1151 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, 1152 sgx_encl_test_and_clear_young_cb, vma->vm_mm); 1153 if (ret < 0) 1154 return 0; 1155 1156 return ret; 1157 } 1158 1159 struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, 1160 unsigned long offset, 1161 u64 secinfo_flags) 1162 { 1163 struct sgx_encl_page *encl_page; 1164 unsigned long prot; 1165 1166 encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); 1167 if (!encl_page) 1168 return ERR_PTR(-ENOMEM); 1169 1170 encl_page->desc = encl->base + offset; 1171 encl_page->encl = encl; 1172 1173 prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | 1174 _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | 1175 _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); 1176 1177 /* 1178 * TCS pages must always RW set for CPU access while the SECINFO 1179 * permissions are *always* zero - the CPU ignores the user provided 1180 * values and silently overwrites them with zero permissions. 1181 */ 1182 if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) 1183 prot |= PROT_READ | PROT_WRITE; 1184 1185 /* Calculate maximum of the VM flags for the page. */ 1186 encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); 1187 1188 return encl_page; 1189 } 1190 1191 /** 1192 * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave 1193 * @encl: the enclave 1194 * @addr: page aligned pointer to single page for which PTEs will be removed 1195 * 1196 * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping 1197 * @addr from each VMA. Ensure that page fault handler is ready to handle 1198 * new mappings of @addr before calling this function. 1199 */ 1200 void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) 1201 { 1202 unsigned long mm_list_version; 1203 struct sgx_encl_mm *encl_mm; 1204 struct vm_area_struct *vma; 1205 int idx, ret; 1206 1207 do { 1208 mm_list_version = encl->mm_list_version; 1209 1210 /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ 1211 smp_rmb(); 1212 1213 idx = srcu_read_lock(&encl->srcu); 1214 1215 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 1216 if (!mmget_not_zero(encl_mm->mm)) 1217 continue; 1218 1219 mmap_read_lock(encl_mm->mm); 1220 1221 ret = sgx_encl_find(encl_mm->mm, addr, &vma); 1222 if (!ret && encl == vma->vm_private_data) 1223 zap_vma_ptes(vma, addr, PAGE_SIZE); 1224 1225 mmap_read_unlock(encl_mm->mm); 1226 1227 mmput_async(encl_mm->mm); 1228 } 1229 1230 srcu_read_unlock(&encl->srcu, idx); 1231 } while (unlikely(encl->mm_list_version != mm_list_version)); 1232 } 1233 1234 /** 1235 * sgx_alloc_va_page() - Allocate a Version Array (VA) page 1236 * @reclaim: Reclaim EPC pages directly if none available. Enclave 1237 * mutex should not be held if this is set. 1238 * 1239 * Allocate a free EPC page and convert it to a Version Array (VA) page. 1240 * 1241 * Return: 1242 * a VA page, 1243 * -errno otherwise 1244 */ 1245 struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) 1246 { 1247 struct sgx_epc_page *epc_page; 1248 int ret; 1249 1250 epc_page = sgx_alloc_epc_page(NULL, reclaim); 1251 if (IS_ERR(epc_page)) 1252 return ERR_CAST(epc_page); 1253 1254 ret = __epa(sgx_get_epc_virt_addr(epc_page)); 1255 if (ret) { 1256 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); 1257 sgx_encl_free_epc_page(epc_page); 1258 return ERR_PTR(-EFAULT); 1259 } 1260 1261 return epc_page; 1262 } 1263 1264 /** 1265 * sgx_alloc_va_slot - allocate a VA slot 1266 * @va_page: a &struct sgx_va_page instance 1267 * 1268 * Allocates a slot from a &struct sgx_va_page instance. 1269 * 1270 * Return: offset of the slot inside the VA page 1271 */ 1272 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) 1273 { 1274 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 1275 1276 if (slot < SGX_VA_SLOT_COUNT) 1277 set_bit(slot, va_page->slots); 1278 1279 return slot << 3; 1280 } 1281 1282 /** 1283 * sgx_free_va_slot - free a VA slot 1284 * @va_page: a &struct sgx_va_page instance 1285 * @offset: offset of the slot inside the VA page 1286 * 1287 * Frees a slot from a &struct sgx_va_page instance. 1288 */ 1289 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) 1290 { 1291 clear_bit(offset >> 3, va_page->slots); 1292 } 1293 1294 /** 1295 * sgx_va_page_full - is the VA page full? 1296 * @va_page: a &struct sgx_va_page instance 1297 * 1298 * Return: true if all slots have been taken 1299 */ 1300 bool sgx_va_page_full(struct sgx_va_page *va_page) 1301 { 1302 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); 1303 1304 return slot == SGX_VA_SLOT_COUNT; 1305 } 1306 1307 /** 1308 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave 1309 * @page: EPC page to be freed 1310 * 1311 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and 1312 * only upon success, it puts the page back to free page list. Otherwise, it 1313 * gives a WARNING to indicate page is leaked. 1314 */ 1315 void sgx_encl_free_epc_page(struct sgx_epc_page *page) 1316 { 1317 int ret; 1318 1319 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); 1320 1321 ret = __eremove(sgx_get_epc_virt_addr(page)); 1322 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) 1323 return; 1324 1325 sgx_free_epc_page(page); 1326 } 1327