1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-20 Intel Corporation. */ 3 4 #include <linux/file.h> 5 #include <linux/freezer.h> 6 #include <linux/highmem.h> 7 #include <linux/kthread.h> 8 #include <linux/kvm_types.h> 9 #include <linux/miscdevice.h> 10 #include <linux/node.h> 11 #include <linux/pagemap.h> 12 #include <linux/ratelimit.h> 13 #include <linux/sched/mm.h> 14 #include <linux/sched/signal.h> 15 #include <linux/slab.h> 16 #include <linux/sysfs.h> 17 #include <linux/vmalloc.h> 18 #include <asm/msr.h> 19 #include <asm/sgx.h> 20 #include "driver.h" 21 #include "encl.h" 22 #include "encls.h" 23 24 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; 25 static int sgx_nr_epc_sections; 26 static struct task_struct *ksgxd_tsk; 27 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); 28 static DEFINE_XARRAY(sgx_epc_address_space); 29 30 /* 31 * These variables are part of the state of the reclaimer, and must be accessed 32 * with sgx_reclaimer_lock acquired. 33 */ 34 static LIST_HEAD(sgx_active_page_list); 35 static DEFINE_SPINLOCK(sgx_reclaimer_lock); 36 37 static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); 38 39 /* Nodes with one or more EPC sections. */ 40 static nodemask_t sgx_numa_mask; 41 42 /* 43 * Array with one list_head for each possible NUMA node. Each 44 * list contains all the sgx_epc_section's which are on that 45 * node. 46 */ 47 static struct sgx_numa_node *sgx_numa_nodes; 48 49 static LIST_HEAD(sgx_dirty_page_list); 50 51 /* 52 * Reset post-kexec EPC pages to the uninitialized state. The pages are removed 53 * from the input list, and made available for the page allocator. SECS pages 54 * prepending their children in the input list are left intact. 55 * 56 * Return 0 when sanitization was successful or kthread was stopped, and the 57 * number of unsanitized pages otherwise. 58 */ 59 static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) 60 { 61 unsigned long left_dirty = 0; 62 struct sgx_epc_page *page; 63 LIST_HEAD(dirty); 64 int ret; 65 66 /* dirty_page_list is thread-local, no need for a lock: */ 67 while (!list_empty(dirty_page_list)) { 68 if (kthread_should_stop()) 69 return 0; 70 71 page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); 72 73 /* 74 * Checking page->poison without holding the node->lock 75 * is racy, but losing the race (i.e. poison is set just 76 * after the check) just means __eremove() will be uselessly 77 * called for a page that sgx_free_epc_page() will put onto 78 * the node->sgx_poison_page_list later. 79 */ 80 if (page->poison) { 81 struct sgx_epc_section *section = &sgx_epc_sections[page->section]; 82 struct sgx_numa_node *node = section->node; 83 84 spin_lock(&node->lock); 85 list_move(&page->list, &node->sgx_poison_page_list); 86 spin_unlock(&node->lock); 87 88 continue; 89 } 90 91 ret = __eremove(sgx_get_epc_virt_addr(page)); 92 if (!ret) { 93 /* 94 * page is now sanitized. Make it available via the SGX 95 * page allocator: 96 */ 97 list_del(&page->list); 98 sgx_free_epc_page(page); 99 } else { 100 /* The page is not yet clean - move to the dirty list. */ 101 list_move_tail(&page->list, &dirty); 102 left_dirty++; 103 } 104 105 cond_resched(); 106 } 107 108 list_splice(&dirty, dirty_page_list); 109 return left_dirty; 110 } 111 112 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) 113 { 114 struct sgx_encl_page *page = epc_page->owner; 115 struct sgx_encl *encl = page->encl; 116 struct sgx_encl_mm *encl_mm; 117 bool ret = true; 118 int idx; 119 120 idx = srcu_read_lock(&encl->srcu); 121 122 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 123 if (!mmget_not_zero(encl_mm->mm)) 124 continue; 125 126 mmap_read_lock(encl_mm->mm); 127 ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); 128 mmap_read_unlock(encl_mm->mm); 129 130 mmput_async(encl_mm->mm); 131 132 if (!ret) 133 break; 134 } 135 136 srcu_read_unlock(&encl->srcu, idx); 137 138 if (!ret) 139 return false; 140 141 return true; 142 } 143 144 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) 145 { 146 struct sgx_encl_page *page = epc_page->owner; 147 unsigned long addr = page->desc & PAGE_MASK; 148 struct sgx_encl *encl = page->encl; 149 int ret; 150 151 sgx_zap_enclave_ptes(encl, addr); 152 153 mutex_lock(&encl->lock); 154 155 ret = __eblock(sgx_get_epc_virt_addr(epc_page)); 156 if (encls_failed(ret)) 157 ENCLS_WARN(ret, "EBLOCK"); 158 159 mutex_unlock(&encl->lock); 160 } 161 162 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, 163 struct sgx_backing *backing) 164 { 165 struct sgx_pageinfo pginfo; 166 int ret; 167 168 pginfo.addr = 0; 169 pginfo.secs = 0; 170 171 pginfo.contents = (unsigned long)kmap_local_page(backing->contents); 172 pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + 173 backing->pcmd_offset; 174 175 ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); 176 set_page_dirty(backing->pcmd); 177 set_page_dirty(backing->contents); 178 179 kunmap_local((void *)(unsigned long)(pginfo.metadata - 180 backing->pcmd_offset)); 181 kunmap_local((void *)(unsigned long)pginfo.contents); 182 183 return ret; 184 } 185 186 void sgx_ipi_cb(void *info) 187 { 188 } 189 190 /* 191 * Swap page to the regular memory transformed to the blocked state by using 192 * EBLOCK, which means that it can no longer be referenced (no new TLB entries). 193 * 194 * The first trial just tries to write the page assuming that some other thread 195 * has reset the count for threads inside the enclave by using ETRACK, and 196 * previous thread count has been zeroed out. The second trial calls ETRACK 197 * before EWB. If that fails we kick all the HW threads out, and then do EWB, 198 * which should be guaranteed the succeed. 199 */ 200 static void sgx_encl_ewb(struct sgx_epc_page *epc_page, 201 struct sgx_backing *backing) 202 { 203 struct sgx_encl_page *encl_page = epc_page->owner; 204 struct sgx_encl *encl = encl_page->encl; 205 struct sgx_va_page *va_page; 206 unsigned int va_offset; 207 void *va_slot; 208 int ret; 209 210 encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; 211 212 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, 213 list); 214 va_offset = sgx_alloc_va_slot(va_page); 215 va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; 216 if (sgx_va_page_full(va_page)) 217 list_move_tail(&va_page->list, &encl->va_pages); 218 219 ret = __sgx_encl_ewb(epc_page, va_slot, backing); 220 if (ret == SGX_NOT_TRACKED) { 221 ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); 222 if (ret) { 223 if (encls_failed(ret)) 224 ENCLS_WARN(ret, "ETRACK"); 225 } 226 227 ret = __sgx_encl_ewb(epc_page, va_slot, backing); 228 if (ret == SGX_NOT_TRACKED) { 229 /* 230 * Slow path, send IPIs to kick cpus out of the 231 * enclave. Note, it's imperative that the cpu 232 * mask is generated *after* ETRACK, else we'll 233 * miss cpus that entered the enclave between 234 * generating the mask and incrementing epoch. 235 */ 236 on_each_cpu_mask(sgx_encl_cpumask(encl), 237 sgx_ipi_cb, NULL, 1); 238 ret = __sgx_encl_ewb(epc_page, va_slot, backing); 239 } 240 } 241 242 if (ret) { 243 if (encls_failed(ret)) 244 ENCLS_WARN(ret, "EWB"); 245 246 sgx_free_va_slot(va_page, va_offset); 247 } else { 248 encl_page->desc |= va_offset; 249 encl_page->va_page = va_page; 250 } 251 } 252 253 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, 254 struct sgx_backing *backing) 255 { 256 struct sgx_encl_page *encl_page = epc_page->owner; 257 struct sgx_encl *encl = encl_page->encl; 258 struct sgx_backing secs_backing; 259 int ret; 260 261 mutex_lock(&encl->lock); 262 263 sgx_encl_ewb(epc_page, backing); 264 encl_page->epc_page = NULL; 265 encl->secs_child_cnt--; 266 sgx_encl_put_backing(backing); 267 268 if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { 269 ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), 270 &secs_backing); 271 if (ret) 272 goto out; 273 274 sgx_encl_ewb(encl->secs.epc_page, &secs_backing); 275 276 sgx_encl_free_epc_page(encl->secs.epc_page); 277 encl->secs.epc_page = NULL; 278 279 sgx_encl_put_backing(&secs_backing); 280 } 281 282 out: 283 mutex_unlock(&encl->lock); 284 } 285 286 /* 287 * Take a fixed number of pages from the head of the active page pool and 288 * reclaim them to the enclave's private shmem files. Skip the pages, which have 289 * been accessed since the last scan. Move those pages to the tail of active 290 * page pool so that the pages get scanned in LRU like fashion. 291 * 292 * Batch process a chunk of pages (at the moment 16) in order to degrade amount 293 * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit 294 * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI 295 * + EWB) but not sufficiently. Reclaiming one page at a time would also be 296 * problematic as it would increase the lock contention too much, which would 297 * halt forward progress. 298 */ 299 static void sgx_reclaim_pages(void) 300 { 301 struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; 302 struct sgx_backing backing[SGX_NR_TO_SCAN]; 303 struct sgx_encl_page *encl_page; 304 struct sgx_epc_page *epc_page; 305 pgoff_t page_index; 306 int cnt = 0; 307 int ret; 308 int i; 309 310 spin_lock(&sgx_reclaimer_lock); 311 for (i = 0; i < SGX_NR_TO_SCAN; i++) { 312 if (list_empty(&sgx_active_page_list)) 313 break; 314 315 epc_page = list_first_entry(&sgx_active_page_list, 316 struct sgx_epc_page, list); 317 list_del_init(&epc_page->list); 318 encl_page = epc_page->owner; 319 320 if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) 321 chunk[cnt++] = epc_page; 322 else 323 /* The owner is freeing the page. No need to add the 324 * page back to the list of reclaimable pages. 325 */ 326 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; 327 } 328 spin_unlock(&sgx_reclaimer_lock); 329 330 for (i = 0; i < cnt; i++) { 331 epc_page = chunk[i]; 332 encl_page = epc_page->owner; 333 334 if (!sgx_reclaimer_age(epc_page)) 335 goto skip; 336 337 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 338 339 mutex_lock(&encl_page->encl->lock); 340 ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); 341 if (ret) { 342 mutex_unlock(&encl_page->encl->lock); 343 goto skip; 344 } 345 346 encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; 347 mutex_unlock(&encl_page->encl->lock); 348 continue; 349 350 skip: 351 spin_lock(&sgx_reclaimer_lock); 352 list_add_tail(&epc_page->list, &sgx_active_page_list); 353 spin_unlock(&sgx_reclaimer_lock); 354 355 kref_put(&encl_page->encl->refcount, sgx_encl_release); 356 357 chunk[i] = NULL; 358 } 359 360 for (i = 0; i < cnt; i++) { 361 epc_page = chunk[i]; 362 if (epc_page) 363 sgx_reclaimer_block(epc_page); 364 } 365 366 for (i = 0; i < cnt; i++) { 367 epc_page = chunk[i]; 368 if (!epc_page) 369 continue; 370 371 encl_page = epc_page->owner; 372 sgx_reclaimer_write(epc_page, &backing[i]); 373 374 kref_put(&encl_page->encl->refcount, sgx_encl_release); 375 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; 376 377 sgx_free_epc_page(epc_page); 378 } 379 } 380 381 static bool sgx_should_reclaim(unsigned long watermark) 382 { 383 return atomic_long_read(&sgx_nr_free_pages) < watermark && 384 !list_empty(&sgx_active_page_list); 385 } 386 387 /* 388 * sgx_reclaim_direct() should be called (without enclave's mutex held) 389 * in locations where SGX memory resources might be low and might be 390 * needed in order to make forward progress. 391 */ 392 void sgx_reclaim_direct(void) 393 { 394 if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) 395 sgx_reclaim_pages(); 396 } 397 398 static int ksgxd(void *p) 399 { 400 set_freezable(); 401 402 /* 403 * Sanitize pages in order to recover from kexec(). The 2nd pass is 404 * required for SECS pages, whose child pages blocked EREMOVE. 405 */ 406 __sgx_sanitize_pages(&sgx_dirty_page_list); 407 WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); 408 409 while (!kthread_should_stop()) { 410 if (try_to_freeze()) 411 continue; 412 413 wait_event_freezable(ksgxd_waitq, 414 kthread_should_stop() || 415 sgx_should_reclaim(SGX_NR_HIGH_PAGES)); 416 417 if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) 418 sgx_reclaim_pages(); 419 420 cond_resched(); 421 } 422 423 return 0; 424 } 425 426 static bool __init sgx_page_reclaimer_init(void) 427 { 428 struct task_struct *tsk; 429 430 tsk = kthread_run(ksgxd, NULL, "ksgxd"); 431 if (IS_ERR(tsk)) 432 return false; 433 434 ksgxd_tsk = tsk; 435 436 return true; 437 } 438 439 bool current_is_ksgxd(void) 440 { 441 return current == ksgxd_tsk; 442 } 443 444 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) 445 { 446 struct sgx_numa_node *node = &sgx_numa_nodes[nid]; 447 struct sgx_epc_page *page = NULL; 448 449 spin_lock(&node->lock); 450 451 if (list_empty(&node->free_page_list)) { 452 spin_unlock(&node->lock); 453 return NULL; 454 } 455 456 page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); 457 list_del_init(&page->list); 458 page->flags = 0; 459 460 spin_unlock(&node->lock); 461 atomic_long_dec(&sgx_nr_free_pages); 462 463 return page; 464 } 465 466 /** 467 * __sgx_alloc_epc_page() - Allocate an EPC page 468 * 469 * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start 470 * from the NUMA node, where the caller is executing. 471 * 472 * Return: 473 * - an EPC page: A borrowed EPC pages were available. 474 * - NULL: Out of EPC pages. 475 */ 476 struct sgx_epc_page *__sgx_alloc_epc_page(void) 477 { 478 struct sgx_epc_page *page; 479 int nid_of_current = numa_node_id(); 480 int nid_start, nid; 481 482 /* 483 * Try local node first. If it doesn't have an EPC section, 484 * fall back to the non-local NUMA nodes. 485 */ 486 if (node_isset(nid_of_current, sgx_numa_mask)) 487 nid_start = nid_of_current; 488 else 489 nid_start = next_node_in(nid_of_current, sgx_numa_mask); 490 491 nid = nid_start; 492 do { 493 page = __sgx_alloc_epc_page_from_node(nid); 494 if (page) 495 return page; 496 497 nid = next_node_in(nid, sgx_numa_mask); 498 } while (nid != nid_start); 499 500 return ERR_PTR(-ENOMEM); 501 } 502 503 /** 504 * sgx_mark_page_reclaimable() - Mark a page as reclaimable 505 * @page: EPC page 506 * 507 * Mark a page as reclaimable and add it to the active page list. Pages 508 * are automatically removed from the active list when freed. 509 */ 510 void sgx_mark_page_reclaimable(struct sgx_epc_page *page) 511 { 512 spin_lock(&sgx_reclaimer_lock); 513 page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; 514 list_add_tail(&page->list, &sgx_active_page_list); 515 spin_unlock(&sgx_reclaimer_lock); 516 } 517 518 /** 519 * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list 520 * @page: EPC page 521 * 522 * Clear the reclaimable flag and remove the page from the active page list. 523 * 524 * Return: 525 * 0 on success, 526 * -EBUSY if the page is in the process of being reclaimed 527 */ 528 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) 529 { 530 spin_lock(&sgx_reclaimer_lock); 531 if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { 532 /* The page is being reclaimed. */ 533 if (list_empty(&page->list)) { 534 spin_unlock(&sgx_reclaimer_lock); 535 return -EBUSY; 536 } 537 538 list_del(&page->list); 539 page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; 540 } 541 spin_unlock(&sgx_reclaimer_lock); 542 543 return 0; 544 } 545 546 /** 547 * sgx_alloc_epc_page() - Allocate an EPC page 548 * @owner: the owner of the EPC page 549 * @reclaim: reclaim pages if necessary 550 * 551 * Iterate through EPC sections and borrow a free EPC page to the caller. When a 552 * page is no longer needed it must be released with sgx_free_epc_page(). If 553 * @reclaim is set to true, directly reclaim pages when we are out of pages. No 554 * mm's can be locked when @reclaim is set to true. 555 * 556 * Finally, wake up ksgxd when the number of pages goes below the watermark 557 * before returning back to the caller. 558 * 559 * Return: 560 * an EPC page, 561 * -errno on error 562 */ 563 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) 564 { 565 struct sgx_epc_page *page; 566 567 for ( ; ; ) { 568 page = __sgx_alloc_epc_page(); 569 if (!IS_ERR(page)) { 570 page->owner = owner; 571 break; 572 } 573 574 if (list_empty(&sgx_active_page_list)) 575 return ERR_PTR(-ENOMEM); 576 577 if (!reclaim) { 578 page = ERR_PTR(-EBUSY); 579 break; 580 } 581 582 if (signal_pending(current)) { 583 page = ERR_PTR(-ERESTARTSYS); 584 break; 585 } 586 587 sgx_reclaim_pages(); 588 cond_resched(); 589 } 590 591 if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) 592 wake_up(&ksgxd_waitq); 593 594 return page; 595 } 596 597 /** 598 * sgx_free_epc_page() - Free an EPC page 599 * @page: an EPC page 600 * 601 * Put the EPC page back to the list of free pages. It's the caller's 602 * responsibility to make sure that the page is in uninitialized state. In other 603 * words, do EREMOVE, EWB or whatever operation is necessary before calling 604 * this function. 605 */ 606 void sgx_free_epc_page(struct sgx_epc_page *page) 607 { 608 struct sgx_epc_section *section = &sgx_epc_sections[page->section]; 609 struct sgx_numa_node *node = section->node; 610 611 spin_lock(&node->lock); 612 613 page->owner = NULL; 614 if (page->poison) 615 list_add(&page->list, &node->sgx_poison_page_list); 616 else 617 list_add_tail(&page->list, &node->free_page_list); 618 page->flags = SGX_EPC_PAGE_IS_FREE; 619 620 spin_unlock(&node->lock); 621 atomic_long_inc(&sgx_nr_free_pages); 622 } 623 624 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, 625 unsigned long index, 626 struct sgx_epc_section *section) 627 { 628 unsigned long nr_pages = size >> PAGE_SHIFT; 629 unsigned long i; 630 631 section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); 632 if (!section->virt_addr) 633 return false; 634 635 section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); 636 if (!section->pages) { 637 memunmap(section->virt_addr); 638 return false; 639 } 640 641 section->phys_addr = phys_addr; 642 xa_store_range(&sgx_epc_address_space, section->phys_addr, 643 phys_addr + size - 1, section, GFP_KERNEL); 644 645 for (i = 0; i < nr_pages; i++) { 646 section->pages[i].section = index; 647 section->pages[i].flags = 0; 648 section->pages[i].owner = NULL; 649 section->pages[i].poison = 0; 650 list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); 651 } 652 653 return true; 654 } 655 656 bool arch_is_platform_page(u64 paddr) 657 { 658 return !!xa_load(&sgx_epc_address_space, paddr); 659 } 660 EXPORT_SYMBOL_GPL(arch_is_platform_page); 661 662 static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) 663 { 664 struct sgx_epc_section *section; 665 666 section = xa_load(&sgx_epc_address_space, paddr); 667 if (!section) 668 return NULL; 669 670 return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; 671 } 672 673 /* 674 * Called in process context to handle a hardware reported 675 * error in an SGX EPC page. 676 * If the MF_ACTION_REQUIRED bit is set in flags, then the 677 * context is the task that consumed the poison data. Otherwise 678 * this is called from a kernel thread unrelated to the page. 679 */ 680 int arch_memory_failure(unsigned long pfn, int flags) 681 { 682 struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); 683 struct sgx_epc_section *section; 684 struct sgx_numa_node *node; 685 686 /* 687 * mm/memory-failure.c calls this routine for all errors 688 * where there isn't a "struct page" for the address. But that 689 * includes other address ranges besides SGX. 690 */ 691 if (!page) 692 return -ENXIO; 693 694 /* 695 * If poison was consumed synchronously. Send a SIGBUS to 696 * the task. Hardware has already exited the SGX enclave and 697 * will not allow re-entry to an enclave that has a memory 698 * error. The signal may help the task understand why the 699 * enclave is broken. 700 */ 701 if (flags & MF_ACTION_REQUIRED) 702 force_sig(SIGBUS); 703 704 section = &sgx_epc_sections[page->section]; 705 node = section->node; 706 707 spin_lock(&node->lock); 708 709 /* Already poisoned? Nothing more to do */ 710 if (page->poison) 711 goto out; 712 713 page->poison = 1; 714 715 /* 716 * If the page is on a free list, move it to the per-node 717 * poison page list. 718 */ 719 if (page->flags & SGX_EPC_PAGE_IS_FREE) { 720 list_move(&page->list, &node->sgx_poison_page_list); 721 goto out; 722 } 723 724 sgx_unmark_page_reclaimable(page); 725 726 /* 727 * TBD: Add additional plumbing to enable pre-emptive 728 * action for asynchronous poison notification. Until 729 * then just hope that the poison: 730 * a) is not accessed - sgx_free_epc_page() will deal with it 731 * when the user gives it back 732 * b) results in a recoverable machine check rather than 733 * a fatal one 734 */ 735 out: 736 spin_unlock(&node->lock); 737 return 0; 738 } 739 740 /* 741 * A section metric is concatenated in a way that @low bits 12-31 define the 742 * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the 743 * metric. 744 */ 745 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) 746 { 747 return (low & GENMASK_ULL(31, 12)) + 748 ((high & GENMASK_ULL(19, 0)) << 32); 749 } 750 751 #ifdef CONFIG_NUMA 752 static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) 753 { 754 return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); 755 } 756 static DEVICE_ATTR_RO(sgx_total_bytes); 757 758 static umode_t arch_node_attr_is_visible(struct kobject *kobj, 759 struct attribute *attr, int idx) 760 { 761 /* Make all x86/ attributes invisible when SGX is not initialized: */ 762 if (nodes_empty(sgx_numa_mask)) 763 return 0; 764 765 return attr->mode; 766 } 767 768 static struct attribute *arch_node_dev_attrs[] = { 769 &dev_attr_sgx_total_bytes.attr, 770 NULL, 771 }; 772 773 const struct attribute_group arch_node_dev_group = { 774 .name = "x86", 775 .attrs = arch_node_dev_attrs, 776 .is_visible = arch_node_attr_is_visible, 777 }; 778 779 static void __init arch_update_sysfs_visibility(int nid) 780 { 781 struct node *node = node_devices[nid]; 782 int ret; 783 784 ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); 785 786 if (ret) 787 pr_err("sysfs update failed (%d), files may be invisible", ret); 788 } 789 #else /* !CONFIG_NUMA */ 790 static void __init arch_update_sysfs_visibility(int nid) {} 791 #endif 792 793 static bool __init sgx_page_cache_init(void) 794 { 795 u32 eax, ebx, ecx, edx, type; 796 u64 pa, size; 797 int nid; 798 int i; 799 800 sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); 801 if (!sgx_numa_nodes) 802 return false; 803 804 for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { 805 cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); 806 807 type = eax & SGX_CPUID_EPC_MASK; 808 if (type == SGX_CPUID_EPC_INVALID) 809 break; 810 811 if (type != SGX_CPUID_EPC_SECTION) { 812 pr_err_once("Unknown EPC section type: %u\n", type); 813 break; 814 } 815 816 pa = sgx_calc_section_metric(eax, ebx); 817 size = sgx_calc_section_metric(ecx, edx); 818 819 pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); 820 821 if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { 822 pr_err("No free memory for an EPC section\n"); 823 break; 824 } 825 826 nid = numa_map_to_online_node(phys_to_target_node(pa)); 827 if (nid == NUMA_NO_NODE) { 828 /* The physical address is already printed above. */ 829 pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); 830 nid = 0; 831 } 832 833 if (!node_isset(nid, sgx_numa_mask)) { 834 spin_lock_init(&sgx_numa_nodes[nid].lock); 835 INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); 836 INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); 837 node_set(nid, sgx_numa_mask); 838 sgx_numa_nodes[nid].size = 0; 839 840 /* Make SGX-specific node sysfs files visible: */ 841 arch_update_sysfs_visibility(nid); 842 } 843 844 sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; 845 sgx_numa_nodes[nid].size += size; 846 847 sgx_nr_epc_sections++; 848 } 849 850 if (!sgx_nr_epc_sections) { 851 pr_err("There are zero EPC sections.\n"); 852 return false; 853 } 854 855 for_each_online_node(nid) { 856 if (!node_isset(nid, sgx_numa_mask) && 857 node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) 858 pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", 859 nid); 860 } 861 862 return true; 863 } 864 865 /* 866 * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. 867 * Bare-metal driver requires to update them to hash of enclave's signer 868 * before EINIT. KVM needs to update them to guest's virtual MSR values 869 * before doing EINIT from guest. 870 */ 871 void sgx_update_lepubkeyhash(u64 *lepubkeyhash) 872 { 873 int i; 874 875 WARN_ON_ONCE(preemptible()); 876 877 for (i = 0; i < 4; i++) 878 wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); 879 } 880 881 const struct file_operations sgx_provision_fops = { 882 .owner = THIS_MODULE, 883 }; 884 885 static struct miscdevice sgx_dev_provision = { 886 .minor = MISC_DYNAMIC_MINOR, 887 .name = "sgx_provision", 888 .nodename = "sgx_provision", 889 .fops = &sgx_provision_fops, 890 }; 891 892 /** 893 * sgx_set_attribute() - Update allowed attributes given file descriptor 894 * @allowed_attributes: Pointer to allowed enclave attributes 895 * @attribute_fd: File descriptor for specific attribute 896 * 897 * Append enclave attribute indicated by file descriptor to allowed 898 * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by 899 * /dev/sgx_provision is supported. 900 * 901 * Return: 902 * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes 903 * -EINVAL: Invalid, or not supported file descriptor 904 */ 905 int sgx_set_attribute(unsigned long *allowed_attributes, 906 unsigned int attribute_fd) 907 { 908 CLASS(fd, f)(attribute_fd); 909 910 if (fd_empty(f)) 911 return -EINVAL; 912 913 if (fd_file(f)->f_op != &sgx_provision_fops) 914 return -EINVAL; 915 916 *allowed_attributes |= SGX_ATTR_PROVISIONKEY; 917 return 0; 918 } 919 EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute); 920 921 static int __init sgx_init(void) 922 { 923 int ret; 924 int i; 925 926 if (!cpu_feature_enabled(X86_FEATURE_SGX)) 927 return -ENODEV; 928 929 if (!sgx_page_cache_init()) 930 return -ENOMEM; 931 932 if (!sgx_page_reclaimer_init()) { 933 ret = -ENOMEM; 934 goto err_page_cache; 935 } 936 937 ret = misc_register(&sgx_dev_provision); 938 if (ret) 939 goto err_kthread; 940 941 /* 942 * Always try to initialize the native *and* KVM drivers. 943 * The KVM driver is less picky than the native one and 944 * can function if the native one is not supported on the 945 * current system or fails to initialize. 946 * 947 * Error out only if both fail to initialize. 948 */ 949 ret = sgx_drv_init(); 950 951 if (sgx_vepc_init() && ret) 952 goto err_provision; 953 954 return 0; 955 956 err_provision: 957 misc_deregister(&sgx_dev_provision); 958 959 err_kthread: 960 kthread_stop(ksgxd_tsk); 961 962 err_page_cache: 963 for (i = 0; i < sgx_nr_epc_sections; i++) { 964 vfree(sgx_epc_sections[i].pages); 965 memunmap(sgx_epc_sections[i].virt_addr); 966 } 967 968 return ret; 969 } 970 971 device_initcall(sgx_init); 972