1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-20 Intel Corporation. */ 3 4 #include <linux/file.h> 5 #include <linux/freezer.h> 6 #include <linux/highmem.h> 7 #include <linux/kthread.h> 8 #include <linux/kvm_types.h> 9 #include <linux/miscdevice.h> 10 #include <linux/node.h> 11 #include <linux/pagemap.h> 12 #include <linux/ratelimit.h> 13 #include <linux/sched/mm.h> 14 #include <linux/sched/signal.h> 15 #include <linux/slab.h> 16 #include <linux/sysfs.h> 17 #include <linux/vmalloc.h> 18 19 #include <asm/cpuid/api.h> 20 #include <asm/msr.h> 21 #include <asm/sgx.h> 22 #include <asm/archrandom.h> 23 24 #include "driver.h" 25 #include "encl.h" 26 #include "encls.h" 27 28 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; 29 static int sgx_nr_epc_sections; 30 static struct task_struct *ksgxd_tsk; 31 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); 32 static DEFINE_XARRAY(sgx_epc_address_space); 33 34 /* 35 * These variables are part of the state of the reclaimer, and must be accessed 36 * with sgx_reclaimer_lock acquired. 37 */ 38 static LIST_HEAD(sgx_active_page_list); 39 static DEFINE_SPINLOCK(sgx_reclaimer_lock); 40 41 static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); 42 43 /* Nodes with one or more EPC sections. */ 44 static nodemask_t sgx_numa_mask; 45 46 /* 47 * Array with one list_head for each possible NUMA node. Each 48 * list contains all the sgx_epc_section's which are on that 49 * node. 50 */ 51 static struct sgx_numa_node *sgx_numa_nodes; 52 53 static LIST_HEAD(sgx_dirty_page_list); 54 55 /* 56 * Reset post-kexec EPC pages to the uninitialized state. The pages are removed 57 * from the input list, and made available for the page allocator. SECS pages 58 * prepending their children in the input list are left intact. 59 * 60 * Return 0 when sanitization was successful or kthread was stopped, and the 61 * number of unsanitized pages otherwise. 62 */ 63 static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) 64 { 65 unsigned long left_dirty = 0; 66 struct sgx_epc_page *page; 67 LIST_HEAD(dirty); 68 int ret; 69 70 /* dirty_page_list is thread-local, no need for a lock: */ 71 while (!list_empty(dirty_page_list)) { 72 if (kthread_should_stop()) 73 return 0; 74 75 page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); 76 77 /* 78 * Checking page->poison without holding the node->lock 79 * is racy, but losing the race (i.e. poison is set just 80 * after the check) just means __eremove() will be uselessly 81 * called for a page that sgx_free_epc_page() will put onto 82 * the node->sgx_poison_page_list later. 83 */ 84 if (page->poison) { 85 struct sgx_epc_section *section = &sgx_epc_sections[page->section]; 86 struct sgx_numa_node *node = section->node; 87 88 spin_lock(&node->lock); 89 list_move(&page->list, &node->sgx_poison_page_list); 90 spin_unlock(&node->lock); 91 92 continue; 93 } 94 95 ret = __eremove(sgx_get_epc_virt_addr(page)); 96 if (!ret) { 97 /* 98 * page is now sanitized. Make it available via the SGX 99 * page allocator: 100 */ 101 list_del(&page->list); 102 sgx_free_epc_page(page); 103 } else { 104 /* The page is not yet clean - move to the dirty list. */ 105 list_move_tail(&page->list, &dirty); 106 left_dirty++; 107 } 108 109 cond_resched(); 110 } 111 112 list_splice(&dirty, dirty_page_list); 113 return left_dirty; 114 } 115 116 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) 117 { 118 struct sgx_encl_page *page = epc_page->owner; 119 struct sgx_encl *encl = page->encl; 120 struct sgx_encl_mm *encl_mm; 121 bool ret = true; 122 int idx; 123 124 idx = srcu_read_lock(&encl->srcu); 125 126 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 127 if (!mmget_not_zero(encl_mm->mm)) 128 continue; 129 130 mmap_read_lock(encl_mm->mm); 131 ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); 132 mmap_read_unlock(encl_mm->mm); 133 134 mmput_async(encl_mm->mm); 135 136 if (!ret) 137 break; 138 } 139 140 srcu_read_unlock(&encl->srcu, idx); 141 142 if (!ret) 143 return false; 144 145 return true; 146 } 147 148 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) 149 { 150 struct sgx_encl_page *page = epc_page->owner; 151 unsigned long addr = page->desc & PAGE_MASK; 152 struct sgx_encl *encl = page->encl; 153 int ret; 154 155 sgx_zap_enclave_ptes(encl, addr); 156 157 mutex_lock(&encl->lock); 158 159 ret = __eblock(sgx_get_epc_virt_addr(epc_page)); 160 if (encls_failed(ret)) 161 ENCLS_WARN(ret, "EBLOCK"); 162 163 mutex_unlock(&encl->lock); 164 } 165 166 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, 167 struct sgx_backing *backing) 168 { 169 struct sgx_pageinfo pginfo; 170 int ret; 171 172 pginfo.addr = 0; 173 pginfo.secs = 0; 174 175 pginfo.contents = (unsigned long)kmap_local_page(backing->contents); 176 pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + 177 backing->pcmd_offset; 178 179 ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); 180 set_page_dirty(backing->pcmd); 181 set_page_dirty(backing->contents); 182 183 kunmap_local((void *)(unsigned long)(pginfo.metadata - 184 backing->pcmd_offset)); 185 kunmap_local((void *)(unsigned long)pginfo.contents); 186 187 return ret; 188 } 189 190 void sgx_ipi_cb(void *info) 191 { 192 } 193 194 /* 195 * Swap page to the regular memory transformed to the blocked state by using 196 * EBLOCK, which means that it can no longer be referenced (no new TLB entries). 197 * 198 * The first trial just tries to write the page assuming that some other thread 199 * has reset the count for threads inside the enclave by using ETRACK, and 200 * previous thread count has been zeroed out. The second trial calls ETRACK 201 * before EWB. If that fails we kick all the HW threads out, and then do EWB, 202 * which should be guaranteed the succeed. 203 */ 204 static void sgx_encl_ewb(struct sgx_epc_page *epc_page, 205 struct sgx_backing *backing) 206 { 207 struct sgx_encl_page *encl_page = epc_page->owner; 208 struct sgx_encl *encl = encl_page->encl; 209 struct sgx_va_page *va_page; 210 unsigned int va_offset; 211 void *va_slot; 212 int ret; 213 214 encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; 215 216 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, 217 list); 218 va_offset = sgx_alloc_va_slot(va_page); 219 va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; 220 if (sgx_va_page_full(va_page)) 221 list_move_tail(&va_page->list, &encl->va_pages); 222 223 ret = __sgx_encl_ewb(epc_page, va_slot, backing); 224 if (ret == SGX_NOT_TRACKED) { 225 ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); 226 if (ret) { 227 if (encls_failed(ret)) 228 ENCLS_WARN(ret, "ETRACK"); 229 } 230 231 ret = __sgx_encl_ewb(epc_page, va_slot, backing); 232 if (ret == SGX_NOT_TRACKED) { 233 /* 234 * Slow path, send IPIs to kick cpus out of the 235 * enclave. Note, it's imperative that the cpu 236 * mask is generated *after* ETRACK, else we'll 237 * miss cpus that entered the enclave between 238 * generating the mask and incrementing epoch. 239 */ 240 on_each_cpu_mask(sgx_encl_cpumask(encl), 241 sgx_ipi_cb, NULL, 1); 242 ret = __sgx_encl_ewb(epc_page, va_slot, backing); 243 } 244 } 245 246 if (ret) { 247 if (encls_failed(ret)) 248 ENCLS_WARN(ret, "EWB"); 249 250 sgx_free_va_slot(va_page, va_offset); 251 } else { 252 encl_page->desc |= va_offset; 253 encl_page->va_page = va_page; 254 } 255 } 256 257 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, 258 struct sgx_backing *backing) 259 { 260 struct sgx_encl_page *encl_page = epc_page->owner; 261 struct sgx_encl *encl = encl_page->encl; 262 struct sgx_backing secs_backing; 263 int ret; 264 265 mutex_lock(&encl->lock); 266 267 sgx_encl_ewb(epc_page, backing); 268 encl_page->epc_page = NULL; 269 encl->secs_child_cnt--; 270 sgx_encl_put_backing(backing); 271 272 if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { 273 ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), 274 &secs_backing); 275 if (ret) 276 goto out; 277 278 sgx_encl_ewb(encl->secs.epc_page, &secs_backing); 279 280 sgx_encl_free_epc_page(encl->secs.epc_page); 281 encl->secs.epc_page = NULL; 282 283 sgx_encl_put_backing(&secs_backing); 284 } 285 286 out: 287 mutex_unlock(&encl->lock); 288 } 289 290 /* 291 * Take a fixed number of pages from the head of the active page pool and 292 * reclaim them to the enclave's private shmem files. Skip the pages, which have 293 * been accessed since the last scan. Move those pages to the tail of active 294 * page pool so that the pages get scanned in LRU like fashion. 295 * 296 * Batch process a chunk of pages (at the moment 16) in order to degrade amount 297 * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit 298 * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI 299 * + EWB) but not sufficiently. Reclaiming one page at a time would also be 300 * problematic as it would increase the lock contention too much, which would 301 * halt forward progress. 302 */ 303 static void sgx_reclaim_pages(void) 304 { 305 struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; 306 struct sgx_backing backing[SGX_NR_TO_SCAN]; 307 struct sgx_encl_page *encl_page; 308 struct sgx_epc_page *epc_page; 309 pgoff_t page_index; 310 int cnt = 0; 311 int ret; 312 int i; 313 314 spin_lock(&sgx_reclaimer_lock); 315 for (i = 0; i < SGX_NR_TO_SCAN; i++) { 316 if (list_empty(&sgx_active_page_list)) 317 break; 318 319 epc_page = list_first_entry(&sgx_active_page_list, 320 struct sgx_epc_page, list); 321 list_del_init(&epc_page->list); 322 encl_page = epc_page->owner; 323 324 if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) 325 chunk[cnt++] = epc_page; 326 else 327 /* The owner is freeing the page. No need to add the 328 * page back to the list of reclaimable pages. 329 */ 330 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; 331 } 332 spin_unlock(&sgx_reclaimer_lock); 333 334 for (i = 0; i < cnt; i++) { 335 epc_page = chunk[i]; 336 encl_page = epc_page->owner; 337 338 if (!sgx_reclaimer_age(epc_page)) 339 goto skip; 340 341 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 342 343 mutex_lock(&encl_page->encl->lock); 344 ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); 345 if (ret) { 346 mutex_unlock(&encl_page->encl->lock); 347 goto skip; 348 } 349 350 encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; 351 mutex_unlock(&encl_page->encl->lock); 352 continue; 353 354 skip: 355 spin_lock(&sgx_reclaimer_lock); 356 list_add_tail(&epc_page->list, &sgx_active_page_list); 357 spin_unlock(&sgx_reclaimer_lock); 358 359 kref_put(&encl_page->encl->refcount, sgx_encl_release); 360 361 chunk[i] = NULL; 362 } 363 364 for (i = 0; i < cnt; i++) { 365 epc_page = chunk[i]; 366 if (epc_page) 367 sgx_reclaimer_block(epc_page); 368 } 369 370 for (i = 0; i < cnt; i++) { 371 epc_page = chunk[i]; 372 if (!epc_page) 373 continue; 374 375 encl_page = epc_page->owner; 376 sgx_reclaimer_write(epc_page, &backing[i]); 377 378 kref_put(&encl_page->encl->refcount, sgx_encl_release); 379 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; 380 381 sgx_free_epc_page(epc_page); 382 } 383 } 384 385 static bool sgx_should_reclaim(unsigned long watermark) 386 { 387 return atomic_long_read(&sgx_nr_free_pages) < watermark && 388 !list_empty(&sgx_active_page_list); 389 } 390 391 /* 392 * sgx_reclaim_direct() should be called (without enclave's mutex held) 393 * in locations where SGX memory resources might be low and might be 394 * needed in order to make forward progress. 395 */ 396 void sgx_reclaim_direct(void) 397 { 398 if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) 399 sgx_reclaim_pages(); 400 } 401 402 static int ksgxd(void *p) 403 { 404 set_freezable(); 405 406 /* 407 * Sanitize pages in order to recover from kexec(). The 2nd pass is 408 * required for SECS pages, whose child pages blocked EREMOVE. 409 */ 410 __sgx_sanitize_pages(&sgx_dirty_page_list); 411 WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); 412 413 while (!kthread_should_stop()) { 414 if (try_to_freeze()) 415 continue; 416 417 wait_event_freezable(ksgxd_waitq, 418 kthread_should_stop() || 419 sgx_should_reclaim(SGX_NR_HIGH_PAGES)); 420 421 if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) 422 sgx_reclaim_pages(); 423 424 cond_resched(); 425 } 426 427 return 0; 428 } 429 430 static bool __init sgx_page_reclaimer_init(void) 431 { 432 struct task_struct *tsk; 433 434 tsk = kthread_run(ksgxd, NULL, "ksgxd"); 435 if (IS_ERR(tsk)) 436 return false; 437 438 ksgxd_tsk = tsk; 439 440 return true; 441 } 442 443 bool current_is_ksgxd(void) 444 { 445 return current == ksgxd_tsk; 446 } 447 448 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) 449 { 450 struct sgx_numa_node *node = &sgx_numa_nodes[nid]; 451 struct sgx_epc_page *page = NULL; 452 453 spin_lock(&node->lock); 454 455 if (list_empty(&node->free_page_list)) { 456 spin_unlock(&node->lock); 457 return NULL; 458 } 459 460 page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); 461 list_del_init(&page->list); 462 page->flags = 0; 463 464 spin_unlock(&node->lock); 465 atomic_long_dec(&sgx_nr_free_pages); 466 467 return page; 468 } 469 470 /** 471 * __sgx_alloc_epc_page() - Allocate an EPC page 472 * 473 * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start 474 * from the NUMA node, where the caller is executing. 475 * 476 * Return: 477 * - an EPC page: A borrowed EPC pages were available. 478 * - NULL: Out of EPC pages. 479 */ 480 struct sgx_epc_page *__sgx_alloc_epc_page(void) 481 { 482 struct sgx_epc_page *page; 483 int nid_of_current = numa_node_id(); 484 int nid_start, nid; 485 486 /* 487 * Try local node first. If it doesn't have an EPC section, 488 * fall back to the non-local NUMA nodes. 489 */ 490 if (node_isset(nid_of_current, sgx_numa_mask)) 491 nid_start = nid_of_current; 492 else 493 nid_start = next_node_in(nid_of_current, sgx_numa_mask); 494 495 nid = nid_start; 496 do { 497 page = __sgx_alloc_epc_page_from_node(nid); 498 if (page) 499 return page; 500 501 nid = next_node_in(nid, sgx_numa_mask); 502 } while (nid != nid_start); 503 504 return ERR_PTR(-ENOMEM); 505 } 506 507 /** 508 * sgx_mark_page_reclaimable() - Mark a page as reclaimable 509 * @page: EPC page 510 * 511 * Mark a page as reclaimable and add it to the active page list. Pages 512 * are automatically removed from the active list when freed. 513 */ 514 void sgx_mark_page_reclaimable(struct sgx_epc_page *page) 515 { 516 spin_lock(&sgx_reclaimer_lock); 517 page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; 518 list_add_tail(&page->list, &sgx_active_page_list); 519 spin_unlock(&sgx_reclaimer_lock); 520 } 521 522 /** 523 * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list 524 * @page: EPC page 525 * 526 * Clear the reclaimable flag and remove the page from the active page list. 527 * 528 * Return: 529 * 0 on success, 530 * -EBUSY if the page is in the process of being reclaimed 531 */ 532 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) 533 { 534 spin_lock(&sgx_reclaimer_lock); 535 if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { 536 /* The page is being reclaimed. */ 537 if (list_empty(&page->list)) { 538 spin_unlock(&sgx_reclaimer_lock); 539 return -EBUSY; 540 } 541 542 list_del(&page->list); 543 page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; 544 } 545 spin_unlock(&sgx_reclaimer_lock); 546 547 return 0; 548 } 549 550 /** 551 * sgx_alloc_epc_page() - Allocate an EPC page 552 * @owner: the owner of the EPC page 553 * @reclaim: reclaim pages if necessary 554 * 555 * Iterate through EPC sections and borrow a free EPC page to the caller. When a 556 * page is no longer needed it must be released with sgx_free_epc_page(). If 557 * @reclaim is set to true, directly reclaim pages when we are out of pages. No 558 * mm's can be locked when @reclaim is set to true. 559 * 560 * Finally, wake up ksgxd when the number of pages goes below the watermark 561 * before returning back to the caller. 562 * 563 * Return: 564 * an EPC page, 565 * -errno on error 566 */ 567 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) 568 { 569 struct sgx_epc_page *page; 570 571 for ( ; ; ) { 572 page = __sgx_alloc_epc_page(); 573 if (!IS_ERR(page)) { 574 page->owner = owner; 575 break; 576 } 577 578 if (list_empty(&sgx_active_page_list)) 579 return ERR_PTR(-ENOMEM); 580 581 if (!reclaim) { 582 page = ERR_PTR(-EBUSY); 583 break; 584 } 585 586 if (signal_pending(current)) { 587 page = ERR_PTR(-ERESTARTSYS); 588 break; 589 } 590 591 sgx_reclaim_pages(); 592 cond_resched(); 593 } 594 595 if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) 596 wake_up(&ksgxd_waitq); 597 598 return page; 599 } 600 601 /** 602 * sgx_free_epc_page() - Free an EPC page 603 * @page: an EPC page 604 * 605 * Put the EPC page back to the list of free pages. It's the caller's 606 * responsibility to make sure that the page is in uninitialized state. In other 607 * words, do EREMOVE, EWB or whatever operation is necessary before calling 608 * this function. 609 */ 610 void sgx_free_epc_page(struct sgx_epc_page *page) 611 { 612 struct sgx_epc_section *section = &sgx_epc_sections[page->section]; 613 struct sgx_numa_node *node = section->node; 614 615 spin_lock(&node->lock); 616 617 page->owner = NULL; 618 if (page->poison) 619 list_add(&page->list, &node->sgx_poison_page_list); 620 else 621 list_add_tail(&page->list, &node->free_page_list); 622 page->flags = SGX_EPC_PAGE_IS_FREE; 623 624 spin_unlock(&node->lock); 625 atomic_long_inc(&sgx_nr_free_pages); 626 } 627 628 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, 629 unsigned long index, 630 struct sgx_epc_section *section) 631 { 632 unsigned long nr_pages = size >> PAGE_SHIFT; 633 unsigned long i; 634 635 section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); 636 if (!section->virt_addr) 637 return false; 638 639 section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); 640 if (!section->pages) { 641 memunmap(section->virt_addr); 642 return false; 643 } 644 645 section->phys_addr = phys_addr; 646 xa_store_range(&sgx_epc_address_space, section->phys_addr, 647 phys_addr + size - 1, section, GFP_KERNEL); 648 649 for (i = 0; i < nr_pages; i++) { 650 section->pages[i].section = index; 651 section->pages[i].flags = 0; 652 section->pages[i].owner = NULL; 653 section->pages[i].poison = 0; 654 list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); 655 } 656 657 return true; 658 } 659 660 bool arch_is_platform_page(u64 paddr) 661 { 662 return !!xa_load(&sgx_epc_address_space, paddr); 663 } 664 EXPORT_SYMBOL_GPL(arch_is_platform_page); 665 666 static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) 667 { 668 struct sgx_epc_section *section; 669 670 section = xa_load(&sgx_epc_address_space, paddr); 671 if (!section) 672 return NULL; 673 674 return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; 675 } 676 677 /* 678 * Called in process context to handle a hardware reported 679 * error in an SGX EPC page. 680 * If the MF_ACTION_REQUIRED bit is set in flags, then the 681 * context is the task that consumed the poison data. Otherwise 682 * this is called from a kernel thread unrelated to the page. 683 */ 684 int arch_memory_failure(unsigned long pfn, int flags) 685 { 686 struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); 687 struct sgx_epc_section *section; 688 struct sgx_numa_node *node; 689 690 /* 691 * mm/memory-failure.c calls this routine for all errors 692 * where there isn't a "struct page" for the address. But that 693 * includes other address ranges besides SGX. 694 */ 695 if (!page) 696 return -ENXIO; 697 698 /* 699 * If poison was consumed synchronously. Send a SIGBUS to 700 * the task. Hardware has already exited the SGX enclave and 701 * will not allow re-entry to an enclave that has a memory 702 * error. The signal may help the task understand why the 703 * enclave is broken. 704 */ 705 if (flags & MF_ACTION_REQUIRED) 706 force_sig(SIGBUS); 707 708 section = &sgx_epc_sections[page->section]; 709 node = section->node; 710 711 spin_lock(&node->lock); 712 713 /* Already poisoned? Nothing more to do */ 714 if (page->poison) 715 goto out; 716 717 page->poison = 1; 718 719 /* 720 * If the page is on a free list, move it to the per-node 721 * poison page list. 722 */ 723 if (page->flags & SGX_EPC_PAGE_IS_FREE) { 724 list_move(&page->list, &node->sgx_poison_page_list); 725 goto out; 726 } 727 728 sgx_unmark_page_reclaimable(page); 729 730 /* 731 * TBD: Add additional plumbing to enable pre-emptive 732 * action for asynchronous poison notification. Until 733 * then just hope that the poison: 734 * a) is not accessed - sgx_free_epc_page() will deal with it 735 * when the user gives it back 736 * b) results in a recoverable machine check rather than 737 * a fatal one 738 */ 739 out: 740 spin_unlock(&node->lock); 741 return 0; 742 } 743 744 /* 745 * A section metric is concatenated in a way that @low bits 12-31 define the 746 * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the 747 * metric. 748 */ 749 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) 750 { 751 return (low & GENMASK_ULL(31, 12)) + 752 ((high & GENMASK_ULL(19, 0)) << 32); 753 } 754 755 #ifdef CONFIG_NUMA 756 static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) 757 { 758 return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); 759 } 760 static DEVICE_ATTR_RO(sgx_total_bytes); 761 762 static umode_t arch_node_attr_is_visible(struct kobject *kobj, 763 struct attribute *attr, int idx) 764 { 765 /* Make all x86/ attributes invisible when SGX is not initialized: */ 766 if (nodes_empty(sgx_numa_mask)) 767 return 0; 768 769 return attr->mode; 770 } 771 772 static struct attribute *arch_node_dev_attrs[] = { 773 &dev_attr_sgx_total_bytes.attr, 774 NULL, 775 }; 776 777 const struct attribute_group arch_node_dev_group = { 778 .name = "x86", 779 .attrs = arch_node_dev_attrs, 780 .is_visible = arch_node_attr_is_visible, 781 }; 782 783 static void __init arch_update_sysfs_visibility(int nid) 784 { 785 struct node *node = node_devices[nid]; 786 int ret; 787 788 ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); 789 790 if (ret) 791 pr_err("sysfs update failed (%d), files may be invisible", ret); 792 } 793 #else /* !CONFIG_NUMA */ 794 static void __init arch_update_sysfs_visibility(int nid) {} 795 #endif 796 797 static bool __init sgx_page_cache_init(void) 798 { 799 u32 eax, ebx, ecx, edx, type; 800 u64 pa, size; 801 int nid; 802 int i; 803 804 sgx_numa_nodes = kmalloc_objs(*sgx_numa_nodes, num_possible_nodes()); 805 if (!sgx_numa_nodes) 806 return false; 807 808 for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { 809 cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); 810 811 type = eax & SGX_CPUID_EPC_MASK; 812 if (type == SGX_CPUID_EPC_INVALID) 813 break; 814 815 if (type != SGX_CPUID_EPC_SECTION) { 816 pr_err_once("Unknown EPC section type: %u\n", type); 817 break; 818 } 819 820 pa = sgx_calc_section_metric(eax, ebx); 821 size = sgx_calc_section_metric(ecx, edx); 822 823 pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); 824 825 if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { 826 pr_err("No free memory for an EPC section\n"); 827 break; 828 } 829 830 nid = numa_map_to_online_node(phys_to_target_node(pa)); 831 if (nid == NUMA_NO_NODE) { 832 /* The physical address is already printed above. */ 833 pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); 834 nid = 0; 835 } 836 837 if (!node_isset(nid, sgx_numa_mask)) { 838 spin_lock_init(&sgx_numa_nodes[nid].lock); 839 INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); 840 INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); 841 node_set(nid, sgx_numa_mask); 842 sgx_numa_nodes[nid].size = 0; 843 844 /* Make SGX-specific node sysfs files visible: */ 845 arch_update_sysfs_visibility(nid); 846 } 847 848 sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; 849 sgx_numa_nodes[nid].size += size; 850 851 sgx_nr_epc_sections++; 852 } 853 854 if (!sgx_nr_epc_sections) { 855 pr_err("There are zero EPC sections.\n"); 856 return false; 857 } 858 859 for_each_online_node(nid) { 860 if (!node_isset(nid, sgx_numa_mask) && 861 node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) 862 pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", 863 nid); 864 } 865 866 return true; 867 } 868 869 /* 870 * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. 871 * Bare-metal driver requires to update them to hash of enclave's signer 872 * before EINIT. KVM needs to update them to guest's virtual MSR values 873 * before doing EINIT from guest. 874 */ 875 void sgx_update_lepubkeyhash(u64 *lepubkeyhash) 876 { 877 int i; 878 879 WARN_ON_ONCE(preemptible()); 880 881 for (i = 0; i < 4; i++) 882 wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); 883 } 884 885 const struct file_operations sgx_provision_fops = { 886 .owner = THIS_MODULE, 887 }; 888 889 static struct miscdevice sgx_dev_provision = { 890 .minor = MISC_DYNAMIC_MINOR, 891 .name = "sgx_provision", 892 .nodename = "sgx_provision", 893 .fops = &sgx_provision_fops, 894 }; 895 896 /** 897 * sgx_set_attribute() - Update allowed attributes given file descriptor 898 * @allowed_attributes: Pointer to allowed enclave attributes 899 * @attribute_fd: File descriptor for specific attribute 900 * 901 * Append enclave attribute indicated by file descriptor to allowed 902 * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by 903 * /dev/sgx_provision is supported. 904 * 905 * Return: 906 * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes 907 * -EINVAL: Invalid, or not supported file descriptor 908 */ 909 int sgx_set_attribute(unsigned long *allowed_attributes, 910 unsigned int attribute_fd) 911 { 912 CLASS(fd, f)(attribute_fd); 913 914 if (fd_empty(f)) 915 return -EINVAL; 916 917 if (fd_file(f)->f_op != &sgx_provision_fops) 918 return -EINVAL; 919 920 *allowed_attributes |= SGX_ATTR_PROVISIONKEY; 921 return 0; 922 } 923 EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute); 924 925 /* Counter to count the active SGX users */ 926 static int sgx_usage_count; 927 928 /** 929 * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN]. 930 * 931 * This instruction attempts to update CPUSVN to the 932 * currently loaded microcode update SVN and generate new 933 * cryptographic assets. 934 * 935 * Return: 936 * * %0: - Success or not supported 937 * * %-EAGAIN: - Can be safely retried, failure is due to lack of 938 * * entropy in RNG 939 * * %-EIO: - Unexpected error, retries are not advisable 940 */ 941 static int sgx_update_svn(void) 942 { 943 int ret; 944 945 /* 946 * If EUPDATESVN is not available, it is ok to 947 * silently skip it to comply with legacy behavior. 948 */ 949 if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN)) 950 return 0; 951 952 /* 953 * EPC is guaranteed to be empty when there are no users. 954 * Ensure we are on our first user before proceeding further. 955 */ 956 WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n"); 957 958 for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) { 959 ret = __eupdatesvn(); 960 961 /* Stop on success or unexpected errors: */ 962 if (ret != SGX_INSUFFICIENT_ENTROPY) 963 break; 964 } 965 966 switch (ret) { 967 case 0: 968 /* 969 * SVN successfully updated. 970 * Let users know when the update was successful. 971 */ 972 pr_info("SVN updated successfully\n"); 973 return 0; 974 case SGX_NO_UPDATE: 975 /* 976 * SVN update failed since the current SVN is 977 * not newer than CPUSVN. This is the most 978 * common case and indicates no harm. 979 */ 980 return 0; 981 case SGX_INSUFFICIENT_ENTROPY: 982 /* 983 * SVN update failed due to lack of entropy in DRNG. 984 * Indicate to userspace that it should retry. 985 */ 986 return -EAGAIN; 987 default: 988 break; 989 } 990 991 /* 992 * EUPDATESVN was called when EPC is empty, all other error 993 * codes are unexpected. 994 */ 995 ENCLS_WARN(ret, "EUPDATESVN"); 996 return -EIO; 997 } 998 999 /* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */ 1000 static DEFINE_MUTEX(sgx_svn_lock); 1001 1002 int sgx_inc_usage_count(void) 1003 { 1004 int ret; 1005 1006 guard(mutex)(&sgx_svn_lock); 1007 1008 if (!sgx_usage_count) { 1009 ret = sgx_update_svn(); 1010 if (ret) 1011 return ret; 1012 } 1013 1014 sgx_usage_count++; 1015 1016 return 0; 1017 } 1018 1019 void sgx_dec_usage_count(void) 1020 { 1021 guard(mutex)(&sgx_svn_lock); 1022 sgx_usage_count--; 1023 } 1024 1025 static int __init sgx_init(void) 1026 { 1027 int ret; 1028 int i; 1029 1030 if (!cpu_feature_enabled(X86_FEATURE_SGX)) 1031 return -ENODEV; 1032 1033 if (!sgx_page_cache_init()) 1034 return -ENOMEM; 1035 1036 if (!sgx_page_reclaimer_init()) { 1037 ret = -ENOMEM; 1038 goto err_page_cache; 1039 } 1040 1041 ret = misc_register(&sgx_dev_provision); 1042 if (ret) 1043 goto err_kthread; 1044 1045 /* 1046 * Always try to initialize the native *and* KVM drivers. 1047 * The KVM driver is less picky than the native one and 1048 * can function if the native one is not supported on the 1049 * current system or fails to initialize. 1050 * 1051 * Error out only if both fail to initialize. 1052 */ 1053 ret = sgx_drv_init(); 1054 1055 if (sgx_vepc_init() && ret) 1056 goto err_provision; 1057 1058 return 0; 1059 1060 err_provision: 1061 misc_deregister(&sgx_dev_provision); 1062 1063 err_kthread: 1064 kthread_stop(ksgxd_tsk); 1065 1066 err_page_cache: 1067 for (i = 0; i < sgx_nr_epc_sections; i++) { 1068 vfree(sgx_epc_sections[i].pages); 1069 memunmap(sgx_epc_sections[i].virt_addr); 1070 } 1071 1072 return ret; 1073 } 1074 1075 device_initcall(sgx_init); 1076