1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2016-20 Intel Corporation. */
3
4 #include <linux/file.h>
5 #include <linux/freezer.h>
6 #include <linux/highmem.h>
7 #include <linux/kthread.h>
8 #include <linux/miscdevice.h>
9 #include <linux/node.h>
10 #include <linux/pagemap.h>
11 #include <linux/ratelimit.h>
12 #include <linux/sched/mm.h>
13 #include <linux/sched/signal.h>
14 #include <linux/slab.h>
15 #include <linux/sysfs.h>
16 #include <linux/vmalloc.h>
17 #include <asm/msr.h>
18 #include <asm/sgx.h>
19 #include "driver.h"
20 #include "encl.h"
21 #include "encls.h"
22
23 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
24 static int sgx_nr_epc_sections;
25 static struct task_struct *ksgxd_tsk;
26 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
27 static DEFINE_XARRAY(sgx_epc_address_space);
28
29 /*
30 * These variables are part of the state of the reclaimer, and must be accessed
31 * with sgx_reclaimer_lock acquired.
32 */
33 static LIST_HEAD(sgx_active_page_list);
34 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
35
36 static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
37
38 /* Nodes with one or more EPC sections. */
39 static nodemask_t sgx_numa_mask;
40
41 /*
42 * Array with one list_head for each possible NUMA node. Each
43 * list contains all the sgx_epc_section's which are on that
44 * node.
45 */
46 static struct sgx_numa_node *sgx_numa_nodes;
47
48 static LIST_HEAD(sgx_dirty_page_list);
49
50 /*
51 * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
52 * from the input list, and made available for the page allocator. SECS pages
53 * prepending their children in the input list are left intact.
54 *
55 * Return 0 when sanitization was successful or kthread was stopped, and the
56 * number of unsanitized pages otherwise.
57 */
__sgx_sanitize_pages(struct list_head * dirty_page_list)58 static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
59 {
60 unsigned long left_dirty = 0;
61 struct sgx_epc_page *page;
62 LIST_HEAD(dirty);
63 int ret;
64
65 /* dirty_page_list is thread-local, no need for a lock: */
66 while (!list_empty(dirty_page_list)) {
67 if (kthread_should_stop())
68 return 0;
69
70 page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
71
72 /*
73 * Checking page->poison without holding the node->lock
74 * is racy, but losing the race (i.e. poison is set just
75 * after the check) just means __eremove() will be uselessly
76 * called for a page that sgx_free_epc_page() will put onto
77 * the node->sgx_poison_page_list later.
78 */
79 if (page->poison) {
80 struct sgx_epc_section *section = &sgx_epc_sections[page->section];
81 struct sgx_numa_node *node = section->node;
82
83 spin_lock(&node->lock);
84 list_move(&page->list, &node->sgx_poison_page_list);
85 spin_unlock(&node->lock);
86
87 continue;
88 }
89
90 ret = __eremove(sgx_get_epc_virt_addr(page));
91 if (!ret) {
92 /*
93 * page is now sanitized. Make it available via the SGX
94 * page allocator:
95 */
96 list_del(&page->list);
97 sgx_free_epc_page(page);
98 } else {
99 /* The page is not yet clean - move to the dirty list. */
100 list_move_tail(&page->list, &dirty);
101 left_dirty++;
102 }
103
104 cond_resched();
105 }
106
107 list_splice(&dirty, dirty_page_list);
108 return left_dirty;
109 }
110
sgx_reclaimer_age(struct sgx_epc_page * epc_page)111 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
112 {
113 struct sgx_encl_page *page = epc_page->owner;
114 struct sgx_encl *encl = page->encl;
115 struct sgx_encl_mm *encl_mm;
116 bool ret = true;
117 int idx;
118
119 idx = srcu_read_lock(&encl->srcu);
120
121 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
122 if (!mmget_not_zero(encl_mm->mm))
123 continue;
124
125 mmap_read_lock(encl_mm->mm);
126 ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
127 mmap_read_unlock(encl_mm->mm);
128
129 mmput_async(encl_mm->mm);
130
131 if (!ret)
132 break;
133 }
134
135 srcu_read_unlock(&encl->srcu, idx);
136
137 if (!ret)
138 return false;
139
140 return true;
141 }
142
sgx_reclaimer_block(struct sgx_epc_page * epc_page)143 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
144 {
145 struct sgx_encl_page *page = epc_page->owner;
146 unsigned long addr = page->desc & PAGE_MASK;
147 struct sgx_encl *encl = page->encl;
148 int ret;
149
150 sgx_zap_enclave_ptes(encl, addr);
151
152 mutex_lock(&encl->lock);
153
154 ret = __eblock(sgx_get_epc_virt_addr(epc_page));
155 if (encls_failed(ret))
156 ENCLS_WARN(ret, "EBLOCK");
157
158 mutex_unlock(&encl->lock);
159 }
160
__sgx_encl_ewb(struct sgx_epc_page * epc_page,void * va_slot,struct sgx_backing * backing)161 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
162 struct sgx_backing *backing)
163 {
164 struct sgx_pageinfo pginfo;
165 int ret;
166
167 pginfo.addr = 0;
168 pginfo.secs = 0;
169
170 pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
171 pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
172 backing->pcmd_offset;
173
174 ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
175 set_page_dirty(backing->pcmd);
176 set_page_dirty(backing->contents);
177
178 kunmap_local((void *)(unsigned long)(pginfo.metadata -
179 backing->pcmd_offset));
180 kunmap_local((void *)(unsigned long)pginfo.contents);
181
182 return ret;
183 }
184
sgx_ipi_cb(void * info)185 void sgx_ipi_cb(void *info)
186 {
187 }
188
189 /*
190 * Swap page to the regular memory transformed to the blocked state by using
191 * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
192 *
193 * The first trial just tries to write the page assuming that some other thread
194 * has reset the count for threads inside the enclave by using ETRACK, and
195 * previous thread count has been zeroed out. The second trial calls ETRACK
196 * before EWB. If that fails we kick all the HW threads out, and then do EWB,
197 * which should be guaranteed the succeed.
198 */
sgx_encl_ewb(struct sgx_epc_page * epc_page,struct sgx_backing * backing)199 static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
200 struct sgx_backing *backing)
201 {
202 struct sgx_encl_page *encl_page = epc_page->owner;
203 struct sgx_encl *encl = encl_page->encl;
204 struct sgx_va_page *va_page;
205 unsigned int va_offset;
206 void *va_slot;
207 int ret;
208
209 encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
210
211 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
212 list);
213 va_offset = sgx_alloc_va_slot(va_page);
214 va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
215 if (sgx_va_page_full(va_page))
216 list_move_tail(&va_page->list, &encl->va_pages);
217
218 ret = __sgx_encl_ewb(epc_page, va_slot, backing);
219 if (ret == SGX_NOT_TRACKED) {
220 ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
221 if (ret) {
222 if (encls_failed(ret))
223 ENCLS_WARN(ret, "ETRACK");
224 }
225
226 ret = __sgx_encl_ewb(epc_page, va_slot, backing);
227 if (ret == SGX_NOT_TRACKED) {
228 /*
229 * Slow path, send IPIs to kick cpus out of the
230 * enclave. Note, it's imperative that the cpu
231 * mask is generated *after* ETRACK, else we'll
232 * miss cpus that entered the enclave between
233 * generating the mask and incrementing epoch.
234 */
235 on_each_cpu_mask(sgx_encl_cpumask(encl),
236 sgx_ipi_cb, NULL, 1);
237 ret = __sgx_encl_ewb(epc_page, va_slot, backing);
238 }
239 }
240
241 if (ret) {
242 if (encls_failed(ret))
243 ENCLS_WARN(ret, "EWB");
244
245 sgx_free_va_slot(va_page, va_offset);
246 } else {
247 encl_page->desc |= va_offset;
248 encl_page->va_page = va_page;
249 }
250 }
251
sgx_reclaimer_write(struct sgx_epc_page * epc_page,struct sgx_backing * backing)252 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
253 struct sgx_backing *backing)
254 {
255 struct sgx_encl_page *encl_page = epc_page->owner;
256 struct sgx_encl *encl = encl_page->encl;
257 struct sgx_backing secs_backing;
258 int ret;
259
260 mutex_lock(&encl->lock);
261
262 sgx_encl_ewb(epc_page, backing);
263 encl_page->epc_page = NULL;
264 encl->secs_child_cnt--;
265 sgx_encl_put_backing(backing);
266
267 if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
268 ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
269 &secs_backing);
270 if (ret)
271 goto out;
272
273 sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
274
275 sgx_encl_free_epc_page(encl->secs.epc_page);
276 encl->secs.epc_page = NULL;
277
278 sgx_encl_put_backing(&secs_backing);
279 }
280
281 out:
282 mutex_unlock(&encl->lock);
283 }
284
285 /*
286 * Take a fixed number of pages from the head of the active page pool and
287 * reclaim them to the enclave's private shmem files. Skip the pages, which have
288 * been accessed since the last scan. Move those pages to the tail of active
289 * page pool so that the pages get scanned in LRU like fashion.
290 *
291 * Batch process a chunk of pages (at the moment 16) in order to degrade amount
292 * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
293 * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
294 * + EWB) but not sufficiently. Reclaiming one page at a time would also be
295 * problematic as it would increase the lock contention too much, which would
296 * halt forward progress.
297 */
sgx_reclaim_pages(void)298 static void sgx_reclaim_pages(void)
299 {
300 struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
301 struct sgx_backing backing[SGX_NR_TO_SCAN];
302 struct sgx_encl_page *encl_page;
303 struct sgx_epc_page *epc_page;
304 pgoff_t page_index;
305 int cnt = 0;
306 int ret;
307 int i;
308
309 spin_lock(&sgx_reclaimer_lock);
310 for (i = 0; i < SGX_NR_TO_SCAN; i++) {
311 if (list_empty(&sgx_active_page_list))
312 break;
313
314 epc_page = list_first_entry(&sgx_active_page_list,
315 struct sgx_epc_page, list);
316 list_del_init(&epc_page->list);
317 encl_page = epc_page->owner;
318
319 if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
320 chunk[cnt++] = epc_page;
321 else
322 /* The owner is freeing the page. No need to add the
323 * page back to the list of reclaimable pages.
324 */
325 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
326 }
327 spin_unlock(&sgx_reclaimer_lock);
328
329 for (i = 0; i < cnt; i++) {
330 epc_page = chunk[i];
331 encl_page = epc_page->owner;
332
333 if (!sgx_reclaimer_age(epc_page))
334 goto skip;
335
336 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
337
338 mutex_lock(&encl_page->encl->lock);
339 ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
340 if (ret) {
341 mutex_unlock(&encl_page->encl->lock);
342 goto skip;
343 }
344
345 encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
346 mutex_unlock(&encl_page->encl->lock);
347 continue;
348
349 skip:
350 spin_lock(&sgx_reclaimer_lock);
351 list_add_tail(&epc_page->list, &sgx_active_page_list);
352 spin_unlock(&sgx_reclaimer_lock);
353
354 kref_put(&encl_page->encl->refcount, sgx_encl_release);
355
356 chunk[i] = NULL;
357 }
358
359 for (i = 0; i < cnt; i++) {
360 epc_page = chunk[i];
361 if (epc_page)
362 sgx_reclaimer_block(epc_page);
363 }
364
365 for (i = 0; i < cnt; i++) {
366 epc_page = chunk[i];
367 if (!epc_page)
368 continue;
369
370 encl_page = epc_page->owner;
371 sgx_reclaimer_write(epc_page, &backing[i]);
372
373 kref_put(&encl_page->encl->refcount, sgx_encl_release);
374 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
375
376 sgx_free_epc_page(epc_page);
377 }
378 }
379
sgx_should_reclaim(unsigned long watermark)380 static bool sgx_should_reclaim(unsigned long watermark)
381 {
382 return atomic_long_read(&sgx_nr_free_pages) < watermark &&
383 !list_empty(&sgx_active_page_list);
384 }
385
386 /*
387 * sgx_reclaim_direct() should be called (without enclave's mutex held)
388 * in locations where SGX memory resources might be low and might be
389 * needed in order to make forward progress.
390 */
sgx_reclaim_direct(void)391 void sgx_reclaim_direct(void)
392 {
393 if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
394 sgx_reclaim_pages();
395 }
396
ksgxd(void * p)397 static int ksgxd(void *p)
398 {
399 set_freezable();
400
401 /*
402 * Sanitize pages in order to recover from kexec(). The 2nd pass is
403 * required for SECS pages, whose child pages blocked EREMOVE.
404 */
405 __sgx_sanitize_pages(&sgx_dirty_page_list);
406 WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
407
408 while (!kthread_should_stop()) {
409 if (try_to_freeze())
410 continue;
411
412 wait_event_freezable(ksgxd_waitq,
413 kthread_should_stop() ||
414 sgx_should_reclaim(SGX_NR_HIGH_PAGES));
415
416 if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
417 sgx_reclaim_pages();
418
419 cond_resched();
420 }
421
422 return 0;
423 }
424
sgx_page_reclaimer_init(void)425 static bool __init sgx_page_reclaimer_init(void)
426 {
427 struct task_struct *tsk;
428
429 tsk = kthread_run(ksgxd, NULL, "ksgxd");
430 if (IS_ERR(tsk))
431 return false;
432
433 ksgxd_tsk = tsk;
434
435 return true;
436 }
437
current_is_ksgxd(void)438 bool current_is_ksgxd(void)
439 {
440 return current == ksgxd_tsk;
441 }
442
__sgx_alloc_epc_page_from_node(int nid)443 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
444 {
445 struct sgx_numa_node *node = &sgx_numa_nodes[nid];
446 struct sgx_epc_page *page = NULL;
447
448 spin_lock(&node->lock);
449
450 if (list_empty(&node->free_page_list)) {
451 spin_unlock(&node->lock);
452 return NULL;
453 }
454
455 page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
456 list_del_init(&page->list);
457 page->flags = 0;
458
459 spin_unlock(&node->lock);
460 atomic_long_dec(&sgx_nr_free_pages);
461
462 return page;
463 }
464
465 /**
466 * __sgx_alloc_epc_page() - Allocate an EPC page
467 *
468 * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
469 * from the NUMA node, where the caller is executing.
470 *
471 * Return:
472 * - an EPC page: A borrowed EPC pages were available.
473 * - NULL: Out of EPC pages.
474 */
__sgx_alloc_epc_page(void)475 struct sgx_epc_page *__sgx_alloc_epc_page(void)
476 {
477 struct sgx_epc_page *page;
478 int nid_of_current = numa_node_id();
479 int nid_start, nid;
480
481 /*
482 * Try local node first. If it doesn't have an EPC section,
483 * fall back to the non-local NUMA nodes.
484 */
485 if (node_isset(nid_of_current, sgx_numa_mask))
486 nid_start = nid_of_current;
487 else
488 nid_start = next_node_in(nid_of_current, sgx_numa_mask);
489
490 nid = nid_start;
491 do {
492 page = __sgx_alloc_epc_page_from_node(nid);
493 if (page)
494 return page;
495
496 nid = next_node_in(nid, sgx_numa_mask);
497 } while (nid != nid_start);
498
499 return ERR_PTR(-ENOMEM);
500 }
501
502 /**
503 * sgx_mark_page_reclaimable() - Mark a page as reclaimable
504 * @page: EPC page
505 *
506 * Mark a page as reclaimable and add it to the active page list. Pages
507 * are automatically removed from the active list when freed.
508 */
sgx_mark_page_reclaimable(struct sgx_epc_page * page)509 void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
510 {
511 spin_lock(&sgx_reclaimer_lock);
512 page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
513 list_add_tail(&page->list, &sgx_active_page_list);
514 spin_unlock(&sgx_reclaimer_lock);
515 }
516
517 /**
518 * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
519 * @page: EPC page
520 *
521 * Clear the reclaimable flag and remove the page from the active page list.
522 *
523 * Return:
524 * 0 on success,
525 * -EBUSY if the page is in the process of being reclaimed
526 */
sgx_unmark_page_reclaimable(struct sgx_epc_page * page)527 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
528 {
529 spin_lock(&sgx_reclaimer_lock);
530 if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
531 /* The page is being reclaimed. */
532 if (list_empty(&page->list)) {
533 spin_unlock(&sgx_reclaimer_lock);
534 return -EBUSY;
535 }
536
537 list_del(&page->list);
538 page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
539 }
540 spin_unlock(&sgx_reclaimer_lock);
541
542 return 0;
543 }
544
545 /**
546 * sgx_alloc_epc_page() - Allocate an EPC page
547 * @owner: the owner of the EPC page
548 * @reclaim: reclaim pages if necessary
549 *
550 * Iterate through EPC sections and borrow a free EPC page to the caller. When a
551 * page is no longer needed it must be released with sgx_free_epc_page(). If
552 * @reclaim is set to true, directly reclaim pages when we are out of pages. No
553 * mm's can be locked when @reclaim is set to true.
554 *
555 * Finally, wake up ksgxd when the number of pages goes below the watermark
556 * before returning back to the caller.
557 *
558 * Return:
559 * an EPC page,
560 * -errno on error
561 */
sgx_alloc_epc_page(void * owner,bool reclaim)562 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
563 {
564 struct sgx_epc_page *page;
565
566 for ( ; ; ) {
567 page = __sgx_alloc_epc_page();
568 if (!IS_ERR(page)) {
569 page->owner = owner;
570 break;
571 }
572
573 if (list_empty(&sgx_active_page_list))
574 return ERR_PTR(-ENOMEM);
575
576 if (!reclaim) {
577 page = ERR_PTR(-EBUSY);
578 break;
579 }
580
581 if (signal_pending(current)) {
582 page = ERR_PTR(-ERESTARTSYS);
583 break;
584 }
585
586 sgx_reclaim_pages();
587 cond_resched();
588 }
589
590 if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
591 wake_up(&ksgxd_waitq);
592
593 return page;
594 }
595
596 /**
597 * sgx_free_epc_page() - Free an EPC page
598 * @page: an EPC page
599 *
600 * Put the EPC page back to the list of free pages. It's the caller's
601 * responsibility to make sure that the page is in uninitialized state. In other
602 * words, do EREMOVE, EWB or whatever operation is necessary before calling
603 * this function.
604 */
sgx_free_epc_page(struct sgx_epc_page * page)605 void sgx_free_epc_page(struct sgx_epc_page *page)
606 {
607 struct sgx_epc_section *section = &sgx_epc_sections[page->section];
608 struct sgx_numa_node *node = section->node;
609
610 spin_lock(&node->lock);
611
612 page->owner = NULL;
613 if (page->poison)
614 list_add(&page->list, &node->sgx_poison_page_list);
615 else
616 list_add_tail(&page->list, &node->free_page_list);
617 page->flags = SGX_EPC_PAGE_IS_FREE;
618
619 spin_unlock(&node->lock);
620 atomic_long_inc(&sgx_nr_free_pages);
621 }
622
sgx_setup_epc_section(u64 phys_addr,u64 size,unsigned long index,struct sgx_epc_section * section)623 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
624 unsigned long index,
625 struct sgx_epc_section *section)
626 {
627 unsigned long nr_pages = size >> PAGE_SHIFT;
628 unsigned long i;
629
630 section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
631 if (!section->virt_addr)
632 return false;
633
634 section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
635 if (!section->pages) {
636 memunmap(section->virt_addr);
637 return false;
638 }
639
640 section->phys_addr = phys_addr;
641 xa_store_range(&sgx_epc_address_space, section->phys_addr,
642 phys_addr + size - 1, section, GFP_KERNEL);
643
644 for (i = 0; i < nr_pages; i++) {
645 section->pages[i].section = index;
646 section->pages[i].flags = 0;
647 section->pages[i].owner = NULL;
648 section->pages[i].poison = 0;
649 list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list);
650 }
651
652 return true;
653 }
654
arch_is_platform_page(u64 paddr)655 bool arch_is_platform_page(u64 paddr)
656 {
657 return !!xa_load(&sgx_epc_address_space, paddr);
658 }
659 EXPORT_SYMBOL_GPL(arch_is_platform_page);
660
sgx_paddr_to_page(u64 paddr)661 static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
662 {
663 struct sgx_epc_section *section;
664
665 section = xa_load(&sgx_epc_address_space, paddr);
666 if (!section)
667 return NULL;
668
669 return §ion->pages[PFN_DOWN(paddr - section->phys_addr)];
670 }
671
672 /*
673 * Called in process context to handle a hardware reported
674 * error in an SGX EPC page.
675 * If the MF_ACTION_REQUIRED bit is set in flags, then the
676 * context is the task that consumed the poison data. Otherwise
677 * this is called from a kernel thread unrelated to the page.
678 */
arch_memory_failure(unsigned long pfn,int flags)679 int arch_memory_failure(unsigned long pfn, int flags)
680 {
681 struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
682 struct sgx_epc_section *section;
683 struct sgx_numa_node *node;
684
685 /*
686 * mm/memory-failure.c calls this routine for all errors
687 * where there isn't a "struct page" for the address. But that
688 * includes other address ranges besides SGX.
689 */
690 if (!page)
691 return -ENXIO;
692
693 /*
694 * If poison was consumed synchronously. Send a SIGBUS to
695 * the task. Hardware has already exited the SGX enclave and
696 * will not allow re-entry to an enclave that has a memory
697 * error. The signal may help the task understand why the
698 * enclave is broken.
699 */
700 if (flags & MF_ACTION_REQUIRED)
701 force_sig(SIGBUS);
702
703 section = &sgx_epc_sections[page->section];
704 node = section->node;
705
706 spin_lock(&node->lock);
707
708 /* Already poisoned? Nothing more to do */
709 if (page->poison)
710 goto out;
711
712 page->poison = 1;
713
714 /*
715 * If the page is on a free list, move it to the per-node
716 * poison page list.
717 */
718 if (page->flags & SGX_EPC_PAGE_IS_FREE) {
719 list_move(&page->list, &node->sgx_poison_page_list);
720 goto out;
721 }
722
723 sgx_unmark_page_reclaimable(page);
724
725 /*
726 * TBD: Add additional plumbing to enable pre-emptive
727 * action for asynchronous poison notification. Until
728 * then just hope that the poison:
729 * a) is not accessed - sgx_free_epc_page() will deal with it
730 * when the user gives it back
731 * b) results in a recoverable machine check rather than
732 * a fatal one
733 */
734 out:
735 spin_unlock(&node->lock);
736 return 0;
737 }
738
739 /*
740 * A section metric is concatenated in a way that @low bits 12-31 define the
741 * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
742 * metric.
743 */
sgx_calc_section_metric(u64 low,u64 high)744 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
745 {
746 return (low & GENMASK_ULL(31, 12)) +
747 ((high & GENMASK_ULL(19, 0)) << 32);
748 }
749
750 #ifdef CONFIG_NUMA
sgx_total_bytes_show(struct device * dev,struct device_attribute * attr,char * buf)751 static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
752 {
753 return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
754 }
755 static DEVICE_ATTR_RO(sgx_total_bytes);
756
arch_node_attr_is_visible(struct kobject * kobj,struct attribute * attr,int idx)757 static umode_t arch_node_attr_is_visible(struct kobject *kobj,
758 struct attribute *attr, int idx)
759 {
760 /* Make all x86/ attributes invisible when SGX is not initialized: */
761 if (nodes_empty(sgx_numa_mask))
762 return 0;
763
764 return attr->mode;
765 }
766
767 static struct attribute *arch_node_dev_attrs[] = {
768 &dev_attr_sgx_total_bytes.attr,
769 NULL,
770 };
771
772 const struct attribute_group arch_node_dev_group = {
773 .name = "x86",
774 .attrs = arch_node_dev_attrs,
775 .is_visible = arch_node_attr_is_visible,
776 };
777
arch_update_sysfs_visibility(int nid)778 static void __init arch_update_sysfs_visibility(int nid)
779 {
780 struct node *node = node_devices[nid];
781 int ret;
782
783 ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
784
785 if (ret)
786 pr_err("sysfs update failed (%d), files may be invisible", ret);
787 }
788 #else /* !CONFIG_NUMA */
arch_update_sysfs_visibility(int nid)789 static void __init arch_update_sysfs_visibility(int nid) {}
790 #endif
791
sgx_page_cache_init(void)792 static bool __init sgx_page_cache_init(void)
793 {
794 u32 eax, ebx, ecx, edx, type;
795 u64 pa, size;
796 int nid;
797 int i;
798
799 sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
800 if (!sgx_numa_nodes)
801 return false;
802
803 for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
804 cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
805
806 type = eax & SGX_CPUID_EPC_MASK;
807 if (type == SGX_CPUID_EPC_INVALID)
808 break;
809
810 if (type != SGX_CPUID_EPC_SECTION) {
811 pr_err_once("Unknown EPC section type: %u\n", type);
812 break;
813 }
814
815 pa = sgx_calc_section_metric(eax, ebx);
816 size = sgx_calc_section_metric(ecx, edx);
817
818 pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
819
820 if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
821 pr_err("No free memory for an EPC section\n");
822 break;
823 }
824
825 nid = numa_map_to_online_node(phys_to_target_node(pa));
826 if (nid == NUMA_NO_NODE) {
827 /* The physical address is already printed above. */
828 pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
829 nid = 0;
830 }
831
832 if (!node_isset(nid, sgx_numa_mask)) {
833 spin_lock_init(&sgx_numa_nodes[nid].lock);
834 INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
835 INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
836 node_set(nid, sgx_numa_mask);
837 sgx_numa_nodes[nid].size = 0;
838
839 /* Make SGX-specific node sysfs files visible: */
840 arch_update_sysfs_visibility(nid);
841 }
842
843 sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
844 sgx_numa_nodes[nid].size += size;
845
846 sgx_nr_epc_sections++;
847 }
848
849 if (!sgx_nr_epc_sections) {
850 pr_err("There are zero EPC sections.\n");
851 return false;
852 }
853
854 for_each_online_node(nid) {
855 if (!node_isset(nid, sgx_numa_mask) &&
856 node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
857 pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
858 nid);
859 }
860
861 return true;
862 }
863
864 /*
865 * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
866 * Bare-metal driver requires to update them to hash of enclave's signer
867 * before EINIT. KVM needs to update them to guest's virtual MSR values
868 * before doing EINIT from guest.
869 */
sgx_update_lepubkeyhash(u64 * lepubkeyhash)870 void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
871 {
872 int i;
873
874 WARN_ON_ONCE(preemptible());
875
876 for (i = 0; i < 4; i++)
877 wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
878 }
879
880 const struct file_operations sgx_provision_fops = {
881 .owner = THIS_MODULE,
882 };
883
884 static struct miscdevice sgx_dev_provision = {
885 .minor = MISC_DYNAMIC_MINOR,
886 .name = "sgx_provision",
887 .nodename = "sgx_provision",
888 .fops = &sgx_provision_fops,
889 };
890
891 /**
892 * sgx_set_attribute() - Update allowed attributes given file descriptor
893 * @allowed_attributes: Pointer to allowed enclave attributes
894 * @attribute_fd: File descriptor for specific attribute
895 *
896 * Append enclave attribute indicated by file descriptor to allowed
897 * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
898 * /dev/sgx_provision is supported.
899 *
900 * Return:
901 * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
902 * -EINVAL: Invalid, or not supported file descriptor
903 */
sgx_set_attribute(unsigned long * allowed_attributes,unsigned int attribute_fd)904 int sgx_set_attribute(unsigned long *allowed_attributes,
905 unsigned int attribute_fd)
906 {
907 CLASS(fd, f)(attribute_fd);
908
909 if (fd_empty(f))
910 return -EINVAL;
911
912 if (fd_file(f)->f_op != &sgx_provision_fops)
913 return -EINVAL;
914
915 *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
916 return 0;
917 }
918 EXPORT_SYMBOL_GPL(sgx_set_attribute);
919
sgx_init(void)920 static int __init sgx_init(void)
921 {
922 int ret;
923 int i;
924
925 if (!cpu_feature_enabled(X86_FEATURE_SGX))
926 return -ENODEV;
927
928 if (!sgx_page_cache_init())
929 return -ENOMEM;
930
931 if (!sgx_page_reclaimer_init()) {
932 ret = -ENOMEM;
933 goto err_page_cache;
934 }
935
936 ret = misc_register(&sgx_dev_provision);
937 if (ret)
938 goto err_kthread;
939
940 /*
941 * Always try to initialize the native *and* KVM drivers.
942 * The KVM driver is less picky than the native one and
943 * can function if the native one is not supported on the
944 * current system or fails to initialize.
945 *
946 * Error out only if both fail to initialize.
947 */
948 ret = sgx_drv_init();
949
950 if (sgx_vepc_init() && ret)
951 goto err_provision;
952
953 return 0;
954
955 err_provision:
956 misc_deregister(&sgx_dev_provision);
957
958 err_kthread:
959 kthread_stop(ksgxd_tsk);
960
961 err_page_cache:
962 for (i = 0; i < sgx_nr_epc_sections; i++) {
963 vfree(sgx_epc_sections[i].pages);
964 memunmap(sgx_epc_sections[i].virt_addr);
965 }
966
967 return ret;
968 }
969
970 device_initcall(sgx_init);
971