1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Device driver to expose SGX enclave memory to KVM guests.
4 *
5 * Copyright(c) 2021 Intel Corporation.
6 */
7
8 #include <linux/kvm_types.h>
9 #include <linux/miscdevice.h>
10 #include <linux/mm.h>
11 #include <linux/mman.h>
12 #include <linux/sched/mm.h>
13 #include <linux/sched/signal.h>
14 #include <linux/slab.h>
15 #include <linux/xarray.h>
16 #include <asm/sgx.h>
17 #include <uapi/asm/sgx.h>
18
19 #include "encls.h"
20 #include "sgx.h"
21
22 struct sgx_vepc {
23 struct xarray page_array;
24 struct mutex lock;
25 };
26
27 /*
28 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
29 * virtual EPC instances, and the lock to protect it.
30 */
31 static struct mutex zombie_secs_pages_lock;
32 static struct list_head zombie_secs_pages;
33
__sgx_vepc_fault(struct sgx_vepc * vepc,struct vm_area_struct * vma,unsigned long addr)34 static int __sgx_vepc_fault(struct sgx_vepc *vepc,
35 struct vm_area_struct *vma, unsigned long addr)
36 {
37 struct sgx_epc_page *epc_page;
38 unsigned long index, pfn;
39 int ret;
40
41 WARN_ON(!mutex_is_locked(&vepc->lock));
42
43 /* Calculate index of EPC page in virtual EPC's page_array */
44 index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
45
46 epc_page = xa_load(&vepc->page_array, index);
47 if (epc_page)
48 return 0;
49
50 epc_page = sgx_alloc_epc_page(vepc, false);
51 if (IS_ERR(epc_page))
52 return PTR_ERR(epc_page);
53
54 ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
55 if (ret)
56 goto err_free;
57
58 pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
59
60 ret = vmf_insert_pfn(vma, addr, pfn);
61 if (ret != VM_FAULT_NOPAGE) {
62 ret = -EFAULT;
63 goto err_delete;
64 }
65
66 return 0;
67
68 err_delete:
69 xa_erase(&vepc->page_array, index);
70 err_free:
71 sgx_free_epc_page(epc_page);
72 return ret;
73 }
74
sgx_vepc_fault(struct vm_fault * vmf)75 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
76 {
77 struct vm_area_struct *vma = vmf->vma;
78 struct sgx_vepc *vepc = vma->vm_private_data;
79 int ret;
80
81 mutex_lock(&vepc->lock);
82 ret = __sgx_vepc_fault(vepc, vma, vmf->address);
83 mutex_unlock(&vepc->lock);
84
85 if (!ret)
86 return VM_FAULT_NOPAGE;
87
88 if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
89 mmap_read_unlock(vma->vm_mm);
90 return VM_FAULT_RETRY;
91 }
92
93 return VM_FAULT_SIGBUS;
94 }
95
96 static const struct vm_operations_struct sgx_vepc_vm_ops = {
97 .fault = sgx_vepc_fault,
98 };
99
sgx_vepc_mmap(struct file * file,struct vm_area_struct * vma)100 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
101 {
102 struct sgx_vepc *vepc = file->private_data;
103
104 if (!(vma->vm_flags & VM_SHARED))
105 return -EINVAL;
106
107 vma->vm_ops = &sgx_vepc_vm_ops;
108 /* Don't copy VMA in fork() */
109 vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
110 vma->vm_private_data = vepc;
111
112 return 0;
113 }
114
sgx_vepc_remove_page(struct sgx_epc_page * epc_page)115 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
116 {
117 /*
118 * Take a previously guest-owned EPC page and return it to the
119 * general EPC page pool.
120 *
121 * Guests can not be trusted to have left this page in a good
122 * state, so run EREMOVE on the page unconditionally. In the
123 * case that a guest properly EREMOVE'd this page, a superfluous
124 * EREMOVE is harmless.
125 */
126 return __eremove(sgx_get_epc_virt_addr(epc_page));
127 }
128
sgx_vepc_free_page(struct sgx_epc_page * epc_page)129 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
130 {
131 int ret = sgx_vepc_remove_page(epc_page);
132 if (ret) {
133 /*
134 * Only SGX_CHILD_PRESENT is expected, which is because of
135 * EREMOVE'ing an SECS still with child, in which case it can
136 * be handled by EREMOVE'ing the SECS again after all pages in
137 * virtual EPC have been EREMOVE'd. See comments in below in
138 * sgx_vepc_release().
139 *
140 * The user of virtual EPC (KVM) needs to guarantee there's no
141 * logical processor is still running in the enclave in guest,
142 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
143 * handled here.
144 */
145 WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
146 ret, ret);
147 return ret;
148 }
149
150 sgx_free_epc_page(epc_page);
151 return 0;
152 }
153
sgx_vepc_remove_all(struct sgx_vepc * vepc)154 static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
155 {
156 struct sgx_epc_page *entry;
157 unsigned long index;
158 long failures = 0;
159
160 xa_for_each(&vepc->page_array, index, entry) {
161 int ret = sgx_vepc_remove_page(entry);
162 if (ret) {
163 if (ret == SGX_CHILD_PRESENT) {
164 /* The page is a SECS, userspace will retry. */
165 failures++;
166 } else {
167 /*
168 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
169 * WARN, as userspace can induce said failures by
170 * calling the ioctl concurrently on multiple vEPCs or
171 * while one or more CPUs is running the enclave. Only
172 * a #PF on EREMOVE indicates a kernel/hardware issue.
173 */
174 WARN_ON_ONCE(encls_faulted(ret) &&
175 ENCLS_TRAPNR(ret) != X86_TRAP_GP);
176 return -EBUSY;
177 }
178 }
179 cond_resched();
180 }
181
182 /*
183 * Return the number of SECS pages that failed to be removed, so
184 * userspace knows that it has to retry.
185 */
186 return failures;
187 }
188
sgx_vepc_release(struct inode * inode,struct file * file)189 static int sgx_vepc_release(struct inode *inode, struct file *file)
190 {
191 struct sgx_vepc *vepc = file->private_data;
192 struct sgx_epc_page *epc_page, *tmp, *entry;
193 unsigned long index;
194
195 LIST_HEAD(secs_pages);
196
197 xa_for_each(&vepc->page_array, index, entry) {
198 /*
199 * Remove all normal, child pages. sgx_vepc_free_page()
200 * will fail if EREMOVE fails, but this is OK and expected on
201 * SECS pages. Those can only be EREMOVE'd *after* all their
202 * child pages. Retries below will clean them up.
203 */
204 if (sgx_vepc_free_page(entry))
205 continue;
206
207 xa_erase(&vepc->page_array, index);
208 cond_resched();
209 }
210
211 /*
212 * Retry EREMOVE'ing pages. This will clean up any SECS pages that
213 * only had children in this 'epc' area.
214 */
215 xa_for_each(&vepc->page_array, index, entry) {
216 epc_page = entry;
217 /*
218 * An EREMOVE failure here means that the SECS page still
219 * has children. But, since all children in this 'sgx_vepc'
220 * have been removed, the SECS page must have a child on
221 * another instance.
222 */
223 if (sgx_vepc_free_page(epc_page))
224 list_add_tail(&epc_page->list, &secs_pages);
225
226 xa_erase(&vepc->page_array, index);
227 cond_resched();
228 }
229
230 /*
231 * SECS pages are "pinned" by child pages, and "unpinned" once all
232 * children have been EREMOVE'd. A child page in this instance
233 * may have pinned an SECS page encountered in an earlier release(),
234 * creating a zombie. Since some children were EREMOVE'd above,
235 * try to EREMOVE all zombies in the hopes that one was unpinned.
236 */
237 mutex_lock(&zombie_secs_pages_lock);
238 list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
239 /*
240 * Speculatively remove the page from the list of zombies,
241 * if the page is successfully EREMOVE'd it will be added to
242 * the list of free pages. If EREMOVE fails, throw the page
243 * on the local list, which will be spliced on at the end.
244 */
245 list_del(&epc_page->list);
246
247 if (sgx_vepc_free_page(epc_page))
248 list_add_tail(&epc_page->list, &secs_pages);
249 cond_resched();
250 }
251
252 if (!list_empty(&secs_pages))
253 list_splice_tail(&secs_pages, &zombie_secs_pages);
254 mutex_unlock(&zombie_secs_pages_lock);
255
256 xa_destroy(&vepc->page_array);
257 kfree(vepc);
258
259 sgx_dec_usage_count();
260 return 0;
261 }
262
__sgx_vepc_open(struct inode * inode,struct file * file)263 static int __sgx_vepc_open(struct inode *inode, struct file *file)
264 {
265 struct sgx_vepc *vepc;
266
267 vepc = kzalloc_obj(struct sgx_vepc);
268 if (!vepc)
269 return -ENOMEM;
270 mutex_init(&vepc->lock);
271 xa_init(&vepc->page_array);
272
273 file->private_data = vepc;
274
275 return 0;
276 }
277
sgx_vepc_open(struct inode * inode,struct file * file)278 static int sgx_vepc_open(struct inode *inode, struct file *file)
279 {
280 int ret;
281
282 ret = sgx_inc_usage_count();
283 if (ret)
284 return ret;
285
286 ret = __sgx_vepc_open(inode, file);
287 if (ret) {
288 sgx_dec_usage_count();
289 return ret;
290 }
291
292 return 0;
293 }
294
sgx_vepc_ioctl(struct file * file,unsigned int cmd,unsigned long arg)295 static long sgx_vepc_ioctl(struct file *file,
296 unsigned int cmd, unsigned long arg)
297 {
298 struct sgx_vepc *vepc = file->private_data;
299
300 switch (cmd) {
301 case SGX_IOC_VEPC_REMOVE_ALL:
302 if (arg)
303 return -EINVAL;
304 return sgx_vepc_remove_all(vepc);
305
306 default:
307 return -ENOTTY;
308 }
309 }
310
311 static const struct file_operations sgx_vepc_fops = {
312 .owner = THIS_MODULE,
313 .open = sgx_vepc_open,
314 .unlocked_ioctl = sgx_vepc_ioctl,
315 .compat_ioctl = sgx_vepc_ioctl,
316 .release = sgx_vepc_release,
317 .mmap = sgx_vepc_mmap,
318 };
319
320 static struct miscdevice sgx_vepc_dev = {
321 .minor = MISC_DYNAMIC_MINOR,
322 .name = "sgx_vepc",
323 .nodename = "sgx_vepc",
324 .fops = &sgx_vepc_fops,
325 };
326
sgx_vepc_init(void)327 int __init sgx_vepc_init(void)
328 {
329 /* SGX virtualization requires KVM to work */
330 if (!cpu_feature_enabled(X86_FEATURE_VMX))
331 return -ENODEV;
332
333 INIT_LIST_HEAD(&zombie_secs_pages);
334 mutex_init(&zombie_secs_pages_lock);
335
336 return misc_register(&sgx_vepc_dev);
337 }
338
339 /**
340 * sgx_virt_ecreate() - Run ECREATE on behalf of guest
341 * @pageinfo: Pointer to PAGEINFO structure
342 * @secs: Userspace pointer to SECS page
343 * @trapnr: trap number injected to guest in case of ECREATE error
344 *
345 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
346 * of enforcing policies of guest's enclaves, and return the trap number
347 * which should be injected to guest in case of any ECREATE error.
348 *
349 * Return:
350 * - 0: ECREATE was successful.
351 * - <0: on error.
352 */
sgx_virt_ecreate(struct sgx_pageinfo * pageinfo,void __user * secs,int * trapnr)353 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
354 int *trapnr)
355 {
356 int ret;
357
358 /*
359 * @secs is an untrusted, userspace-provided address. It comes from
360 * KVM and is assumed to be a valid pointer which points somewhere in
361 * userspace. This can fault and call SGX or other fault handlers when
362 * userspace mapping @secs doesn't exist.
363 *
364 * Add a WARN() to make sure @secs is already valid userspace pointer
365 * from caller (KVM), who should already have handled invalid pointer
366 * case (for instance, made by malicious guest). All other checks,
367 * such as alignment of @secs, are deferred to ENCLS itself.
368 */
369 if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
370 return -EINVAL;
371
372 __uaccess_begin();
373 ret = __ecreate(pageinfo, (void *)secs);
374 __uaccess_end();
375
376 if (encls_faulted(ret)) {
377 *trapnr = ENCLS_TRAPNR(ret);
378 return -EFAULT;
379 }
380
381 /* ECREATE doesn't return an error code, it faults or succeeds. */
382 WARN_ON_ONCE(ret);
383 return 0;
384 }
385 EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
386
__sgx_virt_einit(void __user * sigstruct,void __user * token,void __user * secs)387 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
388 void __user *secs)
389 {
390 int ret;
391
392 /*
393 * Make sure all userspace pointers from caller (KVM) are valid.
394 * All other checks deferred to ENCLS itself. Also see comment
395 * for @secs in sgx_virt_ecreate().
396 */
397 #define SGX_EINITTOKEN_SIZE 304
398 if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
399 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
400 !access_ok(secs, PAGE_SIZE)))
401 return -EINVAL;
402
403 __uaccess_begin();
404 ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
405 __uaccess_end();
406
407 return ret;
408 }
409
410 /**
411 * sgx_virt_einit() - Run EINIT on behalf of guest
412 * @sigstruct: Userspace pointer to SIGSTRUCT structure
413 * @token: Userspace pointer to EINITTOKEN structure
414 * @secs: Userspace pointer to SECS page
415 * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
416 * @trapnr: trap number injected to guest in case of EINIT error
417 *
418 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
419 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
420 * needs to update hardware values to guest's virtual MSR values in order to
421 * ensure EINIT is executed with expected hardware values.
422 *
423 * Return:
424 * - 0: EINIT was successful.
425 * - <0: on error.
426 */
sgx_virt_einit(void __user * sigstruct,void __user * token,void __user * secs,u64 * lepubkeyhash,int * trapnr)427 int sgx_virt_einit(void __user *sigstruct, void __user *token,
428 void __user *secs, u64 *lepubkeyhash, int *trapnr)
429 {
430 int ret;
431
432 if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
433 ret = __sgx_virt_einit(sigstruct, token, secs);
434 } else {
435 preempt_disable();
436
437 sgx_update_lepubkeyhash(lepubkeyhash);
438
439 ret = __sgx_virt_einit(sigstruct, token, secs);
440 preempt_enable();
441 }
442
443 /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
444 if (ret == -EINVAL)
445 return ret;
446
447 if (encls_faulted(ret)) {
448 *trapnr = ENCLS_TRAPNR(ret);
449 return -EFAULT;
450 }
451
452 return ret;
453 }
454 EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
455