xref: /linux/arch/x86/kernel/cpu/sgx/virt.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Device driver to expose SGX enclave memory to KVM guests.
4  *
5  * Copyright(c) 2021 Intel Corporation.
6  */
7 
8 #include <linux/kvm_types.h>
9 #include <linux/miscdevice.h>
10 #include <linux/mm.h>
11 #include <linux/mman.h>
12 #include <linux/sched/mm.h>
13 #include <linux/sched/signal.h>
14 #include <linux/slab.h>
15 #include <linux/xarray.h>
16 #include <asm/sgx.h>
17 #include <uapi/asm/sgx.h>
18 
19 #include "encls.h"
20 #include "sgx.h"
21 
22 struct sgx_vepc {
23 	struct xarray page_array;
24 	struct mutex lock;
25 };
26 
27 /*
28  * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
29  * virtual EPC instances, and the lock to protect it.
30  */
31 static struct mutex zombie_secs_pages_lock;
32 static struct list_head zombie_secs_pages;
33 
34 static int __sgx_vepc_fault(struct sgx_vepc *vepc,
35 			    struct vm_area_struct *vma, unsigned long addr)
36 {
37 	struct sgx_epc_page *epc_page;
38 	unsigned long index, pfn;
39 	int ret;
40 
41 	WARN_ON(!mutex_is_locked(&vepc->lock));
42 
43 	/* Calculate index of EPC page in virtual EPC's page_array */
44 	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
45 
46 	epc_page = xa_load(&vepc->page_array, index);
47 	if (epc_page)
48 		return 0;
49 
50 	epc_page = sgx_alloc_epc_page(vepc, false);
51 	if (IS_ERR(epc_page))
52 		return PTR_ERR(epc_page);
53 
54 	ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
55 	if (ret)
56 		goto err_free;
57 
58 	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
59 
60 	ret = vmf_insert_pfn(vma, addr, pfn);
61 	if (ret != VM_FAULT_NOPAGE) {
62 		ret = -EFAULT;
63 		goto err_delete;
64 	}
65 
66 	return 0;
67 
68 err_delete:
69 	xa_erase(&vepc->page_array, index);
70 err_free:
71 	sgx_free_epc_page(epc_page);
72 	return ret;
73 }
74 
75 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
76 {
77 	struct vm_area_struct *vma = vmf->vma;
78 	struct sgx_vepc *vepc = vma->vm_private_data;
79 	int ret;
80 
81 	mutex_lock(&vepc->lock);
82 	ret = __sgx_vepc_fault(vepc, vma, vmf->address);
83 	mutex_unlock(&vepc->lock);
84 
85 	if (!ret)
86 		return VM_FAULT_NOPAGE;
87 
88 	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
89 		mmap_read_unlock(vma->vm_mm);
90 		return VM_FAULT_RETRY;
91 	}
92 
93 	return VM_FAULT_SIGBUS;
94 }
95 
96 static const struct vm_operations_struct sgx_vepc_vm_ops = {
97 	.fault = sgx_vepc_fault,
98 };
99 
100 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
101 {
102 	struct sgx_vepc *vepc = file->private_data;
103 
104 	if (!(vma->vm_flags & VM_SHARED))
105 		return -EINVAL;
106 
107 	vma->vm_ops = &sgx_vepc_vm_ops;
108 	/* Don't copy VMA in fork() */
109 	vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
110 	vma->vm_private_data = vepc;
111 
112 	return 0;
113 }
114 
115 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
116 {
117 	/*
118 	 * Take a previously guest-owned EPC page and return it to the
119 	 * general EPC page pool.
120 	 *
121 	 * Guests can not be trusted to have left this page in a good
122 	 * state, so run EREMOVE on the page unconditionally.  In the
123 	 * case that a guest properly EREMOVE'd this page, a superfluous
124 	 * EREMOVE is harmless.
125 	 */
126 	return __eremove(sgx_get_epc_virt_addr(epc_page));
127 }
128 
129 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
130 {
131 	int ret = sgx_vepc_remove_page(epc_page);
132 	if (ret) {
133 		/*
134 		 * Only SGX_CHILD_PRESENT is expected, which is because of
135 		 * EREMOVE'ing an SECS still with child, in which case it can
136 		 * be handled by EREMOVE'ing the SECS again after all pages in
137 		 * virtual EPC have been EREMOVE'd. See comments in below in
138 		 * sgx_vepc_release().
139 		 *
140 		 * The user of virtual EPC (KVM) needs to guarantee there's no
141 		 * logical processor is still running in the enclave in guest,
142 		 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
143 		 * handled here.
144 		 */
145 		WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
146 			  ret, ret);
147 		return ret;
148 	}
149 
150 	sgx_free_epc_page(epc_page);
151 	return 0;
152 }
153 
154 static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
155 {
156 	struct sgx_epc_page *entry;
157 	unsigned long index;
158 	long failures = 0;
159 
160 	xa_for_each(&vepc->page_array, index, entry) {
161 		int ret = sgx_vepc_remove_page(entry);
162 		if (ret) {
163 			if (ret == SGX_CHILD_PRESENT) {
164 				/* The page is a SECS, userspace will retry.  */
165 				failures++;
166 			} else {
167 				/*
168 				 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
169 				 * WARN, as userspace can induce said failures by
170 				 * calling the ioctl concurrently on multiple vEPCs or
171 				 * while one or more CPUs is running the enclave.  Only
172 				 * a #PF on EREMOVE indicates a kernel/hardware issue.
173 				 */
174 				WARN_ON_ONCE(encls_faulted(ret) &&
175 					     ENCLS_TRAPNR(ret) != X86_TRAP_GP);
176 				return -EBUSY;
177 			}
178 		}
179 		cond_resched();
180 	}
181 
182 	/*
183 	 * Return the number of SECS pages that failed to be removed, so
184 	 * userspace knows that it has to retry.
185 	 */
186 	return failures;
187 }
188 
189 static int sgx_vepc_release(struct inode *inode, struct file *file)
190 {
191 	struct sgx_vepc *vepc = file->private_data;
192 	struct sgx_epc_page *epc_page, *tmp, *entry;
193 	unsigned long index;
194 
195 	LIST_HEAD(secs_pages);
196 
197 	xa_for_each(&vepc->page_array, index, entry) {
198 		/*
199 		 * Remove all normal, child pages.  sgx_vepc_free_page()
200 		 * will fail if EREMOVE fails, but this is OK and expected on
201 		 * SECS pages.  Those can only be EREMOVE'd *after* all their
202 		 * child pages. Retries below will clean them up.
203 		 */
204 		if (sgx_vepc_free_page(entry))
205 			continue;
206 
207 		xa_erase(&vepc->page_array, index);
208 		cond_resched();
209 	}
210 
211 	/*
212 	 * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
213 	 * only had children in this 'epc' area.
214 	 */
215 	xa_for_each(&vepc->page_array, index, entry) {
216 		epc_page = entry;
217 		/*
218 		 * An EREMOVE failure here means that the SECS page still
219 		 * has children.  But, since all children in this 'sgx_vepc'
220 		 * have been removed, the SECS page must have a child on
221 		 * another instance.
222 		 */
223 		if (sgx_vepc_free_page(epc_page))
224 			list_add_tail(&epc_page->list, &secs_pages);
225 
226 		xa_erase(&vepc->page_array, index);
227 		cond_resched();
228 	}
229 
230 	/*
231 	 * SECS pages are "pinned" by child pages, and "unpinned" once all
232 	 * children have been EREMOVE'd.  A child page in this instance
233 	 * may have pinned an SECS page encountered in an earlier release(),
234 	 * creating a zombie.  Since some children were EREMOVE'd above,
235 	 * try to EREMOVE all zombies in the hopes that one was unpinned.
236 	 */
237 	mutex_lock(&zombie_secs_pages_lock);
238 	list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
239 		/*
240 		 * Speculatively remove the page from the list of zombies,
241 		 * if the page is successfully EREMOVE'd it will be added to
242 		 * the list of free pages.  If EREMOVE fails, throw the page
243 		 * on the local list, which will be spliced on at the end.
244 		 */
245 		list_del(&epc_page->list);
246 
247 		if (sgx_vepc_free_page(epc_page))
248 			list_add_tail(&epc_page->list, &secs_pages);
249 		cond_resched();
250 	}
251 
252 	if (!list_empty(&secs_pages))
253 		list_splice_tail(&secs_pages, &zombie_secs_pages);
254 	mutex_unlock(&zombie_secs_pages_lock);
255 
256 	xa_destroy(&vepc->page_array);
257 	kfree(vepc);
258 
259 	sgx_dec_usage_count();
260 	return 0;
261 }
262 
263 static int __sgx_vepc_open(struct inode *inode, struct file *file)
264 {
265 	struct sgx_vepc *vepc;
266 
267 	vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
268 	if (!vepc)
269 		return -ENOMEM;
270 	mutex_init(&vepc->lock);
271 	xa_init(&vepc->page_array);
272 
273 	file->private_data = vepc;
274 
275 	return 0;
276 }
277 
278 static int sgx_vepc_open(struct inode *inode, struct file *file)
279 {
280 	int ret;
281 
282 	ret = sgx_inc_usage_count();
283 	if (ret)
284 		return ret;
285 
286 	ret =  __sgx_vepc_open(inode, file);
287 	if (ret) {
288 		sgx_dec_usage_count();
289 		return ret;
290 	}
291 
292 	return 0;
293 }
294 
295 static long sgx_vepc_ioctl(struct file *file,
296 			   unsigned int cmd, unsigned long arg)
297 {
298 	struct sgx_vepc *vepc = file->private_data;
299 
300 	switch (cmd) {
301 	case SGX_IOC_VEPC_REMOVE_ALL:
302 		if (arg)
303 			return -EINVAL;
304 		return sgx_vepc_remove_all(vepc);
305 
306 	default:
307 		return -ENOTTY;
308 	}
309 }
310 
311 static const struct file_operations sgx_vepc_fops = {
312 	.owner		= THIS_MODULE,
313 	.open		= sgx_vepc_open,
314 	.unlocked_ioctl	= sgx_vepc_ioctl,
315 	.compat_ioctl	= sgx_vepc_ioctl,
316 	.release	= sgx_vepc_release,
317 	.mmap		= sgx_vepc_mmap,
318 };
319 
320 static struct miscdevice sgx_vepc_dev = {
321 	.minor		= MISC_DYNAMIC_MINOR,
322 	.name		= "sgx_vepc",
323 	.nodename	= "sgx_vepc",
324 	.fops		= &sgx_vepc_fops,
325 };
326 
327 int __init sgx_vepc_init(void)
328 {
329 	/* SGX virtualization requires KVM to work */
330 	if (!cpu_feature_enabled(X86_FEATURE_VMX))
331 		return -ENODEV;
332 
333 	INIT_LIST_HEAD(&zombie_secs_pages);
334 	mutex_init(&zombie_secs_pages_lock);
335 
336 	return misc_register(&sgx_vepc_dev);
337 }
338 
339 /**
340  * sgx_virt_ecreate() - Run ECREATE on behalf of guest
341  * @pageinfo:	Pointer to PAGEINFO structure
342  * @secs:	Userspace pointer to SECS page
343  * @trapnr:	trap number injected to guest in case of ECREATE error
344  *
345  * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
346  * of enforcing policies of guest's enclaves, and return the trap number
347  * which should be injected to guest in case of any ECREATE error.
348  *
349  * Return:
350  * -  0:	ECREATE was successful.
351  * - <0:	on error.
352  */
353 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
354 		     int *trapnr)
355 {
356 	int ret;
357 
358 	/*
359 	 * @secs is an untrusted, userspace-provided address.  It comes from
360 	 * KVM and is assumed to be a valid pointer which points somewhere in
361 	 * userspace.  This can fault and call SGX or other fault handlers when
362 	 * userspace mapping @secs doesn't exist.
363 	 *
364 	 * Add a WARN() to make sure @secs is already valid userspace pointer
365 	 * from caller (KVM), who should already have handled invalid pointer
366 	 * case (for instance, made by malicious guest).  All other checks,
367 	 * such as alignment of @secs, are deferred to ENCLS itself.
368 	 */
369 	if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
370 		return -EINVAL;
371 
372 	__uaccess_begin();
373 	ret = __ecreate(pageinfo, (void *)secs);
374 	__uaccess_end();
375 
376 	if (encls_faulted(ret)) {
377 		*trapnr = ENCLS_TRAPNR(ret);
378 		return -EFAULT;
379 	}
380 
381 	/* ECREATE doesn't return an error code, it faults or succeeds. */
382 	WARN_ON_ONCE(ret);
383 	return 0;
384 }
385 EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
386 
387 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
388 			    void __user *secs)
389 {
390 	int ret;
391 
392 	/*
393 	 * Make sure all userspace pointers from caller (KVM) are valid.
394 	 * All other checks deferred to ENCLS itself.  Also see comment
395 	 * for @secs in sgx_virt_ecreate().
396 	 */
397 #define SGX_EINITTOKEN_SIZE	304
398 	if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
399 			 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
400 			 !access_ok(secs, PAGE_SIZE)))
401 		return -EINVAL;
402 
403 	__uaccess_begin();
404 	ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
405 	__uaccess_end();
406 
407 	return ret;
408 }
409 
410 /**
411  * sgx_virt_einit() - Run EINIT on behalf of guest
412  * @sigstruct:		Userspace pointer to SIGSTRUCT structure
413  * @token:		Userspace pointer to EINITTOKEN structure
414  * @secs:		Userspace pointer to SECS page
415  * @lepubkeyhash:	Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
416  * @trapnr:		trap number injected to guest in case of EINIT error
417  *
418  * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
419  * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
420  * needs to update hardware values to guest's virtual MSR values in order to
421  * ensure EINIT is executed with expected hardware values.
422  *
423  * Return:
424  * -  0:	EINIT was successful.
425  * - <0:	on error.
426  */
427 int sgx_virt_einit(void __user *sigstruct, void __user *token,
428 		   void __user *secs, u64 *lepubkeyhash, int *trapnr)
429 {
430 	int ret;
431 
432 	if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
433 		ret = __sgx_virt_einit(sigstruct, token, secs);
434 	} else {
435 		preempt_disable();
436 
437 		sgx_update_lepubkeyhash(lepubkeyhash);
438 
439 		ret = __sgx_virt_einit(sigstruct, token, secs);
440 		preempt_enable();
441 	}
442 
443 	/* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
444 	if (ret == -EINVAL)
445 		return ret;
446 
447 	if (encls_faulted(ret)) {
448 		*trapnr = ENCLS_TRAPNR(ret);
449 		return -EFAULT;
450 	}
451 
452 	return ret;
453 }
454 EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
455