xref: /linux/arch/x86/kernel/cpu/sgx/virt.c (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Device driver to expose SGX enclave memory to KVM guests.
4  *
5  * Copyright(c) 2021 Intel Corporation.
6  */
7 
8 #include <linux/miscdevice.h>
9 #include <linux/mm.h>
10 #include <linux/mman.h>
11 #include <linux/sched/mm.h>
12 #include <linux/sched/signal.h>
13 #include <linux/slab.h>
14 #include <linux/xarray.h>
15 #include <asm/sgx.h>
16 #include <uapi/asm/sgx.h>
17 
18 #include "encls.h"
19 #include "sgx.h"
20 
21 struct sgx_vepc {
22 	struct xarray page_array;
23 	struct mutex lock;
24 };
25 
26 /*
27  * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
28  * virtual EPC instances, and the lock to protect it.
29  */
30 static struct mutex zombie_secs_pages_lock;
31 static struct list_head zombie_secs_pages;
32 
33 static int __sgx_vepc_fault(struct sgx_vepc *vepc,
34 			    struct vm_area_struct *vma, unsigned long addr)
35 {
36 	struct sgx_epc_page *epc_page;
37 	unsigned long index, pfn;
38 	int ret;
39 
40 	WARN_ON(!mutex_is_locked(&vepc->lock));
41 
42 	/* Calculate index of EPC page in virtual EPC's page_array */
43 	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
44 
45 	epc_page = xa_load(&vepc->page_array, index);
46 	if (epc_page)
47 		return 0;
48 
49 	epc_page = sgx_alloc_epc_page(vepc, false);
50 	if (IS_ERR(epc_page))
51 		return PTR_ERR(epc_page);
52 
53 	ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
54 	if (ret)
55 		goto err_free;
56 
57 	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
58 
59 	ret = vmf_insert_pfn(vma, addr, pfn);
60 	if (ret != VM_FAULT_NOPAGE) {
61 		ret = -EFAULT;
62 		goto err_delete;
63 	}
64 
65 	return 0;
66 
67 err_delete:
68 	xa_erase(&vepc->page_array, index);
69 err_free:
70 	sgx_free_epc_page(epc_page);
71 	return ret;
72 }
73 
74 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
75 {
76 	struct vm_area_struct *vma = vmf->vma;
77 	struct sgx_vepc *vepc = vma->vm_private_data;
78 	int ret;
79 
80 	mutex_lock(&vepc->lock);
81 	ret = __sgx_vepc_fault(vepc, vma, vmf->address);
82 	mutex_unlock(&vepc->lock);
83 
84 	if (!ret)
85 		return VM_FAULT_NOPAGE;
86 
87 	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
88 		mmap_read_unlock(vma->vm_mm);
89 		return VM_FAULT_RETRY;
90 	}
91 
92 	return VM_FAULT_SIGBUS;
93 }
94 
95 static const struct vm_operations_struct sgx_vepc_vm_ops = {
96 	.fault = sgx_vepc_fault,
97 };
98 
99 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
100 {
101 	struct sgx_vepc *vepc = file->private_data;
102 
103 	if (!(vma->vm_flags & VM_SHARED))
104 		return -EINVAL;
105 
106 	vma->vm_ops = &sgx_vepc_vm_ops;
107 	/* Don't copy VMA in fork() */
108 	vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
109 	vma->vm_private_data = vepc;
110 
111 	return 0;
112 }
113 
114 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
115 {
116 	/*
117 	 * Take a previously guest-owned EPC page and return it to the
118 	 * general EPC page pool.
119 	 *
120 	 * Guests can not be trusted to have left this page in a good
121 	 * state, so run EREMOVE on the page unconditionally.  In the
122 	 * case that a guest properly EREMOVE'd this page, a superfluous
123 	 * EREMOVE is harmless.
124 	 */
125 	return __eremove(sgx_get_epc_virt_addr(epc_page));
126 }
127 
128 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
129 {
130 	int ret = sgx_vepc_remove_page(epc_page);
131 	if (ret) {
132 		/*
133 		 * Only SGX_CHILD_PRESENT is expected, which is because of
134 		 * EREMOVE'ing an SECS still with child, in which case it can
135 		 * be handled by EREMOVE'ing the SECS again after all pages in
136 		 * virtual EPC have been EREMOVE'd. See comments in below in
137 		 * sgx_vepc_release().
138 		 *
139 		 * The user of virtual EPC (KVM) needs to guarantee there's no
140 		 * logical processor is still running in the enclave in guest,
141 		 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
142 		 * handled here.
143 		 */
144 		WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
145 			  ret, ret);
146 		return ret;
147 	}
148 
149 	sgx_free_epc_page(epc_page);
150 	return 0;
151 }
152 
153 static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
154 {
155 	struct sgx_epc_page *entry;
156 	unsigned long index;
157 	long failures = 0;
158 
159 	xa_for_each(&vepc->page_array, index, entry) {
160 		int ret = sgx_vepc_remove_page(entry);
161 		if (ret) {
162 			if (ret == SGX_CHILD_PRESENT) {
163 				/* The page is a SECS, userspace will retry.  */
164 				failures++;
165 			} else {
166 				/*
167 				 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
168 				 * WARN, as userspace can induce said failures by
169 				 * calling the ioctl concurrently on multiple vEPCs or
170 				 * while one or more CPUs is running the enclave.  Only
171 				 * a #PF on EREMOVE indicates a kernel/hardware issue.
172 				 */
173 				WARN_ON_ONCE(encls_faulted(ret) &&
174 					     ENCLS_TRAPNR(ret) != X86_TRAP_GP);
175 				return -EBUSY;
176 			}
177 		}
178 		cond_resched();
179 	}
180 
181 	/*
182 	 * Return the number of SECS pages that failed to be removed, so
183 	 * userspace knows that it has to retry.
184 	 */
185 	return failures;
186 }
187 
188 static int sgx_vepc_release(struct inode *inode, struct file *file)
189 {
190 	struct sgx_vepc *vepc = file->private_data;
191 	struct sgx_epc_page *epc_page, *tmp, *entry;
192 	unsigned long index;
193 
194 	LIST_HEAD(secs_pages);
195 
196 	xa_for_each(&vepc->page_array, index, entry) {
197 		/*
198 		 * Remove all normal, child pages.  sgx_vepc_free_page()
199 		 * will fail if EREMOVE fails, but this is OK and expected on
200 		 * SECS pages.  Those can only be EREMOVE'd *after* all their
201 		 * child pages. Retries below will clean them up.
202 		 */
203 		if (sgx_vepc_free_page(entry))
204 			continue;
205 
206 		xa_erase(&vepc->page_array, index);
207 		cond_resched();
208 	}
209 
210 	/*
211 	 * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
212 	 * only had children in this 'epc' area.
213 	 */
214 	xa_for_each(&vepc->page_array, index, entry) {
215 		epc_page = entry;
216 		/*
217 		 * An EREMOVE failure here means that the SECS page still
218 		 * has children.  But, since all children in this 'sgx_vepc'
219 		 * have been removed, the SECS page must have a child on
220 		 * another instance.
221 		 */
222 		if (sgx_vepc_free_page(epc_page))
223 			list_add_tail(&epc_page->list, &secs_pages);
224 
225 		xa_erase(&vepc->page_array, index);
226 		cond_resched();
227 	}
228 
229 	/*
230 	 * SECS pages are "pinned" by child pages, and "unpinned" once all
231 	 * children have been EREMOVE'd.  A child page in this instance
232 	 * may have pinned an SECS page encountered in an earlier release(),
233 	 * creating a zombie.  Since some children were EREMOVE'd above,
234 	 * try to EREMOVE all zombies in the hopes that one was unpinned.
235 	 */
236 	mutex_lock(&zombie_secs_pages_lock);
237 	list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
238 		/*
239 		 * Speculatively remove the page from the list of zombies,
240 		 * if the page is successfully EREMOVE'd it will be added to
241 		 * the list of free pages.  If EREMOVE fails, throw the page
242 		 * on the local list, which will be spliced on at the end.
243 		 */
244 		list_del(&epc_page->list);
245 
246 		if (sgx_vepc_free_page(epc_page))
247 			list_add_tail(&epc_page->list, &secs_pages);
248 		cond_resched();
249 	}
250 
251 	if (!list_empty(&secs_pages))
252 		list_splice_tail(&secs_pages, &zombie_secs_pages);
253 	mutex_unlock(&zombie_secs_pages_lock);
254 
255 	xa_destroy(&vepc->page_array);
256 	kfree(vepc);
257 
258 	return 0;
259 }
260 
261 static int sgx_vepc_open(struct inode *inode, struct file *file)
262 {
263 	struct sgx_vepc *vepc;
264 
265 	vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
266 	if (!vepc)
267 		return -ENOMEM;
268 	mutex_init(&vepc->lock);
269 	xa_init(&vepc->page_array);
270 
271 	file->private_data = vepc;
272 
273 	return 0;
274 }
275 
276 static long sgx_vepc_ioctl(struct file *file,
277 			   unsigned int cmd, unsigned long arg)
278 {
279 	struct sgx_vepc *vepc = file->private_data;
280 
281 	switch (cmd) {
282 	case SGX_IOC_VEPC_REMOVE_ALL:
283 		if (arg)
284 			return -EINVAL;
285 		return sgx_vepc_remove_all(vepc);
286 
287 	default:
288 		return -ENOTTY;
289 	}
290 }
291 
292 static const struct file_operations sgx_vepc_fops = {
293 	.owner		= THIS_MODULE,
294 	.open		= sgx_vepc_open,
295 	.unlocked_ioctl	= sgx_vepc_ioctl,
296 	.compat_ioctl	= sgx_vepc_ioctl,
297 	.release	= sgx_vepc_release,
298 	.mmap		= sgx_vepc_mmap,
299 };
300 
301 static struct miscdevice sgx_vepc_dev = {
302 	.minor		= MISC_DYNAMIC_MINOR,
303 	.name		= "sgx_vepc",
304 	.nodename	= "sgx_vepc",
305 	.fops		= &sgx_vepc_fops,
306 };
307 
308 int __init sgx_vepc_init(void)
309 {
310 	/* SGX virtualization requires KVM to work */
311 	if (!cpu_feature_enabled(X86_FEATURE_VMX))
312 		return -ENODEV;
313 
314 	INIT_LIST_HEAD(&zombie_secs_pages);
315 	mutex_init(&zombie_secs_pages_lock);
316 
317 	return misc_register(&sgx_vepc_dev);
318 }
319 
320 /**
321  * sgx_virt_ecreate() - Run ECREATE on behalf of guest
322  * @pageinfo:	Pointer to PAGEINFO structure
323  * @secs:	Userspace pointer to SECS page
324  * @trapnr:	trap number injected to guest in case of ECREATE error
325  *
326  * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
327  * of enforcing policies of guest's enclaves, and return the trap number
328  * which should be injected to guest in case of any ECREATE error.
329  *
330  * Return:
331  * -  0:	ECREATE was successful.
332  * - <0:	on error.
333  */
334 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
335 		     int *trapnr)
336 {
337 	int ret;
338 
339 	/*
340 	 * @secs is an untrusted, userspace-provided address.  It comes from
341 	 * KVM and is assumed to be a valid pointer which points somewhere in
342 	 * userspace.  This can fault and call SGX or other fault handlers when
343 	 * userspace mapping @secs doesn't exist.
344 	 *
345 	 * Add a WARN() to make sure @secs is already valid userspace pointer
346 	 * from caller (KVM), who should already have handled invalid pointer
347 	 * case (for instance, made by malicious guest).  All other checks,
348 	 * such as alignment of @secs, are deferred to ENCLS itself.
349 	 */
350 	if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
351 		return -EINVAL;
352 
353 	__uaccess_begin();
354 	ret = __ecreate(pageinfo, (void *)secs);
355 	__uaccess_end();
356 
357 	if (encls_faulted(ret)) {
358 		*trapnr = ENCLS_TRAPNR(ret);
359 		return -EFAULT;
360 	}
361 
362 	/* ECREATE doesn't return an error code, it faults or succeeds. */
363 	WARN_ON_ONCE(ret);
364 	return 0;
365 }
366 EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
367 
368 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
369 			    void __user *secs)
370 {
371 	int ret;
372 
373 	/*
374 	 * Make sure all userspace pointers from caller (KVM) are valid.
375 	 * All other checks deferred to ENCLS itself.  Also see comment
376 	 * for @secs in sgx_virt_ecreate().
377 	 */
378 #define SGX_EINITTOKEN_SIZE	304
379 	if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
380 			 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
381 			 !access_ok(secs, PAGE_SIZE)))
382 		return -EINVAL;
383 
384 	__uaccess_begin();
385 	ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
386 	__uaccess_end();
387 
388 	return ret;
389 }
390 
391 /**
392  * sgx_virt_einit() - Run EINIT on behalf of guest
393  * @sigstruct:		Userspace pointer to SIGSTRUCT structure
394  * @token:		Userspace pointer to EINITTOKEN structure
395  * @secs:		Userspace pointer to SECS page
396  * @lepubkeyhash:	Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
397  * @trapnr:		trap number injected to guest in case of EINIT error
398  *
399  * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
400  * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
401  * needs to update hardware values to guest's virtual MSR values in order to
402  * ensure EINIT is executed with expected hardware values.
403  *
404  * Return:
405  * -  0:	EINIT was successful.
406  * - <0:	on error.
407  */
408 int sgx_virt_einit(void __user *sigstruct, void __user *token,
409 		   void __user *secs, u64 *lepubkeyhash, int *trapnr)
410 {
411 	int ret;
412 
413 	if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
414 		ret = __sgx_virt_einit(sigstruct, token, secs);
415 	} else {
416 		preempt_disable();
417 
418 		sgx_update_lepubkeyhash(lepubkeyhash);
419 
420 		ret = __sgx_virt_einit(sigstruct, token, secs);
421 		preempt_enable();
422 	}
423 
424 	/* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
425 	if (ret == -EINVAL)
426 		return ret;
427 
428 	if (encls_faulted(ret)) {
429 		*trapnr = ENCLS_TRAPNR(ret);
430 		return -EFAULT;
431 	}
432 
433 	return ret;
434 }
435 EXPORT_SYMBOL_GPL(sgx_virt_einit);
436