xref: /linux/drivers/vfio/container.c (revision 19d7df98472851e1d2d11e00c177988d0f49683d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
4  *
5  * VFIO container (/dev/vfio/vfio)
6  */
7 #include <linux/file.h>
8 #include <linux/slab.h>
9 #include <linux/fs.h>
10 #include <linux/capability.h>
11 #include <linux/iommu.h>
12 #include <linux/miscdevice.h>
13 #include <linux/vfio.h>
14 #include <uapi/linux/vfio.h>
15 
16 #include "vfio.h"
17 
18 struct vfio_container {
19 	struct kref			kref;
20 	struct list_head		group_list;
21 	struct rw_semaphore		group_lock;
22 	struct vfio_iommu_driver	*iommu_driver;
23 	void				*iommu_data;
24 	bool				noiommu;
25 };
26 
27 static struct vfio {
28 	struct list_head		iommu_drivers_list;
29 	struct mutex			iommu_drivers_lock;
30 } vfio;
31 
32 #ifdef CONFIG_VFIO_NOIOMMU
33 bool vfio_noiommu __read_mostly;
34 module_param_named(enable_unsafe_noiommu_mode,
35 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
36 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
37 #endif
38 
39 static void *vfio_noiommu_open(unsigned long arg)
40 {
41 	if (arg != VFIO_NOIOMMU_IOMMU)
42 		return ERR_PTR(-EINVAL);
43 	if (!capable(CAP_SYS_RAWIO))
44 		return ERR_PTR(-EPERM);
45 
46 	return NULL;
47 }
48 
49 static void vfio_noiommu_release(void *iommu_data)
50 {
51 }
52 
53 static long vfio_noiommu_ioctl(void *iommu_data,
54 			       unsigned int cmd, unsigned long arg)
55 {
56 	if (cmd == VFIO_CHECK_EXTENSION)
57 		return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
58 
59 	return -ENOTTY;
60 }
61 
62 static int vfio_noiommu_attach_group(void *iommu_data,
63 		struct iommu_group *iommu_group, enum vfio_group_type type)
64 {
65 	return 0;
66 }
67 
68 static void vfio_noiommu_detach_group(void *iommu_data,
69 				      struct iommu_group *iommu_group)
70 {
71 }
72 
73 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
74 	.name = "vfio-noiommu",
75 	.owner = THIS_MODULE,
76 	.open = vfio_noiommu_open,
77 	.release = vfio_noiommu_release,
78 	.ioctl = vfio_noiommu_ioctl,
79 	.attach_group = vfio_noiommu_attach_group,
80 	.detach_group = vfio_noiommu_detach_group,
81 };
82 
83 /*
84  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
85  * use vfio-noiommu.
86  */
87 static bool vfio_iommu_driver_allowed(struct vfio_container *container,
88 				      const struct vfio_iommu_driver *driver)
89 {
90 	if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU))
91 		return true;
92 	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
93 }
94 
95 /*
96  * IOMMU driver registration
97  */
98 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
99 {
100 	struct vfio_iommu_driver *driver, *tmp;
101 
102 	if (WARN_ON(!ops->register_device != !ops->unregister_device))
103 		return -EINVAL;
104 
105 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
106 	if (!driver)
107 		return -ENOMEM;
108 
109 	driver->ops = ops;
110 
111 	mutex_lock(&vfio.iommu_drivers_lock);
112 
113 	/* Check for duplicates */
114 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
115 		if (tmp->ops == ops) {
116 			mutex_unlock(&vfio.iommu_drivers_lock);
117 			kfree(driver);
118 			return -EINVAL;
119 		}
120 	}
121 
122 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
123 
124 	mutex_unlock(&vfio.iommu_drivers_lock);
125 
126 	return 0;
127 }
128 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
129 
130 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
131 {
132 	struct vfio_iommu_driver *driver;
133 
134 	mutex_lock(&vfio.iommu_drivers_lock);
135 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
136 		if (driver->ops == ops) {
137 			list_del(&driver->vfio_next);
138 			mutex_unlock(&vfio.iommu_drivers_lock);
139 			kfree(driver);
140 			return;
141 		}
142 	}
143 	mutex_unlock(&vfio.iommu_drivers_lock);
144 }
145 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
146 
147 /*
148  * Container objects - containers are created when /dev/vfio/vfio is
149  * opened, but their lifecycle extends until the last user is done, so
150  * it's freed via kref.  Must support container/group/device being
151  * closed in any order.
152  */
153 static void vfio_container_release(struct kref *kref)
154 {
155 	struct vfio_container *container;
156 	container = container_of(kref, struct vfio_container, kref);
157 
158 	kfree(container);
159 }
160 
161 static void vfio_container_get(struct vfio_container *container)
162 {
163 	kref_get(&container->kref);
164 }
165 
166 static void vfio_container_put(struct vfio_container *container)
167 {
168 	kref_put(&container->kref, vfio_container_release);
169 }
170 
171 void vfio_device_container_register(struct vfio_device *device)
172 {
173 	struct vfio_iommu_driver *iommu_driver =
174 		device->group->container->iommu_driver;
175 
176 	if (iommu_driver && iommu_driver->ops->register_device)
177 		iommu_driver->ops->register_device(
178 			device->group->container->iommu_data, device);
179 }
180 
181 void vfio_device_container_unregister(struct vfio_device *device)
182 {
183 	struct vfio_iommu_driver *iommu_driver =
184 		device->group->container->iommu_driver;
185 
186 	if (iommu_driver && iommu_driver->ops->unregister_device)
187 		iommu_driver->ops->unregister_device(
188 			device->group->container->iommu_data, device);
189 }
190 
191 long vfio_container_ioctl_check_extension(struct vfio_container *container,
192 					  unsigned long arg)
193 {
194 	struct vfio_iommu_driver *driver;
195 	long ret = 0;
196 
197 	down_read(&container->group_lock);
198 
199 	driver = container->iommu_driver;
200 
201 	switch (arg) {
202 		/* No base extensions yet */
203 	default:
204 		/*
205 		 * If no driver is set, poll all registered drivers for
206 		 * extensions and return the first positive result.  If
207 		 * a driver is already set, further queries will be passed
208 		 * only to that driver.
209 		 */
210 		if (!driver) {
211 			mutex_lock(&vfio.iommu_drivers_lock);
212 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
213 					    vfio_next) {
214 
215 				if (!list_empty(&container->group_list) &&
216 				    !vfio_iommu_driver_allowed(container,
217 							       driver))
218 					continue;
219 				if (!try_module_get(driver->ops->owner))
220 					continue;
221 
222 				ret = driver->ops->ioctl(NULL,
223 							 VFIO_CHECK_EXTENSION,
224 							 arg);
225 				module_put(driver->ops->owner);
226 				if (ret > 0)
227 					break;
228 			}
229 			mutex_unlock(&vfio.iommu_drivers_lock);
230 		} else
231 			ret = driver->ops->ioctl(container->iommu_data,
232 						 VFIO_CHECK_EXTENSION, arg);
233 	}
234 
235 	up_read(&container->group_lock);
236 
237 	return ret;
238 }
239 
240 /* hold write lock on container->group_lock */
241 static int __vfio_container_attach_groups(struct vfio_container *container,
242 					  struct vfio_iommu_driver *driver,
243 					  void *data)
244 {
245 	struct vfio_group *group;
246 	int ret = -ENODEV;
247 
248 	list_for_each_entry(group, &container->group_list, container_next) {
249 		ret = driver->ops->attach_group(data, group->iommu_group,
250 						group->type);
251 		if (ret)
252 			goto unwind;
253 	}
254 
255 	return ret;
256 
257 unwind:
258 	list_for_each_entry_continue_reverse(group, &container->group_list,
259 					     container_next) {
260 		driver->ops->detach_group(data, group->iommu_group);
261 	}
262 
263 	return ret;
264 }
265 
266 static long vfio_ioctl_set_iommu(struct vfio_container *container,
267 				 unsigned long arg)
268 {
269 	struct vfio_iommu_driver *driver;
270 	long ret = -ENODEV;
271 
272 	down_write(&container->group_lock);
273 
274 	/*
275 	 * The container is designed to be an unprivileged interface while
276 	 * the group can be assigned to specific users.  Therefore, only by
277 	 * adding a group to a container does the user get the privilege of
278 	 * enabling the iommu, which may allocate finite resources.  There
279 	 * is no unset_iommu, but by removing all the groups from a container,
280 	 * the container is deprivileged and returns to an unset state.
281 	 */
282 	if (list_empty(&container->group_list) || container->iommu_driver) {
283 		up_write(&container->group_lock);
284 		return -EINVAL;
285 	}
286 
287 	mutex_lock(&vfio.iommu_drivers_lock);
288 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
289 		void *data;
290 
291 		if (!vfio_iommu_driver_allowed(container, driver))
292 			continue;
293 		if (!try_module_get(driver->ops->owner))
294 			continue;
295 
296 		/*
297 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
298 		 * so test which iommu driver reported support for this
299 		 * extension and call open on them.  We also pass them the
300 		 * magic, allowing a single driver to support multiple
301 		 * interfaces if they'd like.
302 		 */
303 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
304 			module_put(driver->ops->owner);
305 			continue;
306 		}
307 
308 		data = driver->ops->open(arg);
309 		if (IS_ERR(data)) {
310 			ret = PTR_ERR(data);
311 			module_put(driver->ops->owner);
312 			continue;
313 		}
314 
315 		ret = __vfio_container_attach_groups(container, driver, data);
316 		if (ret) {
317 			driver->ops->release(data);
318 			module_put(driver->ops->owner);
319 			continue;
320 		}
321 
322 		container->iommu_driver = driver;
323 		container->iommu_data = data;
324 		break;
325 	}
326 
327 	mutex_unlock(&vfio.iommu_drivers_lock);
328 	up_write(&container->group_lock);
329 
330 	return ret;
331 }
332 
333 static long vfio_fops_unl_ioctl(struct file *filep,
334 				unsigned int cmd, unsigned long arg)
335 {
336 	struct vfio_container *container = filep->private_data;
337 	struct vfio_iommu_driver *driver;
338 	void *data;
339 	long ret = -EINVAL;
340 
341 	if (!container)
342 		return ret;
343 
344 	switch (cmd) {
345 	case VFIO_GET_API_VERSION:
346 		ret = VFIO_API_VERSION;
347 		break;
348 	case VFIO_CHECK_EXTENSION:
349 		ret = vfio_container_ioctl_check_extension(container, arg);
350 		break;
351 	case VFIO_SET_IOMMU:
352 		ret = vfio_ioctl_set_iommu(container, arg);
353 		break;
354 	default:
355 		driver = container->iommu_driver;
356 		data = container->iommu_data;
357 
358 		if (driver) /* passthrough all unrecognized ioctls */
359 			ret = driver->ops->ioctl(data, cmd, arg);
360 	}
361 
362 	return ret;
363 }
364 
365 static int vfio_fops_open(struct inode *inode, struct file *filep)
366 {
367 	struct vfio_container *container;
368 
369 	container = kzalloc(sizeof(*container), GFP_KERNEL);
370 	if (!container)
371 		return -ENOMEM;
372 
373 	INIT_LIST_HEAD(&container->group_list);
374 	init_rwsem(&container->group_lock);
375 	kref_init(&container->kref);
376 
377 	filep->private_data = container;
378 
379 	return 0;
380 }
381 
382 static int vfio_fops_release(struct inode *inode, struct file *filep)
383 {
384 	struct vfio_container *container = filep->private_data;
385 	struct vfio_iommu_driver *driver = container->iommu_driver;
386 
387 	if (driver && driver->ops->notify)
388 		driver->ops->notify(container->iommu_data,
389 				    VFIO_IOMMU_CONTAINER_CLOSE);
390 
391 	filep->private_data = NULL;
392 
393 	vfio_container_put(container);
394 
395 	return 0;
396 }
397 
398 static const struct file_operations vfio_fops = {
399 	.owner		= THIS_MODULE,
400 	.open		= vfio_fops_open,
401 	.release	= vfio_fops_release,
402 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
403 	.compat_ioctl	= compat_ptr_ioctl,
404 };
405 
406 struct vfio_container *vfio_container_from_file(struct file *file)
407 {
408 	struct vfio_container *container;
409 
410 	/* Sanity check, is this really our fd? */
411 	if (file->f_op != &vfio_fops)
412 		return NULL;
413 
414 	container = file->private_data;
415 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
416 	return container;
417 }
418 
419 static struct miscdevice vfio_dev = {
420 	.minor = VFIO_MINOR,
421 	.name = "vfio",
422 	.fops = &vfio_fops,
423 	.nodename = "vfio/vfio",
424 	.mode = S_IRUGO | S_IWUGO,
425 };
426 
427 int vfio_container_attach_group(struct vfio_container *container,
428 				struct vfio_group *group)
429 {
430 	struct vfio_iommu_driver *driver;
431 	int ret = 0;
432 
433 	lockdep_assert_held(&group->group_lock);
434 
435 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
436 		return -EPERM;
437 
438 	down_write(&container->group_lock);
439 
440 	/* Real groups and fake groups cannot mix */
441 	if (!list_empty(&container->group_list) &&
442 	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
443 		ret = -EPERM;
444 		goto out_unlock_container;
445 	}
446 
447 	if (group->type == VFIO_IOMMU) {
448 		ret = iommu_group_claim_dma_owner(group->iommu_group, group);
449 		if (ret)
450 			goto out_unlock_container;
451 	}
452 
453 	driver = container->iommu_driver;
454 	if (driver) {
455 		ret = driver->ops->attach_group(container->iommu_data,
456 						group->iommu_group,
457 						group->type);
458 		if (ret) {
459 			if (group->type == VFIO_IOMMU)
460 				iommu_group_release_dma_owner(
461 					group->iommu_group);
462 			goto out_unlock_container;
463 		}
464 	}
465 
466 	group->container = container;
467 	group->container_users = 1;
468 	container->noiommu = (group->type == VFIO_NO_IOMMU);
469 	list_add(&group->container_next, &container->group_list);
470 
471 	/* Get a reference on the container and mark a user within the group */
472 	vfio_container_get(container);
473 
474 out_unlock_container:
475 	up_write(&container->group_lock);
476 	return ret;
477 }
478 
479 void vfio_group_detach_container(struct vfio_group *group)
480 {
481 	struct vfio_container *container = group->container;
482 	struct vfio_iommu_driver *driver;
483 
484 	lockdep_assert_held(&group->group_lock);
485 	WARN_ON(group->container_users != 1);
486 
487 	down_write(&container->group_lock);
488 
489 	driver = container->iommu_driver;
490 	if (driver)
491 		driver->ops->detach_group(container->iommu_data,
492 					  group->iommu_group);
493 
494 	if (group->type == VFIO_IOMMU)
495 		iommu_group_release_dma_owner(group->iommu_group);
496 
497 	group->container = NULL;
498 	group->container_users = 0;
499 	list_del(&group->container_next);
500 
501 	/* Detaching the last group deprivileges a container, remove iommu */
502 	if (driver && list_empty(&container->group_list)) {
503 		driver->ops->release(container->iommu_data);
504 		module_put(driver->ops->owner);
505 		container->iommu_driver = NULL;
506 		container->iommu_data = NULL;
507 	}
508 
509 	up_write(&container->group_lock);
510 
511 	vfio_container_put(container);
512 }
513 
514 int vfio_device_assign_container(struct vfio_device *device)
515 {
516 	struct vfio_group *group = device->group;
517 
518 	lockdep_assert_held(&group->group_lock);
519 
520 	if (!group->container || !group->container->iommu_driver ||
521 	    WARN_ON(!group->container_users))
522 		return -EINVAL;
523 
524 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
525 		return -EPERM;
526 
527 	get_file(group->opened_file);
528 	group->container_users++;
529 	return 0;
530 }
531 
532 void vfio_device_unassign_container(struct vfio_device *device)
533 {
534 	mutex_lock(&device->group->group_lock);
535 	WARN_ON(device->group->container_users <= 1);
536 	device->group->container_users--;
537 	fput(device->group->opened_file);
538 	mutex_unlock(&device->group->group_lock);
539 }
540 
541 /*
542  * Pin contiguous user pages and return their associated host pages for local
543  * domain only.
544  * @device [in]  : device
545  * @iova [in]    : starting IOVA of user pages to be pinned.
546  * @npage [in]   : count of pages to be pinned.  This count should not
547  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
548  * @prot [in]    : protection flags
549  * @pages[out]   : array of host pages
550  * Return error or number of pages pinned.
551  *
552  * A driver may only call this function if the vfio_device was created
553  * by vfio_register_emulated_iommu_dev().
554  */
555 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
556 		   int npage, int prot, struct page **pages)
557 {
558 	struct vfio_container *container;
559 	struct vfio_group *group = device->group;
560 	struct vfio_iommu_driver *driver;
561 	int ret;
562 
563 	if (!pages || !npage || !vfio_assert_device_open(device))
564 		return -EINVAL;
565 
566 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
567 		return -E2BIG;
568 
569 	/* group->container cannot change while a vfio device is open */
570 	container = group->container;
571 	driver = container->iommu_driver;
572 	if (likely(driver && driver->ops->pin_pages))
573 		ret = driver->ops->pin_pages(container->iommu_data,
574 					     group->iommu_group, iova,
575 					     npage, prot, pages);
576 	else
577 		ret = -ENOTTY;
578 
579 	return ret;
580 }
581 EXPORT_SYMBOL(vfio_pin_pages);
582 
583 /*
584  * Unpin contiguous host pages for local domain only.
585  * @device [in]  : device
586  * @iova [in]    : starting address of user pages to be unpinned.
587  * @npage [in]   : count of pages to be unpinned.  This count should not
588  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
589  */
590 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
591 {
592 	struct vfio_container *container;
593 	struct vfio_iommu_driver *driver;
594 
595 	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
596 		return;
597 
598 	if (WARN_ON(!vfio_assert_device_open(device)))
599 		return;
600 
601 	/* group->container cannot change while a vfio device is open */
602 	container = device->group->container;
603 	driver = container->iommu_driver;
604 
605 	driver->ops->unpin_pages(container->iommu_data, iova, npage);
606 }
607 EXPORT_SYMBOL(vfio_unpin_pages);
608 
609 /*
610  * This interface allows the CPUs to perform some sort of virtual DMA on
611  * behalf of the device.
612  *
613  * CPUs read/write from/into a range of IOVAs pointing to user space memory
614  * into/from a kernel buffer.
615  *
616  * As the read/write of user space memory is conducted via the CPUs and is
617  * not a real device DMA, it is not necessary to pin the user space memory.
618  *
619  * @device [in]		: VFIO device
620  * @iova [in]		: base IOVA of a user space buffer
621  * @data [in]		: pointer to kernel buffer
622  * @len [in]		: kernel buffer length
623  * @write		: indicate read or write
624  * Return error code on failure or 0 on success.
625  */
626 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
627 		size_t len, bool write)
628 {
629 	struct vfio_container *container;
630 	struct vfio_iommu_driver *driver;
631 	int ret = 0;
632 
633 	if (!data || len <= 0 || !vfio_assert_device_open(device))
634 		return -EINVAL;
635 
636 	/* group->container cannot change while a vfio device is open */
637 	container = device->group->container;
638 	driver = container->iommu_driver;
639 
640 	if (likely(driver && driver->ops->dma_rw))
641 		ret = driver->ops->dma_rw(container->iommu_data,
642 					  iova, data, len, write);
643 	else
644 		ret = -ENOTTY;
645 	return ret;
646 }
647 EXPORT_SYMBOL(vfio_dma_rw);
648 
649 int __init vfio_container_init(void)
650 {
651 	int ret;
652 
653 	mutex_init(&vfio.iommu_drivers_lock);
654 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
655 
656 	ret = misc_register(&vfio_dev);
657 	if (ret) {
658 		pr_err("vfio: misc device register failed\n");
659 		return ret;
660 	}
661 
662 	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) {
663 		ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
664 		if (ret)
665 			goto err_misc;
666 	}
667 	return 0;
668 
669 err_misc:
670 	misc_deregister(&vfio_dev);
671 	return ret;
672 }
673 
674 void vfio_container_cleanup(void)
675 {
676 	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU))
677 		vfio_unregister_iommu_driver(&vfio_noiommu_ops);
678 	misc_deregister(&vfio_dev);
679 	mutex_destroy(&vfio.iommu_drivers_lock);
680 }
681