xref: /linux/drivers/vfio/vfio_main.c (revision 785562e31dbcd85ca583cf58c446e63aa8a5af08)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/seq_file.h>
32 #include <linux/slab.h>
33 #include <linux/stat.h>
34 #include <linux/string.h>
35 #include <linux/uaccess.h>
36 #include <linux/vfio.h>
37 #include <linux/wait.h>
38 #include <linux/sched/signal.h>
39 #include <linux/pm_runtime.h>
40 #include <linux/interval_tree.h>
41 #include <linux/iova_bitmap.h>
42 #include <linux/iommufd.h>
43 #include "vfio.h"
44 
45 #define DRIVER_VERSION	"0.3"
46 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
47 #define DRIVER_DESC	"VFIO - User Level meta-driver"
48 
49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
50 
51 static struct vfio {
52 	struct ida			device_ida;
53 	struct vfsmount			*vfs_mount;
54 	int				fs_count;
55 } vfio;
56 
57 #ifdef CONFIG_VFIO_NOIOMMU
58 bool vfio_noiommu __read_mostly;
59 module_param_named(enable_unsafe_noiommu_mode,
60 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
62 #endif
63 
64 static DEFINE_XARRAY(vfio_device_set_xa);
65 
66 static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
67 {
68 	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
69 }
70 
71 static const struct class vfio_device_class = {
72 	.name		= "vfio-dev",
73 	.devnode	= vfio_device_devnode
74 };
75 
76 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
77 {
78 	unsigned long idx = (unsigned long)set_id;
79 	struct vfio_device_set *new_dev_set;
80 	struct vfio_device_set *dev_set;
81 
82 	if (WARN_ON(!set_id))
83 		return -EINVAL;
84 
85 	/*
86 	 * Atomically acquire a singleton object in the xarray for this set_id
87 	 */
88 	xa_lock(&vfio_device_set_xa);
89 	dev_set = xa_load(&vfio_device_set_xa, idx);
90 	if (dev_set)
91 		goto found_get_ref;
92 	xa_unlock(&vfio_device_set_xa);
93 
94 	new_dev_set = kzalloc_obj(*new_dev_set);
95 	if (!new_dev_set)
96 		return -ENOMEM;
97 	mutex_init(&new_dev_set->lock);
98 	INIT_LIST_HEAD(&new_dev_set->device_list);
99 	new_dev_set->set_id = set_id;
100 
101 	xa_lock(&vfio_device_set_xa);
102 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
103 			       GFP_KERNEL);
104 	if (!dev_set) {
105 		dev_set = new_dev_set;
106 		goto found_get_ref;
107 	}
108 
109 	kfree(new_dev_set);
110 	if (xa_is_err(dev_set)) {
111 		xa_unlock(&vfio_device_set_xa);
112 		return xa_err(dev_set);
113 	}
114 
115 found_get_ref:
116 	dev_set->device_count++;
117 	xa_unlock(&vfio_device_set_xa);
118 	mutex_lock(&dev_set->lock);
119 	device->dev_set = dev_set;
120 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
121 	mutex_unlock(&dev_set->lock);
122 	return 0;
123 }
124 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
125 
126 static void vfio_release_device_set(struct vfio_device *device)
127 {
128 	struct vfio_device_set *dev_set = device->dev_set;
129 
130 	if (!dev_set)
131 		return;
132 
133 	mutex_lock(&dev_set->lock);
134 	list_del(&device->dev_set_list);
135 	mutex_unlock(&dev_set->lock);
136 
137 	xa_lock(&vfio_device_set_xa);
138 	if (!--dev_set->device_count) {
139 		__xa_erase(&vfio_device_set_xa,
140 			   (unsigned long)dev_set->set_id);
141 		mutex_destroy(&dev_set->lock);
142 		kfree(dev_set);
143 	}
144 	xa_unlock(&vfio_device_set_xa);
145 }
146 
147 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
148 {
149 	struct vfio_device *cur;
150 	unsigned int open_count = 0;
151 
152 	lockdep_assert_held(&dev_set->lock);
153 
154 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
155 		open_count += cur->open_count;
156 	return open_count;
157 }
158 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
159 
160 struct vfio_device *
161 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
162 			   struct device *dev)
163 {
164 	struct vfio_device *cur;
165 
166 	lockdep_assert_held(&dev_set->lock);
167 
168 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
169 		if (cur->dev == dev)
170 			return cur;
171 	return NULL;
172 }
173 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
174 
175 /*
176  * Device objects - create, release, get, put, search
177  */
178 /* Device reference always implies a group reference */
179 void vfio_device_put_registration(struct vfio_device *device)
180 {
181 	if (refcount_dec_and_test(&device->refcount))
182 		complete(&device->comp);
183 }
184 EXPORT_SYMBOL_GPL(vfio_device_put_registration);
185 
186 bool vfio_device_try_get_registration(struct vfio_device *device)
187 {
188 	return refcount_inc_not_zero(&device->refcount);
189 }
190 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
191 
192 /*
193  * VFIO driver API
194  */
195 /* Release helper called by vfio_put_device() */
196 static void vfio_device_release(struct device *dev)
197 {
198 	struct vfio_device *device =
199 			container_of(dev, struct vfio_device, device);
200 
201 	vfio_release_device_set(device);
202 	ida_free(&vfio.device_ida, device->index);
203 
204 	if (device->ops->release)
205 		device->ops->release(device);
206 
207 	iput(device->inode);
208 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
209 	kvfree(device);
210 }
211 
212 static int vfio_init_device(struct vfio_device *device, struct device *dev,
213 			    const struct vfio_device_ops *ops);
214 
215 /*
216  * Allocate and initialize vfio_device so it can be registered to vfio
217  * core.
218  *
219  * Drivers should use the wrapper vfio_alloc_device() for allocation.
220  * @size is the size of the structure to be allocated, including any
221  * private data used by the driver.
222  *
223  * Driver may provide an @init callback to cover device private data.
224  *
225  * Use vfio_put_device() to release the structure after success return.
226  */
227 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
228 				       const struct vfio_device_ops *ops)
229 {
230 	struct vfio_device *device;
231 	int ret;
232 
233 	if (WARN_ON(size < sizeof(struct vfio_device)))
234 		return ERR_PTR(-EINVAL);
235 
236 	device = kvzalloc(size, GFP_KERNEL);
237 	if (!device)
238 		return ERR_PTR(-ENOMEM);
239 
240 	ret = vfio_init_device(device, dev, ops);
241 	if (ret)
242 		goto out_free;
243 	return device;
244 
245 out_free:
246 	kvfree(device);
247 	return ERR_PTR(ret);
248 }
249 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
250 
251 static int vfio_fs_init_fs_context(struct fs_context *fc)
252 {
253 	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
254 }
255 
256 static struct file_system_type vfio_fs_type = {
257 	.name = "vfio",
258 	.owner = THIS_MODULE,
259 	.init_fs_context = vfio_fs_init_fs_context,
260 	.kill_sb = kill_anon_super,
261 };
262 
263 static struct inode *vfio_fs_inode_new(void)
264 {
265 	struct inode *inode;
266 	int ret;
267 
268 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
269 	if (ret)
270 		return ERR_PTR(ret);
271 
272 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
273 	if (IS_ERR(inode))
274 		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
275 
276 	return inode;
277 }
278 
279 /*
280  * Initialize a vfio_device so it can be registered to vfio core.
281  */
282 static int vfio_init_device(struct vfio_device *device, struct device *dev,
283 			    const struct vfio_device_ops *ops)
284 {
285 	int ret;
286 
287 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
288 	if (ret < 0) {
289 		dev_dbg(dev, "Error to alloc index\n");
290 		return ret;
291 	}
292 
293 	device->index = ret;
294 	init_completion(&device->comp);
295 	device->dev = dev;
296 	device->ops = ops;
297 	device->inode = vfio_fs_inode_new();
298 	if (IS_ERR(device->inode)) {
299 		ret = PTR_ERR(device->inode);
300 		goto out_inode;
301 	}
302 
303 	if (ops->init) {
304 		ret = ops->init(device);
305 		if (ret)
306 			goto out_uninit;
307 	}
308 
309 	device_initialize(&device->device);
310 	device->device.release = vfio_device_release;
311 	device->device.class = &vfio_device_class;
312 	device->device.parent = device->dev;
313 	return 0;
314 
315 out_uninit:
316 	iput(device->inode);
317 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
318 out_inode:
319 	vfio_release_device_set(device);
320 	ida_free(&vfio.device_ida, device->index);
321 	return ret;
322 }
323 
324 static int __vfio_register_dev(struct vfio_device *device,
325 			       enum vfio_group_type type)
326 {
327 	int ret;
328 
329 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
330 		    (!device->ops->bind_iommufd ||
331 		     !device->ops->unbind_iommufd ||
332 		     !device->ops->attach_ioas ||
333 		     !device->ops->detach_ioas)))
334 		return -EINVAL;
335 
336 	/*
337 	 * If the driver doesn't specify a set then the device is added to a
338 	 * singleton set just for itself.
339 	 */
340 	if (!device->dev_set)
341 		vfio_assign_device_set(device, device);
342 
343 	ret = dev_set_name(&device->device, "vfio%d", device->index);
344 	if (ret)
345 		return ret;
346 
347 	ret = vfio_device_set_group(device, type);
348 	if (ret)
349 		return ret;
350 
351 	/*
352 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
353 	 * restore cache coherency. It has to be checked here because it is only
354 	 * valid for cases where we are using iommu groups.
355 	 */
356 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
357 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
358 		ret = -EINVAL;
359 		goto err_out;
360 	}
361 
362 	ret = vfio_device_add(device);
363 	if (ret)
364 		goto err_out;
365 
366 	/* Refcounting can't start until the driver calls register */
367 	refcount_set(&device->refcount, 1);
368 
369 	vfio_device_group_register(device);
370 	vfio_device_debugfs_init(device);
371 
372 	return 0;
373 err_out:
374 	vfio_device_remove_group(device);
375 	return ret;
376 }
377 
378 int vfio_register_group_dev(struct vfio_device *device)
379 {
380 	return __vfio_register_dev(device, VFIO_IOMMU);
381 }
382 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
383 
384 /*
385  * Register a virtual device without IOMMU backing.  The user of this
386  * device must not be able to directly trigger unmediated DMA.
387  */
388 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
389 {
390 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
391 }
392 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
393 
394 /*
395  * Decrement the device reference count and wait for the device to be
396  * removed.  Open file descriptors for the device... */
397 void vfio_unregister_group_dev(struct vfio_device *device)
398 {
399 	unsigned int i = 0;
400 	bool interrupted = false;
401 	long rc;
402 
403 	/*
404 	 * Prevent new device opened by userspace via the
405 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
406 	 */
407 	vfio_device_group_unregister(device);
408 
409 	/*
410 	 * Balances vfio_device_add() in register path, also prevents
411 	 * new device opened by userspace in the cdev path.
412 	 */
413 	vfio_device_del(device);
414 
415 	vfio_device_put_registration(device);
416 	rc = try_wait_for_completion(&device->comp);
417 	while (rc <= 0) {
418 		if (device->ops->request)
419 			device->ops->request(device, i++);
420 
421 		if (interrupted) {
422 			rc = wait_for_completion_timeout(&device->comp,
423 							 HZ * 10);
424 		} else {
425 			rc = wait_for_completion_interruptible_timeout(
426 				&device->comp, HZ * 10);
427 			if (rc < 0) {
428 				interrupted = true;
429 				dev_warn(device->dev,
430 					 "Device is currently in use, task"
431 					 " \"%s\" (%d) "
432 					 "blocked until device is released",
433 					 current->comm, task_pid_nr(current));
434 			}
435 		}
436 	}
437 
438 	vfio_device_debugfs_exit(device);
439 	/* Balances vfio_device_set_group in register path */
440 	vfio_device_remove_group(device);
441 }
442 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
443 
444 #if IS_ENABLED(CONFIG_KVM)
445 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
446 {
447 	void (*pfn)(struct kvm *kvm);
448 	bool (*fn)(struct kvm *kvm);
449 	bool ret;
450 
451 	lockdep_assert_held(&device->dev_set->lock);
452 
453 	if (!kvm)
454 		return;
455 
456 	pfn = symbol_get(kvm_put_kvm);
457 	if (WARN_ON(!pfn))
458 		return;
459 
460 	fn = symbol_get(kvm_get_kvm_safe);
461 	if (WARN_ON(!fn)) {
462 		symbol_put(kvm_put_kvm);
463 		return;
464 	}
465 
466 	ret = fn(kvm);
467 	symbol_put(kvm_get_kvm_safe);
468 	if (!ret) {
469 		symbol_put(kvm_put_kvm);
470 		return;
471 	}
472 
473 	device->put_kvm = pfn;
474 	device->kvm = kvm;
475 }
476 
477 void vfio_device_put_kvm(struct vfio_device *device)
478 {
479 	lockdep_assert_held(&device->dev_set->lock);
480 
481 	if (!device->kvm)
482 		return;
483 
484 	if (WARN_ON(!device->put_kvm))
485 		goto clear;
486 
487 	device->put_kvm(device->kvm);
488 	device->put_kvm = NULL;
489 	symbol_put(kvm_put_kvm);
490 
491 clear:
492 	device->kvm = NULL;
493 }
494 #endif
495 
496 /* true if the vfio_device has open_device() called but not close_device() */
497 static bool vfio_assert_device_open(struct vfio_device *device)
498 {
499 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
500 }
501 
502 struct vfio_device_file *
503 vfio_allocate_device_file(struct vfio_device *device)
504 {
505 	struct vfio_device_file *df;
506 
507 	df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT);
508 	if (!df)
509 		return ERR_PTR(-ENOMEM);
510 
511 	df->device = device;
512 	spin_lock_init(&df->kvm_ref_lock);
513 
514 	return df;
515 }
516 
517 static int vfio_df_device_first_open(struct vfio_device_file *df)
518 {
519 	struct vfio_device *device = df->device;
520 	struct iommufd_ctx *iommufd = df->iommufd;
521 	int ret;
522 
523 	lockdep_assert_held(&device->dev_set->lock);
524 
525 	if (!try_module_get(device->dev->driver->owner))
526 		return -ENODEV;
527 
528 	if (iommufd)
529 		ret = vfio_df_iommufd_bind(df);
530 	else
531 		ret = vfio_device_group_use_iommu(device);
532 	if (ret)
533 		goto err_module_put;
534 
535 	if (device->ops->open_device) {
536 		ret = device->ops->open_device(device);
537 		if (ret)
538 			goto err_unuse_iommu;
539 	}
540 	return 0;
541 
542 err_unuse_iommu:
543 	if (iommufd)
544 		vfio_df_iommufd_unbind(df);
545 	else
546 		vfio_device_group_unuse_iommu(device);
547 err_module_put:
548 	module_put(device->dev->driver->owner);
549 	return ret;
550 }
551 
552 static void vfio_df_device_last_close(struct vfio_device_file *df)
553 {
554 	struct vfio_device *device = df->device;
555 	struct iommufd_ctx *iommufd = df->iommufd;
556 
557 	lockdep_assert_held(&device->dev_set->lock);
558 
559 	if (device->ops->close_device)
560 		device->ops->close_device(device);
561 	if (iommufd)
562 		vfio_df_iommufd_unbind(df);
563 	else
564 		vfio_device_group_unuse_iommu(device);
565 	device->precopy_info_v2 = 0;
566 	module_put(device->dev->driver->owner);
567 }
568 
569 int vfio_df_open(struct vfio_device_file *df)
570 {
571 	struct vfio_device *device = df->device;
572 	int ret = 0;
573 
574 	lockdep_assert_held(&device->dev_set->lock);
575 
576 	/*
577 	 * Only the group path allows the device to be opened multiple
578 	 * times.  The device cdev path doesn't have a secure way for it.
579 	 */
580 	if (device->open_count != 0 && !df->group)
581 		return -EINVAL;
582 
583 	device->open_count++;
584 	if (device->open_count == 1) {
585 		ret = vfio_df_device_first_open(df);
586 		if (ret)
587 			device->open_count--;
588 	}
589 
590 	return ret;
591 }
592 
593 void vfio_df_close(struct vfio_device_file *df)
594 {
595 	struct vfio_device *device = df->device;
596 
597 	lockdep_assert_held(&device->dev_set->lock);
598 
599 	if (!vfio_assert_device_open(device))
600 		return;
601 	if (device->open_count == 1)
602 		vfio_df_device_last_close(df);
603 	device->open_count--;
604 }
605 
606 /*
607  * Wrapper around pm_runtime_resume_and_get().
608  * Return error code on failure or 0 on success.
609  */
610 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
611 {
612 	struct device *dev = device->dev;
613 
614 	if (dev->driver && dev->driver->pm) {
615 		int ret;
616 
617 		ret = pm_runtime_resume_and_get(dev);
618 		if (ret) {
619 			dev_info_ratelimited(dev,
620 				"vfio: runtime resume failed %d\n", ret);
621 			return -EIO;
622 		}
623 	}
624 
625 	return 0;
626 }
627 
628 /*
629  * Wrapper around pm_runtime_put().
630  */
631 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
632 {
633 	struct device *dev = device->dev;
634 
635 	if (dev->driver && dev->driver->pm)
636 		pm_runtime_put(dev);
637 }
638 
639 /*
640  * VFIO Device fd
641  */
642 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
643 {
644 	struct vfio_device_file *df = filep->private_data;
645 	struct vfio_device *device = df->device;
646 
647 	if (df->group)
648 		vfio_df_group_close(df);
649 	else
650 		vfio_df_unbind_iommufd(df);
651 
652 	vfio_device_put_registration(device);
653 
654 	kfree(df);
655 
656 	return 0;
657 }
658 
659 /*
660  * vfio_mig_get_next_state - Compute the next step in the FSM
661  * @cur_fsm - The current state the device is in
662  * @new_fsm - The target state to reach
663  * @next_fsm - Pointer to the next step to get to new_fsm
664  *
665  * Return 0 upon success, otherwise -errno
666  * Upon success the next step in the state progression between cur_fsm and
667  * new_fsm will be set in next_fsm.
668  *
669  * This breaks down requests for combination transitions into smaller steps and
670  * returns the next step to get to new_fsm. The function may need to be called
671  * multiple times before reaching new_fsm.
672  *
673  */
674 int vfio_mig_get_next_state(struct vfio_device *device,
675 			    enum vfio_device_mig_state cur_fsm,
676 			    enum vfio_device_mig_state new_fsm,
677 			    enum vfio_device_mig_state *next_fsm)
678 {
679 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
680 	/*
681 	 * The coding in this table requires the driver to implement the
682 	 * following FSM arcs:
683 	 *         RESUMING -> STOP
684 	 *         STOP -> RESUMING
685 	 *         STOP -> STOP_COPY
686 	 *         STOP_COPY -> STOP
687 	 *
688 	 * If P2P is supported then the driver must also implement these FSM
689 	 * arcs:
690 	 *         RUNNING -> RUNNING_P2P
691 	 *         RUNNING_P2P -> RUNNING
692 	 *         RUNNING_P2P -> STOP
693 	 *         STOP -> RUNNING_P2P
694 	 *
695 	 * If precopy is supported then the driver must support these additional
696 	 * FSM arcs:
697 	 *         RUNNING -> PRE_COPY
698 	 *         PRE_COPY -> RUNNING
699 	 *         PRE_COPY -> STOP_COPY
700 	 * However, if precopy and P2P are supported together then the driver
701 	 * must support these additional arcs beyond the P2P arcs above:
702 	 *         PRE_COPY -> RUNNING
703 	 *         PRE_COPY -> PRE_COPY_P2P
704 	 *         PRE_COPY_P2P -> PRE_COPY
705 	 *         PRE_COPY_P2P -> RUNNING_P2P
706 	 *         PRE_COPY_P2P -> STOP_COPY
707 	 *         RUNNING -> PRE_COPY
708 	 *         RUNNING_P2P -> PRE_COPY_P2P
709 	 *
710 	 * Without P2P and precopy the driver must implement:
711 	 *         RUNNING -> STOP
712 	 *         STOP -> RUNNING
713 	 *
714 	 * The coding will step through multiple states for some combination
715 	 * transitions; if all optional features are supported, this means the
716 	 * following ones:
717 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
718 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
719 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
720 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
721 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
722 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
723 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
724 	 *         RESUMING -> STOP -> RUNNING_P2P
725 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
726 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
727 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
728 	 *         RESUMING -> STOP -> STOP_COPY
729 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
730 	 *         RUNNING -> RUNNING_P2P -> STOP
731 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
732 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
733 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
734 	 *         RUNNING_P2P -> STOP -> RESUMING
735 	 *         RUNNING_P2P -> STOP -> STOP_COPY
736 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
737 	 *         STOP -> RUNNING_P2P -> RUNNING
738 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
739 	 *         STOP_COPY -> STOP -> RESUMING
740 	 *         STOP_COPY -> STOP -> RUNNING_P2P
741 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
742 	 *
743 	 *  The following transitions are blocked:
744 	 *         STOP_COPY -> PRE_COPY
745 	 *         STOP_COPY -> PRE_COPY_P2P
746 	 */
747 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
748 		[VFIO_DEVICE_STATE_STOP] = {
749 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
750 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
751 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
752 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
753 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
754 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
755 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
756 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
757 		},
758 		[VFIO_DEVICE_STATE_RUNNING] = {
759 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
760 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
761 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
762 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
763 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
764 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
765 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
766 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
767 		},
768 		[VFIO_DEVICE_STATE_PRE_COPY] = {
769 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
770 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
771 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
772 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
773 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
774 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
775 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
776 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
777 		},
778 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
779 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
780 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
781 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
782 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
783 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
784 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
785 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
786 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
787 		},
788 		[VFIO_DEVICE_STATE_STOP_COPY] = {
789 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
790 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
791 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
792 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
793 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
794 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
795 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
796 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
797 		},
798 		[VFIO_DEVICE_STATE_RESUMING] = {
799 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
800 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
801 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
802 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
803 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
804 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
805 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
806 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
807 		},
808 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
809 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
810 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
811 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
812 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
813 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
814 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
815 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
816 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
817 		},
818 		[VFIO_DEVICE_STATE_ERROR] = {
819 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
820 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
821 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
822 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
823 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
824 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
825 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
826 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
827 		},
828 	};
829 
830 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
831 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
832 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
833 		[VFIO_DEVICE_STATE_PRE_COPY] =
834 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
835 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
836 						   VFIO_MIGRATION_P2P |
837 						   VFIO_MIGRATION_PRE_COPY,
838 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
839 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
840 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
841 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
842 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
843 	};
844 
845 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
846 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
847 			state_flags_table[cur_fsm]))
848 		return -EINVAL;
849 
850 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
851 	   (state_flags_table[new_fsm] & device->migration_flags) !=
852 			state_flags_table[new_fsm])
853 		return -EINVAL;
854 
855 	/*
856 	 * Arcs touching optional and unsupported states are skipped over. The
857 	 * driver will instead see an arc from the original state to the next
858 	 * logical state, as per the above comment.
859 	 */
860 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
861 	while (*next_fsm != VFIO_DEVICE_STATE_ERROR &&
862 	       (state_flags_table[*next_fsm] & device->migration_flags) !=
863 			state_flags_table[*next_fsm])
864 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
865 
866 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
867 }
868 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
869 
870 /*
871  * Convert the drivers's struct file into a FD number and return it to userspace
872  */
873 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
874 				   struct vfio_device_feature_mig_state *mig)
875 {
876 	int ret;
877 	int fd;
878 
879 	fd = get_unused_fd_flags(O_CLOEXEC);
880 	if (fd < 0) {
881 		ret = fd;
882 		goto out_fput;
883 	}
884 
885 	mig->data_fd = fd;
886 	if (copy_to_user(arg, mig, sizeof(*mig))) {
887 		ret = -EFAULT;
888 		goto out_put_unused;
889 	}
890 	fd_install(fd, filp);
891 	return 0;
892 
893 out_put_unused:
894 	put_unused_fd(fd);
895 out_fput:
896 	fput(filp);
897 	return ret;
898 }
899 
900 static int
901 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
902 					   u32 flags, void __user *arg,
903 					   size_t argsz)
904 {
905 	size_t minsz =
906 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
907 	struct vfio_device_feature_mig_state mig;
908 	struct file *filp = NULL;
909 	int ret;
910 
911 	if (!device->mig_ops)
912 		return -ENOTTY;
913 
914 	ret = vfio_check_feature(flags, argsz,
915 				 VFIO_DEVICE_FEATURE_SET |
916 				 VFIO_DEVICE_FEATURE_GET,
917 				 sizeof(mig));
918 	if (ret != 1)
919 		return ret;
920 
921 	if (copy_from_user(&mig, arg, minsz))
922 		return -EFAULT;
923 
924 	if (flags & VFIO_DEVICE_FEATURE_GET) {
925 		enum vfio_device_mig_state curr_state;
926 
927 		ret = device->mig_ops->migration_get_state(device,
928 							   &curr_state);
929 		if (ret)
930 			return ret;
931 		mig.device_state = curr_state;
932 		goto out_copy;
933 	}
934 
935 	/* Handle the VFIO_DEVICE_FEATURE_SET */
936 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
937 	if (IS_ERR(filp) || !filp)
938 		goto out_copy;
939 
940 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
941 out_copy:
942 	mig.data_fd = -1;
943 	if (copy_to_user(arg, &mig, sizeof(mig)))
944 		return -EFAULT;
945 	if (IS_ERR(filp))
946 		return PTR_ERR(filp);
947 	return 0;
948 }
949 
950 static int
951 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
952 					      u32 flags, void __user *arg,
953 					      size_t argsz)
954 {
955 	struct vfio_device_feature_mig_data_size data_size = {};
956 	unsigned long stop_copy_length;
957 	int ret;
958 
959 	if (!device->mig_ops)
960 		return -ENOTTY;
961 
962 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
963 				 sizeof(data_size));
964 	if (ret != 1)
965 		return ret;
966 
967 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
968 	if (ret)
969 		return ret;
970 
971 	data_size.stop_copy_length = stop_copy_length;
972 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
973 		return -EFAULT;
974 
975 	return 0;
976 }
977 
978 static int
979 vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device,
980 						    u32 flags, size_t argsz)
981 {
982 	int ret;
983 
984 	if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY))
985 		return -EINVAL;
986 
987 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
988 	if (ret != 1)
989 		return ret;
990 
991 	device->precopy_info_v2 = 1;
992 	return 0;
993 }
994 
995 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
996 					       u32 flags, void __user *arg,
997 					       size_t argsz)
998 {
999 	struct vfio_device_feature_migration mig = {
1000 		.flags = device->migration_flags,
1001 	};
1002 	int ret;
1003 
1004 	if (!device->mig_ops)
1005 		return -ENOTTY;
1006 
1007 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1008 				 sizeof(mig));
1009 	if (ret != 1)
1010 		return ret;
1011 	if (copy_to_user(arg, &mig, sizeof(mig)))
1012 		return -EFAULT;
1013 	return 0;
1014 }
1015 
1016 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
1017 			      u32 req_nodes)
1018 {
1019 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
1020 	unsigned long min_gap, curr_gap;
1021 
1022 	/* Special shortcut when a single range is required */
1023 	if (req_nodes == 1) {
1024 		unsigned long last;
1025 
1026 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
1027 
1028 		/* Empty list */
1029 		if (WARN_ON_ONCE(!comb_start))
1030 			return;
1031 
1032 		curr = comb_start;
1033 		while (curr) {
1034 			last = curr->last;
1035 			prev = curr;
1036 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1037 			if (prev != comb_start)
1038 				interval_tree_remove(prev, root);
1039 		}
1040 		comb_start->last = last;
1041 		return;
1042 	}
1043 
1044 	/* Combine ranges which have the smallest gap */
1045 	while (cur_nodes > req_nodes) {
1046 		prev = NULL;
1047 		min_gap = ULONG_MAX;
1048 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1049 		while (curr) {
1050 			if (prev) {
1051 				curr_gap = curr->start - prev->last;
1052 				if (curr_gap < min_gap) {
1053 					min_gap = curr_gap;
1054 					comb_start = prev;
1055 					comb_end = curr;
1056 				}
1057 			}
1058 			prev = curr;
1059 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1060 		}
1061 
1062 		/* Empty list or no nodes to combine */
1063 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1064 			break;
1065 
1066 		comb_start->last = comb_end->last;
1067 		interval_tree_remove(comb_end, root);
1068 		cur_nodes--;
1069 	}
1070 }
1071 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1072 
1073 /* Ranges should fit into a single kernel page */
1074 #define LOG_MAX_RANGES \
1075 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1076 
1077 static int
1078 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1079 					u32 flags, void __user *arg,
1080 					size_t argsz)
1081 {
1082 	size_t minsz =
1083 		offsetofend(struct vfio_device_feature_dma_logging_control,
1084 			    ranges);
1085 	struct vfio_device_feature_dma_logging_range __user *ranges;
1086 	struct vfio_device_feature_dma_logging_control control;
1087 	struct vfio_device_feature_dma_logging_range range;
1088 	struct rb_root_cached root = RB_ROOT_CACHED;
1089 	struct interval_tree_node *nodes;
1090 	u64 iova_end;
1091 	u32 nnodes;
1092 	int i, ret;
1093 
1094 	if (!device->log_ops)
1095 		return -ENOTTY;
1096 
1097 	ret = vfio_check_feature(flags, argsz,
1098 				 VFIO_DEVICE_FEATURE_SET,
1099 				 sizeof(control));
1100 	if (ret != 1)
1101 		return ret;
1102 
1103 	if (copy_from_user(&control, arg, minsz))
1104 		return -EFAULT;
1105 
1106 	nnodes = control.num_ranges;
1107 	if (!nnodes)
1108 		return -EINVAL;
1109 
1110 	if (nnodes > LOG_MAX_RANGES)
1111 		return -E2BIG;
1112 
1113 	ranges = u64_to_user_ptr(control.ranges);
1114 	nodes = kmalloc_objs(struct interval_tree_node, nnodes);
1115 	if (!nodes)
1116 		return -ENOMEM;
1117 
1118 	for (i = 0; i < nnodes; i++) {
1119 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1120 			ret = -EFAULT;
1121 			goto end;
1122 		}
1123 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1124 		    !IS_ALIGNED(range.length, control.page_size)) {
1125 			ret = -EINVAL;
1126 			goto end;
1127 		}
1128 
1129 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1130 		    iova_end > ULONG_MAX) {
1131 			ret = -EOVERFLOW;
1132 			goto end;
1133 		}
1134 
1135 		nodes[i].start = range.iova;
1136 		nodes[i].last = range.iova + range.length - 1;
1137 		if (interval_tree_iter_first(&root, nodes[i].start,
1138 					     nodes[i].last)) {
1139 			/* Range overlapping */
1140 			ret = -EINVAL;
1141 			goto end;
1142 		}
1143 		interval_tree_insert(nodes + i, &root);
1144 	}
1145 
1146 	ret = device->log_ops->log_start(device, &root, nnodes,
1147 					 &control.page_size);
1148 	if (ret)
1149 		goto end;
1150 
1151 	if (copy_to_user(arg, &control, sizeof(control))) {
1152 		ret = -EFAULT;
1153 		device->log_ops->log_stop(device);
1154 	}
1155 
1156 end:
1157 	kfree(nodes);
1158 	return ret;
1159 }
1160 
1161 static int
1162 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1163 				       u32 flags, void __user *arg,
1164 				       size_t argsz)
1165 {
1166 	int ret;
1167 
1168 	if (!device->log_ops)
1169 		return -ENOTTY;
1170 
1171 	ret = vfio_check_feature(flags, argsz,
1172 				 VFIO_DEVICE_FEATURE_SET, 0);
1173 	if (ret != 1)
1174 		return ret;
1175 
1176 	return device->log_ops->log_stop(device);
1177 }
1178 
1179 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1180 					  unsigned long iova, size_t length,
1181 					  void *opaque)
1182 {
1183 	struct vfio_device *device = opaque;
1184 
1185 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1186 }
1187 
1188 static int
1189 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1190 					 u32 flags, void __user *arg,
1191 					 size_t argsz)
1192 {
1193 	size_t minsz =
1194 		offsetofend(struct vfio_device_feature_dma_logging_report,
1195 			    bitmap);
1196 	struct vfio_device_feature_dma_logging_report report;
1197 	struct iova_bitmap *iter;
1198 	u64 iova_end;
1199 	int ret;
1200 
1201 	if (!device->log_ops)
1202 		return -ENOTTY;
1203 
1204 	ret = vfio_check_feature(flags, argsz,
1205 				 VFIO_DEVICE_FEATURE_GET,
1206 				 sizeof(report));
1207 	if (ret != 1)
1208 		return ret;
1209 
1210 	if (copy_from_user(&report, arg, minsz))
1211 		return -EFAULT;
1212 
1213 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1214 		return -EINVAL;
1215 
1216 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1217 	    iova_end > ULONG_MAX)
1218 		return -EOVERFLOW;
1219 
1220 	iter = iova_bitmap_alloc(report.iova, report.length,
1221 				 report.page_size,
1222 				 u64_to_user_ptr(report.bitmap));
1223 	if (IS_ERR(iter))
1224 		return PTR_ERR(iter);
1225 
1226 	ret = iova_bitmap_for_each(iter, device,
1227 				   vfio_device_log_read_and_clear);
1228 
1229 	iova_bitmap_free(iter);
1230 	return ret;
1231 }
1232 
1233 static int vfio_ioctl_device_feature(struct vfio_device *device,
1234 				     struct vfio_device_feature __user *arg)
1235 {
1236 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1237 	struct vfio_device_feature feature;
1238 
1239 	if (copy_from_user(&feature, arg, minsz))
1240 		return -EFAULT;
1241 
1242 	if (feature.argsz < minsz)
1243 		return -EINVAL;
1244 
1245 	/* Check unknown flags */
1246 	if (feature.flags &
1247 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1248 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1249 		return -EINVAL;
1250 
1251 	/* GET & SET are mutually exclusive except with PROBE */
1252 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1253 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1254 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1255 		return -EINVAL;
1256 
1257 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1258 	case VFIO_DEVICE_FEATURE_MIGRATION:
1259 		return vfio_ioctl_device_feature_migration(
1260 			device, feature.flags, arg->data,
1261 			feature.argsz - minsz);
1262 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1263 		return vfio_ioctl_device_feature_mig_device_state(
1264 			device, feature.flags, arg->data,
1265 			feature.argsz - minsz);
1266 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1267 		return vfio_ioctl_device_feature_logging_start(
1268 			device, feature.flags, arg->data,
1269 			feature.argsz - minsz);
1270 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1271 		return vfio_ioctl_device_feature_logging_stop(
1272 			device, feature.flags, arg->data,
1273 			feature.argsz - minsz);
1274 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1275 		return vfio_ioctl_device_feature_logging_report(
1276 			device, feature.flags, arg->data,
1277 			feature.argsz - minsz);
1278 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1279 		return vfio_ioctl_device_feature_migration_data_size(
1280 			device, feature.flags, arg->data,
1281 			feature.argsz - minsz);
1282 	case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2:
1283 		return vfio_ioctl_device_feature_migration_precopy_info_v2(
1284 			device, feature.flags, feature.argsz - minsz);
1285 	default:
1286 		if (unlikely(!device->ops->device_feature))
1287 			return -ENOTTY;
1288 		return device->ops->device_feature(device, feature.flags,
1289 						   arg->data,
1290 						   feature.argsz - minsz);
1291 	}
1292 }
1293 
1294 static long vfio_get_region_info(struct vfio_device *device,
1295 				 struct vfio_region_info __user *arg)
1296 {
1297 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
1298 	struct vfio_region_info info = {};
1299 	struct vfio_info_cap caps = {};
1300 	int ret;
1301 
1302 	if (unlikely(!device->ops->get_region_info_caps))
1303 		return -EINVAL;
1304 
1305 	if (copy_from_user(&info, arg, minsz))
1306 		return -EFAULT;
1307 	if (info.argsz < minsz)
1308 		return -EINVAL;
1309 
1310 	ret = device->ops->get_region_info_caps(device, &info, &caps);
1311 	if (ret)
1312 		goto out_free;
1313 
1314 	if (caps.size) {
1315 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1316 		if (info.argsz < sizeof(info) + caps.size) {
1317 			info.argsz = sizeof(info) + caps.size;
1318 			info.cap_offset = 0;
1319 		} else {
1320 			vfio_info_cap_shift(&caps, sizeof(info));
1321 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1322 				ret = -EFAULT;
1323 				goto out_free;
1324 			}
1325 			info.cap_offset = sizeof(info);
1326 		}
1327 	}
1328 
1329 	if (copy_to_user(arg, &info, minsz)){
1330 		ret = -EFAULT;
1331 		goto out_free;
1332 	}
1333 
1334 out_free:
1335 	kfree(caps.buf);
1336 	return ret;
1337 }
1338 
1339 static long vfio_device_fops_unl_ioctl(struct file *filep,
1340 				       unsigned int cmd, unsigned long arg)
1341 {
1342 	struct vfio_device_file *df = filep->private_data;
1343 	struct vfio_device *device = df->device;
1344 	void __user *uptr = (void __user *)arg;
1345 	int ret;
1346 
1347 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1348 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1349 
1350 	/* Paired with smp_store_release() following vfio_df_open() */
1351 	if (!smp_load_acquire(&df->access_granted))
1352 		return -EINVAL;
1353 
1354 	ret = vfio_device_pm_runtime_get(device);
1355 	if (ret)
1356 		return ret;
1357 
1358 	/* cdev only ioctls */
1359 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1360 		switch (cmd) {
1361 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1362 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1363 			goto out;
1364 
1365 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1366 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1367 			goto out;
1368 		}
1369 	}
1370 
1371 	switch (cmd) {
1372 	case VFIO_DEVICE_FEATURE:
1373 		ret = vfio_ioctl_device_feature(device, uptr);
1374 		break;
1375 
1376 	case VFIO_DEVICE_GET_REGION_INFO:
1377 		ret = vfio_get_region_info(device, uptr);
1378 		break;
1379 
1380 	default:
1381 		if (unlikely(!device->ops->ioctl))
1382 			ret = -EINVAL;
1383 		else
1384 			ret = device->ops->ioctl(device, cmd, arg);
1385 		break;
1386 	}
1387 out:
1388 	vfio_device_pm_runtime_put(device);
1389 	return ret;
1390 }
1391 
1392 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1393 				     size_t count, loff_t *ppos)
1394 {
1395 	struct vfio_device_file *df = filep->private_data;
1396 	struct vfio_device *device = df->device;
1397 
1398 	/* Paired with smp_store_release() following vfio_df_open() */
1399 	if (!smp_load_acquire(&df->access_granted))
1400 		return -EINVAL;
1401 
1402 	if (unlikely(!device->ops->read))
1403 		return -EINVAL;
1404 
1405 	return device->ops->read(device, buf, count, ppos);
1406 }
1407 
1408 static ssize_t vfio_device_fops_write(struct file *filep,
1409 				      const char __user *buf,
1410 				      size_t count, loff_t *ppos)
1411 {
1412 	struct vfio_device_file *df = filep->private_data;
1413 	struct vfio_device *device = df->device;
1414 
1415 	/* Paired with smp_store_release() following vfio_df_open() */
1416 	if (!smp_load_acquire(&df->access_granted))
1417 		return -EINVAL;
1418 
1419 	if (unlikely(!device->ops->write))
1420 		return -EINVAL;
1421 
1422 	return device->ops->write(device, buf, count, ppos);
1423 }
1424 
1425 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1426 {
1427 	struct vfio_device_file *df = filep->private_data;
1428 	struct vfio_device *device = df->device;
1429 
1430 	/* Paired with smp_store_release() following vfio_df_open() */
1431 	if (!smp_load_acquire(&df->access_granted))
1432 		return -EINVAL;
1433 
1434 	if (unlikely(!device->ops->mmap))
1435 		return -EINVAL;
1436 
1437 	return device->ops->mmap(device, vma);
1438 }
1439 
1440 #ifdef CONFIG_PROC_FS
1441 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep)
1442 {
1443 	char *path;
1444 	struct vfio_device_file *df = filep->private_data;
1445 	struct vfio_device *device = df->device;
1446 
1447 	path = kobject_get_path(&device->dev->kobj, GFP_KERNEL);
1448 	if (!path)
1449 		return;
1450 
1451 	seq_printf(m, "vfio-device-syspath: /sys%s\n", path);
1452 	kfree(path);
1453 }
1454 #endif
1455 
1456 const struct file_operations vfio_device_fops = {
1457 	.owner		= THIS_MODULE,
1458 	.open		= vfio_device_fops_cdev_open,
1459 	.release	= vfio_device_fops_release,
1460 	.read		= vfio_device_fops_read,
1461 	.write		= vfio_device_fops_write,
1462 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1463 	.compat_ioctl	= compat_ptr_ioctl,
1464 	.mmap		= vfio_device_fops_mmap,
1465 #ifdef CONFIG_PROC_FS
1466 	.show_fdinfo	= vfio_device_show_fdinfo,
1467 #endif
1468 };
1469 
1470 static struct vfio_device *vfio_device_from_file(struct file *file)
1471 {
1472 	struct vfio_device_file *df = file->private_data;
1473 
1474 	if (file->f_op != &vfio_device_fops)
1475 		return NULL;
1476 	return df->device;
1477 }
1478 
1479 /**
1480  * vfio_file_is_valid - True if the file is valid vfio file
1481  * @file: VFIO group file or VFIO device file
1482  */
1483 bool vfio_file_is_valid(struct file *file)
1484 {
1485 	return vfio_group_from_file(file) ||
1486 	       vfio_device_from_file(file);
1487 }
1488 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1489 
1490 /**
1491  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1492  *        is always CPU cache coherent
1493  * @file: VFIO group file or VFIO device file
1494  *
1495  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1496  * bit in DMA transactions. A return of false indicates that the user has
1497  * rights to access additional instructions such as wbinvd on x86.
1498  */
1499 bool vfio_file_enforced_coherent(struct file *file)
1500 {
1501 	struct vfio_device *device;
1502 	struct vfio_group *group;
1503 
1504 	group = vfio_group_from_file(file);
1505 	if (group)
1506 		return vfio_group_enforced_coherent(group);
1507 
1508 	device = vfio_device_from_file(file);
1509 	if (device)
1510 		return device_iommu_capable(device->dev,
1511 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1512 
1513 	return true;
1514 }
1515 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1516 
1517 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1518 {
1519 	struct vfio_device_file *df = file->private_data;
1520 
1521 	/*
1522 	 * The kvm is first recorded in the vfio_device_file, and will
1523 	 * be propagated to vfio_device::kvm when the file is bound to
1524 	 * iommufd successfully in the vfio device cdev path.
1525 	 */
1526 	spin_lock(&df->kvm_ref_lock);
1527 	df->kvm = kvm;
1528 	spin_unlock(&df->kvm_ref_lock);
1529 }
1530 
1531 /**
1532  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1533  * @file: VFIO group file or VFIO device file
1534  * @kvm: KVM to link
1535  *
1536  * When a VFIO device is first opened the KVM will be available in
1537  * device->kvm if one was associated with the file.
1538  */
1539 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1540 {
1541 	struct vfio_group *group;
1542 
1543 	group = vfio_group_from_file(file);
1544 	if (group)
1545 		vfio_group_set_kvm(group, kvm);
1546 
1547 	if (vfio_device_from_file(file))
1548 		vfio_device_file_set_kvm(file, kvm);
1549 }
1550 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1551 
1552 /*
1553  * Sub-module support
1554  */
1555 /*
1556  * Helper for managing a buffer of info chain capabilities, allocate or
1557  * reallocate a buffer with additional @size, filling in @id and @version
1558  * of the capability.  A pointer to the new capability is returned.
1559  *
1560  * NB. The chain is based at the head of the buffer, so new entries are
1561  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1562  * next offsets prior to copying to the user buffer.
1563  */
1564 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1565 					       size_t size, u16 id, u16 version)
1566 {
1567 	void *buf;
1568 	struct vfio_info_cap_header *header, *tmp;
1569 
1570 	/* Ensure that the next capability struct will be aligned */
1571 	size = ALIGN(size, sizeof(u64));
1572 
1573 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1574 	if (!buf) {
1575 		kfree(caps->buf);
1576 		caps->buf = NULL;
1577 		caps->size = 0;
1578 		return ERR_PTR(-ENOMEM);
1579 	}
1580 
1581 	caps->buf = buf;
1582 	header = buf + caps->size;
1583 
1584 	/* Eventually copied to user buffer, zero */
1585 	memset(header, 0, size);
1586 
1587 	header->id = id;
1588 	header->version = version;
1589 
1590 	/* Add to the end of the capability chain */
1591 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1592 		; /* nothing */
1593 
1594 	tmp->next = caps->size;
1595 	caps->size += size;
1596 
1597 	return header;
1598 }
1599 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1600 
1601 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1602 {
1603 	struct vfio_info_cap_header *tmp;
1604 	void *buf = (void *)caps->buf;
1605 
1606 	/* Capability structs should start with proper alignment */
1607 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1608 
1609 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1610 		tmp->next += offset;
1611 }
1612 EXPORT_SYMBOL(vfio_info_cap_shift);
1613 
1614 int vfio_info_add_capability(struct vfio_info_cap *caps,
1615 			     struct vfio_info_cap_header *cap, size_t size)
1616 {
1617 	struct vfio_info_cap_header *header;
1618 
1619 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1620 	if (IS_ERR(header))
1621 		return PTR_ERR(header);
1622 
1623 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1624 
1625 	return 0;
1626 }
1627 EXPORT_SYMBOL(vfio_info_add_capability);
1628 
1629 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1630 				       int max_irq_type, size_t *data_size)
1631 {
1632 	unsigned long minsz;
1633 	size_t size;
1634 
1635 	minsz = offsetofend(struct vfio_irq_set, count);
1636 
1637 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1638 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1639 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1640 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1641 		return -EINVAL;
1642 
1643 	if (data_size)
1644 		*data_size = 0;
1645 
1646 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1647 		return -EINVAL;
1648 
1649 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1650 	case VFIO_IRQ_SET_DATA_NONE:
1651 		size = 0;
1652 		break;
1653 	case VFIO_IRQ_SET_DATA_BOOL:
1654 		size = sizeof(uint8_t);
1655 		break;
1656 	case VFIO_IRQ_SET_DATA_EVENTFD:
1657 		size = sizeof(int32_t);
1658 		break;
1659 	default:
1660 		return -EINVAL;
1661 	}
1662 
1663 	if (size) {
1664 		if (hdr->argsz - minsz < hdr->count * size)
1665 			return -EINVAL;
1666 
1667 		if (!data_size)
1668 			return -EINVAL;
1669 
1670 		*data_size = hdr->count * size;
1671 	}
1672 
1673 	return 0;
1674 }
1675 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1676 
1677 /*
1678  * Pin contiguous user pages and return their associated host pages for local
1679  * domain only.
1680  * @device [in]  : device
1681  * @iova [in]    : starting IOVA of user pages to be pinned.
1682  * @npage [in]   : count of pages to be pinned.  This count should not
1683  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1684  * @prot [in]    : protection flags
1685  * @pages[out]   : array of host pages
1686  * Return error or number of pages pinned.
1687  *
1688  * A driver may only call this function if the vfio_device was created
1689  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1690  */
1691 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1692 		   int npage, int prot, struct page **pages)
1693 {
1694 	/* group->container cannot change while a vfio device is open */
1695 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1696 		return -EINVAL;
1697 	if (!device->ops->dma_unmap)
1698 		return -EINVAL;
1699 	if (vfio_device_has_container(device))
1700 		return vfio_device_container_pin_pages(device, iova,
1701 						       npage, prot, pages);
1702 	if (device->iommufd_access) {
1703 		int ret;
1704 
1705 		if (iova > ULONG_MAX)
1706 			return -EINVAL;
1707 		/*
1708 		 * VFIO ignores the sub page offset, npages is from the start of
1709 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1710 		 * the sub page offset by doing:
1711 		 *     pages[0] + (iova % PAGE_SIZE)
1712 		 */
1713 		ret = iommufd_access_pin_pages(
1714 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1715 			npage * PAGE_SIZE, pages,
1716 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1717 		if (ret)
1718 			return ret;
1719 		return npage;
1720 	}
1721 	return -EINVAL;
1722 }
1723 EXPORT_SYMBOL(vfio_pin_pages);
1724 
1725 /*
1726  * Unpin contiguous host pages for local domain only.
1727  * @device [in]  : device
1728  * @iova [in]    : starting address of user pages to be unpinned.
1729  * @npage [in]   : count of pages to be unpinned.  This count should not
1730  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1731  */
1732 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1733 {
1734 	if (WARN_ON(!vfio_assert_device_open(device)))
1735 		return;
1736 	if (WARN_ON(!device->ops->dma_unmap))
1737 		return;
1738 
1739 	if (vfio_device_has_container(device)) {
1740 		vfio_device_container_unpin_pages(device, iova, npage);
1741 		return;
1742 	}
1743 	if (device->iommufd_access) {
1744 		if (WARN_ON(iova > ULONG_MAX))
1745 			return;
1746 		iommufd_access_unpin_pages(device->iommufd_access,
1747 					   ALIGN_DOWN(iova, PAGE_SIZE),
1748 					   npage * PAGE_SIZE);
1749 		return;
1750 	}
1751 }
1752 EXPORT_SYMBOL(vfio_unpin_pages);
1753 
1754 /*
1755  * This interface allows the CPUs to perform some sort of virtual DMA on
1756  * behalf of the device.
1757  *
1758  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1759  * into/from a kernel buffer.
1760  *
1761  * As the read/write of user space memory is conducted via the CPUs and is
1762  * not a real device DMA, it is not necessary to pin the user space memory.
1763  *
1764  * @device [in]		: VFIO device
1765  * @iova [in]		: base IOVA of a user space buffer
1766  * @data [in]		: pointer to kernel buffer
1767  * @len [in]		: kernel buffer length
1768  * @write		: indicate read or write
1769  * Return error code on failure or 0 on success.
1770  */
1771 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1772 		size_t len, bool write)
1773 {
1774 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1775 		return -EINVAL;
1776 
1777 	if (vfio_device_has_container(device))
1778 		return vfio_device_container_dma_rw(device, iova,
1779 						    data, len, write);
1780 
1781 	if (device->iommufd_access) {
1782 		unsigned int flags = 0;
1783 
1784 		if (iova > ULONG_MAX)
1785 			return -EINVAL;
1786 
1787 		/* VFIO historically tries to auto-detect a kthread */
1788 		if (!current->mm)
1789 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1790 		if (write)
1791 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1792 		return iommufd_access_rw(device->iommufd_access, iova, data,
1793 					 len, flags);
1794 	}
1795 	return -EINVAL;
1796 }
1797 EXPORT_SYMBOL(vfio_dma_rw);
1798 
1799 /*
1800  * Module/class support
1801  */
1802 static int __init vfio_init(void)
1803 {
1804 	int ret;
1805 
1806 	ida_init(&vfio.device_ida);
1807 
1808 	ret = vfio_group_init();
1809 	if (ret)
1810 		return ret;
1811 
1812 	ret = vfio_virqfd_init();
1813 	if (ret)
1814 		goto err_virqfd;
1815 
1816 	/* /sys/class/vfio-dev/vfioX */
1817 	ret = class_register(&vfio_device_class);
1818 	if (ret)
1819 		goto err_dev_class;
1820 
1821 	ret = vfio_cdev_init();
1822 	if (ret)
1823 		goto err_alloc_dev_chrdev;
1824 
1825 	vfio_debugfs_create_root();
1826 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1827 	return 0;
1828 
1829 err_alloc_dev_chrdev:
1830 	class_unregister(&vfio_device_class);
1831 err_dev_class:
1832 	vfio_virqfd_exit();
1833 err_virqfd:
1834 	vfio_group_cleanup();
1835 	return ret;
1836 }
1837 
1838 static void __exit vfio_cleanup(void)
1839 {
1840 	vfio_debugfs_remove_root();
1841 	ida_destroy(&vfio.device_ida);
1842 	vfio_cdev_cleanup();
1843 	class_unregister(&vfio_device_class);
1844 	vfio_virqfd_exit();
1845 	vfio_group_cleanup();
1846 	xa_destroy(&vfio_device_set_xa);
1847 }
1848 
1849 module_init(vfio_init);
1850 module_exit(vfio_cleanup);
1851 
1852 MODULE_IMPORT_NS("IOMMUFD");
1853 MODULE_VERSION(DRIVER_VERSION);
1854 MODULE_LICENSE("GPL v2");
1855 MODULE_AUTHOR(DRIVER_AUTHOR);
1856 MODULE_DESCRIPTION(DRIVER_DESC);
1857 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1858