xref: /linux/drivers/vfio/vfio_main.c (revision 5ea5880764cbb164afb17a62e76ca75dc371409d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/seq_file.h>
32 #include <linux/slab.h>
33 #include <linux/stat.h>
34 #include <linux/string.h>
35 #include <linux/uaccess.h>
36 #include <linux/vfio.h>
37 #include <linux/wait.h>
38 #include <linux/sched/signal.h>
39 #include <linux/pm_runtime.h>
40 #include <linux/interval_tree.h>
41 #include <linux/iova_bitmap.h>
42 #include <linux/iommufd.h>
43 #include "vfio.h"
44 
45 #define DRIVER_VERSION	"0.3"
46 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
47 #define DRIVER_DESC	"VFIO - User Level meta-driver"
48 
49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
50 
51 static struct vfio {
52 	struct ida			device_ida;
53 	struct vfsmount			*vfs_mount;
54 	int				fs_count;
55 } vfio;
56 
57 #ifdef CONFIG_VFIO_NOIOMMU
58 bool vfio_noiommu __read_mostly;
59 module_param_named(enable_unsafe_noiommu_mode,
60 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
62 #endif
63 
64 static DEFINE_XARRAY(vfio_device_set_xa);
65 
66 static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
67 {
68 	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
69 }
70 
71 static const struct class vfio_device_class = {
72 	.name		= "vfio-dev",
73 	.devnode	= vfio_device_devnode
74 };
75 
76 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
77 {
78 	unsigned long idx = (unsigned long)set_id;
79 	struct vfio_device_set *new_dev_set;
80 	struct vfio_device_set *dev_set;
81 
82 	if (WARN_ON(!set_id))
83 		return -EINVAL;
84 
85 	/*
86 	 * Atomically acquire a singleton object in the xarray for this set_id
87 	 */
88 	xa_lock(&vfio_device_set_xa);
89 	dev_set = xa_load(&vfio_device_set_xa, idx);
90 	if (dev_set)
91 		goto found_get_ref;
92 	xa_unlock(&vfio_device_set_xa);
93 
94 	new_dev_set = kzalloc_obj(*new_dev_set);
95 	if (!new_dev_set)
96 		return -ENOMEM;
97 	mutex_init(&new_dev_set->lock);
98 	INIT_LIST_HEAD(&new_dev_set->device_list);
99 	new_dev_set->set_id = set_id;
100 
101 	xa_lock(&vfio_device_set_xa);
102 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
103 			       GFP_KERNEL);
104 	if (!dev_set) {
105 		dev_set = new_dev_set;
106 		goto found_get_ref;
107 	}
108 
109 	kfree(new_dev_set);
110 	if (xa_is_err(dev_set)) {
111 		xa_unlock(&vfio_device_set_xa);
112 		return xa_err(dev_set);
113 	}
114 
115 found_get_ref:
116 	dev_set->device_count++;
117 	xa_unlock(&vfio_device_set_xa);
118 	mutex_lock(&dev_set->lock);
119 	device->dev_set = dev_set;
120 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
121 	mutex_unlock(&dev_set->lock);
122 	return 0;
123 }
124 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
125 
126 static void vfio_release_device_set(struct vfio_device *device)
127 {
128 	struct vfio_device_set *dev_set = device->dev_set;
129 
130 	if (!dev_set)
131 		return;
132 
133 	mutex_lock(&dev_set->lock);
134 	list_del(&device->dev_set_list);
135 	mutex_unlock(&dev_set->lock);
136 
137 	xa_lock(&vfio_device_set_xa);
138 	if (!--dev_set->device_count) {
139 		__xa_erase(&vfio_device_set_xa,
140 			   (unsigned long)dev_set->set_id);
141 		mutex_destroy(&dev_set->lock);
142 		kfree(dev_set);
143 	}
144 	xa_unlock(&vfio_device_set_xa);
145 }
146 
147 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
148 {
149 	struct vfio_device *cur;
150 	unsigned int open_count = 0;
151 
152 	lockdep_assert_held(&dev_set->lock);
153 
154 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
155 		open_count += cur->open_count;
156 	return open_count;
157 }
158 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
159 
160 struct vfio_device *
161 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
162 			   struct device *dev)
163 {
164 	struct vfio_device *cur;
165 
166 	lockdep_assert_held(&dev_set->lock);
167 
168 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
169 		if (cur->dev == dev)
170 			return cur;
171 	return NULL;
172 }
173 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
174 
175 /*
176  * Device objects - create, release, get, put, search
177  */
178 /* Device reference always implies a group reference */
179 void vfio_device_put_registration(struct vfio_device *device)
180 {
181 	if (refcount_dec_and_test(&device->refcount))
182 		complete(&device->comp);
183 }
184 EXPORT_SYMBOL_GPL(vfio_device_put_registration);
185 
186 bool vfio_device_try_get_registration(struct vfio_device *device)
187 {
188 	return refcount_inc_not_zero(&device->refcount);
189 }
190 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
191 
192 /*
193  * VFIO driver API
194  */
195 /* Release helper called by vfio_put_device() */
196 static void vfio_device_release(struct device *dev)
197 {
198 	struct vfio_device *device =
199 			container_of(dev, struct vfio_device, device);
200 
201 	vfio_release_device_set(device);
202 	ida_free(&vfio.device_ida, device->index);
203 
204 	if (device->ops->release)
205 		device->ops->release(device);
206 
207 	iput(device->inode);
208 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
209 	kvfree(device);
210 }
211 
212 static int vfio_init_device(struct vfio_device *device, struct device *dev,
213 			    const struct vfio_device_ops *ops);
214 
215 /*
216  * Allocate and initialize vfio_device so it can be registered to vfio
217  * core.
218  *
219  * Drivers should use the wrapper vfio_alloc_device() for allocation.
220  * @size is the size of the structure to be allocated, including any
221  * private data used by the driver.
222  *
223  * Driver may provide an @init callback to cover device private data.
224  *
225  * Use vfio_put_device() to release the structure after success return.
226  */
227 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
228 				       const struct vfio_device_ops *ops)
229 {
230 	struct vfio_device *device;
231 	int ret;
232 
233 	if (WARN_ON(size < sizeof(struct vfio_device)))
234 		return ERR_PTR(-EINVAL);
235 
236 	device = kvzalloc(size, GFP_KERNEL);
237 	if (!device)
238 		return ERR_PTR(-ENOMEM);
239 
240 	ret = vfio_init_device(device, dev, ops);
241 	if (ret)
242 		goto out_free;
243 	return device;
244 
245 out_free:
246 	kvfree(device);
247 	return ERR_PTR(ret);
248 }
249 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
250 
251 static int vfio_fs_init_fs_context(struct fs_context *fc)
252 {
253 	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
254 }
255 
256 static struct file_system_type vfio_fs_type = {
257 	.name = "vfio",
258 	.owner = THIS_MODULE,
259 	.init_fs_context = vfio_fs_init_fs_context,
260 	.kill_sb = kill_anon_super,
261 };
262 
263 static struct inode *vfio_fs_inode_new(void)
264 {
265 	struct inode *inode;
266 	int ret;
267 
268 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
269 	if (ret)
270 		return ERR_PTR(ret);
271 
272 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
273 	if (IS_ERR(inode))
274 		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
275 
276 	return inode;
277 }
278 
279 /*
280  * Initialize a vfio_device so it can be registered to vfio core.
281  */
282 static int vfio_init_device(struct vfio_device *device, struct device *dev,
283 			    const struct vfio_device_ops *ops)
284 {
285 	int ret;
286 
287 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
288 	if (ret < 0) {
289 		dev_dbg(dev, "Error to alloc index\n");
290 		return ret;
291 	}
292 
293 	device->index = ret;
294 	init_completion(&device->comp);
295 	device->dev = dev;
296 	device->ops = ops;
297 	device->inode = vfio_fs_inode_new();
298 	if (IS_ERR(device->inode)) {
299 		ret = PTR_ERR(device->inode);
300 		goto out_inode;
301 	}
302 
303 	if (ops->init) {
304 		ret = ops->init(device);
305 		if (ret)
306 			goto out_uninit;
307 	}
308 
309 	device_initialize(&device->device);
310 	device->device.release = vfio_device_release;
311 	device->device.class = &vfio_device_class;
312 	device->device.parent = device->dev;
313 	return 0;
314 
315 out_uninit:
316 	iput(device->inode);
317 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
318 out_inode:
319 	vfio_release_device_set(device);
320 	ida_free(&vfio.device_ida, device->index);
321 	return ret;
322 }
323 
324 static int __vfio_register_dev(struct vfio_device *device,
325 			       enum vfio_group_type type)
326 {
327 	int ret;
328 
329 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
330 		    (!device->ops->bind_iommufd ||
331 		     !device->ops->unbind_iommufd ||
332 		     !device->ops->attach_ioas ||
333 		     !device->ops->detach_ioas)))
334 		return -EINVAL;
335 
336 	/*
337 	 * If the driver doesn't specify a set then the device is added to a
338 	 * singleton set just for itself.
339 	 */
340 	if (!device->dev_set)
341 		vfio_assign_device_set(device, device);
342 
343 	ret = dev_set_name(&device->device, "vfio%d", device->index);
344 	if (ret)
345 		return ret;
346 
347 	ret = vfio_device_set_group(device, type);
348 	if (ret)
349 		return ret;
350 
351 	/*
352 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
353 	 * restore cache coherency. It has to be checked here because it is only
354 	 * valid for cases where we are using iommu groups.
355 	 */
356 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
357 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
358 		ret = -EINVAL;
359 		goto err_out;
360 	}
361 
362 	ret = vfio_device_add(device);
363 	if (ret)
364 		goto err_out;
365 
366 	/* Refcounting can't start until the driver calls register */
367 	refcount_set(&device->refcount, 1);
368 
369 	vfio_device_group_register(device);
370 	vfio_device_debugfs_init(device);
371 
372 	return 0;
373 err_out:
374 	vfio_device_remove_group(device);
375 	return ret;
376 }
377 
378 int vfio_register_group_dev(struct vfio_device *device)
379 {
380 	return __vfio_register_dev(device, VFIO_IOMMU);
381 }
382 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
383 
384 /*
385  * Register a virtual device without IOMMU backing.  The user of this
386  * device must not be able to directly trigger unmediated DMA.
387  */
388 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
389 {
390 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
391 }
392 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
393 
394 /*
395  * Decrement the device reference count and wait for the device to be
396  * removed.  Open file descriptors for the device... */
397 void vfio_unregister_group_dev(struct vfio_device *device)
398 {
399 	unsigned int i = 0;
400 	bool interrupted = false;
401 	long rc;
402 
403 	/*
404 	 * Prevent new device opened by userspace via the
405 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
406 	 */
407 	vfio_device_group_unregister(device);
408 
409 	/*
410 	 * Balances vfio_device_add() in register path, also prevents
411 	 * new device opened by userspace in the cdev path.
412 	 */
413 	vfio_device_del(device);
414 
415 	vfio_device_put_registration(device);
416 	rc = try_wait_for_completion(&device->comp);
417 	while (rc <= 0) {
418 		if (device->ops->request)
419 			device->ops->request(device, i++);
420 
421 		if (interrupted) {
422 			rc = wait_for_completion_timeout(&device->comp,
423 							 HZ * 10);
424 		} else {
425 			rc = wait_for_completion_interruptible_timeout(
426 				&device->comp, HZ * 10);
427 			if (rc < 0) {
428 				interrupted = true;
429 				dev_warn(device->dev,
430 					 "Device is currently in use, task"
431 					 " \"%s\" (%d) "
432 					 "blocked until device is released",
433 					 current->comm, task_pid_nr(current));
434 			}
435 		}
436 	}
437 
438 	vfio_device_debugfs_exit(device);
439 	/* Balances vfio_device_set_group in register path */
440 	vfio_device_remove_group(device);
441 }
442 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
443 
444 #if IS_ENABLED(CONFIG_KVM)
445 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
446 {
447 	void (*pfn)(struct kvm *kvm);
448 	bool (*fn)(struct kvm *kvm);
449 	bool ret;
450 
451 	lockdep_assert_held(&device->dev_set->lock);
452 
453 	if (!kvm)
454 		return;
455 
456 	pfn = symbol_get(kvm_put_kvm);
457 	if (WARN_ON(!pfn))
458 		return;
459 
460 	fn = symbol_get(kvm_get_kvm_safe);
461 	if (WARN_ON(!fn)) {
462 		symbol_put(kvm_put_kvm);
463 		return;
464 	}
465 
466 	ret = fn(kvm);
467 	symbol_put(kvm_get_kvm_safe);
468 	if (!ret) {
469 		symbol_put(kvm_put_kvm);
470 		return;
471 	}
472 
473 	device->put_kvm = pfn;
474 	device->kvm = kvm;
475 }
476 
477 void vfio_device_put_kvm(struct vfio_device *device)
478 {
479 	lockdep_assert_held(&device->dev_set->lock);
480 
481 	if (!device->kvm)
482 		return;
483 
484 	if (WARN_ON(!device->put_kvm))
485 		goto clear;
486 
487 	device->put_kvm(device->kvm);
488 	device->put_kvm = NULL;
489 	symbol_put(kvm_put_kvm);
490 
491 clear:
492 	device->kvm = NULL;
493 }
494 #endif
495 
496 /* true if the vfio_device has open_device() called but not close_device() */
497 static bool vfio_assert_device_open(struct vfio_device *device)
498 {
499 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
500 }
501 
502 struct vfio_device_file *
503 vfio_allocate_device_file(struct vfio_device *device)
504 {
505 	struct vfio_device_file *df;
506 
507 	df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT);
508 	if (!df)
509 		return ERR_PTR(-ENOMEM);
510 
511 	df->device = device;
512 	spin_lock_init(&df->kvm_ref_lock);
513 
514 	return df;
515 }
516 
517 static int vfio_df_device_first_open(struct vfio_device_file *df)
518 {
519 	struct vfio_device *device = df->device;
520 	struct iommufd_ctx *iommufd = df->iommufd;
521 	int ret;
522 
523 	lockdep_assert_held(&device->dev_set->lock);
524 
525 	if (!try_module_get(device->dev->driver->owner))
526 		return -ENODEV;
527 
528 	if (iommufd)
529 		ret = vfio_df_iommufd_bind(df);
530 	else
531 		ret = vfio_device_group_use_iommu(device);
532 	if (ret)
533 		goto err_module_put;
534 
535 	if (device->ops->open_device) {
536 		ret = device->ops->open_device(device);
537 		if (ret)
538 			goto err_unuse_iommu;
539 	}
540 	return 0;
541 
542 err_unuse_iommu:
543 	if (iommufd)
544 		vfio_df_iommufd_unbind(df);
545 	else
546 		vfio_device_group_unuse_iommu(device);
547 err_module_put:
548 	module_put(device->dev->driver->owner);
549 	return ret;
550 }
551 
552 static void vfio_df_device_last_close(struct vfio_device_file *df)
553 {
554 	struct vfio_device *device = df->device;
555 	struct iommufd_ctx *iommufd = df->iommufd;
556 
557 	lockdep_assert_held(&device->dev_set->lock);
558 
559 	if (device->ops->close_device)
560 		device->ops->close_device(device);
561 	if (iommufd)
562 		vfio_df_iommufd_unbind(df);
563 	else
564 		vfio_device_group_unuse_iommu(device);
565 	device->precopy_info_v2 = 0;
566 	module_put(device->dev->driver->owner);
567 }
568 
569 int vfio_df_open(struct vfio_device_file *df)
570 {
571 	struct vfio_device *device = df->device;
572 	int ret = 0;
573 
574 	lockdep_assert_held(&device->dev_set->lock);
575 
576 	/*
577 	 * Only the group path allows the device to be opened multiple
578 	 * times.  The device cdev path doesn't have a secure way for it.
579 	 */
580 	if (device->open_count != 0 && !df->group)
581 		return -EINVAL;
582 
583 	device->open_count++;
584 	if (device->open_count == 1) {
585 		ret = vfio_df_device_first_open(df);
586 		if (ret)
587 			device->open_count--;
588 	}
589 
590 	return ret;
591 }
592 
593 void vfio_df_close(struct vfio_device_file *df)
594 {
595 	struct vfio_device *device = df->device;
596 
597 	lockdep_assert_held(&device->dev_set->lock);
598 
599 	if (!vfio_assert_device_open(device))
600 		return;
601 	if (device->open_count == 1)
602 		vfio_df_device_last_close(df);
603 	device->open_count--;
604 }
605 
606 /*
607  * Wrapper around pm_runtime_resume_and_get().
608  * Return error code on failure or 0 on success.
609  */
610 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
611 {
612 	struct device *dev = device->dev;
613 
614 	if (dev->driver && dev->driver->pm) {
615 		int ret;
616 
617 		ret = pm_runtime_resume_and_get(dev);
618 		if (ret) {
619 			dev_info_ratelimited(dev,
620 				"vfio: runtime resume failed %d\n", ret);
621 			return -EIO;
622 		}
623 	}
624 
625 	return 0;
626 }
627 
628 /*
629  * Wrapper around pm_runtime_put().
630  */
631 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
632 {
633 	struct device *dev = device->dev;
634 
635 	if (dev->driver && dev->driver->pm)
636 		pm_runtime_put(dev);
637 }
638 
639 /*
640  * VFIO Device fd
641  */
642 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
643 {
644 	struct vfio_device_file *df = filep->private_data;
645 	struct vfio_device *device = df->device;
646 
647 	if (df->group)
648 		vfio_df_group_close(df);
649 	else
650 		vfio_df_unbind_iommufd(df);
651 
652 	vfio_device_put_registration(device);
653 
654 	kfree(df);
655 
656 	return 0;
657 }
658 
659 /*
660  * vfio_mig_get_next_state - Compute the next step in the FSM
661  * @cur_fsm - The current state the device is in
662  * @new_fsm - The target state to reach
663  * @next_fsm - Pointer to the next step to get to new_fsm
664  *
665  * Return 0 upon success, otherwise -errno
666  * Upon success the next step in the state progression between cur_fsm and
667  * new_fsm will be set in next_fsm.
668  *
669  * This breaks down requests for combination transitions into smaller steps and
670  * returns the next step to get to new_fsm. The function may need to be called
671  * multiple times before reaching new_fsm.
672  *
673  */
674 int vfio_mig_get_next_state(struct vfio_device *device,
675 			    enum vfio_device_mig_state cur_fsm,
676 			    enum vfio_device_mig_state new_fsm,
677 			    enum vfio_device_mig_state *next_fsm)
678 {
679 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
680 	/*
681 	 * The coding in this table requires the driver to implement the
682 	 * following FSM arcs:
683 	 *         RESUMING -> STOP
684 	 *         STOP -> RESUMING
685 	 *         STOP -> STOP_COPY
686 	 *         STOP_COPY -> STOP
687 	 *
688 	 * If P2P is supported then the driver must also implement these FSM
689 	 * arcs:
690 	 *         RUNNING -> RUNNING_P2P
691 	 *         RUNNING_P2P -> RUNNING
692 	 *         RUNNING_P2P -> STOP
693 	 *         STOP -> RUNNING_P2P
694 	 *
695 	 * If precopy is supported then the driver must support these additional
696 	 * FSM arcs:
697 	 *         RUNNING -> PRE_COPY
698 	 *         PRE_COPY -> RUNNING
699 	 *         PRE_COPY -> STOP_COPY
700 	 * However, if precopy and P2P are supported together then the driver
701 	 * must support these additional arcs beyond the P2P arcs above:
702 	 *         PRE_COPY -> RUNNING
703 	 *         PRE_COPY -> PRE_COPY_P2P
704 	 *         PRE_COPY_P2P -> PRE_COPY
705 	 *         PRE_COPY_P2P -> RUNNING_P2P
706 	 *         PRE_COPY_P2P -> STOP_COPY
707 	 *         RUNNING -> PRE_COPY
708 	 *         RUNNING_P2P -> PRE_COPY_P2P
709 	 *
710 	 * Without P2P and precopy the driver must implement:
711 	 *         RUNNING -> STOP
712 	 *         STOP -> RUNNING
713 	 *
714 	 * The coding will step through multiple states for some combination
715 	 * transitions; if all optional features are supported, this means the
716 	 * following ones:
717 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
718 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
719 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
720 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
721 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
722 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
723 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
724 	 *         RESUMING -> STOP -> RUNNING_P2P
725 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
726 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
727 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
728 	 *         RESUMING -> STOP -> STOP_COPY
729 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
730 	 *         RUNNING -> RUNNING_P2P -> STOP
731 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
732 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
733 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
734 	 *         RUNNING_P2P -> STOP -> RESUMING
735 	 *         RUNNING_P2P -> STOP -> STOP_COPY
736 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
737 	 *         STOP -> RUNNING_P2P -> RUNNING
738 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
739 	 *         STOP_COPY -> STOP -> RESUMING
740 	 *         STOP_COPY -> STOP -> RUNNING_P2P
741 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
742 	 *
743 	 *  The following transitions are blocked:
744 	 *         STOP_COPY -> PRE_COPY
745 	 *         STOP_COPY -> PRE_COPY_P2P
746 	 */
747 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
748 		[VFIO_DEVICE_STATE_STOP] = {
749 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
750 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
751 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
752 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
753 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
754 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
755 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
756 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
757 		},
758 		[VFIO_DEVICE_STATE_RUNNING] = {
759 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
760 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
761 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
762 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
763 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
764 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
765 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
766 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
767 		},
768 		[VFIO_DEVICE_STATE_PRE_COPY] = {
769 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
770 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
771 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
772 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
773 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
774 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
775 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
776 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
777 		},
778 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
779 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
780 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
781 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
782 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
783 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
784 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
785 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
786 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
787 		},
788 		[VFIO_DEVICE_STATE_STOP_COPY] = {
789 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
790 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
791 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
792 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
793 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
794 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
795 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
796 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
797 		},
798 		[VFIO_DEVICE_STATE_RESUMING] = {
799 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
800 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
801 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
802 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
803 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
804 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
805 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
806 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
807 		},
808 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
809 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
810 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
811 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
812 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
813 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
814 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
815 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
816 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
817 		},
818 		[VFIO_DEVICE_STATE_ERROR] = {
819 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
820 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
821 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
822 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
823 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
824 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
825 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
826 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
827 		},
828 	};
829 
830 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
831 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
832 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
833 		[VFIO_DEVICE_STATE_PRE_COPY] =
834 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
835 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
836 						   VFIO_MIGRATION_P2P |
837 						   VFIO_MIGRATION_PRE_COPY,
838 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
839 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
840 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
841 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
842 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
843 	};
844 
845 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
846 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
847 			state_flags_table[cur_fsm]))
848 		return -EINVAL;
849 
850 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
851 	   (state_flags_table[new_fsm] & device->migration_flags) !=
852 			state_flags_table[new_fsm])
853 		return -EINVAL;
854 
855 	/*
856 	 * Arcs touching optional and unsupported states are skipped over. The
857 	 * driver will instead see an arc from the original state to the next
858 	 * logical state, as per the above comment.
859 	 */
860 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
861 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
862 			state_flags_table[*next_fsm])
863 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
864 
865 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
866 }
867 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
868 
869 /*
870  * Convert the drivers's struct file into a FD number and return it to userspace
871  */
872 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
873 				   struct vfio_device_feature_mig_state *mig)
874 {
875 	int ret;
876 	int fd;
877 
878 	fd = get_unused_fd_flags(O_CLOEXEC);
879 	if (fd < 0) {
880 		ret = fd;
881 		goto out_fput;
882 	}
883 
884 	mig->data_fd = fd;
885 	if (copy_to_user(arg, mig, sizeof(*mig))) {
886 		ret = -EFAULT;
887 		goto out_put_unused;
888 	}
889 	fd_install(fd, filp);
890 	return 0;
891 
892 out_put_unused:
893 	put_unused_fd(fd);
894 out_fput:
895 	fput(filp);
896 	return ret;
897 }
898 
899 static int
900 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
901 					   u32 flags, void __user *arg,
902 					   size_t argsz)
903 {
904 	size_t minsz =
905 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
906 	struct vfio_device_feature_mig_state mig;
907 	struct file *filp = NULL;
908 	int ret;
909 
910 	if (!device->mig_ops)
911 		return -ENOTTY;
912 
913 	ret = vfio_check_feature(flags, argsz,
914 				 VFIO_DEVICE_FEATURE_SET |
915 				 VFIO_DEVICE_FEATURE_GET,
916 				 sizeof(mig));
917 	if (ret != 1)
918 		return ret;
919 
920 	if (copy_from_user(&mig, arg, minsz))
921 		return -EFAULT;
922 
923 	if (flags & VFIO_DEVICE_FEATURE_GET) {
924 		enum vfio_device_mig_state curr_state;
925 
926 		ret = device->mig_ops->migration_get_state(device,
927 							   &curr_state);
928 		if (ret)
929 			return ret;
930 		mig.device_state = curr_state;
931 		goto out_copy;
932 	}
933 
934 	/* Handle the VFIO_DEVICE_FEATURE_SET */
935 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
936 	if (IS_ERR(filp) || !filp)
937 		goto out_copy;
938 
939 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
940 out_copy:
941 	mig.data_fd = -1;
942 	if (copy_to_user(arg, &mig, sizeof(mig)))
943 		return -EFAULT;
944 	if (IS_ERR(filp))
945 		return PTR_ERR(filp);
946 	return 0;
947 }
948 
949 static int
950 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
951 					      u32 flags, void __user *arg,
952 					      size_t argsz)
953 {
954 	struct vfio_device_feature_mig_data_size data_size = {};
955 	unsigned long stop_copy_length;
956 	int ret;
957 
958 	if (!device->mig_ops)
959 		return -ENOTTY;
960 
961 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
962 				 sizeof(data_size));
963 	if (ret != 1)
964 		return ret;
965 
966 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
967 	if (ret)
968 		return ret;
969 
970 	data_size.stop_copy_length = stop_copy_length;
971 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
972 		return -EFAULT;
973 
974 	return 0;
975 }
976 
977 static int
978 vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device,
979 						    u32 flags, size_t argsz)
980 {
981 	int ret;
982 
983 	if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY))
984 		return -EINVAL;
985 
986 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
987 	if (ret != 1)
988 		return ret;
989 
990 	device->precopy_info_v2 = 1;
991 	return 0;
992 }
993 
994 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
995 					       u32 flags, void __user *arg,
996 					       size_t argsz)
997 {
998 	struct vfio_device_feature_migration mig = {
999 		.flags = device->migration_flags,
1000 	};
1001 	int ret;
1002 
1003 	if (!device->mig_ops)
1004 		return -ENOTTY;
1005 
1006 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1007 				 sizeof(mig));
1008 	if (ret != 1)
1009 		return ret;
1010 	if (copy_to_user(arg, &mig, sizeof(mig)))
1011 		return -EFAULT;
1012 	return 0;
1013 }
1014 
1015 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
1016 			      u32 req_nodes)
1017 {
1018 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
1019 	unsigned long min_gap, curr_gap;
1020 
1021 	/* Special shortcut when a single range is required */
1022 	if (req_nodes == 1) {
1023 		unsigned long last;
1024 
1025 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
1026 
1027 		/* Empty list */
1028 		if (WARN_ON_ONCE(!comb_start))
1029 			return;
1030 
1031 		curr = comb_start;
1032 		while (curr) {
1033 			last = curr->last;
1034 			prev = curr;
1035 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1036 			if (prev != comb_start)
1037 				interval_tree_remove(prev, root);
1038 		}
1039 		comb_start->last = last;
1040 		return;
1041 	}
1042 
1043 	/* Combine ranges which have the smallest gap */
1044 	while (cur_nodes > req_nodes) {
1045 		prev = NULL;
1046 		min_gap = ULONG_MAX;
1047 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1048 		while (curr) {
1049 			if (prev) {
1050 				curr_gap = curr->start - prev->last;
1051 				if (curr_gap < min_gap) {
1052 					min_gap = curr_gap;
1053 					comb_start = prev;
1054 					comb_end = curr;
1055 				}
1056 			}
1057 			prev = curr;
1058 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1059 		}
1060 
1061 		/* Empty list or no nodes to combine */
1062 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1063 			break;
1064 
1065 		comb_start->last = comb_end->last;
1066 		interval_tree_remove(comb_end, root);
1067 		cur_nodes--;
1068 	}
1069 }
1070 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1071 
1072 /* Ranges should fit into a single kernel page */
1073 #define LOG_MAX_RANGES \
1074 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1075 
1076 static int
1077 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1078 					u32 flags, void __user *arg,
1079 					size_t argsz)
1080 {
1081 	size_t minsz =
1082 		offsetofend(struct vfio_device_feature_dma_logging_control,
1083 			    ranges);
1084 	struct vfio_device_feature_dma_logging_range __user *ranges;
1085 	struct vfio_device_feature_dma_logging_control control;
1086 	struct vfio_device_feature_dma_logging_range range;
1087 	struct rb_root_cached root = RB_ROOT_CACHED;
1088 	struct interval_tree_node *nodes;
1089 	u64 iova_end;
1090 	u32 nnodes;
1091 	int i, ret;
1092 
1093 	if (!device->log_ops)
1094 		return -ENOTTY;
1095 
1096 	ret = vfio_check_feature(flags, argsz,
1097 				 VFIO_DEVICE_FEATURE_SET,
1098 				 sizeof(control));
1099 	if (ret != 1)
1100 		return ret;
1101 
1102 	if (copy_from_user(&control, arg, minsz))
1103 		return -EFAULT;
1104 
1105 	nnodes = control.num_ranges;
1106 	if (!nnodes)
1107 		return -EINVAL;
1108 
1109 	if (nnodes > LOG_MAX_RANGES)
1110 		return -E2BIG;
1111 
1112 	ranges = u64_to_user_ptr(control.ranges);
1113 	nodes = kmalloc_objs(struct interval_tree_node, nnodes);
1114 	if (!nodes)
1115 		return -ENOMEM;
1116 
1117 	for (i = 0; i < nnodes; i++) {
1118 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1119 			ret = -EFAULT;
1120 			goto end;
1121 		}
1122 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1123 		    !IS_ALIGNED(range.length, control.page_size)) {
1124 			ret = -EINVAL;
1125 			goto end;
1126 		}
1127 
1128 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1129 		    iova_end > ULONG_MAX) {
1130 			ret = -EOVERFLOW;
1131 			goto end;
1132 		}
1133 
1134 		nodes[i].start = range.iova;
1135 		nodes[i].last = range.iova + range.length - 1;
1136 		if (interval_tree_iter_first(&root, nodes[i].start,
1137 					     nodes[i].last)) {
1138 			/* Range overlapping */
1139 			ret = -EINVAL;
1140 			goto end;
1141 		}
1142 		interval_tree_insert(nodes + i, &root);
1143 	}
1144 
1145 	ret = device->log_ops->log_start(device, &root, nnodes,
1146 					 &control.page_size);
1147 	if (ret)
1148 		goto end;
1149 
1150 	if (copy_to_user(arg, &control, sizeof(control))) {
1151 		ret = -EFAULT;
1152 		device->log_ops->log_stop(device);
1153 	}
1154 
1155 end:
1156 	kfree(nodes);
1157 	return ret;
1158 }
1159 
1160 static int
1161 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1162 				       u32 flags, void __user *arg,
1163 				       size_t argsz)
1164 {
1165 	int ret;
1166 
1167 	if (!device->log_ops)
1168 		return -ENOTTY;
1169 
1170 	ret = vfio_check_feature(flags, argsz,
1171 				 VFIO_DEVICE_FEATURE_SET, 0);
1172 	if (ret != 1)
1173 		return ret;
1174 
1175 	return device->log_ops->log_stop(device);
1176 }
1177 
1178 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1179 					  unsigned long iova, size_t length,
1180 					  void *opaque)
1181 {
1182 	struct vfio_device *device = opaque;
1183 
1184 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1185 }
1186 
1187 static int
1188 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1189 					 u32 flags, void __user *arg,
1190 					 size_t argsz)
1191 {
1192 	size_t minsz =
1193 		offsetofend(struct vfio_device_feature_dma_logging_report,
1194 			    bitmap);
1195 	struct vfio_device_feature_dma_logging_report report;
1196 	struct iova_bitmap *iter;
1197 	u64 iova_end;
1198 	int ret;
1199 
1200 	if (!device->log_ops)
1201 		return -ENOTTY;
1202 
1203 	ret = vfio_check_feature(flags, argsz,
1204 				 VFIO_DEVICE_FEATURE_GET,
1205 				 sizeof(report));
1206 	if (ret != 1)
1207 		return ret;
1208 
1209 	if (copy_from_user(&report, arg, minsz))
1210 		return -EFAULT;
1211 
1212 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1213 		return -EINVAL;
1214 
1215 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1216 	    iova_end > ULONG_MAX)
1217 		return -EOVERFLOW;
1218 
1219 	iter = iova_bitmap_alloc(report.iova, report.length,
1220 				 report.page_size,
1221 				 u64_to_user_ptr(report.bitmap));
1222 	if (IS_ERR(iter))
1223 		return PTR_ERR(iter);
1224 
1225 	ret = iova_bitmap_for_each(iter, device,
1226 				   vfio_device_log_read_and_clear);
1227 
1228 	iova_bitmap_free(iter);
1229 	return ret;
1230 }
1231 
1232 static int vfio_ioctl_device_feature(struct vfio_device *device,
1233 				     struct vfio_device_feature __user *arg)
1234 {
1235 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1236 	struct vfio_device_feature feature;
1237 
1238 	if (copy_from_user(&feature, arg, minsz))
1239 		return -EFAULT;
1240 
1241 	if (feature.argsz < minsz)
1242 		return -EINVAL;
1243 
1244 	/* Check unknown flags */
1245 	if (feature.flags &
1246 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1247 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1248 		return -EINVAL;
1249 
1250 	/* GET & SET are mutually exclusive except with PROBE */
1251 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1252 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1253 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1254 		return -EINVAL;
1255 
1256 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1257 	case VFIO_DEVICE_FEATURE_MIGRATION:
1258 		return vfio_ioctl_device_feature_migration(
1259 			device, feature.flags, arg->data,
1260 			feature.argsz - minsz);
1261 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1262 		return vfio_ioctl_device_feature_mig_device_state(
1263 			device, feature.flags, arg->data,
1264 			feature.argsz - minsz);
1265 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1266 		return vfio_ioctl_device_feature_logging_start(
1267 			device, feature.flags, arg->data,
1268 			feature.argsz - minsz);
1269 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1270 		return vfio_ioctl_device_feature_logging_stop(
1271 			device, feature.flags, arg->data,
1272 			feature.argsz - minsz);
1273 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1274 		return vfio_ioctl_device_feature_logging_report(
1275 			device, feature.flags, arg->data,
1276 			feature.argsz - minsz);
1277 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1278 		return vfio_ioctl_device_feature_migration_data_size(
1279 			device, feature.flags, arg->data,
1280 			feature.argsz - minsz);
1281 	case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2:
1282 		return vfio_ioctl_device_feature_migration_precopy_info_v2(
1283 			device, feature.flags, feature.argsz - minsz);
1284 	default:
1285 		if (unlikely(!device->ops->device_feature))
1286 			return -ENOTTY;
1287 		return device->ops->device_feature(device, feature.flags,
1288 						   arg->data,
1289 						   feature.argsz - minsz);
1290 	}
1291 }
1292 
1293 static long vfio_get_region_info(struct vfio_device *device,
1294 				 struct vfio_region_info __user *arg)
1295 {
1296 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
1297 	struct vfio_region_info info = {};
1298 	struct vfio_info_cap caps = {};
1299 	int ret;
1300 
1301 	if (unlikely(!device->ops->get_region_info_caps))
1302 		return -EINVAL;
1303 
1304 	if (copy_from_user(&info, arg, minsz))
1305 		return -EFAULT;
1306 	if (info.argsz < minsz)
1307 		return -EINVAL;
1308 
1309 	ret = device->ops->get_region_info_caps(device, &info, &caps);
1310 	if (ret)
1311 		goto out_free;
1312 
1313 	if (caps.size) {
1314 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1315 		if (info.argsz < sizeof(info) + caps.size) {
1316 			info.argsz = sizeof(info) + caps.size;
1317 			info.cap_offset = 0;
1318 		} else {
1319 			vfio_info_cap_shift(&caps, sizeof(info));
1320 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1321 				ret = -EFAULT;
1322 				goto out_free;
1323 			}
1324 			info.cap_offset = sizeof(info);
1325 		}
1326 	}
1327 
1328 	if (copy_to_user(arg, &info, minsz)){
1329 		ret = -EFAULT;
1330 		goto out_free;
1331 	}
1332 
1333 out_free:
1334 	kfree(caps.buf);
1335 	return ret;
1336 }
1337 
1338 static long vfio_device_fops_unl_ioctl(struct file *filep,
1339 				       unsigned int cmd, unsigned long arg)
1340 {
1341 	struct vfio_device_file *df = filep->private_data;
1342 	struct vfio_device *device = df->device;
1343 	void __user *uptr = (void __user *)arg;
1344 	int ret;
1345 
1346 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1347 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1348 
1349 	/* Paired with smp_store_release() following vfio_df_open() */
1350 	if (!smp_load_acquire(&df->access_granted))
1351 		return -EINVAL;
1352 
1353 	ret = vfio_device_pm_runtime_get(device);
1354 	if (ret)
1355 		return ret;
1356 
1357 	/* cdev only ioctls */
1358 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1359 		switch (cmd) {
1360 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1361 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1362 			goto out;
1363 
1364 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1365 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1366 			goto out;
1367 		}
1368 	}
1369 
1370 	switch (cmd) {
1371 	case VFIO_DEVICE_FEATURE:
1372 		ret = vfio_ioctl_device_feature(device, uptr);
1373 		break;
1374 
1375 	case VFIO_DEVICE_GET_REGION_INFO:
1376 		ret = vfio_get_region_info(device, uptr);
1377 		break;
1378 
1379 	default:
1380 		if (unlikely(!device->ops->ioctl))
1381 			ret = -EINVAL;
1382 		else
1383 			ret = device->ops->ioctl(device, cmd, arg);
1384 		break;
1385 	}
1386 out:
1387 	vfio_device_pm_runtime_put(device);
1388 	return ret;
1389 }
1390 
1391 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1392 				     size_t count, loff_t *ppos)
1393 {
1394 	struct vfio_device_file *df = filep->private_data;
1395 	struct vfio_device *device = df->device;
1396 
1397 	/* Paired with smp_store_release() following vfio_df_open() */
1398 	if (!smp_load_acquire(&df->access_granted))
1399 		return -EINVAL;
1400 
1401 	if (unlikely(!device->ops->read))
1402 		return -EINVAL;
1403 
1404 	return device->ops->read(device, buf, count, ppos);
1405 }
1406 
1407 static ssize_t vfio_device_fops_write(struct file *filep,
1408 				      const char __user *buf,
1409 				      size_t count, loff_t *ppos)
1410 {
1411 	struct vfio_device_file *df = filep->private_data;
1412 	struct vfio_device *device = df->device;
1413 
1414 	/* Paired with smp_store_release() following vfio_df_open() */
1415 	if (!smp_load_acquire(&df->access_granted))
1416 		return -EINVAL;
1417 
1418 	if (unlikely(!device->ops->write))
1419 		return -EINVAL;
1420 
1421 	return device->ops->write(device, buf, count, ppos);
1422 }
1423 
1424 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1425 {
1426 	struct vfio_device_file *df = filep->private_data;
1427 	struct vfio_device *device = df->device;
1428 
1429 	/* Paired with smp_store_release() following vfio_df_open() */
1430 	if (!smp_load_acquire(&df->access_granted))
1431 		return -EINVAL;
1432 
1433 	if (unlikely(!device->ops->mmap))
1434 		return -EINVAL;
1435 
1436 	return device->ops->mmap(device, vma);
1437 }
1438 
1439 #ifdef CONFIG_PROC_FS
1440 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep)
1441 {
1442 	char *path;
1443 	struct vfio_device_file *df = filep->private_data;
1444 	struct vfio_device *device = df->device;
1445 
1446 	path = kobject_get_path(&device->dev->kobj, GFP_KERNEL);
1447 	if (!path)
1448 		return;
1449 
1450 	seq_printf(m, "vfio-device-syspath: /sys%s\n", path);
1451 	kfree(path);
1452 }
1453 #endif
1454 
1455 const struct file_operations vfio_device_fops = {
1456 	.owner		= THIS_MODULE,
1457 	.open		= vfio_device_fops_cdev_open,
1458 	.release	= vfio_device_fops_release,
1459 	.read		= vfio_device_fops_read,
1460 	.write		= vfio_device_fops_write,
1461 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1462 	.compat_ioctl	= compat_ptr_ioctl,
1463 	.mmap		= vfio_device_fops_mmap,
1464 #ifdef CONFIG_PROC_FS
1465 	.show_fdinfo	= vfio_device_show_fdinfo,
1466 #endif
1467 };
1468 
1469 static struct vfio_device *vfio_device_from_file(struct file *file)
1470 {
1471 	struct vfio_device_file *df = file->private_data;
1472 
1473 	if (file->f_op != &vfio_device_fops)
1474 		return NULL;
1475 	return df->device;
1476 }
1477 
1478 /**
1479  * vfio_file_is_valid - True if the file is valid vfio file
1480  * @file: VFIO group file or VFIO device file
1481  */
1482 bool vfio_file_is_valid(struct file *file)
1483 {
1484 	return vfio_group_from_file(file) ||
1485 	       vfio_device_from_file(file);
1486 }
1487 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1488 
1489 /**
1490  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1491  *        is always CPU cache coherent
1492  * @file: VFIO group file or VFIO device file
1493  *
1494  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1495  * bit in DMA transactions. A return of false indicates that the user has
1496  * rights to access additional instructions such as wbinvd on x86.
1497  */
1498 bool vfio_file_enforced_coherent(struct file *file)
1499 {
1500 	struct vfio_device *device;
1501 	struct vfio_group *group;
1502 
1503 	group = vfio_group_from_file(file);
1504 	if (group)
1505 		return vfio_group_enforced_coherent(group);
1506 
1507 	device = vfio_device_from_file(file);
1508 	if (device)
1509 		return device_iommu_capable(device->dev,
1510 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1511 
1512 	return true;
1513 }
1514 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1515 
1516 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1517 {
1518 	struct vfio_device_file *df = file->private_data;
1519 
1520 	/*
1521 	 * The kvm is first recorded in the vfio_device_file, and will
1522 	 * be propagated to vfio_device::kvm when the file is bound to
1523 	 * iommufd successfully in the vfio device cdev path.
1524 	 */
1525 	spin_lock(&df->kvm_ref_lock);
1526 	df->kvm = kvm;
1527 	spin_unlock(&df->kvm_ref_lock);
1528 }
1529 
1530 /**
1531  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1532  * @file: VFIO group file or VFIO device file
1533  * @kvm: KVM to link
1534  *
1535  * When a VFIO device is first opened the KVM will be available in
1536  * device->kvm if one was associated with the file.
1537  */
1538 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1539 {
1540 	struct vfio_group *group;
1541 
1542 	group = vfio_group_from_file(file);
1543 	if (group)
1544 		vfio_group_set_kvm(group, kvm);
1545 
1546 	if (vfio_device_from_file(file))
1547 		vfio_device_file_set_kvm(file, kvm);
1548 }
1549 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1550 
1551 /*
1552  * Sub-module support
1553  */
1554 /*
1555  * Helper for managing a buffer of info chain capabilities, allocate or
1556  * reallocate a buffer with additional @size, filling in @id and @version
1557  * of the capability.  A pointer to the new capability is returned.
1558  *
1559  * NB. The chain is based at the head of the buffer, so new entries are
1560  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1561  * next offsets prior to copying to the user buffer.
1562  */
1563 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1564 					       size_t size, u16 id, u16 version)
1565 {
1566 	void *buf;
1567 	struct vfio_info_cap_header *header, *tmp;
1568 
1569 	/* Ensure that the next capability struct will be aligned */
1570 	size = ALIGN(size, sizeof(u64));
1571 
1572 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1573 	if (!buf) {
1574 		kfree(caps->buf);
1575 		caps->buf = NULL;
1576 		caps->size = 0;
1577 		return ERR_PTR(-ENOMEM);
1578 	}
1579 
1580 	caps->buf = buf;
1581 	header = buf + caps->size;
1582 
1583 	/* Eventually copied to user buffer, zero */
1584 	memset(header, 0, size);
1585 
1586 	header->id = id;
1587 	header->version = version;
1588 
1589 	/* Add to the end of the capability chain */
1590 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1591 		; /* nothing */
1592 
1593 	tmp->next = caps->size;
1594 	caps->size += size;
1595 
1596 	return header;
1597 }
1598 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1599 
1600 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1601 {
1602 	struct vfio_info_cap_header *tmp;
1603 	void *buf = (void *)caps->buf;
1604 
1605 	/* Capability structs should start with proper alignment */
1606 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1607 
1608 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1609 		tmp->next += offset;
1610 }
1611 EXPORT_SYMBOL(vfio_info_cap_shift);
1612 
1613 int vfio_info_add_capability(struct vfio_info_cap *caps,
1614 			     struct vfio_info_cap_header *cap, size_t size)
1615 {
1616 	struct vfio_info_cap_header *header;
1617 
1618 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1619 	if (IS_ERR(header))
1620 		return PTR_ERR(header);
1621 
1622 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1623 
1624 	return 0;
1625 }
1626 EXPORT_SYMBOL(vfio_info_add_capability);
1627 
1628 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1629 				       int max_irq_type, size_t *data_size)
1630 {
1631 	unsigned long minsz;
1632 	size_t size;
1633 
1634 	minsz = offsetofend(struct vfio_irq_set, count);
1635 
1636 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1637 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1638 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1639 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1640 		return -EINVAL;
1641 
1642 	if (data_size)
1643 		*data_size = 0;
1644 
1645 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1646 		return -EINVAL;
1647 
1648 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1649 	case VFIO_IRQ_SET_DATA_NONE:
1650 		size = 0;
1651 		break;
1652 	case VFIO_IRQ_SET_DATA_BOOL:
1653 		size = sizeof(uint8_t);
1654 		break;
1655 	case VFIO_IRQ_SET_DATA_EVENTFD:
1656 		size = sizeof(int32_t);
1657 		break;
1658 	default:
1659 		return -EINVAL;
1660 	}
1661 
1662 	if (size) {
1663 		if (hdr->argsz - minsz < hdr->count * size)
1664 			return -EINVAL;
1665 
1666 		if (!data_size)
1667 			return -EINVAL;
1668 
1669 		*data_size = hdr->count * size;
1670 	}
1671 
1672 	return 0;
1673 }
1674 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1675 
1676 /*
1677  * Pin contiguous user pages and return their associated host pages for local
1678  * domain only.
1679  * @device [in]  : device
1680  * @iova [in]    : starting IOVA of user pages to be pinned.
1681  * @npage [in]   : count of pages to be pinned.  This count should not
1682  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1683  * @prot [in]    : protection flags
1684  * @pages[out]   : array of host pages
1685  * Return error or number of pages pinned.
1686  *
1687  * A driver may only call this function if the vfio_device was created
1688  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1689  */
1690 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1691 		   int npage, int prot, struct page **pages)
1692 {
1693 	/* group->container cannot change while a vfio device is open */
1694 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1695 		return -EINVAL;
1696 	if (!device->ops->dma_unmap)
1697 		return -EINVAL;
1698 	if (vfio_device_has_container(device))
1699 		return vfio_device_container_pin_pages(device, iova,
1700 						       npage, prot, pages);
1701 	if (device->iommufd_access) {
1702 		int ret;
1703 
1704 		if (iova > ULONG_MAX)
1705 			return -EINVAL;
1706 		/*
1707 		 * VFIO ignores the sub page offset, npages is from the start of
1708 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1709 		 * the sub page offset by doing:
1710 		 *     pages[0] + (iova % PAGE_SIZE)
1711 		 */
1712 		ret = iommufd_access_pin_pages(
1713 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1714 			npage * PAGE_SIZE, pages,
1715 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1716 		if (ret)
1717 			return ret;
1718 		return npage;
1719 	}
1720 	return -EINVAL;
1721 }
1722 EXPORT_SYMBOL(vfio_pin_pages);
1723 
1724 /*
1725  * Unpin contiguous host pages for local domain only.
1726  * @device [in]  : device
1727  * @iova [in]    : starting address of user pages to be unpinned.
1728  * @npage [in]   : count of pages to be unpinned.  This count should not
1729  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1730  */
1731 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1732 {
1733 	if (WARN_ON(!vfio_assert_device_open(device)))
1734 		return;
1735 	if (WARN_ON(!device->ops->dma_unmap))
1736 		return;
1737 
1738 	if (vfio_device_has_container(device)) {
1739 		vfio_device_container_unpin_pages(device, iova, npage);
1740 		return;
1741 	}
1742 	if (device->iommufd_access) {
1743 		if (WARN_ON(iova > ULONG_MAX))
1744 			return;
1745 		iommufd_access_unpin_pages(device->iommufd_access,
1746 					   ALIGN_DOWN(iova, PAGE_SIZE),
1747 					   npage * PAGE_SIZE);
1748 		return;
1749 	}
1750 }
1751 EXPORT_SYMBOL(vfio_unpin_pages);
1752 
1753 /*
1754  * This interface allows the CPUs to perform some sort of virtual DMA on
1755  * behalf of the device.
1756  *
1757  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1758  * into/from a kernel buffer.
1759  *
1760  * As the read/write of user space memory is conducted via the CPUs and is
1761  * not a real device DMA, it is not necessary to pin the user space memory.
1762  *
1763  * @device [in]		: VFIO device
1764  * @iova [in]		: base IOVA of a user space buffer
1765  * @data [in]		: pointer to kernel buffer
1766  * @len [in]		: kernel buffer length
1767  * @write		: indicate read or write
1768  * Return error code on failure or 0 on success.
1769  */
1770 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1771 		size_t len, bool write)
1772 {
1773 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1774 		return -EINVAL;
1775 
1776 	if (vfio_device_has_container(device))
1777 		return vfio_device_container_dma_rw(device, iova,
1778 						    data, len, write);
1779 
1780 	if (device->iommufd_access) {
1781 		unsigned int flags = 0;
1782 
1783 		if (iova > ULONG_MAX)
1784 			return -EINVAL;
1785 
1786 		/* VFIO historically tries to auto-detect a kthread */
1787 		if (!current->mm)
1788 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1789 		if (write)
1790 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1791 		return iommufd_access_rw(device->iommufd_access, iova, data,
1792 					 len, flags);
1793 	}
1794 	return -EINVAL;
1795 }
1796 EXPORT_SYMBOL(vfio_dma_rw);
1797 
1798 /*
1799  * Module/class support
1800  */
1801 static int __init vfio_init(void)
1802 {
1803 	int ret;
1804 
1805 	ida_init(&vfio.device_ida);
1806 
1807 	ret = vfio_group_init();
1808 	if (ret)
1809 		return ret;
1810 
1811 	ret = vfio_virqfd_init();
1812 	if (ret)
1813 		goto err_virqfd;
1814 
1815 	/* /sys/class/vfio-dev/vfioX */
1816 	ret = class_register(&vfio_device_class);
1817 	if (ret)
1818 		goto err_dev_class;
1819 
1820 	ret = vfio_cdev_init();
1821 	if (ret)
1822 		goto err_alloc_dev_chrdev;
1823 
1824 	vfio_debugfs_create_root();
1825 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1826 	return 0;
1827 
1828 err_alloc_dev_chrdev:
1829 	class_unregister(&vfio_device_class);
1830 err_dev_class:
1831 	vfio_virqfd_exit();
1832 err_virqfd:
1833 	vfio_group_cleanup();
1834 	return ret;
1835 }
1836 
1837 static void __exit vfio_cleanup(void)
1838 {
1839 	vfio_debugfs_remove_root();
1840 	ida_destroy(&vfio.device_ida);
1841 	vfio_cdev_cleanup();
1842 	class_unregister(&vfio_device_class);
1843 	vfio_virqfd_exit();
1844 	vfio_group_cleanup();
1845 	xa_destroy(&vfio_device_set_xa);
1846 }
1847 
1848 module_init(vfio_init);
1849 module_exit(vfio_cleanup);
1850 
1851 MODULE_IMPORT_NS("IOMMUFD");
1852 MODULE_VERSION(DRIVER_VERSION);
1853 MODULE_LICENSE("GPL v2");
1854 MODULE_AUTHOR(DRIVER_AUTHOR);
1855 MODULE_DESCRIPTION(DRIVER_DESC);
1856 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1857