xref: /linux/drivers/vfio/vfio_main.c (revision a3ebb59eee2e558e8f8f27fc3f75cd367f17cd8e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/seq_file.h>
32 #include <linux/slab.h>
33 #include <linux/stat.h>
34 #include <linux/string.h>
35 #include <linux/uaccess.h>
36 #include <linux/vfio.h>
37 #include <linux/wait.h>
38 #include <linux/sched/signal.h>
39 #include <linux/pm_runtime.h>
40 #include <linux/interval_tree.h>
41 #include <linux/iova_bitmap.h>
42 #include <linux/iommufd.h>
43 #include "vfio.h"
44 
45 #define DRIVER_VERSION	"0.3"
46 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
47 #define DRIVER_DESC	"VFIO - User Level meta-driver"
48 
49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
50 
51 static struct vfio {
52 	struct class			*device_class;
53 	struct ida			device_ida;
54 	struct vfsmount			*vfs_mount;
55 	int				fs_count;
56 } vfio;
57 
58 #ifdef CONFIG_VFIO_NOIOMMU
59 bool vfio_noiommu __read_mostly;
60 module_param_named(enable_unsafe_noiommu_mode,
61 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
62 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
63 #endif
64 
65 static DEFINE_XARRAY(vfio_device_set_xa);
66 
vfio_assign_device_set(struct vfio_device * device,void * set_id)67 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
68 {
69 	unsigned long idx = (unsigned long)set_id;
70 	struct vfio_device_set *new_dev_set;
71 	struct vfio_device_set *dev_set;
72 
73 	if (WARN_ON(!set_id))
74 		return -EINVAL;
75 
76 	/*
77 	 * Atomically acquire a singleton object in the xarray for this set_id
78 	 */
79 	xa_lock(&vfio_device_set_xa);
80 	dev_set = xa_load(&vfio_device_set_xa, idx);
81 	if (dev_set)
82 		goto found_get_ref;
83 	xa_unlock(&vfio_device_set_xa);
84 
85 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
86 	if (!new_dev_set)
87 		return -ENOMEM;
88 	mutex_init(&new_dev_set->lock);
89 	INIT_LIST_HEAD(&new_dev_set->device_list);
90 	new_dev_set->set_id = set_id;
91 
92 	xa_lock(&vfio_device_set_xa);
93 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
94 			       GFP_KERNEL);
95 	if (!dev_set) {
96 		dev_set = new_dev_set;
97 		goto found_get_ref;
98 	}
99 
100 	kfree(new_dev_set);
101 	if (xa_is_err(dev_set)) {
102 		xa_unlock(&vfio_device_set_xa);
103 		return xa_err(dev_set);
104 	}
105 
106 found_get_ref:
107 	dev_set->device_count++;
108 	xa_unlock(&vfio_device_set_xa);
109 	mutex_lock(&dev_set->lock);
110 	device->dev_set = dev_set;
111 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
112 	mutex_unlock(&dev_set->lock);
113 	return 0;
114 }
115 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
116 
vfio_release_device_set(struct vfio_device * device)117 static void vfio_release_device_set(struct vfio_device *device)
118 {
119 	struct vfio_device_set *dev_set = device->dev_set;
120 
121 	if (!dev_set)
122 		return;
123 
124 	mutex_lock(&dev_set->lock);
125 	list_del(&device->dev_set_list);
126 	mutex_unlock(&dev_set->lock);
127 
128 	xa_lock(&vfio_device_set_xa);
129 	if (!--dev_set->device_count) {
130 		__xa_erase(&vfio_device_set_xa,
131 			   (unsigned long)dev_set->set_id);
132 		mutex_destroy(&dev_set->lock);
133 		kfree(dev_set);
134 	}
135 	xa_unlock(&vfio_device_set_xa);
136 }
137 
vfio_device_set_open_count(struct vfio_device_set * dev_set)138 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
139 {
140 	struct vfio_device *cur;
141 	unsigned int open_count = 0;
142 
143 	lockdep_assert_held(&dev_set->lock);
144 
145 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
146 		open_count += cur->open_count;
147 	return open_count;
148 }
149 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
150 
151 struct vfio_device *
vfio_find_device_in_devset(struct vfio_device_set * dev_set,struct device * dev)152 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
153 			   struct device *dev)
154 {
155 	struct vfio_device *cur;
156 
157 	lockdep_assert_held(&dev_set->lock);
158 
159 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
160 		if (cur->dev == dev)
161 			return cur;
162 	return NULL;
163 }
164 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
165 
166 /*
167  * Device objects - create, release, get, put, search
168  */
169 /* Device reference always implies a group reference */
vfio_device_put_registration(struct vfio_device * device)170 void vfio_device_put_registration(struct vfio_device *device)
171 {
172 	if (refcount_dec_and_test(&device->refcount))
173 		complete(&device->comp);
174 }
175 EXPORT_SYMBOL_GPL(vfio_device_put_registration);
176 
vfio_device_try_get_registration(struct vfio_device * device)177 bool vfio_device_try_get_registration(struct vfio_device *device)
178 {
179 	return refcount_inc_not_zero(&device->refcount);
180 }
181 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
182 
183 /*
184  * VFIO driver API
185  */
186 /* Release helper called by vfio_put_device() */
vfio_device_release(struct device * dev)187 static void vfio_device_release(struct device *dev)
188 {
189 	struct vfio_device *device =
190 			container_of(dev, struct vfio_device, device);
191 
192 	vfio_release_device_set(device);
193 	ida_free(&vfio.device_ida, device->index);
194 
195 	if (device->ops->release)
196 		device->ops->release(device);
197 
198 	iput(device->inode);
199 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
200 	kvfree(device);
201 }
202 
203 static int vfio_init_device(struct vfio_device *device, struct device *dev,
204 			    const struct vfio_device_ops *ops);
205 
206 /*
207  * Allocate and initialize vfio_device so it can be registered to vfio
208  * core.
209  *
210  * Drivers should use the wrapper vfio_alloc_device() for allocation.
211  * @size is the size of the structure to be allocated, including any
212  * private data used by the driver.
213  *
214  * Driver may provide an @init callback to cover device private data.
215  *
216  * Use vfio_put_device() to release the structure after success return.
217  */
_vfio_alloc_device(size_t size,struct device * dev,const struct vfio_device_ops * ops)218 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
219 				       const struct vfio_device_ops *ops)
220 {
221 	struct vfio_device *device;
222 	int ret;
223 
224 	if (WARN_ON(size < sizeof(struct vfio_device)))
225 		return ERR_PTR(-EINVAL);
226 
227 	device = kvzalloc(size, GFP_KERNEL);
228 	if (!device)
229 		return ERR_PTR(-ENOMEM);
230 
231 	ret = vfio_init_device(device, dev, ops);
232 	if (ret)
233 		goto out_free;
234 	return device;
235 
236 out_free:
237 	kvfree(device);
238 	return ERR_PTR(ret);
239 }
240 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
241 
vfio_fs_init_fs_context(struct fs_context * fc)242 static int vfio_fs_init_fs_context(struct fs_context *fc)
243 {
244 	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
245 }
246 
247 static struct file_system_type vfio_fs_type = {
248 	.name = "vfio",
249 	.owner = THIS_MODULE,
250 	.init_fs_context = vfio_fs_init_fs_context,
251 	.kill_sb = kill_anon_super,
252 };
253 
vfio_fs_inode_new(void)254 static struct inode *vfio_fs_inode_new(void)
255 {
256 	struct inode *inode;
257 	int ret;
258 
259 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
260 	if (ret)
261 		return ERR_PTR(ret);
262 
263 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
264 	if (IS_ERR(inode))
265 		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
266 
267 	return inode;
268 }
269 
270 /*
271  * Initialize a vfio_device so it can be registered to vfio core.
272  */
vfio_init_device(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)273 static int vfio_init_device(struct vfio_device *device, struct device *dev,
274 			    const struct vfio_device_ops *ops)
275 {
276 	int ret;
277 
278 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
279 	if (ret < 0) {
280 		dev_dbg(dev, "Error to alloc index\n");
281 		return ret;
282 	}
283 
284 	device->index = ret;
285 	init_completion(&device->comp);
286 	device->dev = dev;
287 	device->ops = ops;
288 	device->inode = vfio_fs_inode_new();
289 	if (IS_ERR(device->inode)) {
290 		ret = PTR_ERR(device->inode);
291 		goto out_inode;
292 	}
293 
294 	if (ops->init) {
295 		ret = ops->init(device);
296 		if (ret)
297 			goto out_uninit;
298 	}
299 
300 	device_initialize(&device->device);
301 	device->device.release = vfio_device_release;
302 	device->device.class = vfio.device_class;
303 	device->device.parent = device->dev;
304 	return 0;
305 
306 out_uninit:
307 	iput(device->inode);
308 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
309 out_inode:
310 	vfio_release_device_set(device);
311 	ida_free(&vfio.device_ida, device->index);
312 	return ret;
313 }
314 
__vfio_register_dev(struct vfio_device * device,enum vfio_group_type type)315 static int __vfio_register_dev(struct vfio_device *device,
316 			       enum vfio_group_type type)
317 {
318 	int ret;
319 
320 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
321 		    (!device->ops->bind_iommufd ||
322 		     !device->ops->unbind_iommufd ||
323 		     !device->ops->attach_ioas ||
324 		     !device->ops->detach_ioas)))
325 		return -EINVAL;
326 
327 	/*
328 	 * If the driver doesn't specify a set then the device is added to a
329 	 * singleton set just for itself.
330 	 */
331 	if (!device->dev_set)
332 		vfio_assign_device_set(device, device);
333 
334 	ret = dev_set_name(&device->device, "vfio%d", device->index);
335 	if (ret)
336 		return ret;
337 
338 	ret = vfio_device_set_group(device, type);
339 	if (ret)
340 		return ret;
341 
342 	/*
343 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
344 	 * restore cache coherency. It has to be checked here because it is only
345 	 * valid for cases where we are using iommu groups.
346 	 */
347 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
348 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
349 		ret = -EINVAL;
350 		goto err_out;
351 	}
352 
353 	ret = vfio_device_add(device);
354 	if (ret)
355 		goto err_out;
356 
357 	/* Refcounting can't start until the driver calls register */
358 	refcount_set(&device->refcount, 1);
359 
360 	vfio_device_group_register(device);
361 	vfio_device_debugfs_init(device);
362 
363 	return 0;
364 err_out:
365 	vfio_device_remove_group(device);
366 	return ret;
367 }
368 
vfio_register_group_dev(struct vfio_device * device)369 int vfio_register_group_dev(struct vfio_device *device)
370 {
371 	return __vfio_register_dev(device, VFIO_IOMMU);
372 }
373 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
374 
375 /*
376  * Register a virtual device without IOMMU backing.  The user of this
377  * device must not be able to directly trigger unmediated DMA.
378  */
vfio_register_emulated_iommu_dev(struct vfio_device * device)379 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
380 {
381 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
382 }
383 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
384 
385 /*
386  * Decrement the device reference count and wait for the device to be
387  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)388 void vfio_unregister_group_dev(struct vfio_device *device)
389 {
390 	unsigned int i = 0;
391 	bool interrupted = false;
392 	long rc;
393 
394 	/*
395 	 * Prevent new device opened by userspace via the
396 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
397 	 */
398 	vfio_device_group_unregister(device);
399 
400 	/*
401 	 * Balances vfio_device_add() in register path, also prevents
402 	 * new device opened by userspace in the cdev path.
403 	 */
404 	vfio_device_del(device);
405 
406 	vfio_device_put_registration(device);
407 	rc = try_wait_for_completion(&device->comp);
408 	while (rc <= 0) {
409 		if (device->ops->request)
410 			device->ops->request(device, i++);
411 
412 		if (interrupted) {
413 			rc = wait_for_completion_timeout(&device->comp,
414 							 HZ * 10);
415 		} else {
416 			rc = wait_for_completion_interruptible_timeout(
417 				&device->comp, HZ * 10);
418 			if (rc < 0) {
419 				interrupted = true;
420 				dev_warn(device->dev,
421 					 "Device is currently in use, task"
422 					 " \"%s\" (%d) "
423 					 "blocked until device is released",
424 					 current->comm, task_pid_nr(current));
425 			}
426 		}
427 	}
428 
429 	vfio_device_debugfs_exit(device);
430 	/* Balances vfio_device_set_group in register path */
431 	vfio_device_remove_group(device);
432 }
433 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
434 
435 #if IS_ENABLED(CONFIG_KVM)
vfio_device_get_kvm_safe(struct vfio_device * device,struct kvm * kvm)436 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
437 {
438 	void (*pfn)(struct kvm *kvm);
439 	bool (*fn)(struct kvm *kvm);
440 	bool ret;
441 
442 	lockdep_assert_held(&device->dev_set->lock);
443 
444 	if (!kvm)
445 		return;
446 
447 	pfn = symbol_get(kvm_put_kvm);
448 	if (WARN_ON(!pfn))
449 		return;
450 
451 	fn = symbol_get(kvm_get_kvm_safe);
452 	if (WARN_ON(!fn)) {
453 		symbol_put(kvm_put_kvm);
454 		return;
455 	}
456 
457 	ret = fn(kvm);
458 	symbol_put(kvm_get_kvm_safe);
459 	if (!ret) {
460 		symbol_put(kvm_put_kvm);
461 		return;
462 	}
463 
464 	device->put_kvm = pfn;
465 	device->kvm = kvm;
466 }
467 
vfio_device_put_kvm(struct vfio_device * device)468 void vfio_device_put_kvm(struct vfio_device *device)
469 {
470 	lockdep_assert_held(&device->dev_set->lock);
471 
472 	if (!device->kvm)
473 		return;
474 
475 	if (WARN_ON(!device->put_kvm))
476 		goto clear;
477 
478 	device->put_kvm(device->kvm);
479 	device->put_kvm = NULL;
480 	symbol_put(kvm_put_kvm);
481 
482 clear:
483 	device->kvm = NULL;
484 }
485 #endif
486 
487 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)488 static bool vfio_assert_device_open(struct vfio_device *device)
489 {
490 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
491 }
492 
493 struct vfio_device_file *
vfio_allocate_device_file(struct vfio_device * device)494 vfio_allocate_device_file(struct vfio_device *device)
495 {
496 	struct vfio_device_file *df;
497 
498 	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
499 	if (!df)
500 		return ERR_PTR(-ENOMEM);
501 
502 	df->device = device;
503 	spin_lock_init(&df->kvm_ref_lock);
504 
505 	return df;
506 }
507 
vfio_df_device_first_open(struct vfio_device_file * df)508 static int vfio_df_device_first_open(struct vfio_device_file *df)
509 {
510 	struct vfio_device *device = df->device;
511 	struct iommufd_ctx *iommufd = df->iommufd;
512 	int ret;
513 
514 	lockdep_assert_held(&device->dev_set->lock);
515 
516 	if (!try_module_get(device->dev->driver->owner))
517 		return -ENODEV;
518 
519 	if (iommufd)
520 		ret = vfio_df_iommufd_bind(df);
521 	else
522 		ret = vfio_device_group_use_iommu(device);
523 	if (ret)
524 		goto err_module_put;
525 
526 	if (device->ops->open_device) {
527 		ret = device->ops->open_device(device);
528 		if (ret)
529 			goto err_unuse_iommu;
530 	}
531 	return 0;
532 
533 err_unuse_iommu:
534 	if (iommufd)
535 		vfio_df_iommufd_unbind(df);
536 	else
537 		vfio_device_group_unuse_iommu(device);
538 err_module_put:
539 	module_put(device->dev->driver->owner);
540 	return ret;
541 }
542 
vfio_df_device_last_close(struct vfio_device_file * df)543 static void vfio_df_device_last_close(struct vfio_device_file *df)
544 {
545 	struct vfio_device *device = df->device;
546 	struct iommufd_ctx *iommufd = df->iommufd;
547 
548 	lockdep_assert_held(&device->dev_set->lock);
549 
550 	if (device->ops->close_device)
551 		device->ops->close_device(device);
552 	if (iommufd)
553 		vfio_df_iommufd_unbind(df);
554 	else
555 		vfio_device_group_unuse_iommu(device);
556 	module_put(device->dev->driver->owner);
557 }
558 
vfio_df_open(struct vfio_device_file * df)559 int vfio_df_open(struct vfio_device_file *df)
560 {
561 	struct vfio_device *device = df->device;
562 	int ret = 0;
563 
564 	lockdep_assert_held(&device->dev_set->lock);
565 
566 	/*
567 	 * Only the group path allows the device to be opened multiple
568 	 * times.  The device cdev path doesn't have a secure way for it.
569 	 */
570 	if (device->open_count != 0 && !df->group)
571 		return -EINVAL;
572 
573 	device->open_count++;
574 	if (device->open_count == 1) {
575 		ret = vfio_df_device_first_open(df);
576 		if (ret)
577 			device->open_count--;
578 	}
579 
580 	return ret;
581 }
582 
vfio_df_close(struct vfio_device_file * df)583 void vfio_df_close(struct vfio_device_file *df)
584 {
585 	struct vfio_device *device = df->device;
586 
587 	lockdep_assert_held(&device->dev_set->lock);
588 
589 	if (!vfio_assert_device_open(device))
590 		return;
591 	if (device->open_count == 1)
592 		vfio_df_device_last_close(df);
593 	device->open_count--;
594 }
595 
596 /*
597  * Wrapper around pm_runtime_resume_and_get().
598  * Return error code on failure or 0 on success.
599  */
vfio_device_pm_runtime_get(struct vfio_device * device)600 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
601 {
602 	struct device *dev = device->dev;
603 
604 	if (dev->driver && dev->driver->pm) {
605 		int ret;
606 
607 		ret = pm_runtime_resume_and_get(dev);
608 		if (ret) {
609 			dev_info_ratelimited(dev,
610 				"vfio: runtime resume failed %d\n", ret);
611 			return -EIO;
612 		}
613 	}
614 
615 	return 0;
616 }
617 
618 /*
619  * Wrapper around pm_runtime_put().
620  */
vfio_device_pm_runtime_put(struct vfio_device * device)621 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
622 {
623 	struct device *dev = device->dev;
624 
625 	if (dev->driver && dev->driver->pm)
626 		pm_runtime_put(dev);
627 }
628 
629 /*
630  * VFIO Device fd
631  */
vfio_device_fops_release(struct inode * inode,struct file * filep)632 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
633 {
634 	struct vfio_device_file *df = filep->private_data;
635 	struct vfio_device *device = df->device;
636 
637 	if (df->group)
638 		vfio_df_group_close(df);
639 	else
640 		vfio_df_unbind_iommufd(df);
641 
642 	vfio_device_put_registration(device);
643 
644 	kfree(df);
645 
646 	return 0;
647 }
648 
649 /*
650  * vfio_mig_get_next_state - Compute the next step in the FSM
651  * @cur_fsm - The current state the device is in
652  * @new_fsm - The target state to reach
653  * @next_fsm - Pointer to the next step to get to new_fsm
654  *
655  * Return 0 upon success, otherwise -errno
656  * Upon success the next step in the state progression between cur_fsm and
657  * new_fsm will be set in next_fsm.
658  *
659  * This breaks down requests for combination transitions into smaller steps and
660  * returns the next step to get to new_fsm. The function may need to be called
661  * multiple times before reaching new_fsm.
662  *
663  */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)664 int vfio_mig_get_next_state(struct vfio_device *device,
665 			    enum vfio_device_mig_state cur_fsm,
666 			    enum vfio_device_mig_state new_fsm,
667 			    enum vfio_device_mig_state *next_fsm)
668 {
669 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
670 	/*
671 	 * The coding in this table requires the driver to implement the
672 	 * following FSM arcs:
673 	 *         RESUMING -> STOP
674 	 *         STOP -> RESUMING
675 	 *         STOP -> STOP_COPY
676 	 *         STOP_COPY -> STOP
677 	 *
678 	 * If P2P is supported then the driver must also implement these FSM
679 	 * arcs:
680 	 *         RUNNING -> RUNNING_P2P
681 	 *         RUNNING_P2P -> RUNNING
682 	 *         RUNNING_P2P -> STOP
683 	 *         STOP -> RUNNING_P2P
684 	 *
685 	 * If precopy is supported then the driver must support these additional
686 	 * FSM arcs:
687 	 *         RUNNING -> PRE_COPY
688 	 *         PRE_COPY -> RUNNING
689 	 *         PRE_COPY -> STOP_COPY
690 	 * However, if precopy and P2P are supported together then the driver
691 	 * must support these additional arcs beyond the P2P arcs above:
692 	 *         PRE_COPY -> RUNNING
693 	 *         PRE_COPY -> PRE_COPY_P2P
694 	 *         PRE_COPY_P2P -> PRE_COPY
695 	 *         PRE_COPY_P2P -> RUNNING_P2P
696 	 *         PRE_COPY_P2P -> STOP_COPY
697 	 *         RUNNING -> PRE_COPY
698 	 *         RUNNING_P2P -> PRE_COPY_P2P
699 	 *
700 	 * Without P2P and precopy the driver must implement:
701 	 *         RUNNING -> STOP
702 	 *         STOP -> RUNNING
703 	 *
704 	 * The coding will step through multiple states for some combination
705 	 * transitions; if all optional features are supported, this means the
706 	 * following ones:
707 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
708 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
709 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
710 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
711 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
712 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
713 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
714 	 *         RESUMING -> STOP -> RUNNING_P2P
715 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
716 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
717 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
718 	 *         RESUMING -> STOP -> STOP_COPY
719 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
720 	 *         RUNNING -> RUNNING_P2P -> STOP
721 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
722 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
723 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
724 	 *         RUNNING_P2P -> STOP -> RESUMING
725 	 *         RUNNING_P2P -> STOP -> STOP_COPY
726 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
727 	 *         STOP -> RUNNING_P2P -> RUNNING
728 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
729 	 *         STOP_COPY -> STOP -> RESUMING
730 	 *         STOP_COPY -> STOP -> RUNNING_P2P
731 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
732 	 *
733 	 *  The following transitions are blocked:
734 	 *         STOP_COPY -> PRE_COPY
735 	 *         STOP_COPY -> PRE_COPY_P2P
736 	 */
737 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
738 		[VFIO_DEVICE_STATE_STOP] = {
739 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
740 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
741 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
742 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
743 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
744 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
745 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
746 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
747 		},
748 		[VFIO_DEVICE_STATE_RUNNING] = {
749 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
750 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
751 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
752 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
753 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
754 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
755 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
756 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
757 		},
758 		[VFIO_DEVICE_STATE_PRE_COPY] = {
759 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
760 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
761 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
762 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
763 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
764 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
765 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
766 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
767 		},
768 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
769 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
770 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
771 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
772 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
773 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
774 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
775 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
776 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
777 		},
778 		[VFIO_DEVICE_STATE_STOP_COPY] = {
779 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
780 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
781 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
782 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
783 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
784 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
785 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
786 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
787 		},
788 		[VFIO_DEVICE_STATE_RESUMING] = {
789 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
790 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
791 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
792 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
793 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
794 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
795 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
796 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
797 		},
798 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
799 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
800 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
801 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
802 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
803 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
804 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
805 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
806 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
807 		},
808 		[VFIO_DEVICE_STATE_ERROR] = {
809 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
810 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
811 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
812 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
813 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
814 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
815 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
816 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
817 		},
818 	};
819 
820 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
821 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
822 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
823 		[VFIO_DEVICE_STATE_PRE_COPY] =
824 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
825 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
826 						   VFIO_MIGRATION_P2P |
827 						   VFIO_MIGRATION_PRE_COPY,
828 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
829 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
830 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
831 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
832 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
833 	};
834 
835 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
836 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
837 			state_flags_table[cur_fsm]))
838 		return -EINVAL;
839 
840 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
841 	   (state_flags_table[new_fsm] & device->migration_flags) !=
842 			state_flags_table[new_fsm])
843 		return -EINVAL;
844 
845 	/*
846 	 * Arcs touching optional and unsupported states are skipped over. The
847 	 * driver will instead see an arc from the original state to the next
848 	 * logical state, as per the above comment.
849 	 */
850 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
851 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
852 			state_flags_table[*next_fsm])
853 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
854 
855 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
856 }
857 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
858 
859 /*
860  * Convert the drivers's struct file into a FD number and return it to userspace
861  */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)862 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
863 				   struct vfio_device_feature_mig_state *mig)
864 {
865 	int ret;
866 	int fd;
867 
868 	fd = get_unused_fd_flags(O_CLOEXEC);
869 	if (fd < 0) {
870 		ret = fd;
871 		goto out_fput;
872 	}
873 
874 	mig->data_fd = fd;
875 	if (copy_to_user(arg, mig, sizeof(*mig))) {
876 		ret = -EFAULT;
877 		goto out_put_unused;
878 	}
879 	fd_install(fd, filp);
880 	return 0;
881 
882 out_put_unused:
883 	put_unused_fd(fd);
884 out_fput:
885 	fput(filp);
886 	return ret;
887 }
888 
889 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)890 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
891 					   u32 flags, void __user *arg,
892 					   size_t argsz)
893 {
894 	size_t minsz =
895 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
896 	struct vfio_device_feature_mig_state mig;
897 	struct file *filp = NULL;
898 	int ret;
899 
900 	if (!device->mig_ops)
901 		return -ENOTTY;
902 
903 	ret = vfio_check_feature(flags, argsz,
904 				 VFIO_DEVICE_FEATURE_SET |
905 				 VFIO_DEVICE_FEATURE_GET,
906 				 sizeof(mig));
907 	if (ret != 1)
908 		return ret;
909 
910 	if (copy_from_user(&mig, arg, minsz))
911 		return -EFAULT;
912 
913 	if (flags & VFIO_DEVICE_FEATURE_GET) {
914 		enum vfio_device_mig_state curr_state;
915 
916 		ret = device->mig_ops->migration_get_state(device,
917 							   &curr_state);
918 		if (ret)
919 			return ret;
920 		mig.device_state = curr_state;
921 		goto out_copy;
922 	}
923 
924 	/* Handle the VFIO_DEVICE_FEATURE_SET */
925 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
926 	if (IS_ERR(filp) || !filp)
927 		goto out_copy;
928 
929 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
930 out_copy:
931 	mig.data_fd = -1;
932 	if (copy_to_user(arg, &mig, sizeof(mig)))
933 		return -EFAULT;
934 	if (IS_ERR(filp))
935 		return PTR_ERR(filp);
936 	return 0;
937 }
938 
939 static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)940 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
941 					      u32 flags, void __user *arg,
942 					      size_t argsz)
943 {
944 	struct vfio_device_feature_mig_data_size data_size = {};
945 	unsigned long stop_copy_length;
946 	int ret;
947 
948 	if (!device->mig_ops)
949 		return -ENOTTY;
950 
951 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
952 				 sizeof(data_size));
953 	if (ret != 1)
954 		return ret;
955 
956 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
957 	if (ret)
958 		return ret;
959 
960 	data_size.stop_copy_length = stop_copy_length;
961 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
962 		return -EFAULT;
963 
964 	return 0;
965 }
966 
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)967 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
968 					       u32 flags, void __user *arg,
969 					       size_t argsz)
970 {
971 	struct vfio_device_feature_migration mig = {
972 		.flags = device->migration_flags,
973 	};
974 	int ret;
975 
976 	if (!device->mig_ops)
977 		return -ENOTTY;
978 
979 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
980 				 sizeof(mig));
981 	if (ret != 1)
982 		return ret;
983 	if (copy_to_user(arg, &mig, sizeof(mig)))
984 		return -EFAULT;
985 	return 0;
986 }
987 
vfio_combine_iova_ranges(struct rb_root_cached * root,u32 cur_nodes,u32 req_nodes)988 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
989 			      u32 req_nodes)
990 {
991 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
992 	unsigned long min_gap, curr_gap;
993 
994 	/* Special shortcut when a single range is required */
995 	if (req_nodes == 1) {
996 		unsigned long last;
997 
998 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
999 
1000 		/* Empty list */
1001 		if (WARN_ON_ONCE(!comb_start))
1002 			return;
1003 
1004 		curr = comb_start;
1005 		while (curr) {
1006 			last = curr->last;
1007 			prev = curr;
1008 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1009 			if (prev != comb_start)
1010 				interval_tree_remove(prev, root);
1011 		}
1012 		comb_start->last = last;
1013 		return;
1014 	}
1015 
1016 	/* Combine ranges which have the smallest gap */
1017 	while (cur_nodes > req_nodes) {
1018 		prev = NULL;
1019 		min_gap = ULONG_MAX;
1020 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1021 		while (curr) {
1022 			if (prev) {
1023 				curr_gap = curr->start - prev->last;
1024 				if (curr_gap < min_gap) {
1025 					min_gap = curr_gap;
1026 					comb_start = prev;
1027 					comb_end = curr;
1028 				}
1029 			}
1030 			prev = curr;
1031 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1032 		}
1033 
1034 		/* Empty list or no nodes to combine */
1035 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1036 			break;
1037 
1038 		comb_start->last = comb_end->last;
1039 		interval_tree_remove(comb_end, root);
1040 		cur_nodes--;
1041 	}
1042 }
1043 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1044 
1045 /* Ranges should fit into a single kernel page */
1046 #define LOG_MAX_RANGES \
1047 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1048 
1049 static int
vfio_ioctl_device_feature_logging_start(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1050 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1051 					u32 flags, void __user *arg,
1052 					size_t argsz)
1053 {
1054 	size_t minsz =
1055 		offsetofend(struct vfio_device_feature_dma_logging_control,
1056 			    ranges);
1057 	struct vfio_device_feature_dma_logging_range __user *ranges;
1058 	struct vfio_device_feature_dma_logging_control control;
1059 	struct vfio_device_feature_dma_logging_range range;
1060 	struct rb_root_cached root = RB_ROOT_CACHED;
1061 	struct interval_tree_node *nodes;
1062 	u64 iova_end;
1063 	u32 nnodes;
1064 	int i, ret;
1065 
1066 	if (!device->log_ops)
1067 		return -ENOTTY;
1068 
1069 	ret = vfio_check_feature(flags, argsz,
1070 				 VFIO_DEVICE_FEATURE_SET,
1071 				 sizeof(control));
1072 	if (ret != 1)
1073 		return ret;
1074 
1075 	if (copy_from_user(&control, arg, minsz))
1076 		return -EFAULT;
1077 
1078 	nnodes = control.num_ranges;
1079 	if (!nnodes)
1080 		return -EINVAL;
1081 
1082 	if (nnodes > LOG_MAX_RANGES)
1083 		return -E2BIG;
1084 
1085 	ranges = u64_to_user_ptr(control.ranges);
1086 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1087 			      GFP_KERNEL);
1088 	if (!nodes)
1089 		return -ENOMEM;
1090 
1091 	for (i = 0; i < nnodes; i++) {
1092 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1093 			ret = -EFAULT;
1094 			goto end;
1095 		}
1096 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1097 		    !IS_ALIGNED(range.length, control.page_size)) {
1098 			ret = -EINVAL;
1099 			goto end;
1100 		}
1101 
1102 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1103 		    iova_end > ULONG_MAX) {
1104 			ret = -EOVERFLOW;
1105 			goto end;
1106 		}
1107 
1108 		nodes[i].start = range.iova;
1109 		nodes[i].last = range.iova + range.length - 1;
1110 		if (interval_tree_iter_first(&root, nodes[i].start,
1111 					     nodes[i].last)) {
1112 			/* Range overlapping */
1113 			ret = -EINVAL;
1114 			goto end;
1115 		}
1116 		interval_tree_insert(nodes + i, &root);
1117 	}
1118 
1119 	ret = device->log_ops->log_start(device, &root, nnodes,
1120 					 &control.page_size);
1121 	if (ret)
1122 		goto end;
1123 
1124 	if (copy_to_user(arg, &control, sizeof(control))) {
1125 		ret = -EFAULT;
1126 		device->log_ops->log_stop(device);
1127 	}
1128 
1129 end:
1130 	kfree(nodes);
1131 	return ret;
1132 }
1133 
1134 static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1135 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1136 				       u32 flags, void __user *arg,
1137 				       size_t argsz)
1138 {
1139 	int ret;
1140 
1141 	if (!device->log_ops)
1142 		return -ENOTTY;
1143 
1144 	ret = vfio_check_feature(flags, argsz,
1145 				 VFIO_DEVICE_FEATURE_SET, 0);
1146 	if (ret != 1)
1147 		return ret;
1148 
1149 	return device->log_ops->log_stop(device);
1150 }
1151 
vfio_device_log_read_and_clear(struct iova_bitmap * iter,unsigned long iova,size_t length,void * opaque)1152 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1153 					  unsigned long iova, size_t length,
1154 					  void *opaque)
1155 {
1156 	struct vfio_device *device = opaque;
1157 
1158 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1159 }
1160 
1161 static int
vfio_ioctl_device_feature_logging_report(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1162 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1163 					 u32 flags, void __user *arg,
1164 					 size_t argsz)
1165 {
1166 	size_t minsz =
1167 		offsetofend(struct vfio_device_feature_dma_logging_report,
1168 			    bitmap);
1169 	struct vfio_device_feature_dma_logging_report report;
1170 	struct iova_bitmap *iter;
1171 	u64 iova_end;
1172 	int ret;
1173 
1174 	if (!device->log_ops)
1175 		return -ENOTTY;
1176 
1177 	ret = vfio_check_feature(flags, argsz,
1178 				 VFIO_DEVICE_FEATURE_GET,
1179 				 sizeof(report));
1180 	if (ret != 1)
1181 		return ret;
1182 
1183 	if (copy_from_user(&report, arg, minsz))
1184 		return -EFAULT;
1185 
1186 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1187 		return -EINVAL;
1188 
1189 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1190 	    iova_end > ULONG_MAX)
1191 		return -EOVERFLOW;
1192 
1193 	iter = iova_bitmap_alloc(report.iova, report.length,
1194 				 report.page_size,
1195 				 u64_to_user_ptr(report.bitmap));
1196 	if (IS_ERR(iter))
1197 		return PTR_ERR(iter);
1198 
1199 	ret = iova_bitmap_for_each(iter, device,
1200 				   vfio_device_log_read_and_clear);
1201 
1202 	iova_bitmap_free(iter);
1203 	return ret;
1204 }
1205 
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1206 static int vfio_ioctl_device_feature(struct vfio_device *device,
1207 				     struct vfio_device_feature __user *arg)
1208 {
1209 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1210 	struct vfio_device_feature feature;
1211 
1212 	if (copy_from_user(&feature, arg, minsz))
1213 		return -EFAULT;
1214 
1215 	if (feature.argsz < minsz)
1216 		return -EINVAL;
1217 
1218 	/* Check unknown flags */
1219 	if (feature.flags &
1220 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1221 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1222 		return -EINVAL;
1223 
1224 	/* GET & SET are mutually exclusive except with PROBE */
1225 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1226 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1227 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1228 		return -EINVAL;
1229 
1230 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1231 	case VFIO_DEVICE_FEATURE_MIGRATION:
1232 		return vfio_ioctl_device_feature_migration(
1233 			device, feature.flags, arg->data,
1234 			feature.argsz - minsz);
1235 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1236 		return vfio_ioctl_device_feature_mig_device_state(
1237 			device, feature.flags, arg->data,
1238 			feature.argsz - minsz);
1239 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1240 		return vfio_ioctl_device_feature_logging_start(
1241 			device, feature.flags, arg->data,
1242 			feature.argsz - minsz);
1243 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1244 		return vfio_ioctl_device_feature_logging_stop(
1245 			device, feature.flags, arg->data,
1246 			feature.argsz - minsz);
1247 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1248 		return vfio_ioctl_device_feature_logging_report(
1249 			device, feature.flags, arg->data,
1250 			feature.argsz - minsz);
1251 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1252 		return vfio_ioctl_device_feature_migration_data_size(
1253 			device, feature.flags, arg->data,
1254 			feature.argsz - minsz);
1255 	default:
1256 		if (unlikely(!device->ops->device_feature))
1257 			return -ENOTTY;
1258 		return device->ops->device_feature(device, feature.flags,
1259 						   arg->data,
1260 						   feature.argsz - minsz);
1261 	}
1262 }
1263 
vfio_get_region_info(struct vfio_device * device,struct vfio_region_info __user * arg)1264 static long vfio_get_region_info(struct vfio_device *device,
1265 				 struct vfio_region_info __user *arg)
1266 {
1267 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
1268 	struct vfio_region_info info = {};
1269 	struct vfio_info_cap caps = {};
1270 	int ret;
1271 
1272 	if (unlikely(!device->ops->get_region_info_caps))
1273 		return -EINVAL;
1274 
1275 	if (copy_from_user(&info, arg, minsz))
1276 		return -EFAULT;
1277 	if (info.argsz < minsz)
1278 		return -EINVAL;
1279 
1280 	ret = device->ops->get_region_info_caps(device, &info, &caps);
1281 	if (ret)
1282 		goto out_free;
1283 
1284 	if (caps.size) {
1285 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1286 		if (info.argsz < sizeof(info) + caps.size) {
1287 			info.argsz = sizeof(info) + caps.size;
1288 			info.cap_offset = 0;
1289 		} else {
1290 			vfio_info_cap_shift(&caps, sizeof(info));
1291 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1292 				ret = -EFAULT;
1293 				goto out_free;
1294 			}
1295 			info.cap_offset = sizeof(info);
1296 		}
1297 	}
1298 
1299 	if (copy_to_user(arg, &info, minsz)){
1300 		ret = -EFAULT;
1301 		goto out_free;
1302 	}
1303 
1304 out_free:
1305 	kfree(caps.buf);
1306 	return ret;
1307 }
1308 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1309 static long vfio_device_fops_unl_ioctl(struct file *filep,
1310 				       unsigned int cmd, unsigned long arg)
1311 {
1312 	struct vfio_device_file *df = filep->private_data;
1313 	struct vfio_device *device = df->device;
1314 	void __user *uptr = (void __user *)arg;
1315 	int ret;
1316 
1317 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1318 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1319 
1320 	/* Paired with smp_store_release() following vfio_df_open() */
1321 	if (!smp_load_acquire(&df->access_granted))
1322 		return -EINVAL;
1323 
1324 	ret = vfio_device_pm_runtime_get(device);
1325 	if (ret)
1326 		return ret;
1327 
1328 	/* cdev only ioctls */
1329 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1330 		switch (cmd) {
1331 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1332 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1333 			goto out;
1334 
1335 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1336 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1337 			goto out;
1338 		}
1339 	}
1340 
1341 	switch (cmd) {
1342 	case VFIO_DEVICE_FEATURE:
1343 		ret = vfio_ioctl_device_feature(device, uptr);
1344 		break;
1345 
1346 	case VFIO_DEVICE_GET_REGION_INFO:
1347 		ret = vfio_get_region_info(device, uptr);
1348 		break;
1349 
1350 	default:
1351 		if (unlikely(!device->ops->ioctl))
1352 			ret = -EINVAL;
1353 		else
1354 			ret = device->ops->ioctl(device, cmd, arg);
1355 		break;
1356 	}
1357 out:
1358 	vfio_device_pm_runtime_put(device);
1359 	return ret;
1360 }
1361 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1362 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1363 				     size_t count, loff_t *ppos)
1364 {
1365 	struct vfio_device_file *df = filep->private_data;
1366 	struct vfio_device *device = df->device;
1367 
1368 	/* Paired with smp_store_release() following vfio_df_open() */
1369 	if (!smp_load_acquire(&df->access_granted))
1370 		return -EINVAL;
1371 
1372 	if (unlikely(!device->ops->read))
1373 		return -EINVAL;
1374 
1375 	return device->ops->read(device, buf, count, ppos);
1376 }
1377 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1378 static ssize_t vfio_device_fops_write(struct file *filep,
1379 				      const char __user *buf,
1380 				      size_t count, loff_t *ppos)
1381 {
1382 	struct vfio_device_file *df = filep->private_data;
1383 	struct vfio_device *device = df->device;
1384 
1385 	/* Paired with smp_store_release() following vfio_df_open() */
1386 	if (!smp_load_acquire(&df->access_granted))
1387 		return -EINVAL;
1388 
1389 	if (unlikely(!device->ops->write))
1390 		return -EINVAL;
1391 
1392 	return device->ops->write(device, buf, count, ppos);
1393 }
1394 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1395 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1396 {
1397 	struct vfio_device_file *df = filep->private_data;
1398 	struct vfio_device *device = df->device;
1399 
1400 	/* Paired with smp_store_release() following vfio_df_open() */
1401 	if (!smp_load_acquire(&df->access_granted))
1402 		return -EINVAL;
1403 
1404 	if (unlikely(!device->ops->mmap))
1405 		return -EINVAL;
1406 
1407 	return device->ops->mmap(device, vma);
1408 }
1409 
1410 #ifdef CONFIG_PROC_FS
vfio_device_show_fdinfo(struct seq_file * m,struct file * filep)1411 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep)
1412 {
1413 	char *path;
1414 	struct vfio_device_file *df = filep->private_data;
1415 	struct vfio_device *device = df->device;
1416 
1417 	path = kobject_get_path(&device->dev->kobj, GFP_KERNEL);
1418 	if (!path)
1419 		return;
1420 
1421 	seq_printf(m, "vfio-device-syspath: /sys%s\n", path);
1422 	kfree(path);
1423 }
1424 #endif
1425 
1426 const struct file_operations vfio_device_fops = {
1427 	.owner		= THIS_MODULE,
1428 	.open		= vfio_device_fops_cdev_open,
1429 	.release	= vfio_device_fops_release,
1430 	.read		= vfio_device_fops_read,
1431 	.write		= vfio_device_fops_write,
1432 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1433 	.compat_ioctl	= compat_ptr_ioctl,
1434 	.mmap		= vfio_device_fops_mmap,
1435 #ifdef CONFIG_PROC_FS
1436 	.show_fdinfo	= vfio_device_show_fdinfo,
1437 #endif
1438 };
1439 
vfio_device_from_file(struct file * file)1440 static struct vfio_device *vfio_device_from_file(struct file *file)
1441 {
1442 	struct vfio_device_file *df = file->private_data;
1443 
1444 	if (file->f_op != &vfio_device_fops)
1445 		return NULL;
1446 	return df->device;
1447 }
1448 
1449 /**
1450  * vfio_file_is_valid - True if the file is valid vfio file
1451  * @file: VFIO group file or VFIO device file
1452  */
vfio_file_is_valid(struct file * file)1453 bool vfio_file_is_valid(struct file *file)
1454 {
1455 	return vfio_group_from_file(file) ||
1456 	       vfio_device_from_file(file);
1457 }
1458 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1459 
1460 /**
1461  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1462  *        is always CPU cache coherent
1463  * @file: VFIO group file or VFIO device file
1464  *
1465  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1466  * bit in DMA transactions. A return of false indicates that the user has
1467  * rights to access additional instructions such as wbinvd on x86.
1468  */
vfio_file_enforced_coherent(struct file * file)1469 bool vfio_file_enforced_coherent(struct file *file)
1470 {
1471 	struct vfio_device *device;
1472 	struct vfio_group *group;
1473 
1474 	group = vfio_group_from_file(file);
1475 	if (group)
1476 		return vfio_group_enforced_coherent(group);
1477 
1478 	device = vfio_device_from_file(file);
1479 	if (device)
1480 		return device_iommu_capable(device->dev,
1481 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1482 
1483 	return true;
1484 }
1485 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1486 
vfio_device_file_set_kvm(struct file * file,struct kvm * kvm)1487 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1488 {
1489 	struct vfio_device_file *df = file->private_data;
1490 
1491 	/*
1492 	 * The kvm is first recorded in the vfio_device_file, and will
1493 	 * be propagated to vfio_device::kvm when the file is bound to
1494 	 * iommufd successfully in the vfio device cdev path.
1495 	 */
1496 	spin_lock(&df->kvm_ref_lock);
1497 	df->kvm = kvm;
1498 	spin_unlock(&df->kvm_ref_lock);
1499 }
1500 
1501 /**
1502  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1503  * @file: VFIO group file or VFIO device file
1504  * @kvm: KVM to link
1505  *
1506  * When a VFIO device is first opened the KVM will be available in
1507  * device->kvm if one was associated with the file.
1508  */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1509 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1510 {
1511 	struct vfio_group *group;
1512 
1513 	group = vfio_group_from_file(file);
1514 	if (group)
1515 		vfio_group_set_kvm(group, kvm);
1516 
1517 	if (vfio_device_from_file(file))
1518 		vfio_device_file_set_kvm(file, kvm);
1519 }
1520 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1521 
1522 /*
1523  * Sub-module support
1524  */
1525 /*
1526  * Helper for managing a buffer of info chain capabilities, allocate or
1527  * reallocate a buffer with additional @size, filling in @id and @version
1528  * of the capability.  A pointer to the new capability is returned.
1529  *
1530  * NB. The chain is based at the head of the buffer, so new entries are
1531  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1532  * next offsets prior to copying to the user buffer.
1533  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1534 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1535 					       size_t size, u16 id, u16 version)
1536 {
1537 	void *buf;
1538 	struct vfio_info_cap_header *header, *tmp;
1539 
1540 	/* Ensure that the next capability struct will be aligned */
1541 	size = ALIGN(size, sizeof(u64));
1542 
1543 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1544 	if (!buf) {
1545 		kfree(caps->buf);
1546 		caps->buf = NULL;
1547 		caps->size = 0;
1548 		return ERR_PTR(-ENOMEM);
1549 	}
1550 
1551 	caps->buf = buf;
1552 	header = buf + caps->size;
1553 
1554 	/* Eventually copied to user buffer, zero */
1555 	memset(header, 0, size);
1556 
1557 	header->id = id;
1558 	header->version = version;
1559 
1560 	/* Add to the end of the capability chain */
1561 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1562 		; /* nothing */
1563 
1564 	tmp->next = caps->size;
1565 	caps->size += size;
1566 
1567 	return header;
1568 }
1569 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1570 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1571 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1572 {
1573 	struct vfio_info_cap_header *tmp;
1574 	void *buf = (void *)caps->buf;
1575 
1576 	/* Capability structs should start with proper alignment */
1577 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1578 
1579 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1580 		tmp->next += offset;
1581 }
1582 EXPORT_SYMBOL(vfio_info_cap_shift);
1583 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1584 int vfio_info_add_capability(struct vfio_info_cap *caps,
1585 			     struct vfio_info_cap_header *cap, size_t size)
1586 {
1587 	struct vfio_info_cap_header *header;
1588 
1589 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1590 	if (IS_ERR(header))
1591 		return PTR_ERR(header);
1592 
1593 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1594 
1595 	return 0;
1596 }
1597 EXPORT_SYMBOL(vfio_info_add_capability);
1598 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1599 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1600 				       int max_irq_type, size_t *data_size)
1601 {
1602 	unsigned long minsz;
1603 	size_t size;
1604 
1605 	minsz = offsetofend(struct vfio_irq_set, count);
1606 
1607 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1608 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1609 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1610 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1611 		return -EINVAL;
1612 
1613 	if (data_size)
1614 		*data_size = 0;
1615 
1616 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1617 		return -EINVAL;
1618 
1619 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1620 	case VFIO_IRQ_SET_DATA_NONE:
1621 		size = 0;
1622 		break;
1623 	case VFIO_IRQ_SET_DATA_BOOL:
1624 		size = sizeof(uint8_t);
1625 		break;
1626 	case VFIO_IRQ_SET_DATA_EVENTFD:
1627 		size = sizeof(int32_t);
1628 		break;
1629 	default:
1630 		return -EINVAL;
1631 	}
1632 
1633 	if (size) {
1634 		if (hdr->argsz - minsz < hdr->count * size)
1635 			return -EINVAL;
1636 
1637 		if (!data_size)
1638 			return -EINVAL;
1639 
1640 		*data_size = hdr->count * size;
1641 	}
1642 
1643 	return 0;
1644 }
1645 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1646 
1647 /*
1648  * Pin contiguous user pages and return their associated host pages for local
1649  * domain only.
1650  * @device [in]  : device
1651  * @iova [in]    : starting IOVA of user pages to be pinned.
1652  * @npage [in]   : count of pages to be pinned.  This count should not
1653  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1654  * @prot [in]    : protection flags
1655  * @pages[out]   : array of host pages
1656  * Return error or number of pages pinned.
1657  *
1658  * A driver may only call this function if the vfio_device was created
1659  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1660  */
vfio_pin_pages(struct vfio_device * device,dma_addr_t iova,int npage,int prot,struct page ** pages)1661 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1662 		   int npage, int prot, struct page **pages)
1663 {
1664 	/* group->container cannot change while a vfio device is open */
1665 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1666 		return -EINVAL;
1667 	if (!device->ops->dma_unmap)
1668 		return -EINVAL;
1669 	if (vfio_device_has_container(device))
1670 		return vfio_device_container_pin_pages(device, iova,
1671 						       npage, prot, pages);
1672 	if (device->iommufd_access) {
1673 		int ret;
1674 
1675 		if (iova > ULONG_MAX)
1676 			return -EINVAL;
1677 		/*
1678 		 * VFIO ignores the sub page offset, npages is from the start of
1679 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1680 		 * the sub page offset by doing:
1681 		 *     pages[0] + (iova % PAGE_SIZE)
1682 		 */
1683 		ret = iommufd_access_pin_pages(
1684 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1685 			npage * PAGE_SIZE, pages,
1686 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1687 		if (ret)
1688 			return ret;
1689 		return npage;
1690 	}
1691 	return -EINVAL;
1692 }
1693 EXPORT_SYMBOL(vfio_pin_pages);
1694 
1695 /*
1696  * Unpin contiguous host pages for local domain only.
1697  * @device [in]  : device
1698  * @iova [in]    : starting address of user pages to be unpinned.
1699  * @npage [in]   : count of pages to be unpinned.  This count should not
1700  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1701  */
vfio_unpin_pages(struct vfio_device * device,dma_addr_t iova,int npage)1702 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1703 {
1704 	if (WARN_ON(!vfio_assert_device_open(device)))
1705 		return;
1706 	if (WARN_ON(!device->ops->dma_unmap))
1707 		return;
1708 
1709 	if (vfio_device_has_container(device)) {
1710 		vfio_device_container_unpin_pages(device, iova, npage);
1711 		return;
1712 	}
1713 	if (device->iommufd_access) {
1714 		if (WARN_ON(iova > ULONG_MAX))
1715 			return;
1716 		iommufd_access_unpin_pages(device->iommufd_access,
1717 					   ALIGN_DOWN(iova, PAGE_SIZE),
1718 					   npage * PAGE_SIZE);
1719 		return;
1720 	}
1721 }
1722 EXPORT_SYMBOL(vfio_unpin_pages);
1723 
1724 /*
1725  * This interface allows the CPUs to perform some sort of virtual DMA on
1726  * behalf of the device.
1727  *
1728  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1729  * into/from a kernel buffer.
1730  *
1731  * As the read/write of user space memory is conducted via the CPUs and is
1732  * not a real device DMA, it is not necessary to pin the user space memory.
1733  *
1734  * @device [in]		: VFIO device
1735  * @iova [in]		: base IOVA of a user space buffer
1736  * @data [in]		: pointer to kernel buffer
1737  * @len [in]		: kernel buffer length
1738  * @write		: indicate read or write
1739  * Return error code on failure or 0 on success.
1740  */
vfio_dma_rw(struct vfio_device * device,dma_addr_t iova,void * data,size_t len,bool write)1741 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1742 		size_t len, bool write)
1743 {
1744 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1745 		return -EINVAL;
1746 
1747 	if (vfio_device_has_container(device))
1748 		return vfio_device_container_dma_rw(device, iova,
1749 						    data, len, write);
1750 
1751 	if (device->iommufd_access) {
1752 		unsigned int flags = 0;
1753 
1754 		if (iova > ULONG_MAX)
1755 			return -EINVAL;
1756 
1757 		/* VFIO historically tries to auto-detect a kthread */
1758 		if (!current->mm)
1759 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1760 		if (write)
1761 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1762 		return iommufd_access_rw(device->iommufd_access, iova, data,
1763 					 len, flags);
1764 	}
1765 	return -EINVAL;
1766 }
1767 EXPORT_SYMBOL(vfio_dma_rw);
1768 
1769 /*
1770  * Module/class support
1771  */
vfio_init(void)1772 static int __init vfio_init(void)
1773 {
1774 	int ret;
1775 
1776 	ida_init(&vfio.device_ida);
1777 
1778 	ret = vfio_group_init();
1779 	if (ret)
1780 		return ret;
1781 
1782 	ret = vfio_virqfd_init();
1783 	if (ret)
1784 		goto err_virqfd;
1785 
1786 	/* /sys/class/vfio-dev/vfioX */
1787 	vfio.device_class = class_create("vfio-dev");
1788 	if (IS_ERR(vfio.device_class)) {
1789 		ret = PTR_ERR(vfio.device_class);
1790 		goto err_dev_class;
1791 	}
1792 
1793 	ret = vfio_cdev_init(vfio.device_class);
1794 	if (ret)
1795 		goto err_alloc_dev_chrdev;
1796 
1797 	vfio_debugfs_create_root();
1798 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1799 	return 0;
1800 
1801 err_alloc_dev_chrdev:
1802 	class_destroy(vfio.device_class);
1803 	vfio.device_class = NULL;
1804 err_dev_class:
1805 	vfio_virqfd_exit();
1806 err_virqfd:
1807 	vfio_group_cleanup();
1808 	return ret;
1809 }
1810 
vfio_cleanup(void)1811 static void __exit vfio_cleanup(void)
1812 {
1813 	vfio_debugfs_remove_root();
1814 	ida_destroy(&vfio.device_ida);
1815 	vfio_cdev_cleanup();
1816 	class_destroy(vfio.device_class);
1817 	vfio.device_class = NULL;
1818 	vfio_virqfd_exit();
1819 	vfio_group_cleanup();
1820 	xa_destroy(&vfio_device_set_xa);
1821 }
1822 
1823 module_init(vfio_init);
1824 module_exit(vfio_cleanup);
1825 
1826 MODULE_IMPORT_NS("IOMMUFD");
1827 MODULE_VERSION(DRIVER_VERSION);
1828 MODULE_LICENSE("GPL v2");
1829 MODULE_AUTHOR(DRIVER_AUTHOR);
1830 MODULE_DESCRIPTION(DRIVER_DESC);
1831 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1832