xref: /linux/drivers/vfio/vfio_main.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/seq_file.h>
32 #include <linux/slab.h>
33 #include <linux/stat.h>
34 #include <linux/string.h>
35 #include <linux/uaccess.h>
36 #include <linux/vfio.h>
37 #include <linux/wait.h>
38 #include <linux/sched/signal.h>
39 #include <linux/pm_runtime.h>
40 #include <linux/interval_tree.h>
41 #include <linux/iova_bitmap.h>
42 #include <linux/iommufd.h>
43 #include "vfio.h"
44 
45 #define DRIVER_VERSION	"0.3"
46 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
47 #define DRIVER_DESC	"VFIO - User Level meta-driver"
48 
49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
50 
51 static struct vfio {
52 	struct class			*device_class;
53 	struct ida			device_ida;
54 	struct vfsmount			*vfs_mount;
55 	int				fs_count;
56 } vfio;
57 
58 #ifdef CONFIG_VFIO_NOIOMMU
59 bool vfio_noiommu __read_mostly;
60 module_param_named(enable_unsafe_noiommu_mode,
61 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
62 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
63 #endif
64 
65 static DEFINE_XARRAY(vfio_device_set_xa);
66 
vfio_assign_device_set(struct vfio_device * device,void * set_id)67 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
68 {
69 	unsigned long idx = (unsigned long)set_id;
70 	struct vfio_device_set *new_dev_set;
71 	struct vfio_device_set *dev_set;
72 
73 	if (WARN_ON(!set_id))
74 		return -EINVAL;
75 
76 	/*
77 	 * Atomically acquire a singleton object in the xarray for this set_id
78 	 */
79 	xa_lock(&vfio_device_set_xa);
80 	dev_set = xa_load(&vfio_device_set_xa, idx);
81 	if (dev_set)
82 		goto found_get_ref;
83 	xa_unlock(&vfio_device_set_xa);
84 
85 	new_dev_set = kzalloc_obj(*new_dev_set);
86 	if (!new_dev_set)
87 		return -ENOMEM;
88 	mutex_init(&new_dev_set->lock);
89 	INIT_LIST_HEAD(&new_dev_set->device_list);
90 	new_dev_set->set_id = set_id;
91 
92 	xa_lock(&vfio_device_set_xa);
93 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
94 			       GFP_KERNEL);
95 	if (!dev_set) {
96 		dev_set = new_dev_set;
97 		goto found_get_ref;
98 	}
99 
100 	kfree(new_dev_set);
101 	if (xa_is_err(dev_set)) {
102 		xa_unlock(&vfio_device_set_xa);
103 		return xa_err(dev_set);
104 	}
105 
106 found_get_ref:
107 	dev_set->device_count++;
108 	xa_unlock(&vfio_device_set_xa);
109 	mutex_lock(&dev_set->lock);
110 	device->dev_set = dev_set;
111 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
112 	mutex_unlock(&dev_set->lock);
113 	return 0;
114 }
115 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
116 
vfio_release_device_set(struct vfio_device * device)117 static void vfio_release_device_set(struct vfio_device *device)
118 {
119 	struct vfio_device_set *dev_set = device->dev_set;
120 
121 	if (!dev_set)
122 		return;
123 
124 	mutex_lock(&dev_set->lock);
125 	list_del(&device->dev_set_list);
126 	mutex_unlock(&dev_set->lock);
127 
128 	xa_lock(&vfio_device_set_xa);
129 	if (!--dev_set->device_count) {
130 		__xa_erase(&vfio_device_set_xa,
131 			   (unsigned long)dev_set->set_id);
132 		mutex_destroy(&dev_set->lock);
133 		kfree(dev_set);
134 	}
135 	xa_unlock(&vfio_device_set_xa);
136 }
137 
vfio_device_set_open_count(struct vfio_device_set * dev_set)138 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
139 {
140 	struct vfio_device *cur;
141 	unsigned int open_count = 0;
142 
143 	lockdep_assert_held(&dev_set->lock);
144 
145 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
146 		open_count += cur->open_count;
147 	return open_count;
148 }
149 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
150 
151 struct vfio_device *
vfio_find_device_in_devset(struct vfio_device_set * dev_set,struct device * dev)152 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
153 			   struct device *dev)
154 {
155 	struct vfio_device *cur;
156 
157 	lockdep_assert_held(&dev_set->lock);
158 
159 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
160 		if (cur->dev == dev)
161 			return cur;
162 	return NULL;
163 }
164 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
165 
166 /*
167  * Device objects - create, release, get, put, search
168  */
169 /* Device reference always implies a group reference */
vfio_device_put_registration(struct vfio_device * device)170 void vfio_device_put_registration(struct vfio_device *device)
171 {
172 	if (refcount_dec_and_test(&device->refcount))
173 		complete(&device->comp);
174 }
175 EXPORT_SYMBOL_GPL(vfio_device_put_registration);
176 
vfio_device_try_get_registration(struct vfio_device * device)177 bool vfio_device_try_get_registration(struct vfio_device *device)
178 {
179 	return refcount_inc_not_zero(&device->refcount);
180 }
181 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
182 
183 /*
184  * VFIO driver API
185  */
186 /* Release helper called by vfio_put_device() */
vfio_device_release(struct device * dev)187 static void vfio_device_release(struct device *dev)
188 {
189 	struct vfio_device *device =
190 			container_of(dev, struct vfio_device, device);
191 
192 	vfio_release_device_set(device);
193 	ida_free(&vfio.device_ida, device->index);
194 
195 	if (device->ops->release)
196 		device->ops->release(device);
197 
198 	iput(device->inode);
199 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
200 	kvfree(device);
201 }
202 
203 static int vfio_init_device(struct vfio_device *device, struct device *dev,
204 			    const struct vfio_device_ops *ops);
205 
206 /*
207  * Allocate and initialize vfio_device so it can be registered to vfio
208  * core.
209  *
210  * Drivers should use the wrapper vfio_alloc_device() for allocation.
211  * @size is the size of the structure to be allocated, including any
212  * private data used by the driver.
213  *
214  * Driver may provide an @init callback to cover device private data.
215  *
216  * Use vfio_put_device() to release the structure after success return.
217  */
_vfio_alloc_device(size_t size,struct device * dev,const struct vfio_device_ops * ops)218 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
219 				       const struct vfio_device_ops *ops)
220 {
221 	struct vfio_device *device;
222 	int ret;
223 
224 	if (WARN_ON(size < sizeof(struct vfio_device)))
225 		return ERR_PTR(-EINVAL);
226 
227 	device = kvzalloc(size, GFP_KERNEL);
228 	if (!device)
229 		return ERR_PTR(-ENOMEM);
230 
231 	ret = vfio_init_device(device, dev, ops);
232 	if (ret)
233 		goto out_free;
234 	return device;
235 
236 out_free:
237 	kvfree(device);
238 	return ERR_PTR(ret);
239 }
240 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
241 
vfio_fs_init_fs_context(struct fs_context * fc)242 static int vfio_fs_init_fs_context(struct fs_context *fc)
243 {
244 	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
245 }
246 
247 static struct file_system_type vfio_fs_type = {
248 	.name = "vfio",
249 	.owner = THIS_MODULE,
250 	.init_fs_context = vfio_fs_init_fs_context,
251 	.kill_sb = kill_anon_super,
252 };
253 
vfio_fs_inode_new(void)254 static struct inode *vfio_fs_inode_new(void)
255 {
256 	struct inode *inode;
257 	int ret;
258 
259 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
260 	if (ret)
261 		return ERR_PTR(ret);
262 
263 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
264 	if (IS_ERR(inode))
265 		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
266 
267 	return inode;
268 }
269 
270 /*
271  * Initialize a vfio_device so it can be registered to vfio core.
272  */
vfio_init_device(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)273 static int vfio_init_device(struct vfio_device *device, struct device *dev,
274 			    const struct vfio_device_ops *ops)
275 {
276 	int ret;
277 
278 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
279 	if (ret < 0) {
280 		dev_dbg(dev, "Error to alloc index\n");
281 		return ret;
282 	}
283 
284 	device->index = ret;
285 	init_completion(&device->comp);
286 	device->dev = dev;
287 	device->ops = ops;
288 	device->inode = vfio_fs_inode_new();
289 	if (IS_ERR(device->inode)) {
290 		ret = PTR_ERR(device->inode);
291 		goto out_inode;
292 	}
293 
294 	if (ops->init) {
295 		ret = ops->init(device);
296 		if (ret)
297 			goto out_uninit;
298 	}
299 
300 	device_initialize(&device->device);
301 	device->device.release = vfio_device_release;
302 	device->device.class = vfio.device_class;
303 	device->device.parent = device->dev;
304 	return 0;
305 
306 out_uninit:
307 	iput(device->inode);
308 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
309 out_inode:
310 	vfio_release_device_set(device);
311 	ida_free(&vfio.device_ida, device->index);
312 	return ret;
313 }
314 
__vfio_register_dev(struct vfio_device * device,enum vfio_group_type type)315 static int __vfio_register_dev(struct vfio_device *device,
316 			       enum vfio_group_type type)
317 {
318 	int ret;
319 
320 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
321 		    (!device->ops->bind_iommufd ||
322 		     !device->ops->unbind_iommufd ||
323 		     !device->ops->attach_ioas ||
324 		     !device->ops->detach_ioas)))
325 		return -EINVAL;
326 
327 	/*
328 	 * If the driver doesn't specify a set then the device is added to a
329 	 * singleton set just for itself.
330 	 */
331 	if (!device->dev_set)
332 		vfio_assign_device_set(device, device);
333 
334 	ret = dev_set_name(&device->device, "vfio%d", device->index);
335 	if (ret)
336 		return ret;
337 
338 	ret = vfio_device_set_group(device, type);
339 	if (ret)
340 		return ret;
341 
342 	/*
343 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
344 	 * restore cache coherency. It has to be checked here because it is only
345 	 * valid for cases where we are using iommu groups.
346 	 */
347 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
348 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
349 		ret = -EINVAL;
350 		goto err_out;
351 	}
352 
353 	ret = vfio_device_add(device);
354 	if (ret)
355 		goto err_out;
356 
357 	/* Refcounting can't start until the driver calls register */
358 	refcount_set(&device->refcount, 1);
359 
360 	vfio_device_group_register(device);
361 	vfio_device_debugfs_init(device);
362 
363 	return 0;
364 err_out:
365 	vfio_device_remove_group(device);
366 	return ret;
367 }
368 
vfio_register_group_dev(struct vfio_device * device)369 int vfio_register_group_dev(struct vfio_device *device)
370 {
371 	return __vfio_register_dev(device, VFIO_IOMMU);
372 }
373 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
374 
375 /*
376  * Register a virtual device without IOMMU backing.  The user of this
377  * device must not be able to directly trigger unmediated DMA.
378  */
vfio_register_emulated_iommu_dev(struct vfio_device * device)379 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
380 {
381 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
382 }
383 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
384 
385 /*
386  * Decrement the device reference count and wait for the device to be
387  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)388 void vfio_unregister_group_dev(struct vfio_device *device)
389 {
390 	unsigned int i = 0;
391 	bool interrupted = false;
392 	long rc;
393 
394 	/*
395 	 * Prevent new device opened by userspace via the
396 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
397 	 */
398 	vfio_device_group_unregister(device);
399 
400 	/*
401 	 * Balances vfio_device_add() in register path, also prevents
402 	 * new device opened by userspace in the cdev path.
403 	 */
404 	vfio_device_del(device);
405 
406 	vfio_device_put_registration(device);
407 	rc = try_wait_for_completion(&device->comp);
408 	while (rc <= 0) {
409 		if (device->ops->request)
410 			device->ops->request(device, i++);
411 
412 		if (interrupted) {
413 			rc = wait_for_completion_timeout(&device->comp,
414 							 HZ * 10);
415 		} else {
416 			rc = wait_for_completion_interruptible_timeout(
417 				&device->comp, HZ * 10);
418 			if (rc < 0) {
419 				interrupted = true;
420 				dev_warn(device->dev,
421 					 "Device is currently in use, task"
422 					 " \"%s\" (%d) "
423 					 "blocked until device is released",
424 					 current->comm, task_pid_nr(current));
425 			}
426 		}
427 	}
428 
429 	vfio_device_debugfs_exit(device);
430 	/* Balances vfio_device_set_group in register path */
431 	vfio_device_remove_group(device);
432 }
433 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
434 
435 #if IS_ENABLED(CONFIG_KVM)
vfio_device_get_kvm_safe(struct vfio_device * device,struct kvm * kvm)436 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
437 {
438 	void (*pfn)(struct kvm *kvm);
439 	bool (*fn)(struct kvm *kvm);
440 	bool ret;
441 
442 	lockdep_assert_held(&device->dev_set->lock);
443 
444 	if (!kvm)
445 		return;
446 
447 	pfn = symbol_get(kvm_put_kvm);
448 	if (WARN_ON(!pfn))
449 		return;
450 
451 	fn = symbol_get(kvm_get_kvm_safe);
452 	if (WARN_ON(!fn)) {
453 		symbol_put(kvm_put_kvm);
454 		return;
455 	}
456 
457 	ret = fn(kvm);
458 	symbol_put(kvm_get_kvm_safe);
459 	if (!ret) {
460 		symbol_put(kvm_put_kvm);
461 		return;
462 	}
463 
464 	device->put_kvm = pfn;
465 	device->kvm = kvm;
466 }
467 
vfio_device_put_kvm(struct vfio_device * device)468 void vfio_device_put_kvm(struct vfio_device *device)
469 {
470 	lockdep_assert_held(&device->dev_set->lock);
471 
472 	if (!device->kvm)
473 		return;
474 
475 	if (WARN_ON(!device->put_kvm))
476 		goto clear;
477 
478 	device->put_kvm(device->kvm);
479 	device->put_kvm = NULL;
480 	symbol_put(kvm_put_kvm);
481 
482 clear:
483 	device->kvm = NULL;
484 }
485 #endif
486 
487 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)488 static bool vfio_assert_device_open(struct vfio_device *device)
489 {
490 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
491 }
492 
493 struct vfio_device_file *
vfio_allocate_device_file(struct vfio_device * device)494 vfio_allocate_device_file(struct vfio_device *device)
495 {
496 	struct vfio_device_file *df;
497 
498 	df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT);
499 	if (!df)
500 		return ERR_PTR(-ENOMEM);
501 
502 	df->device = device;
503 	spin_lock_init(&df->kvm_ref_lock);
504 
505 	return df;
506 }
507 
vfio_df_device_first_open(struct vfio_device_file * df)508 static int vfio_df_device_first_open(struct vfio_device_file *df)
509 {
510 	struct vfio_device *device = df->device;
511 	struct iommufd_ctx *iommufd = df->iommufd;
512 	int ret;
513 
514 	lockdep_assert_held(&device->dev_set->lock);
515 
516 	if (!try_module_get(device->dev->driver->owner))
517 		return -ENODEV;
518 
519 	if (iommufd)
520 		ret = vfio_df_iommufd_bind(df);
521 	else
522 		ret = vfio_device_group_use_iommu(device);
523 	if (ret)
524 		goto err_module_put;
525 
526 	if (device->ops->open_device) {
527 		ret = device->ops->open_device(device);
528 		if (ret)
529 			goto err_unuse_iommu;
530 	}
531 	return 0;
532 
533 err_unuse_iommu:
534 	if (iommufd)
535 		vfio_df_iommufd_unbind(df);
536 	else
537 		vfio_device_group_unuse_iommu(device);
538 err_module_put:
539 	module_put(device->dev->driver->owner);
540 	return ret;
541 }
542 
vfio_df_device_last_close(struct vfio_device_file * df)543 static void vfio_df_device_last_close(struct vfio_device_file *df)
544 {
545 	struct vfio_device *device = df->device;
546 	struct iommufd_ctx *iommufd = df->iommufd;
547 
548 	lockdep_assert_held(&device->dev_set->lock);
549 
550 	if (device->ops->close_device)
551 		device->ops->close_device(device);
552 	if (iommufd)
553 		vfio_df_iommufd_unbind(df);
554 	else
555 		vfio_device_group_unuse_iommu(device);
556 	module_put(device->dev->driver->owner);
557 }
558 
vfio_df_open(struct vfio_device_file * df)559 int vfio_df_open(struct vfio_device_file *df)
560 {
561 	struct vfio_device *device = df->device;
562 	int ret = 0;
563 
564 	lockdep_assert_held(&device->dev_set->lock);
565 
566 	/*
567 	 * Only the group path allows the device to be opened multiple
568 	 * times.  The device cdev path doesn't have a secure way for it.
569 	 */
570 	if (device->open_count != 0 && !df->group)
571 		return -EINVAL;
572 
573 	device->open_count++;
574 	if (device->open_count == 1) {
575 		ret = vfio_df_device_first_open(df);
576 		if (ret)
577 			device->open_count--;
578 	}
579 
580 	return ret;
581 }
582 
vfio_df_close(struct vfio_device_file * df)583 void vfio_df_close(struct vfio_device_file *df)
584 {
585 	struct vfio_device *device = df->device;
586 
587 	lockdep_assert_held(&device->dev_set->lock);
588 
589 	if (!vfio_assert_device_open(device))
590 		return;
591 	if (device->open_count == 1)
592 		vfio_df_device_last_close(df);
593 	device->open_count--;
594 }
595 
596 /*
597  * Wrapper around pm_runtime_resume_and_get().
598  * Return error code on failure or 0 on success.
599  */
vfio_device_pm_runtime_get(struct vfio_device * device)600 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
601 {
602 	struct device *dev = device->dev;
603 
604 	if (dev->driver && dev->driver->pm) {
605 		int ret;
606 
607 		ret = pm_runtime_resume_and_get(dev);
608 		if (ret) {
609 			dev_info_ratelimited(dev,
610 				"vfio: runtime resume failed %d\n", ret);
611 			return -EIO;
612 		}
613 	}
614 
615 	return 0;
616 }
617 
618 /*
619  * Wrapper around pm_runtime_put().
620  */
vfio_device_pm_runtime_put(struct vfio_device * device)621 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
622 {
623 	struct device *dev = device->dev;
624 
625 	if (dev->driver && dev->driver->pm)
626 		pm_runtime_put(dev);
627 }
628 
629 /*
630  * VFIO Device fd
631  */
vfio_device_fops_release(struct inode * inode,struct file * filep)632 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
633 {
634 	struct vfio_device_file *df = filep->private_data;
635 	struct vfio_device *device = df->device;
636 
637 	if (df->group)
638 		vfio_df_group_close(df);
639 	else
640 		vfio_df_unbind_iommufd(df);
641 
642 	vfio_device_put_registration(device);
643 
644 	kfree(df);
645 
646 	return 0;
647 }
648 
649 /*
650  * vfio_mig_get_next_state - Compute the next step in the FSM
651  * @cur_fsm - The current state the device is in
652  * @new_fsm - The target state to reach
653  * @next_fsm - Pointer to the next step to get to new_fsm
654  *
655  * Return 0 upon success, otherwise -errno
656  * Upon success the next step in the state progression between cur_fsm and
657  * new_fsm will be set in next_fsm.
658  *
659  * This breaks down requests for combination transitions into smaller steps and
660  * returns the next step to get to new_fsm. The function may need to be called
661  * multiple times before reaching new_fsm.
662  *
663  */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)664 int vfio_mig_get_next_state(struct vfio_device *device,
665 			    enum vfio_device_mig_state cur_fsm,
666 			    enum vfio_device_mig_state new_fsm,
667 			    enum vfio_device_mig_state *next_fsm)
668 {
669 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
670 	/*
671 	 * The coding in this table requires the driver to implement the
672 	 * following FSM arcs:
673 	 *         RESUMING -> STOP
674 	 *         STOP -> RESUMING
675 	 *         STOP -> STOP_COPY
676 	 *         STOP_COPY -> STOP
677 	 *
678 	 * If P2P is supported then the driver must also implement these FSM
679 	 * arcs:
680 	 *         RUNNING -> RUNNING_P2P
681 	 *         RUNNING_P2P -> RUNNING
682 	 *         RUNNING_P2P -> STOP
683 	 *         STOP -> RUNNING_P2P
684 	 *
685 	 * If precopy is supported then the driver must support these additional
686 	 * FSM arcs:
687 	 *         RUNNING -> PRE_COPY
688 	 *         PRE_COPY -> RUNNING
689 	 *         PRE_COPY -> STOP_COPY
690 	 * However, if precopy and P2P are supported together then the driver
691 	 * must support these additional arcs beyond the P2P arcs above:
692 	 *         PRE_COPY -> RUNNING
693 	 *         PRE_COPY -> PRE_COPY_P2P
694 	 *         PRE_COPY_P2P -> PRE_COPY
695 	 *         PRE_COPY_P2P -> RUNNING_P2P
696 	 *         PRE_COPY_P2P -> STOP_COPY
697 	 *         RUNNING -> PRE_COPY
698 	 *         RUNNING_P2P -> PRE_COPY_P2P
699 	 *
700 	 * Without P2P and precopy the driver must implement:
701 	 *         RUNNING -> STOP
702 	 *         STOP -> RUNNING
703 	 *
704 	 * The coding will step through multiple states for some combination
705 	 * transitions; if all optional features are supported, this means the
706 	 * following ones:
707 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
708 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
709 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
710 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
711 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
712 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
713 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
714 	 *         RESUMING -> STOP -> RUNNING_P2P
715 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
716 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
717 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
718 	 *         RESUMING -> STOP -> STOP_COPY
719 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
720 	 *         RUNNING -> RUNNING_P2P -> STOP
721 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
722 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
723 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
724 	 *         RUNNING_P2P -> STOP -> RESUMING
725 	 *         RUNNING_P2P -> STOP -> STOP_COPY
726 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
727 	 *         STOP -> RUNNING_P2P -> RUNNING
728 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
729 	 *         STOP_COPY -> STOP -> RESUMING
730 	 *         STOP_COPY -> STOP -> RUNNING_P2P
731 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
732 	 *
733 	 *  The following transitions are blocked:
734 	 *         STOP_COPY -> PRE_COPY
735 	 *         STOP_COPY -> PRE_COPY_P2P
736 	 */
737 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
738 		[VFIO_DEVICE_STATE_STOP] = {
739 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
740 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
741 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
742 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
743 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
744 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
745 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
746 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
747 		},
748 		[VFIO_DEVICE_STATE_RUNNING] = {
749 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
750 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
751 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
752 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
753 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
754 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
755 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
756 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
757 		},
758 		[VFIO_DEVICE_STATE_PRE_COPY] = {
759 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
760 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
761 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
762 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
763 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
764 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
765 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
766 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
767 		},
768 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
769 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
770 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
771 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
772 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
773 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
774 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
775 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
776 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
777 		},
778 		[VFIO_DEVICE_STATE_STOP_COPY] = {
779 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
780 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
781 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
782 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
783 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
784 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
785 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
786 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
787 		},
788 		[VFIO_DEVICE_STATE_RESUMING] = {
789 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
790 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
791 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
792 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
793 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
794 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
795 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
796 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
797 		},
798 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
799 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
800 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
801 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
802 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
803 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
804 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
805 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
806 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
807 		},
808 		[VFIO_DEVICE_STATE_ERROR] = {
809 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
810 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
811 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
812 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
813 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
814 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
815 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
816 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
817 		},
818 	};
819 
820 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
821 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
822 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
823 		[VFIO_DEVICE_STATE_PRE_COPY] =
824 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
825 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
826 						   VFIO_MIGRATION_P2P |
827 						   VFIO_MIGRATION_PRE_COPY,
828 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
829 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
830 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
831 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
832 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
833 	};
834 
835 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
836 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
837 			state_flags_table[cur_fsm]))
838 		return -EINVAL;
839 
840 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
841 	   (state_flags_table[new_fsm] & device->migration_flags) !=
842 			state_flags_table[new_fsm])
843 		return -EINVAL;
844 
845 	/*
846 	 * Arcs touching optional and unsupported states are skipped over. The
847 	 * driver will instead see an arc from the original state to the next
848 	 * logical state, as per the above comment.
849 	 */
850 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
851 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
852 			state_flags_table[*next_fsm])
853 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
854 
855 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
856 }
857 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
858 
859 /*
860  * Convert the drivers's struct file into a FD number and return it to userspace
861  */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)862 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
863 				   struct vfio_device_feature_mig_state *mig)
864 {
865 	int ret;
866 	int fd;
867 
868 	fd = get_unused_fd_flags(O_CLOEXEC);
869 	if (fd < 0) {
870 		ret = fd;
871 		goto out_fput;
872 	}
873 
874 	mig->data_fd = fd;
875 	if (copy_to_user(arg, mig, sizeof(*mig))) {
876 		ret = -EFAULT;
877 		goto out_put_unused;
878 	}
879 	fd_install(fd, filp);
880 	return 0;
881 
882 out_put_unused:
883 	put_unused_fd(fd);
884 out_fput:
885 	fput(filp);
886 	return ret;
887 }
888 
889 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)890 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
891 					   u32 flags, void __user *arg,
892 					   size_t argsz)
893 {
894 	size_t minsz =
895 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
896 	struct vfio_device_feature_mig_state mig;
897 	struct file *filp = NULL;
898 	int ret;
899 
900 	if (!device->mig_ops)
901 		return -ENOTTY;
902 
903 	ret = vfio_check_feature(flags, argsz,
904 				 VFIO_DEVICE_FEATURE_SET |
905 				 VFIO_DEVICE_FEATURE_GET,
906 				 sizeof(mig));
907 	if (ret != 1)
908 		return ret;
909 
910 	if (copy_from_user(&mig, arg, minsz))
911 		return -EFAULT;
912 
913 	if (flags & VFIO_DEVICE_FEATURE_GET) {
914 		enum vfio_device_mig_state curr_state;
915 
916 		ret = device->mig_ops->migration_get_state(device,
917 							   &curr_state);
918 		if (ret)
919 			return ret;
920 		mig.device_state = curr_state;
921 		goto out_copy;
922 	}
923 
924 	/* Handle the VFIO_DEVICE_FEATURE_SET */
925 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
926 	if (IS_ERR(filp) || !filp)
927 		goto out_copy;
928 
929 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
930 out_copy:
931 	mig.data_fd = -1;
932 	if (copy_to_user(arg, &mig, sizeof(mig)))
933 		return -EFAULT;
934 	if (IS_ERR(filp))
935 		return PTR_ERR(filp);
936 	return 0;
937 }
938 
939 static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)940 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
941 					      u32 flags, void __user *arg,
942 					      size_t argsz)
943 {
944 	struct vfio_device_feature_mig_data_size data_size = {};
945 	unsigned long stop_copy_length;
946 	int ret;
947 
948 	if (!device->mig_ops)
949 		return -ENOTTY;
950 
951 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
952 				 sizeof(data_size));
953 	if (ret != 1)
954 		return ret;
955 
956 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
957 	if (ret)
958 		return ret;
959 
960 	data_size.stop_copy_length = stop_copy_length;
961 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
962 		return -EFAULT;
963 
964 	return 0;
965 }
966 
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)967 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
968 					       u32 flags, void __user *arg,
969 					       size_t argsz)
970 {
971 	struct vfio_device_feature_migration mig = {
972 		.flags = device->migration_flags,
973 	};
974 	int ret;
975 
976 	if (!device->mig_ops)
977 		return -ENOTTY;
978 
979 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
980 				 sizeof(mig));
981 	if (ret != 1)
982 		return ret;
983 	if (copy_to_user(arg, &mig, sizeof(mig)))
984 		return -EFAULT;
985 	return 0;
986 }
987 
vfio_combine_iova_ranges(struct rb_root_cached * root,u32 cur_nodes,u32 req_nodes)988 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
989 			      u32 req_nodes)
990 {
991 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
992 	unsigned long min_gap, curr_gap;
993 
994 	/* Special shortcut when a single range is required */
995 	if (req_nodes == 1) {
996 		unsigned long last;
997 
998 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
999 
1000 		/* Empty list */
1001 		if (WARN_ON_ONCE(!comb_start))
1002 			return;
1003 
1004 		curr = comb_start;
1005 		while (curr) {
1006 			last = curr->last;
1007 			prev = curr;
1008 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1009 			if (prev != comb_start)
1010 				interval_tree_remove(prev, root);
1011 		}
1012 		comb_start->last = last;
1013 		return;
1014 	}
1015 
1016 	/* Combine ranges which have the smallest gap */
1017 	while (cur_nodes > req_nodes) {
1018 		prev = NULL;
1019 		min_gap = ULONG_MAX;
1020 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1021 		while (curr) {
1022 			if (prev) {
1023 				curr_gap = curr->start - prev->last;
1024 				if (curr_gap < min_gap) {
1025 					min_gap = curr_gap;
1026 					comb_start = prev;
1027 					comb_end = curr;
1028 				}
1029 			}
1030 			prev = curr;
1031 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1032 		}
1033 
1034 		/* Empty list or no nodes to combine */
1035 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1036 			break;
1037 
1038 		comb_start->last = comb_end->last;
1039 		interval_tree_remove(comb_end, root);
1040 		cur_nodes--;
1041 	}
1042 }
1043 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1044 
1045 /* Ranges should fit into a single kernel page */
1046 #define LOG_MAX_RANGES \
1047 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1048 
1049 static int
vfio_ioctl_device_feature_logging_start(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1050 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1051 					u32 flags, void __user *arg,
1052 					size_t argsz)
1053 {
1054 	size_t minsz =
1055 		offsetofend(struct vfio_device_feature_dma_logging_control,
1056 			    ranges);
1057 	struct vfio_device_feature_dma_logging_range __user *ranges;
1058 	struct vfio_device_feature_dma_logging_control control;
1059 	struct vfio_device_feature_dma_logging_range range;
1060 	struct rb_root_cached root = RB_ROOT_CACHED;
1061 	struct interval_tree_node *nodes;
1062 	u64 iova_end;
1063 	u32 nnodes;
1064 	int i, ret;
1065 
1066 	if (!device->log_ops)
1067 		return -ENOTTY;
1068 
1069 	ret = vfio_check_feature(flags, argsz,
1070 				 VFIO_DEVICE_FEATURE_SET,
1071 				 sizeof(control));
1072 	if (ret != 1)
1073 		return ret;
1074 
1075 	if (copy_from_user(&control, arg, minsz))
1076 		return -EFAULT;
1077 
1078 	nnodes = control.num_ranges;
1079 	if (!nnodes)
1080 		return -EINVAL;
1081 
1082 	if (nnodes > LOG_MAX_RANGES)
1083 		return -E2BIG;
1084 
1085 	ranges = u64_to_user_ptr(control.ranges);
1086 	nodes = kmalloc_objs(struct interval_tree_node, nnodes);
1087 	if (!nodes)
1088 		return -ENOMEM;
1089 
1090 	for (i = 0; i < nnodes; i++) {
1091 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1092 			ret = -EFAULT;
1093 			goto end;
1094 		}
1095 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1096 		    !IS_ALIGNED(range.length, control.page_size)) {
1097 			ret = -EINVAL;
1098 			goto end;
1099 		}
1100 
1101 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1102 		    iova_end > ULONG_MAX) {
1103 			ret = -EOVERFLOW;
1104 			goto end;
1105 		}
1106 
1107 		nodes[i].start = range.iova;
1108 		nodes[i].last = range.iova + range.length - 1;
1109 		if (interval_tree_iter_first(&root, nodes[i].start,
1110 					     nodes[i].last)) {
1111 			/* Range overlapping */
1112 			ret = -EINVAL;
1113 			goto end;
1114 		}
1115 		interval_tree_insert(nodes + i, &root);
1116 	}
1117 
1118 	ret = device->log_ops->log_start(device, &root, nnodes,
1119 					 &control.page_size);
1120 	if (ret)
1121 		goto end;
1122 
1123 	if (copy_to_user(arg, &control, sizeof(control))) {
1124 		ret = -EFAULT;
1125 		device->log_ops->log_stop(device);
1126 	}
1127 
1128 end:
1129 	kfree(nodes);
1130 	return ret;
1131 }
1132 
1133 static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1134 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1135 				       u32 flags, void __user *arg,
1136 				       size_t argsz)
1137 {
1138 	int ret;
1139 
1140 	if (!device->log_ops)
1141 		return -ENOTTY;
1142 
1143 	ret = vfio_check_feature(flags, argsz,
1144 				 VFIO_DEVICE_FEATURE_SET, 0);
1145 	if (ret != 1)
1146 		return ret;
1147 
1148 	return device->log_ops->log_stop(device);
1149 }
1150 
vfio_device_log_read_and_clear(struct iova_bitmap * iter,unsigned long iova,size_t length,void * opaque)1151 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1152 					  unsigned long iova, size_t length,
1153 					  void *opaque)
1154 {
1155 	struct vfio_device *device = opaque;
1156 
1157 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1158 }
1159 
1160 static int
vfio_ioctl_device_feature_logging_report(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1161 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1162 					 u32 flags, void __user *arg,
1163 					 size_t argsz)
1164 {
1165 	size_t minsz =
1166 		offsetofend(struct vfio_device_feature_dma_logging_report,
1167 			    bitmap);
1168 	struct vfio_device_feature_dma_logging_report report;
1169 	struct iova_bitmap *iter;
1170 	u64 iova_end;
1171 	int ret;
1172 
1173 	if (!device->log_ops)
1174 		return -ENOTTY;
1175 
1176 	ret = vfio_check_feature(flags, argsz,
1177 				 VFIO_DEVICE_FEATURE_GET,
1178 				 sizeof(report));
1179 	if (ret != 1)
1180 		return ret;
1181 
1182 	if (copy_from_user(&report, arg, minsz))
1183 		return -EFAULT;
1184 
1185 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1186 		return -EINVAL;
1187 
1188 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1189 	    iova_end > ULONG_MAX)
1190 		return -EOVERFLOW;
1191 
1192 	iter = iova_bitmap_alloc(report.iova, report.length,
1193 				 report.page_size,
1194 				 u64_to_user_ptr(report.bitmap));
1195 	if (IS_ERR(iter))
1196 		return PTR_ERR(iter);
1197 
1198 	ret = iova_bitmap_for_each(iter, device,
1199 				   vfio_device_log_read_and_clear);
1200 
1201 	iova_bitmap_free(iter);
1202 	return ret;
1203 }
1204 
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1205 static int vfio_ioctl_device_feature(struct vfio_device *device,
1206 				     struct vfio_device_feature __user *arg)
1207 {
1208 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1209 	struct vfio_device_feature feature;
1210 
1211 	if (copy_from_user(&feature, arg, minsz))
1212 		return -EFAULT;
1213 
1214 	if (feature.argsz < minsz)
1215 		return -EINVAL;
1216 
1217 	/* Check unknown flags */
1218 	if (feature.flags &
1219 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1220 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1221 		return -EINVAL;
1222 
1223 	/* GET & SET are mutually exclusive except with PROBE */
1224 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1225 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1226 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1227 		return -EINVAL;
1228 
1229 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1230 	case VFIO_DEVICE_FEATURE_MIGRATION:
1231 		return vfio_ioctl_device_feature_migration(
1232 			device, feature.flags, arg->data,
1233 			feature.argsz - minsz);
1234 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1235 		return vfio_ioctl_device_feature_mig_device_state(
1236 			device, feature.flags, arg->data,
1237 			feature.argsz - minsz);
1238 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1239 		return vfio_ioctl_device_feature_logging_start(
1240 			device, feature.flags, arg->data,
1241 			feature.argsz - minsz);
1242 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1243 		return vfio_ioctl_device_feature_logging_stop(
1244 			device, feature.flags, arg->data,
1245 			feature.argsz - minsz);
1246 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1247 		return vfio_ioctl_device_feature_logging_report(
1248 			device, feature.flags, arg->data,
1249 			feature.argsz - minsz);
1250 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1251 		return vfio_ioctl_device_feature_migration_data_size(
1252 			device, feature.flags, arg->data,
1253 			feature.argsz - minsz);
1254 	default:
1255 		if (unlikely(!device->ops->device_feature))
1256 			return -ENOTTY;
1257 		return device->ops->device_feature(device, feature.flags,
1258 						   arg->data,
1259 						   feature.argsz - minsz);
1260 	}
1261 }
1262 
vfio_get_region_info(struct vfio_device * device,struct vfio_region_info __user * arg)1263 static long vfio_get_region_info(struct vfio_device *device,
1264 				 struct vfio_region_info __user *arg)
1265 {
1266 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
1267 	struct vfio_region_info info = {};
1268 	struct vfio_info_cap caps = {};
1269 	int ret;
1270 
1271 	if (unlikely(!device->ops->get_region_info_caps))
1272 		return -EINVAL;
1273 
1274 	if (copy_from_user(&info, arg, minsz))
1275 		return -EFAULT;
1276 	if (info.argsz < minsz)
1277 		return -EINVAL;
1278 
1279 	ret = device->ops->get_region_info_caps(device, &info, &caps);
1280 	if (ret)
1281 		goto out_free;
1282 
1283 	if (caps.size) {
1284 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1285 		if (info.argsz < sizeof(info) + caps.size) {
1286 			info.argsz = sizeof(info) + caps.size;
1287 			info.cap_offset = 0;
1288 		} else {
1289 			vfio_info_cap_shift(&caps, sizeof(info));
1290 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1291 				ret = -EFAULT;
1292 				goto out_free;
1293 			}
1294 			info.cap_offset = sizeof(info);
1295 		}
1296 	}
1297 
1298 	if (copy_to_user(arg, &info, minsz)){
1299 		ret = -EFAULT;
1300 		goto out_free;
1301 	}
1302 
1303 out_free:
1304 	kfree(caps.buf);
1305 	return ret;
1306 }
1307 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1308 static long vfio_device_fops_unl_ioctl(struct file *filep,
1309 				       unsigned int cmd, unsigned long arg)
1310 {
1311 	struct vfio_device_file *df = filep->private_data;
1312 	struct vfio_device *device = df->device;
1313 	void __user *uptr = (void __user *)arg;
1314 	int ret;
1315 
1316 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1317 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1318 
1319 	/* Paired with smp_store_release() following vfio_df_open() */
1320 	if (!smp_load_acquire(&df->access_granted))
1321 		return -EINVAL;
1322 
1323 	ret = vfio_device_pm_runtime_get(device);
1324 	if (ret)
1325 		return ret;
1326 
1327 	/* cdev only ioctls */
1328 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1329 		switch (cmd) {
1330 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1331 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1332 			goto out;
1333 
1334 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1335 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1336 			goto out;
1337 		}
1338 	}
1339 
1340 	switch (cmd) {
1341 	case VFIO_DEVICE_FEATURE:
1342 		ret = vfio_ioctl_device_feature(device, uptr);
1343 		break;
1344 
1345 	case VFIO_DEVICE_GET_REGION_INFO:
1346 		ret = vfio_get_region_info(device, uptr);
1347 		break;
1348 
1349 	default:
1350 		if (unlikely(!device->ops->ioctl))
1351 			ret = -EINVAL;
1352 		else
1353 			ret = device->ops->ioctl(device, cmd, arg);
1354 		break;
1355 	}
1356 out:
1357 	vfio_device_pm_runtime_put(device);
1358 	return ret;
1359 }
1360 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1361 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1362 				     size_t count, loff_t *ppos)
1363 {
1364 	struct vfio_device_file *df = filep->private_data;
1365 	struct vfio_device *device = df->device;
1366 
1367 	/* Paired with smp_store_release() following vfio_df_open() */
1368 	if (!smp_load_acquire(&df->access_granted))
1369 		return -EINVAL;
1370 
1371 	if (unlikely(!device->ops->read))
1372 		return -EINVAL;
1373 
1374 	return device->ops->read(device, buf, count, ppos);
1375 }
1376 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1377 static ssize_t vfio_device_fops_write(struct file *filep,
1378 				      const char __user *buf,
1379 				      size_t count, loff_t *ppos)
1380 {
1381 	struct vfio_device_file *df = filep->private_data;
1382 	struct vfio_device *device = df->device;
1383 
1384 	/* Paired with smp_store_release() following vfio_df_open() */
1385 	if (!smp_load_acquire(&df->access_granted))
1386 		return -EINVAL;
1387 
1388 	if (unlikely(!device->ops->write))
1389 		return -EINVAL;
1390 
1391 	return device->ops->write(device, buf, count, ppos);
1392 }
1393 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1394 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1395 {
1396 	struct vfio_device_file *df = filep->private_data;
1397 	struct vfio_device *device = df->device;
1398 
1399 	/* Paired with smp_store_release() following vfio_df_open() */
1400 	if (!smp_load_acquire(&df->access_granted))
1401 		return -EINVAL;
1402 
1403 	if (unlikely(!device->ops->mmap))
1404 		return -EINVAL;
1405 
1406 	return device->ops->mmap(device, vma);
1407 }
1408 
1409 #ifdef CONFIG_PROC_FS
vfio_device_show_fdinfo(struct seq_file * m,struct file * filep)1410 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep)
1411 {
1412 	char *path;
1413 	struct vfio_device_file *df = filep->private_data;
1414 	struct vfio_device *device = df->device;
1415 
1416 	path = kobject_get_path(&device->dev->kobj, GFP_KERNEL);
1417 	if (!path)
1418 		return;
1419 
1420 	seq_printf(m, "vfio-device-syspath: /sys%s\n", path);
1421 	kfree(path);
1422 }
1423 #endif
1424 
1425 const struct file_operations vfio_device_fops = {
1426 	.owner		= THIS_MODULE,
1427 	.open		= vfio_device_fops_cdev_open,
1428 	.release	= vfio_device_fops_release,
1429 	.read		= vfio_device_fops_read,
1430 	.write		= vfio_device_fops_write,
1431 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1432 	.compat_ioctl	= compat_ptr_ioctl,
1433 	.mmap		= vfio_device_fops_mmap,
1434 #ifdef CONFIG_PROC_FS
1435 	.show_fdinfo	= vfio_device_show_fdinfo,
1436 #endif
1437 };
1438 
vfio_device_from_file(struct file * file)1439 static struct vfio_device *vfio_device_from_file(struct file *file)
1440 {
1441 	struct vfio_device_file *df = file->private_data;
1442 
1443 	if (file->f_op != &vfio_device_fops)
1444 		return NULL;
1445 	return df->device;
1446 }
1447 
1448 /**
1449  * vfio_file_is_valid - True if the file is valid vfio file
1450  * @file: VFIO group file or VFIO device file
1451  */
vfio_file_is_valid(struct file * file)1452 bool vfio_file_is_valid(struct file *file)
1453 {
1454 	return vfio_group_from_file(file) ||
1455 	       vfio_device_from_file(file);
1456 }
1457 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1458 
1459 /**
1460  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1461  *        is always CPU cache coherent
1462  * @file: VFIO group file or VFIO device file
1463  *
1464  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1465  * bit in DMA transactions. A return of false indicates that the user has
1466  * rights to access additional instructions such as wbinvd on x86.
1467  */
vfio_file_enforced_coherent(struct file * file)1468 bool vfio_file_enforced_coherent(struct file *file)
1469 {
1470 	struct vfio_device *device;
1471 	struct vfio_group *group;
1472 
1473 	group = vfio_group_from_file(file);
1474 	if (group)
1475 		return vfio_group_enforced_coherent(group);
1476 
1477 	device = vfio_device_from_file(file);
1478 	if (device)
1479 		return device_iommu_capable(device->dev,
1480 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1481 
1482 	return true;
1483 }
1484 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1485 
vfio_device_file_set_kvm(struct file * file,struct kvm * kvm)1486 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1487 {
1488 	struct vfio_device_file *df = file->private_data;
1489 
1490 	/*
1491 	 * The kvm is first recorded in the vfio_device_file, and will
1492 	 * be propagated to vfio_device::kvm when the file is bound to
1493 	 * iommufd successfully in the vfio device cdev path.
1494 	 */
1495 	spin_lock(&df->kvm_ref_lock);
1496 	df->kvm = kvm;
1497 	spin_unlock(&df->kvm_ref_lock);
1498 }
1499 
1500 /**
1501  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1502  * @file: VFIO group file or VFIO device file
1503  * @kvm: KVM to link
1504  *
1505  * When a VFIO device is first opened the KVM will be available in
1506  * device->kvm if one was associated with the file.
1507  */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1508 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1509 {
1510 	struct vfio_group *group;
1511 
1512 	group = vfio_group_from_file(file);
1513 	if (group)
1514 		vfio_group_set_kvm(group, kvm);
1515 
1516 	if (vfio_device_from_file(file))
1517 		vfio_device_file_set_kvm(file, kvm);
1518 }
1519 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1520 
1521 /*
1522  * Sub-module support
1523  */
1524 /*
1525  * Helper for managing a buffer of info chain capabilities, allocate or
1526  * reallocate a buffer with additional @size, filling in @id and @version
1527  * of the capability.  A pointer to the new capability is returned.
1528  *
1529  * NB. The chain is based at the head of the buffer, so new entries are
1530  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1531  * next offsets prior to copying to the user buffer.
1532  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1533 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1534 					       size_t size, u16 id, u16 version)
1535 {
1536 	void *buf;
1537 	struct vfio_info_cap_header *header, *tmp;
1538 
1539 	/* Ensure that the next capability struct will be aligned */
1540 	size = ALIGN(size, sizeof(u64));
1541 
1542 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1543 	if (!buf) {
1544 		kfree(caps->buf);
1545 		caps->buf = NULL;
1546 		caps->size = 0;
1547 		return ERR_PTR(-ENOMEM);
1548 	}
1549 
1550 	caps->buf = buf;
1551 	header = buf + caps->size;
1552 
1553 	/* Eventually copied to user buffer, zero */
1554 	memset(header, 0, size);
1555 
1556 	header->id = id;
1557 	header->version = version;
1558 
1559 	/* Add to the end of the capability chain */
1560 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1561 		; /* nothing */
1562 
1563 	tmp->next = caps->size;
1564 	caps->size += size;
1565 
1566 	return header;
1567 }
1568 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1569 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1570 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1571 {
1572 	struct vfio_info_cap_header *tmp;
1573 	void *buf = (void *)caps->buf;
1574 
1575 	/* Capability structs should start with proper alignment */
1576 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1577 
1578 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1579 		tmp->next += offset;
1580 }
1581 EXPORT_SYMBOL(vfio_info_cap_shift);
1582 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1583 int vfio_info_add_capability(struct vfio_info_cap *caps,
1584 			     struct vfio_info_cap_header *cap, size_t size)
1585 {
1586 	struct vfio_info_cap_header *header;
1587 
1588 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1589 	if (IS_ERR(header))
1590 		return PTR_ERR(header);
1591 
1592 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1593 
1594 	return 0;
1595 }
1596 EXPORT_SYMBOL(vfio_info_add_capability);
1597 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1598 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1599 				       int max_irq_type, size_t *data_size)
1600 {
1601 	unsigned long minsz;
1602 	size_t size;
1603 
1604 	minsz = offsetofend(struct vfio_irq_set, count);
1605 
1606 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1607 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1608 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1609 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1610 		return -EINVAL;
1611 
1612 	if (data_size)
1613 		*data_size = 0;
1614 
1615 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1616 		return -EINVAL;
1617 
1618 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1619 	case VFIO_IRQ_SET_DATA_NONE:
1620 		size = 0;
1621 		break;
1622 	case VFIO_IRQ_SET_DATA_BOOL:
1623 		size = sizeof(uint8_t);
1624 		break;
1625 	case VFIO_IRQ_SET_DATA_EVENTFD:
1626 		size = sizeof(int32_t);
1627 		break;
1628 	default:
1629 		return -EINVAL;
1630 	}
1631 
1632 	if (size) {
1633 		if (hdr->argsz - minsz < hdr->count * size)
1634 			return -EINVAL;
1635 
1636 		if (!data_size)
1637 			return -EINVAL;
1638 
1639 		*data_size = hdr->count * size;
1640 	}
1641 
1642 	return 0;
1643 }
1644 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1645 
1646 /*
1647  * Pin contiguous user pages and return their associated host pages for local
1648  * domain only.
1649  * @device [in]  : device
1650  * @iova [in]    : starting IOVA of user pages to be pinned.
1651  * @npage [in]   : count of pages to be pinned.  This count should not
1652  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1653  * @prot [in]    : protection flags
1654  * @pages[out]   : array of host pages
1655  * Return error or number of pages pinned.
1656  *
1657  * A driver may only call this function if the vfio_device was created
1658  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1659  */
vfio_pin_pages(struct vfio_device * device,dma_addr_t iova,int npage,int prot,struct page ** pages)1660 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1661 		   int npage, int prot, struct page **pages)
1662 {
1663 	/* group->container cannot change while a vfio device is open */
1664 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1665 		return -EINVAL;
1666 	if (!device->ops->dma_unmap)
1667 		return -EINVAL;
1668 	if (vfio_device_has_container(device))
1669 		return vfio_device_container_pin_pages(device, iova,
1670 						       npage, prot, pages);
1671 	if (device->iommufd_access) {
1672 		int ret;
1673 
1674 		if (iova > ULONG_MAX)
1675 			return -EINVAL;
1676 		/*
1677 		 * VFIO ignores the sub page offset, npages is from the start of
1678 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1679 		 * the sub page offset by doing:
1680 		 *     pages[0] + (iova % PAGE_SIZE)
1681 		 */
1682 		ret = iommufd_access_pin_pages(
1683 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1684 			npage * PAGE_SIZE, pages,
1685 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1686 		if (ret)
1687 			return ret;
1688 		return npage;
1689 	}
1690 	return -EINVAL;
1691 }
1692 EXPORT_SYMBOL(vfio_pin_pages);
1693 
1694 /*
1695  * Unpin contiguous host pages for local domain only.
1696  * @device [in]  : device
1697  * @iova [in]    : starting address of user pages to be unpinned.
1698  * @npage [in]   : count of pages to be unpinned.  This count should not
1699  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1700  */
vfio_unpin_pages(struct vfio_device * device,dma_addr_t iova,int npage)1701 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1702 {
1703 	if (WARN_ON(!vfio_assert_device_open(device)))
1704 		return;
1705 	if (WARN_ON(!device->ops->dma_unmap))
1706 		return;
1707 
1708 	if (vfio_device_has_container(device)) {
1709 		vfio_device_container_unpin_pages(device, iova, npage);
1710 		return;
1711 	}
1712 	if (device->iommufd_access) {
1713 		if (WARN_ON(iova > ULONG_MAX))
1714 			return;
1715 		iommufd_access_unpin_pages(device->iommufd_access,
1716 					   ALIGN_DOWN(iova, PAGE_SIZE),
1717 					   npage * PAGE_SIZE);
1718 		return;
1719 	}
1720 }
1721 EXPORT_SYMBOL(vfio_unpin_pages);
1722 
1723 /*
1724  * This interface allows the CPUs to perform some sort of virtual DMA on
1725  * behalf of the device.
1726  *
1727  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1728  * into/from a kernel buffer.
1729  *
1730  * As the read/write of user space memory is conducted via the CPUs and is
1731  * not a real device DMA, it is not necessary to pin the user space memory.
1732  *
1733  * @device [in]		: VFIO device
1734  * @iova [in]		: base IOVA of a user space buffer
1735  * @data [in]		: pointer to kernel buffer
1736  * @len [in]		: kernel buffer length
1737  * @write		: indicate read or write
1738  * Return error code on failure or 0 on success.
1739  */
vfio_dma_rw(struct vfio_device * device,dma_addr_t iova,void * data,size_t len,bool write)1740 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1741 		size_t len, bool write)
1742 {
1743 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1744 		return -EINVAL;
1745 
1746 	if (vfio_device_has_container(device))
1747 		return vfio_device_container_dma_rw(device, iova,
1748 						    data, len, write);
1749 
1750 	if (device->iommufd_access) {
1751 		unsigned int flags = 0;
1752 
1753 		if (iova > ULONG_MAX)
1754 			return -EINVAL;
1755 
1756 		/* VFIO historically tries to auto-detect a kthread */
1757 		if (!current->mm)
1758 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1759 		if (write)
1760 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1761 		return iommufd_access_rw(device->iommufd_access, iova, data,
1762 					 len, flags);
1763 	}
1764 	return -EINVAL;
1765 }
1766 EXPORT_SYMBOL(vfio_dma_rw);
1767 
1768 /*
1769  * Module/class support
1770  */
vfio_init(void)1771 static int __init vfio_init(void)
1772 {
1773 	int ret;
1774 
1775 	ida_init(&vfio.device_ida);
1776 
1777 	ret = vfio_group_init();
1778 	if (ret)
1779 		return ret;
1780 
1781 	ret = vfio_virqfd_init();
1782 	if (ret)
1783 		goto err_virqfd;
1784 
1785 	/* /sys/class/vfio-dev/vfioX */
1786 	vfio.device_class = class_create("vfio-dev");
1787 	if (IS_ERR(vfio.device_class)) {
1788 		ret = PTR_ERR(vfio.device_class);
1789 		goto err_dev_class;
1790 	}
1791 
1792 	ret = vfio_cdev_init(vfio.device_class);
1793 	if (ret)
1794 		goto err_alloc_dev_chrdev;
1795 
1796 	vfio_debugfs_create_root();
1797 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1798 	return 0;
1799 
1800 err_alloc_dev_chrdev:
1801 	class_destroy(vfio.device_class);
1802 	vfio.device_class = NULL;
1803 err_dev_class:
1804 	vfio_virqfd_exit();
1805 err_virqfd:
1806 	vfio_group_cleanup();
1807 	return ret;
1808 }
1809 
vfio_cleanup(void)1810 static void __exit vfio_cleanup(void)
1811 {
1812 	vfio_debugfs_remove_root();
1813 	ida_destroy(&vfio.device_ida);
1814 	vfio_cdev_cleanup();
1815 	class_destroy(vfio.device_class);
1816 	vfio.device_class = NULL;
1817 	vfio_virqfd_exit();
1818 	vfio_group_cleanup();
1819 	xa_destroy(&vfio_device_set_xa);
1820 }
1821 
1822 module_init(vfio_init);
1823 module_exit(vfio_cleanup);
1824 
1825 MODULE_IMPORT_NS("IOMMUFD");
1826 MODULE_VERSION(DRIVER_VERSION);
1827 MODULE_LICENSE("GPL v2");
1828 MODULE_AUTHOR(DRIVER_AUTHOR);
1829 MODULE_DESCRIPTION(DRIVER_DESC);
1830 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1831