1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #if IS_ENABLED(CONFIG_KVM) 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mount.h> 26 #include <linux/mutex.h> 27 #include <linux/pci.h> 28 #include <linux/pseudo_fs.h> 29 #include <linux/rwsem.h> 30 #include <linux/sched.h> 31 #include <linux/seq_file.h> 32 #include <linux/slab.h> 33 #include <linux/stat.h> 34 #include <linux/string.h> 35 #include <linux/uaccess.h> 36 #include <linux/vfio.h> 37 #include <linux/wait.h> 38 #include <linux/sched/signal.h> 39 #include <linux/pm_runtime.h> 40 #include <linux/interval_tree.h> 41 #include <linux/iova_bitmap.h> 42 #include <linux/iommufd.h> 43 #include "vfio.h" 44 45 #define DRIVER_VERSION "0.3" 46 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 47 #define DRIVER_DESC "VFIO - User Level meta-driver" 48 49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */ 50 51 static struct vfio { 52 struct ida device_ida; 53 struct vfsmount *vfs_mount; 54 int fs_count; 55 } vfio; 56 57 #ifdef CONFIG_VFIO_NOIOMMU 58 bool vfio_noiommu __read_mostly; 59 module_param_named(enable_unsafe_noiommu_mode, 60 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 62 #endif 63 64 static DEFINE_XARRAY(vfio_device_set_xa); 65 66 static char *vfio_device_devnode(const struct device *dev, umode_t *mode) 67 { 68 return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); 69 } 70 71 static const struct class vfio_device_class = { 72 .name = "vfio-dev", 73 .devnode = vfio_device_devnode 74 }; 75 76 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 77 { 78 unsigned long idx = (unsigned long)set_id; 79 struct vfio_device_set *new_dev_set; 80 struct vfio_device_set *dev_set; 81 82 if (WARN_ON(!set_id)) 83 return -EINVAL; 84 85 /* 86 * Atomically acquire a singleton object in the xarray for this set_id 87 */ 88 xa_lock(&vfio_device_set_xa); 89 dev_set = xa_load(&vfio_device_set_xa, idx); 90 if (dev_set) 91 goto found_get_ref; 92 xa_unlock(&vfio_device_set_xa); 93 94 new_dev_set = kzalloc_obj(*new_dev_set); 95 if (!new_dev_set) 96 return -ENOMEM; 97 mutex_init(&new_dev_set->lock); 98 INIT_LIST_HEAD(&new_dev_set->device_list); 99 new_dev_set->set_id = set_id; 100 101 xa_lock(&vfio_device_set_xa); 102 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 103 GFP_KERNEL); 104 if (!dev_set) { 105 dev_set = new_dev_set; 106 goto found_get_ref; 107 } 108 109 kfree(new_dev_set); 110 if (xa_is_err(dev_set)) { 111 xa_unlock(&vfio_device_set_xa); 112 return xa_err(dev_set); 113 } 114 115 found_get_ref: 116 dev_set->device_count++; 117 xa_unlock(&vfio_device_set_xa); 118 mutex_lock(&dev_set->lock); 119 device->dev_set = dev_set; 120 list_add_tail(&device->dev_set_list, &dev_set->device_list); 121 mutex_unlock(&dev_set->lock); 122 return 0; 123 } 124 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 125 126 static void vfio_release_device_set(struct vfio_device *device) 127 { 128 struct vfio_device_set *dev_set = device->dev_set; 129 130 if (!dev_set) 131 return; 132 133 mutex_lock(&dev_set->lock); 134 list_del(&device->dev_set_list); 135 mutex_unlock(&dev_set->lock); 136 137 xa_lock(&vfio_device_set_xa); 138 if (!--dev_set->device_count) { 139 __xa_erase(&vfio_device_set_xa, 140 (unsigned long)dev_set->set_id); 141 mutex_destroy(&dev_set->lock); 142 kfree(dev_set); 143 } 144 xa_unlock(&vfio_device_set_xa); 145 } 146 147 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 148 { 149 struct vfio_device *cur; 150 unsigned int open_count = 0; 151 152 lockdep_assert_held(&dev_set->lock); 153 154 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 155 open_count += cur->open_count; 156 return open_count; 157 } 158 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 159 160 struct vfio_device * 161 vfio_find_device_in_devset(struct vfio_device_set *dev_set, 162 struct device *dev) 163 { 164 struct vfio_device *cur; 165 166 lockdep_assert_held(&dev_set->lock); 167 168 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 169 if (cur->dev == dev) 170 return cur; 171 return NULL; 172 } 173 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 174 175 /* 176 * Device objects - create, release, get, put, search 177 */ 178 /* Device reference always implies a group reference */ 179 void vfio_device_put_registration(struct vfio_device *device) 180 { 181 if (refcount_dec_and_test(&device->refcount)) 182 complete(&device->comp); 183 } 184 EXPORT_SYMBOL_GPL(vfio_device_put_registration); 185 186 bool vfio_device_try_get_registration(struct vfio_device *device) 187 { 188 return refcount_inc_not_zero(&device->refcount); 189 } 190 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); 191 192 /* 193 * VFIO driver API 194 */ 195 /* Release helper called by vfio_put_device() */ 196 static void vfio_device_release(struct device *dev) 197 { 198 struct vfio_device *device = 199 container_of(dev, struct vfio_device, device); 200 201 vfio_release_device_set(device); 202 ida_free(&vfio.device_ida, device->index); 203 204 if (device->ops->release) 205 device->ops->release(device); 206 207 iput(device->inode); 208 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 209 kvfree(device); 210 } 211 212 static int vfio_init_device(struct vfio_device *device, struct device *dev, 213 const struct vfio_device_ops *ops); 214 215 /* 216 * Allocate and initialize vfio_device so it can be registered to vfio 217 * core. 218 * 219 * Drivers should use the wrapper vfio_alloc_device() for allocation. 220 * @size is the size of the structure to be allocated, including any 221 * private data used by the driver. 222 * 223 * Driver may provide an @init callback to cover device private data. 224 * 225 * Use vfio_put_device() to release the structure after success return. 226 */ 227 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 228 const struct vfio_device_ops *ops) 229 { 230 struct vfio_device *device; 231 int ret; 232 233 if (WARN_ON(size < sizeof(struct vfio_device))) 234 return ERR_PTR(-EINVAL); 235 236 device = kvzalloc(size, GFP_KERNEL); 237 if (!device) 238 return ERR_PTR(-ENOMEM); 239 240 ret = vfio_init_device(device, dev, ops); 241 if (ret) 242 goto out_free; 243 return device; 244 245 out_free: 246 kvfree(device); 247 return ERR_PTR(ret); 248 } 249 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 250 251 static int vfio_fs_init_fs_context(struct fs_context *fc) 252 { 253 return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; 254 } 255 256 static struct file_system_type vfio_fs_type = { 257 .name = "vfio", 258 .owner = THIS_MODULE, 259 .init_fs_context = vfio_fs_init_fs_context, 260 .kill_sb = kill_anon_super, 261 }; 262 263 static struct inode *vfio_fs_inode_new(void) 264 { 265 struct inode *inode; 266 int ret; 267 268 ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); 269 if (ret) 270 return ERR_PTR(ret); 271 272 inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); 273 if (IS_ERR(inode)) 274 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 275 276 return inode; 277 } 278 279 /* 280 * Initialize a vfio_device so it can be registered to vfio core. 281 */ 282 static int vfio_init_device(struct vfio_device *device, struct device *dev, 283 const struct vfio_device_ops *ops) 284 { 285 int ret; 286 287 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 288 if (ret < 0) { 289 dev_dbg(dev, "Error to alloc index\n"); 290 return ret; 291 } 292 293 device->index = ret; 294 init_completion(&device->comp); 295 device->dev = dev; 296 device->ops = ops; 297 device->inode = vfio_fs_inode_new(); 298 if (IS_ERR(device->inode)) { 299 ret = PTR_ERR(device->inode); 300 goto out_inode; 301 } 302 303 if (ops->init) { 304 ret = ops->init(device); 305 if (ret) 306 goto out_uninit; 307 } 308 309 device_initialize(&device->device); 310 device->device.release = vfio_device_release; 311 device->device.class = &vfio_device_class; 312 device->device.parent = device->dev; 313 return 0; 314 315 out_uninit: 316 iput(device->inode); 317 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 318 out_inode: 319 vfio_release_device_set(device); 320 ida_free(&vfio.device_ida, device->index); 321 return ret; 322 } 323 324 static int __vfio_register_dev(struct vfio_device *device, 325 enum vfio_group_type type) 326 { 327 int ret; 328 329 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 330 (!device->ops->bind_iommufd || 331 !device->ops->unbind_iommufd || 332 !device->ops->attach_ioas || 333 !device->ops->detach_ioas))) 334 return -EINVAL; 335 336 /* 337 * If the driver doesn't specify a set then the device is added to a 338 * singleton set just for itself. 339 */ 340 if (!device->dev_set) 341 vfio_assign_device_set(device, device); 342 343 ret = dev_set_name(&device->device, "vfio%d", device->index); 344 if (ret) 345 return ret; 346 347 ret = vfio_device_set_group(device, type); 348 if (ret) 349 return ret; 350 351 /* 352 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 353 * restore cache coherency. It has to be checked here because it is only 354 * valid for cases where we are using iommu groups. 355 */ 356 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && 357 !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { 358 ret = -EINVAL; 359 goto err_out; 360 } 361 362 ret = vfio_device_add(device); 363 if (ret) 364 goto err_out; 365 366 /* Refcounting can't start until the driver calls register */ 367 refcount_set(&device->refcount, 1); 368 369 vfio_device_group_register(device); 370 vfio_device_debugfs_init(device); 371 372 return 0; 373 err_out: 374 vfio_device_remove_group(device); 375 return ret; 376 } 377 378 int vfio_register_group_dev(struct vfio_device *device) 379 { 380 return __vfio_register_dev(device, VFIO_IOMMU); 381 } 382 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 383 384 /* 385 * Register a virtual device without IOMMU backing. The user of this 386 * device must not be able to directly trigger unmediated DMA. 387 */ 388 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 389 { 390 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 391 } 392 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 393 394 /* 395 * Decrement the device reference count and wait for the device to be 396 * removed. Open file descriptors for the device... */ 397 void vfio_unregister_group_dev(struct vfio_device *device) 398 { 399 unsigned int i = 0; 400 bool interrupted = false; 401 long rc; 402 403 /* 404 * Prevent new device opened by userspace via the 405 * VFIO_GROUP_GET_DEVICE_FD in the group path. 406 */ 407 vfio_device_group_unregister(device); 408 409 /* 410 * Balances vfio_device_add() in register path, also prevents 411 * new device opened by userspace in the cdev path. 412 */ 413 vfio_device_del(device); 414 415 vfio_device_put_registration(device); 416 rc = try_wait_for_completion(&device->comp); 417 while (rc <= 0) { 418 if (device->ops->request) 419 device->ops->request(device, i++); 420 421 if (interrupted) { 422 rc = wait_for_completion_timeout(&device->comp, 423 HZ * 10); 424 } else { 425 rc = wait_for_completion_interruptible_timeout( 426 &device->comp, HZ * 10); 427 if (rc < 0) { 428 interrupted = true; 429 dev_warn(device->dev, 430 "Device is currently in use, task" 431 " \"%s\" (%d) " 432 "blocked until device is released", 433 current->comm, task_pid_nr(current)); 434 } 435 } 436 } 437 438 vfio_device_debugfs_exit(device); 439 /* Balances vfio_device_set_group in register path */ 440 vfio_device_remove_group(device); 441 } 442 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 443 444 #if IS_ENABLED(CONFIG_KVM) 445 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 446 { 447 void (*pfn)(struct kvm *kvm); 448 bool (*fn)(struct kvm *kvm); 449 bool ret; 450 451 lockdep_assert_held(&device->dev_set->lock); 452 453 if (!kvm) 454 return; 455 456 pfn = symbol_get(kvm_put_kvm); 457 if (WARN_ON(!pfn)) 458 return; 459 460 fn = symbol_get(kvm_get_kvm_safe); 461 if (WARN_ON(!fn)) { 462 symbol_put(kvm_put_kvm); 463 return; 464 } 465 466 ret = fn(kvm); 467 symbol_put(kvm_get_kvm_safe); 468 if (!ret) { 469 symbol_put(kvm_put_kvm); 470 return; 471 } 472 473 device->put_kvm = pfn; 474 device->kvm = kvm; 475 } 476 477 void vfio_device_put_kvm(struct vfio_device *device) 478 { 479 lockdep_assert_held(&device->dev_set->lock); 480 481 if (!device->kvm) 482 return; 483 484 if (WARN_ON(!device->put_kvm)) 485 goto clear; 486 487 device->put_kvm(device->kvm); 488 device->put_kvm = NULL; 489 symbol_put(kvm_put_kvm); 490 491 clear: 492 device->kvm = NULL; 493 } 494 #endif 495 496 /* true if the vfio_device has open_device() called but not close_device() */ 497 static bool vfio_assert_device_open(struct vfio_device *device) 498 { 499 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 500 } 501 502 struct vfio_device_file * 503 vfio_allocate_device_file(struct vfio_device *device) 504 { 505 struct vfio_device_file *df; 506 507 df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT); 508 if (!df) 509 return ERR_PTR(-ENOMEM); 510 511 df->device = device; 512 spin_lock_init(&df->kvm_ref_lock); 513 514 return df; 515 } 516 517 static int vfio_df_device_first_open(struct vfio_device_file *df) 518 { 519 struct vfio_device *device = df->device; 520 struct iommufd_ctx *iommufd = df->iommufd; 521 int ret; 522 523 lockdep_assert_held(&device->dev_set->lock); 524 525 if (!try_module_get(device->dev->driver->owner)) 526 return -ENODEV; 527 528 if (iommufd) 529 ret = vfio_df_iommufd_bind(df); 530 else 531 ret = vfio_device_group_use_iommu(device); 532 if (ret) 533 goto err_module_put; 534 535 if (device->ops->open_device) { 536 ret = device->ops->open_device(device); 537 if (ret) 538 goto err_unuse_iommu; 539 } 540 return 0; 541 542 err_unuse_iommu: 543 if (iommufd) 544 vfio_df_iommufd_unbind(df); 545 else 546 vfio_device_group_unuse_iommu(device); 547 err_module_put: 548 module_put(device->dev->driver->owner); 549 return ret; 550 } 551 552 static void vfio_df_device_last_close(struct vfio_device_file *df) 553 { 554 struct vfio_device *device = df->device; 555 struct iommufd_ctx *iommufd = df->iommufd; 556 557 lockdep_assert_held(&device->dev_set->lock); 558 559 if (device->ops->close_device) 560 device->ops->close_device(device); 561 if (iommufd) 562 vfio_df_iommufd_unbind(df); 563 else 564 vfio_device_group_unuse_iommu(device); 565 device->precopy_info_v2 = 0; 566 module_put(device->dev->driver->owner); 567 } 568 569 int vfio_df_open(struct vfio_device_file *df) 570 { 571 struct vfio_device *device = df->device; 572 int ret = 0; 573 574 lockdep_assert_held(&device->dev_set->lock); 575 576 /* 577 * Only the group path allows the device to be opened multiple 578 * times. The device cdev path doesn't have a secure way for it. 579 */ 580 if (device->open_count != 0 && !df->group) 581 return -EINVAL; 582 583 device->open_count++; 584 if (device->open_count == 1) { 585 ret = vfio_df_device_first_open(df); 586 if (ret) 587 device->open_count--; 588 } 589 590 return ret; 591 } 592 593 void vfio_df_close(struct vfio_device_file *df) 594 { 595 struct vfio_device *device = df->device; 596 597 lockdep_assert_held(&device->dev_set->lock); 598 599 if (!vfio_assert_device_open(device)) 600 return; 601 if (device->open_count == 1) 602 vfio_df_device_last_close(df); 603 device->open_count--; 604 } 605 606 /* 607 * Wrapper around pm_runtime_resume_and_get(). 608 * Return error code on failure or 0 on success. 609 */ 610 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 611 { 612 struct device *dev = device->dev; 613 614 if (dev->driver && dev->driver->pm) { 615 int ret; 616 617 ret = pm_runtime_resume_and_get(dev); 618 if (ret) { 619 dev_info_ratelimited(dev, 620 "vfio: runtime resume failed %d\n", ret); 621 return -EIO; 622 } 623 } 624 625 return 0; 626 } 627 628 /* 629 * Wrapper around pm_runtime_put(). 630 */ 631 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 632 { 633 struct device *dev = device->dev; 634 635 if (dev->driver && dev->driver->pm) 636 pm_runtime_put(dev); 637 } 638 639 /* 640 * VFIO Device fd 641 */ 642 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 643 { 644 struct vfio_device_file *df = filep->private_data; 645 struct vfio_device *device = df->device; 646 647 if (df->group) 648 vfio_df_group_close(df); 649 else 650 vfio_df_unbind_iommufd(df); 651 652 vfio_device_put_registration(device); 653 654 kfree(df); 655 656 return 0; 657 } 658 659 /* 660 * vfio_mig_get_next_state - Compute the next step in the FSM 661 * @cur_fsm - The current state the device is in 662 * @new_fsm - The target state to reach 663 * @next_fsm - Pointer to the next step to get to new_fsm 664 * 665 * Return 0 upon success, otherwise -errno 666 * Upon success the next step in the state progression between cur_fsm and 667 * new_fsm will be set in next_fsm. 668 * 669 * This breaks down requests for combination transitions into smaller steps and 670 * returns the next step to get to new_fsm. The function may need to be called 671 * multiple times before reaching new_fsm. 672 * 673 */ 674 int vfio_mig_get_next_state(struct vfio_device *device, 675 enum vfio_device_mig_state cur_fsm, 676 enum vfio_device_mig_state new_fsm, 677 enum vfio_device_mig_state *next_fsm) 678 { 679 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 680 /* 681 * The coding in this table requires the driver to implement the 682 * following FSM arcs: 683 * RESUMING -> STOP 684 * STOP -> RESUMING 685 * STOP -> STOP_COPY 686 * STOP_COPY -> STOP 687 * 688 * If P2P is supported then the driver must also implement these FSM 689 * arcs: 690 * RUNNING -> RUNNING_P2P 691 * RUNNING_P2P -> RUNNING 692 * RUNNING_P2P -> STOP 693 * STOP -> RUNNING_P2P 694 * 695 * If precopy is supported then the driver must support these additional 696 * FSM arcs: 697 * RUNNING -> PRE_COPY 698 * PRE_COPY -> RUNNING 699 * PRE_COPY -> STOP_COPY 700 * However, if precopy and P2P are supported together then the driver 701 * must support these additional arcs beyond the P2P arcs above: 702 * PRE_COPY -> RUNNING 703 * PRE_COPY -> PRE_COPY_P2P 704 * PRE_COPY_P2P -> PRE_COPY 705 * PRE_COPY_P2P -> RUNNING_P2P 706 * PRE_COPY_P2P -> STOP_COPY 707 * RUNNING -> PRE_COPY 708 * RUNNING_P2P -> PRE_COPY_P2P 709 * 710 * Without P2P and precopy the driver must implement: 711 * RUNNING -> STOP 712 * STOP -> RUNNING 713 * 714 * The coding will step through multiple states for some combination 715 * transitions; if all optional features are supported, this means the 716 * following ones: 717 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 718 * PRE_COPY -> RUNNING -> RUNNING_P2P 719 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 720 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 721 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 722 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 723 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 724 * RESUMING -> STOP -> RUNNING_P2P 725 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 726 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 727 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 728 * RESUMING -> STOP -> STOP_COPY 729 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 730 * RUNNING -> RUNNING_P2P -> STOP 731 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 732 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 733 * RUNNING_P2P -> RUNNING -> PRE_COPY 734 * RUNNING_P2P -> STOP -> RESUMING 735 * RUNNING_P2P -> STOP -> STOP_COPY 736 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 737 * STOP -> RUNNING_P2P -> RUNNING 738 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 739 * STOP_COPY -> STOP -> RESUMING 740 * STOP_COPY -> STOP -> RUNNING_P2P 741 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 742 * 743 * The following transitions are blocked: 744 * STOP_COPY -> PRE_COPY 745 * STOP_COPY -> PRE_COPY_P2P 746 */ 747 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 748 [VFIO_DEVICE_STATE_STOP] = { 749 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 750 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 751 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 752 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 753 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 754 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 755 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 756 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 757 }, 758 [VFIO_DEVICE_STATE_RUNNING] = { 759 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 760 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 761 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 762 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 763 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 764 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 765 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 766 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 767 }, 768 [VFIO_DEVICE_STATE_PRE_COPY] = { 769 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 770 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 771 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 772 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 773 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 774 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 775 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 776 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 777 }, 778 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 779 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 780 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 781 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 782 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 783 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 784 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 785 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 786 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 787 }, 788 [VFIO_DEVICE_STATE_STOP_COPY] = { 789 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 790 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 791 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 792 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 793 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 794 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 795 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 796 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 797 }, 798 [VFIO_DEVICE_STATE_RESUMING] = { 799 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 800 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 801 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 802 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 803 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 804 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 805 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 806 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 807 }, 808 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 809 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 810 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 811 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 812 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 813 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 814 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 815 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 816 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 817 }, 818 [VFIO_DEVICE_STATE_ERROR] = { 819 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 820 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 821 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 822 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 823 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 824 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 825 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 826 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 827 }, 828 }; 829 830 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 831 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 832 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 833 [VFIO_DEVICE_STATE_PRE_COPY] = 834 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 835 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 836 VFIO_MIGRATION_P2P | 837 VFIO_MIGRATION_PRE_COPY, 838 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 839 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 840 [VFIO_DEVICE_STATE_RUNNING_P2P] = 841 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 842 [VFIO_DEVICE_STATE_ERROR] = ~0U, 843 }; 844 845 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 846 (state_flags_table[cur_fsm] & device->migration_flags) != 847 state_flags_table[cur_fsm])) 848 return -EINVAL; 849 850 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 851 (state_flags_table[new_fsm] & device->migration_flags) != 852 state_flags_table[new_fsm]) 853 return -EINVAL; 854 855 /* 856 * Arcs touching optional and unsupported states are skipped over. The 857 * driver will instead see an arc from the original state to the next 858 * logical state, as per the above comment. 859 */ 860 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 861 while (*next_fsm != VFIO_DEVICE_STATE_ERROR && 862 (state_flags_table[*next_fsm] & device->migration_flags) != 863 state_flags_table[*next_fsm]) 864 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 865 866 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 867 } 868 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 869 870 /* 871 * Convert the drivers's struct file into a FD number and return it to userspace 872 */ 873 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 874 struct vfio_device_feature_mig_state *mig) 875 { 876 int ret; 877 int fd; 878 879 fd = get_unused_fd_flags(O_CLOEXEC); 880 if (fd < 0) { 881 ret = fd; 882 goto out_fput; 883 } 884 885 mig->data_fd = fd; 886 if (copy_to_user(arg, mig, sizeof(*mig))) { 887 ret = -EFAULT; 888 goto out_put_unused; 889 } 890 fd_install(fd, filp); 891 return 0; 892 893 out_put_unused: 894 put_unused_fd(fd); 895 out_fput: 896 fput(filp); 897 return ret; 898 } 899 900 static int 901 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 902 u32 flags, void __user *arg, 903 size_t argsz) 904 { 905 size_t minsz = 906 offsetofend(struct vfio_device_feature_mig_state, data_fd); 907 struct vfio_device_feature_mig_state mig; 908 struct file *filp = NULL; 909 int ret; 910 911 if (!device->mig_ops) 912 return -ENOTTY; 913 914 ret = vfio_check_feature(flags, argsz, 915 VFIO_DEVICE_FEATURE_SET | 916 VFIO_DEVICE_FEATURE_GET, 917 sizeof(mig)); 918 if (ret != 1) 919 return ret; 920 921 if (copy_from_user(&mig, arg, minsz)) 922 return -EFAULT; 923 924 if (flags & VFIO_DEVICE_FEATURE_GET) { 925 enum vfio_device_mig_state curr_state; 926 927 ret = device->mig_ops->migration_get_state(device, 928 &curr_state); 929 if (ret) 930 return ret; 931 mig.device_state = curr_state; 932 goto out_copy; 933 } 934 935 /* Handle the VFIO_DEVICE_FEATURE_SET */ 936 filp = device->mig_ops->migration_set_state(device, mig.device_state); 937 if (IS_ERR(filp) || !filp) 938 goto out_copy; 939 940 return vfio_ioct_mig_return_fd(filp, arg, &mig); 941 out_copy: 942 mig.data_fd = -1; 943 if (copy_to_user(arg, &mig, sizeof(mig))) 944 return -EFAULT; 945 if (IS_ERR(filp)) 946 return PTR_ERR(filp); 947 return 0; 948 } 949 950 static int 951 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 952 u32 flags, void __user *arg, 953 size_t argsz) 954 { 955 struct vfio_device_feature_mig_data_size data_size = {}; 956 unsigned long stop_copy_length; 957 int ret; 958 959 if (!device->mig_ops) 960 return -ENOTTY; 961 962 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 963 sizeof(data_size)); 964 if (ret != 1) 965 return ret; 966 967 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 968 if (ret) 969 return ret; 970 971 data_size.stop_copy_length = stop_copy_length; 972 if (copy_to_user(arg, &data_size, sizeof(data_size))) 973 return -EFAULT; 974 975 return 0; 976 } 977 978 static int 979 vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device, 980 u32 flags, size_t argsz) 981 { 982 int ret; 983 984 if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY)) 985 return -EINVAL; 986 987 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); 988 if (ret != 1) 989 return ret; 990 991 device->precopy_info_v2 = 1; 992 return 0; 993 } 994 995 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 996 u32 flags, void __user *arg, 997 size_t argsz) 998 { 999 struct vfio_device_feature_migration mig = { 1000 .flags = device->migration_flags, 1001 }; 1002 int ret; 1003 1004 if (!device->mig_ops) 1005 return -ENOTTY; 1006 1007 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 1008 sizeof(mig)); 1009 if (ret != 1) 1010 return ret; 1011 if (copy_to_user(arg, &mig, sizeof(mig))) 1012 return -EFAULT; 1013 return 0; 1014 } 1015 1016 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 1017 u32 req_nodes) 1018 { 1019 struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 1020 unsigned long min_gap, curr_gap; 1021 1022 /* Special shortcut when a single range is required */ 1023 if (req_nodes == 1) { 1024 unsigned long last; 1025 1026 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 1027 1028 /* Empty list */ 1029 if (WARN_ON_ONCE(!comb_start)) 1030 return; 1031 1032 curr = comb_start; 1033 while (curr) { 1034 last = curr->last; 1035 prev = curr; 1036 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1037 if (prev != comb_start) 1038 interval_tree_remove(prev, root); 1039 } 1040 comb_start->last = last; 1041 return; 1042 } 1043 1044 /* Combine ranges which have the smallest gap */ 1045 while (cur_nodes > req_nodes) { 1046 prev = NULL; 1047 min_gap = ULONG_MAX; 1048 curr = interval_tree_iter_first(root, 0, ULONG_MAX); 1049 while (curr) { 1050 if (prev) { 1051 curr_gap = curr->start - prev->last; 1052 if (curr_gap < min_gap) { 1053 min_gap = curr_gap; 1054 comb_start = prev; 1055 comb_end = curr; 1056 } 1057 } 1058 prev = curr; 1059 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1060 } 1061 1062 /* Empty list or no nodes to combine */ 1063 if (WARN_ON_ONCE(min_gap == ULONG_MAX)) 1064 break; 1065 1066 comb_start->last = comb_end->last; 1067 interval_tree_remove(comb_end, root); 1068 cur_nodes--; 1069 } 1070 } 1071 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); 1072 1073 /* Ranges should fit into a single kernel page */ 1074 #define LOG_MAX_RANGES \ 1075 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 1076 1077 static int 1078 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 1079 u32 flags, void __user *arg, 1080 size_t argsz) 1081 { 1082 size_t minsz = 1083 offsetofend(struct vfio_device_feature_dma_logging_control, 1084 ranges); 1085 struct vfio_device_feature_dma_logging_range __user *ranges; 1086 struct vfio_device_feature_dma_logging_control control; 1087 struct vfio_device_feature_dma_logging_range range; 1088 struct rb_root_cached root = RB_ROOT_CACHED; 1089 struct interval_tree_node *nodes; 1090 u64 iova_end; 1091 u32 nnodes; 1092 int i, ret; 1093 1094 if (!device->log_ops) 1095 return -ENOTTY; 1096 1097 ret = vfio_check_feature(flags, argsz, 1098 VFIO_DEVICE_FEATURE_SET, 1099 sizeof(control)); 1100 if (ret != 1) 1101 return ret; 1102 1103 if (copy_from_user(&control, arg, minsz)) 1104 return -EFAULT; 1105 1106 nnodes = control.num_ranges; 1107 if (!nnodes) 1108 return -EINVAL; 1109 1110 if (nnodes > LOG_MAX_RANGES) 1111 return -E2BIG; 1112 1113 ranges = u64_to_user_ptr(control.ranges); 1114 nodes = kmalloc_objs(struct interval_tree_node, nnodes); 1115 if (!nodes) 1116 return -ENOMEM; 1117 1118 for (i = 0; i < nnodes; i++) { 1119 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 1120 ret = -EFAULT; 1121 goto end; 1122 } 1123 if (!IS_ALIGNED(range.iova, control.page_size) || 1124 !IS_ALIGNED(range.length, control.page_size)) { 1125 ret = -EINVAL; 1126 goto end; 1127 } 1128 1129 if (check_add_overflow(range.iova, range.length, &iova_end) || 1130 iova_end > ULONG_MAX) { 1131 ret = -EOVERFLOW; 1132 goto end; 1133 } 1134 1135 nodes[i].start = range.iova; 1136 nodes[i].last = range.iova + range.length - 1; 1137 if (interval_tree_iter_first(&root, nodes[i].start, 1138 nodes[i].last)) { 1139 /* Range overlapping */ 1140 ret = -EINVAL; 1141 goto end; 1142 } 1143 interval_tree_insert(nodes + i, &root); 1144 } 1145 1146 ret = device->log_ops->log_start(device, &root, nnodes, 1147 &control.page_size); 1148 if (ret) 1149 goto end; 1150 1151 if (copy_to_user(arg, &control, sizeof(control))) { 1152 ret = -EFAULT; 1153 device->log_ops->log_stop(device); 1154 } 1155 1156 end: 1157 kfree(nodes); 1158 return ret; 1159 } 1160 1161 static int 1162 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1163 u32 flags, void __user *arg, 1164 size_t argsz) 1165 { 1166 int ret; 1167 1168 if (!device->log_ops) 1169 return -ENOTTY; 1170 1171 ret = vfio_check_feature(flags, argsz, 1172 VFIO_DEVICE_FEATURE_SET, 0); 1173 if (ret != 1) 1174 return ret; 1175 1176 return device->log_ops->log_stop(device); 1177 } 1178 1179 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1180 unsigned long iova, size_t length, 1181 void *opaque) 1182 { 1183 struct vfio_device *device = opaque; 1184 1185 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1186 } 1187 1188 static int 1189 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1190 u32 flags, void __user *arg, 1191 size_t argsz) 1192 { 1193 size_t minsz = 1194 offsetofend(struct vfio_device_feature_dma_logging_report, 1195 bitmap); 1196 struct vfio_device_feature_dma_logging_report report; 1197 struct iova_bitmap *iter; 1198 u64 iova_end; 1199 int ret; 1200 1201 if (!device->log_ops) 1202 return -ENOTTY; 1203 1204 ret = vfio_check_feature(flags, argsz, 1205 VFIO_DEVICE_FEATURE_GET, 1206 sizeof(report)); 1207 if (ret != 1) 1208 return ret; 1209 1210 if (copy_from_user(&report, arg, minsz)) 1211 return -EFAULT; 1212 1213 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1214 return -EINVAL; 1215 1216 if (check_add_overflow(report.iova, report.length, &iova_end) || 1217 iova_end > ULONG_MAX) 1218 return -EOVERFLOW; 1219 1220 iter = iova_bitmap_alloc(report.iova, report.length, 1221 report.page_size, 1222 u64_to_user_ptr(report.bitmap)); 1223 if (IS_ERR(iter)) 1224 return PTR_ERR(iter); 1225 1226 ret = iova_bitmap_for_each(iter, device, 1227 vfio_device_log_read_and_clear); 1228 1229 iova_bitmap_free(iter); 1230 return ret; 1231 } 1232 1233 static int vfio_ioctl_device_feature(struct vfio_device *device, 1234 struct vfio_device_feature __user *arg) 1235 { 1236 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1237 struct vfio_device_feature feature; 1238 1239 if (copy_from_user(&feature, arg, minsz)) 1240 return -EFAULT; 1241 1242 if (feature.argsz < minsz) 1243 return -EINVAL; 1244 1245 /* Check unknown flags */ 1246 if (feature.flags & 1247 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1248 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1249 return -EINVAL; 1250 1251 /* GET & SET are mutually exclusive except with PROBE */ 1252 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1253 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1254 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1255 return -EINVAL; 1256 1257 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1258 case VFIO_DEVICE_FEATURE_MIGRATION: 1259 return vfio_ioctl_device_feature_migration( 1260 device, feature.flags, arg->data, 1261 feature.argsz - minsz); 1262 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1263 return vfio_ioctl_device_feature_mig_device_state( 1264 device, feature.flags, arg->data, 1265 feature.argsz - minsz); 1266 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1267 return vfio_ioctl_device_feature_logging_start( 1268 device, feature.flags, arg->data, 1269 feature.argsz - minsz); 1270 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1271 return vfio_ioctl_device_feature_logging_stop( 1272 device, feature.flags, arg->data, 1273 feature.argsz - minsz); 1274 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1275 return vfio_ioctl_device_feature_logging_report( 1276 device, feature.flags, arg->data, 1277 feature.argsz - minsz); 1278 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1279 return vfio_ioctl_device_feature_migration_data_size( 1280 device, feature.flags, arg->data, 1281 feature.argsz - minsz); 1282 case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2: 1283 return vfio_ioctl_device_feature_migration_precopy_info_v2( 1284 device, feature.flags, feature.argsz - minsz); 1285 default: 1286 if (unlikely(!device->ops->device_feature)) 1287 return -ENOTTY; 1288 return device->ops->device_feature(device, feature.flags, 1289 arg->data, 1290 feature.argsz - minsz); 1291 } 1292 } 1293 1294 static long vfio_get_region_info(struct vfio_device *device, 1295 struct vfio_region_info __user *arg) 1296 { 1297 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 1298 struct vfio_region_info info = {}; 1299 struct vfio_info_cap caps = {}; 1300 int ret; 1301 1302 if (unlikely(!device->ops->get_region_info_caps)) 1303 return -EINVAL; 1304 1305 if (copy_from_user(&info, arg, minsz)) 1306 return -EFAULT; 1307 if (info.argsz < minsz) 1308 return -EINVAL; 1309 1310 ret = device->ops->get_region_info_caps(device, &info, &caps); 1311 if (ret) 1312 goto out_free; 1313 1314 if (caps.size) { 1315 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 1316 if (info.argsz < sizeof(info) + caps.size) { 1317 info.argsz = sizeof(info) + caps.size; 1318 info.cap_offset = 0; 1319 } else { 1320 vfio_info_cap_shift(&caps, sizeof(info)); 1321 if (copy_to_user(arg + 1, caps.buf, caps.size)) { 1322 ret = -EFAULT; 1323 goto out_free; 1324 } 1325 info.cap_offset = sizeof(info); 1326 } 1327 } 1328 1329 if (copy_to_user(arg, &info, minsz)){ 1330 ret = -EFAULT; 1331 goto out_free; 1332 } 1333 1334 out_free: 1335 kfree(caps.buf); 1336 return ret; 1337 } 1338 1339 static long vfio_device_fops_unl_ioctl(struct file *filep, 1340 unsigned int cmd, unsigned long arg) 1341 { 1342 struct vfio_device_file *df = filep->private_data; 1343 struct vfio_device *device = df->device; 1344 void __user *uptr = (void __user *)arg; 1345 int ret; 1346 1347 if (cmd == VFIO_DEVICE_BIND_IOMMUFD) 1348 return vfio_df_ioctl_bind_iommufd(df, uptr); 1349 1350 /* Paired with smp_store_release() following vfio_df_open() */ 1351 if (!smp_load_acquire(&df->access_granted)) 1352 return -EINVAL; 1353 1354 ret = vfio_device_pm_runtime_get(device); 1355 if (ret) 1356 return ret; 1357 1358 /* cdev only ioctls */ 1359 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { 1360 switch (cmd) { 1361 case VFIO_DEVICE_ATTACH_IOMMUFD_PT: 1362 ret = vfio_df_ioctl_attach_pt(df, uptr); 1363 goto out; 1364 1365 case VFIO_DEVICE_DETACH_IOMMUFD_PT: 1366 ret = vfio_df_ioctl_detach_pt(df, uptr); 1367 goto out; 1368 } 1369 } 1370 1371 switch (cmd) { 1372 case VFIO_DEVICE_FEATURE: 1373 ret = vfio_ioctl_device_feature(device, uptr); 1374 break; 1375 1376 case VFIO_DEVICE_GET_REGION_INFO: 1377 ret = vfio_get_region_info(device, uptr); 1378 break; 1379 1380 default: 1381 if (unlikely(!device->ops->ioctl)) 1382 ret = -EINVAL; 1383 else 1384 ret = device->ops->ioctl(device, cmd, arg); 1385 break; 1386 } 1387 out: 1388 vfio_device_pm_runtime_put(device); 1389 return ret; 1390 } 1391 1392 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1393 size_t count, loff_t *ppos) 1394 { 1395 struct vfio_device_file *df = filep->private_data; 1396 struct vfio_device *device = df->device; 1397 1398 /* Paired with smp_store_release() following vfio_df_open() */ 1399 if (!smp_load_acquire(&df->access_granted)) 1400 return -EINVAL; 1401 1402 if (unlikely(!device->ops->read)) 1403 return -EINVAL; 1404 1405 return device->ops->read(device, buf, count, ppos); 1406 } 1407 1408 static ssize_t vfio_device_fops_write(struct file *filep, 1409 const char __user *buf, 1410 size_t count, loff_t *ppos) 1411 { 1412 struct vfio_device_file *df = filep->private_data; 1413 struct vfio_device *device = df->device; 1414 1415 /* Paired with smp_store_release() following vfio_df_open() */ 1416 if (!smp_load_acquire(&df->access_granted)) 1417 return -EINVAL; 1418 1419 if (unlikely(!device->ops->write)) 1420 return -EINVAL; 1421 1422 return device->ops->write(device, buf, count, ppos); 1423 } 1424 1425 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1426 { 1427 struct vfio_device_file *df = filep->private_data; 1428 struct vfio_device *device = df->device; 1429 1430 /* Paired with smp_store_release() following vfio_df_open() */ 1431 if (!smp_load_acquire(&df->access_granted)) 1432 return -EINVAL; 1433 1434 if (unlikely(!device->ops->mmap)) 1435 return -EINVAL; 1436 1437 return device->ops->mmap(device, vma); 1438 } 1439 1440 #ifdef CONFIG_PROC_FS 1441 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep) 1442 { 1443 char *path; 1444 struct vfio_device_file *df = filep->private_data; 1445 struct vfio_device *device = df->device; 1446 1447 path = kobject_get_path(&device->dev->kobj, GFP_KERNEL); 1448 if (!path) 1449 return; 1450 1451 seq_printf(m, "vfio-device-syspath: /sys%s\n", path); 1452 kfree(path); 1453 } 1454 #endif 1455 1456 const struct file_operations vfio_device_fops = { 1457 .owner = THIS_MODULE, 1458 .open = vfio_device_fops_cdev_open, 1459 .release = vfio_device_fops_release, 1460 .read = vfio_device_fops_read, 1461 .write = vfio_device_fops_write, 1462 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1463 .compat_ioctl = compat_ptr_ioctl, 1464 .mmap = vfio_device_fops_mmap, 1465 #ifdef CONFIG_PROC_FS 1466 .show_fdinfo = vfio_device_show_fdinfo, 1467 #endif 1468 }; 1469 1470 static struct vfio_device *vfio_device_from_file(struct file *file) 1471 { 1472 struct vfio_device_file *df = file->private_data; 1473 1474 if (file->f_op != &vfio_device_fops) 1475 return NULL; 1476 return df->device; 1477 } 1478 1479 /** 1480 * vfio_file_is_valid - True if the file is valid vfio file 1481 * @file: VFIO group file or VFIO device file 1482 */ 1483 bool vfio_file_is_valid(struct file *file) 1484 { 1485 return vfio_group_from_file(file) || 1486 vfio_device_from_file(file); 1487 } 1488 EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1489 1490 /** 1491 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1492 * is always CPU cache coherent 1493 * @file: VFIO group file or VFIO device file 1494 * 1495 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1496 * bit in DMA transactions. A return of false indicates that the user has 1497 * rights to access additional instructions such as wbinvd on x86. 1498 */ 1499 bool vfio_file_enforced_coherent(struct file *file) 1500 { 1501 struct vfio_device *device; 1502 struct vfio_group *group; 1503 1504 group = vfio_group_from_file(file); 1505 if (group) 1506 return vfio_group_enforced_coherent(group); 1507 1508 device = vfio_device_from_file(file); 1509 if (device) 1510 return device_iommu_capable(device->dev, 1511 IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1512 1513 return true; 1514 } 1515 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1516 1517 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1518 { 1519 struct vfio_device_file *df = file->private_data; 1520 1521 /* 1522 * The kvm is first recorded in the vfio_device_file, and will 1523 * be propagated to vfio_device::kvm when the file is bound to 1524 * iommufd successfully in the vfio device cdev path. 1525 */ 1526 spin_lock(&df->kvm_ref_lock); 1527 df->kvm = kvm; 1528 spin_unlock(&df->kvm_ref_lock); 1529 } 1530 1531 /** 1532 * vfio_file_set_kvm - Link a kvm with VFIO drivers 1533 * @file: VFIO group file or VFIO device file 1534 * @kvm: KVM to link 1535 * 1536 * When a VFIO device is first opened the KVM will be available in 1537 * device->kvm if one was associated with the file. 1538 */ 1539 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1540 { 1541 struct vfio_group *group; 1542 1543 group = vfio_group_from_file(file); 1544 if (group) 1545 vfio_group_set_kvm(group, kvm); 1546 1547 if (vfio_device_from_file(file)) 1548 vfio_device_file_set_kvm(file, kvm); 1549 } 1550 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1551 1552 /* 1553 * Sub-module support 1554 */ 1555 /* 1556 * Helper for managing a buffer of info chain capabilities, allocate or 1557 * reallocate a buffer with additional @size, filling in @id and @version 1558 * of the capability. A pointer to the new capability is returned. 1559 * 1560 * NB. The chain is based at the head of the buffer, so new entries are 1561 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1562 * next offsets prior to copying to the user buffer. 1563 */ 1564 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1565 size_t size, u16 id, u16 version) 1566 { 1567 void *buf; 1568 struct vfio_info_cap_header *header, *tmp; 1569 1570 /* Ensure that the next capability struct will be aligned */ 1571 size = ALIGN(size, sizeof(u64)); 1572 1573 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1574 if (!buf) { 1575 kfree(caps->buf); 1576 caps->buf = NULL; 1577 caps->size = 0; 1578 return ERR_PTR(-ENOMEM); 1579 } 1580 1581 caps->buf = buf; 1582 header = buf + caps->size; 1583 1584 /* Eventually copied to user buffer, zero */ 1585 memset(header, 0, size); 1586 1587 header->id = id; 1588 header->version = version; 1589 1590 /* Add to the end of the capability chain */ 1591 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1592 ; /* nothing */ 1593 1594 tmp->next = caps->size; 1595 caps->size += size; 1596 1597 return header; 1598 } 1599 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1600 1601 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1602 { 1603 struct vfio_info_cap_header *tmp; 1604 void *buf = (void *)caps->buf; 1605 1606 /* Capability structs should start with proper alignment */ 1607 WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); 1608 1609 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1610 tmp->next += offset; 1611 } 1612 EXPORT_SYMBOL(vfio_info_cap_shift); 1613 1614 int vfio_info_add_capability(struct vfio_info_cap *caps, 1615 struct vfio_info_cap_header *cap, size_t size) 1616 { 1617 struct vfio_info_cap_header *header; 1618 1619 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1620 if (IS_ERR(header)) 1621 return PTR_ERR(header); 1622 1623 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1624 1625 return 0; 1626 } 1627 EXPORT_SYMBOL(vfio_info_add_capability); 1628 1629 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1630 int max_irq_type, size_t *data_size) 1631 { 1632 unsigned long minsz; 1633 size_t size; 1634 1635 minsz = offsetofend(struct vfio_irq_set, count); 1636 1637 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1638 (hdr->count >= (U32_MAX - hdr->start)) || 1639 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1640 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1641 return -EINVAL; 1642 1643 if (data_size) 1644 *data_size = 0; 1645 1646 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1647 return -EINVAL; 1648 1649 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1650 case VFIO_IRQ_SET_DATA_NONE: 1651 size = 0; 1652 break; 1653 case VFIO_IRQ_SET_DATA_BOOL: 1654 size = sizeof(uint8_t); 1655 break; 1656 case VFIO_IRQ_SET_DATA_EVENTFD: 1657 size = sizeof(int32_t); 1658 break; 1659 default: 1660 return -EINVAL; 1661 } 1662 1663 if (size) { 1664 if (hdr->argsz - minsz < hdr->count * size) 1665 return -EINVAL; 1666 1667 if (!data_size) 1668 return -EINVAL; 1669 1670 *data_size = hdr->count * size; 1671 } 1672 1673 return 0; 1674 } 1675 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1676 1677 /* 1678 * Pin contiguous user pages and return their associated host pages for local 1679 * domain only. 1680 * @device [in] : device 1681 * @iova [in] : starting IOVA of user pages to be pinned. 1682 * @npage [in] : count of pages to be pinned. This count should not 1683 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1684 * @prot [in] : protection flags 1685 * @pages[out] : array of host pages 1686 * Return error or number of pages pinned. 1687 * 1688 * A driver may only call this function if the vfio_device was created 1689 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1690 */ 1691 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1692 int npage, int prot, struct page **pages) 1693 { 1694 /* group->container cannot change while a vfio device is open */ 1695 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1696 return -EINVAL; 1697 if (!device->ops->dma_unmap) 1698 return -EINVAL; 1699 if (vfio_device_has_container(device)) 1700 return vfio_device_container_pin_pages(device, iova, 1701 npage, prot, pages); 1702 if (device->iommufd_access) { 1703 int ret; 1704 1705 if (iova > ULONG_MAX) 1706 return -EINVAL; 1707 /* 1708 * VFIO ignores the sub page offset, npages is from the start of 1709 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1710 * the sub page offset by doing: 1711 * pages[0] + (iova % PAGE_SIZE) 1712 */ 1713 ret = iommufd_access_pin_pages( 1714 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1715 npage * PAGE_SIZE, pages, 1716 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1717 if (ret) 1718 return ret; 1719 return npage; 1720 } 1721 return -EINVAL; 1722 } 1723 EXPORT_SYMBOL(vfio_pin_pages); 1724 1725 /* 1726 * Unpin contiguous host pages for local domain only. 1727 * @device [in] : device 1728 * @iova [in] : starting address of user pages to be unpinned. 1729 * @npage [in] : count of pages to be unpinned. This count should not 1730 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1731 */ 1732 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1733 { 1734 if (WARN_ON(!vfio_assert_device_open(device))) 1735 return; 1736 if (WARN_ON(!device->ops->dma_unmap)) 1737 return; 1738 1739 if (vfio_device_has_container(device)) { 1740 vfio_device_container_unpin_pages(device, iova, npage); 1741 return; 1742 } 1743 if (device->iommufd_access) { 1744 if (WARN_ON(iova > ULONG_MAX)) 1745 return; 1746 iommufd_access_unpin_pages(device->iommufd_access, 1747 ALIGN_DOWN(iova, PAGE_SIZE), 1748 npage * PAGE_SIZE); 1749 return; 1750 } 1751 } 1752 EXPORT_SYMBOL(vfio_unpin_pages); 1753 1754 /* 1755 * This interface allows the CPUs to perform some sort of virtual DMA on 1756 * behalf of the device. 1757 * 1758 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1759 * into/from a kernel buffer. 1760 * 1761 * As the read/write of user space memory is conducted via the CPUs and is 1762 * not a real device DMA, it is not necessary to pin the user space memory. 1763 * 1764 * @device [in] : VFIO device 1765 * @iova [in] : base IOVA of a user space buffer 1766 * @data [in] : pointer to kernel buffer 1767 * @len [in] : kernel buffer length 1768 * @write : indicate read or write 1769 * Return error code on failure or 0 on success. 1770 */ 1771 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1772 size_t len, bool write) 1773 { 1774 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1775 return -EINVAL; 1776 1777 if (vfio_device_has_container(device)) 1778 return vfio_device_container_dma_rw(device, iova, 1779 data, len, write); 1780 1781 if (device->iommufd_access) { 1782 unsigned int flags = 0; 1783 1784 if (iova > ULONG_MAX) 1785 return -EINVAL; 1786 1787 /* VFIO historically tries to auto-detect a kthread */ 1788 if (!current->mm) 1789 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1790 if (write) 1791 flags |= IOMMUFD_ACCESS_RW_WRITE; 1792 return iommufd_access_rw(device->iommufd_access, iova, data, 1793 len, flags); 1794 } 1795 return -EINVAL; 1796 } 1797 EXPORT_SYMBOL(vfio_dma_rw); 1798 1799 /* 1800 * Module/class support 1801 */ 1802 static int __init vfio_init(void) 1803 { 1804 int ret; 1805 1806 ida_init(&vfio.device_ida); 1807 1808 ret = vfio_group_init(); 1809 if (ret) 1810 return ret; 1811 1812 ret = vfio_virqfd_init(); 1813 if (ret) 1814 goto err_virqfd; 1815 1816 /* /sys/class/vfio-dev/vfioX */ 1817 ret = class_register(&vfio_device_class); 1818 if (ret) 1819 goto err_dev_class; 1820 1821 ret = vfio_cdev_init(); 1822 if (ret) 1823 goto err_alloc_dev_chrdev; 1824 1825 vfio_debugfs_create_root(); 1826 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1827 return 0; 1828 1829 err_alloc_dev_chrdev: 1830 class_unregister(&vfio_device_class); 1831 err_dev_class: 1832 vfio_virqfd_exit(); 1833 err_virqfd: 1834 vfio_group_cleanup(); 1835 return ret; 1836 } 1837 1838 static void __exit vfio_cleanup(void) 1839 { 1840 vfio_debugfs_remove_root(); 1841 ida_destroy(&vfio.device_ida); 1842 vfio_cdev_cleanup(); 1843 class_unregister(&vfio_device_class); 1844 vfio_virqfd_exit(); 1845 vfio_group_cleanup(); 1846 xa_destroy(&vfio_device_set_xa); 1847 } 1848 1849 module_init(vfio_init); 1850 module_exit(vfio_cleanup); 1851 1852 MODULE_IMPORT_NS("IOMMUFD"); 1853 MODULE_VERSION(DRIVER_VERSION); 1854 MODULE_LICENSE("GPL v2"); 1855 MODULE_AUTHOR(DRIVER_AUTHOR); 1856 MODULE_DESCRIPTION(DRIVER_DESC); 1857 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1858