1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #if IS_ENABLED(CONFIG_KVM) 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mount.h> 26 #include <linux/mutex.h> 27 #include <linux/pci.h> 28 #include <linux/pseudo_fs.h> 29 #include <linux/rwsem.h> 30 #include <linux/sched.h> 31 #include <linux/seq_file.h> 32 #include <linux/slab.h> 33 #include <linux/stat.h> 34 #include <linux/string.h> 35 #include <linux/uaccess.h> 36 #include <linux/vfio.h> 37 #include <linux/wait.h> 38 #include <linux/sched/signal.h> 39 #include <linux/pm_runtime.h> 40 #include <linux/interval_tree.h> 41 #include <linux/iova_bitmap.h> 42 #include <linux/iommufd.h> 43 #include "vfio.h" 44 45 #define DRIVER_VERSION "0.3" 46 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 47 #define DRIVER_DESC "VFIO - User Level meta-driver" 48 49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */ 50 51 static struct vfio { 52 struct ida device_ida; 53 struct vfsmount *vfs_mount; 54 int fs_count; 55 } vfio; 56 57 #ifdef CONFIG_VFIO_NOIOMMU 58 bool vfio_noiommu __read_mostly; 59 module_param_named(enable_unsafe_noiommu_mode, 60 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 62 #endif 63 64 static DEFINE_XARRAY(vfio_device_set_xa); 65 66 static char *vfio_device_devnode(const struct device *dev, umode_t *mode) 67 { 68 return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); 69 } 70 71 static const struct class vfio_device_class = { 72 .name = "vfio-dev", 73 .devnode = vfio_device_devnode 74 }; 75 76 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 77 { 78 unsigned long idx = (unsigned long)set_id; 79 struct vfio_device_set *new_dev_set; 80 struct vfio_device_set *dev_set; 81 82 if (WARN_ON(!set_id)) 83 return -EINVAL; 84 85 /* 86 * Atomically acquire a singleton object in the xarray for this set_id 87 */ 88 xa_lock(&vfio_device_set_xa); 89 dev_set = xa_load(&vfio_device_set_xa, idx); 90 if (dev_set) 91 goto found_get_ref; 92 xa_unlock(&vfio_device_set_xa); 93 94 new_dev_set = kzalloc_obj(*new_dev_set); 95 if (!new_dev_set) 96 return -ENOMEM; 97 mutex_init(&new_dev_set->lock); 98 INIT_LIST_HEAD(&new_dev_set->device_list); 99 new_dev_set->set_id = set_id; 100 101 xa_lock(&vfio_device_set_xa); 102 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 103 GFP_KERNEL); 104 if (!dev_set) { 105 dev_set = new_dev_set; 106 goto found_get_ref; 107 } 108 109 kfree(new_dev_set); 110 if (xa_is_err(dev_set)) { 111 xa_unlock(&vfio_device_set_xa); 112 return xa_err(dev_set); 113 } 114 115 found_get_ref: 116 dev_set->device_count++; 117 xa_unlock(&vfio_device_set_xa); 118 mutex_lock(&dev_set->lock); 119 device->dev_set = dev_set; 120 list_add_tail(&device->dev_set_list, &dev_set->device_list); 121 mutex_unlock(&dev_set->lock); 122 return 0; 123 } 124 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 125 126 static void vfio_release_device_set(struct vfio_device *device) 127 { 128 struct vfio_device_set *dev_set = device->dev_set; 129 130 if (!dev_set) 131 return; 132 133 mutex_lock(&dev_set->lock); 134 list_del(&device->dev_set_list); 135 mutex_unlock(&dev_set->lock); 136 137 xa_lock(&vfio_device_set_xa); 138 if (!--dev_set->device_count) { 139 __xa_erase(&vfio_device_set_xa, 140 (unsigned long)dev_set->set_id); 141 mutex_destroy(&dev_set->lock); 142 kfree(dev_set); 143 } 144 xa_unlock(&vfio_device_set_xa); 145 } 146 147 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 148 { 149 struct vfio_device *cur; 150 unsigned int open_count = 0; 151 152 lockdep_assert_held(&dev_set->lock); 153 154 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 155 open_count += cur->open_count; 156 return open_count; 157 } 158 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 159 160 struct vfio_device * 161 vfio_find_device_in_devset(struct vfio_device_set *dev_set, 162 struct device *dev) 163 { 164 struct vfio_device *cur; 165 166 lockdep_assert_held(&dev_set->lock); 167 168 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 169 if (cur->dev == dev) 170 return cur; 171 return NULL; 172 } 173 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 174 175 /* 176 * Device objects - create, release, get, put, search 177 */ 178 /* Device reference always implies a group reference */ 179 void vfio_device_put_registration(struct vfio_device *device) 180 { 181 if (refcount_dec_and_test(&device->refcount)) 182 complete(&device->comp); 183 } 184 EXPORT_SYMBOL_GPL(vfio_device_put_registration); 185 186 bool vfio_device_try_get_registration(struct vfio_device *device) 187 { 188 return refcount_inc_not_zero(&device->refcount); 189 } 190 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); 191 192 /* 193 * VFIO driver API 194 */ 195 /* Release helper called by vfio_put_device() */ 196 static void vfio_device_release(struct device *dev) 197 { 198 struct vfio_device *device = 199 container_of(dev, struct vfio_device, device); 200 201 vfio_release_device_set(device); 202 ida_free(&vfio.device_ida, device->index); 203 204 if (device->ops->release) 205 device->ops->release(device); 206 207 iput(device->inode); 208 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 209 kvfree(device); 210 } 211 212 static int vfio_init_device(struct vfio_device *device, struct device *dev, 213 const struct vfio_device_ops *ops); 214 215 /* 216 * Allocate and initialize vfio_device so it can be registered to vfio 217 * core. 218 * 219 * Drivers should use the wrapper vfio_alloc_device() for allocation. 220 * @size is the size of the structure to be allocated, including any 221 * private data used by the driver. 222 * 223 * Driver may provide an @init callback to cover device private data. 224 * 225 * Use vfio_put_device() to release the structure after success return. 226 */ 227 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 228 const struct vfio_device_ops *ops) 229 { 230 struct vfio_device *device; 231 int ret; 232 233 if (WARN_ON(size < sizeof(struct vfio_device))) 234 return ERR_PTR(-EINVAL); 235 236 device = kvzalloc(size, GFP_KERNEL); 237 if (!device) 238 return ERR_PTR(-ENOMEM); 239 240 ret = vfio_init_device(device, dev, ops); 241 if (ret) 242 goto out_free; 243 return device; 244 245 out_free: 246 kvfree(device); 247 return ERR_PTR(ret); 248 } 249 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 250 251 static int vfio_fs_init_fs_context(struct fs_context *fc) 252 { 253 return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; 254 } 255 256 static struct file_system_type vfio_fs_type = { 257 .name = "vfio", 258 .owner = THIS_MODULE, 259 .init_fs_context = vfio_fs_init_fs_context, 260 .kill_sb = kill_anon_super, 261 }; 262 263 static struct inode *vfio_fs_inode_new(void) 264 { 265 struct inode *inode; 266 int ret; 267 268 ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); 269 if (ret) 270 return ERR_PTR(ret); 271 272 inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); 273 if (IS_ERR(inode)) 274 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 275 276 return inode; 277 } 278 279 /* 280 * Initialize a vfio_device so it can be registered to vfio core. 281 */ 282 static int vfio_init_device(struct vfio_device *device, struct device *dev, 283 const struct vfio_device_ops *ops) 284 { 285 int ret; 286 287 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 288 if (ret < 0) { 289 dev_dbg(dev, "Error to alloc index\n"); 290 return ret; 291 } 292 293 device->index = ret; 294 init_completion(&device->comp); 295 device->dev = dev; 296 device->ops = ops; 297 device->inode = vfio_fs_inode_new(); 298 if (IS_ERR(device->inode)) { 299 ret = PTR_ERR(device->inode); 300 goto out_inode; 301 } 302 303 if (ops->init) { 304 ret = ops->init(device); 305 if (ret) 306 goto out_uninit; 307 } 308 309 device_initialize(&device->device); 310 device->device.release = vfio_device_release; 311 device->device.class = &vfio_device_class; 312 device->device.parent = device->dev; 313 return 0; 314 315 out_uninit: 316 iput(device->inode); 317 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 318 out_inode: 319 vfio_release_device_set(device); 320 ida_free(&vfio.device_ida, device->index); 321 return ret; 322 } 323 324 static int __vfio_register_dev(struct vfio_device *device, 325 enum vfio_group_type type) 326 { 327 int ret; 328 329 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 330 (!device->ops->bind_iommufd || 331 !device->ops->unbind_iommufd || 332 !device->ops->attach_ioas || 333 !device->ops->detach_ioas))) 334 return -EINVAL; 335 336 /* 337 * If the driver doesn't specify a set then the device is added to a 338 * singleton set just for itself. 339 */ 340 if (!device->dev_set) 341 vfio_assign_device_set(device, device); 342 343 ret = dev_set_name(&device->device, "vfio%d", device->index); 344 if (ret) 345 return ret; 346 347 ret = vfio_device_set_group(device, type); 348 if (ret) 349 return ret; 350 351 /* 352 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 353 * restore cache coherency. It has to be checked here because it is only 354 * valid for cases where we are using iommu groups. 355 */ 356 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && 357 !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { 358 ret = -EINVAL; 359 goto err_out; 360 } 361 362 ret = vfio_device_add(device); 363 if (ret) 364 goto err_out; 365 366 /* Refcounting can't start until the driver calls register */ 367 refcount_set(&device->refcount, 1); 368 369 vfio_device_group_register(device); 370 vfio_device_debugfs_init(device); 371 372 return 0; 373 err_out: 374 vfio_device_remove_group(device); 375 return ret; 376 } 377 378 int vfio_register_group_dev(struct vfio_device *device) 379 { 380 return __vfio_register_dev(device, VFIO_IOMMU); 381 } 382 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 383 384 /* 385 * Register a virtual device without IOMMU backing. The user of this 386 * device must not be able to directly trigger unmediated DMA. 387 */ 388 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 389 { 390 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 391 } 392 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 393 394 /* 395 * Decrement the device reference count and wait for the device to be 396 * removed. Open file descriptors for the device... */ 397 void vfio_unregister_group_dev(struct vfio_device *device) 398 { 399 unsigned int i = 0; 400 bool interrupted = false; 401 long rc; 402 403 /* 404 * Prevent new device opened by userspace via the 405 * VFIO_GROUP_GET_DEVICE_FD in the group path. 406 */ 407 vfio_device_group_unregister(device); 408 409 /* 410 * Balances vfio_device_add() in register path, also prevents 411 * new device opened by userspace in the cdev path. 412 */ 413 vfio_device_del(device); 414 415 vfio_device_put_registration(device); 416 rc = try_wait_for_completion(&device->comp); 417 while (rc <= 0) { 418 if (device->ops->request) 419 device->ops->request(device, i++); 420 421 if (interrupted) { 422 rc = wait_for_completion_timeout(&device->comp, 423 HZ * 10); 424 } else { 425 rc = wait_for_completion_interruptible_timeout( 426 &device->comp, HZ * 10); 427 if (rc < 0) { 428 interrupted = true; 429 dev_warn(device->dev, 430 "Device is currently in use, task" 431 " \"%s\" (%d) " 432 "blocked until device is released", 433 current->comm, task_pid_nr(current)); 434 } 435 } 436 } 437 438 vfio_device_debugfs_exit(device); 439 /* Balances vfio_device_set_group in register path */ 440 vfio_device_remove_group(device); 441 } 442 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 443 444 #if IS_ENABLED(CONFIG_KVM) 445 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 446 { 447 void (*pfn)(struct kvm *kvm); 448 bool (*fn)(struct kvm *kvm); 449 bool ret; 450 451 lockdep_assert_held(&device->dev_set->lock); 452 453 if (!kvm) 454 return; 455 456 pfn = symbol_get(kvm_put_kvm); 457 if (WARN_ON(!pfn)) 458 return; 459 460 fn = symbol_get(kvm_get_kvm_safe); 461 if (WARN_ON(!fn)) { 462 symbol_put(kvm_put_kvm); 463 return; 464 } 465 466 ret = fn(kvm); 467 symbol_put(kvm_get_kvm_safe); 468 if (!ret) { 469 symbol_put(kvm_put_kvm); 470 return; 471 } 472 473 device->put_kvm = pfn; 474 device->kvm = kvm; 475 } 476 477 void vfio_device_put_kvm(struct vfio_device *device) 478 { 479 lockdep_assert_held(&device->dev_set->lock); 480 481 if (!device->kvm) 482 return; 483 484 if (WARN_ON(!device->put_kvm)) 485 goto clear; 486 487 device->put_kvm(device->kvm); 488 device->put_kvm = NULL; 489 symbol_put(kvm_put_kvm); 490 491 clear: 492 device->kvm = NULL; 493 } 494 #endif 495 496 /* true if the vfio_device has open_device() called but not close_device() */ 497 static bool vfio_assert_device_open(struct vfio_device *device) 498 { 499 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 500 } 501 502 struct vfio_device_file * 503 vfio_allocate_device_file(struct vfio_device *device) 504 { 505 struct vfio_device_file *df; 506 507 df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT); 508 if (!df) 509 return ERR_PTR(-ENOMEM); 510 511 df->device = device; 512 spin_lock_init(&df->kvm_ref_lock); 513 514 return df; 515 } 516 517 static int vfio_df_device_first_open(struct vfio_device_file *df) 518 { 519 struct vfio_device *device = df->device; 520 struct iommufd_ctx *iommufd = df->iommufd; 521 int ret; 522 523 lockdep_assert_held(&device->dev_set->lock); 524 525 if (!try_module_get(device->dev->driver->owner)) 526 return -ENODEV; 527 528 if (iommufd) 529 ret = vfio_df_iommufd_bind(df); 530 else 531 ret = vfio_device_group_use_iommu(device); 532 if (ret) 533 goto err_module_put; 534 535 if (device->ops->open_device) { 536 ret = device->ops->open_device(device); 537 if (ret) 538 goto err_unuse_iommu; 539 } 540 return 0; 541 542 err_unuse_iommu: 543 if (iommufd) 544 vfio_df_iommufd_unbind(df); 545 else 546 vfio_device_group_unuse_iommu(device); 547 err_module_put: 548 module_put(device->dev->driver->owner); 549 return ret; 550 } 551 552 static void vfio_df_device_last_close(struct vfio_device_file *df) 553 { 554 struct vfio_device *device = df->device; 555 struct iommufd_ctx *iommufd = df->iommufd; 556 557 lockdep_assert_held(&device->dev_set->lock); 558 559 if (device->ops->close_device) 560 device->ops->close_device(device); 561 if (iommufd) 562 vfio_df_iommufd_unbind(df); 563 else 564 vfio_device_group_unuse_iommu(device); 565 device->precopy_info_v2 = 0; 566 module_put(device->dev->driver->owner); 567 } 568 569 int vfio_df_open(struct vfio_device_file *df) 570 { 571 struct vfio_device *device = df->device; 572 int ret = 0; 573 574 lockdep_assert_held(&device->dev_set->lock); 575 576 /* 577 * Only the group path allows the device to be opened multiple 578 * times. The device cdev path doesn't have a secure way for it. 579 */ 580 if (device->open_count != 0 && !df->group) 581 return -EINVAL; 582 583 device->open_count++; 584 if (device->open_count == 1) { 585 ret = vfio_df_device_first_open(df); 586 if (ret) 587 device->open_count--; 588 } 589 590 return ret; 591 } 592 593 void vfio_df_close(struct vfio_device_file *df) 594 { 595 struct vfio_device *device = df->device; 596 597 lockdep_assert_held(&device->dev_set->lock); 598 599 if (!vfio_assert_device_open(device)) 600 return; 601 if (device->open_count == 1) 602 vfio_df_device_last_close(df); 603 device->open_count--; 604 } 605 606 /* 607 * Wrapper around pm_runtime_resume_and_get(). 608 * Return error code on failure or 0 on success. 609 */ 610 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 611 { 612 struct device *dev = device->dev; 613 614 if (dev->driver && dev->driver->pm) { 615 int ret; 616 617 ret = pm_runtime_resume_and_get(dev); 618 if (ret) { 619 dev_info_ratelimited(dev, 620 "vfio: runtime resume failed %d\n", ret); 621 return -EIO; 622 } 623 } 624 625 return 0; 626 } 627 628 /* 629 * Wrapper around pm_runtime_put(). 630 */ 631 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 632 { 633 struct device *dev = device->dev; 634 635 if (dev->driver && dev->driver->pm) 636 pm_runtime_put(dev); 637 } 638 639 /* 640 * VFIO Device fd 641 */ 642 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 643 { 644 struct vfio_device_file *df = filep->private_data; 645 struct vfio_device *device = df->device; 646 647 if (df->group) 648 vfio_df_group_close(df); 649 else 650 vfio_df_unbind_iommufd(df); 651 652 vfio_device_put_registration(device); 653 654 kfree(df); 655 656 return 0; 657 } 658 659 /* 660 * vfio_mig_get_next_state - Compute the next step in the FSM 661 * @cur_fsm - The current state the device is in 662 * @new_fsm - The target state to reach 663 * @next_fsm - Pointer to the next step to get to new_fsm 664 * 665 * Return 0 upon success, otherwise -errno 666 * Upon success the next step in the state progression between cur_fsm and 667 * new_fsm will be set in next_fsm. 668 * 669 * This breaks down requests for combination transitions into smaller steps and 670 * returns the next step to get to new_fsm. The function may need to be called 671 * multiple times before reaching new_fsm. 672 * 673 */ 674 int vfio_mig_get_next_state(struct vfio_device *device, 675 enum vfio_device_mig_state cur_fsm, 676 enum vfio_device_mig_state new_fsm, 677 enum vfio_device_mig_state *next_fsm) 678 { 679 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 680 /* 681 * The coding in this table requires the driver to implement the 682 * following FSM arcs: 683 * RESUMING -> STOP 684 * STOP -> RESUMING 685 * STOP -> STOP_COPY 686 * STOP_COPY -> STOP 687 * 688 * If P2P is supported then the driver must also implement these FSM 689 * arcs: 690 * RUNNING -> RUNNING_P2P 691 * RUNNING_P2P -> RUNNING 692 * RUNNING_P2P -> STOP 693 * STOP -> RUNNING_P2P 694 * 695 * If precopy is supported then the driver must support these additional 696 * FSM arcs: 697 * RUNNING -> PRE_COPY 698 * PRE_COPY -> RUNNING 699 * PRE_COPY -> STOP_COPY 700 * However, if precopy and P2P are supported together then the driver 701 * must support these additional arcs beyond the P2P arcs above: 702 * PRE_COPY -> RUNNING 703 * PRE_COPY -> PRE_COPY_P2P 704 * PRE_COPY_P2P -> PRE_COPY 705 * PRE_COPY_P2P -> RUNNING_P2P 706 * PRE_COPY_P2P -> STOP_COPY 707 * RUNNING -> PRE_COPY 708 * RUNNING_P2P -> PRE_COPY_P2P 709 * 710 * Without P2P and precopy the driver must implement: 711 * RUNNING -> STOP 712 * STOP -> RUNNING 713 * 714 * The coding will step through multiple states for some combination 715 * transitions; if all optional features are supported, this means the 716 * following ones: 717 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 718 * PRE_COPY -> RUNNING -> RUNNING_P2P 719 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 720 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 721 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 722 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 723 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 724 * RESUMING -> STOP -> RUNNING_P2P 725 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 726 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 727 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 728 * RESUMING -> STOP -> STOP_COPY 729 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 730 * RUNNING -> RUNNING_P2P -> STOP 731 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 732 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 733 * RUNNING_P2P -> RUNNING -> PRE_COPY 734 * RUNNING_P2P -> STOP -> RESUMING 735 * RUNNING_P2P -> STOP -> STOP_COPY 736 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 737 * STOP -> RUNNING_P2P -> RUNNING 738 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 739 * STOP_COPY -> STOP -> RESUMING 740 * STOP_COPY -> STOP -> RUNNING_P2P 741 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 742 * 743 * The following transitions are blocked: 744 * STOP_COPY -> PRE_COPY 745 * STOP_COPY -> PRE_COPY_P2P 746 */ 747 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 748 [VFIO_DEVICE_STATE_STOP] = { 749 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 750 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 751 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 752 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 753 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 754 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 755 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 756 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 757 }, 758 [VFIO_DEVICE_STATE_RUNNING] = { 759 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 760 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 761 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 762 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 763 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 764 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 765 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 766 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 767 }, 768 [VFIO_DEVICE_STATE_PRE_COPY] = { 769 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 770 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 771 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 772 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 773 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 774 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 775 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 776 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 777 }, 778 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 779 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 780 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 781 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 782 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 783 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 784 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 785 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 786 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 787 }, 788 [VFIO_DEVICE_STATE_STOP_COPY] = { 789 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 790 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 791 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 792 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 793 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 794 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 795 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 796 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 797 }, 798 [VFIO_DEVICE_STATE_RESUMING] = { 799 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 800 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 801 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 802 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 803 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 804 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 805 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 806 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 807 }, 808 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 809 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 810 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 811 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 812 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 813 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 814 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 815 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 816 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 817 }, 818 [VFIO_DEVICE_STATE_ERROR] = { 819 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 820 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 821 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 822 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 823 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 824 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 825 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 826 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 827 }, 828 }; 829 830 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 831 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 832 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 833 [VFIO_DEVICE_STATE_PRE_COPY] = 834 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 835 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 836 VFIO_MIGRATION_P2P | 837 VFIO_MIGRATION_PRE_COPY, 838 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 839 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 840 [VFIO_DEVICE_STATE_RUNNING_P2P] = 841 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 842 [VFIO_DEVICE_STATE_ERROR] = ~0U, 843 }; 844 845 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 846 (state_flags_table[cur_fsm] & device->migration_flags) != 847 state_flags_table[cur_fsm])) 848 return -EINVAL; 849 850 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 851 (state_flags_table[new_fsm] & device->migration_flags) != 852 state_flags_table[new_fsm]) 853 return -EINVAL; 854 855 /* 856 * Arcs touching optional and unsupported states are skipped over. The 857 * driver will instead see an arc from the original state to the next 858 * logical state, as per the above comment. 859 */ 860 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 861 while ((state_flags_table[*next_fsm] & device->migration_flags) != 862 state_flags_table[*next_fsm]) 863 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 864 865 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 866 } 867 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 868 869 /* 870 * Convert the drivers's struct file into a FD number and return it to userspace 871 */ 872 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 873 struct vfio_device_feature_mig_state *mig) 874 { 875 int ret; 876 int fd; 877 878 fd = get_unused_fd_flags(O_CLOEXEC); 879 if (fd < 0) { 880 ret = fd; 881 goto out_fput; 882 } 883 884 mig->data_fd = fd; 885 if (copy_to_user(arg, mig, sizeof(*mig))) { 886 ret = -EFAULT; 887 goto out_put_unused; 888 } 889 fd_install(fd, filp); 890 return 0; 891 892 out_put_unused: 893 put_unused_fd(fd); 894 out_fput: 895 fput(filp); 896 return ret; 897 } 898 899 static int 900 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 901 u32 flags, void __user *arg, 902 size_t argsz) 903 { 904 size_t minsz = 905 offsetofend(struct vfio_device_feature_mig_state, data_fd); 906 struct vfio_device_feature_mig_state mig; 907 struct file *filp = NULL; 908 int ret; 909 910 if (!device->mig_ops) 911 return -ENOTTY; 912 913 ret = vfio_check_feature(flags, argsz, 914 VFIO_DEVICE_FEATURE_SET | 915 VFIO_DEVICE_FEATURE_GET, 916 sizeof(mig)); 917 if (ret != 1) 918 return ret; 919 920 if (copy_from_user(&mig, arg, minsz)) 921 return -EFAULT; 922 923 if (flags & VFIO_DEVICE_FEATURE_GET) { 924 enum vfio_device_mig_state curr_state; 925 926 ret = device->mig_ops->migration_get_state(device, 927 &curr_state); 928 if (ret) 929 return ret; 930 mig.device_state = curr_state; 931 goto out_copy; 932 } 933 934 /* Handle the VFIO_DEVICE_FEATURE_SET */ 935 filp = device->mig_ops->migration_set_state(device, mig.device_state); 936 if (IS_ERR(filp) || !filp) 937 goto out_copy; 938 939 return vfio_ioct_mig_return_fd(filp, arg, &mig); 940 out_copy: 941 mig.data_fd = -1; 942 if (copy_to_user(arg, &mig, sizeof(mig))) 943 return -EFAULT; 944 if (IS_ERR(filp)) 945 return PTR_ERR(filp); 946 return 0; 947 } 948 949 static int 950 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 951 u32 flags, void __user *arg, 952 size_t argsz) 953 { 954 struct vfio_device_feature_mig_data_size data_size = {}; 955 unsigned long stop_copy_length; 956 int ret; 957 958 if (!device->mig_ops) 959 return -ENOTTY; 960 961 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 962 sizeof(data_size)); 963 if (ret != 1) 964 return ret; 965 966 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 967 if (ret) 968 return ret; 969 970 data_size.stop_copy_length = stop_copy_length; 971 if (copy_to_user(arg, &data_size, sizeof(data_size))) 972 return -EFAULT; 973 974 return 0; 975 } 976 977 static int 978 vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device, 979 u32 flags, size_t argsz) 980 { 981 int ret; 982 983 if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY)) 984 return -EINVAL; 985 986 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); 987 if (ret != 1) 988 return ret; 989 990 device->precopy_info_v2 = 1; 991 return 0; 992 } 993 994 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 995 u32 flags, void __user *arg, 996 size_t argsz) 997 { 998 struct vfio_device_feature_migration mig = { 999 .flags = device->migration_flags, 1000 }; 1001 int ret; 1002 1003 if (!device->mig_ops) 1004 return -ENOTTY; 1005 1006 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 1007 sizeof(mig)); 1008 if (ret != 1) 1009 return ret; 1010 if (copy_to_user(arg, &mig, sizeof(mig))) 1011 return -EFAULT; 1012 return 0; 1013 } 1014 1015 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 1016 u32 req_nodes) 1017 { 1018 struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 1019 unsigned long min_gap, curr_gap; 1020 1021 /* Special shortcut when a single range is required */ 1022 if (req_nodes == 1) { 1023 unsigned long last; 1024 1025 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 1026 1027 /* Empty list */ 1028 if (WARN_ON_ONCE(!comb_start)) 1029 return; 1030 1031 curr = comb_start; 1032 while (curr) { 1033 last = curr->last; 1034 prev = curr; 1035 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1036 if (prev != comb_start) 1037 interval_tree_remove(prev, root); 1038 } 1039 comb_start->last = last; 1040 return; 1041 } 1042 1043 /* Combine ranges which have the smallest gap */ 1044 while (cur_nodes > req_nodes) { 1045 prev = NULL; 1046 min_gap = ULONG_MAX; 1047 curr = interval_tree_iter_first(root, 0, ULONG_MAX); 1048 while (curr) { 1049 if (prev) { 1050 curr_gap = curr->start - prev->last; 1051 if (curr_gap < min_gap) { 1052 min_gap = curr_gap; 1053 comb_start = prev; 1054 comb_end = curr; 1055 } 1056 } 1057 prev = curr; 1058 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1059 } 1060 1061 /* Empty list or no nodes to combine */ 1062 if (WARN_ON_ONCE(min_gap == ULONG_MAX)) 1063 break; 1064 1065 comb_start->last = comb_end->last; 1066 interval_tree_remove(comb_end, root); 1067 cur_nodes--; 1068 } 1069 } 1070 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); 1071 1072 /* Ranges should fit into a single kernel page */ 1073 #define LOG_MAX_RANGES \ 1074 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 1075 1076 static int 1077 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 1078 u32 flags, void __user *arg, 1079 size_t argsz) 1080 { 1081 size_t minsz = 1082 offsetofend(struct vfio_device_feature_dma_logging_control, 1083 ranges); 1084 struct vfio_device_feature_dma_logging_range __user *ranges; 1085 struct vfio_device_feature_dma_logging_control control; 1086 struct vfio_device_feature_dma_logging_range range; 1087 struct rb_root_cached root = RB_ROOT_CACHED; 1088 struct interval_tree_node *nodes; 1089 u64 iova_end; 1090 u32 nnodes; 1091 int i, ret; 1092 1093 if (!device->log_ops) 1094 return -ENOTTY; 1095 1096 ret = vfio_check_feature(flags, argsz, 1097 VFIO_DEVICE_FEATURE_SET, 1098 sizeof(control)); 1099 if (ret != 1) 1100 return ret; 1101 1102 if (copy_from_user(&control, arg, minsz)) 1103 return -EFAULT; 1104 1105 nnodes = control.num_ranges; 1106 if (!nnodes) 1107 return -EINVAL; 1108 1109 if (nnodes > LOG_MAX_RANGES) 1110 return -E2BIG; 1111 1112 ranges = u64_to_user_ptr(control.ranges); 1113 nodes = kmalloc_objs(struct interval_tree_node, nnodes); 1114 if (!nodes) 1115 return -ENOMEM; 1116 1117 for (i = 0; i < nnodes; i++) { 1118 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 1119 ret = -EFAULT; 1120 goto end; 1121 } 1122 if (!IS_ALIGNED(range.iova, control.page_size) || 1123 !IS_ALIGNED(range.length, control.page_size)) { 1124 ret = -EINVAL; 1125 goto end; 1126 } 1127 1128 if (check_add_overflow(range.iova, range.length, &iova_end) || 1129 iova_end > ULONG_MAX) { 1130 ret = -EOVERFLOW; 1131 goto end; 1132 } 1133 1134 nodes[i].start = range.iova; 1135 nodes[i].last = range.iova + range.length - 1; 1136 if (interval_tree_iter_first(&root, nodes[i].start, 1137 nodes[i].last)) { 1138 /* Range overlapping */ 1139 ret = -EINVAL; 1140 goto end; 1141 } 1142 interval_tree_insert(nodes + i, &root); 1143 } 1144 1145 ret = device->log_ops->log_start(device, &root, nnodes, 1146 &control.page_size); 1147 if (ret) 1148 goto end; 1149 1150 if (copy_to_user(arg, &control, sizeof(control))) { 1151 ret = -EFAULT; 1152 device->log_ops->log_stop(device); 1153 } 1154 1155 end: 1156 kfree(nodes); 1157 return ret; 1158 } 1159 1160 static int 1161 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1162 u32 flags, void __user *arg, 1163 size_t argsz) 1164 { 1165 int ret; 1166 1167 if (!device->log_ops) 1168 return -ENOTTY; 1169 1170 ret = vfio_check_feature(flags, argsz, 1171 VFIO_DEVICE_FEATURE_SET, 0); 1172 if (ret != 1) 1173 return ret; 1174 1175 return device->log_ops->log_stop(device); 1176 } 1177 1178 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1179 unsigned long iova, size_t length, 1180 void *opaque) 1181 { 1182 struct vfio_device *device = opaque; 1183 1184 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1185 } 1186 1187 static int 1188 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1189 u32 flags, void __user *arg, 1190 size_t argsz) 1191 { 1192 size_t minsz = 1193 offsetofend(struct vfio_device_feature_dma_logging_report, 1194 bitmap); 1195 struct vfio_device_feature_dma_logging_report report; 1196 struct iova_bitmap *iter; 1197 u64 iova_end; 1198 int ret; 1199 1200 if (!device->log_ops) 1201 return -ENOTTY; 1202 1203 ret = vfio_check_feature(flags, argsz, 1204 VFIO_DEVICE_FEATURE_GET, 1205 sizeof(report)); 1206 if (ret != 1) 1207 return ret; 1208 1209 if (copy_from_user(&report, arg, minsz)) 1210 return -EFAULT; 1211 1212 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1213 return -EINVAL; 1214 1215 if (check_add_overflow(report.iova, report.length, &iova_end) || 1216 iova_end > ULONG_MAX) 1217 return -EOVERFLOW; 1218 1219 iter = iova_bitmap_alloc(report.iova, report.length, 1220 report.page_size, 1221 u64_to_user_ptr(report.bitmap)); 1222 if (IS_ERR(iter)) 1223 return PTR_ERR(iter); 1224 1225 ret = iova_bitmap_for_each(iter, device, 1226 vfio_device_log_read_and_clear); 1227 1228 iova_bitmap_free(iter); 1229 return ret; 1230 } 1231 1232 static int vfio_ioctl_device_feature(struct vfio_device *device, 1233 struct vfio_device_feature __user *arg) 1234 { 1235 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1236 struct vfio_device_feature feature; 1237 1238 if (copy_from_user(&feature, arg, minsz)) 1239 return -EFAULT; 1240 1241 if (feature.argsz < minsz) 1242 return -EINVAL; 1243 1244 /* Check unknown flags */ 1245 if (feature.flags & 1246 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1247 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1248 return -EINVAL; 1249 1250 /* GET & SET are mutually exclusive except with PROBE */ 1251 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1252 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1253 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1254 return -EINVAL; 1255 1256 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1257 case VFIO_DEVICE_FEATURE_MIGRATION: 1258 return vfio_ioctl_device_feature_migration( 1259 device, feature.flags, arg->data, 1260 feature.argsz - minsz); 1261 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1262 return vfio_ioctl_device_feature_mig_device_state( 1263 device, feature.flags, arg->data, 1264 feature.argsz - minsz); 1265 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1266 return vfio_ioctl_device_feature_logging_start( 1267 device, feature.flags, arg->data, 1268 feature.argsz - minsz); 1269 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1270 return vfio_ioctl_device_feature_logging_stop( 1271 device, feature.flags, arg->data, 1272 feature.argsz - minsz); 1273 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1274 return vfio_ioctl_device_feature_logging_report( 1275 device, feature.flags, arg->data, 1276 feature.argsz - minsz); 1277 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1278 return vfio_ioctl_device_feature_migration_data_size( 1279 device, feature.flags, arg->data, 1280 feature.argsz - minsz); 1281 case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2: 1282 return vfio_ioctl_device_feature_migration_precopy_info_v2( 1283 device, feature.flags, feature.argsz - minsz); 1284 default: 1285 if (unlikely(!device->ops->device_feature)) 1286 return -ENOTTY; 1287 return device->ops->device_feature(device, feature.flags, 1288 arg->data, 1289 feature.argsz - minsz); 1290 } 1291 } 1292 1293 static long vfio_get_region_info(struct vfio_device *device, 1294 struct vfio_region_info __user *arg) 1295 { 1296 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 1297 struct vfio_region_info info = {}; 1298 struct vfio_info_cap caps = {}; 1299 int ret; 1300 1301 if (unlikely(!device->ops->get_region_info_caps)) 1302 return -EINVAL; 1303 1304 if (copy_from_user(&info, arg, minsz)) 1305 return -EFAULT; 1306 if (info.argsz < minsz) 1307 return -EINVAL; 1308 1309 ret = device->ops->get_region_info_caps(device, &info, &caps); 1310 if (ret) 1311 goto out_free; 1312 1313 if (caps.size) { 1314 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 1315 if (info.argsz < sizeof(info) + caps.size) { 1316 info.argsz = sizeof(info) + caps.size; 1317 info.cap_offset = 0; 1318 } else { 1319 vfio_info_cap_shift(&caps, sizeof(info)); 1320 if (copy_to_user(arg + 1, caps.buf, caps.size)) { 1321 ret = -EFAULT; 1322 goto out_free; 1323 } 1324 info.cap_offset = sizeof(info); 1325 } 1326 } 1327 1328 if (copy_to_user(arg, &info, minsz)){ 1329 ret = -EFAULT; 1330 goto out_free; 1331 } 1332 1333 out_free: 1334 kfree(caps.buf); 1335 return ret; 1336 } 1337 1338 static long vfio_device_fops_unl_ioctl(struct file *filep, 1339 unsigned int cmd, unsigned long arg) 1340 { 1341 struct vfio_device_file *df = filep->private_data; 1342 struct vfio_device *device = df->device; 1343 void __user *uptr = (void __user *)arg; 1344 int ret; 1345 1346 if (cmd == VFIO_DEVICE_BIND_IOMMUFD) 1347 return vfio_df_ioctl_bind_iommufd(df, uptr); 1348 1349 /* Paired with smp_store_release() following vfio_df_open() */ 1350 if (!smp_load_acquire(&df->access_granted)) 1351 return -EINVAL; 1352 1353 ret = vfio_device_pm_runtime_get(device); 1354 if (ret) 1355 return ret; 1356 1357 /* cdev only ioctls */ 1358 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { 1359 switch (cmd) { 1360 case VFIO_DEVICE_ATTACH_IOMMUFD_PT: 1361 ret = vfio_df_ioctl_attach_pt(df, uptr); 1362 goto out; 1363 1364 case VFIO_DEVICE_DETACH_IOMMUFD_PT: 1365 ret = vfio_df_ioctl_detach_pt(df, uptr); 1366 goto out; 1367 } 1368 } 1369 1370 switch (cmd) { 1371 case VFIO_DEVICE_FEATURE: 1372 ret = vfio_ioctl_device_feature(device, uptr); 1373 break; 1374 1375 case VFIO_DEVICE_GET_REGION_INFO: 1376 ret = vfio_get_region_info(device, uptr); 1377 break; 1378 1379 default: 1380 if (unlikely(!device->ops->ioctl)) 1381 ret = -EINVAL; 1382 else 1383 ret = device->ops->ioctl(device, cmd, arg); 1384 break; 1385 } 1386 out: 1387 vfio_device_pm_runtime_put(device); 1388 return ret; 1389 } 1390 1391 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1392 size_t count, loff_t *ppos) 1393 { 1394 struct vfio_device_file *df = filep->private_data; 1395 struct vfio_device *device = df->device; 1396 1397 /* Paired with smp_store_release() following vfio_df_open() */ 1398 if (!smp_load_acquire(&df->access_granted)) 1399 return -EINVAL; 1400 1401 if (unlikely(!device->ops->read)) 1402 return -EINVAL; 1403 1404 return device->ops->read(device, buf, count, ppos); 1405 } 1406 1407 static ssize_t vfio_device_fops_write(struct file *filep, 1408 const char __user *buf, 1409 size_t count, loff_t *ppos) 1410 { 1411 struct vfio_device_file *df = filep->private_data; 1412 struct vfio_device *device = df->device; 1413 1414 /* Paired with smp_store_release() following vfio_df_open() */ 1415 if (!smp_load_acquire(&df->access_granted)) 1416 return -EINVAL; 1417 1418 if (unlikely(!device->ops->write)) 1419 return -EINVAL; 1420 1421 return device->ops->write(device, buf, count, ppos); 1422 } 1423 1424 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1425 { 1426 struct vfio_device_file *df = filep->private_data; 1427 struct vfio_device *device = df->device; 1428 1429 /* Paired with smp_store_release() following vfio_df_open() */ 1430 if (!smp_load_acquire(&df->access_granted)) 1431 return -EINVAL; 1432 1433 if (unlikely(!device->ops->mmap)) 1434 return -EINVAL; 1435 1436 return device->ops->mmap(device, vma); 1437 } 1438 1439 #ifdef CONFIG_PROC_FS 1440 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep) 1441 { 1442 char *path; 1443 struct vfio_device_file *df = filep->private_data; 1444 struct vfio_device *device = df->device; 1445 1446 path = kobject_get_path(&device->dev->kobj, GFP_KERNEL); 1447 if (!path) 1448 return; 1449 1450 seq_printf(m, "vfio-device-syspath: /sys%s\n", path); 1451 kfree(path); 1452 } 1453 #endif 1454 1455 const struct file_operations vfio_device_fops = { 1456 .owner = THIS_MODULE, 1457 .open = vfio_device_fops_cdev_open, 1458 .release = vfio_device_fops_release, 1459 .read = vfio_device_fops_read, 1460 .write = vfio_device_fops_write, 1461 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1462 .compat_ioctl = compat_ptr_ioctl, 1463 .mmap = vfio_device_fops_mmap, 1464 #ifdef CONFIG_PROC_FS 1465 .show_fdinfo = vfio_device_show_fdinfo, 1466 #endif 1467 }; 1468 1469 static struct vfio_device *vfio_device_from_file(struct file *file) 1470 { 1471 struct vfio_device_file *df = file->private_data; 1472 1473 if (file->f_op != &vfio_device_fops) 1474 return NULL; 1475 return df->device; 1476 } 1477 1478 /** 1479 * vfio_file_is_valid - True if the file is valid vfio file 1480 * @file: VFIO group file or VFIO device file 1481 */ 1482 bool vfio_file_is_valid(struct file *file) 1483 { 1484 return vfio_group_from_file(file) || 1485 vfio_device_from_file(file); 1486 } 1487 EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1488 1489 /** 1490 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1491 * is always CPU cache coherent 1492 * @file: VFIO group file or VFIO device file 1493 * 1494 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1495 * bit in DMA transactions. A return of false indicates that the user has 1496 * rights to access additional instructions such as wbinvd on x86. 1497 */ 1498 bool vfio_file_enforced_coherent(struct file *file) 1499 { 1500 struct vfio_device *device; 1501 struct vfio_group *group; 1502 1503 group = vfio_group_from_file(file); 1504 if (group) 1505 return vfio_group_enforced_coherent(group); 1506 1507 device = vfio_device_from_file(file); 1508 if (device) 1509 return device_iommu_capable(device->dev, 1510 IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1511 1512 return true; 1513 } 1514 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1515 1516 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1517 { 1518 struct vfio_device_file *df = file->private_data; 1519 1520 /* 1521 * The kvm is first recorded in the vfio_device_file, and will 1522 * be propagated to vfio_device::kvm when the file is bound to 1523 * iommufd successfully in the vfio device cdev path. 1524 */ 1525 spin_lock(&df->kvm_ref_lock); 1526 df->kvm = kvm; 1527 spin_unlock(&df->kvm_ref_lock); 1528 } 1529 1530 /** 1531 * vfio_file_set_kvm - Link a kvm with VFIO drivers 1532 * @file: VFIO group file or VFIO device file 1533 * @kvm: KVM to link 1534 * 1535 * When a VFIO device is first opened the KVM will be available in 1536 * device->kvm if one was associated with the file. 1537 */ 1538 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1539 { 1540 struct vfio_group *group; 1541 1542 group = vfio_group_from_file(file); 1543 if (group) 1544 vfio_group_set_kvm(group, kvm); 1545 1546 if (vfio_device_from_file(file)) 1547 vfio_device_file_set_kvm(file, kvm); 1548 } 1549 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1550 1551 /* 1552 * Sub-module support 1553 */ 1554 /* 1555 * Helper for managing a buffer of info chain capabilities, allocate or 1556 * reallocate a buffer with additional @size, filling in @id and @version 1557 * of the capability. A pointer to the new capability is returned. 1558 * 1559 * NB. The chain is based at the head of the buffer, so new entries are 1560 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1561 * next offsets prior to copying to the user buffer. 1562 */ 1563 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1564 size_t size, u16 id, u16 version) 1565 { 1566 void *buf; 1567 struct vfio_info_cap_header *header, *tmp; 1568 1569 /* Ensure that the next capability struct will be aligned */ 1570 size = ALIGN(size, sizeof(u64)); 1571 1572 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1573 if (!buf) { 1574 kfree(caps->buf); 1575 caps->buf = NULL; 1576 caps->size = 0; 1577 return ERR_PTR(-ENOMEM); 1578 } 1579 1580 caps->buf = buf; 1581 header = buf + caps->size; 1582 1583 /* Eventually copied to user buffer, zero */ 1584 memset(header, 0, size); 1585 1586 header->id = id; 1587 header->version = version; 1588 1589 /* Add to the end of the capability chain */ 1590 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1591 ; /* nothing */ 1592 1593 tmp->next = caps->size; 1594 caps->size += size; 1595 1596 return header; 1597 } 1598 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1599 1600 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1601 { 1602 struct vfio_info_cap_header *tmp; 1603 void *buf = (void *)caps->buf; 1604 1605 /* Capability structs should start with proper alignment */ 1606 WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); 1607 1608 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1609 tmp->next += offset; 1610 } 1611 EXPORT_SYMBOL(vfio_info_cap_shift); 1612 1613 int vfio_info_add_capability(struct vfio_info_cap *caps, 1614 struct vfio_info_cap_header *cap, size_t size) 1615 { 1616 struct vfio_info_cap_header *header; 1617 1618 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1619 if (IS_ERR(header)) 1620 return PTR_ERR(header); 1621 1622 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1623 1624 return 0; 1625 } 1626 EXPORT_SYMBOL(vfio_info_add_capability); 1627 1628 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1629 int max_irq_type, size_t *data_size) 1630 { 1631 unsigned long minsz; 1632 size_t size; 1633 1634 minsz = offsetofend(struct vfio_irq_set, count); 1635 1636 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1637 (hdr->count >= (U32_MAX - hdr->start)) || 1638 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1639 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1640 return -EINVAL; 1641 1642 if (data_size) 1643 *data_size = 0; 1644 1645 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1646 return -EINVAL; 1647 1648 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1649 case VFIO_IRQ_SET_DATA_NONE: 1650 size = 0; 1651 break; 1652 case VFIO_IRQ_SET_DATA_BOOL: 1653 size = sizeof(uint8_t); 1654 break; 1655 case VFIO_IRQ_SET_DATA_EVENTFD: 1656 size = sizeof(int32_t); 1657 break; 1658 default: 1659 return -EINVAL; 1660 } 1661 1662 if (size) { 1663 if (hdr->argsz - minsz < hdr->count * size) 1664 return -EINVAL; 1665 1666 if (!data_size) 1667 return -EINVAL; 1668 1669 *data_size = hdr->count * size; 1670 } 1671 1672 return 0; 1673 } 1674 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1675 1676 /* 1677 * Pin contiguous user pages and return their associated host pages for local 1678 * domain only. 1679 * @device [in] : device 1680 * @iova [in] : starting IOVA of user pages to be pinned. 1681 * @npage [in] : count of pages to be pinned. This count should not 1682 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1683 * @prot [in] : protection flags 1684 * @pages[out] : array of host pages 1685 * Return error or number of pages pinned. 1686 * 1687 * A driver may only call this function if the vfio_device was created 1688 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1689 */ 1690 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1691 int npage, int prot, struct page **pages) 1692 { 1693 /* group->container cannot change while a vfio device is open */ 1694 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1695 return -EINVAL; 1696 if (!device->ops->dma_unmap) 1697 return -EINVAL; 1698 if (vfio_device_has_container(device)) 1699 return vfio_device_container_pin_pages(device, iova, 1700 npage, prot, pages); 1701 if (device->iommufd_access) { 1702 int ret; 1703 1704 if (iova > ULONG_MAX) 1705 return -EINVAL; 1706 /* 1707 * VFIO ignores the sub page offset, npages is from the start of 1708 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1709 * the sub page offset by doing: 1710 * pages[0] + (iova % PAGE_SIZE) 1711 */ 1712 ret = iommufd_access_pin_pages( 1713 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1714 npage * PAGE_SIZE, pages, 1715 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1716 if (ret) 1717 return ret; 1718 return npage; 1719 } 1720 return -EINVAL; 1721 } 1722 EXPORT_SYMBOL(vfio_pin_pages); 1723 1724 /* 1725 * Unpin contiguous host pages for local domain only. 1726 * @device [in] : device 1727 * @iova [in] : starting address of user pages to be unpinned. 1728 * @npage [in] : count of pages to be unpinned. This count should not 1729 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1730 */ 1731 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1732 { 1733 if (WARN_ON(!vfio_assert_device_open(device))) 1734 return; 1735 if (WARN_ON(!device->ops->dma_unmap)) 1736 return; 1737 1738 if (vfio_device_has_container(device)) { 1739 vfio_device_container_unpin_pages(device, iova, npage); 1740 return; 1741 } 1742 if (device->iommufd_access) { 1743 if (WARN_ON(iova > ULONG_MAX)) 1744 return; 1745 iommufd_access_unpin_pages(device->iommufd_access, 1746 ALIGN_DOWN(iova, PAGE_SIZE), 1747 npage * PAGE_SIZE); 1748 return; 1749 } 1750 } 1751 EXPORT_SYMBOL(vfio_unpin_pages); 1752 1753 /* 1754 * This interface allows the CPUs to perform some sort of virtual DMA on 1755 * behalf of the device. 1756 * 1757 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1758 * into/from a kernel buffer. 1759 * 1760 * As the read/write of user space memory is conducted via the CPUs and is 1761 * not a real device DMA, it is not necessary to pin the user space memory. 1762 * 1763 * @device [in] : VFIO device 1764 * @iova [in] : base IOVA of a user space buffer 1765 * @data [in] : pointer to kernel buffer 1766 * @len [in] : kernel buffer length 1767 * @write : indicate read or write 1768 * Return error code on failure or 0 on success. 1769 */ 1770 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1771 size_t len, bool write) 1772 { 1773 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1774 return -EINVAL; 1775 1776 if (vfio_device_has_container(device)) 1777 return vfio_device_container_dma_rw(device, iova, 1778 data, len, write); 1779 1780 if (device->iommufd_access) { 1781 unsigned int flags = 0; 1782 1783 if (iova > ULONG_MAX) 1784 return -EINVAL; 1785 1786 /* VFIO historically tries to auto-detect a kthread */ 1787 if (!current->mm) 1788 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1789 if (write) 1790 flags |= IOMMUFD_ACCESS_RW_WRITE; 1791 return iommufd_access_rw(device->iommufd_access, iova, data, 1792 len, flags); 1793 } 1794 return -EINVAL; 1795 } 1796 EXPORT_SYMBOL(vfio_dma_rw); 1797 1798 /* 1799 * Module/class support 1800 */ 1801 static int __init vfio_init(void) 1802 { 1803 int ret; 1804 1805 ida_init(&vfio.device_ida); 1806 1807 ret = vfio_group_init(); 1808 if (ret) 1809 return ret; 1810 1811 ret = vfio_virqfd_init(); 1812 if (ret) 1813 goto err_virqfd; 1814 1815 /* /sys/class/vfio-dev/vfioX */ 1816 ret = class_register(&vfio_device_class); 1817 if (ret) 1818 goto err_dev_class; 1819 1820 ret = vfio_cdev_init(); 1821 if (ret) 1822 goto err_alloc_dev_chrdev; 1823 1824 vfio_debugfs_create_root(); 1825 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1826 return 0; 1827 1828 err_alloc_dev_chrdev: 1829 class_unregister(&vfio_device_class); 1830 err_dev_class: 1831 vfio_virqfd_exit(); 1832 err_virqfd: 1833 vfio_group_cleanup(); 1834 return ret; 1835 } 1836 1837 static void __exit vfio_cleanup(void) 1838 { 1839 vfio_debugfs_remove_root(); 1840 ida_destroy(&vfio.device_ida); 1841 vfio_cdev_cleanup(); 1842 class_unregister(&vfio_device_class); 1843 vfio_virqfd_exit(); 1844 vfio_group_cleanup(); 1845 xa_destroy(&vfio_device_set_xa); 1846 } 1847 1848 module_init(vfio_init); 1849 module_exit(vfio_cleanup); 1850 1851 MODULE_IMPORT_NS("IOMMUFD"); 1852 MODULE_VERSION(DRIVER_VERSION); 1853 MODULE_LICENSE("GPL v2"); 1854 MODULE_AUTHOR(DRIVER_AUTHOR); 1855 MODULE_DESCRIPTION(DRIVER_DESC); 1856 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1857