1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #if IS_ENABLED(CONFIG_KVM) 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mount.h> 26 #include <linux/mutex.h> 27 #include <linux/pci.h> 28 #include <linux/pseudo_fs.h> 29 #include <linux/rwsem.h> 30 #include <linux/sched.h> 31 #include <linux/seq_file.h> 32 #include <linux/slab.h> 33 #include <linux/stat.h> 34 #include <linux/string.h> 35 #include <linux/uaccess.h> 36 #include <linux/vfio.h> 37 #include <linux/wait.h> 38 #include <linux/sched/signal.h> 39 #include <linux/pm_runtime.h> 40 #include <linux/interval_tree.h> 41 #include <linux/iova_bitmap.h> 42 #include <linux/iommufd.h> 43 #include "vfio.h" 44 45 #define DRIVER_VERSION "0.3" 46 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 47 #define DRIVER_DESC "VFIO - User Level meta-driver" 48 49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */ 50 51 static struct vfio { 52 struct class *device_class; 53 struct ida device_ida; 54 struct vfsmount *vfs_mount; 55 int fs_count; 56 } vfio; 57 58 #ifdef CONFIG_VFIO_NOIOMMU 59 bool vfio_noiommu __read_mostly; 60 module_param_named(enable_unsafe_noiommu_mode, 61 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 62 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 63 #endif 64 65 static DEFINE_XARRAY(vfio_device_set_xa); 66 67 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 68 { 69 unsigned long idx = (unsigned long)set_id; 70 struct vfio_device_set *new_dev_set; 71 struct vfio_device_set *dev_set; 72 73 if (WARN_ON(!set_id)) 74 return -EINVAL; 75 76 /* 77 * Atomically acquire a singleton object in the xarray for this set_id 78 */ 79 xa_lock(&vfio_device_set_xa); 80 dev_set = xa_load(&vfio_device_set_xa, idx); 81 if (dev_set) 82 goto found_get_ref; 83 xa_unlock(&vfio_device_set_xa); 84 85 new_dev_set = kzalloc_obj(*new_dev_set); 86 if (!new_dev_set) 87 return -ENOMEM; 88 mutex_init(&new_dev_set->lock); 89 INIT_LIST_HEAD(&new_dev_set->device_list); 90 new_dev_set->set_id = set_id; 91 92 xa_lock(&vfio_device_set_xa); 93 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 94 GFP_KERNEL); 95 if (!dev_set) { 96 dev_set = new_dev_set; 97 goto found_get_ref; 98 } 99 100 kfree(new_dev_set); 101 if (xa_is_err(dev_set)) { 102 xa_unlock(&vfio_device_set_xa); 103 return xa_err(dev_set); 104 } 105 106 found_get_ref: 107 dev_set->device_count++; 108 xa_unlock(&vfio_device_set_xa); 109 mutex_lock(&dev_set->lock); 110 device->dev_set = dev_set; 111 list_add_tail(&device->dev_set_list, &dev_set->device_list); 112 mutex_unlock(&dev_set->lock); 113 return 0; 114 } 115 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 116 117 static void vfio_release_device_set(struct vfio_device *device) 118 { 119 struct vfio_device_set *dev_set = device->dev_set; 120 121 if (!dev_set) 122 return; 123 124 mutex_lock(&dev_set->lock); 125 list_del(&device->dev_set_list); 126 mutex_unlock(&dev_set->lock); 127 128 xa_lock(&vfio_device_set_xa); 129 if (!--dev_set->device_count) { 130 __xa_erase(&vfio_device_set_xa, 131 (unsigned long)dev_set->set_id); 132 mutex_destroy(&dev_set->lock); 133 kfree(dev_set); 134 } 135 xa_unlock(&vfio_device_set_xa); 136 } 137 138 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 139 { 140 struct vfio_device *cur; 141 unsigned int open_count = 0; 142 143 lockdep_assert_held(&dev_set->lock); 144 145 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 146 open_count += cur->open_count; 147 return open_count; 148 } 149 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 150 151 struct vfio_device * 152 vfio_find_device_in_devset(struct vfio_device_set *dev_set, 153 struct device *dev) 154 { 155 struct vfio_device *cur; 156 157 lockdep_assert_held(&dev_set->lock); 158 159 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 160 if (cur->dev == dev) 161 return cur; 162 return NULL; 163 } 164 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 165 166 /* 167 * Device objects - create, release, get, put, search 168 */ 169 /* Device reference always implies a group reference */ 170 void vfio_device_put_registration(struct vfio_device *device) 171 { 172 if (refcount_dec_and_test(&device->refcount)) 173 complete(&device->comp); 174 } 175 EXPORT_SYMBOL_GPL(vfio_device_put_registration); 176 177 bool vfio_device_try_get_registration(struct vfio_device *device) 178 { 179 return refcount_inc_not_zero(&device->refcount); 180 } 181 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); 182 183 /* 184 * VFIO driver API 185 */ 186 /* Release helper called by vfio_put_device() */ 187 static void vfio_device_release(struct device *dev) 188 { 189 struct vfio_device *device = 190 container_of(dev, struct vfio_device, device); 191 192 vfio_release_device_set(device); 193 ida_free(&vfio.device_ida, device->index); 194 195 if (device->ops->release) 196 device->ops->release(device); 197 198 iput(device->inode); 199 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 200 kvfree(device); 201 } 202 203 static int vfio_init_device(struct vfio_device *device, struct device *dev, 204 const struct vfio_device_ops *ops); 205 206 /* 207 * Allocate and initialize vfio_device so it can be registered to vfio 208 * core. 209 * 210 * Drivers should use the wrapper vfio_alloc_device() for allocation. 211 * @size is the size of the structure to be allocated, including any 212 * private data used by the driver. 213 * 214 * Driver may provide an @init callback to cover device private data. 215 * 216 * Use vfio_put_device() to release the structure after success return. 217 */ 218 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 219 const struct vfio_device_ops *ops) 220 { 221 struct vfio_device *device; 222 int ret; 223 224 if (WARN_ON(size < sizeof(struct vfio_device))) 225 return ERR_PTR(-EINVAL); 226 227 device = kvzalloc(size, GFP_KERNEL); 228 if (!device) 229 return ERR_PTR(-ENOMEM); 230 231 ret = vfio_init_device(device, dev, ops); 232 if (ret) 233 goto out_free; 234 return device; 235 236 out_free: 237 kvfree(device); 238 return ERR_PTR(ret); 239 } 240 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 241 242 static int vfio_fs_init_fs_context(struct fs_context *fc) 243 { 244 return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; 245 } 246 247 static struct file_system_type vfio_fs_type = { 248 .name = "vfio", 249 .owner = THIS_MODULE, 250 .init_fs_context = vfio_fs_init_fs_context, 251 .kill_sb = kill_anon_super, 252 }; 253 254 static struct inode *vfio_fs_inode_new(void) 255 { 256 struct inode *inode; 257 int ret; 258 259 ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); 260 if (ret) 261 return ERR_PTR(ret); 262 263 inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); 264 if (IS_ERR(inode)) 265 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 266 267 return inode; 268 } 269 270 /* 271 * Initialize a vfio_device so it can be registered to vfio core. 272 */ 273 static int vfio_init_device(struct vfio_device *device, struct device *dev, 274 const struct vfio_device_ops *ops) 275 { 276 int ret; 277 278 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 279 if (ret < 0) { 280 dev_dbg(dev, "Error to alloc index\n"); 281 return ret; 282 } 283 284 device->index = ret; 285 init_completion(&device->comp); 286 device->dev = dev; 287 device->ops = ops; 288 device->inode = vfio_fs_inode_new(); 289 if (IS_ERR(device->inode)) { 290 ret = PTR_ERR(device->inode); 291 goto out_inode; 292 } 293 294 if (ops->init) { 295 ret = ops->init(device); 296 if (ret) 297 goto out_uninit; 298 } 299 300 device_initialize(&device->device); 301 device->device.release = vfio_device_release; 302 device->device.class = vfio.device_class; 303 device->device.parent = device->dev; 304 return 0; 305 306 out_uninit: 307 iput(device->inode); 308 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 309 out_inode: 310 vfio_release_device_set(device); 311 ida_free(&vfio.device_ida, device->index); 312 return ret; 313 } 314 315 static int __vfio_register_dev(struct vfio_device *device, 316 enum vfio_group_type type) 317 { 318 int ret; 319 320 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 321 (!device->ops->bind_iommufd || 322 !device->ops->unbind_iommufd || 323 !device->ops->attach_ioas || 324 !device->ops->detach_ioas))) 325 return -EINVAL; 326 327 /* 328 * If the driver doesn't specify a set then the device is added to a 329 * singleton set just for itself. 330 */ 331 if (!device->dev_set) 332 vfio_assign_device_set(device, device); 333 334 ret = dev_set_name(&device->device, "vfio%d", device->index); 335 if (ret) 336 return ret; 337 338 ret = vfio_device_set_group(device, type); 339 if (ret) 340 return ret; 341 342 /* 343 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 344 * restore cache coherency. It has to be checked here because it is only 345 * valid for cases where we are using iommu groups. 346 */ 347 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && 348 !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { 349 ret = -EINVAL; 350 goto err_out; 351 } 352 353 ret = vfio_device_add(device); 354 if (ret) 355 goto err_out; 356 357 /* Refcounting can't start until the driver calls register */ 358 refcount_set(&device->refcount, 1); 359 360 vfio_device_group_register(device); 361 vfio_device_debugfs_init(device); 362 363 return 0; 364 err_out: 365 vfio_device_remove_group(device); 366 return ret; 367 } 368 369 int vfio_register_group_dev(struct vfio_device *device) 370 { 371 return __vfio_register_dev(device, VFIO_IOMMU); 372 } 373 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 374 375 /* 376 * Register a virtual device without IOMMU backing. The user of this 377 * device must not be able to directly trigger unmediated DMA. 378 */ 379 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 380 { 381 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 382 } 383 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 384 385 /* 386 * Decrement the device reference count and wait for the device to be 387 * removed. Open file descriptors for the device... */ 388 void vfio_unregister_group_dev(struct vfio_device *device) 389 { 390 unsigned int i = 0; 391 bool interrupted = false; 392 long rc; 393 394 /* 395 * Prevent new device opened by userspace via the 396 * VFIO_GROUP_GET_DEVICE_FD in the group path. 397 */ 398 vfio_device_group_unregister(device); 399 400 /* 401 * Balances vfio_device_add() in register path, also prevents 402 * new device opened by userspace in the cdev path. 403 */ 404 vfio_device_del(device); 405 406 vfio_device_put_registration(device); 407 rc = try_wait_for_completion(&device->comp); 408 while (rc <= 0) { 409 if (device->ops->request) 410 device->ops->request(device, i++); 411 412 if (interrupted) { 413 rc = wait_for_completion_timeout(&device->comp, 414 HZ * 10); 415 } else { 416 rc = wait_for_completion_interruptible_timeout( 417 &device->comp, HZ * 10); 418 if (rc < 0) { 419 interrupted = true; 420 dev_warn(device->dev, 421 "Device is currently in use, task" 422 " \"%s\" (%d) " 423 "blocked until device is released", 424 current->comm, task_pid_nr(current)); 425 } 426 } 427 } 428 429 vfio_device_debugfs_exit(device); 430 /* Balances vfio_device_set_group in register path */ 431 vfio_device_remove_group(device); 432 } 433 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 434 435 #if IS_ENABLED(CONFIG_KVM) 436 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 437 { 438 void (*pfn)(struct kvm *kvm); 439 bool (*fn)(struct kvm *kvm); 440 bool ret; 441 442 lockdep_assert_held(&device->dev_set->lock); 443 444 if (!kvm) 445 return; 446 447 pfn = symbol_get(kvm_put_kvm); 448 if (WARN_ON(!pfn)) 449 return; 450 451 fn = symbol_get(kvm_get_kvm_safe); 452 if (WARN_ON(!fn)) { 453 symbol_put(kvm_put_kvm); 454 return; 455 } 456 457 ret = fn(kvm); 458 symbol_put(kvm_get_kvm_safe); 459 if (!ret) { 460 symbol_put(kvm_put_kvm); 461 return; 462 } 463 464 device->put_kvm = pfn; 465 device->kvm = kvm; 466 } 467 468 void vfio_device_put_kvm(struct vfio_device *device) 469 { 470 lockdep_assert_held(&device->dev_set->lock); 471 472 if (!device->kvm) 473 return; 474 475 if (WARN_ON(!device->put_kvm)) 476 goto clear; 477 478 device->put_kvm(device->kvm); 479 device->put_kvm = NULL; 480 symbol_put(kvm_put_kvm); 481 482 clear: 483 device->kvm = NULL; 484 } 485 #endif 486 487 /* true if the vfio_device has open_device() called but not close_device() */ 488 static bool vfio_assert_device_open(struct vfio_device *device) 489 { 490 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 491 } 492 493 struct vfio_device_file * 494 vfio_allocate_device_file(struct vfio_device *device) 495 { 496 struct vfio_device_file *df; 497 498 df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT); 499 if (!df) 500 return ERR_PTR(-ENOMEM); 501 502 df->device = device; 503 spin_lock_init(&df->kvm_ref_lock); 504 505 return df; 506 } 507 508 static int vfio_df_device_first_open(struct vfio_device_file *df) 509 { 510 struct vfio_device *device = df->device; 511 struct iommufd_ctx *iommufd = df->iommufd; 512 int ret; 513 514 lockdep_assert_held(&device->dev_set->lock); 515 516 if (!try_module_get(device->dev->driver->owner)) 517 return -ENODEV; 518 519 if (iommufd) 520 ret = vfio_df_iommufd_bind(df); 521 else 522 ret = vfio_device_group_use_iommu(device); 523 if (ret) 524 goto err_module_put; 525 526 if (device->ops->open_device) { 527 ret = device->ops->open_device(device); 528 if (ret) 529 goto err_unuse_iommu; 530 } 531 return 0; 532 533 err_unuse_iommu: 534 if (iommufd) 535 vfio_df_iommufd_unbind(df); 536 else 537 vfio_device_group_unuse_iommu(device); 538 err_module_put: 539 module_put(device->dev->driver->owner); 540 return ret; 541 } 542 543 static void vfio_df_device_last_close(struct vfio_device_file *df) 544 { 545 struct vfio_device *device = df->device; 546 struct iommufd_ctx *iommufd = df->iommufd; 547 548 lockdep_assert_held(&device->dev_set->lock); 549 550 if (device->ops->close_device) 551 device->ops->close_device(device); 552 if (iommufd) 553 vfio_df_iommufd_unbind(df); 554 else 555 vfio_device_group_unuse_iommu(device); 556 device->precopy_info_v2 = 0; 557 module_put(device->dev->driver->owner); 558 } 559 560 int vfio_df_open(struct vfio_device_file *df) 561 { 562 struct vfio_device *device = df->device; 563 int ret = 0; 564 565 lockdep_assert_held(&device->dev_set->lock); 566 567 /* 568 * Only the group path allows the device to be opened multiple 569 * times. The device cdev path doesn't have a secure way for it. 570 */ 571 if (device->open_count != 0 && !df->group) 572 return -EINVAL; 573 574 device->open_count++; 575 if (device->open_count == 1) { 576 ret = vfio_df_device_first_open(df); 577 if (ret) 578 device->open_count--; 579 } 580 581 return ret; 582 } 583 584 void vfio_df_close(struct vfio_device_file *df) 585 { 586 struct vfio_device *device = df->device; 587 588 lockdep_assert_held(&device->dev_set->lock); 589 590 if (!vfio_assert_device_open(device)) 591 return; 592 if (device->open_count == 1) 593 vfio_df_device_last_close(df); 594 device->open_count--; 595 } 596 597 /* 598 * Wrapper around pm_runtime_resume_and_get(). 599 * Return error code on failure or 0 on success. 600 */ 601 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 602 { 603 struct device *dev = device->dev; 604 605 if (dev->driver && dev->driver->pm) { 606 int ret; 607 608 ret = pm_runtime_resume_and_get(dev); 609 if (ret) { 610 dev_info_ratelimited(dev, 611 "vfio: runtime resume failed %d\n", ret); 612 return -EIO; 613 } 614 } 615 616 return 0; 617 } 618 619 /* 620 * Wrapper around pm_runtime_put(). 621 */ 622 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 623 { 624 struct device *dev = device->dev; 625 626 if (dev->driver && dev->driver->pm) 627 pm_runtime_put(dev); 628 } 629 630 /* 631 * VFIO Device fd 632 */ 633 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 634 { 635 struct vfio_device_file *df = filep->private_data; 636 struct vfio_device *device = df->device; 637 638 if (df->group) 639 vfio_df_group_close(df); 640 else 641 vfio_df_unbind_iommufd(df); 642 643 vfio_device_put_registration(device); 644 645 kfree(df); 646 647 return 0; 648 } 649 650 /* 651 * vfio_mig_get_next_state - Compute the next step in the FSM 652 * @cur_fsm - The current state the device is in 653 * @new_fsm - The target state to reach 654 * @next_fsm - Pointer to the next step to get to new_fsm 655 * 656 * Return 0 upon success, otherwise -errno 657 * Upon success the next step in the state progression between cur_fsm and 658 * new_fsm will be set in next_fsm. 659 * 660 * This breaks down requests for combination transitions into smaller steps and 661 * returns the next step to get to new_fsm. The function may need to be called 662 * multiple times before reaching new_fsm. 663 * 664 */ 665 int vfio_mig_get_next_state(struct vfio_device *device, 666 enum vfio_device_mig_state cur_fsm, 667 enum vfio_device_mig_state new_fsm, 668 enum vfio_device_mig_state *next_fsm) 669 { 670 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 671 /* 672 * The coding in this table requires the driver to implement the 673 * following FSM arcs: 674 * RESUMING -> STOP 675 * STOP -> RESUMING 676 * STOP -> STOP_COPY 677 * STOP_COPY -> STOP 678 * 679 * If P2P is supported then the driver must also implement these FSM 680 * arcs: 681 * RUNNING -> RUNNING_P2P 682 * RUNNING_P2P -> RUNNING 683 * RUNNING_P2P -> STOP 684 * STOP -> RUNNING_P2P 685 * 686 * If precopy is supported then the driver must support these additional 687 * FSM arcs: 688 * RUNNING -> PRE_COPY 689 * PRE_COPY -> RUNNING 690 * PRE_COPY -> STOP_COPY 691 * However, if precopy and P2P are supported together then the driver 692 * must support these additional arcs beyond the P2P arcs above: 693 * PRE_COPY -> RUNNING 694 * PRE_COPY -> PRE_COPY_P2P 695 * PRE_COPY_P2P -> PRE_COPY 696 * PRE_COPY_P2P -> RUNNING_P2P 697 * PRE_COPY_P2P -> STOP_COPY 698 * RUNNING -> PRE_COPY 699 * RUNNING_P2P -> PRE_COPY_P2P 700 * 701 * Without P2P and precopy the driver must implement: 702 * RUNNING -> STOP 703 * STOP -> RUNNING 704 * 705 * The coding will step through multiple states for some combination 706 * transitions; if all optional features are supported, this means the 707 * following ones: 708 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 709 * PRE_COPY -> RUNNING -> RUNNING_P2P 710 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 711 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 712 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 713 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 714 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 715 * RESUMING -> STOP -> RUNNING_P2P 716 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 717 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 718 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 719 * RESUMING -> STOP -> STOP_COPY 720 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 721 * RUNNING -> RUNNING_P2P -> STOP 722 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 723 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 724 * RUNNING_P2P -> RUNNING -> PRE_COPY 725 * RUNNING_P2P -> STOP -> RESUMING 726 * RUNNING_P2P -> STOP -> STOP_COPY 727 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 728 * STOP -> RUNNING_P2P -> RUNNING 729 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 730 * STOP_COPY -> STOP -> RESUMING 731 * STOP_COPY -> STOP -> RUNNING_P2P 732 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 733 * 734 * The following transitions are blocked: 735 * STOP_COPY -> PRE_COPY 736 * STOP_COPY -> PRE_COPY_P2P 737 */ 738 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 739 [VFIO_DEVICE_STATE_STOP] = { 740 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 741 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 742 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 743 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 744 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 745 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 746 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 747 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 748 }, 749 [VFIO_DEVICE_STATE_RUNNING] = { 750 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 751 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 752 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 753 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 754 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 755 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 756 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 757 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 758 }, 759 [VFIO_DEVICE_STATE_PRE_COPY] = { 760 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 761 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 762 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 763 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 764 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 765 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 766 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 767 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 768 }, 769 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 770 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 771 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 772 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 773 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 774 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 775 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 776 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 777 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 778 }, 779 [VFIO_DEVICE_STATE_STOP_COPY] = { 780 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 781 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 782 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 783 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 784 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 785 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 786 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 787 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 788 }, 789 [VFIO_DEVICE_STATE_RESUMING] = { 790 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 791 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 792 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 793 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 794 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 795 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 796 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 797 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 798 }, 799 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 800 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 801 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 802 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 803 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 804 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 805 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 806 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 807 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 808 }, 809 [VFIO_DEVICE_STATE_ERROR] = { 810 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 811 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 812 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 813 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 814 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 815 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 816 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 817 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 818 }, 819 }; 820 821 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 822 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 823 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 824 [VFIO_DEVICE_STATE_PRE_COPY] = 825 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 826 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 827 VFIO_MIGRATION_P2P | 828 VFIO_MIGRATION_PRE_COPY, 829 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 830 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 831 [VFIO_DEVICE_STATE_RUNNING_P2P] = 832 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 833 [VFIO_DEVICE_STATE_ERROR] = ~0U, 834 }; 835 836 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 837 (state_flags_table[cur_fsm] & device->migration_flags) != 838 state_flags_table[cur_fsm])) 839 return -EINVAL; 840 841 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 842 (state_flags_table[new_fsm] & device->migration_flags) != 843 state_flags_table[new_fsm]) 844 return -EINVAL; 845 846 /* 847 * Arcs touching optional and unsupported states are skipped over. The 848 * driver will instead see an arc from the original state to the next 849 * logical state, as per the above comment. 850 */ 851 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 852 while ((state_flags_table[*next_fsm] & device->migration_flags) != 853 state_flags_table[*next_fsm]) 854 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 855 856 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 857 } 858 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 859 860 /* 861 * Convert the drivers's struct file into a FD number and return it to userspace 862 */ 863 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 864 struct vfio_device_feature_mig_state *mig) 865 { 866 int ret; 867 int fd; 868 869 fd = get_unused_fd_flags(O_CLOEXEC); 870 if (fd < 0) { 871 ret = fd; 872 goto out_fput; 873 } 874 875 mig->data_fd = fd; 876 if (copy_to_user(arg, mig, sizeof(*mig))) { 877 ret = -EFAULT; 878 goto out_put_unused; 879 } 880 fd_install(fd, filp); 881 return 0; 882 883 out_put_unused: 884 put_unused_fd(fd); 885 out_fput: 886 fput(filp); 887 return ret; 888 } 889 890 static int 891 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 892 u32 flags, void __user *arg, 893 size_t argsz) 894 { 895 size_t minsz = 896 offsetofend(struct vfio_device_feature_mig_state, data_fd); 897 struct vfio_device_feature_mig_state mig; 898 struct file *filp = NULL; 899 int ret; 900 901 if (!device->mig_ops) 902 return -ENOTTY; 903 904 ret = vfio_check_feature(flags, argsz, 905 VFIO_DEVICE_FEATURE_SET | 906 VFIO_DEVICE_FEATURE_GET, 907 sizeof(mig)); 908 if (ret != 1) 909 return ret; 910 911 if (copy_from_user(&mig, arg, minsz)) 912 return -EFAULT; 913 914 if (flags & VFIO_DEVICE_FEATURE_GET) { 915 enum vfio_device_mig_state curr_state; 916 917 ret = device->mig_ops->migration_get_state(device, 918 &curr_state); 919 if (ret) 920 return ret; 921 mig.device_state = curr_state; 922 goto out_copy; 923 } 924 925 /* Handle the VFIO_DEVICE_FEATURE_SET */ 926 filp = device->mig_ops->migration_set_state(device, mig.device_state); 927 if (IS_ERR(filp) || !filp) 928 goto out_copy; 929 930 return vfio_ioct_mig_return_fd(filp, arg, &mig); 931 out_copy: 932 mig.data_fd = -1; 933 if (copy_to_user(arg, &mig, sizeof(mig))) 934 return -EFAULT; 935 if (IS_ERR(filp)) 936 return PTR_ERR(filp); 937 return 0; 938 } 939 940 static int 941 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 942 u32 flags, void __user *arg, 943 size_t argsz) 944 { 945 struct vfio_device_feature_mig_data_size data_size = {}; 946 unsigned long stop_copy_length; 947 int ret; 948 949 if (!device->mig_ops) 950 return -ENOTTY; 951 952 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 953 sizeof(data_size)); 954 if (ret != 1) 955 return ret; 956 957 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 958 if (ret) 959 return ret; 960 961 data_size.stop_copy_length = stop_copy_length; 962 if (copy_to_user(arg, &data_size, sizeof(data_size))) 963 return -EFAULT; 964 965 return 0; 966 } 967 968 static int 969 vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device, 970 u32 flags, size_t argsz) 971 { 972 int ret; 973 974 if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY)) 975 return -EINVAL; 976 977 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); 978 if (ret != 1) 979 return ret; 980 981 device->precopy_info_v2 = 1; 982 return 0; 983 } 984 985 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 986 u32 flags, void __user *arg, 987 size_t argsz) 988 { 989 struct vfio_device_feature_migration mig = { 990 .flags = device->migration_flags, 991 }; 992 int ret; 993 994 if (!device->mig_ops) 995 return -ENOTTY; 996 997 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 998 sizeof(mig)); 999 if (ret != 1) 1000 return ret; 1001 if (copy_to_user(arg, &mig, sizeof(mig))) 1002 return -EFAULT; 1003 return 0; 1004 } 1005 1006 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 1007 u32 req_nodes) 1008 { 1009 struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 1010 unsigned long min_gap, curr_gap; 1011 1012 /* Special shortcut when a single range is required */ 1013 if (req_nodes == 1) { 1014 unsigned long last; 1015 1016 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 1017 1018 /* Empty list */ 1019 if (WARN_ON_ONCE(!comb_start)) 1020 return; 1021 1022 curr = comb_start; 1023 while (curr) { 1024 last = curr->last; 1025 prev = curr; 1026 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1027 if (prev != comb_start) 1028 interval_tree_remove(prev, root); 1029 } 1030 comb_start->last = last; 1031 return; 1032 } 1033 1034 /* Combine ranges which have the smallest gap */ 1035 while (cur_nodes > req_nodes) { 1036 prev = NULL; 1037 min_gap = ULONG_MAX; 1038 curr = interval_tree_iter_first(root, 0, ULONG_MAX); 1039 while (curr) { 1040 if (prev) { 1041 curr_gap = curr->start - prev->last; 1042 if (curr_gap < min_gap) { 1043 min_gap = curr_gap; 1044 comb_start = prev; 1045 comb_end = curr; 1046 } 1047 } 1048 prev = curr; 1049 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1050 } 1051 1052 /* Empty list or no nodes to combine */ 1053 if (WARN_ON_ONCE(min_gap == ULONG_MAX)) 1054 break; 1055 1056 comb_start->last = comb_end->last; 1057 interval_tree_remove(comb_end, root); 1058 cur_nodes--; 1059 } 1060 } 1061 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); 1062 1063 /* Ranges should fit into a single kernel page */ 1064 #define LOG_MAX_RANGES \ 1065 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 1066 1067 static int 1068 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 1069 u32 flags, void __user *arg, 1070 size_t argsz) 1071 { 1072 size_t minsz = 1073 offsetofend(struct vfio_device_feature_dma_logging_control, 1074 ranges); 1075 struct vfio_device_feature_dma_logging_range __user *ranges; 1076 struct vfio_device_feature_dma_logging_control control; 1077 struct vfio_device_feature_dma_logging_range range; 1078 struct rb_root_cached root = RB_ROOT_CACHED; 1079 struct interval_tree_node *nodes; 1080 u64 iova_end; 1081 u32 nnodes; 1082 int i, ret; 1083 1084 if (!device->log_ops) 1085 return -ENOTTY; 1086 1087 ret = vfio_check_feature(flags, argsz, 1088 VFIO_DEVICE_FEATURE_SET, 1089 sizeof(control)); 1090 if (ret != 1) 1091 return ret; 1092 1093 if (copy_from_user(&control, arg, minsz)) 1094 return -EFAULT; 1095 1096 nnodes = control.num_ranges; 1097 if (!nnodes) 1098 return -EINVAL; 1099 1100 if (nnodes > LOG_MAX_RANGES) 1101 return -E2BIG; 1102 1103 ranges = u64_to_user_ptr(control.ranges); 1104 nodes = kmalloc_objs(struct interval_tree_node, nnodes); 1105 if (!nodes) 1106 return -ENOMEM; 1107 1108 for (i = 0; i < nnodes; i++) { 1109 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 1110 ret = -EFAULT; 1111 goto end; 1112 } 1113 if (!IS_ALIGNED(range.iova, control.page_size) || 1114 !IS_ALIGNED(range.length, control.page_size)) { 1115 ret = -EINVAL; 1116 goto end; 1117 } 1118 1119 if (check_add_overflow(range.iova, range.length, &iova_end) || 1120 iova_end > ULONG_MAX) { 1121 ret = -EOVERFLOW; 1122 goto end; 1123 } 1124 1125 nodes[i].start = range.iova; 1126 nodes[i].last = range.iova + range.length - 1; 1127 if (interval_tree_iter_first(&root, nodes[i].start, 1128 nodes[i].last)) { 1129 /* Range overlapping */ 1130 ret = -EINVAL; 1131 goto end; 1132 } 1133 interval_tree_insert(nodes + i, &root); 1134 } 1135 1136 ret = device->log_ops->log_start(device, &root, nnodes, 1137 &control.page_size); 1138 if (ret) 1139 goto end; 1140 1141 if (copy_to_user(arg, &control, sizeof(control))) { 1142 ret = -EFAULT; 1143 device->log_ops->log_stop(device); 1144 } 1145 1146 end: 1147 kfree(nodes); 1148 return ret; 1149 } 1150 1151 static int 1152 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1153 u32 flags, void __user *arg, 1154 size_t argsz) 1155 { 1156 int ret; 1157 1158 if (!device->log_ops) 1159 return -ENOTTY; 1160 1161 ret = vfio_check_feature(flags, argsz, 1162 VFIO_DEVICE_FEATURE_SET, 0); 1163 if (ret != 1) 1164 return ret; 1165 1166 return device->log_ops->log_stop(device); 1167 } 1168 1169 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1170 unsigned long iova, size_t length, 1171 void *opaque) 1172 { 1173 struct vfio_device *device = opaque; 1174 1175 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1176 } 1177 1178 static int 1179 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1180 u32 flags, void __user *arg, 1181 size_t argsz) 1182 { 1183 size_t minsz = 1184 offsetofend(struct vfio_device_feature_dma_logging_report, 1185 bitmap); 1186 struct vfio_device_feature_dma_logging_report report; 1187 struct iova_bitmap *iter; 1188 u64 iova_end; 1189 int ret; 1190 1191 if (!device->log_ops) 1192 return -ENOTTY; 1193 1194 ret = vfio_check_feature(flags, argsz, 1195 VFIO_DEVICE_FEATURE_GET, 1196 sizeof(report)); 1197 if (ret != 1) 1198 return ret; 1199 1200 if (copy_from_user(&report, arg, minsz)) 1201 return -EFAULT; 1202 1203 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1204 return -EINVAL; 1205 1206 if (check_add_overflow(report.iova, report.length, &iova_end) || 1207 iova_end > ULONG_MAX) 1208 return -EOVERFLOW; 1209 1210 iter = iova_bitmap_alloc(report.iova, report.length, 1211 report.page_size, 1212 u64_to_user_ptr(report.bitmap)); 1213 if (IS_ERR(iter)) 1214 return PTR_ERR(iter); 1215 1216 ret = iova_bitmap_for_each(iter, device, 1217 vfio_device_log_read_and_clear); 1218 1219 iova_bitmap_free(iter); 1220 return ret; 1221 } 1222 1223 static int vfio_ioctl_device_feature(struct vfio_device *device, 1224 struct vfio_device_feature __user *arg) 1225 { 1226 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1227 struct vfio_device_feature feature; 1228 1229 if (copy_from_user(&feature, arg, minsz)) 1230 return -EFAULT; 1231 1232 if (feature.argsz < minsz) 1233 return -EINVAL; 1234 1235 /* Check unknown flags */ 1236 if (feature.flags & 1237 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1238 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1239 return -EINVAL; 1240 1241 /* GET & SET are mutually exclusive except with PROBE */ 1242 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1243 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1244 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1245 return -EINVAL; 1246 1247 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1248 case VFIO_DEVICE_FEATURE_MIGRATION: 1249 return vfio_ioctl_device_feature_migration( 1250 device, feature.flags, arg->data, 1251 feature.argsz - minsz); 1252 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1253 return vfio_ioctl_device_feature_mig_device_state( 1254 device, feature.flags, arg->data, 1255 feature.argsz - minsz); 1256 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1257 return vfio_ioctl_device_feature_logging_start( 1258 device, feature.flags, arg->data, 1259 feature.argsz - minsz); 1260 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1261 return vfio_ioctl_device_feature_logging_stop( 1262 device, feature.flags, arg->data, 1263 feature.argsz - minsz); 1264 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1265 return vfio_ioctl_device_feature_logging_report( 1266 device, feature.flags, arg->data, 1267 feature.argsz - minsz); 1268 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1269 return vfio_ioctl_device_feature_migration_data_size( 1270 device, feature.flags, arg->data, 1271 feature.argsz - minsz); 1272 case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2: 1273 return vfio_ioctl_device_feature_migration_precopy_info_v2( 1274 device, feature.flags, feature.argsz - minsz); 1275 default: 1276 if (unlikely(!device->ops->device_feature)) 1277 return -ENOTTY; 1278 return device->ops->device_feature(device, feature.flags, 1279 arg->data, 1280 feature.argsz - minsz); 1281 } 1282 } 1283 1284 static long vfio_get_region_info(struct vfio_device *device, 1285 struct vfio_region_info __user *arg) 1286 { 1287 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 1288 struct vfio_region_info info = {}; 1289 struct vfio_info_cap caps = {}; 1290 int ret; 1291 1292 if (unlikely(!device->ops->get_region_info_caps)) 1293 return -EINVAL; 1294 1295 if (copy_from_user(&info, arg, minsz)) 1296 return -EFAULT; 1297 if (info.argsz < minsz) 1298 return -EINVAL; 1299 1300 ret = device->ops->get_region_info_caps(device, &info, &caps); 1301 if (ret) 1302 goto out_free; 1303 1304 if (caps.size) { 1305 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 1306 if (info.argsz < sizeof(info) + caps.size) { 1307 info.argsz = sizeof(info) + caps.size; 1308 info.cap_offset = 0; 1309 } else { 1310 vfio_info_cap_shift(&caps, sizeof(info)); 1311 if (copy_to_user(arg + 1, caps.buf, caps.size)) { 1312 ret = -EFAULT; 1313 goto out_free; 1314 } 1315 info.cap_offset = sizeof(info); 1316 } 1317 } 1318 1319 if (copy_to_user(arg, &info, minsz)){ 1320 ret = -EFAULT; 1321 goto out_free; 1322 } 1323 1324 out_free: 1325 kfree(caps.buf); 1326 return ret; 1327 } 1328 1329 static long vfio_device_fops_unl_ioctl(struct file *filep, 1330 unsigned int cmd, unsigned long arg) 1331 { 1332 struct vfio_device_file *df = filep->private_data; 1333 struct vfio_device *device = df->device; 1334 void __user *uptr = (void __user *)arg; 1335 int ret; 1336 1337 if (cmd == VFIO_DEVICE_BIND_IOMMUFD) 1338 return vfio_df_ioctl_bind_iommufd(df, uptr); 1339 1340 /* Paired with smp_store_release() following vfio_df_open() */ 1341 if (!smp_load_acquire(&df->access_granted)) 1342 return -EINVAL; 1343 1344 ret = vfio_device_pm_runtime_get(device); 1345 if (ret) 1346 return ret; 1347 1348 /* cdev only ioctls */ 1349 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { 1350 switch (cmd) { 1351 case VFIO_DEVICE_ATTACH_IOMMUFD_PT: 1352 ret = vfio_df_ioctl_attach_pt(df, uptr); 1353 goto out; 1354 1355 case VFIO_DEVICE_DETACH_IOMMUFD_PT: 1356 ret = vfio_df_ioctl_detach_pt(df, uptr); 1357 goto out; 1358 } 1359 } 1360 1361 switch (cmd) { 1362 case VFIO_DEVICE_FEATURE: 1363 ret = vfio_ioctl_device_feature(device, uptr); 1364 break; 1365 1366 case VFIO_DEVICE_GET_REGION_INFO: 1367 ret = vfio_get_region_info(device, uptr); 1368 break; 1369 1370 default: 1371 if (unlikely(!device->ops->ioctl)) 1372 ret = -EINVAL; 1373 else 1374 ret = device->ops->ioctl(device, cmd, arg); 1375 break; 1376 } 1377 out: 1378 vfio_device_pm_runtime_put(device); 1379 return ret; 1380 } 1381 1382 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1383 size_t count, loff_t *ppos) 1384 { 1385 struct vfio_device_file *df = filep->private_data; 1386 struct vfio_device *device = df->device; 1387 1388 /* Paired with smp_store_release() following vfio_df_open() */ 1389 if (!smp_load_acquire(&df->access_granted)) 1390 return -EINVAL; 1391 1392 if (unlikely(!device->ops->read)) 1393 return -EINVAL; 1394 1395 return device->ops->read(device, buf, count, ppos); 1396 } 1397 1398 static ssize_t vfio_device_fops_write(struct file *filep, 1399 const char __user *buf, 1400 size_t count, loff_t *ppos) 1401 { 1402 struct vfio_device_file *df = filep->private_data; 1403 struct vfio_device *device = df->device; 1404 1405 /* Paired with smp_store_release() following vfio_df_open() */ 1406 if (!smp_load_acquire(&df->access_granted)) 1407 return -EINVAL; 1408 1409 if (unlikely(!device->ops->write)) 1410 return -EINVAL; 1411 1412 return device->ops->write(device, buf, count, ppos); 1413 } 1414 1415 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1416 { 1417 struct vfio_device_file *df = filep->private_data; 1418 struct vfio_device *device = df->device; 1419 1420 /* Paired with smp_store_release() following vfio_df_open() */ 1421 if (!smp_load_acquire(&df->access_granted)) 1422 return -EINVAL; 1423 1424 if (unlikely(!device->ops->mmap)) 1425 return -EINVAL; 1426 1427 return device->ops->mmap(device, vma); 1428 } 1429 1430 #ifdef CONFIG_PROC_FS 1431 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep) 1432 { 1433 char *path; 1434 struct vfio_device_file *df = filep->private_data; 1435 struct vfio_device *device = df->device; 1436 1437 path = kobject_get_path(&device->dev->kobj, GFP_KERNEL); 1438 if (!path) 1439 return; 1440 1441 seq_printf(m, "vfio-device-syspath: /sys%s\n", path); 1442 kfree(path); 1443 } 1444 #endif 1445 1446 const struct file_operations vfio_device_fops = { 1447 .owner = THIS_MODULE, 1448 .open = vfio_device_fops_cdev_open, 1449 .release = vfio_device_fops_release, 1450 .read = vfio_device_fops_read, 1451 .write = vfio_device_fops_write, 1452 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1453 .compat_ioctl = compat_ptr_ioctl, 1454 .mmap = vfio_device_fops_mmap, 1455 #ifdef CONFIG_PROC_FS 1456 .show_fdinfo = vfio_device_show_fdinfo, 1457 #endif 1458 }; 1459 1460 static struct vfio_device *vfio_device_from_file(struct file *file) 1461 { 1462 struct vfio_device_file *df = file->private_data; 1463 1464 if (file->f_op != &vfio_device_fops) 1465 return NULL; 1466 return df->device; 1467 } 1468 1469 /** 1470 * vfio_file_is_valid - True if the file is valid vfio file 1471 * @file: VFIO group file or VFIO device file 1472 */ 1473 bool vfio_file_is_valid(struct file *file) 1474 { 1475 return vfio_group_from_file(file) || 1476 vfio_device_from_file(file); 1477 } 1478 EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1479 1480 /** 1481 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1482 * is always CPU cache coherent 1483 * @file: VFIO group file or VFIO device file 1484 * 1485 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1486 * bit in DMA transactions. A return of false indicates that the user has 1487 * rights to access additional instructions such as wbinvd on x86. 1488 */ 1489 bool vfio_file_enforced_coherent(struct file *file) 1490 { 1491 struct vfio_device *device; 1492 struct vfio_group *group; 1493 1494 group = vfio_group_from_file(file); 1495 if (group) 1496 return vfio_group_enforced_coherent(group); 1497 1498 device = vfio_device_from_file(file); 1499 if (device) 1500 return device_iommu_capable(device->dev, 1501 IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1502 1503 return true; 1504 } 1505 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1506 1507 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1508 { 1509 struct vfio_device_file *df = file->private_data; 1510 1511 /* 1512 * The kvm is first recorded in the vfio_device_file, and will 1513 * be propagated to vfio_device::kvm when the file is bound to 1514 * iommufd successfully in the vfio device cdev path. 1515 */ 1516 spin_lock(&df->kvm_ref_lock); 1517 df->kvm = kvm; 1518 spin_unlock(&df->kvm_ref_lock); 1519 } 1520 1521 /** 1522 * vfio_file_set_kvm - Link a kvm with VFIO drivers 1523 * @file: VFIO group file or VFIO device file 1524 * @kvm: KVM to link 1525 * 1526 * When a VFIO device is first opened the KVM will be available in 1527 * device->kvm if one was associated with the file. 1528 */ 1529 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1530 { 1531 struct vfio_group *group; 1532 1533 group = vfio_group_from_file(file); 1534 if (group) 1535 vfio_group_set_kvm(group, kvm); 1536 1537 if (vfio_device_from_file(file)) 1538 vfio_device_file_set_kvm(file, kvm); 1539 } 1540 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1541 1542 /* 1543 * Sub-module support 1544 */ 1545 /* 1546 * Helper for managing a buffer of info chain capabilities, allocate or 1547 * reallocate a buffer with additional @size, filling in @id and @version 1548 * of the capability. A pointer to the new capability is returned. 1549 * 1550 * NB. The chain is based at the head of the buffer, so new entries are 1551 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1552 * next offsets prior to copying to the user buffer. 1553 */ 1554 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1555 size_t size, u16 id, u16 version) 1556 { 1557 void *buf; 1558 struct vfio_info_cap_header *header, *tmp; 1559 1560 /* Ensure that the next capability struct will be aligned */ 1561 size = ALIGN(size, sizeof(u64)); 1562 1563 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1564 if (!buf) { 1565 kfree(caps->buf); 1566 caps->buf = NULL; 1567 caps->size = 0; 1568 return ERR_PTR(-ENOMEM); 1569 } 1570 1571 caps->buf = buf; 1572 header = buf + caps->size; 1573 1574 /* Eventually copied to user buffer, zero */ 1575 memset(header, 0, size); 1576 1577 header->id = id; 1578 header->version = version; 1579 1580 /* Add to the end of the capability chain */ 1581 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1582 ; /* nothing */ 1583 1584 tmp->next = caps->size; 1585 caps->size += size; 1586 1587 return header; 1588 } 1589 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1590 1591 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1592 { 1593 struct vfio_info_cap_header *tmp; 1594 void *buf = (void *)caps->buf; 1595 1596 /* Capability structs should start with proper alignment */ 1597 WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); 1598 1599 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1600 tmp->next += offset; 1601 } 1602 EXPORT_SYMBOL(vfio_info_cap_shift); 1603 1604 int vfio_info_add_capability(struct vfio_info_cap *caps, 1605 struct vfio_info_cap_header *cap, size_t size) 1606 { 1607 struct vfio_info_cap_header *header; 1608 1609 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1610 if (IS_ERR(header)) 1611 return PTR_ERR(header); 1612 1613 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1614 1615 return 0; 1616 } 1617 EXPORT_SYMBOL(vfio_info_add_capability); 1618 1619 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1620 int max_irq_type, size_t *data_size) 1621 { 1622 unsigned long minsz; 1623 size_t size; 1624 1625 minsz = offsetofend(struct vfio_irq_set, count); 1626 1627 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1628 (hdr->count >= (U32_MAX - hdr->start)) || 1629 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1630 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1631 return -EINVAL; 1632 1633 if (data_size) 1634 *data_size = 0; 1635 1636 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1637 return -EINVAL; 1638 1639 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1640 case VFIO_IRQ_SET_DATA_NONE: 1641 size = 0; 1642 break; 1643 case VFIO_IRQ_SET_DATA_BOOL: 1644 size = sizeof(uint8_t); 1645 break; 1646 case VFIO_IRQ_SET_DATA_EVENTFD: 1647 size = sizeof(int32_t); 1648 break; 1649 default: 1650 return -EINVAL; 1651 } 1652 1653 if (size) { 1654 if (hdr->argsz - minsz < hdr->count * size) 1655 return -EINVAL; 1656 1657 if (!data_size) 1658 return -EINVAL; 1659 1660 *data_size = hdr->count * size; 1661 } 1662 1663 return 0; 1664 } 1665 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1666 1667 /* 1668 * Pin contiguous user pages and return their associated host pages for local 1669 * domain only. 1670 * @device [in] : device 1671 * @iova [in] : starting IOVA of user pages to be pinned. 1672 * @npage [in] : count of pages to be pinned. This count should not 1673 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1674 * @prot [in] : protection flags 1675 * @pages[out] : array of host pages 1676 * Return error or number of pages pinned. 1677 * 1678 * A driver may only call this function if the vfio_device was created 1679 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1680 */ 1681 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1682 int npage, int prot, struct page **pages) 1683 { 1684 /* group->container cannot change while a vfio device is open */ 1685 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1686 return -EINVAL; 1687 if (!device->ops->dma_unmap) 1688 return -EINVAL; 1689 if (vfio_device_has_container(device)) 1690 return vfio_device_container_pin_pages(device, iova, 1691 npage, prot, pages); 1692 if (device->iommufd_access) { 1693 int ret; 1694 1695 if (iova > ULONG_MAX) 1696 return -EINVAL; 1697 /* 1698 * VFIO ignores the sub page offset, npages is from the start of 1699 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1700 * the sub page offset by doing: 1701 * pages[0] + (iova % PAGE_SIZE) 1702 */ 1703 ret = iommufd_access_pin_pages( 1704 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1705 npage * PAGE_SIZE, pages, 1706 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1707 if (ret) 1708 return ret; 1709 return npage; 1710 } 1711 return -EINVAL; 1712 } 1713 EXPORT_SYMBOL(vfio_pin_pages); 1714 1715 /* 1716 * Unpin contiguous host pages for local domain only. 1717 * @device [in] : device 1718 * @iova [in] : starting address of user pages to be unpinned. 1719 * @npage [in] : count of pages to be unpinned. This count should not 1720 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1721 */ 1722 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1723 { 1724 if (WARN_ON(!vfio_assert_device_open(device))) 1725 return; 1726 if (WARN_ON(!device->ops->dma_unmap)) 1727 return; 1728 1729 if (vfio_device_has_container(device)) { 1730 vfio_device_container_unpin_pages(device, iova, npage); 1731 return; 1732 } 1733 if (device->iommufd_access) { 1734 if (WARN_ON(iova > ULONG_MAX)) 1735 return; 1736 iommufd_access_unpin_pages(device->iommufd_access, 1737 ALIGN_DOWN(iova, PAGE_SIZE), 1738 npage * PAGE_SIZE); 1739 return; 1740 } 1741 } 1742 EXPORT_SYMBOL(vfio_unpin_pages); 1743 1744 /* 1745 * This interface allows the CPUs to perform some sort of virtual DMA on 1746 * behalf of the device. 1747 * 1748 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1749 * into/from a kernel buffer. 1750 * 1751 * As the read/write of user space memory is conducted via the CPUs and is 1752 * not a real device DMA, it is not necessary to pin the user space memory. 1753 * 1754 * @device [in] : VFIO device 1755 * @iova [in] : base IOVA of a user space buffer 1756 * @data [in] : pointer to kernel buffer 1757 * @len [in] : kernel buffer length 1758 * @write : indicate read or write 1759 * Return error code on failure or 0 on success. 1760 */ 1761 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1762 size_t len, bool write) 1763 { 1764 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1765 return -EINVAL; 1766 1767 if (vfio_device_has_container(device)) 1768 return vfio_device_container_dma_rw(device, iova, 1769 data, len, write); 1770 1771 if (device->iommufd_access) { 1772 unsigned int flags = 0; 1773 1774 if (iova > ULONG_MAX) 1775 return -EINVAL; 1776 1777 /* VFIO historically tries to auto-detect a kthread */ 1778 if (!current->mm) 1779 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1780 if (write) 1781 flags |= IOMMUFD_ACCESS_RW_WRITE; 1782 return iommufd_access_rw(device->iommufd_access, iova, data, 1783 len, flags); 1784 } 1785 return -EINVAL; 1786 } 1787 EXPORT_SYMBOL(vfio_dma_rw); 1788 1789 /* 1790 * Module/class support 1791 */ 1792 static int __init vfio_init(void) 1793 { 1794 int ret; 1795 1796 ida_init(&vfio.device_ida); 1797 1798 ret = vfio_group_init(); 1799 if (ret) 1800 return ret; 1801 1802 ret = vfio_virqfd_init(); 1803 if (ret) 1804 goto err_virqfd; 1805 1806 /* /sys/class/vfio-dev/vfioX */ 1807 vfio.device_class = class_create("vfio-dev"); 1808 if (IS_ERR(vfio.device_class)) { 1809 ret = PTR_ERR(vfio.device_class); 1810 goto err_dev_class; 1811 } 1812 1813 ret = vfio_cdev_init(vfio.device_class); 1814 if (ret) 1815 goto err_alloc_dev_chrdev; 1816 1817 vfio_debugfs_create_root(); 1818 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1819 return 0; 1820 1821 err_alloc_dev_chrdev: 1822 class_destroy(vfio.device_class); 1823 vfio.device_class = NULL; 1824 err_dev_class: 1825 vfio_virqfd_exit(); 1826 err_virqfd: 1827 vfio_group_cleanup(); 1828 return ret; 1829 } 1830 1831 static void __exit vfio_cleanup(void) 1832 { 1833 vfio_debugfs_remove_root(); 1834 ida_destroy(&vfio.device_ida); 1835 vfio_cdev_cleanup(); 1836 class_destroy(vfio.device_class); 1837 vfio.device_class = NULL; 1838 vfio_virqfd_exit(); 1839 vfio_group_cleanup(); 1840 xa_destroy(&vfio_device_set_xa); 1841 } 1842 1843 module_init(vfio_init); 1844 module_exit(vfio_cleanup); 1845 1846 MODULE_IMPORT_NS("IOMMUFD"); 1847 MODULE_VERSION(DRIVER_VERSION); 1848 MODULE_LICENSE("GPL v2"); 1849 MODULE_AUTHOR(DRIVER_AUTHOR); 1850 MODULE_DESCRIPTION(DRIVER_DESC); 1851 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1852