1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #if IS_ENABLED(CONFIG_KVM) 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mount.h> 26 #include <linux/mutex.h> 27 #include <linux/pci.h> 28 #include <linux/pseudo_fs.h> 29 #include <linux/rwsem.h> 30 #include <linux/sched.h> 31 #include <linux/seq_file.h> 32 #include <linux/slab.h> 33 #include <linux/stat.h> 34 #include <linux/string.h> 35 #include <linux/uaccess.h> 36 #include <linux/vfio.h> 37 #include <linux/wait.h> 38 #include <linux/sched/signal.h> 39 #include <linux/pm_runtime.h> 40 #include <linux/interval_tree.h> 41 #include <linux/iova_bitmap.h> 42 #include <linux/iommufd.h> 43 #include "vfio.h" 44 45 #define DRIVER_VERSION "0.3" 46 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 47 #define DRIVER_DESC "VFIO - User Level meta-driver" 48 49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */ 50 51 static struct vfio { 52 struct class *device_class; 53 struct ida device_ida; 54 struct vfsmount *vfs_mount; 55 int fs_count; 56 } vfio; 57 58 #ifdef CONFIG_VFIO_NOIOMMU 59 bool vfio_noiommu __read_mostly; 60 module_param_named(enable_unsafe_noiommu_mode, 61 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 62 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 63 #endif 64 65 static DEFINE_XARRAY(vfio_device_set_xa); 66 67 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 68 { 69 unsigned long idx = (unsigned long)set_id; 70 struct vfio_device_set *new_dev_set; 71 struct vfio_device_set *dev_set; 72 73 if (WARN_ON(!set_id)) 74 return -EINVAL; 75 76 /* 77 * Atomically acquire a singleton object in the xarray for this set_id 78 */ 79 xa_lock(&vfio_device_set_xa); 80 dev_set = xa_load(&vfio_device_set_xa, idx); 81 if (dev_set) 82 goto found_get_ref; 83 xa_unlock(&vfio_device_set_xa); 84 85 new_dev_set = kzalloc_obj(*new_dev_set); 86 if (!new_dev_set) 87 return -ENOMEM; 88 mutex_init(&new_dev_set->lock); 89 INIT_LIST_HEAD(&new_dev_set->device_list); 90 new_dev_set->set_id = set_id; 91 92 xa_lock(&vfio_device_set_xa); 93 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 94 GFP_KERNEL); 95 if (!dev_set) { 96 dev_set = new_dev_set; 97 goto found_get_ref; 98 } 99 100 kfree(new_dev_set); 101 if (xa_is_err(dev_set)) { 102 xa_unlock(&vfio_device_set_xa); 103 return xa_err(dev_set); 104 } 105 106 found_get_ref: 107 dev_set->device_count++; 108 xa_unlock(&vfio_device_set_xa); 109 mutex_lock(&dev_set->lock); 110 device->dev_set = dev_set; 111 list_add_tail(&device->dev_set_list, &dev_set->device_list); 112 mutex_unlock(&dev_set->lock); 113 return 0; 114 } 115 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 116 117 static void vfio_release_device_set(struct vfio_device *device) 118 { 119 struct vfio_device_set *dev_set = device->dev_set; 120 121 if (!dev_set) 122 return; 123 124 mutex_lock(&dev_set->lock); 125 list_del(&device->dev_set_list); 126 mutex_unlock(&dev_set->lock); 127 128 xa_lock(&vfio_device_set_xa); 129 if (!--dev_set->device_count) { 130 __xa_erase(&vfio_device_set_xa, 131 (unsigned long)dev_set->set_id); 132 mutex_destroy(&dev_set->lock); 133 kfree(dev_set); 134 } 135 xa_unlock(&vfio_device_set_xa); 136 } 137 138 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 139 { 140 struct vfio_device *cur; 141 unsigned int open_count = 0; 142 143 lockdep_assert_held(&dev_set->lock); 144 145 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 146 open_count += cur->open_count; 147 return open_count; 148 } 149 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 150 151 struct vfio_device * 152 vfio_find_device_in_devset(struct vfio_device_set *dev_set, 153 struct device *dev) 154 { 155 struct vfio_device *cur; 156 157 lockdep_assert_held(&dev_set->lock); 158 159 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 160 if (cur->dev == dev) 161 return cur; 162 return NULL; 163 } 164 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 165 166 /* 167 * Device objects - create, release, get, put, search 168 */ 169 /* Device reference always implies a group reference */ 170 void vfio_device_put_registration(struct vfio_device *device) 171 { 172 if (refcount_dec_and_test(&device->refcount)) 173 complete(&device->comp); 174 } 175 EXPORT_SYMBOL_GPL(vfio_device_put_registration); 176 177 bool vfio_device_try_get_registration(struct vfio_device *device) 178 { 179 return refcount_inc_not_zero(&device->refcount); 180 } 181 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); 182 183 /* 184 * VFIO driver API 185 */ 186 /* Release helper called by vfio_put_device() */ 187 static void vfio_device_release(struct device *dev) 188 { 189 struct vfio_device *device = 190 container_of(dev, struct vfio_device, device); 191 192 vfio_release_device_set(device); 193 ida_free(&vfio.device_ida, device->index); 194 195 if (device->ops->release) 196 device->ops->release(device); 197 198 iput(device->inode); 199 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 200 kvfree(device); 201 } 202 203 static int vfio_init_device(struct vfio_device *device, struct device *dev, 204 const struct vfio_device_ops *ops); 205 206 /* 207 * Allocate and initialize vfio_device so it can be registered to vfio 208 * core. 209 * 210 * Drivers should use the wrapper vfio_alloc_device() for allocation. 211 * @size is the size of the structure to be allocated, including any 212 * private data used by the driver. 213 * 214 * Driver may provide an @init callback to cover device private data. 215 * 216 * Use vfio_put_device() to release the structure after success return. 217 */ 218 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 219 const struct vfio_device_ops *ops) 220 { 221 struct vfio_device *device; 222 int ret; 223 224 if (WARN_ON(size < sizeof(struct vfio_device))) 225 return ERR_PTR(-EINVAL); 226 227 device = kvzalloc(size, GFP_KERNEL); 228 if (!device) 229 return ERR_PTR(-ENOMEM); 230 231 ret = vfio_init_device(device, dev, ops); 232 if (ret) 233 goto out_free; 234 return device; 235 236 out_free: 237 kvfree(device); 238 return ERR_PTR(ret); 239 } 240 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 241 242 static int vfio_fs_init_fs_context(struct fs_context *fc) 243 { 244 return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; 245 } 246 247 static struct file_system_type vfio_fs_type = { 248 .name = "vfio", 249 .owner = THIS_MODULE, 250 .init_fs_context = vfio_fs_init_fs_context, 251 .kill_sb = kill_anon_super, 252 }; 253 254 static struct inode *vfio_fs_inode_new(void) 255 { 256 struct inode *inode; 257 int ret; 258 259 ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); 260 if (ret) 261 return ERR_PTR(ret); 262 263 inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); 264 if (IS_ERR(inode)) 265 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 266 267 return inode; 268 } 269 270 /* 271 * Initialize a vfio_device so it can be registered to vfio core. 272 */ 273 static int vfio_init_device(struct vfio_device *device, struct device *dev, 274 const struct vfio_device_ops *ops) 275 { 276 int ret; 277 278 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 279 if (ret < 0) { 280 dev_dbg(dev, "Error to alloc index\n"); 281 return ret; 282 } 283 284 device->index = ret; 285 init_completion(&device->comp); 286 device->dev = dev; 287 device->ops = ops; 288 device->inode = vfio_fs_inode_new(); 289 if (IS_ERR(device->inode)) { 290 ret = PTR_ERR(device->inode); 291 goto out_inode; 292 } 293 294 if (ops->init) { 295 ret = ops->init(device); 296 if (ret) 297 goto out_uninit; 298 } 299 300 device_initialize(&device->device); 301 device->device.release = vfio_device_release; 302 device->device.class = vfio.device_class; 303 device->device.parent = device->dev; 304 return 0; 305 306 out_uninit: 307 iput(device->inode); 308 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); 309 out_inode: 310 vfio_release_device_set(device); 311 ida_free(&vfio.device_ida, device->index); 312 return ret; 313 } 314 315 static int __vfio_register_dev(struct vfio_device *device, 316 enum vfio_group_type type) 317 { 318 int ret; 319 320 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 321 (!device->ops->bind_iommufd || 322 !device->ops->unbind_iommufd || 323 !device->ops->attach_ioas || 324 !device->ops->detach_ioas))) 325 return -EINVAL; 326 327 /* 328 * If the driver doesn't specify a set then the device is added to a 329 * singleton set just for itself. 330 */ 331 if (!device->dev_set) 332 vfio_assign_device_set(device, device); 333 334 ret = dev_set_name(&device->device, "vfio%d", device->index); 335 if (ret) 336 return ret; 337 338 ret = vfio_device_set_group(device, type); 339 if (ret) 340 return ret; 341 342 /* 343 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 344 * restore cache coherency. It has to be checked here because it is only 345 * valid for cases where we are using iommu groups. 346 */ 347 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && 348 !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { 349 ret = -EINVAL; 350 goto err_out; 351 } 352 353 ret = vfio_device_add(device); 354 if (ret) 355 goto err_out; 356 357 /* Refcounting can't start until the driver calls register */ 358 refcount_set(&device->refcount, 1); 359 360 vfio_device_group_register(device); 361 vfio_device_debugfs_init(device); 362 363 return 0; 364 err_out: 365 vfio_device_remove_group(device); 366 return ret; 367 } 368 369 int vfio_register_group_dev(struct vfio_device *device) 370 { 371 return __vfio_register_dev(device, VFIO_IOMMU); 372 } 373 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 374 375 /* 376 * Register a virtual device without IOMMU backing. The user of this 377 * device must not be able to directly trigger unmediated DMA. 378 */ 379 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 380 { 381 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 382 } 383 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 384 385 /* 386 * Decrement the device reference count and wait for the device to be 387 * removed. Open file descriptors for the device... */ 388 void vfio_unregister_group_dev(struct vfio_device *device) 389 { 390 unsigned int i = 0; 391 bool interrupted = false; 392 long rc; 393 394 /* 395 * Prevent new device opened by userspace via the 396 * VFIO_GROUP_GET_DEVICE_FD in the group path. 397 */ 398 vfio_device_group_unregister(device); 399 400 /* 401 * Balances vfio_device_add() in register path, also prevents 402 * new device opened by userspace in the cdev path. 403 */ 404 vfio_device_del(device); 405 406 vfio_device_put_registration(device); 407 rc = try_wait_for_completion(&device->comp); 408 while (rc <= 0) { 409 if (device->ops->request) 410 device->ops->request(device, i++); 411 412 if (interrupted) { 413 rc = wait_for_completion_timeout(&device->comp, 414 HZ * 10); 415 } else { 416 rc = wait_for_completion_interruptible_timeout( 417 &device->comp, HZ * 10); 418 if (rc < 0) { 419 interrupted = true; 420 dev_warn(device->dev, 421 "Device is currently in use, task" 422 " \"%s\" (%d) " 423 "blocked until device is released", 424 current->comm, task_pid_nr(current)); 425 } 426 } 427 } 428 429 vfio_device_debugfs_exit(device); 430 /* Balances vfio_device_set_group in register path */ 431 vfio_device_remove_group(device); 432 } 433 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 434 435 #if IS_ENABLED(CONFIG_KVM) 436 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 437 { 438 void (*pfn)(struct kvm *kvm); 439 bool (*fn)(struct kvm *kvm); 440 bool ret; 441 442 lockdep_assert_held(&device->dev_set->lock); 443 444 if (!kvm) 445 return; 446 447 pfn = symbol_get(kvm_put_kvm); 448 if (WARN_ON(!pfn)) 449 return; 450 451 fn = symbol_get(kvm_get_kvm_safe); 452 if (WARN_ON(!fn)) { 453 symbol_put(kvm_put_kvm); 454 return; 455 } 456 457 ret = fn(kvm); 458 symbol_put(kvm_get_kvm_safe); 459 if (!ret) { 460 symbol_put(kvm_put_kvm); 461 return; 462 } 463 464 device->put_kvm = pfn; 465 device->kvm = kvm; 466 } 467 468 void vfio_device_put_kvm(struct vfio_device *device) 469 { 470 lockdep_assert_held(&device->dev_set->lock); 471 472 if (!device->kvm) 473 return; 474 475 if (WARN_ON(!device->put_kvm)) 476 goto clear; 477 478 device->put_kvm(device->kvm); 479 device->put_kvm = NULL; 480 symbol_put(kvm_put_kvm); 481 482 clear: 483 device->kvm = NULL; 484 } 485 #endif 486 487 /* true if the vfio_device has open_device() called but not close_device() */ 488 static bool vfio_assert_device_open(struct vfio_device *device) 489 { 490 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 491 } 492 493 struct vfio_device_file * 494 vfio_allocate_device_file(struct vfio_device *device) 495 { 496 struct vfio_device_file *df; 497 498 df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT); 499 if (!df) 500 return ERR_PTR(-ENOMEM); 501 502 df->device = device; 503 spin_lock_init(&df->kvm_ref_lock); 504 505 return df; 506 } 507 508 static int vfio_df_device_first_open(struct vfio_device_file *df) 509 { 510 struct vfio_device *device = df->device; 511 struct iommufd_ctx *iommufd = df->iommufd; 512 int ret; 513 514 lockdep_assert_held(&device->dev_set->lock); 515 516 if (!try_module_get(device->dev->driver->owner)) 517 return -ENODEV; 518 519 if (iommufd) 520 ret = vfio_df_iommufd_bind(df); 521 else 522 ret = vfio_device_group_use_iommu(device); 523 if (ret) 524 goto err_module_put; 525 526 if (device->ops->open_device) { 527 ret = device->ops->open_device(device); 528 if (ret) 529 goto err_unuse_iommu; 530 } 531 return 0; 532 533 err_unuse_iommu: 534 if (iommufd) 535 vfio_df_iommufd_unbind(df); 536 else 537 vfio_device_group_unuse_iommu(device); 538 err_module_put: 539 module_put(device->dev->driver->owner); 540 return ret; 541 } 542 543 static void vfio_df_device_last_close(struct vfio_device_file *df) 544 { 545 struct vfio_device *device = df->device; 546 struct iommufd_ctx *iommufd = df->iommufd; 547 548 lockdep_assert_held(&device->dev_set->lock); 549 550 if (device->ops->close_device) 551 device->ops->close_device(device); 552 if (iommufd) 553 vfio_df_iommufd_unbind(df); 554 else 555 vfio_device_group_unuse_iommu(device); 556 module_put(device->dev->driver->owner); 557 } 558 559 int vfio_df_open(struct vfio_device_file *df) 560 { 561 struct vfio_device *device = df->device; 562 int ret = 0; 563 564 lockdep_assert_held(&device->dev_set->lock); 565 566 /* 567 * Only the group path allows the device to be opened multiple 568 * times. The device cdev path doesn't have a secure way for it. 569 */ 570 if (device->open_count != 0 && !df->group) 571 return -EINVAL; 572 573 device->open_count++; 574 if (device->open_count == 1) { 575 ret = vfio_df_device_first_open(df); 576 if (ret) 577 device->open_count--; 578 } 579 580 return ret; 581 } 582 583 void vfio_df_close(struct vfio_device_file *df) 584 { 585 struct vfio_device *device = df->device; 586 587 lockdep_assert_held(&device->dev_set->lock); 588 589 if (!vfio_assert_device_open(device)) 590 return; 591 if (device->open_count == 1) 592 vfio_df_device_last_close(df); 593 device->open_count--; 594 } 595 596 /* 597 * Wrapper around pm_runtime_resume_and_get(). 598 * Return error code on failure or 0 on success. 599 */ 600 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 601 { 602 struct device *dev = device->dev; 603 604 if (dev->driver && dev->driver->pm) { 605 int ret; 606 607 ret = pm_runtime_resume_and_get(dev); 608 if (ret) { 609 dev_info_ratelimited(dev, 610 "vfio: runtime resume failed %d\n", ret); 611 return -EIO; 612 } 613 } 614 615 return 0; 616 } 617 618 /* 619 * Wrapper around pm_runtime_put(). 620 */ 621 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 622 { 623 struct device *dev = device->dev; 624 625 if (dev->driver && dev->driver->pm) 626 pm_runtime_put(dev); 627 } 628 629 /* 630 * VFIO Device fd 631 */ 632 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 633 { 634 struct vfio_device_file *df = filep->private_data; 635 struct vfio_device *device = df->device; 636 637 if (df->group) 638 vfio_df_group_close(df); 639 else 640 vfio_df_unbind_iommufd(df); 641 642 vfio_device_put_registration(device); 643 644 kfree(df); 645 646 return 0; 647 } 648 649 /* 650 * vfio_mig_get_next_state - Compute the next step in the FSM 651 * @cur_fsm - The current state the device is in 652 * @new_fsm - The target state to reach 653 * @next_fsm - Pointer to the next step to get to new_fsm 654 * 655 * Return 0 upon success, otherwise -errno 656 * Upon success the next step in the state progression between cur_fsm and 657 * new_fsm will be set in next_fsm. 658 * 659 * This breaks down requests for combination transitions into smaller steps and 660 * returns the next step to get to new_fsm. The function may need to be called 661 * multiple times before reaching new_fsm. 662 * 663 */ 664 int vfio_mig_get_next_state(struct vfio_device *device, 665 enum vfio_device_mig_state cur_fsm, 666 enum vfio_device_mig_state new_fsm, 667 enum vfio_device_mig_state *next_fsm) 668 { 669 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 670 /* 671 * The coding in this table requires the driver to implement the 672 * following FSM arcs: 673 * RESUMING -> STOP 674 * STOP -> RESUMING 675 * STOP -> STOP_COPY 676 * STOP_COPY -> STOP 677 * 678 * If P2P is supported then the driver must also implement these FSM 679 * arcs: 680 * RUNNING -> RUNNING_P2P 681 * RUNNING_P2P -> RUNNING 682 * RUNNING_P2P -> STOP 683 * STOP -> RUNNING_P2P 684 * 685 * If precopy is supported then the driver must support these additional 686 * FSM arcs: 687 * RUNNING -> PRE_COPY 688 * PRE_COPY -> RUNNING 689 * PRE_COPY -> STOP_COPY 690 * However, if precopy and P2P are supported together then the driver 691 * must support these additional arcs beyond the P2P arcs above: 692 * PRE_COPY -> RUNNING 693 * PRE_COPY -> PRE_COPY_P2P 694 * PRE_COPY_P2P -> PRE_COPY 695 * PRE_COPY_P2P -> RUNNING_P2P 696 * PRE_COPY_P2P -> STOP_COPY 697 * RUNNING -> PRE_COPY 698 * RUNNING_P2P -> PRE_COPY_P2P 699 * 700 * Without P2P and precopy the driver must implement: 701 * RUNNING -> STOP 702 * STOP -> RUNNING 703 * 704 * The coding will step through multiple states for some combination 705 * transitions; if all optional features are supported, this means the 706 * following ones: 707 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 708 * PRE_COPY -> RUNNING -> RUNNING_P2P 709 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 710 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 711 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 712 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 713 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 714 * RESUMING -> STOP -> RUNNING_P2P 715 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 716 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 717 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 718 * RESUMING -> STOP -> STOP_COPY 719 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 720 * RUNNING -> RUNNING_P2P -> STOP 721 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 722 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 723 * RUNNING_P2P -> RUNNING -> PRE_COPY 724 * RUNNING_P2P -> STOP -> RESUMING 725 * RUNNING_P2P -> STOP -> STOP_COPY 726 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 727 * STOP -> RUNNING_P2P -> RUNNING 728 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 729 * STOP_COPY -> STOP -> RESUMING 730 * STOP_COPY -> STOP -> RUNNING_P2P 731 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 732 * 733 * The following transitions are blocked: 734 * STOP_COPY -> PRE_COPY 735 * STOP_COPY -> PRE_COPY_P2P 736 */ 737 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 738 [VFIO_DEVICE_STATE_STOP] = { 739 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 740 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 741 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 742 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 743 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 744 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 745 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 746 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 747 }, 748 [VFIO_DEVICE_STATE_RUNNING] = { 749 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 750 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 751 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 752 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 753 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 754 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 755 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 756 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 757 }, 758 [VFIO_DEVICE_STATE_PRE_COPY] = { 759 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 760 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 761 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 762 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 763 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 764 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 765 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 766 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 767 }, 768 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 769 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 770 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 771 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 772 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 773 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 774 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 775 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 776 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 777 }, 778 [VFIO_DEVICE_STATE_STOP_COPY] = { 779 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 780 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 781 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 782 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 783 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 784 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 785 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 786 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 787 }, 788 [VFIO_DEVICE_STATE_RESUMING] = { 789 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 790 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 791 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 792 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 793 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 794 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 795 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 796 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 797 }, 798 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 799 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 800 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 801 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 802 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 803 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 804 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 805 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 806 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 807 }, 808 [VFIO_DEVICE_STATE_ERROR] = { 809 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 810 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 811 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 812 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 813 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 814 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 815 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 816 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 817 }, 818 }; 819 820 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 821 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 822 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 823 [VFIO_DEVICE_STATE_PRE_COPY] = 824 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 825 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 826 VFIO_MIGRATION_P2P | 827 VFIO_MIGRATION_PRE_COPY, 828 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 829 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 830 [VFIO_DEVICE_STATE_RUNNING_P2P] = 831 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 832 [VFIO_DEVICE_STATE_ERROR] = ~0U, 833 }; 834 835 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 836 (state_flags_table[cur_fsm] & device->migration_flags) != 837 state_flags_table[cur_fsm])) 838 return -EINVAL; 839 840 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 841 (state_flags_table[new_fsm] & device->migration_flags) != 842 state_flags_table[new_fsm]) 843 return -EINVAL; 844 845 /* 846 * Arcs touching optional and unsupported states are skipped over. The 847 * driver will instead see an arc from the original state to the next 848 * logical state, as per the above comment. 849 */ 850 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 851 while ((state_flags_table[*next_fsm] & device->migration_flags) != 852 state_flags_table[*next_fsm]) 853 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 854 855 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 856 } 857 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 858 859 /* 860 * Convert the drivers's struct file into a FD number and return it to userspace 861 */ 862 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 863 struct vfio_device_feature_mig_state *mig) 864 { 865 int ret; 866 int fd; 867 868 fd = get_unused_fd_flags(O_CLOEXEC); 869 if (fd < 0) { 870 ret = fd; 871 goto out_fput; 872 } 873 874 mig->data_fd = fd; 875 if (copy_to_user(arg, mig, sizeof(*mig))) { 876 ret = -EFAULT; 877 goto out_put_unused; 878 } 879 fd_install(fd, filp); 880 return 0; 881 882 out_put_unused: 883 put_unused_fd(fd); 884 out_fput: 885 fput(filp); 886 return ret; 887 } 888 889 static int 890 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 891 u32 flags, void __user *arg, 892 size_t argsz) 893 { 894 size_t minsz = 895 offsetofend(struct vfio_device_feature_mig_state, data_fd); 896 struct vfio_device_feature_mig_state mig; 897 struct file *filp = NULL; 898 int ret; 899 900 if (!device->mig_ops) 901 return -ENOTTY; 902 903 ret = vfio_check_feature(flags, argsz, 904 VFIO_DEVICE_FEATURE_SET | 905 VFIO_DEVICE_FEATURE_GET, 906 sizeof(mig)); 907 if (ret != 1) 908 return ret; 909 910 if (copy_from_user(&mig, arg, minsz)) 911 return -EFAULT; 912 913 if (flags & VFIO_DEVICE_FEATURE_GET) { 914 enum vfio_device_mig_state curr_state; 915 916 ret = device->mig_ops->migration_get_state(device, 917 &curr_state); 918 if (ret) 919 return ret; 920 mig.device_state = curr_state; 921 goto out_copy; 922 } 923 924 /* Handle the VFIO_DEVICE_FEATURE_SET */ 925 filp = device->mig_ops->migration_set_state(device, mig.device_state); 926 if (IS_ERR(filp) || !filp) 927 goto out_copy; 928 929 return vfio_ioct_mig_return_fd(filp, arg, &mig); 930 out_copy: 931 mig.data_fd = -1; 932 if (copy_to_user(arg, &mig, sizeof(mig))) 933 return -EFAULT; 934 if (IS_ERR(filp)) 935 return PTR_ERR(filp); 936 return 0; 937 } 938 939 static int 940 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 941 u32 flags, void __user *arg, 942 size_t argsz) 943 { 944 struct vfio_device_feature_mig_data_size data_size = {}; 945 unsigned long stop_copy_length; 946 int ret; 947 948 if (!device->mig_ops) 949 return -ENOTTY; 950 951 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 952 sizeof(data_size)); 953 if (ret != 1) 954 return ret; 955 956 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 957 if (ret) 958 return ret; 959 960 data_size.stop_copy_length = stop_copy_length; 961 if (copy_to_user(arg, &data_size, sizeof(data_size))) 962 return -EFAULT; 963 964 return 0; 965 } 966 967 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 968 u32 flags, void __user *arg, 969 size_t argsz) 970 { 971 struct vfio_device_feature_migration mig = { 972 .flags = device->migration_flags, 973 }; 974 int ret; 975 976 if (!device->mig_ops) 977 return -ENOTTY; 978 979 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 980 sizeof(mig)); 981 if (ret != 1) 982 return ret; 983 if (copy_to_user(arg, &mig, sizeof(mig))) 984 return -EFAULT; 985 return 0; 986 } 987 988 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 989 u32 req_nodes) 990 { 991 struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 992 unsigned long min_gap, curr_gap; 993 994 /* Special shortcut when a single range is required */ 995 if (req_nodes == 1) { 996 unsigned long last; 997 998 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 999 1000 /* Empty list */ 1001 if (WARN_ON_ONCE(!comb_start)) 1002 return; 1003 1004 curr = comb_start; 1005 while (curr) { 1006 last = curr->last; 1007 prev = curr; 1008 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1009 if (prev != comb_start) 1010 interval_tree_remove(prev, root); 1011 } 1012 comb_start->last = last; 1013 return; 1014 } 1015 1016 /* Combine ranges which have the smallest gap */ 1017 while (cur_nodes > req_nodes) { 1018 prev = NULL; 1019 min_gap = ULONG_MAX; 1020 curr = interval_tree_iter_first(root, 0, ULONG_MAX); 1021 while (curr) { 1022 if (prev) { 1023 curr_gap = curr->start - prev->last; 1024 if (curr_gap < min_gap) { 1025 min_gap = curr_gap; 1026 comb_start = prev; 1027 comb_end = curr; 1028 } 1029 } 1030 prev = curr; 1031 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 1032 } 1033 1034 /* Empty list or no nodes to combine */ 1035 if (WARN_ON_ONCE(min_gap == ULONG_MAX)) 1036 break; 1037 1038 comb_start->last = comb_end->last; 1039 interval_tree_remove(comb_end, root); 1040 cur_nodes--; 1041 } 1042 } 1043 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); 1044 1045 /* Ranges should fit into a single kernel page */ 1046 #define LOG_MAX_RANGES \ 1047 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 1048 1049 static int 1050 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 1051 u32 flags, void __user *arg, 1052 size_t argsz) 1053 { 1054 size_t minsz = 1055 offsetofend(struct vfio_device_feature_dma_logging_control, 1056 ranges); 1057 struct vfio_device_feature_dma_logging_range __user *ranges; 1058 struct vfio_device_feature_dma_logging_control control; 1059 struct vfio_device_feature_dma_logging_range range; 1060 struct rb_root_cached root = RB_ROOT_CACHED; 1061 struct interval_tree_node *nodes; 1062 u64 iova_end; 1063 u32 nnodes; 1064 int i, ret; 1065 1066 if (!device->log_ops) 1067 return -ENOTTY; 1068 1069 ret = vfio_check_feature(flags, argsz, 1070 VFIO_DEVICE_FEATURE_SET, 1071 sizeof(control)); 1072 if (ret != 1) 1073 return ret; 1074 1075 if (copy_from_user(&control, arg, minsz)) 1076 return -EFAULT; 1077 1078 nnodes = control.num_ranges; 1079 if (!nnodes) 1080 return -EINVAL; 1081 1082 if (nnodes > LOG_MAX_RANGES) 1083 return -E2BIG; 1084 1085 ranges = u64_to_user_ptr(control.ranges); 1086 nodes = kmalloc_objs(struct interval_tree_node, nnodes); 1087 if (!nodes) 1088 return -ENOMEM; 1089 1090 for (i = 0; i < nnodes; i++) { 1091 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 1092 ret = -EFAULT; 1093 goto end; 1094 } 1095 if (!IS_ALIGNED(range.iova, control.page_size) || 1096 !IS_ALIGNED(range.length, control.page_size)) { 1097 ret = -EINVAL; 1098 goto end; 1099 } 1100 1101 if (check_add_overflow(range.iova, range.length, &iova_end) || 1102 iova_end > ULONG_MAX) { 1103 ret = -EOVERFLOW; 1104 goto end; 1105 } 1106 1107 nodes[i].start = range.iova; 1108 nodes[i].last = range.iova + range.length - 1; 1109 if (interval_tree_iter_first(&root, nodes[i].start, 1110 nodes[i].last)) { 1111 /* Range overlapping */ 1112 ret = -EINVAL; 1113 goto end; 1114 } 1115 interval_tree_insert(nodes + i, &root); 1116 } 1117 1118 ret = device->log_ops->log_start(device, &root, nnodes, 1119 &control.page_size); 1120 if (ret) 1121 goto end; 1122 1123 if (copy_to_user(arg, &control, sizeof(control))) { 1124 ret = -EFAULT; 1125 device->log_ops->log_stop(device); 1126 } 1127 1128 end: 1129 kfree(nodes); 1130 return ret; 1131 } 1132 1133 static int 1134 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1135 u32 flags, void __user *arg, 1136 size_t argsz) 1137 { 1138 int ret; 1139 1140 if (!device->log_ops) 1141 return -ENOTTY; 1142 1143 ret = vfio_check_feature(flags, argsz, 1144 VFIO_DEVICE_FEATURE_SET, 0); 1145 if (ret != 1) 1146 return ret; 1147 1148 return device->log_ops->log_stop(device); 1149 } 1150 1151 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1152 unsigned long iova, size_t length, 1153 void *opaque) 1154 { 1155 struct vfio_device *device = opaque; 1156 1157 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1158 } 1159 1160 static int 1161 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1162 u32 flags, void __user *arg, 1163 size_t argsz) 1164 { 1165 size_t minsz = 1166 offsetofend(struct vfio_device_feature_dma_logging_report, 1167 bitmap); 1168 struct vfio_device_feature_dma_logging_report report; 1169 struct iova_bitmap *iter; 1170 u64 iova_end; 1171 int ret; 1172 1173 if (!device->log_ops) 1174 return -ENOTTY; 1175 1176 ret = vfio_check_feature(flags, argsz, 1177 VFIO_DEVICE_FEATURE_GET, 1178 sizeof(report)); 1179 if (ret != 1) 1180 return ret; 1181 1182 if (copy_from_user(&report, arg, minsz)) 1183 return -EFAULT; 1184 1185 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1186 return -EINVAL; 1187 1188 if (check_add_overflow(report.iova, report.length, &iova_end) || 1189 iova_end > ULONG_MAX) 1190 return -EOVERFLOW; 1191 1192 iter = iova_bitmap_alloc(report.iova, report.length, 1193 report.page_size, 1194 u64_to_user_ptr(report.bitmap)); 1195 if (IS_ERR(iter)) 1196 return PTR_ERR(iter); 1197 1198 ret = iova_bitmap_for_each(iter, device, 1199 vfio_device_log_read_and_clear); 1200 1201 iova_bitmap_free(iter); 1202 return ret; 1203 } 1204 1205 static int vfio_ioctl_device_feature(struct vfio_device *device, 1206 struct vfio_device_feature __user *arg) 1207 { 1208 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1209 struct vfio_device_feature feature; 1210 1211 if (copy_from_user(&feature, arg, minsz)) 1212 return -EFAULT; 1213 1214 if (feature.argsz < minsz) 1215 return -EINVAL; 1216 1217 /* Check unknown flags */ 1218 if (feature.flags & 1219 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1220 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1221 return -EINVAL; 1222 1223 /* GET & SET are mutually exclusive except with PROBE */ 1224 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1225 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1226 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1227 return -EINVAL; 1228 1229 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1230 case VFIO_DEVICE_FEATURE_MIGRATION: 1231 return vfio_ioctl_device_feature_migration( 1232 device, feature.flags, arg->data, 1233 feature.argsz - minsz); 1234 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1235 return vfio_ioctl_device_feature_mig_device_state( 1236 device, feature.flags, arg->data, 1237 feature.argsz - minsz); 1238 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1239 return vfio_ioctl_device_feature_logging_start( 1240 device, feature.flags, arg->data, 1241 feature.argsz - minsz); 1242 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1243 return vfio_ioctl_device_feature_logging_stop( 1244 device, feature.flags, arg->data, 1245 feature.argsz - minsz); 1246 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1247 return vfio_ioctl_device_feature_logging_report( 1248 device, feature.flags, arg->data, 1249 feature.argsz - minsz); 1250 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1251 return vfio_ioctl_device_feature_migration_data_size( 1252 device, feature.flags, arg->data, 1253 feature.argsz - minsz); 1254 default: 1255 if (unlikely(!device->ops->device_feature)) 1256 return -ENOTTY; 1257 return device->ops->device_feature(device, feature.flags, 1258 arg->data, 1259 feature.argsz - minsz); 1260 } 1261 } 1262 1263 static long vfio_get_region_info(struct vfio_device *device, 1264 struct vfio_region_info __user *arg) 1265 { 1266 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 1267 struct vfio_region_info info = {}; 1268 struct vfio_info_cap caps = {}; 1269 int ret; 1270 1271 if (unlikely(!device->ops->get_region_info_caps)) 1272 return -EINVAL; 1273 1274 if (copy_from_user(&info, arg, minsz)) 1275 return -EFAULT; 1276 if (info.argsz < minsz) 1277 return -EINVAL; 1278 1279 ret = device->ops->get_region_info_caps(device, &info, &caps); 1280 if (ret) 1281 goto out_free; 1282 1283 if (caps.size) { 1284 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 1285 if (info.argsz < sizeof(info) + caps.size) { 1286 info.argsz = sizeof(info) + caps.size; 1287 info.cap_offset = 0; 1288 } else { 1289 vfio_info_cap_shift(&caps, sizeof(info)); 1290 if (copy_to_user(arg + 1, caps.buf, caps.size)) { 1291 ret = -EFAULT; 1292 goto out_free; 1293 } 1294 info.cap_offset = sizeof(info); 1295 } 1296 } 1297 1298 if (copy_to_user(arg, &info, minsz)){ 1299 ret = -EFAULT; 1300 goto out_free; 1301 } 1302 1303 out_free: 1304 kfree(caps.buf); 1305 return ret; 1306 } 1307 1308 static long vfio_device_fops_unl_ioctl(struct file *filep, 1309 unsigned int cmd, unsigned long arg) 1310 { 1311 struct vfio_device_file *df = filep->private_data; 1312 struct vfio_device *device = df->device; 1313 void __user *uptr = (void __user *)arg; 1314 int ret; 1315 1316 if (cmd == VFIO_DEVICE_BIND_IOMMUFD) 1317 return vfio_df_ioctl_bind_iommufd(df, uptr); 1318 1319 /* Paired with smp_store_release() following vfio_df_open() */ 1320 if (!smp_load_acquire(&df->access_granted)) 1321 return -EINVAL; 1322 1323 ret = vfio_device_pm_runtime_get(device); 1324 if (ret) 1325 return ret; 1326 1327 /* cdev only ioctls */ 1328 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { 1329 switch (cmd) { 1330 case VFIO_DEVICE_ATTACH_IOMMUFD_PT: 1331 ret = vfio_df_ioctl_attach_pt(df, uptr); 1332 goto out; 1333 1334 case VFIO_DEVICE_DETACH_IOMMUFD_PT: 1335 ret = vfio_df_ioctl_detach_pt(df, uptr); 1336 goto out; 1337 } 1338 } 1339 1340 switch (cmd) { 1341 case VFIO_DEVICE_FEATURE: 1342 ret = vfio_ioctl_device_feature(device, uptr); 1343 break; 1344 1345 case VFIO_DEVICE_GET_REGION_INFO: 1346 ret = vfio_get_region_info(device, uptr); 1347 break; 1348 1349 default: 1350 if (unlikely(!device->ops->ioctl)) 1351 ret = -EINVAL; 1352 else 1353 ret = device->ops->ioctl(device, cmd, arg); 1354 break; 1355 } 1356 out: 1357 vfio_device_pm_runtime_put(device); 1358 return ret; 1359 } 1360 1361 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1362 size_t count, loff_t *ppos) 1363 { 1364 struct vfio_device_file *df = filep->private_data; 1365 struct vfio_device *device = df->device; 1366 1367 /* Paired with smp_store_release() following vfio_df_open() */ 1368 if (!smp_load_acquire(&df->access_granted)) 1369 return -EINVAL; 1370 1371 if (unlikely(!device->ops->read)) 1372 return -EINVAL; 1373 1374 return device->ops->read(device, buf, count, ppos); 1375 } 1376 1377 static ssize_t vfio_device_fops_write(struct file *filep, 1378 const char __user *buf, 1379 size_t count, loff_t *ppos) 1380 { 1381 struct vfio_device_file *df = filep->private_data; 1382 struct vfio_device *device = df->device; 1383 1384 /* Paired with smp_store_release() following vfio_df_open() */ 1385 if (!smp_load_acquire(&df->access_granted)) 1386 return -EINVAL; 1387 1388 if (unlikely(!device->ops->write)) 1389 return -EINVAL; 1390 1391 return device->ops->write(device, buf, count, ppos); 1392 } 1393 1394 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1395 { 1396 struct vfio_device_file *df = filep->private_data; 1397 struct vfio_device *device = df->device; 1398 1399 /* Paired with smp_store_release() following vfio_df_open() */ 1400 if (!smp_load_acquire(&df->access_granted)) 1401 return -EINVAL; 1402 1403 if (unlikely(!device->ops->mmap)) 1404 return -EINVAL; 1405 1406 return device->ops->mmap(device, vma); 1407 } 1408 1409 #ifdef CONFIG_PROC_FS 1410 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep) 1411 { 1412 char *path; 1413 struct vfio_device_file *df = filep->private_data; 1414 struct vfio_device *device = df->device; 1415 1416 path = kobject_get_path(&device->dev->kobj, GFP_KERNEL); 1417 if (!path) 1418 return; 1419 1420 seq_printf(m, "vfio-device-syspath: /sys%s\n", path); 1421 kfree(path); 1422 } 1423 #endif 1424 1425 const struct file_operations vfio_device_fops = { 1426 .owner = THIS_MODULE, 1427 .open = vfio_device_fops_cdev_open, 1428 .release = vfio_device_fops_release, 1429 .read = vfio_device_fops_read, 1430 .write = vfio_device_fops_write, 1431 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1432 .compat_ioctl = compat_ptr_ioctl, 1433 .mmap = vfio_device_fops_mmap, 1434 #ifdef CONFIG_PROC_FS 1435 .show_fdinfo = vfio_device_show_fdinfo, 1436 #endif 1437 }; 1438 1439 static struct vfio_device *vfio_device_from_file(struct file *file) 1440 { 1441 struct vfio_device_file *df = file->private_data; 1442 1443 if (file->f_op != &vfio_device_fops) 1444 return NULL; 1445 return df->device; 1446 } 1447 1448 /** 1449 * vfio_file_is_valid - True if the file is valid vfio file 1450 * @file: VFIO group file or VFIO device file 1451 */ 1452 bool vfio_file_is_valid(struct file *file) 1453 { 1454 return vfio_group_from_file(file) || 1455 vfio_device_from_file(file); 1456 } 1457 EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1458 1459 /** 1460 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1461 * is always CPU cache coherent 1462 * @file: VFIO group file or VFIO device file 1463 * 1464 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1465 * bit in DMA transactions. A return of false indicates that the user has 1466 * rights to access additional instructions such as wbinvd on x86. 1467 */ 1468 bool vfio_file_enforced_coherent(struct file *file) 1469 { 1470 struct vfio_device *device; 1471 struct vfio_group *group; 1472 1473 group = vfio_group_from_file(file); 1474 if (group) 1475 return vfio_group_enforced_coherent(group); 1476 1477 device = vfio_device_from_file(file); 1478 if (device) 1479 return device_iommu_capable(device->dev, 1480 IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1481 1482 return true; 1483 } 1484 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1485 1486 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1487 { 1488 struct vfio_device_file *df = file->private_data; 1489 1490 /* 1491 * The kvm is first recorded in the vfio_device_file, and will 1492 * be propagated to vfio_device::kvm when the file is bound to 1493 * iommufd successfully in the vfio device cdev path. 1494 */ 1495 spin_lock(&df->kvm_ref_lock); 1496 df->kvm = kvm; 1497 spin_unlock(&df->kvm_ref_lock); 1498 } 1499 1500 /** 1501 * vfio_file_set_kvm - Link a kvm with VFIO drivers 1502 * @file: VFIO group file or VFIO device file 1503 * @kvm: KVM to link 1504 * 1505 * When a VFIO device is first opened the KVM will be available in 1506 * device->kvm if one was associated with the file. 1507 */ 1508 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1509 { 1510 struct vfio_group *group; 1511 1512 group = vfio_group_from_file(file); 1513 if (group) 1514 vfio_group_set_kvm(group, kvm); 1515 1516 if (vfio_device_from_file(file)) 1517 vfio_device_file_set_kvm(file, kvm); 1518 } 1519 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1520 1521 /* 1522 * Sub-module support 1523 */ 1524 /* 1525 * Helper for managing a buffer of info chain capabilities, allocate or 1526 * reallocate a buffer with additional @size, filling in @id and @version 1527 * of the capability. A pointer to the new capability is returned. 1528 * 1529 * NB. The chain is based at the head of the buffer, so new entries are 1530 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1531 * next offsets prior to copying to the user buffer. 1532 */ 1533 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1534 size_t size, u16 id, u16 version) 1535 { 1536 void *buf; 1537 struct vfio_info_cap_header *header, *tmp; 1538 1539 /* Ensure that the next capability struct will be aligned */ 1540 size = ALIGN(size, sizeof(u64)); 1541 1542 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1543 if (!buf) { 1544 kfree(caps->buf); 1545 caps->buf = NULL; 1546 caps->size = 0; 1547 return ERR_PTR(-ENOMEM); 1548 } 1549 1550 caps->buf = buf; 1551 header = buf + caps->size; 1552 1553 /* Eventually copied to user buffer, zero */ 1554 memset(header, 0, size); 1555 1556 header->id = id; 1557 header->version = version; 1558 1559 /* Add to the end of the capability chain */ 1560 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1561 ; /* nothing */ 1562 1563 tmp->next = caps->size; 1564 caps->size += size; 1565 1566 return header; 1567 } 1568 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1569 1570 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1571 { 1572 struct vfio_info_cap_header *tmp; 1573 void *buf = (void *)caps->buf; 1574 1575 /* Capability structs should start with proper alignment */ 1576 WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); 1577 1578 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1579 tmp->next += offset; 1580 } 1581 EXPORT_SYMBOL(vfio_info_cap_shift); 1582 1583 int vfio_info_add_capability(struct vfio_info_cap *caps, 1584 struct vfio_info_cap_header *cap, size_t size) 1585 { 1586 struct vfio_info_cap_header *header; 1587 1588 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1589 if (IS_ERR(header)) 1590 return PTR_ERR(header); 1591 1592 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1593 1594 return 0; 1595 } 1596 EXPORT_SYMBOL(vfio_info_add_capability); 1597 1598 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1599 int max_irq_type, size_t *data_size) 1600 { 1601 unsigned long minsz; 1602 size_t size; 1603 1604 minsz = offsetofend(struct vfio_irq_set, count); 1605 1606 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1607 (hdr->count >= (U32_MAX - hdr->start)) || 1608 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1609 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1610 return -EINVAL; 1611 1612 if (data_size) 1613 *data_size = 0; 1614 1615 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1616 return -EINVAL; 1617 1618 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1619 case VFIO_IRQ_SET_DATA_NONE: 1620 size = 0; 1621 break; 1622 case VFIO_IRQ_SET_DATA_BOOL: 1623 size = sizeof(uint8_t); 1624 break; 1625 case VFIO_IRQ_SET_DATA_EVENTFD: 1626 size = sizeof(int32_t); 1627 break; 1628 default: 1629 return -EINVAL; 1630 } 1631 1632 if (size) { 1633 if (hdr->argsz - minsz < hdr->count * size) 1634 return -EINVAL; 1635 1636 if (!data_size) 1637 return -EINVAL; 1638 1639 *data_size = hdr->count * size; 1640 } 1641 1642 return 0; 1643 } 1644 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1645 1646 /* 1647 * Pin contiguous user pages and return their associated host pages for local 1648 * domain only. 1649 * @device [in] : device 1650 * @iova [in] : starting IOVA of user pages to be pinned. 1651 * @npage [in] : count of pages to be pinned. This count should not 1652 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1653 * @prot [in] : protection flags 1654 * @pages[out] : array of host pages 1655 * Return error or number of pages pinned. 1656 * 1657 * A driver may only call this function if the vfio_device was created 1658 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1659 */ 1660 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1661 int npage, int prot, struct page **pages) 1662 { 1663 /* group->container cannot change while a vfio device is open */ 1664 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1665 return -EINVAL; 1666 if (!device->ops->dma_unmap) 1667 return -EINVAL; 1668 if (vfio_device_has_container(device)) 1669 return vfio_device_container_pin_pages(device, iova, 1670 npage, prot, pages); 1671 if (device->iommufd_access) { 1672 int ret; 1673 1674 if (iova > ULONG_MAX) 1675 return -EINVAL; 1676 /* 1677 * VFIO ignores the sub page offset, npages is from the start of 1678 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1679 * the sub page offset by doing: 1680 * pages[0] + (iova % PAGE_SIZE) 1681 */ 1682 ret = iommufd_access_pin_pages( 1683 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1684 npage * PAGE_SIZE, pages, 1685 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1686 if (ret) 1687 return ret; 1688 return npage; 1689 } 1690 return -EINVAL; 1691 } 1692 EXPORT_SYMBOL(vfio_pin_pages); 1693 1694 /* 1695 * Unpin contiguous host pages for local domain only. 1696 * @device [in] : device 1697 * @iova [in] : starting address of user pages to be unpinned. 1698 * @npage [in] : count of pages to be unpinned. This count should not 1699 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1700 */ 1701 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1702 { 1703 if (WARN_ON(!vfio_assert_device_open(device))) 1704 return; 1705 if (WARN_ON(!device->ops->dma_unmap)) 1706 return; 1707 1708 if (vfio_device_has_container(device)) { 1709 vfio_device_container_unpin_pages(device, iova, npage); 1710 return; 1711 } 1712 if (device->iommufd_access) { 1713 if (WARN_ON(iova > ULONG_MAX)) 1714 return; 1715 iommufd_access_unpin_pages(device->iommufd_access, 1716 ALIGN_DOWN(iova, PAGE_SIZE), 1717 npage * PAGE_SIZE); 1718 return; 1719 } 1720 } 1721 EXPORT_SYMBOL(vfio_unpin_pages); 1722 1723 /* 1724 * This interface allows the CPUs to perform some sort of virtual DMA on 1725 * behalf of the device. 1726 * 1727 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1728 * into/from a kernel buffer. 1729 * 1730 * As the read/write of user space memory is conducted via the CPUs and is 1731 * not a real device DMA, it is not necessary to pin the user space memory. 1732 * 1733 * @device [in] : VFIO device 1734 * @iova [in] : base IOVA of a user space buffer 1735 * @data [in] : pointer to kernel buffer 1736 * @len [in] : kernel buffer length 1737 * @write : indicate read or write 1738 * Return error code on failure or 0 on success. 1739 */ 1740 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1741 size_t len, bool write) 1742 { 1743 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1744 return -EINVAL; 1745 1746 if (vfio_device_has_container(device)) 1747 return vfio_device_container_dma_rw(device, iova, 1748 data, len, write); 1749 1750 if (device->iommufd_access) { 1751 unsigned int flags = 0; 1752 1753 if (iova > ULONG_MAX) 1754 return -EINVAL; 1755 1756 /* VFIO historically tries to auto-detect a kthread */ 1757 if (!current->mm) 1758 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1759 if (write) 1760 flags |= IOMMUFD_ACCESS_RW_WRITE; 1761 return iommufd_access_rw(device->iommufd_access, iova, data, 1762 len, flags); 1763 } 1764 return -EINVAL; 1765 } 1766 EXPORT_SYMBOL(vfio_dma_rw); 1767 1768 /* 1769 * Module/class support 1770 */ 1771 static int __init vfio_init(void) 1772 { 1773 int ret; 1774 1775 ida_init(&vfio.device_ida); 1776 1777 ret = vfio_group_init(); 1778 if (ret) 1779 return ret; 1780 1781 ret = vfio_virqfd_init(); 1782 if (ret) 1783 goto err_virqfd; 1784 1785 /* /sys/class/vfio-dev/vfioX */ 1786 vfio.device_class = class_create("vfio-dev"); 1787 if (IS_ERR(vfio.device_class)) { 1788 ret = PTR_ERR(vfio.device_class); 1789 goto err_dev_class; 1790 } 1791 1792 ret = vfio_cdev_init(vfio.device_class); 1793 if (ret) 1794 goto err_alloc_dev_chrdev; 1795 1796 vfio_debugfs_create_root(); 1797 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1798 return 0; 1799 1800 err_alloc_dev_chrdev: 1801 class_destroy(vfio.device_class); 1802 vfio.device_class = NULL; 1803 err_dev_class: 1804 vfio_virqfd_exit(); 1805 err_virqfd: 1806 vfio_group_cleanup(); 1807 return ret; 1808 } 1809 1810 static void __exit vfio_cleanup(void) 1811 { 1812 vfio_debugfs_remove_root(); 1813 ida_destroy(&vfio.device_ida); 1814 vfio_cdev_cleanup(); 1815 class_destroy(vfio.device_class); 1816 vfio.device_class = NULL; 1817 vfio_virqfd_exit(); 1818 vfio_group_cleanup(); 1819 xa_destroy(&vfio_device_set_xa); 1820 } 1821 1822 module_init(vfio_init); 1823 module_exit(vfio_cleanup); 1824 1825 MODULE_IMPORT_NS("IOMMUFD"); 1826 MODULE_VERSION(DRIVER_VERSION); 1827 MODULE_LICENSE("GPL v2"); 1828 MODULE_AUTHOR(DRIVER_AUTHOR); 1829 MODULE_DESCRIPTION(DRIVER_DESC); 1830 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1831