1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #ifdef CONFIG_HAVE_KVM 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mutex.h> 26 #include <linux/pci.h> 27 #include <linux/rwsem.h> 28 #include <linux/sched.h> 29 #include <linux/slab.h> 30 #include <linux/stat.h> 31 #include <linux/string.h> 32 #include <linux/uaccess.h> 33 #include <linux/vfio.h> 34 #include <linux/wait.h> 35 #include <linux/sched/signal.h> 36 #include <linux/pm_runtime.h> 37 #include <linux/interval_tree.h> 38 #include <linux/iova_bitmap.h> 39 #include <linux/iommufd.h> 40 #include "vfio.h" 41 42 #define DRIVER_VERSION "0.3" 43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 44 #define DRIVER_DESC "VFIO - User Level meta-driver" 45 46 static struct vfio { 47 struct class *device_class; 48 struct ida device_ida; 49 } vfio; 50 51 #ifdef CONFIG_VFIO_NOIOMMU 52 bool vfio_noiommu __read_mostly; 53 module_param_named(enable_unsafe_noiommu_mode, 54 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 56 #endif 57 58 static DEFINE_XARRAY(vfio_device_set_xa); 59 60 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 61 { 62 unsigned long idx = (unsigned long)set_id; 63 struct vfio_device_set *new_dev_set; 64 struct vfio_device_set *dev_set; 65 66 if (WARN_ON(!set_id)) 67 return -EINVAL; 68 69 /* 70 * Atomically acquire a singleton object in the xarray for this set_id 71 */ 72 xa_lock(&vfio_device_set_xa); 73 dev_set = xa_load(&vfio_device_set_xa, idx); 74 if (dev_set) 75 goto found_get_ref; 76 xa_unlock(&vfio_device_set_xa); 77 78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 79 if (!new_dev_set) 80 return -ENOMEM; 81 mutex_init(&new_dev_set->lock); 82 INIT_LIST_HEAD(&new_dev_set->device_list); 83 new_dev_set->set_id = set_id; 84 85 xa_lock(&vfio_device_set_xa); 86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 87 GFP_KERNEL); 88 if (!dev_set) { 89 dev_set = new_dev_set; 90 goto found_get_ref; 91 } 92 93 kfree(new_dev_set); 94 if (xa_is_err(dev_set)) { 95 xa_unlock(&vfio_device_set_xa); 96 return xa_err(dev_set); 97 } 98 99 found_get_ref: 100 dev_set->device_count++; 101 xa_unlock(&vfio_device_set_xa); 102 mutex_lock(&dev_set->lock); 103 device->dev_set = dev_set; 104 list_add_tail(&device->dev_set_list, &dev_set->device_list); 105 mutex_unlock(&dev_set->lock); 106 return 0; 107 } 108 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 109 110 static void vfio_release_device_set(struct vfio_device *device) 111 { 112 struct vfio_device_set *dev_set = device->dev_set; 113 114 if (!dev_set) 115 return; 116 117 mutex_lock(&dev_set->lock); 118 list_del(&device->dev_set_list); 119 mutex_unlock(&dev_set->lock); 120 121 xa_lock(&vfio_device_set_xa); 122 if (!--dev_set->device_count) { 123 __xa_erase(&vfio_device_set_xa, 124 (unsigned long)dev_set->set_id); 125 mutex_destroy(&dev_set->lock); 126 kfree(dev_set); 127 } 128 xa_unlock(&vfio_device_set_xa); 129 } 130 131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 132 { 133 struct vfio_device *cur; 134 unsigned int open_count = 0; 135 136 lockdep_assert_held(&dev_set->lock); 137 138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 139 open_count += cur->open_count; 140 return open_count; 141 } 142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 143 144 struct vfio_device * 145 vfio_find_device_in_devset(struct vfio_device_set *dev_set, 146 struct device *dev) 147 { 148 struct vfio_device *cur; 149 150 lockdep_assert_held(&dev_set->lock); 151 152 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 153 if (cur->dev == dev) 154 return cur; 155 return NULL; 156 } 157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 158 159 /* 160 * Device objects - create, release, get, put, search 161 */ 162 /* Device reference always implies a group reference */ 163 void vfio_device_put_registration(struct vfio_device *device) 164 { 165 if (refcount_dec_and_test(&device->refcount)) 166 complete(&device->comp); 167 } 168 169 bool vfio_device_try_get_registration(struct vfio_device *device) 170 { 171 return refcount_inc_not_zero(&device->refcount); 172 } 173 174 /* 175 * VFIO driver API 176 */ 177 /* Release helper called by vfio_put_device() */ 178 static void vfio_device_release(struct device *dev) 179 { 180 struct vfio_device *device = 181 container_of(dev, struct vfio_device, device); 182 183 vfio_release_device_set(device); 184 ida_free(&vfio.device_ida, device->index); 185 186 if (device->ops->release) 187 device->ops->release(device); 188 189 kvfree(device); 190 } 191 192 static int vfio_init_device(struct vfio_device *device, struct device *dev, 193 const struct vfio_device_ops *ops); 194 195 /* 196 * Allocate and initialize vfio_device so it can be registered to vfio 197 * core. 198 * 199 * Drivers should use the wrapper vfio_alloc_device() for allocation. 200 * @size is the size of the structure to be allocated, including any 201 * private data used by the driver. 202 * 203 * Driver may provide an @init callback to cover device private data. 204 * 205 * Use vfio_put_device() to release the structure after success return. 206 */ 207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 208 const struct vfio_device_ops *ops) 209 { 210 struct vfio_device *device; 211 int ret; 212 213 if (WARN_ON(size < sizeof(struct vfio_device))) 214 return ERR_PTR(-EINVAL); 215 216 device = kvzalloc(size, GFP_KERNEL); 217 if (!device) 218 return ERR_PTR(-ENOMEM); 219 220 ret = vfio_init_device(device, dev, ops); 221 if (ret) 222 goto out_free; 223 return device; 224 225 out_free: 226 kvfree(device); 227 return ERR_PTR(ret); 228 } 229 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 230 231 /* 232 * Initialize a vfio_device so it can be registered to vfio core. 233 */ 234 static int vfio_init_device(struct vfio_device *device, struct device *dev, 235 const struct vfio_device_ops *ops) 236 { 237 int ret; 238 239 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 240 if (ret < 0) { 241 dev_dbg(dev, "Error to alloc index\n"); 242 return ret; 243 } 244 245 device->index = ret; 246 init_completion(&device->comp); 247 device->dev = dev; 248 device->ops = ops; 249 250 if (ops->init) { 251 ret = ops->init(device); 252 if (ret) 253 goto out_uninit; 254 } 255 256 device_initialize(&device->device); 257 device->device.release = vfio_device_release; 258 device->device.class = vfio.device_class; 259 device->device.parent = device->dev; 260 return 0; 261 262 out_uninit: 263 vfio_release_device_set(device); 264 ida_free(&vfio.device_ida, device->index); 265 return ret; 266 } 267 268 static int __vfio_register_dev(struct vfio_device *device, 269 enum vfio_group_type type) 270 { 271 int ret; 272 273 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 274 (!device->ops->bind_iommufd || 275 !device->ops->unbind_iommufd || 276 !device->ops->attach_ioas || 277 !device->ops->detach_ioas))) 278 return -EINVAL; 279 280 /* 281 * If the driver doesn't specify a set then the device is added to a 282 * singleton set just for itself. 283 */ 284 if (!device->dev_set) 285 vfio_assign_device_set(device, device); 286 287 ret = dev_set_name(&device->device, "vfio%d", device->index); 288 if (ret) 289 return ret; 290 291 ret = vfio_device_set_group(device, type); 292 if (ret) 293 return ret; 294 295 /* 296 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 297 * restore cache coherency. It has to be checked here because it is only 298 * valid for cases where we are using iommu groups. 299 */ 300 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && 301 !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { 302 ret = -EINVAL; 303 goto err_out; 304 } 305 306 ret = vfio_device_add(device); 307 if (ret) 308 goto err_out; 309 310 /* Refcounting can't start until the driver calls register */ 311 refcount_set(&device->refcount, 1); 312 313 vfio_device_group_register(device); 314 vfio_device_debugfs_init(device); 315 316 return 0; 317 err_out: 318 vfio_device_remove_group(device); 319 return ret; 320 } 321 322 int vfio_register_group_dev(struct vfio_device *device) 323 { 324 return __vfio_register_dev(device, VFIO_IOMMU); 325 } 326 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 327 328 /* 329 * Register a virtual device without IOMMU backing. The user of this 330 * device must not be able to directly trigger unmediated DMA. 331 */ 332 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 333 { 334 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 335 } 336 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 337 338 /* 339 * Decrement the device reference count and wait for the device to be 340 * removed. Open file descriptors for the device... */ 341 void vfio_unregister_group_dev(struct vfio_device *device) 342 { 343 unsigned int i = 0; 344 bool interrupted = false; 345 long rc; 346 347 /* 348 * Prevent new device opened by userspace via the 349 * VFIO_GROUP_GET_DEVICE_FD in the group path. 350 */ 351 vfio_device_group_unregister(device); 352 353 /* 354 * Balances vfio_device_add() in register path, also prevents 355 * new device opened by userspace in the cdev path. 356 */ 357 vfio_device_del(device); 358 359 vfio_device_put_registration(device); 360 rc = try_wait_for_completion(&device->comp); 361 while (rc <= 0) { 362 if (device->ops->request) 363 device->ops->request(device, i++); 364 365 if (interrupted) { 366 rc = wait_for_completion_timeout(&device->comp, 367 HZ * 10); 368 } else { 369 rc = wait_for_completion_interruptible_timeout( 370 &device->comp, HZ * 10); 371 if (rc < 0) { 372 interrupted = true; 373 dev_warn(device->dev, 374 "Device is currently in use, task" 375 " \"%s\" (%d) " 376 "blocked until device is released", 377 current->comm, task_pid_nr(current)); 378 } 379 } 380 } 381 382 vfio_device_debugfs_exit(device); 383 /* Balances vfio_device_set_group in register path */ 384 vfio_device_remove_group(device); 385 } 386 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 387 388 #ifdef CONFIG_HAVE_KVM 389 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 390 { 391 void (*pfn)(struct kvm *kvm); 392 bool (*fn)(struct kvm *kvm); 393 bool ret; 394 395 lockdep_assert_held(&device->dev_set->lock); 396 397 if (!kvm) 398 return; 399 400 pfn = symbol_get(kvm_put_kvm); 401 if (WARN_ON(!pfn)) 402 return; 403 404 fn = symbol_get(kvm_get_kvm_safe); 405 if (WARN_ON(!fn)) { 406 symbol_put(kvm_put_kvm); 407 return; 408 } 409 410 ret = fn(kvm); 411 symbol_put(kvm_get_kvm_safe); 412 if (!ret) { 413 symbol_put(kvm_put_kvm); 414 return; 415 } 416 417 device->put_kvm = pfn; 418 device->kvm = kvm; 419 } 420 421 void vfio_device_put_kvm(struct vfio_device *device) 422 { 423 lockdep_assert_held(&device->dev_set->lock); 424 425 if (!device->kvm) 426 return; 427 428 if (WARN_ON(!device->put_kvm)) 429 goto clear; 430 431 device->put_kvm(device->kvm); 432 device->put_kvm = NULL; 433 symbol_put(kvm_put_kvm); 434 435 clear: 436 device->kvm = NULL; 437 } 438 #endif 439 440 /* true if the vfio_device has open_device() called but not close_device() */ 441 static bool vfio_assert_device_open(struct vfio_device *device) 442 { 443 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 444 } 445 446 struct vfio_device_file * 447 vfio_allocate_device_file(struct vfio_device *device) 448 { 449 struct vfio_device_file *df; 450 451 df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT); 452 if (!df) 453 return ERR_PTR(-ENOMEM); 454 455 df->device = device; 456 spin_lock_init(&df->kvm_ref_lock); 457 458 return df; 459 } 460 461 static int vfio_df_device_first_open(struct vfio_device_file *df) 462 { 463 struct vfio_device *device = df->device; 464 struct iommufd_ctx *iommufd = df->iommufd; 465 int ret; 466 467 lockdep_assert_held(&device->dev_set->lock); 468 469 if (!try_module_get(device->dev->driver->owner)) 470 return -ENODEV; 471 472 if (iommufd) 473 ret = vfio_df_iommufd_bind(df); 474 else 475 ret = vfio_device_group_use_iommu(device); 476 if (ret) 477 goto err_module_put; 478 479 if (device->ops->open_device) { 480 ret = device->ops->open_device(device); 481 if (ret) 482 goto err_unuse_iommu; 483 } 484 return 0; 485 486 err_unuse_iommu: 487 if (iommufd) 488 vfio_df_iommufd_unbind(df); 489 else 490 vfio_device_group_unuse_iommu(device); 491 err_module_put: 492 module_put(device->dev->driver->owner); 493 return ret; 494 } 495 496 static void vfio_df_device_last_close(struct vfio_device_file *df) 497 { 498 struct vfio_device *device = df->device; 499 struct iommufd_ctx *iommufd = df->iommufd; 500 501 lockdep_assert_held(&device->dev_set->lock); 502 503 if (device->ops->close_device) 504 device->ops->close_device(device); 505 if (iommufd) 506 vfio_df_iommufd_unbind(df); 507 else 508 vfio_device_group_unuse_iommu(device); 509 module_put(device->dev->driver->owner); 510 } 511 512 int vfio_df_open(struct vfio_device_file *df) 513 { 514 struct vfio_device *device = df->device; 515 int ret = 0; 516 517 lockdep_assert_held(&device->dev_set->lock); 518 519 /* 520 * Only the group path allows the device to be opened multiple 521 * times. The device cdev path doesn't have a secure way for it. 522 */ 523 if (device->open_count != 0 && !df->group) 524 return -EINVAL; 525 526 device->open_count++; 527 if (device->open_count == 1) { 528 ret = vfio_df_device_first_open(df); 529 if (ret) 530 device->open_count--; 531 } 532 533 return ret; 534 } 535 536 void vfio_df_close(struct vfio_device_file *df) 537 { 538 struct vfio_device *device = df->device; 539 540 lockdep_assert_held(&device->dev_set->lock); 541 542 vfio_assert_device_open(device); 543 if (device->open_count == 1) 544 vfio_df_device_last_close(df); 545 device->open_count--; 546 } 547 548 /* 549 * Wrapper around pm_runtime_resume_and_get(). 550 * Return error code on failure or 0 on success. 551 */ 552 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 553 { 554 struct device *dev = device->dev; 555 556 if (dev->driver && dev->driver->pm) { 557 int ret; 558 559 ret = pm_runtime_resume_and_get(dev); 560 if (ret) { 561 dev_info_ratelimited(dev, 562 "vfio: runtime resume failed %d\n", ret); 563 return -EIO; 564 } 565 } 566 567 return 0; 568 } 569 570 /* 571 * Wrapper around pm_runtime_put(). 572 */ 573 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 574 { 575 struct device *dev = device->dev; 576 577 if (dev->driver && dev->driver->pm) 578 pm_runtime_put(dev); 579 } 580 581 /* 582 * VFIO Device fd 583 */ 584 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 585 { 586 struct vfio_device_file *df = filep->private_data; 587 struct vfio_device *device = df->device; 588 589 if (df->group) 590 vfio_df_group_close(df); 591 else 592 vfio_df_unbind_iommufd(df); 593 594 vfio_device_put_registration(device); 595 596 kfree(df); 597 598 return 0; 599 } 600 601 /* 602 * vfio_mig_get_next_state - Compute the next step in the FSM 603 * @cur_fsm - The current state the device is in 604 * @new_fsm - The target state to reach 605 * @next_fsm - Pointer to the next step to get to new_fsm 606 * 607 * Return 0 upon success, otherwise -errno 608 * Upon success the next step in the state progression between cur_fsm and 609 * new_fsm will be set in next_fsm. 610 * 611 * This breaks down requests for combination transitions into smaller steps and 612 * returns the next step to get to new_fsm. The function may need to be called 613 * multiple times before reaching new_fsm. 614 * 615 */ 616 int vfio_mig_get_next_state(struct vfio_device *device, 617 enum vfio_device_mig_state cur_fsm, 618 enum vfio_device_mig_state new_fsm, 619 enum vfio_device_mig_state *next_fsm) 620 { 621 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 622 /* 623 * The coding in this table requires the driver to implement the 624 * following FSM arcs: 625 * RESUMING -> STOP 626 * STOP -> RESUMING 627 * STOP -> STOP_COPY 628 * STOP_COPY -> STOP 629 * 630 * If P2P is supported then the driver must also implement these FSM 631 * arcs: 632 * RUNNING -> RUNNING_P2P 633 * RUNNING_P2P -> RUNNING 634 * RUNNING_P2P -> STOP 635 * STOP -> RUNNING_P2P 636 * 637 * If precopy is supported then the driver must support these additional 638 * FSM arcs: 639 * RUNNING -> PRE_COPY 640 * PRE_COPY -> RUNNING 641 * PRE_COPY -> STOP_COPY 642 * However, if precopy and P2P are supported together then the driver 643 * must support these additional arcs beyond the P2P arcs above: 644 * PRE_COPY -> RUNNING 645 * PRE_COPY -> PRE_COPY_P2P 646 * PRE_COPY_P2P -> PRE_COPY 647 * PRE_COPY_P2P -> RUNNING_P2P 648 * PRE_COPY_P2P -> STOP_COPY 649 * RUNNING -> PRE_COPY 650 * RUNNING_P2P -> PRE_COPY_P2P 651 * 652 * Without P2P and precopy the driver must implement: 653 * RUNNING -> STOP 654 * STOP -> RUNNING 655 * 656 * The coding will step through multiple states for some combination 657 * transitions; if all optional features are supported, this means the 658 * following ones: 659 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 660 * PRE_COPY -> RUNNING -> RUNNING_P2P 661 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 662 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 663 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 664 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 665 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 666 * RESUMING -> STOP -> RUNNING_P2P 667 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 668 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 669 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 670 * RESUMING -> STOP -> STOP_COPY 671 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 672 * RUNNING -> RUNNING_P2P -> STOP 673 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 674 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 675 * RUNNING_P2P -> RUNNING -> PRE_COPY 676 * RUNNING_P2P -> STOP -> RESUMING 677 * RUNNING_P2P -> STOP -> STOP_COPY 678 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 679 * STOP -> RUNNING_P2P -> RUNNING 680 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 681 * STOP_COPY -> STOP -> RESUMING 682 * STOP_COPY -> STOP -> RUNNING_P2P 683 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 684 * 685 * The following transitions are blocked: 686 * STOP_COPY -> PRE_COPY 687 * STOP_COPY -> PRE_COPY_P2P 688 */ 689 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 690 [VFIO_DEVICE_STATE_STOP] = { 691 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 692 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 693 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 694 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 695 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 696 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 697 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 698 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 699 }, 700 [VFIO_DEVICE_STATE_RUNNING] = { 701 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 702 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 703 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 705 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 706 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 707 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 708 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 709 }, 710 [VFIO_DEVICE_STATE_PRE_COPY] = { 711 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 712 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 713 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 714 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 715 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 716 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 717 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 718 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 719 }, 720 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 721 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 722 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 723 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 724 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 725 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 726 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 727 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 728 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 729 }, 730 [VFIO_DEVICE_STATE_STOP_COPY] = { 731 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 732 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 733 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 734 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 735 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 736 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 737 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 738 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 739 }, 740 [VFIO_DEVICE_STATE_RESUMING] = { 741 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 742 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 743 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 744 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 745 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 746 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 747 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 748 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 749 }, 750 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 751 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 752 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 753 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 754 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 755 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 756 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 757 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 758 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 759 }, 760 [VFIO_DEVICE_STATE_ERROR] = { 761 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 762 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 763 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 764 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 765 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 766 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 767 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 768 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 769 }, 770 }; 771 772 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 773 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 774 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 775 [VFIO_DEVICE_STATE_PRE_COPY] = 776 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 777 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 778 VFIO_MIGRATION_P2P | 779 VFIO_MIGRATION_PRE_COPY, 780 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 781 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 782 [VFIO_DEVICE_STATE_RUNNING_P2P] = 783 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 784 [VFIO_DEVICE_STATE_ERROR] = ~0U, 785 }; 786 787 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 788 (state_flags_table[cur_fsm] & device->migration_flags) != 789 state_flags_table[cur_fsm])) 790 return -EINVAL; 791 792 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 793 (state_flags_table[new_fsm] & device->migration_flags) != 794 state_flags_table[new_fsm]) 795 return -EINVAL; 796 797 /* 798 * Arcs touching optional and unsupported states are skipped over. The 799 * driver will instead see an arc from the original state to the next 800 * logical state, as per the above comment. 801 */ 802 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 803 while ((state_flags_table[*next_fsm] & device->migration_flags) != 804 state_flags_table[*next_fsm]) 805 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 806 807 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 808 } 809 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 810 811 /* 812 * Convert the drivers's struct file into a FD number and return it to userspace 813 */ 814 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 815 struct vfio_device_feature_mig_state *mig) 816 { 817 int ret; 818 int fd; 819 820 fd = get_unused_fd_flags(O_CLOEXEC); 821 if (fd < 0) { 822 ret = fd; 823 goto out_fput; 824 } 825 826 mig->data_fd = fd; 827 if (copy_to_user(arg, mig, sizeof(*mig))) { 828 ret = -EFAULT; 829 goto out_put_unused; 830 } 831 fd_install(fd, filp); 832 return 0; 833 834 out_put_unused: 835 put_unused_fd(fd); 836 out_fput: 837 fput(filp); 838 return ret; 839 } 840 841 static int 842 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 843 u32 flags, void __user *arg, 844 size_t argsz) 845 { 846 size_t minsz = 847 offsetofend(struct vfio_device_feature_mig_state, data_fd); 848 struct vfio_device_feature_mig_state mig; 849 struct file *filp = NULL; 850 int ret; 851 852 if (!device->mig_ops) 853 return -ENOTTY; 854 855 ret = vfio_check_feature(flags, argsz, 856 VFIO_DEVICE_FEATURE_SET | 857 VFIO_DEVICE_FEATURE_GET, 858 sizeof(mig)); 859 if (ret != 1) 860 return ret; 861 862 if (copy_from_user(&mig, arg, minsz)) 863 return -EFAULT; 864 865 if (flags & VFIO_DEVICE_FEATURE_GET) { 866 enum vfio_device_mig_state curr_state; 867 868 ret = device->mig_ops->migration_get_state(device, 869 &curr_state); 870 if (ret) 871 return ret; 872 mig.device_state = curr_state; 873 goto out_copy; 874 } 875 876 /* Handle the VFIO_DEVICE_FEATURE_SET */ 877 filp = device->mig_ops->migration_set_state(device, mig.device_state); 878 if (IS_ERR(filp) || !filp) 879 goto out_copy; 880 881 return vfio_ioct_mig_return_fd(filp, arg, &mig); 882 out_copy: 883 mig.data_fd = -1; 884 if (copy_to_user(arg, &mig, sizeof(mig))) 885 return -EFAULT; 886 if (IS_ERR(filp)) 887 return PTR_ERR(filp); 888 return 0; 889 } 890 891 static int 892 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 893 u32 flags, void __user *arg, 894 size_t argsz) 895 { 896 struct vfio_device_feature_mig_data_size data_size = {}; 897 unsigned long stop_copy_length; 898 int ret; 899 900 if (!device->mig_ops) 901 return -ENOTTY; 902 903 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 904 sizeof(data_size)); 905 if (ret != 1) 906 return ret; 907 908 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 909 if (ret) 910 return ret; 911 912 data_size.stop_copy_length = stop_copy_length; 913 if (copy_to_user(arg, &data_size, sizeof(data_size))) 914 return -EFAULT; 915 916 return 0; 917 } 918 919 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 920 u32 flags, void __user *arg, 921 size_t argsz) 922 { 923 struct vfio_device_feature_migration mig = { 924 .flags = device->migration_flags, 925 }; 926 int ret; 927 928 if (!device->mig_ops) 929 return -ENOTTY; 930 931 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 932 sizeof(mig)); 933 if (ret != 1) 934 return ret; 935 if (copy_to_user(arg, &mig, sizeof(mig))) 936 return -EFAULT; 937 return 0; 938 } 939 940 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 941 u32 req_nodes) 942 { 943 struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 944 unsigned long min_gap, curr_gap; 945 946 /* Special shortcut when a single range is required */ 947 if (req_nodes == 1) { 948 unsigned long last; 949 950 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 951 952 /* Empty list */ 953 if (WARN_ON_ONCE(!comb_start)) 954 return; 955 956 curr = comb_start; 957 while (curr) { 958 last = curr->last; 959 prev = curr; 960 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 961 if (prev != comb_start) 962 interval_tree_remove(prev, root); 963 } 964 comb_start->last = last; 965 return; 966 } 967 968 /* Combine ranges which have the smallest gap */ 969 while (cur_nodes > req_nodes) { 970 prev = NULL; 971 min_gap = ULONG_MAX; 972 curr = interval_tree_iter_first(root, 0, ULONG_MAX); 973 while (curr) { 974 if (prev) { 975 curr_gap = curr->start - prev->last; 976 if (curr_gap < min_gap) { 977 min_gap = curr_gap; 978 comb_start = prev; 979 comb_end = curr; 980 } 981 } 982 prev = curr; 983 curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 984 } 985 986 /* Empty list or no nodes to combine */ 987 if (WARN_ON_ONCE(min_gap == ULONG_MAX)) 988 break; 989 990 comb_start->last = comb_end->last; 991 interval_tree_remove(comb_end, root); 992 cur_nodes--; 993 } 994 } 995 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); 996 997 /* Ranges should fit into a single kernel page */ 998 #define LOG_MAX_RANGES \ 999 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 1000 1001 static int 1002 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 1003 u32 flags, void __user *arg, 1004 size_t argsz) 1005 { 1006 size_t minsz = 1007 offsetofend(struct vfio_device_feature_dma_logging_control, 1008 ranges); 1009 struct vfio_device_feature_dma_logging_range __user *ranges; 1010 struct vfio_device_feature_dma_logging_control control; 1011 struct vfio_device_feature_dma_logging_range range; 1012 struct rb_root_cached root = RB_ROOT_CACHED; 1013 struct interval_tree_node *nodes; 1014 u64 iova_end; 1015 u32 nnodes; 1016 int i, ret; 1017 1018 if (!device->log_ops) 1019 return -ENOTTY; 1020 1021 ret = vfio_check_feature(flags, argsz, 1022 VFIO_DEVICE_FEATURE_SET, 1023 sizeof(control)); 1024 if (ret != 1) 1025 return ret; 1026 1027 if (copy_from_user(&control, arg, minsz)) 1028 return -EFAULT; 1029 1030 nnodes = control.num_ranges; 1031 if (!nnodes) 1032 return -EINVAL; 1033 1034 if (nnodes > LOG_MAX_RANGES) 1035 return -E2BIG; 1036 1037 ranges = u64_to_user_ptr(control.ranges); 1038 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), 1039 GFP_KERNEL); 1040 if (!nodes) 1041 return -ENOMEM; 1042 1043 for (i = 0; i < nnodes; i++) { 1044 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 1045 ret = -EFAULT; 1046 goto end; 1047 } 1048 if (!IS_ALIGNED(range.iova, control.page_size) || 1049 !IS_ALIGNED(range.length, control.page_size)) { 1050 ret = -EINVAL; 1051 goto end; 1052 } 1053 1054 if (check_add_overflow(range.iova, range.length, &iova_end) || 1055 iova_end > ULONG_MAX) { 1056 ret = -EOVERFLOW; 1057 goto end; 1058 } 1059 1060 nodes[i].start = range.iova; 1061 nodes[i].last = range.iova + range.length - 1; 1062 if (interval_tree_iter_first(&root, nodes[i].start, 1063 nodes[i].last)) { 1064 /* Range overlapping */ 1065 ret = -EINVAL; 1066 goto end; 1067 } 1068 interval_tree_insert(nodes + i, &root); 1069 } 1070 1071 ret = device->log_ops->log_start(device, &root, nnodes, 1072 &control.page_size); 1073 if (ret) 1074 goto end; 1075 1076 if (copy_to_user(arg, &control, sizeof(control))) { 1077 ret = -EFAULT; 1078 device->log_ops->log_stop(device); 1079 } 1080 1081 end: 1082 kfree(nodes); 1083 return ret; 1084 } 1085 1086 static int 1087 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1088 u32 flags, void __user *arg, 1089 size_t argsz) 1090 { 1091 int ret; 1092 1093 if (!device->log_ops) 1094 return -ENOTTY; 1095 1096 ret = vfio_check_feature(flags, argsz, 1097 VFIO_DEVICE_FEATURE_SET, 0); 1098 if (ret != 1) 1099 return ret; 1100 1101 return device->log_ops->log_stop(device); 1102 } 1103 1104 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1105 unsigned long iova, size_t length, 1106 void *opaque) 1107 { 1108 struct vfio_device *device = opaque; 1109 1110 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1111 } 1112 1113 static int 1114 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1115 u32 flags, void __user *arg, 1116 size_t argsz) 1117 { 1118 size_t minsz = 1119 offsetofend(struct vfio_device_feature_dma_logging_report, 1120 bitmap); 1121 struct vfio_device_feature_dma_logging_report report; 1122 struct iova_bitmap *iter; 1123 u64 iova_end; 1124 int ret; 1125 1126 if (!device->log_ops) 1127 return -ENOTTY; 1128 1129 ret = vfio_check_feature(flags, argsz, 1130 VFIO_DEVICE_FEATURE_GET, 1131 sizeof(report)); 1132 if (ret != 1) 1133 return ret; 1134 1135 if (copy_from_user(&report, arg, minsz)) 1136 return -EFAULT; 1137 1138 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1139 return -EINVAL; 1140 1141 if (check_add_overflow(report.iova, report.length, &iova_end) || 1142 iova_end > ULONG_MAX) 1143 return -EOVERFLOW; 1144 1145 iter = iova_bitmap_alloc(report.iova, report.length, 1146 report.page_size, 1147 u64_to_user_ptr(report.bitmap)); 1148 if (IS_ERR(iter)) 1149 return PTR_ERR(iter); 1150 1151 ret = iova_bitmap_for_each(iter, device, 1152 vfio_device_log_read_and_clear); 1153 1154 iova_bitmap_free(iter); 1155 return ret; 1156 } 1157 1158 static int vfio_ioctl_device_feature(struct vfio_device *device, 1159 struct vfio_device_feature __user *arg) 1160 { 1161 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1162 struct vfio_device_feature feature; 1163 1164 if (copy_from_user(&feature, arg, minsz)) 1165 return -EFAULT; 1166 1167 if (feature.argsz < minsz) 1168 return -EINVAL; 1169 1170 /* Check unknown flags */ 1171 if (feature.flags & 1172 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1173 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1174 return -EINVAL; 1175 1176 /* GET & SET are mutually exclusive except with PROBE */ 1177 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1178 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1179 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1180 return -EINVAL; 1181 1182 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1183 case VFIO_DEVICE_FEATURE_MIGRATION: 1184 return vfio_ioctl_device_feature_migration( 1185 device, feature.flags, arg->data, 1186 feature.argsz - minsz); 1187 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1188 return vfio_ioctl_device_feature_mig_device_state( 1189 device, feature.flags, arg->data, 1190 feature.argsz - minsz); 1191 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1192 return vfio_ioctl_device_feature_logging_start( 1193 device, feature.flags, arg->data, 1194 feature.argsz - minsz); 1195 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1196 return vfio_ioctl_device_feature_logging_stop( 1197 device, feature.flags, arg->data, 1198 feature.argsz - minsz); 1199 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1200 return vfio_ioctl_device_feature_logging_report( 1201 device, feature.flags, arg->data, 1202 feature.argsz - minsz); 1203 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1204 return vfio_ioctl_device_feature_migration_data_size( 1205 device, feature.flags, arg->data, 1206 feature.argsz - minsz); 1207 default: 1208 if (unlikely(!device->ops->device_feature)) 1209 return -EINVAL; 1210 return device->ops->device_feature(device, feature.flags, 1211 arg->data, 1212 feature.argsz - minsz); 1213 } 1214 } 1215 1216 static long vfio_device_fops_unl_ioctl(struct file *filep, 1217 unsigned int cmd, unsigned long arg) 1218 { 1219 struct vfio_device_file *df = filep->private_data; 1220 struct vfio_device *device = df->device; 1221 void __user *uptr = (void __user *)arg; 1222 int ret; 1223 1224 if (cmd == VFIO_DEVICE_BIND_IOMMUFD) 1225 return vfio_df_ioctl_bind_iommufd(df, uptr); 1226 1227 /* Paired with smp_store_release() following vfio_df_open() */ 1228 if (!smp_load_acquire(&df->access_granted)) 1229 return -EINVAL; 1230 1231 ret = vfio_device_pm_runtime_get(device); 1232 if (ret) 1233 return ret; 1234 1235 /* cdev only ioctls */ 1236 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { 1237 switch (cmd) { 1238 case VFIO_DEVICE_ATTACH_IOMMUFD_PT: 1239 ret = vfio_df_ioctl_attach_pt(df, uptr); 1240 goto out; 1241 1242 case VFIO_DEVICE_DETACH_IOMMUFD_PT: 1243 ret = vfio_df_ioctl_detach_pt(df, uptr); 1244 goto out; 1245 } 1246 } 1247 1248 switch (cmd) { 1249 case VFIO_DEVICE_FEATURE: 1250 ret = vfio_ioctl_device_feature(device, uptr); 1251 break; 1252 1253 default: 1254 if (unlikely(!device->ops->ioctl)) 1255 ret = -EINVAL; 1256 else 1257 ret = device->ops->ioctl(device, cmd, arg); 1258 break; 1259 } 1260 out: 1261 vfio_device_pm_runtime_put(device); 1262 return ret; 1263 } 1264 1265 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1266 size_t count, loff_t *ppos) 1267 { 1268 struct vfio_device_file *df = filep->private_data; 1269 struct vfio_device *device = df->device; 1270 1271 /* Paired with smp_store_release() following vfio_df_open() */ 1272 if (!smp_load_acquire(&df->access_granted)) 1273 return -EINVAL; 1274 1275 if (unlikely(!device->ops->read)) 1276 return -EINVAL; 1277 1278 return device->ops->read(device, buf, count, ppos); 1279 } 1280 1281 static ssize_t vfio_device_fops_write(struct file *filep, 1282 const char __user *buf, 1283 size_t count, loff_t *ppos) 1284 { 1285 struct vfio_device_file *df = filep->private_data; 1286 struct vfio_device *device = df->device; 1287 1288 /* Paired with smp_store_release() following vfio_df_open() */ 1289 if (!smp_load_acquire(&df->access_granted)) 1290 return -EINVAL; 1291 1292 if (unlikely(!device->ops->write)) 1293 return -EINVAL; 1294 1295 return device->ops->write(device, buf, count, ppos); 1296 } 1297 1298 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1299 { 1300 struct vfio_device_file *df = filep->private_data; 1301 struct vfio_device *device = df->device; 1302 1303 /* Paired with smp_store_release() following vfio_df_open() */ 1304 if (!smp_load_acquire(&df->access_granted)) 1305 return -EINVAL; 1306 1307 if (unlikely(!device->ops->mmap)) 1308 return -EINVAL; 1309 1310 return device->ops->mmap(device, vma); 1311 } 1312 1313 const struct file_operations vfio_device_fops = { 1314 .owner = THIS_MODULE, 1315 .open = vfio_device_fops_cdev_open, 1316 .release = vfio_device_fops_release, 1317 .read = vfio_device_fops_read, 1318 .write = vfio_device_fops_write, 1319 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1320 .compat_ioctl = compat_ptr_ioctl, 1321 .mmap = vfio_device_fops_mmap, 1322 }; 1323 1324 static struct vfio_device *vfio_device_from_file(struct file *file) 1325 { 1326 struct vfio_device_file *df = file->private_data; 1327 1328 if (file->f_op != &vfio_device_fops) 1329 return NULL; 1330 return df->device; 1331 } 1332 1333 /** 1334 * vfio_file_is_valid - True if the file is valid vfio file 1335 * @file: VFIO group file or VFIO device file 1336 */ 1337 bool vfio_file_is_valid(struct file *file) 1338 { 1339 return vfio_group_from_file(file) || 1340 vfio_device_from_file(file); 1341 } 1342 EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1343 1344 /** 1345 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1346 * is always CPU cache coherent 1347 * @file: VFIO group file or VFIO device file 1348 * 1349 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1350 * bit in DMA transactions. A return of false indicates that the user has 1351 * rights to access additional instructions such as wbinvd on x86. 1352 */ 1353 bool vfio_file_enforced_coherent(struct file *file) 1354 { 1355 struct vfio_device *device; 1356 struct vfio_group *group; 1357 1358 group = vfio_group_from_file(file); 1359 if (group) 1360 return vfio_group_enforced_coherent(group); 1361 1362 device = vfio_device_from_file(file); 1363 if (device) 1364 return device_iommu_capable(device->dev, 1365 IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1366 1367 return true; 1368 } 1369 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1370 1371 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1372 { 1373 struct vfio_device_file *df = file->private_data; 1374 1375 /* 1376 * The kvm is first recorded in the vfio_device_file, and will 1377 * be propagated to vfio_device::kvm when the file is bound to 1378 * iommufd successfully in the vfio device cdev path. 1379 */ 1380 spin_lock(&df->kvm_ref_lock); 1381 df->kvm = kvm; 1382 spin_unlock(&df->kvm_ref_lock); 1383 } 1384 1385 /** 1386 * vfio_file_set_kvm - Link a kvm with VFIO drivers 1387 * @file: VFIO group file or VFIO device file 1388 * @kvm: KVM to link 1389 * 1390 * When a VFIO device is first opened the KVM will be available in 1391 * device->kvm if one was associated with the file. 1392 */ 1393 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1394 { 1395 struct vfio_group *group; 1396 1397 group = vfio_group_from_file(file); 1398 if (group) 1399 vfio_group_set_kvm(group, kvm); 1400 1401 if (vfio_device_from_file(file)) 1402 vfio_device_file_set_kvm(file, kvm); 1403 } 1404 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1405 1406 /* 1407 * Sub-module support 1408 */ 1409 /* 1410 * Helper for managing a buffer of info chain capabilities, allocate or 1411 * reallocate a buffer with additional @size, filling in @id and @version 1412 * of the capability. A pointer to the new capability is returned. 1413 * 1414 * NB. The chain is based at the head of the buffer, so new entries are 1415 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1416 * next offsets prior to copying to the user buffer. 1417 */ 1418 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1419 size_t size, u16 id, u16 version) 1420 { 1421 void *buf; 1422 struct vfio_info_cap_header *header, *tmp; 1423 1424 /* Ensure that the next capability struct will be aligned */ 1425 size = ALIGN(size, sizeof(u64)); 1426 1427 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1428 if (!buf) { 1429 kfree(caps->buf); 1430 caps->buf = NULL; 1431 caps->size = 0; 1432 return ERR_PTR(-ENOMEM); 1433 } 1434 1435 caps->buf = buf; 1436 header = buf + caps->size; 1437 1438 /* Eventually copied to user buffer, zero */ 1439 memset(header, 0, size); 1440 1441 header->id = id; 1442 header->version = version; 1443 1444 /* Add to the end of the capability chain */ 1445 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1446 ; /* nothing */ 1447 1448 tmp->next = caps->size; 1449 caps->size += size; 1450 1451 return header; 1452 } 1453 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1454 1455 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1456 { 1457 struct vfio_info_cap_header *tmp; 1458 void *buf = (void *)caps->buf; 1459 1460 /* Capability structs should start with proper alignment */ 1461 WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); 1462 1463 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1464 tmp->next += offset; 1465 } 1466 EXPORT_SYMBOL(vfio_info_cap_shift); 1467 1468 int vfio_info_add_capability(struct vfio_info_cap *caps, 1469 struct vfio_info_cap_header *cap, size_t size) 1470 { 1471 struct vfio_info_cap_header *header; 1472 1473 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1474 if (IS_ERR(header)) 1475 return PTR_ERR(header); 1476 1477 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1478 1479 return 0; 1480 } 1481 EXPORT_SYMBOL(vfio_info_add_capability); 1482 1483 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1484 int max_irq_type, size_t *data_size) 1485 { 1486 unsigned long minsz; 1487 size_t size; 1488 1489 minsz = offsetofend(struct vfio_irq_set, count); 1490 1491 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1492 (hdr->count >= (U32_MAX - hdr->start)) || 1493 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1494 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1495 return -EINVAL; 1496 1497 if (data_size) 1498 *data_size = 0; 1499 1500 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1501 return -EINVAL; 1502 1503 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1504 case VFIO_IRQ_SET_DATA_NONE: 1505 size = 0; 1506 break; 1507 case VFIO_IRQ_SET_DATA_BOOL: 1508 size = sizeof(uint8_t); 1509 break; 1510 case VFIO_IRQ_SET_DATA_EVENTFD: 1511 size = sizeof(int32_t); 1512 break; 1513 default: 1514 return -EINVAL; 1515 } 1516 1517 if (size) { 1518 if (hdr->argsz - minsz < hdr->count * size) 1519 return -EINVAL; 1520 1521 if (!data_size) 1522 return -EINVAL; 1523 1524 *data_size = hdr->count * size; 1525 } 1526 1527 return 0; 1528 } 1529 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1530 1531 /* 1532 * Pin contiguous user pages and return their associated host pages for local 1533 * domain only. 1534 * @device [in] : device 1535 * @iova [in] : starting IOVA of user pages to be pinned. 1536 * @npage [in] : count of pages to be pinned. This count should not 1537 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1538 * @prot [in] : protection flags 1539 * @pages[out] : array of host pages 1540 * Return error or number of pages pinned. 1541 * 1542 * A driver may only call this function if the vfio_device was created 1543 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1544 */ 1545 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1546 int npage, int prot, struct page **pages) 1547 { 1548 /* group->container cannot change while a vfio device is open */ 1549 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1550 return -EINVAL; 1551 if (!device->ops->dma_unmap) 1552 return -EINVAL; 1553 if (vfio_device_has_container(device)) 1554 return vfio_device_container_pin_pages(device, iova, 1555 npage, prot, pages); 1556 if (device->iommufd_access) { 1557 int ret; 1558 1559 if (iova > ULONG_MAX) 1560 return -EINVAL; 1561 /* 1562 * VFIO ignores the sub page offset, npages is from the start of 1563 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1564 * the sub page offset by doing: 1565 * pages[0] + (iova % PAGE_SIZE) 1566 */ 1567 ret = iommufd_access_pin_pages( 1568 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1569 npage * PAGE_SIZE, pages, 1570 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1571 if (ret) 1572 return ret; 1573 return npage; 1574 } 1575 return -EINVAL; 1576 } 1577 EXPORT_SYMBOL(vfio_pin_pages); 1578 1579 /* 1580 * Unpin contiguous host pages for local domain only. 1581 * @device [in] : device 1582 * @iova [in] : starting address of user pages to be unpinned. 1583 * @npage [in] : count of pages to be unpinned. This count should not 1584 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1585 */ 1586 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1587 { 1588 if (WARN_ON(!vfio_assert_device_open(device))) 1589 return; 1590 if (WARN_ON(!device->ops->dma_unmap)) 1591 return; 1592 1593 if (vfio_device_has_container(device)) { 1594 vfio_device_container_unpin_pages(device, iova, npage); 1595 return; 1596 } 1597 if (device->iommufd_access) { 1598 if (WARN_ON(iova > ULONG_MAX)) 1599 return; 1600 iommufd_access_unpin_pages(device->iommufd_access, 1601 ALIGN_DOWN(iova, PAGE_SIZE), 1602 npage * PAGE_SIZE); 1603 return; 1604 } 1605 } 1606 EXPORT_SYMBOL(vfio_unpin_pages); 1607 1608 /* 1609 * This interface allows the CPUs to perform some sort of virtual DMA on 1610 * behalf of the device. 1611 * 1612 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1613 * into/from a kernel buffer. 1614 * 1615 * As the read/write of user space memory is conducted via the CPUs and is 1616 * not a real device DMA, it is not necessary to pin the user space memory. 1617 * 1618 * @device [in] : VFIO device 1619 * @iova [in] : base IOVA of a user space buffer 1620 * @data [in] : pointer to kernel buffer 1621 * @len [in] : kernel buffer length 1622 * @write : indicate read or write 1623 * Return error code on failure or 0 on success. 1624 */ 1625 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1626 size_t len, bool write) 1627 { 1628 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1629 return -EINVAL; 1630 1631 if (vfio_device_has_container(device)) 1632 return vfio_device_container_dma_rw(device, iova, 1633 data, len, write); 1634 1635 if (device->iommufd_access) { 1636 unsigned int flags = 0; 1637 1638 if (iova > ULONG_MAX) 1639 return -EINVAL; 1640 1641 /* VFIO historically tries to auto-detect a kthread */ 1642 if (!current->mm) 1643 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1644 if (write) 1645 flags |= IOMMUFD_ACCESS_RW_WRITE; 1646 return iommufd_access_rw(device->iommufd_access, iova, data, 1647 len, flags); 1648 } 1649 return -EINVAL; 1650 } 1651 EXPORT_SYMBOL(vfio_dma_rw); 1652 1653 /* 1654 * Module/class support 1655 */ 1656 static int __init vfio_init(void) 1657 { 1658 int ret; 1659 1660 ida_init(&vfio.device_ida); 1661 1662 ret = vfio_group_init(); 1663 if (ret) 1664 return ret; 1665 1666 ret = vfio_virqfd_init(); 1667 if (ret) 1668 goto err_virqfd; 1669 1670 /* /sys/class/vfio-dev/vfioX */ 1671 vfio.device_class = class_create("vfio-dev"); 1672 if (IS_ERR(vfio.device_class)) { 1673 ret = PTR_ERR(vfio.device_class); 1674 goto err_dev_class; 1675 } 1676 1677 ret = vfio_cdev_init(vfio.device_class); 1678 if (ret) 1679 goto err_alloc_dev_chrdev; 1680 1681 vfio_debugfs_create_root(); 1682 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1683 return 0; 1684 1685 err_alloc_dev_chrdev: 1686 class_destroy(vfio.device_class); 1687 vfio.device_class = NULL; 1688 err_dev_class: 1689 vfio_virqfd_exit(); 1690 err_virqfd: 1691 vfio_group_cleanup(); 1692 return ret; 1693 } 1694 1695 static void __exit vfio_cleanup(void) 1696 { 1697 vfio_debugfs_remove_root(); 1698 ida_destroy(&vfio.device_ida); 1699 vfio_cdev_cleanup(); 1700 class_destroy(vfio.device_class); 1701 vfio.device_class = NULL; 1702 vfio_virqfd_exit(); 1703 vfio_group_cleanup(); 1704 xa_destroy(&vfio_device_set_xa); 1705 } 1706 1707 module_init(vfio_init); 1708 module_exit(vfio_cleanup); 1709 1710 MODULE_IMPORT_NS(IOMMUFD); 1711 MODULE_VERSION(DRIVER_VERSION); 1712 MODULE_LICENSE("GPL v2"); 1713 MODULE_AUTHOR(DRIVER_AUTHOR); 1714 MODULE_DESCRIPTION(DRIVER_DESC); 1715 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1716