1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #ifdef CONFIG_HAVE_KVM 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mutex.h> 26 #include <linux/pci.h> 27 #include <linux/rwsem.h> 28 #include <linux/sched.h> 29 #include <linux/slab.h> 30 #include <linux/stat.h> 31 #include <linux/string.h> 32 #include <linux/uaccess.h> 33 #include <linux/vfio.h> 34 #include <linux/wait.h> 35 #include <linux/sched/signal.h> 36 #include <linux/pm_runtime.h> 37 #include <linux/interval_tree.h> 38 #include <linux/iova_bitmap.h> 39 #include <linux/iommufd.h> 40 #include "vfio.h" 41 42 #define DRIVER_VERSION "0.3" 43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 44 #define DRIVER_DESC "VFIO - User Level meta-driver" 45 46 static struct vfio { 47 struct class *device_class; 48 struct ida device_ida; 49 } vfio; 50 51 #ifdef CONFIG_VFIO_NOIOMMU 52 bool vfio_noiommu __read_mostly; 53 module_param_named(enable_unsafe_noiommu_mode, 54 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 56 #endif 57 58 static DEFINE_XARRAY(vfio_device_set_xa); 59 60 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 61 { 62 unsigned long idx = (unsigned long)set_id; 63 struct vfio_device_set *new_dev_set; 64 struct vfio_device_set *dev_set; 65 66 if (WARN_ON(!set_id)) 67 return -EINVAL; 68 69 /* 70 * Atomically acquire a singleton object in the xarray for this set_id 71 */ 72 xa_lock(&vfio_device_set_xa); 73 dev_set = xa_load(&vfio_device_set_xa, idx); 74 if (dev_set) 75 goto found_get_ref; 76 xa_unlock(&vfio_device_set_xa); 77 78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 79 if (!new_dev_set) 80 return -ENOMEM; 81 mutex_init(&new_dev_set->lock); 82 INIT_LIST_HEAD(&new_dev_set->device_list); 83 new_dev_set->set_id = set_id; 84 85 xa_lock(&vfio_device_set_xa); 86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 87 GFP_KERNEL); 88 if (!dev_set) { 89 dev_set = new_dev_set; 90 goto found_get_ref; 91 } 92 93 kfree(new_dev_set); 94 if (xa_is_err(dev_set)) { 95 xa_unlock(&vfio_device_set_xa); 96 return xa_err(dev_set); 97 } 98 99 found_get_ref: 100 dev_set->device_count++; 101 xa_unlock(&vfio_device_set_xa); 102 mutex_lock(&dev_set->lock); 103 device->dev_set = dev_set; 104 list_add_tail(&device->dev_set_list, &dev_set->device_list); 105 mutex_unlock(&dev_set->lock); 106 return 0; 107 } 108 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 109 110 static void vfio_release_device_set(struct vfio_device *device) 111 { 112 struct vfio_device_set *dev_set = device->dev_set; 113 114 if (!dev_set) 115 return; 116 117 mutex_lock(&dev_set->lock); 118 list_del(&device->dev_set_list); 119 mutex_unlock(&dev_set->lock); 120 121 xa_lock(&vfio_device_set_xa); 122 if (!--dev_set->device_count) { 123 __xa_erase(&vfio_device_set_xa, 124 (unsigned long)dev_set->set_id); 125 mutex_destroy(&dev_set->lock); 126 kfree(dev_set); 127 } 128 xa_unlock(&vfio_device_set_xa); 129 } 130 131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 132 { 133 struct vfio_device *cur; 134 unsigned int open_count = 0; 135 136 lockdep_assert_held(&dev_set->lock); 137 138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 139 open_count += cur->open_count; 140 return open_count; 141 } 142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 143 144 /* 145 * Device objects - create, release, get, put, search 146 */ 147 /* Device reference always implies a group reference */ 148 void vfio_device_put_registration(struct vfio_device *device) 149 { 150 if (refcount_dec_and_test(&device->refcount)) 151 complete(&device->comp); 152 } 153 154 bool vfio_device_try_get_registration(struct vfio_device *device) 155 { 156 return refcount_inc_not_zero(&device->refcount); 157 } 158 159 /* 160 * VFIO driver API 161 */ 162 /* Release helper called by vfio_put_device() */ 163 static void vfio_device_release(struct device *dev) 164 { 165 struct vfio_device *device = 166 container_of(dev, struct vfio_device, device); 167 168 vfio_release_device_set(device); 169 ida_free(&vfio.device_ida, device->index); 170 171 if (device->ops->release) 172 device->ops->release(device); 173 174 kvfree(device); 175 } 176 177 static int vfio_init_device(struct vfio_device *device, struct device *dev, 178 const struct vfio_device_ops *ops); 179 180 /* 181 * Allocate and initialize vfio_device so it can be registered to vfio 182 * core. 183 * 184 * Drivers should use the wrapper vfio_alloc_device() for allocation. 185 * @size is the size of the structure to be allocated, including any 186 * private data used by the driver. 187 * 188 * Driver may provide an @init callback to cover device private data. 189 * 190 * Use vfio_put_device() to release the structure after success return. 191 */ 192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 193 const struct vfio_device_ops *ops) 194 { 195 struct vfio_device *device; 196 int ret; 197 198 if (WARN_ON(size < sizeof(struct vfio_device))) 199 return ERR_PTR(-EINVAL); 200 201 device = kvzalloc(size, GFP_KERNEL); 202 if (!device) 203 return ERR_PTR(-ENOMEM); 204 205 ret = vfio_init_device(device, dev, ops); 206 if (ret) 207 goto out_free; 208 return device; 209 210 out_free: 211 kvfree(device); 212 return ERR_PTR(ret); 213 } 214 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 215 216 /* 217 * Initialize a vfio_device so it can be registered to vfio core. 218 */ 219 static int vfio_init_device(struct vfio_device *device, struct device *dev, 220 const struct vfio_device_ops *ops) 221 { 222 int ret; 223 224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 225 if (ret < 0) { 226 dev_dbg(dev, "Error to alloc index\n"); 227 return ret; 228 } 229 230 device->index = ret; 231 init_completion(&device->comp); 232 device->dev = dev; 233 device->ops = ops; 234 235 if (ops->init) { 236 ret = ops->init(device); 237 if (ret) 238 goto out_uninit; 239 } 240 241 device_initialize(&device->device); 242 device->device.release = vfio_device_release; 243 device->device.class = vfio.device_class; 244 device->device.parent = device->dev; 245 return 0; 246 247 out_uninit: 248 vfio_release_device_set(device); 249 ida_free(&vfio.device_ida, device->index); 250 return ret; 251 } 252 253 static int __vfio_register_dev(struct vfio_device *device, 254 enum vfio_group_type type) 255 { 256 int ret; 257 258 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 259 (!device->ops->bind_iommufd || 260 !device->ops->unbind_iommufd || 261 !device->ops->attach_ioas))) 262 return -EINVAL; 263 264 /* 265 * If the driver doesn't specify a set then the device is added to a 266 * singleton set just for itself. 267 */ 268 if (!device->dev_set) 269 vfio_assign_device_set(device, device); 270 271 ret = dev_set_name(&device->device, "vfio%d", device->index); 272 if (ret) 273 return ret; 274 275 ret = vfio_device_set_group(device, type); 276 if (ret) 277 return ret; 278 279 ret = device_add(&device->device); 280 if (ret) 281 goto err_out; 282 283 /* Refcounting can't start until the driver calls register */ 284 refcount_set(&device->refcount, 1); 285 286 vfio_device_group_register(device); 287 288 return 0; 289 err_out: 290 vfio_device_remove_group(device); 291 return ret; 292 } 293 294 int vfio_register_group_dev(struct vfio_device *device) 295 { 296 return __vfio_register_dev(device, VFIO_IOMMU); 297 } 298 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 299 300 /* 301 * Register a virtual device without IOMMU backing. The user of this 302 * device must not be able to directly trigger unmediated DMA. 303 */ 304 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 305 { 306 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 307 } 308 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 309 310 /* 311 * Decrement the device reference count and wait for the device to be 312 * removed. Open file descriptors for the device... */ 313 void vfio_unregister_group_dev(struct vfio_device *device) 314 { 315 unsigned int i = 0; 316 bool interrupted = false; 317 long rc; 318 319 vfio_device_put_registration(device); 320 rc = try_wait_for_completion(&device->comp); 321 while (rc <= 0) { 322 if (device->ops->request) 323 device->ops->request(device, i++); 324 325 if (interrupted) { 326 rc = wait_for_completion_timeout(&device->comp, 327 HZ * 10); 328 } else { 329 rc = wait_for_completion_interruptible_timeout( 330 &device->comp, HZ * 10); 331 if (rc < 0) { 332 interrupted = true; 333 dev_warn(device->dev, 334 "Device is currently in use, task" 335 " \"%s\" (%d) " 336 "blocked until device is released", 337 current->comm, task_pid_nr(current)); 338 } 339 } 340 } 341 342 vfio_device_group_unregister(device); 343 344 /* Balances device_add in register path */ 345 device_del(&device->device); 346 347 /* Balances vfio_device_set_group in register path */ 348 vfio_device_remove_group(device); 349 } 350 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 351 352 #ifdef CONFIG_HAVE_KVM 353 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 354 { 355 void (*pfn)(struct kvm *kvm); 356 bool (*fn)(struct kvm *kvm); 357 bool ret; 358 359 lockdep_assert_held(&device->dev_set->lock); 360 361 pfn = symbol_get(kvm_put_kvm); 362 if (WARN_ON(!pfn)) 363 return; 364 365 fn = symbol_get(kvm_get_kvm_safe); 366 if (WARN_ON(!fn)) { 367 symbol_put(kvm_put_kvm); 368 return; 369 } 370 371 ret = fn(kvm); 372 symbol_put(kvm_get_kvm_safe); 373 if (!ret) { 374 symbol_put(kvm_put_kvm); 375 return; 376 } 377 378 device->put_kvm = pfn; 379 device->kvm = kvm; 380 } 381 382 void vfio_device_put_kvm(struct vfio_device *device) 383 { 384 lockdep_assert_held(&device->dev_set->lock); 385 386 if (!device->kvm) 387 return; 388 389 if (WARN_ON(!device->put_kvm)) 390 goto clear; 391 392 device->put_kvm(device->kvm); 393 device->put_kvm = NULL; 394 symbol_put(kvm_put_kvm); 395 396 clear: 397 device->kvm = NULL; 398 } 399 #endif 400 401 /* true if the vfio_device has open_device() called but not close_device() */ 402 static bool vfio_assert_device_open(struct vfio_device *device) 403 { 404 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 405 } 406 407 static int vfio_device_first_open(struct vfio_device *device, 408 struct iommufd_ctx *iommufd) 409 { 410 int ret; 411 412 lockdep_assert_held(&device->dev_set->lock); 413 414 if (!try_module_get(device->dev->driver->owner)) 415 return -ENODEV; 416 417 if (iommufd) 418 ret = vfio_iommufd_bind(device, iommufd); 419 else 420 ret = vfio_device_group_use_iommu(device); 421 if (ret) 422 goto err_module_put; 423 424 if (device->ops->open_device) { 425 ret = device->ops->open_device(device); 426 if (ret) 427 goto err_unuse_iommu; 428 } 429 return 0; 430 431 err_unuse_iommu: 432 if (iommufd) 433 vfio_iommufd_unbind(device); 434 else 435 vfio_device_group_unuse_iommu(device); 436 err_module_put: 437 module_put(device->dev->driver->owner); 438 return ret; 439 } 440 441 static void vfio_device_last_close(struct vfio_device *device, 442 struct iommufd_ctx *iommufd) 443 { 444 lockdep_assert_held(&device->dev_set->lock); 445 446 if (device->ops->close_device) 447 device->ops->close_device(device); 448 if (iommufd) 449 vfio_iommufd_unbind(device); 450 else 451 vfio_device_group_unuse_iommu(device); 452 module_put(device->dev->driver->owner); 453 } 454 455 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd) 456 { 457 int ret = 0; 458 459 lockdep_assert_held(&device->dev_set->lock); 460 461 device->open_count++; 462 if (device->open_count == 1) { 463 ret = vfio_device_first_open(device, iommufd); 464 if (ret) 465 device->open_count--; 466 } 467 468 return ret; 469 } 470 471 void vfio_device_close(struct vfio_device *device, 472 struct iommufd_ctx *iommufd) 473 { 474 lockdep_assert_held(&device->dev_set->lock); 475 476 vfio_assert_device_open(device); 477 if (device->open_count == 1) 478 vfio_device_last_close(device, iommufd); 479 device->open_count--; 480 } 481 482 /* 483 * Wrapper around pm_runtime_resume_and_get(). 484 * Return error code on failure or 0 on success. 485 */ 486 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 487 { 488 struct device *dev = device->dev; 489 490 if (dev->driver && dev->driver->pm) { 491 int ret; 492 493 ret = pm_runtime_resume_and_get(dev); 494 if (ret) { 495 dev_info_ratelimited(dev, 496 "vfio: runtime resume failed %d\n", ret); 497 return -EIO; 498 } 499 } 500 501 return 0; 502 } 503 504 /* 505 * Wrapper around pm_runtime_put(). 506 */ 507 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 508 { 509 struct device *dev = device->dev; 510 511 if (dev->driver && dev->driver->pm) 512 pm_runtime_put(dev); 513 } 514 515 /* 516 * VFIO Device fd 517 */ 518 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 519 { 520 struct vfio_device *device = filep->private_data; 521 522 vfio_device_group_close(device); 523 524 vfio_device_put_registration(device); 525 526 return 0; 527 } 528 529 /* 530 * vfio_mig_get_next_state - Compute the next step in the FSM 531 * @cur_fsm - The current state the device is in 532 * @new_fsm - The target state to reach 533 * @next_fsm - Pointer to the next step to get to new_fsm 534 * 535 * Return 0 upon success, otherwise -errno 536 * Upon success the next step in the state progression between cur_fsm and 537 * new_fsm will be set in next_fsm. 538 * 539 * This breaks down requests for combination transitions into smaller steps and 540 * returns the next step to get to new_fsm. The function may need to be called 541 * multiple times before reaching new_fsm. 542 * 543 */ 544 int vfio_mig_get_next_state(struct vfio_device *device, 545 enum vfio_device_mig_state cur_fsm, 546 enum vfio_device_mig_state new_fsm, 547 enum vfio_device_mig_state *next_fsm) 548 { 549 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 550 /* 551 * The coding in this table requires the driver to implement the 552 * following FSM arcs: 553 * RESUMING -> STOP 554 * STOP -> RESUMING 555 * STOP -> STOP_COPY 556 * STOP_COPY -> STOP 557 * 558 * If P2P is supported then the driver must also implement these FSM 559 * arcs: 560 * RUNNING -> RUNNING_P2P 561 * RUNNING_P2P -> RUNNING 562 * RUNNING_P2P -> STOP 563 * STOP -> RUNNING_P2P 564 * 565 * If precopy is supported then the driver must support these additional 566 * FSM arcs: 567 * RUNNING -> PRE_COPY 568 * PRE_COPY -> RUNNING 569 * PRE_COPY -> STOP_COPY 570 * However, if precopy and P2P are supported together then the driver 571 * must support these additional arcs beyond the P2P arcs above: 572 * PRE_COPY -> RUNNING 573 * PRE_COPY -> PRE_COPY_P2P 574 * PRE_COPY_P2P -> PRE_COPY 575 * PRE_COPY_P2P -> RUNNING_P2P 576 * PRE_COPY_P2P -> STOP_COPY 577 * RUNNING -> PRE_COPY 578 * RUNNING_P2P -> PRE_COPY_P2P 579 * 580 * Without P2P and precopy the driver must implement: 581 * RUNNING -> STOP 582 * STOP -> RUNNING 583 * 584 * The coding will step through multiple states for some combination 585 * transitions; if all optional features are supported, this means the 586 * following ones: 587 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 588 * PRE_COPY -> RUNNING -> RUNNING_P2P 589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 590 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 591 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 593 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 594 * RESUMING -> STOP -> RUNNING_P2P 595 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 597 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 598 * RESUMING -> STOP -> STOP_COPY 599 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 600 * RUNNING -> RUNNING_P2P -> STOP 601 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 602 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 603 * RUNNING_P2P -> RUNNING -> PRE_COPY 604 * RUNNING_P2P -> STOP -> RESUMING 605 * RUNNING_P2P -> STOP -> STOP_COPY 606 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 607 * STOP -> RUNNING_P2P -> RUNNING 608 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 609 * STOP_COPY -> STOP -> RESUMING 610 * STOP_COPY -> STOP -> RUNNING_P2P 611 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 612 * 613 * The following transitions are blocked: 614 * STOP_COPY -> PRE_COPY 615 * STOP_COPY -> PRE_COPY_P2P 616 */ 617 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 618 [VFIO_DEVICE_STATE_STOP] = { 619 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 620 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 621 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 622 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 623 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 624 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 625 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 626 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 627 }, 628 [VFIO_DEVICE_STATE_RUNNING] = { 629 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 630 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 631 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 632 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 633 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 634 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 635 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 636 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 637 }, 638 [VFIO_DEVICE_STATE_PRE_COPY] = { 639 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 640 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 641 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 642 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 643 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 644 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 645 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 646 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 647 }, 648 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 649 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 650 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 651 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 652 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 653 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 654 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 655 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 656 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 657 }, 658 [VFIO_DEVICE_STATE_STOP_COPY] = { 659 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 660 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 661 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 662 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 663 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 664 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 665 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 666 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 667 }, 668 [VFIO_DEVICE_STATE_RESUMING] = { 669 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 670 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 671 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 672 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 673 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 674 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 675 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 676 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 677 }, 678 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 679 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 680 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 681 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 682 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 683 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 684 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 685 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 686 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 687 }, 688 [VFIO_DEVICE_STATE_ERROR] = { 689 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 690 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 691 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 692 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 693 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 694 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 695 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 696 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 697 }, 698 }; 699 700 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 701 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 702 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 703 [VFIO_DEVICE_STATE_PRE_COPY] = 704 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 705 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 706 VFIO_MIGRATION_P2P | 707 VFIO_MIGRATION_PRE_COPY, 708 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 709 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 710 [VFIO_DEVICE_STATE_RUNNING_P2P] = 711 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 712 [VFIO_DEVICE_STATE_ERROR] = ~0U, 713 }; 714 715 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 716 (state_flags_table[cur_fsm] & device->migration_flags) != 717 state_flags_table[cur_fsm])) 718 return -EINVAL; 719 720 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 721 (state_flags_table[new_fsm] & device->migration_flags) != 722 state_flags_table[new_fsm]) 723 return -EINVAL; 724 725 /* 726 * Arcs touching optional and unsupported states are skipped over. The 727 * driver will instead see an arc from the original state to the next 728 * logical state, as per the above comment. 729 */ 730 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 731 while ((state_flags_table[*next_fsm] & device->migration_flags) != 732 state_flags_table[*next_fsm]) 733 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 734 735 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 736 } 737 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 738 739 /* 740 * Convert the drivers's struct file into a FD number and return it to userspace 741 */ 742 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 743 struct vfio_device_feature_mig_state *mig) 744 { 745 int ret; 746 int fd; 747 748 fd = get_unused_fd_flags(O_CLOEXEC); 749 if (fd < 0) { 750 ret = fd; 751 goto out_fput; 752 } 753 754 mig->data_fd = fd; 755 if (copy_to_user(arg, mig, sizeof(*mig))) { 756 ret = -EFAULT; 757 goto out_put_unused; 758 } 759 fd_install(fd, filp); 760 return 0; 761 762 out_put_unused: 763 put_unused_fd(fd); 764 out_fput: 765 fput(filp); 766 return ret; 767 } 768 769 static int 770 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 771 u32 flags, void __user *arg, 772 size_t argsz) 773 { 774 size_t minsz = 775 offsetofend(struct vfio_device_feature_mig_state, data_fd); 776 struct vfio_device_feature_mig_state mig; 777 struct file *filp = NULL; 778 int ret; 779 780 if (!device->mig_ops) 781 return -ENOTTY; 782 783 ret = vfio_check_feature(flags, argsz, 784 VFIO_DEVICE_FEATURE_SET | 785 VFIO_DEVICE_FEATURE_GET, 786 sizeof(mig)); 787 if (ret != 1) 788 return ret; 789 790 if (copy_from_user(&mig, arg, minsz)) 791 return -EFAULT; 792 793 if (flags & VFIO_DEVICE_FEATURE_GET) { 794 enum vfio_device_mig_state curr_state; 795 796 ret = device->mig_ops->migration_get_state(device, 797 &curr_state); 798 if (ret) 799 return ret; 800 mig.device_state = curr_state; 801 goto out_copy; 802 } 803 804 /* Handle the VFIO_DEVICE_FEATURE_SET */ 805 filp = device->mig_ops->migration_set_state(device, mig.device_state); 806 if (IS_ERR(filp) || !filp) 807 goto out_copy; 808 809 return vfio_ioct_mig_return_fd(filp, arg, &mig); 810 out_copy: 811 mig.data_fd = -1; 812 if (copy_to_user(arg, &mig, sizeof(mig))) 813 return -EFAULT; 814 if (IS_ERR(filp)) 815 return PTR_ERR(filp); 816 return 0; 817 } 818 819 static int 820 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 821 u32 flags, void __user *arg, 822 size_t argsz) 823 { 824 struct vfio_device_feature_mig_data_size data_size = {}; 825 unsigned long stop_copy_length; 826 int ret; 827 828 if (!device->mig_ops) 829 return -ENOTTY; 830 831 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 832 sizeof(data_size)); 833 if (ret != 1) 834 return ret; 835 836 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 837 if (ret) 838 return ret; 839 840 data_size.stop_copy_length = stop_copy_length; 841 if (copy_to_user(arg, &data_size, sizeof(data_size))) 842 return -EFAULT; 843 844 return 0; 845 } 846 847 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 848 u32 flags, void __user *arg, 849 size_t argsz) 850 { 851 struct vfio_device_feature_migration mig = { 852 .flags = device->migration_flags, 853 }; 854 int ret; 855 856 if (!device->mig_ops) 857 return -ENOTTY; 858 859 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 860 sizeof(mig)); 861 if (ret != 1) 862 return ret; 863 if (copy_to_user(arg, &mig, sizeof(mig))) 864 return -EFAULT; 865 return 0; 866 } 867 868 /* Ranges should fit into a single kernel page */ 869 #define LOG_MAX_RANGES \ 870 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 871 872 static int 873 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 874 u32 flags, void __user *arg, 875 size_t argsz) 876 { 877 size_t minsz = 878 offsetofend(struct vfio_device_feature_dma_logging_control, 879 ranges); 880 struct vfio_device_feature_dma_logging_range __user *ranges; 881 struct vfio_device_feature_dma_logging_control control; 882 struct vfio_device_feature_dma_logging_range range; 883 struct rb_root_cached root = RB_ROOT_CACHED; 884 struct interval_tree_node *nodes; 885 u64 iova_end; 886 u32 nnodes; 887 int i, ret; 888 889 if (!device->log_ops) 890 return -ENOTTY; 891 892 ret = vfio_check_feature(flags, argsz, 893 VFIO_DEVICE_FEATURE_SET, 894 sizeof(control)); 895 if (ret != 1) 896 return ret; 897 898 if (copy_from_user(&control, arg, minsz)) 899 return -EFAULT; 900 901 nnodes = control.num_ranges; 902 if (!nnodes) 903 return -EINVAL; 904 905 if (nnodes > LOG_MAX_RANGES) 906 return -E2BIG; 907 908 ranges = u64_to_user_ptr(control.ranges); 909 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), 910 GFP_KERNEL); 911 if (!nodes) 912 return -ENOMEM; 913 914 for (i = 0; i < nnodes; i++) { 915 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 916 ret = -EFAULT; 917 goto end; 918 } 919 if (!IS_ALIGNED(range.iova, control.page_size) || 920 !IS_ALIGNED(range.length, control.page_size)) { 921 ret = -EINVAL; 922 goto end; 923 } 924 925 if (check_add_overflow(range.iova, range.length, &iova_end) || 926 iova_end > ULONG_MAX) { 927 ret = -EOVERFLOW; 928 goto end; 929 } 930 931 nodes[i].start = range.iova; 932 nodes[i].last = range.iova + range.length - 1; 933 if (interval_tree_iter_first(&root, nodes[i].start, 934 nodes[i].last)) { 935 /* Range overlapping */ 936 ret = -EINVAL; 937 goto end; 938 } 939 interval_tree_insert(nodes + i, &root); 940 } 941 942 ret = device->log_ops->log_start(device, &root, nnodes, 943 &control.page_size); 944 if (ret) 945 goto end; 946 947 if (copy_to_user(arg, &control, sizeof(control))) { 948 ret = -EFAULT; 949 device->log_ops->log_stop(device); 950 } 951 952 end: 953 kfree(nodes); 954 return ret; 955 } 956 957 static int 958 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 959 u32 flags, void __user *arg, 960 size_t argsz) 961 { 962 int ret; 963 964 if (!device->log_ops) 965 return -ENOTTY; 966 967 ret = vfio_check_feature(flags, argsz, 968 VFIO_DEVICE_FEATURE_SET, 0); 969 if (ret != 1) 970 return ret; 971 972 return device->log_ops->log_stop(device); 973 } 974 975 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 976 unsigned long iova, size_t length, 977 void *opaque) 978 { 979 struct vfio_device *device = opaque; 980 981 return device->log_ops->log_read_and_clear(device, iova, length, iter); 982 } 983 984 static int 985 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 986 u32 flags, void __user *arg, 987 size_t argsz) 988 { 989 size_t minsz = 990 offsetofend(struct vfio_device_feature_dma_logging_report, 991 bitmap); 992 struct vfio_device_feature_dma_logging_report report; 993 struct iova_bitmap *iter; 994 u64 iova_end; 995 int ret; 996 997 if (!device->log_ops) 998 return -ENOTTY; 999 1000 ret = vfio_check_feature(flags, argsz, 1001 VFIO_DEVICE_FEATURE_GET, 1002 sizeof(report)); 1003 if (ret != 1) 1004 return ret; 1005 1006 if (copy_from_user(&report, arg, minsz)) 1007 return -EFAULT; 1008 1009 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1010 return -EINVAL; 1011 1012 if (check_add_overflow(report.iova, report.length, &iova_end) || 1013 iova_end > ULONG_MAX) 1014 return -EOVERFLOW; 1015 1016 iter = iova_bitmap_alloc(report.iova, report.length, 1017 report.page_size, 1018 u64_to_user_ptr(report.bitmap)); 1019 if (IS_ERR(iter)) 1020 return PTR_ERR(iter); 1021 1022 ret = iova_bitmap_for_each(iter, device, 1023 vfio_device_log_read_and_clear); 1024 1025 iova_bitmap_free(iter); 1026 return ret; 1027 } 1028 1029 static int vfio_ioctl_device_feature(struct vfio_device *device, 1030 struct vfio_device_feature __user *arg) 1031 { 1032 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1033 struct vfio_device_feature feature; 1034 1035 if (copy_from_user(&feature, arg, minsz)) 1036 return -EFAULT; 1037 1038 if (feature.argsz < minsz) 1039 return -EINVAL; 1040 1041 /* Check unknown flags */ 1042 if (feature.flags & 1043 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1044 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1045 return -EINVAL; 1046 1047 /* GET & SET are mutually exclusive except with PROBE */ 1048 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1049 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1050 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1051 return -EINVAL; 1052 1053 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1054 case VFIO_DEVICE_FEATURE_MIGRATION: 1055 return vfio_ioctl_device_feature_migration( 1056 device, feature.flags, arg->data, 1057 feature.argsz - minsz); 1058 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1059 return vfio_ioctl_device_feature_mig_device_state( 1060 device, feature.flags, arg->data, 1061 feature.argsz - minsz); 1062 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1063 return vfio_ioctl_device_feature_logging_start( 1064 device, feature.flags, arg->data, 1065 feature.argsz - minsz); 1066 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1067 return vfio_ioctl_device_feature_logging_stop( 1068 device, feature.flags, arg->data, 1069 feature.argsz - minsz); 1070 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1071 return vfio_ioctl_device_feature_logging_report( 1072 device, feature.flags, arg->data, 1073 feature.argsz - minsz); 1074 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1075 return vfio_ioctl_device_feature_migration_data_size( 1076 device, feature.flags, arg->data, 1077 feature.argsz - minsz); 1078 default: 1079 if (unlikely(!device->ops->device_feature)) 1080 return -EINVAL; 1081 return device->ops->device_feature(device, feature.flags, 1082 arg->data, 1083 feature.argsz - minsz); 1084 } 1085 } 1086 1087 static long vfio_device_fops_unl_ioctl(struct file *filep, 1088 unsigned int cmd, unsigned long arg) 1089 { 1090 struct vfio_device *device = filep->private_data; 1091 int ret; 1092 1093 ret = vfio_device_pm_runtime_get(device); 1094 if (ret) 1095 return ret; 1096 1097 switch (cmd) { 1098 case VFIO_DEVICE_FEATURE: 1099 ret = vfio_ioctl_device_feature(device, (void __user *)arg); 1100 break; 1101 1102 default: 1103 if (unlikely(!device->ops->ioctl)) 1104 ret = -EINVAL; 1105 else 1106 ret = device->ops->ioctl(device, cmd, arg); 1107 break; 1108 } 1109 1110 vfio_device_pm_runtime_put(device); 1111 return ret; 1112 } 1113 1114 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1115 size_t count, loff_t *ppos) 1116 { 1117 struct vfio_device *device = filep->private_data; 1118 1119 if (unlikely(!device->ops->read)) 1120 return -EINVAL; 1121 1122 return device->ops->read(device, buf, count, ppos); 1123 } 1124 1125 static ssize_t vfio_device_fops_write(struct file *filep, 1126 const char __user *buf, 1127 size_t count, loff_t *ppos) 1128 { 1129 struct vfio_device *device = filep->private_data; 1130 1131 if (unlikely(!device->ops->write)) 1132 return -EINVAL; 1133 1134 return device->ops->write(device, buf, count, ppos); 1135 } 1136 1137 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1138 { 1139 struct vfio_device *device = filep->private_data; 1140 1141 if (unlikely(!device->ops->mmap)) 1142 return -EINVAL; 1143 1144 return device->ops->mmap(device, vma); 1145 } 1146 1147 const struct file_operations vfio_device_fops = { 1148 .owner = THIS_MODULE, 1149 .release = vfio_device_fops_release, 1150 .read = vfio_device_fops_read, 1151 .write = vfio_device_fops_write, 1152 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1153 .compat_ioctl = compat_ptr_ioctl, 1154 .mmap = vfio_device_fops_mmap, 1155 }; 1156 1157 /* 1158 * Sub-module support 1159 */ 1160 /* 1161 * Helper for managing a buffer of info chain capabilities, allocate or 1162 * reallocate a buffer with additional @size, filling in @id and @version 1163 * of the capability. A pointer to the new capability is returned. 1164 * 1165 * NB. The chain is based at the head of the buffer, so new entries are 1166 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1167 * next offsets prior to copying to the user buffer. 1168 */ 1169 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1170 size_t size, u16 id, u16 version) 1171 { 1172 void *buf; 1173 struct vfio_info_cap_header *header, *tmp; 1174 1175 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1176 if (!buf) { 1177 kfree(caps->buf); 1178 caps->buf = NULL; 1179 caps->size = 0; 1180 return ERR_PTR(-ENOMEM); 1181 } 1182 1183 caps->buf = buf; 1184 header = buf + caps->size; 1185 1186 /* Eventually copied to user buffer, zero */ 1187 memset(header, 0, size); 1188 1189 header->id = id; 1190 header->version = version; 1191 1192 /* Add to the end of the capability chain */ 1193 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1194 ; /* nothing */ 1195 1196 tmp->next = caps->size; 1197 caps->size += size; 1198 1199 return header; 1200 } 1201 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1202 1203 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1204 { 1205 struct vfio_info_cap_header *tmp; 1206 void *buf = (void *)caps->buf; 1207 1208 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1209 tmp->next += offset; 1210 } 1211 EXPORT_SYMBOL(vfio_info_cap_shift); 1212 1213 int vfio_info_add_capability(struct vfio_info_cap *caps, 1214 struct vfio_info_cap_header *cap, size_t size) 1215 { 1216 struct vfio_info_cap_header *header; 1217 1218 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1219 if (IS_ERR(header)) 1220 return PTR_ERR(header); 1221 1222 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1223 1224 return 0; 1225 } 1226 EXPORT_SYMBOL(vfio_info_add_capability); 1227 1228 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1229 int max_irq_type, size_t *data_size) 1230 { 1231 unsigned long minsz; 1232 size_t size; 1233 1234 minsz = offsetofend(struct vfio_irq_set, count); 1235 1236 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1237 (hdr->count >= (U32_MAX - hdr->start)) || 1238 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1239 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1240 return -EINVAL; 1241 1242 if (data_size) 1243 *data_size = 0; 1244 1245 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1246 return -EINVAL; 1247 1248 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1249 case VFIO_IRQ_SET_DATA_NONE: 1250 size = 0; 1251 break; 1252 case VFIO_IRQ_SET_DATA_BOOL: 1253 size = sizeof(uint8_t); 1254 break; 1255 case VFIO_IRQ_SET_DATA_EVENTFD: 1256 size = sizeof(int32_t); 1257 break; 1258 default: 1259 return -EINVAL; 1260 } 1261 1262 if (size) { 1263 if (hdr->argsz - minsz < hdr->count * size) 1264 return -EINVAL; 1265 1266 if (!data_size) 1267 return -EINVAL; 1268 1269 *data_size = hdr->count * size; 1270 } 1271 1272 return 0; 1273 } 1274 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1275 1276 /* 1277 * Pin contiguous user pages and return their associated host pages for local 1278 * domain only. 1279 * @device [in] : device 1280 * @iova [in] : starting IOVA of user pages to be pinned. 1281 * @npage [in] : count of pages to be pinned. This count should not 1282 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1283 * @prot [in] : protection flags 1284 * @pages[out] : array of host pages 1285 * Return error or number of pages pinned. 1286 * 1287 * A driver may only call this function if the vfio_device was created 1288 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1289 */ 1290 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1291 int npage, int prot, struct page **pages) 1292 { 1293 /* group->container cannot change while a vfio device is open */ 1294 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1295 return -EINVAL; 1296 if (vfio_device_has_container(device)) 1297 return vfio_device_container_pin_pages(device, iova, 1298 npage, prot, pages); 1299 if (device->iommufd_access) { 1300 int ret; 1301 1302 if (iova > ULONG_MAX) 1303 return -EINVAL; 1304 /* 1305 * VFIO ignores the sub page offset, npages is from the start of 1306 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1307 * the sub page offset by doing: 1308 * pages[0] + (iova % PAGE_SIZE) 1309 */ 1310 ret = iommufd_access_pin_pages( 1311 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1312 npage * PAGE_SIZE, pages, 1313 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1314 if (ret) 1315 return ret; 1316 return npage; 1317 } 1318 return -EINVAL; 1319 } 1320 EXPORT_SYMBOL(vfio_pin_pages); 1321 1322 /* 1323 * Unpin contiguous host pages for local domain only. 1324 * @device [in] : device 1325 * @iova [in] : starting address of user pages to be unpinned. 1326 * @npage [in] : count of pages to be unpinned. This count should not 1327 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1328 */ 1329 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1330 { 1331 if (WARN_ON(!vfio_assert_device_open(device))) 1332 return; 1333 1334 if (vfio_device_has_container(device)) { 1335 vfio_device_container_unpin_pages(device, iova, npage); 1336 return; 1337 } 1338 if (device->iommufd_access) { 1339 if (WARN_ON(iova > ULONG_MAX)) 1340 return; 1341 iommufd_access_unpin_pages(device->iommufd_access, 1342 ALIGN_DOWN(iova, PAGE_SIZE), 1343 npage * PAGE_SIZE); 1344 return; 1345 } 1346 } 1347 EXPORT_SYMBOL(vfio_unpin_pages); 1348 1349 /* 1350 * This interface allows the CPUs to perform some sort of virtual DMA on 1351 * behalf of the device. 1352 * 1353 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1354 * into/from a kernel buffer. 1355 * 1356 * As the read/write of user space memory is conducted via the CPUs and is 1357 * not a real device DMA, it is not necessary to pin the user space memory. 1358 * 1359 * @device [in] : VFIO device 1360 * @iova [in] : base IOVA of a user space buffer 1361 * @data [in] : pointer to kernel buffer 1362 * @len [in] : kernel buffer length 1363 * @write : indicate read or write 1364 * Return error code on failure or 0 on success. 1365 */ 1366 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1367 size_t len, bool write) 1368 { 1369 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1370 return -EINVAL; 1371 1372 if (vfio_device_has_container(device)) 1373 return vfio_device_container_dma_rw(device, iova, 1374 data, len, write); 1375 1376 if (device->iommufd_access) { 1377 unsigned int flags = 0; 1378 1379 if (iova > ULONG_MAX) 1380 return -EINVAL; 1381 1382 /* VFIO historically tries to auto-detect a kthread */ 1383 if (!current->mm) 1384 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1385 if (write) 1386 flags |= IOMMUFD_ACCESS_RW_WRITE; 1387 return iommufd_access_rw(device->iommufd_access, iova, data, 1388 len, flags); 1389 } 1390 return -EINVAL; 1391 } 1392 EXPORT_SYMBOL(vfio_dma_rw); 1393 1394 /* 1395 * Module/class support 1396 */ 1397 static int __init vfio_init(void) 1398 { 1399 int ret; 1400 1401 ida_init(&vfio.device_ida); 1402 1403 ret = vfio_group_init(); 1404 if (ret) 1405 return ret; 1406 1407 ret = vfio_virqfd_init(); 1408 if (ret) 1409 goto err_virqfd; 1410 1411 /* /sys/class/vfio-dev/vfioX */ 1412 vfio.device_class = class_create("vfio-dev"); 1413 if (IS_ERR(vfio.device_class)) { 1414 ret = PTR_ERR(vfio.device_class); 1415 goto err_dev_class; 1416 } 1417 1418 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1419 return 0; 1420 1421 err_dev_class: 1422 vfio_virqfd_exit(); 1423 err_virqfd: 1424 vfio_group_cleanup(); 1425 return ret; 1426 } 1427 1428 static void __exit vfio_cleanup(void) 1429 { 1430 ida_destroy(&vfio.device_ida); 1431 class_destroy(vfio.device_class); 1432 vfio.device_class = NULL; 1433 vfio_virqfd_exit(); 1434 vfio_group_cleanup(); 1435 xa_destroy(&vfio_device_set_xa); 1436 } 1437 1438 module_init(vfio_init); 1439 module_exit(vfio_cleanup); 1440 1441 MODULE_VERSION(DRIVER_VERSION); 1442 MODULE_LICENSE("GPL v2"); 1443 MODULE_AUTHOR(DRIVER_AUTHOR); 1444 MODULE_DESCRIPTION(DRIVER_DESC); 1445 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1446