1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #include <linux/list.h> 20 #include <linux/miscdevice.h> 21 #include <linux/module.h> 22 #include <linux/mutex.h> 23 #include <linux/pci.h> 24 #include <linux/rwsem.h> 25 #include <linux/sched.h> 26 #include <linux/slab.h> 27 #include <linux/stat.h> 28 #include <linux/string.h> 29 #include <linux/uaccess.h> 30 #include <linux/vfio.h> 31 #include <linux/wait.h> 32 #include <linux/sched/signal.h> 33 #include <linux/pm_runtime.h> 34 #include <linux/interval_tree.h> 35 #include <linux/iova_bitmap.h> 36 #include <linux/iommufd.h> 37 #include "vfio.h" 38 39 #define DRIVER_VERSION "0.3" 40 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 41 #define DRIVER_DESC "VFIO - User Level meta-driver" 42 43 static struct vfio { 44 struct class *device_class; 45 struct ida device_ida; 46 } vfio; 47 48 static DEFINE_XARRAY(vfio_device_set_xa); 49 50 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 51 { 52 unsigned long idx = (unsigned long)set_id; 53 struct vfio_device_set *new_dev_set; 54 struct vfio_device_set *dev_set; 55 56 if (WARN_ON(!set_id)) 57 return -EINVAL; 58 59 /* 60 * Atomically acquire a singleton object in the xarray for this set_id 61 */ 62 xa_lock(&vfio_device_set_xa); 63 dev_set = xa_load(&vfio_device_set_xa, idx); 64 if (dev_set) 65 goto found_get_ref; 66 xa_unlock(&vfio_device_set_xa); 67 68 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 69 if (!new_dev_set) 70 return -ENOMEM; 71 mutex_init(&new_dev_set->lock); 72 INIT_LIST_HEAD(&new_dev_set->device_list); 73 new_dev_set->set_id = set_id; 74 75 xa_lock(&vfio_device_set_xa); 76 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 77 GFP_KERNEL); 78 if (!dev_set) { 79 dev_set = new_dev_set; 80 goto found_get_ref; 81 } 82 83 kfree(new_dev_set); 84 if (xa_is_err(dev_set)) { 85 xa_unlock(&vfio_device_set_xa); 86 return xa_err(dev_set); 87 } 88 89 found_get_ref: 90 dev_set->device_count++; 91 xa_unlock(&vfio_device_set_xa); 92 mutex_lock(&dev_set->lock); 93 device->dev_set = dev_set; 94 list_add_tail(&device->dev_set_list, &dev_set->device_list); 95 mutex_unlock(&dev_set->lock); 96 return 0; 97 } 98 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 99 100 static void vfio_release_device_set(struct vfio_device *device) 101 { 102 struct vfio_device_set *dev_set = device->dev_set; 103 104 if (!dev_set) 105 return; 106 107 mutex_lock(&dev_set->lock); 108 list_del(&device->dev_set_list); 109 mutex_unlock(&dev_set->lock); 110 111 xa_lock(&vfio_device_set_xa); 112 if (!--dev_set->device_count) { 113 __xa_erase(&vfio_device_set_xa, 114 (unsigned long)dev_set->set_id); 115 mutex_destroy(&dev_set->lock); 116 kfree(dev_set); 117 } 118 xa_unlock(&vfio_device_set_xa); 119 } 120 121 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 122 { 123 struct vfio_device *cur; 124 unsigned int open_count = 0; 125 126 lockdep_assert_held(&dev_set->lock); 127 128 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 129 open_count += cur->open_count; 130 return open_count; 131 } 132 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 133 134 /* 135 * Device objects - create, release, get, put, search 136 */ 137 /* Device reference always implies a group reference */ 138 void vfio_device_put_registration(struct vfio_device *device) 139 { 140 if (refcount_dec_and_test(&device->refcount)) 141 complete(&device->comp); 142 } 143 144 bool vfio_device_try_get_registration(struct vfio_device *device) 145 { 146 return refcount_inc_not_zero(&device->refcount); 147 } 148 149 /* 150 * VFIO driver API 151 */ 152 /* Release helper called by vfio_put_device() */ 153 static void vfio_device_release(struct device *dev) 154 { 155 struct vfio_device *device = 156 container_of(dev, struct vfio_device, device); 157 158 vfio_release_device_set(device); 159 ida_free(&vfio.device_ida, device->index); 160 161 /* 162 * kvfree() cannot be done here due to a life cycle mess in 163 * vfio-ccw. Before the ccw part is fixed all drivers are 164 * required to support @release and call vfio_free_device() 165 * from there. 166 */ 167 device->ops->release(device); 168 } 169 170 /* 171 * Allocate and initialize vfio_device so it can be registered to vfio 172 * core. 173 * 174 * Drivers should use the wrapper vfio_alloc_device() for allocation. 175 * @size is the size of the structure to be allocated, including any 176 * private data used by the driver. 177 * 178 * Driver may provide an @init callback to cover device private data. 179 * 180 * Use vfio_put_device() to release the structure after success return. 181 */ 182 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 183 const struct vfio_device_ops *ops) 184 { 185 struct vfio_device *device; 186 int ret; 187 188 if (WARN_ON(size < sizeof(struct vfio_device))) 189 return ERR_PTR(-EINVAL); 190 191 device = kvzalloc(size, GFP_KERNEL); 192 if (!device) 193 return ERR_PTR(-ENOMEM); 194 195 ret = vfio_init_device(device, dev, ops); 196 if (ret) 197 goto out_free; 198 return device; 199 200 out_free: 201 kvfree(device); 202 return ERR_PTR(ret); 203 } 204 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 205 206 /* 207 * Initialize a vfio_device so it can be registered to vfio core. 208 * 209 * Only vfio-ccw driver should call this interface. 210 */ 211 int vfio_init_device(struct vfio_device *device, struct device *dev, 212 const struct vfio_device_ops *ops) 213 { 214 int ret; 215 216 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 217 if (ret < 0) { 218 dev_dbg(dev, "Error to alloc index\n"); 219 return ret; 220 } 221 222 device->index = ret; 223 init_completion(&device->comp); 224 device->dev = dev; 225 device->ops = ops; 226 227 if (ops->init) { 228 ret = ops->init(device); 229 if (ret) 230 goto out_uninit; 231 } 232 233 device_initialize(&device->device); 234 device->device.release = vfio_device_release; 235 device->device.class = vfio.device_class; 236 device->device.parent = device->dev; 237 return 0; 238 239 out_uninit: 240 vfio_release_device_set(device); 241 ida_free(&vfio.device_ida, device->index); 242 return ret; 243 } 244 EXPORT_SYMBOL_GPL(vfio_init_device); 245 246 /* 247 * The helper called by driver @release callback to free the device 248 * structure. Drivers which don't have private data to clean can 249 * simply use this helper as its @release. 250 */ 251 void vfio_free_device(struct vfio_device *device) 252 { 253 kvfree(device); 254 } 255 EXPORT_SYMBOL_GPL(vfio_free_device); 256 257 static int __vfio_register_dev(struct vfio_device *device, 258 enum vfio_group_type type) 259 { 260 int ret; 261 262 if (WARN_ON(device->ops->bind_iommufd && 263 (!device->ops->unbind_iommufd || 264 !device->ops->attach_ioas))) 265 return -EINVAL; 266 267 /* 268 * If the driver doesn't specify a set then the device is added to a 269 * singleton set just for itself. 270 */ 271 if (!device->dev_set) 272 vfio_assign_device_set(device, device); 273 274 ret = dev_set_name(&device->device, "vfio%d", device->index); 275 if (ret) 276 return ret; 277 278 ret = vfio_device_set_group(device, type); 279 if (ret) 280 return ret; 281 282 ret = device_add(&device->device); 283 if (ret) 284 goto err_out; 285 286 /* Refcounting can't start until the driver calls register */ 287 refcount_set(&device->refcount, 1); 288 289 vfio_device_group_register(device); 290 291 return 0; 292 err_out: 293 vfio_device_remove_group(device); 294 return ret; 295 } 296 297 int vfio_register_group_dev(struct vfio_device *device) 298 { 299 return __vfio_register_dev(device, VFIO_IOMMU); 300 } 301 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 302 303 /* 304 * Register a virtual device without IOMMU backing. The user of this 305 * device must not be able to directly trigger unmediated DMA. 306 */ 307 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 308 { 309 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 310 } 311 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 312 313 /* 314 * Decrement the device reference count and wait for the device to be 315 * removed. Open file descriptors for the device... */ 316 void vfio_unregister_group_dev(struct vfio_device *device) 317 { 318 unsigned int i = 0; 319 bool interrupted = false; 320 long rc; 321 322 vfio_device_put_registration(device); 323 rc = try_wait_for_completion(&device->comp); 324 while (rc <= 0) { 325 if (device->ops->request) 326 device->ops->request(device, i++); 327 328 if (interrupted) { 329 rc = wait_for_completion_timeout(&device->comp, 330 HZ * 10); 331 } else { 332 rc = wait_for_completion_interruptible_timeout( 333 &device->comp, HZ * 10); 334 if (rc < 0) { 335 interrupted = true; 336 dev_warn(device->dev, 337 "Device is currently in use, task" 338 " \"%s\" (%d) " 339 "blocked until device is released", 340 current->comm, task_pid_nr(current)); 341 } 342 } 343 } 344 345 vfio_device_group_unregister(device); 346 347 /* Balances device_add in register path */ 348 device_del(&device->device); 349 350 /* Balances vfio_device_set_group in register path */ 351 vfio_device_remove_group(device); 352 } 353 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 354 355 /* true if the vfio_device has open_device() called but not close_device() */ 356 static bool vfio_assert_device_open(struct vfio_device *device) 357 { 358 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 359 } 360 361 static int vfio_device_first_open(struct vfio_device *device, 362 struct iommufd_ctx *iommufd, struct kvm *kvm) 363 { 364 int ret; 365 366 lockdep_assert_held(&device->dev_set->lock); 367 368 if (!try_module_get(device->dev->driver->owner)) 369 return -ENODEV; 370 371 if (iommufd) 372 ret = vfio_iommufd_bind(device, iommufd); 373 else 374 ret = vfio_device_group_use_iommu(device); 375 if (ret) 376 goto err_module_put; 377 378 device->kvm = kvm; 379 if (device->ops->open_device) { 380 ret = device->ops->open_device(device); 381 if (ret) 382 goto err_unuse_iommu; 383 } 384 return 0; 385 386 err_unuse_iommu: 387 device->kvm = NULL; 388 if (iommufd) 389 vfio_iommufd_unbind(device); 390 else 391 vfio_device_group_unuse_iommu(device); 392 err_module_put: 393 module_put(device->dev->driver->owner); 394 return ret; 395 } 396 397 static void vfio_device_last_close(struct vfio_device *device, 398 struct iommufd_ctx *iommufd) 399 { 400 lockdep_assert_held(&device->dev_set->lock); 401 402 if (device->ops->close_device) 403 device->ops->close_device(device); 404 device->kvm = NULL; 405 if (iommufd) 406 vfio_iommufd_unbind(device); 407 else 408 vfio_device_group_unuse_iommu(device); 409 module_put(device->dev->driver->owner); 410 } 411 412 int vfio_device_open(struct vfio_device *device, 413 struct iommufd_ctx *iommufd, struct kvm *kvm) 414 { 415 int ret = 0; 416 417 mutex_lock(&device->dev_set->lock); 418 device->open_count++; 419 if (device->open_count == 1) { 420 ret = vfio_device_first_open(device, iommufd, kvm); 421 if (ret) 422 device->open_count--; 423 } 424 mutex_unlock(&device->dev_set->lock); 425 426 return ret; 427 } 428 429 void vfio_device_close(struct vfio_device *device, 430 struct iommufd_ctx *iommufd) 431 { 432 mutex_lock(&device->dev_set->lock); 433 vfio_assert_device_open(device); 434 if (device->open_count == 1) 435 vfio_device_last_close(device, iommufd); 436 device->open_count--; 437 mutex_unlock(&device->dev_set->lock); 438 } 439 440 /* 441 * Wrapper around pm_runtime_resume_and_get(). 442 * Return error code on failure or 0 on success. 443 */ 444 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 445 { 446 struct device *dev = device->dev; 447 448 if (dev->driver && dev->driver->pm) { 449 int ret; 450 451 ret = pm_runtime_resume_and_get(dev); 452 if (ret) { 453 dev_info_ratelimited(dev, 454 "vfio: runtime resume failed %d\n", ret); 455 return -EIO; 456 } 457 } 458 459 return 0; 460 } 461 462 /* 463 * Wrapper around pm_runtime_put(). 464 */ 465 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 466 { 467 struct device *dev = device->dev; 468 469 if (dev->driver && dev->driver->pm) 470 pm_runtime_put(dev); 471 } 472 473 /* 474 * VFIO Device fd 475 */ 476 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 477 { 478 struct vfio_device *device = filep->private_data; 479 480 vfio_device_group_close(device); 481 482 vfio_device_put_registration(device); 483 484 return 0; 485 } 486 487 /* 488 * vfio_mig_get_next_state - Compute the next step in the FSM 489 * @cur_fsm - The current state the device is in 490 * @new_fsm - The target state to reach 491 * @next_fsm - Pointer to the next step to get to new_fsm 492 * 493 * Return 0 upon success, otherwise -errno 494 * Upon success the next step in the state progression between cur_fsm and 495 * new_fsm will be set in next_fsm. 496 * 497 * This breaks down requests for combination transitions into smaller steps and 498 * returns the next step to get to new_fsm. The function may need to be called 499 * multiple times before reaching new_fsm. 500 * 501 */ 502 int vfio_mig_get_next_state(struct vfio_device *device, 503 enum vfio_device_mig_state cur_fsm, 504 enum vfio_device_mig_state new_fsm, 505 enum vfio_device_mig_state *next_fsm) 506 { 507 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; 508 /* 509 * The coding in this table requires the driver to implement the 510 * following FSM arcs: 511 * RESUMING -> STOP 512 * STOP -> RESUMING 513 * STOP -> STOP_COPY 514 * STOP_COPY -> STOP 515 * 516 * If P2P is supported then the driver must also implement these FSM 517 * arcs: 518 * RUNNING -> RUNNING_P2P 519 * RUNNING_P2P -> RUNNING 520 * RUNNING_P2P -> STOP 521 * STOP -> RUNNING_P2P 522 * Without P2P the driver must implement: 523 * RUNNING -> STOP 524 * STOP -> RUNNING 525 * 526 * The coding will step through multiple states for some combination 527 * transitions; if all optional features are supported, this means the 528 * following ones: 529 * RESUMING -> STOP -> RUNNING_P2P 530 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 531 * RESUMING -> STOP -> STOP_COPY 532 * RUNNING -> RUNNING_P2P -> STOP 533 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 534 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 535 * RUNNING_P2P -> STOP -> RESUMING 536 * RUNNING_P2P -> STOP -> STOP_COPY 537 * STOP -> RUNNING_P2P -> RUNNING 538 * STOP_COPY -> STOP -> RESUMING 539 * STOP_COPY -> STOP -> RUNNING_P2P 540 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 541 */ 542 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 543 [VFIO_DEVICE_STATE_STOP] = { 544 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 545 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 546 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 547 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 548 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 549 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 550 }, 551 [VFIO_DEVICE_STATE_RUNNING] = { 552 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 553 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 554 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 555 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 556 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 557 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 558 }, 559 [VFIO_DEVICE_STATE_STOP_COPY] = { 560 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 561 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 562 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 563 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 564 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 565 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 566 }, 567 [VFIO_DEVICE_STATE_RESUMING] = { 568 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 569 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 570 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 571 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 572 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 573 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 574 }, 575 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 576 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 577 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 578 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 579 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 580 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 581 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 582 }, 583 [VFIO_DEVICE_STATE_ERROR] = { 584 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 585 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 586 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 587 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 588 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 589 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 590 }, 591 }; 592 593 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 594 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 595 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 596 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 597 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 598 [VFIO_DEVICE_STATE_RUNNING_P2P] = 599 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 600 [VFIO_DEVICE_STATE_ERROR] = ~0U, 601 }; 602 603 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 604 (state_flags_table[cur_fsm] & device->migration_flags) != 605 state_flags_table[cur_fsm])) 606 return -EINVAL; 607 608 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 609 (state_flags_table[new_fsm] & device->migration_flags) != 610 state_flags_table[new_fsm]) 611 return -EINVAL; 612 613 /* 614 * Arcs touching optional and unsupported states are skipped over. The 615 * driver will instead see an arc from the original state to the next 616 * logical state, as per the above comment. 617 */ 618 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 619 while ((state_flags_table[*next_fsm] & device->migration_flags) != 620 state_flags_table[*next_fsm]) 621 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 622 623 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 624 } 625 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 626 627 /* 628 * Convert the drivers's struct file into a FD number and return it to userspace 629 */ 630 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 631 struct vfio_device_feature_mig_state *mig) 632 { 633 int ret; 634 int fd; 635 636 fd = get_unused_fd_flags(O_CLOEXEC); 637 if (fd < 0) { 638 ret = fd; 639 goto out_fput; 640 } 641 642 mig->data_fd = fd; 643 if (copy_to_user(arg, mig, sizeof(*mig))) { 644 ret = -EFAULT; 645 goto out_put_unused; 646 } 647 fd_install(fd, filp); 648 return 0; 649 650 out_put_unused: 651 put_unused_fd(fd); 652 out_fput: 653 fput(filp); 654 return ret; 655 } 656 657 static int 658 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 659 u32 flags, void __user *arg, 660 size_t argsz) 661 { 662 size_t minsz = 663 offsetofend(struct vfio_device_feature_mig_state, data_fd); 664 struct vfio_device_feature_mig_state mig; 665 struct file *filp = NULL; 666 int ret; 667 668 if (!device->mig_ops) 669 return -ENOTTY; 670 671 ret = vfio_check_feature(flags, argsz, 672 VFIO_DEVICE_FEATURE_SET | 673 VFIO_DEVICE_FEATURE_GET, 674 sizeof(mig)); 675 if (ret != 1) 676 return ret; 677 678 if (copy_from_user(&mig, arg, minsz)) 679 return -EFAULT; 680 681 if (flags & VFIO_DEVICE_FEATURE_GET) { 682 enum vfio_device_mig_state curr_state; 683 684 ret = device->mig_ops->migration_get_state(device, 685 &curr_state); 686 if (ret) 687 return ret; 688 mig.device_state = curr_state; 689 goto out_copy; 690 } 691 692 /* Handle the VFIO_DEVICE_FEATURE_SET */ 693 filp = device->mig_ops->migration_set_state(device, mig.device_state); 694 if (IS_ERR(filp) || !filp) 695 goto out_copy; 696 697 return vfio_ioct_mig_return_fd(filp, arg, &mig); 698 out_copy: 699 mig.data_fd = -1; 700 if (copy_to_user(arg, &mig, sizeof(mig))) 701 return -EFAULT; 702 if (IS_ERR(filp)) 703 return PTR_ERR(filp); 704 return 0; 705 } 706 707 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 708 u32 flags, void __user *arg, 709 size_t argsz) 710 { 711 struct vfio_device_feature_migration mig = { 712 .flags = device->migration_flags, 713 }; 714 int ret; 715 716 if (!device->mig_ops) 717 return -ENOTTY; 718 719 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 720 sizeof(mig)); 721 if (ret != 1) 722 return ret; 723 if (copy_to_user(arg, &mig, sizeof(mig))) 724 return -EFAULT; 725 return 0; 726 } 727 728 /* Ranges should fit into a single kernel page */ 729 #define LOG_MAX_RANGES \ 730 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 731 732 static int 733 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 734 u32 flags, void __user *arg, 735 size_t argsz) 736 { 737 size_t minsz = 738 offsetofend(struct vfio_device_feature_dma_logging_control, 739 ranges); 740 struct vfio_device_feature_dma_logging_range __user *ranges; 741 struct vfio_device_feature_dma_logging_control control; 742 struct vfio_device_feature_dma_logging_range range; 743 struct rb_root_cached root = RB_ROOT_CACHED; 744 struct interval_tree_node *nodes; 745 u64 iova_end; 746 u32 nnodes; 747 int i, ret; 748 749 if (!device->log_ops) 750 return -ENOTTY; 751 752 ret = vfio_check_feature(flags, argsz, 753 VFIO_DEVICE_FEATURE_SET, 754 sizeof(control)); 755 if (ret != 1) 756 return ret; 757 758 if (copy_from_user(&control, arg, minsz)) 759 return -EFAULT; 760 761 nnodes = control.num_ranges; 762 if (!nnodes) 763 return -EINVAL; 764 765 if (nnodes > LOG_MAX_RANGES) 766 return -E2BIG; 767 768 ranges = u64_to_user_ptr(control.ranges); 769 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), 770 GFP_KERNEL); 771 if (!nodes) 772 return -ENOMEM; 773 774 for (i = 0; i < nnodes; i++) { 775 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 776 ret = -EFAULT; 777 goto end; 778 } 779 if (!IS_ALIGNED(range.iova, control.page_size) || 780 !IS_ALIGNED(range.length, control.page_size)) { 781 ret = -EINVAL; 782 goto end; 783 } 784 785 if (check_add_overflow(range.iova, range.length, &iova_end) || 786 iova_end > ULONG_MAX) { 787 ret = -EOVERFLOW; 788 goto end; 789 } 790 791 nodes[i].start = range.iova; 792 nodes[i].last = range.iova + range.length - 1; 793 if (interval_tree_iter_first(&root, nodes[i].start, 794 nodes[i].last)) { 795 /* Range overlapping */ 796 ret = -EINVAL; 797 goto end; 798 } 799 interval_tree_insert(nodes + i, &root); 800 } 801 802 ret = device->log_ops->log_start(device, &root, nnodes, 803 &control.page_size); 804 if (ret) 805 goto end; 806 807 if (copy_to_user(arg, &control, sizeof(control))) { 808 ret = -EFAULT; 809 device->log_ops->log_stop(device); 810 } 811 812 end: 813 kfree(nodes); 814 return ret; 815 } 816 817 static int 818 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 819 u32 flags, void __user *arg, 820 size_t argsz) 821 { 822 int ret; 823 824 if (!device->log_ops) 825 return -ENOTTY; 826 827 ret = vfio_check_feature(flags, argsz, 828 VFIO_DEVICE_FEATURE_SET, 0); 829 if (ret != 1) 830 return ret; 831 832 return device->log_ops->log_stop(device); 833 } 834 835 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 836 unsigned long iova, size_t length, 837 void *opaque) 838 { 839 struct vfio_device *device = opaque; 840 841 return device->log_ops->log_read_and_clear(device, iova, length, iter); 842 } 843 844 static int 845 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 846 u32 flags, void __user *arg, 847 size_t argsz) 848 { 849 size_t minsz = 850 offsetofend(struct vfio_device_feature_dma_logging_report, 851 bitmap); 852 struct vfio_device_feature_dma_logging_report report; 853 struct iova_bitmap *iter; 854 u64 iova_end; 855 int ret; 856 857 if (!device->log_ops) 858 return -ENOTTY; 859 860 ret = vfio_check_feature(flags, argsz, 861 VFIO_DEVICE_FEATURE_GET, 862 sizeof(report)); 863 if (ret != 1) 864 return ret; 865 866 if (copy_from_user(&report, arg, minsz)) 867 return -EFAULT; 868 869 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 870 return -EINVAL; 871 872 if (check_add_overflow(report.iova, report.length, &iova_end) || 873 iova_end > ULONG_MAX) 874 return -EOVERFLOW; 875 876 iter = iova_bitmap_alloc(report.iova, report.length, 877 report.page_size, 878 u64_to_user_ptr(report.bitmap)); 879 if (IS_ERR(iter)) 880 return PTR_ERR(iter); 881 882 ret = iova_bitmap_for_each(iter, device, 883 vfio_device_log_read_and_clear); 884 885 iova_bitmap_free(iter); 886 return ret; 887 } 888 889 static int vfio_ioctl_device_feature(struct vfio_device *device, 890 struct vfio_device_feature __user *arg) 891 { 892 size_t minsz = offsetofend(struct vfio_device_feature, flags); 893 struct vfio_device_feature feature; 894 895 if (copy_from_user(&feature, arg, minsz)) 896 return -EFAULT; 897 898 if (feature.argsz < minsz) 899 return -EINVAL; 900 901 /* Check unknown flags */ 902 if (feature.flags & 903 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 904 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 905 return -EINVAL; 906 907 /* GET & SET are mutually exclusive except with PROBE */ 908 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 909 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 910 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 911 return -EINVAL; 912 913 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 914 case VFIO_DEVICE_FEATURE_MIGRATION: 915 return vfio_ioctl_device_feature_migration( 916 device, feature.flags, arg->data, 917 feature.argsz - minsz); 918 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 919 return vfio_ioctl_device_feature_mig_device_state( 920 device, feature.flags, arg->data, 921 feature.argsz - minsz); 922 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 923 return vfio_ioctl_device_feature_logging_start( 924 device, feature.flags, arg->data, 925 feature.argsz - minsz); 926 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 927 return vfio_ioctl_device_feature_logging_stop( 928 device, feature.flags, arg->data, 929 feature.argsz - minsz); 930 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 931 return vfio_ioctl_device_feature_logging_report( 932 device, feature.flags, arg->data, 933 feature.argsz - minsz); 934 default: 935 if (unlikely(!device->ops->device_feature)) 936 return -EINVAL; 937 return device->ops->device_feature(device, feature.flags, 938 arg->data, 939 feature.argsz - minsz); 940 } 941 } 942 943 static long vfio_device_fops_unl_ioctl(struct file *filep, 944 unsigned int cmd, unsigned long arg) 945 { 946 struct vfio_device *device = filep->private_data; 947 int ret; 948 949 ret = vfio_device_pm_runtime_get(device); 950 if (ret) 951 return ret; 952 953 switch (cmd) { 954 case VFIO_DEVICE_FEATURE: 955 ret = vfio_ioctl_device_feature(device, (void __user *)arg); 956 break; 957 958 default: 959 if (unlikely(!device->ops->ioctl)) 960 ret = -EINVAL; 961 else 962 ret = device->ops->ioctl(device, cmd, arg); 963 break; 964 } 965 966 vfio_device_pm_runtime_put(device); 967 return ret; 968 } 969 970 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 971 size_t count, loff_t *ppos) 972 { 973 struct vfio_device *device = filep->private_data; 974 975 if (unlikely(!device->ops->read)) 976 return -EINVAL; 977 978 return device->ops->read(device, buf, count, ppos); 979 } 980 981 static ssize_t vfio_device_fops_write(struct file *filep, 982 const char __user *buf, 983 size_t count, loff_t *ppos) 984 { 985 struct vfio_device *device = filep->private_data; 986 987 if (unlikely(!device->ops->write)) 988 return -EINVAL; 989 990 return device->ops->write(device, buf, count, ppos); 991 } 992 993 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 994 { 995 struct vfio_device *device = filep->private_data; 996 997 if (unlikely(!device->ops->mmap)) 998 return -EINVAL; 999 1000 return device->ops->mmap(device, vma); 1001 } 1002 1003 const struct file_operations vfio_device_fops = { 1004 .owner = THIS_MODULE, 1005 .release = vfio_device_fops_release, 1006 .read = vfio_device_fops_read, 1007 .write = vfio_device_fops_write, 1008 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1009 .compat_ioctl = compat_ptr_ioctl, 1010 .mmap = vfio_device_fops_mmap, 1011 }; 1012 1013 /* 1014 * Sub-module support 1015 */ 1016 /* 1017 * Helper for managing a buffer of info chain capabilities, allocate or 1018 * reallocate a buffer with additional @size, filling in @id and @version 1019 * of the capability. A pointer to the new capability is returned. 1020 * 1021 * NB. The chain is based at the head of the buffer, so new entries are 1022 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1023 * next offsets prior to copying to the user buffer. 1024 */ 1025 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1026 size_t size, u16 id, u16 version) 1027 { 1028 void *buf; 1029 struct vfio_info_cap_header *header, *tmp; 1030 1031 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1032 if (!buf) { 1033 kfree(caps->buf); 1034 caps->buf = NULL; 1035 caps->size = 0; 1036 return ERR_PTR(-ENOMEM); 1037 } 1038 1039 caps->buf = buf; 1040 header = buf + caps->size; 1041 1042 /* Eventually copied to user buffer, zero */ 1043 memset(header, 0, size); 1044 1045 header->id = id; 1046 header->version = version; 1047 1048 /* Add to the end of the capability chain */ 1049 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1050 ; /* nothing */ 1051 1052 tmp->next = caps->size; 1053 caps->size += size; 1054 1055 return header; 1056 } 1057 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1058 1059 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1060 { 1061 struct vfio_info_cap_header *tmp; 1062 void *buf = (void *)caps->buf; 1063 1064 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1065 tmp->next += offset; 1066 } 1067 EXPORT_SYMBOL(vfio_info_cap_shift); 1068 1069 int vfio_info_add_capability(struct vfio_info_cap *caps, 1070 struct vfio_info_cap_header *cap, size_t size) 1071 { 1072 struct vfio_info_cap_header *header; 1073 1074 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1075 if (IS_ERR(header)) 1076 return PTR_ERR(header); 1077 1078 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1079 1080 return 0; 1081 } 1082 EXPORT_SYMBOL(vfio_info_add_capability); 1083 1084 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1085 int max_irq_type, size_t *data_size) 1086 { 1087 unsigned long minsz; 1088 size_t size; 1089 1090 minsz = offsetofend(struct vfio_irq_set, count); 1091 1092 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1093 (hdr->count >= (U32_MAX - hdr->start)) || 1094 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1095 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1096 return -EINVAL; 1097 1098 if (data_size) 1099 *data_size = 0; 1100 1101 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1102 return -EINVAL; 1103 1104 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1105 case VFIO_IRQ_SET_DATA_NONE: 1106 size = 0; 1107 break; 1108 case VFIO_IRQ_SET_DATA_BOOL: 1109 size = sizeof(uint8_t); 1110 break; 1111 case VFIO_IRQ_SET_DATA_EVENTFD: 1112 size = sizeof(int32_t); 1113 break; 1114 default: 1115 return -EINVAL; 1116 } 1117 1118 if (size) { 1119 if (hdr->argsz - minsz < hdr->count * size) 1120 return -EINVAL; 1121 1122 if (!data_size) 1123 return -EINVAL; 1124 1125 *data_size = hdr->count * size; 1126 } 1127 1128 return 0; 1129 } 1130 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1131 1132 /* 1133 * Pin contiguous user pages and return their associated host pages for local 1134 * domain only. 1135 * @device [in] : device 1136 * @iova [in] : starting IOVA of user pages to be pinned. 1137 * @npage [in] : count of pages to be pinned. This count should not 1138 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1139 * @prot [in] : protection flags 1140 * @pages[out] : array of host pages 1141 * Return error or number of pages pinned. 1142 * 1143 * A driver may only call this function if the vfio_device was created 1144 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1145 */ 1146 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1147 int npage, int prot, struct page **pages) 1148 { 1149 /* group->container cannot change while a vfio device is open */ 1150 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1151 return -EINVAL; 1152 if (vfio_device_has_container(device)) 1153 return vfio_device_container_pin_pages(device, iova, 1154 npage, prot, pages); 1155 if (device->iommufd_access) { 1156 int ret; 1157 1158 if (iova > ULONG_MAX) 1159 return -EINVAL; 1160 /* 1161 * VFIO ignores the sub page offset, npages is from the start of 1162 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1163 * the sub page offset by doing: 1164 * pages[0] + (iova % PAGE_SIZE) 1165 */ 1166 ret = iommufd_access_pin_pages( 1167 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1168 npage * PAGE_SIZE, pages, 1169 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1170 if (ret) 1171 return ret; 1172 return npage; 1173 } 1174 return -EINVAL; 1175 } 1176 EXPORT_SYMBOL(vfio_pin_pages); 1177 1178 /* 1179 * Unpin contiguous host pages for local domain only. 1180 * @device [in] : device 1181 * @iova [in] : starting address of user pages to be unpinned. 1182 * @npage [in] : count of pages to be unpinned. This count should not 1183 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1184 */ 1185 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1186 { 1187 if (WARN_ON(!vfio_assert_device_open(device))) 1188 return; 1189 1190 if (vfio_device_has_container(device)) { 1191 vfio_device_container_unpin_pages(device, iova, npage); 1192 return; 1193 } 1194 if (device->iommufd_access) { 1195 if (WARN_ON(iova > ULONG_MAX)) 1196 return; 1197 iommufd_access_unpin_pages(device->iommufd_access, 1198 ALIGN_DOWN(iova, PAGE_SIZE), 1199 npage * PAGE_SIZE); 1200 return; 1201 } 1202 } 1203 EXPORT_SYMBOL(vfio_unpin_pages); 1204 1205 /* 1206 * This interface allows the CPUs to perform some sort of virtual DMA on 1207 * behalf of the device. 1208 * 1209 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1210 * into/from a kernel buffer. 1211 * 1212 * As the read/write of user space memory is conducted via the CPUs and is 1213 * not a real device DMA, it is not necessary to pin the user space memory. 1214 * 1215 * @device [in] : VFIO device 1216 * @iova [in] : base IOVA of a user space buffer 1217 * @data [in] : pointer to kernel buffer 1218 * @len [in] : kernel buffer length 1219 * @write : indicate read or write 1220 * Return error code on failure or 0 on success. 1221 */ 1222 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1223 size_t len, bool write) 1224 { 1225 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1226 return -EINVAL; 1227 1228 if (vfio_device_has_container(device)) 1229 return vfio_device_container_dma_rw(device, iova, 1230 data, len, write); 1231 1232 if (device->iommufd_access) { 1233 unsigned int flags = 0; 1234 1235 if (iova > ULONG_MAX) 1236 return -EINVAL; 1237 1238 /* VFIO historically tries to auto-detect a kthread */ 1239 if (!current->mm) 1240 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1241 if (write) 1242 flags |= IOMMUFD_ACCESS_RW_WRITE; 1243 return iommufd_access_rw(device->iommufd_access, iova, data, 1244 len, flags); 1245 } 1246 return -EINVAL; 1247 } 1248 EXPORT_SYMBOL(vfio_dma_rw); 1249 1250 /* 1251 * Module/class support 1252 */ 1253 static int __init vfio_init(void) 1254 { 1255 int ret; 1256 1257 ida_init(&vfio.device_ida); 1258 1259 ret = vfio_group_init(); 1260 if (ret) 1261 return ret; 1262 1263 /* /sys/class/vfio-dev/vfioX */ 1264 vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); 1265 if (IS_ERR(vfio.device_class)) { 1266 ret = PTR_ERR(vfio.device_class); 1267 goto err_dev_class; 1268 } 1269 1270 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1271 return 0; 1272 1273 err_dev_class: 1274 vfio_group_cleanup(); 1275 return ret; 1276 } 1277 1278 static void __exit vfio_cleanup(void) 1279 { 1280 ida_destroy(&vfio.device_ida); 1281 class_destroy(vfio.device_class); 1282 vfio.device_class = NULL; 1283 vfio_group_cleanup(); 1284 xa_destroy(&vfio_device_set_xa); 1285 } 1286 1287 module_init(vfio_init); 1288 module_exit(vfio_cleanup); 1289 1290 MODULE_VERSION(DRIVER_VERSION); 1291 MODULE_LICENSE("GPL v2"); 1292 MODULE_AUTHOR(DRIVER_AUTHOR); 1293 MODULE_DESCRIPTION(DRIVER_DESC); 1294 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1295