1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/bug.h> 12 #include <linux/file.h> 13 #include <linux/fs.h> 14 #include <linux/iommufd.h> 15 #include <linux/miscdevice.h> 16 #include <linux/module.h> 17 #include <linux/mutex.h> 18 #include <linux/slab.h> 19 #include <uapi/linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 void (*abort)(struct iommufd_object *obj); 28 }; 29 static const struct iommufd_object_ops iommufd_object_ops[]; 30 static struct miscdevice vfio_misc_dev; 31 32 /* 33 * Allow concurrent access to the object. 34 * 35 * Once another thread can see the object pointer it can prevent object 36 * destruction. Expect for special kernel-only objects there is no in-kernel way 37 * to reliably destroy a single object. Thus all APIs that are creating objects 38 * must use iommufd_object_abort() to handle their errors and only call 39 * iommufd_object_finalize() once object creation cannot fail. 40 */ 41 void iommufd_object_finalize(struct iommufd_ctx *ictx, 42 struct iommufd_object *obj) 43 { 44 XA_STATE(xas, &ictx->objects, obj->id); 45 void *old; 46 47 xa_lock(&ictx->objects); 48 old = xas_store(&xas, obj); 49 xa_unlock(&ictx->objects); 50 /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ 51 WARN_ON(old != XA_ZERO_ENTRY); 52 } 53 54 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 55 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 56 { 57 XA_STATE(xas, &ictx->objects, obj->id); 58 void *old; 59 60 xa_lock(&ictx->objects); 61 old = xas_store(&xas, NULL); 62 xa_unlock(&ictx->objects); 63 WARN_ON(old != XA_ZERO_ENTRY); 64 kfree(obj); 65 } 66 67 /* 68 * Abort an object that has been fully initialized and needs destroy, but has 69 * not been finalized. 70 */ 71 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 72 struct iommufd_object *obj) 73 { 74 if (iommufd_object_ops[obj->type].abort) 75 iommufd_object_ops[obj->type].abort(obj); 76 else 77 iommufd_object_ops[obj->type].destroy(obj); 78 iommufd_object_abort(ictx, obj); 79 } 80 81 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 82 enum iommufd_object_type type) 83 { 84 struct iommufd_object *obj; 85 86 if (iommufd_should_fail()) 87 return ERR_PTR(-ENOENT); 88 89 xa_lock(&ictx->objects); 90 obj = xa_load(&ictx->objects, id); 91 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 92 !iommufd_lock_obj(obj)) 93 obj = ERR_PTR(-ENOENT); 94 xa_unlock(&ictx->objects); 95 return obj; 96 } 97 98 static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, 99 struct iommufd_object *to_destroy) 100 { 101 if (refcount_dec_and_test(&to_destroy->shortterm_users)) 102 return 0; 103 104 if (wait_event_timeout(ictx->destroy_wait, 105 refcount_read(&to_destroy->shortterm_users) == 106 0, 107 msecs_to_jiffies(60000))) 108 return 0; 109 110 pr_crit("Time out waiting for iommufd object to become free\n"); 111 refcount_inc(&to_destroy->shortterm_users); 112 return -EBUSY; 113 } 114 115 /* 116 * Remove the given object id from the xarray if the only reference to the 117 * object is held by the xarray. 118 */ 119 int iommufd_object_remove(struct iommufd_ctx *ictx, 120 struct iommufd_object *to_destroy, u32 id, 121 unsigned int flags) 122 { 123 struct iommufd_object *obj; 124 XA_STATE(xas, &ictx->objects, id); 125 bool zerod_shortterm = false; 126 int ret; 127 128 /* 129 * The purpose of the shortterm_users is to ensure deterministic 130 * destruction of objects used by external drivers and destroyed by this 131 * function. Any temporary increment of the refcount must increment 132 * shortterm_users, such as during ioctl execution. 133 */ 134 if (flags & REMOVE_WAIT_SHORTTERM) { 135 ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); 136 if (ret) { 137 /* 138 * We have a bug. Put back the callers reference and 139 * defer cleaning this object until close. 140 */ 141 refcount_dec(&to_destroy->users); 142 return ret; 143 } 144 zerod_shortterm = true; 145 } 146 147 xa_lock(&ictx->objects); 148 obj = xas_load(&xas); 149 if (to_destroy) { 150 /* 151 * If the caller is holding a ref on obj we put it here under 152 * the spinlock. 153 */ 154 refcount_dec(&obj->users); 155 156 if (WARN_ON(obj != to_destroy)) { 157 ret = -ENOENT; 158 goto err_xa; 159 } 160 } else if (xa_is_zero(obj) || !obj) { 161 ret = -ENOENT; 162 goto err_xa; 163 } 164 165 if (!refcount_dec_if_one(&obj->users)) { 166 ret = -EBUSY; 167 goto err_xa; 168 } 169 170 xas_store(&xas, NULL); 171 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 172 ictx->vfio_ioas = NULL; 173 xa_unlock(&ictx->objects); 174 175 /* 176 * Since users is zero any positive users_shortterm must be racing 177 * iommufd_put_object(), or we have a bug. 178 */ 179 if (!zerod_shortterm) { 180 ret = iommufd_object_dec_wait_shortterm(ictx, obj); 181 if (WARN_ON(ret)) 182 return ret; 183 } 184 185 iommufd_object_ops[obj->type].destroy(obj); 186 kfree(obj); 187 return 0; 188 189 err_xa: 190 if (zerod_shortterm) { 191 /* Restore the xarray owned reference */ 192 refcount_set(&obj->shortterm_users, 1); 193 } 194 xa_unlock(&ictx->objects); 195 196 /* The returned object reference count is zero */ 197 return ret; 198 } 199 200 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 201 { 202 struct iommu_destroy *cmd = ucmd->cmd; 203 204 return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); 205 } 206 207 static int iommufd_fops_open(struct inode *inode, struct file *filp) 208 { 209 struct iommufd_ctx *ictx; 210 211 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 212 if (!ictx) 213 return -ENOMEM; 214 215 /* 216 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 217 * to the same rlimit accounting as vfio uses. 218 */ 219 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 220 filp->private_data == &vfio_misc_dev) { 221 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 222 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 223 } 224 225 init_rwsem(&ictx->ioas_creation_lock); 226 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 227 xa_init(&ictx->groups); 228 ictx->file = filp; 229 init_waitqueue_head(&ictx->destroy_wait); 230 mutex_init(&ictx->sw_msi_lock); 231 INIT_LIST_HEAD(&ictx->sw_msi_list); 232 filp->private_data = ictx; 233 return 0; 234 } 235 236 static int iommufd_fops_release(struct inode *inode, struct file *filp) 237 { 238 struct iommufd_ctx *ictx = filp->private_data; 239 struct iommufd_sw_msi_map *next; 240 struct iommufd_sw_msi_map *cur; 241 struct iommufd_object *obj; 242 243 /* 244 * The objects in the xarray form a graph of "users" counts, and we have 245 * to destroy them in a depth first manner. Leaf objects will reduce the 246 * users count of interior objects when they are destroyed. 247 * 248 * Repeatedly destroying all the "1 users" leaf objects will progress 249 * until the entire list is destroyed. If this can't progress then there 250 * is some bug related to object refcounting. 251 */ 252 while (!xa_empty(&ictx->objects)) { 253 unsigned int destroyed = 0; 254 unsigned long index; 255 256 xa_for_each(&ictx->objects, index, obj) { 257 if (!refcount_dec_if_one(&obj->users)) 258 continue; 259 destroyed++; 260 xa_erase(&ictx->objects, index); 261 iommufd_object_ops[obj->type].destroy(obj); 262 kfree(obj); 263 } 264 /* Bug related to users refcount */ 265 if (WARN_ON(!destroyed)) 266 break; 267 } 268 WARN_ON(!xa_empty(&ictx->groups)); 269 270 mutex_destroy(&ictx->sw_msi_lock); 271 list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) 272 kfree(cur); 273 274 kfree(ictx); 275 return 0; 276 } 277 278 static int iommufd_option(struct iommufd_ucmd *ucmd) 279 { 280 struct iommu_option *cmd = ucmd->cmd; 281 int rc; 282 283 if (cmd->__reserved) 284 return -EOPNOTSUPP; 285 286 switch (cmd->option_id) { 287 case IOMMU_OPTION_RLIMIT_MODE: 288 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 289 break; 290 case IOMMU_OPTION_HUGE_PAGES: 291 rc = iommufd_ioas_option(ucmd); 292 break; 293 default: 294 return -EOPNOTSUPP; 295 } 296 if (rc) 297 return rc; 298 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 299 &cmd->val64, sizeof(cmd->val64))) 300 return -EFAULT; 301 return 0; 302 } 303 304 union ucmd_buffer { 305 struct iommu_destroy destroy; 306 struct iommu_fault_alloc fault; 307 struct iommu_hw_info info; 308 struct iommu_hwpt_alloc hwpt; 309 struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; 310 struct iommu_hwpt_invalidate cache; 311 struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; 312 struct iommu_ioas_alloc alloc; 313 struct iommu_ioas_allow_iovas allow_iovas; 314 struct iommu_ioas_copy ioas_copy; 315 struct iommu_ioas_iova_ranges iova_ranges; 316 struct iommu_ioas_map map; 317 struct iommu_ioas_unmap unmap; 318 struct iommu_option option; 319 struct iommu_vdevice_alloc vdev; 320 struct iommu_veventq_alloc veventq; 321 struct iommu_vfio_ioas vfio_ioas; 322 struct iommu_viommu_alloc viommu; 323 #ifdef CONFIG_IOMMUFD_TEST 324 struct iommu_test_cmd test; 325 #endif 326 }; 327 328 struct iommufd_ioctl_op { 329 unsigned int size; 330 unsigned int min_size; 331 unsigned int ioctl_num; 332 int (*execute)(struct iommufd_ucmd *ucmd); 333 }; 334 335 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 336 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 337 .size = sizeof(_struct) + \ 338 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 339 sizeof(_struct)), \ 340 .min_size = offsetofend(_struct, _last), \ 341 .ioctl_num = _ioctl, \ 342 .execute = _fn, \ 343 } 344 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 345 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 346 IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, 347 struct iommu_fault_alloc, out_fault_fd), 348 IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, 349 __reserved), 350 IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, 351 __reserved), 352 IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, 353 struct iommu_hwpt_get_dirty_bitmap, data), 354 IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, 355 struct iommu_hwpt_invalidate, __reserved), 356 IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, 357 struct iommu_hwpt_set_dirty_tracking, __reserved), 358 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 359 struct iommu_ioas_alloc, out_ioas_id), 360 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 361 struct iommu_ioas_allow_iovas, allowed_iovas), 362 IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, 363 struct iommu_ioas_change_process, __reserved), 364 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 365 src_iova), 366 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 367 struct iommu_ioas_iova_ranges, out_iova_alignment), 368 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), 369 IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, 370 struct iommu_ioas_map_file, iova), 371 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 372 length), 373 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), 374 IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, 375 struct iommu_vdevice_alloc, virt_id), 376 IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc, 377 struct iommu_veventq_alloc, out_veventq_fd), 378 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 379 __reserved), 380 IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, 381 struct iommu_viommu_alloc, out_viommu_id), 382 #ifdef CONFIG_IOMMUFD_TEST 383 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 384 #endif 385 }; 386 387 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 388 unsigned long arg) 389 { 390 struct iommufd_ctx *ictx = filp->private_data; 391 const struct iommufd_ioctl_op *op; 392 struct iommufd_ucmd ucmd = {}; 393 union ucmd_buffer buf; 394 unsigned int nr; 395 int ret; 396 397 nr = _IOC_NR(cmd); 398 if (nr < IOMMUFD_CMD_BASE || 399 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 400 return iommufd_vfio_ioctl(ictx, cmd, arg); 401 402 ucmd.ictx = ictx; 403 ucmd.ubuffer = (void __user *)arg; 404 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 405 if (ret) 406 return ret; 407 408 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 409 if (op->ioctl_num != cmd) 410 return -ENOIOCTLCMD; 411 if (ucmd.user_size < op->min_size) 412 return -EINVAL; 413 414 ucmd.cmd = &buf; 415 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 416 ucmd.user_size); 417 if (ret) 418 return ret; 419 ret = op->execute(&ucmd); 420 return ret; 421 } 422 423 static const struct file_operations iommufd_fops = { 424 .owner = THIS_MODULE, 425 .open = iommufd_fops_open, 426 .release = iommufd_fops_release, 427 .unlocked_ioctl = iommufd_fops_ioctl, 428 }; 429 430 /** 431 * iommufd_ctx_get - Get a context reference 432 * @ictx: Context to get 433 * 434 * The caller must already hold a valid reference to ictx. 435 */ 436 void iommufd_ctx_get(struct iommufd_ctx *ictx) 437 { 438 get_file(ictx->file); 439 } 440 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, "IOMMUFD"); 441 442 /** 443 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 444 * @file: File to obtain the reference from 445 * 446 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 447 * remains owned by the caller and the caller must still do fput. On success 448 * the caller is responsible to call iommufd_ctx_put(). 449 */ 450 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 451 { 452 struct iommufd_ctx *ictx; 453 454 if (file->f_op != &iommufd_fops) 455 return ERR_PTR(-EBADFD); 456 ictx = file->private_data; 457 iommufd_ctx_get(ictx); 458 return ictx; 459 } 460 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, "IOMMUFD"); 461 462 /** 463 * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 464 * @fd: File descriptor to obtain the reference from 465 * 466 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 467 * the caller is responsible to call iommufd_ctx_put(). 468 */ 469 struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 470 { 471 struct file *file; 472 473 file = fget(fd); 474 if (!file) 475 return ERR_PTR(-EBADF); 476 477 if (file->f_op != &iommufd_fops) { 478 fput(file); 479 return ERR_PTR(-EBADFD); 480 } 481 /* fget is the same as iommufd_ctx_get() */ 482 return file->private_data; 483 } 484 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, "IOMMUFD"); 485 486 /** 487 * iommufd_ctx_put - Put back a reference 488 * @ictx: Context to put back 489 */ 490 void iommufd_ctx_put(struct iommufd_ctx *ictx) 491 { 492 fput(ictx->file); 493 } 494 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD"); 495 496 static const struct iommufd_object_ops iommufd_object_ops[] = { 497 [IOMMUFD_OBJ_ACCESS] = { 498 .destroy = iommufd_access_destroy_object, 499 }, 500 [IOMMUFD_OBJ_DEVICE] = { 501 .destroy = iommufd_device_destroy, 502 }, 503 [IOMMUFD_OBJ_FAULT] = { 504 .destroy = iommufd_fault_destroy, 505 }, 506 [IOMMUFD_OBJ_HWPT_PAGING] = { 507 .destroy = iommufd_hwpt_paging_destroy, 508 .abort = iommufd_hwpt_paging_abort, 509 }, 510 [IOMMUFD_OBJ_HWPT_NESTED] = { 511 .destroy = iommufd_hwpt_nested_destroy, 512 .abort = iommufd_hwpt_nested_abort, 513 }, 514 [IOMMUFD_OBJ_IOAS] = { 515 .destroy = iommufd_ioas_destroy, 516 }, 517 [IOMMUFD_OBJ_VDEVICE] = { 518 .destroy = iommufd_vdevice_destroy, 519 }, 520 [IOMMUFD_OBJ_VEVENTQ] = { 521 .destroy = iommufd_veventq_destroy, 522 .abort = iommufd_veventq_abort, 523 }, 524 [IOMMUFD_OBJ_VIOMMU] = { 525 .destroy = iommufd_viommu_destroy, 526 }, 527 #ifdef CONFIG_IOMMUFD_TEST 528 [IOMMUFD_OBJ_SELFTEST] = { 529 .destroy = iommufd_selftest_destroy, 530 }, 531 #endif 532 }; 533 534 static struct miscdevice iommu_misc_dev = { 535 .minor = MISC_DYNAMIC_MINOR, 536 .name = "iommu", 537 .fops = &iommufd_fops, 538 .nodename = "iommu", 539 .mode = 0660, 540 }; 541 542 543 static struct miscdevice vfio_misc_dev = { 544 .minor = VFIO_MINOR, 545 .name = "vfio", 546 .fops = &iommufd_fops, 547 .nodename = "vfio/vfio", 548 .mode = 0666, 549 }; 550 551 static int __init iommufd_init(void) 552 { 553 int ret; 554 555 ret = misc_register(&iommu_misc_dev); 556 if (ret) 557 return ret; 558 559 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 560 ret = misc_register(&vfio_misc_dev); 561 if (ret) 562 goto err_misc; 563 } 564 ret = iommufd_test_init(); 565 if (ret) 566 goto err_vfio_misc; 567 return 0; 568 569 err_vfio_misc: 570 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 571 misc_deregister(&vfio_misc_dev); 572 err_misc: 573 misc_deregister(&iommu_misc_dev); 574 return ret; 575 } 576 577 static void __exit iommufd_exit(void) 578 { 579 iommufd_test_exit(); 580 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 581 misc_deregister(&vfio_misc_dev); 582 misc_deregister(&iommu_misc_dev); 583 } 584 585 module_init(iommufd_init); 586 module_exit(iommufd_exit); 587 588 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 589 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 590 MODULE_ALIAS("devname:vfio/vfio"); 591 #endif 592 MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); 593 MODULE_IMPORT_NS("IOMMUFD"); 594 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 595 MODULE_LICENSE("GPL"); 596