1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/bug.h> 12 #include <linux/file.h> 13 #include <linux/fs.h> 14 #include <linux/iommufd.h> 15 #include <linux/miscdevice.h> 16 #include <linux/module.h> 17 #include <linux/mutex.h> 18 #include <linux/slab.h> 19 #include <uapi/linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 void (*abort)(struct iommufd_object *obj); 28 }; 29 static const struct iommufd_object_ops iommufd_object_ops[]; 30 static struct miscdevice vfio_misc_dev; 31 32 struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, 33 size_t size, 34 enum iommufd_object_type type) 35 { 36 struct iommufd_object *obj; 37 int rc; 38 39 obj = kzalloc(size, GFP_KERNEL_ACCOUNT); 40 if (!obj) 41 return ERR_PTR(-ENOMEM); 42 obj->type = type; 43 /* Starts out bias'd by 1 until it is removed from the xarray */ 44 refcount_set(&obj->shortterm_users, 1); 45 refcount_set(&obj->users, 1); 46 47 /* 48 * Reserve an ID in the xarray but do not publish the pointer yet since 49 * the caller hasn't initialized it yet. Once the pointer is published 50 * in the xarray and visible to other threads we can't reliably destroy 51 * it anymore, so the caller must complete all errorable operations 52 * before calling iommufd_object_finalize(). 53 */ 54 rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, 55 xa_limit_31b, GFP_KERNEL_ACCOUNT); 56 if (rc) 57 goto out_free; 58 return obj; 59 out_free: 60 kfree(obj); 61 return ERR_PTR(rc); 62 } 63 64 /* 65 * Allow concurrent access to the object. 66 * 67 * Once another thread can see the object pointer it can prevent object 68 * destruction. Expect for special kernel-only objects there is no in-kernel way 69 * to reliably destroy a single object. Thus all APIs that are creating objects 70 * must use iommufd_object_abort() to handle their errors and only call 71 * iommufd_object_finalize() once object creation cannot fail. 72 */ 73 void iommufd_object_finalize(struct iommufd_ctx *ictx, 74 struct iommufd_object *obj) 75 { 76 void *old; 77 78 old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); 79 /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ 80 WARN_ON(old); 81 } 82 83 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 84 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 85 { 86 void *old; 87 88 old = xa_erase(&ictx->objects, obj->id); 89 WARN_ON(old); 90 kfree(obj); 91 } 92 93 /* 94 * Abort an object that has been fully initialized and needs destroy, but has 95 * not been finalized. 96 */ 97 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 98 struct iommufd_object *obj) 99 { 100 if (iommufd_object_ops[obj->type].abort) 101 iommufd_object_ops[obj->type].abort(obj); 102 else 103 iommufd_object_ops[obj->type].destroy(obj); 104 iommufd_object_abort(ictx, obj); 105 } 106 107 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 108 enum iommufd_object_type type) 109 { 110 struct iommufd_object *obj; 111 112 if (iommufd_should_fail()) 113 return ERR_PTR(-ENOENT); 114 115 xa_lock(&ictx->objects); 116 obj = xa_load(&ictx->objects, id); 117 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 118 !iommufd_lock_obj(obj)) 119 obj = ERR_PTR(-ENOENT); 120 xa_unlock(&ictx->objects); 121 return obj; 122 } 123 124 static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, 125 struct iommufd_object *to_destroy) 126 { 127 if (refcount_dec_and_test(&to_destroy->shortterm_users)) 128 return 0; 129 130 if (wait_event_timeout(ictx->destroy_wait, 131 refcount_read(&to_destroy->shortterm_users) == 132 0, 133 msecs_to_jiffies(10000))) 134 return 0; 135 136 pr_crit("Time out waiting for iommufd object to become free\n"); 137 refcount_inc(&to_destroy->shortterm_users); 138 return -EBUSY; 139 } 140 141 /* 142 * Remove the given object id from the xarray if the only reference to the 143 * object is held by the xarray. 144 */ 145 int iommufd_object_remove(struct iommufd_ctx *ictx, 146 struct iommufd_object *to_destroy, u32 id, 147 unsigned int flags) 148 { 149 struct iommufd_object *obj; 150 XA_STATE(xas, &ictx->objects, id); 151 bool zerod_shortterm = false; 152 int ret; 153 154 /* 155 * The purpose of the shortterm_users is to ensure deterministic 156 * destruction of objects used by external drivers and destroyed by this 157 * function. Any temporary increment of the refcount must increment 158 * shortterm_users, such as during ioctl execution. 159 */ 160 if (flags & REMOVE_WAIT_SHORTTERM) { 161 ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); 162 if (ret) { 163 /* 164 * We have a bug. Put back the callers reference and 165 * defer cleaning this object until close. 166 */ 167 refcount_dec(&to_destroy->users); 168 return ret; 169 } 170 zerod_shortterm = true; 171 } 172 173 xa_lock(&ictx->objects); 174 obj = xas_load(&xas); 175 if (to_destroy) { 176 /* 177 * If the caller is holding a ref on obj we put it here under 178 * the spinlock. 179 */ 180 refcount_dec(&obj->users); 181 182 if (WARN_ON(obj != to_destroy)) { 183 ret = -ENOENT; 184 goto err_xa; 185 } 186 } else if (xa_is_zero(obj) || !obj) { 187 ret = -ENOENT; 188 goto err_xa; 189 } 190 191 if (!refcount_dec_if_one(&obj->users)) { 192 ret = -EBUSY; 193 goto err_xa; 194 } 195 196 xas_store(&xas, NULL); 197 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 198 ictx->vfio_ioas = NULL; 199 xa_unlock(&ictx->objects); 200 201 /* 202 * Since users is zero any positive users_shortterm must be racing 203 * iommufd_put_object(), or we have a bug. 204 */ 205 if (!zerod_shortterm) { 206 ret = iommufd_object_dec_wait_shortterm(ictx, obj); 207 if (WARN_ON(ret)) 208 return ret; 209 } 210 211 iommufd_object_ops[obj->type].destroy(obj); 212 kfree(obj); 213 return 0; 214 215 err_xa: 216 if (zerod_shortterm) { 217 /* Restore the xarray owned reference */ 218 refcount_set(&obj->shortterm_users, 1); 219 } 220 xa_unlock(&ictx->objects); 221 222 /* The returned object reference count is zero */ 223 return ret; 224 } 225 226 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 227 { 228 struct iommu_destroy *cmd = ucmd->cmd; 229 230 return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); 231 } 232 233 static int iommufd_fops_open(struct inode *inode, struct file *filp) 234 { 235 struct iommufd_ctx *ictx; 236 237 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 238 if (!ictx) 239 return -ENOMEM; 240 241 /* 242 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 243 * to the same rlimit accounting as vfio uses. 244 */ 245 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 246 filp->private_data == &vfio_misc_dev) { 247 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 248 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 249 } 250 251 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 252 xa_init(&ictx->groups); 253 ictx->file = filp; 254 init_waitqueue_head(&ictx->destroy_wait); 255 filp->private_data = ictx; 256 return 0; 257 } 258 259 static int iommufd_fops_release(struct inode *inode, struct file *filp) 260 { 261 struct iommufd_ctx *ictx = filp->private_data; 262 struct iommufd_object *obj; 263 264 /* 265 * The objects in the xarray form a graph of "users" counts, and we have 266 * to destroy them in a depth first manner. Leaf objects will reduce the 267 * users count of interior objects when they are destroyed. 268 * 269 * Repeatedly destroying all the "1 users" leaf objects will progress 270 * until the entire list is destroyed. If this can't progress then there 271 * is some bug related to object refcounting. 272 */ 273 while (!xa_empty(&ictx->objects)) { 274 unsigned int destroyed = 0; 275 unsigned long index; 276 277 xa_for_each(&ictx->objects, index, obj) { 278 if (!refcount_dec_if_one(&obj->users)) 279 continue; 280 destroyed++; 281 xa_erase(&ictx->objects, index); 282 iommufd_object_ops[obj->type].destroy(obj); 283 kfree(obj); 284 } 285 /* Bug related to users refcount */ 286 if (WARN_ON(!destroyed)) 287 break; 288 } 289 WARN_ON(!xa_empty(&ictx->groups)); 290 kfree(ictx); 291 return 0; 292 } 293 294 static int iommufd_option(struct iommufd_ucmd *ucmd) 295 { 296 struct iommu_option *cmd = ucmd->cmd; 297 int rc; 298 299 if (cmd->__reserved) 300 return -EOPNOTSUPP; 301 302 switch (cmd->option_id) { 303 case IOMMU_OPTION_RLIMIT_MODE: 304 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 305 break; 306 case IOMMU_OPTION_HUGE_PAGES: 307 rc = iommufd_ioas_option(ucmd); 308 break; 309 default: 310 return -EOPNOTSUPP; 311 } 312 if (rc) 313 return rc; 314 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 315 &cmd->val64, sizeof(cmd->val64))) 316 return -EFAULT; 317 return 0; 318 } 319 320 union ucmd_buffer { 321 struct iommu_destroy destroy; 322 struct iommu_fault_alloc fault; 323 struct iommu_hw_info info; 324 struct iommu_hwpt_alloc hwpt; 325 struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; 326 struct iommu_hwpt_invalidate cache; 327 struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; 328 struct iommu_ioas_alloc alloc; 329 struct iommu_ioas_allow_iovas allow_iovas; 330 struct iommu_ioas_copy ioas_copy; 331 struct iommu_ioas_iova_ranges iova_ranges; 332 struct iommu_ioas_map map; 333 struct iommu_ioas_unmap unmap; 334 struct iommu_option option; 335 struct iommu_vfio_ioas vfio_ioas; 336 #ifdef CONFIG_IOMMUFD_TEST 337 struct iommu_test_cmd test; 338 #endif 339 }; 340 341 struct iommufd_ioctl_op { 342 unsigned int size; 343 unsigned int min_size; 344 unsigned int ioctl_num; 345 int (*execute)(struct iommufd_ucmd *ucmd); 346 }; 347 348 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 349 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 350 .size = sizeof(_struct) + \ 351 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 352 sizeof(_struct)), \ 353 .min_size = offsetofend(_struct, _last), \ 354 .ioctl_num = _ioctl, \ 355 .execute = _fn, \ 356 } 357 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 358 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 359 IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc, 360 out_fault_fd), 361 IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, 362 __reserved), 363 IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, 364 __reserved), 365 IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, 366 struct iommu_hwpt_get_dirty_bitmap, data), 367 IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, 368 struct iommu_hwpt_invalidate, __reserved), 369 IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, 370 struct iommu_hwpt_set_dirty_tracking, __reserved), 371 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 372 struct iommu_ioas_alloc, out_ioas_id), 373 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 374 struct iommu_ioas_allow_iovas, allowed_iovas), 375 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 376 src_iova), 377 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 378 struct iommu_ioas_iova_ranges, out_iova_alignment), 379 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, 380 iova), 381 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 382 length), 383 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, 384 val64), 385 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 386 __reserved), 387 #ifdef CONFIG_IOMMUFD_TEST 388 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 389 #endif 390 }; 391 392 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 393 unsigned long arg) 394 { 395 struct iommufd_ctx *ictx = filp->private_data; 396 const struct iommufd_ioctl_op *op; 397 struct iommufd_ucmd ucmd = {}; 398 union ucmd_buffer buf; 399 unsigned int nr; 400 int ret; 401 402 nr = _IOC_NR(cmd); 403 if (nr < IOMMUFD_CMD_BASE || 404 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 405 return iommufd_vfio_ioctl(ictx, cmd, arg); 406 407 ucmd.ictx = ictx; 408 ucmd.ubuffer = (void __user *)arg; 409 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 410 if (ret) 411 return ret; 412 413 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 414 if (op->ioctl_num != cmd) 415 return -ENOIOCTLCMD; 416 if (ucmd.user_size < op->min_size) 417 return -EINVAL; 418 419 ucmd.cmd = &buf; 420 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 421 ucmd.user_size); 422 if (ret) 423 return ret; 424 ret = op->execute(&ucmd); 425 return ret; 426 } 427 428 static const struct file_operations iommufd_fops = { 429 .owner = THIS_MODULE, 430 .open = iommufd_fops_open, 431 .release = iommufd_fops_release, 432 .unlocked_ioctl = iommufd_fops_ioctl, 433 }; 434 435 /** 436 * iommufd_ctx_get - Get a context reference 437 * @ictx: Context to get 438 * 439 * The caller must already hold a valid reference to ictx. 440 */ 441 void iommufd_ctx_get(struct iommufd_ctx *ictx) 442 { 443 get_file(ictx->file); 444 } 445 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); 446 447 /** 448 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 449 * @file: File to obtain the reference from 450 * 451 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 452 * remains owned by the caller and the caller must still do fput. On success 453 * the caller is responsible to call iommufd_ctx_put(). 454 */ 455 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 456 { 457 struct iommufd_ctx *ictx; 458 459 if (file->f_op != &iommufd_fops) 460 return ERR_PTR(-EBADFD); 461 ictx = file->private_data; 462 iommufd_ctx_get(ictx); 463 return ictx; 464 } 465 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 466 467 /** 468 * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 469 * @fd: File descriptor to obtain the reference from 470 * 471 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 472 * the caller is responsible to call iommufd_ctx_put(). 473 */ 474 struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 475 { 476 struct file *file; 477 478 file = fget(fd); 479 if (!file) 480 return ERR_PTR(-EBADF); 481 482 if (file->f_op != &iommufd_fops) { 483 fput(file); 484 return ERR_PTR(-EBADFD); 485 } 486 /* fget is the same as iommufd_ctx_get() */ 487 return file->private_data; 488 } 489 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); 490 491 /** 492 * iommufd_ctx_put - Put back a reference 493 * @ictx: Context to put back 494 */ 495 void iommufd_ctx_put(struct iommufd_ctx *ictx) 496 { 497 fput(ictx->file); 498 } 499 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); 500 501 static const struct iommufd_object_ops iommufd_object_ops[] = { 502 [IOMMUFD_OBJ_ACCESS] = { 503 .destroy = iommufd_access_destroy_object, 504 }, 505 [IOMMUFD_OBJ_DEVICE] = { 506 .destroy = iommufd_device_destroy, 507 }, 508 [IOMMUFD_OBJ_IOAS] = { 509 .destroy = iommufd_ioas_destroy, 510 }, 511 [IOMMUFD_OBJ_HWPT_PAGING] = { 512 .destroy = iommufd_hwpt_paging_destroy, 513 .abort = iommufd_hwpt_paging_abort, 514 }, 515 [IOMMUFD_OBJ_HWPT_NESTED] = { 516 .destroy = iommufd_hwpt_nested_destroy, 517 .abort = iommufd_hwpt_nested_abort, 518 }, 519 [IOMMUFD_OBJ_FAULT] = { 520 .destroy = iommufd_fault_destroy, 521 }, 522 #ifdef CONFIG_IOMMUFD_TEST 523 [IOMMUFD_OBJ_SELFTEST] = { 524 .destroy = iommufd_selftest_destroy, 525 }, 526 #endif 527 }; 528 529 static struct miscdevice iommu_misc_dev = { 530 .minor = MISC_DYNAMIC_MINOR, 531 .name = "iommu", 532 .fops = &iommufd_fops, 533 .nodename = "iommu", 534 .mode = 0660, 535 }; 536 537 538 static struct miscdevice vfio_misc_dev = { 539 .minor = VFIO_MINOR, 540 .name = "vfio", 541 .fops = &iommufd_fops, 542 .nodename = "vfio/vfio", 543 .mode = 0666, 544 }; 545 546 static int __init iommufd_init(void) 547 { 548 int ret; 549 550 ret = misc_register(&iommu_misc_dev); 551 if (ret) 552 return ret; 553 554 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 555 ret = misc_register(&vfio_misc_dev); 556 if (ret) 557 goto err_misc; 558 } 559 ret = iommufd_test_init(); 560 if (ret) 561 goto err_vfio_misc; 562 return 0; 563 564 err_vfio_misc: 565 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 566 misc_deregister(&vfio_misc_dev); 567 err_misc: 568 misc_deregister(&iommu_misc_dev); 569 return ret; 570 } 571 572 static void __exit iommufd_exit(void) 573 { 574 iommufd_test_exit(); 575 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 576 misc_deregister(&vfio_misc_dev); 577 misc_deregister(&iommu_misc_dev); 578 } 579 580 module_init(iommufd_init); 581 module_exit(iommufd_exit); 582 583 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 584 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 585 MODULE_ALIAS("devname:vfio/vfio"); 586 #endif 587 MODULE_IMPORT_NS(IOMMUFD_INTERNAL); 588 MODULE_IMPORT_NS(IOMMUFD); 589 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 590 MODULE_LICENSE("GPL"); 591