1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/file.h> 12 #include <linux/fs.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/miscdevice.h> 16 #include <linux/mutex.h> 17 #include <linux/bug.h> 18 #include <uapi/linux/iommufd.h> 19 #include <linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 void (*abort)(struct iommufd_object *obj); 28 }; 29 static const struct iommufd_object_ops iommufd_object_ops[]; 30 static struct miscdevice vfio_misc_dev; 31 32 struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, 33 size_t size, 34 enum iommufd_object_type type) 35 { 36 struct iommufd_object *obj; 37 int rc; 38 39 obj = kzalloc(size, GFP_KERNEL_ACCOUNT); 40 if (!obj) 41 return ERR_PTR(-ENOMEM); 42 obj->type = type; 43 /* Starts out bias'd by 1 until it is removed from the xarray */ 44 refcount_set(&obj->shortterm_users, 1); 45 refcount_set(&obj->users, 1); 46 47 /* 48 * Reserve an ID in the xarray but do not publish the pointer yet since 49 * the caller hasn't initialized it yet. Once the pointer is published 50 * in the xarray and visible to other threads we can't reliably destroy 51 * it anymore, so the caller must complete all errorable operations 52 * before calling iommufd_object_finalize(). 53 */ 54 rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, 55 xa_limit_31b, GFP_KERNEL_ACCOUNT); 56 if (rc) 57 goto out_free; 58 return obj; 59 out_free: 60 kfree(obj); 61 return ERR_PTR(rc); 62 } 63 64 /* 65 * Allow concurrent access to the object. 66 * 67 * Once another thread can see the object pointer it can prevent object 68 * destruction. Expect for special kernel-only objects there is no in-kernel way 69 * to reliably destroy a single object. Thus all APIs that are creating objects 70 * must use iommufd_object_abort() to handle their errors and only call 71 * iommufd_object_finalize() once object creation cannot fail. 72 */ 73 void iommufd_object_finalize(struct iommufd_ctx *ictx, 74 struct iommufd_object *obj) 75 { 76 void *old; 77 78 old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); 79 /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ 80 WARN_ON(old); 81 } 82 83 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 84 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 85 { 86 void *old; 87 88 old = xa_erase(&ictx->objects, obj->id); 89 WARN_ON(old); 90 kfree(obj); 91 } 92 93 /* 94 * Abort an object that has been fully initialized and needs destroy, but has 95 * not been finalized. 96 */ 97 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 98 struct iommufd_object *obj) 99 { 100 if (iommufd_object_ops[obj->type].abort) 101 iommufd_object_ops[obj->type].abort(obj); 102 else 103 iommufd_object_ops[obj->type].destroy(obj); 104 iommufd_object_abort(ictx, obj); 105 } 106 107 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 108 enum iommufd_object_type type) 109 { 110 struct iommufd_object *obj; 111 112 if (iommufd_should_fail()) 113 return ERR_PTR(-ENOENT); 114 115 xa_lock(&ictx->objects); 116 obj = xa_load(&ictx->objects, id); 117 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 118 !iommufd_lock_obj(obj)) 119 obj = ERR_PTR(-ENOENT); 120 xa_unlock(&ictx->objects); 121 return obj; 122 } 123 124 static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, 125 struct iommufd_object *to_destroy) 126 { 127 if (refcount_dec_and_test(&to_destroy->shortterm_users)) 128 return 0; 129 130 if (wait_event_timeout(ictx->destroy_wait, 131 refcount_read(&to_destroy->shortterm_users) == 132 0, 133 msecs_to_jiffies(10000))) 134 return 0; 135 136 pr_crit("Time out waiting for iommufd object to become free\n"); 137 refcount_inc(&to_destroy->shortterm_users); 138 return -EBUSY; 139 } 140 141 /* 142 * Remove the given object id from the xarray if the only reference to the 143 * object is held by the xarray. 144 */ 145 int iommufd_object_remove(struct iommufd_ctx *ictx, 146 struct iommufd_object *to_destroy, u32 id, 147 unsigned int flags) 148 { 149 struct iommufd_object *obj; 150 XA_STATE(xas, &ictx->objects, id); 151 bool zerod_shortterm = false; 152 int ret; 153 154 /* 155 * The purpose of the shortterm_users is to ensure deterministic 156 * destruction of objects used by external drivers and destroyed by this 157 * function. Any temporary increment of the refcount must increment 158 * shortterm_users, such as during ioctl execution. 159 */ 160 if (flags & REMOVE_WAIT_SHORTTERM) { 161 ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); 162 if (ret) { 163 /* 164 * We have a bug. Put back the callers reference and 165 * defer cleaning this object until close. 166 */ 167 refcount_dec(&to_destroy->users); 168 return ret; 169 } 170 zerod_shortterm = true; 171 } 172 173 xa_lock(&ictx->objects); 174 obj = xas_load(&xas); 175 if (to_destroy) { 176 /* 177 * If the caller is holding a ref on obj we put it here under 178 * the spinlock. 179 */ 180 refcount_dec(&obj->users); 181 182 if (WARN_ON(obj != to_destroy)) { 183 ret = -ENOENT; 184 goto err_xa; 185 } 186 } else if (xa_is_zero(obj) || !obj) { 187 ret = -ENOENT; 188 goto err_xa; 189 } 190 191 if (!refcount_dec_if_one(&obj->users)) { 192 ret = -EBUSY; 193 goto err_xa; 194 } 195 196 xas_store(&xas, NULL); 197 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 198 ictx->vfio_ioas = NULL; 199 xa_unlock(&ictx->objects); 200 201 /* 202 * Since users is zero any positive users_shortterm must be racing 203 * iommufd_put_object(), or we have a bug. 204 */ 205 if (!zerod_shortterm) { 206 ret = iommufd_object_dec_wait_shortterm(ictx, obj); 207 if (WARN_ON(ret)) 208 return ret; 209 } 210 211 iommufd_object_ops[obj->type].destroy(obj); 212 kfree(obj); 213 return 0; 214 215 err_xa: 216 if (zerod_shortterm) { 217 /* Restore the xarray owned reference */ 218 refcount_set(&obj->shortterm_users, 1); 219 } 220 xa_unlock(&ictx->objects); 221 222 /* The returned object reference count is zero */ 223 return ret; 224 } 225 226 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 227 { 228 struct iommu_destroy *cmd = ucmd->cmd; 229 230 return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); 231 } 232 233 static int iommufd_fops_open(struct inode *inode, struct file *filp) 234 { 235 struct iommufd_ctx *ictx; 236 237 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 238 if (!ictx) 239 return -ENOMEM; 240 241 /* 242 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 243 * to the same rlimit accounting as vfio uses. 244 */ 245 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 246 filp->private_data == &vfio_misc_dev) { 247 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 248 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 249 } 250 251 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 252 xa_init(&ictx->groups); 253 ictx->file = filp; 254 init_waitqueue_head(&ictx->destroy_wait); 255 filp->private_data = ictx; 256 return 0; 257 } 258 259 static int iommufd_fops_release(struct inode *inode, struct file *filp) 260 { 261 struct iommufd_ctx *ictx = filp->private_data; 262 struct iommufd_object *obj; 263 264 /* 265 * The objects in the xarray form a graph of "users" counts, and we have 266 * to destroy them in a depth first manner. Leaf objects will reduce the 267 * users count of interior objects when they are destroyed. 268 * 269 * Repeatedly destroying all the "1 users" leaf objects will progress 270 * until the entire list is destroyed. If this can't progress then there 271 * is some bug related to object refcounting. 272 */ 273 while (!xa_empty(&ictx->objects)) { 274 unsigned int destroyed = 0; 275 unsigned long index; 276 277 xa_for_each(&ictx->objects, index, obj) { 278 if (!refcount_dec_if_one(&obj->users)) 279 continue; 280 destroyed++; 281 xa_erase(&ictx->objects, index); 282 iommufd_object_ops[obj->type].destroy(obj); 283 kfree(obj); 284 } 285 /* Bug related to users refcount */ 286 if (WARN_ON(!destroyed)) 287 break; 288 } 289 WARN_ON(!xa_empty(&ictx->groups)); 290 kfree(ictx); 291 return 0; 292 } 293 294 static int iommufd_option(struct iommufd_ucmd *ucmd) 295 { 296 struct iommu_option *cmd = ucmd->cmd; 297 int rc; 298 299 if (cmd->__reserved) 300 return -EOPNOTSUPP; 301 302 switch (cmd->option_id) { 303 case IOMMU_OPTION_RLIMIT_MODE: 304 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 305 break; 306 case IOMMU_OPTION_HUGE_PAGES: 307 rc = iommufd_ioas_option(ucmd); 308 break; 309 default: 310 return -EOPNOTSUPP; 311 } 312 if (rc) 313 return rc; 314 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 315 &cmd->val64, sizeof(cmd->val64))) 316 return -EFAULT; 317 return 0; 318 } 319 320 union ucmd_buffer { 321 struct iommu_destroy destroy; 322 struct iommu_hw_info info; 323 struct iommu_hwpt_alloc hwpt; 324 struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; 325 struct iommu_hwpt_invalidate cache; 326 struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; 327 struct iommu_ioas_alloc alloc; 328 struct iommu_ioas_allow_iovas allow_iovas; 329 struct iommu_ioas_copy ioas_copy; 330 struct iommu_ioas_iova_ranges iova_ranges; 331 struct iommu_ioas_map map; 332 struct iommu_ioas_unmap unmap; 333 struct iommu_option option; 334 struct iommu_vfio_ioas vfio_ioas; 335 #ifdef CONFIG_IOMMUFD_TEST 336 struct iommu_test_cmd test; 337 #endif 338 }; 339 340 struct iommufd_ioctl_op { 341 unsigned int size; 342 unsigned int min_size; 343 unsigned int ioctl_num; 344 int (*execute)(struct iommufd_ucmd *ucmd); 345 }; 346 347 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 348 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 349 .size = sizeof(_struct) + \ 350 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 351 sizeof(_struct)), \ 352 .min_size = offsetofend(_struct, _last), \ 353 .ioctl_num = _ioctl, \ 354 .execute = _fn, \ 355 } 356 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 357 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 358 IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, 359 __reserved), 360 IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, 361 __reserved), 362 IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, 363 struct iommu_hwpt_get_dirty_bitmap, data), 364 IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, 365 struct iommu_hwpt_invalidate, __reserved), 366 IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, 367 struct iommu_hwpt_set_dirty_tracking, __reserved), 368 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 369 struct iommu_ioas_alloc, out_ioas_id), 370 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 371 struct iommu_ioas_allow_iovas, allowed_iovas), 372 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 373 src_iova), 374 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 375 struct iommu_ioas_iova_ranges, out_iova_alignment), 376 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, 377 iova), 378 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 379 length), 380 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, 381 val64), 382 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 383 __reserved), 384 #ifdef CONFIG_IOMMUFD_TEST 385 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 386 #endif 387 }; 388 389 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 390 unsigned long arg) 391 { 392 struct iommufd_ctx *ictx = filp->private_data; 393 const struct iommufd_ioctl_op *op; 394 struct iommufd_ucmd ucmd = {}; 395 union ucmd_buffer buf; 396 unsigned int nr; 397 int ret; 398 399 nr = _IOC_NR(cmd); 400 if (nr < IOMMUFD_CMD_BASE || 401 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 402 return iommufd_vfio_ioctl(ictx, cmd, arg); 403 404 ucmd.ictx = ictx; 405 ucmd.ubuffer = (void __user *)arg; 406 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 407 if (ret) 408 return ret; 409 410 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 411 if (op->ioctl_num != cmd) 412 return -ENOIOCTLCMD; 413 if (ucmd.user_size < op->min_size) 414 return -EINVAL; 415 416 ucmd.cmd = &buf; 417 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 418 ucmd.user_size); 419 if (ret) 420 return ret; 421 ret = op->execute(&ucmd); 422 return ret; 423 } 424 425 static const struct file_operations iommufd_fops = { 426 .owner = THIS_MODULE, 427 .open = iommufd_fops_open, 428 .release = iommufd_fops_release, 429 .unlocked_ioctl = iommufd_fops_ioctl, 430 }; 431 432 /** 433 * iommufd_ctx_get - Get a context reference 434 * @ictx: Context to get 435 * 436 * The caller must already hold a valid reference to ictx. 437 */ 438 void iommufd_ctx_get(struct iommufd_ctx *ictx) 439 { 440 get_file(ictx->file); 441 } 442 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); 443 444 /** 445 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 446 * @file: File to obtain the reference from 447 * 448 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 449 * remains owned by the caller and the caller must still do fput. On success 450 * the caller is responsible to call iommufd_ctx_put(). 451 */ 452 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 453 { 454 struct iommufd_ctx *ictx; 455 456 if (file->f_op != &iommufd_fops) 457 return ERR_PTR(-EBADFD); 458 ictx = file->private_data; 459 iommufd_ctx_get(ictx); 460 return ictx; 461 } 462 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 463 464 /** 465 * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 466 * @fd: File descriptor to obtain the reference from 467 * 468 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 469 * the caller is responsible to call iommufd_ctx_put(). 470 */ 471 struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 472 { 473 struct file *file; 474 475 file = fget(fd); 476 if (!file) 477 return ERR_PTR(-EBADF); 478 479 if (file->f_op != &iommufd_fops) { 480 fput(file); 481 return ERR_PTR(-EBADFD); 482 } 483 /* fget is the same as iommufd_ctx_get() */ 484 return file->private_data; 485 } 486 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); 487 488 /** 489 * iommufd_ctx_put - Put back a reference 490 * @ictx: Context to put back 491 */ 492 void iommufd_ctx_put(struct iommufd_ctx *ictx) 493 { 494 fput(ictx->file); 495 } 496 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); 497 498 static const struct iommufd_object_ops iommufd_object_ops[] = { 499 [IOMMUFD_OBJ_ACCESS] = { 500 .destroy = iommufd_access_destroy_object, 501 }, 502 [IOMMUFD_OBJ_DEVICE] = { 503 .destroy = iommufd_device_destroy, 504 }, 505 [IOMMUFD_OBJ_IOAS] = { 506 .destroy = iommufd_ioas_destroy, 507 }, 508 [IOMMUFD_OBJ_HWPT_PAGING] = { 509 .destroy = iommufd_hwpt_paging_destroy, 510 .abort = iommufd_hwpt_paging_abort, 511 }, 512 [IOMMUFD_OBJ_HWPT_NESTED] = { 513 .destroy = iommufd_hwpt_nested_destroy, 514 .abort = iommufd_hwpt_nested_abort, 515 }, 516 #ifdef CONFIG_IOMMUFD_TEST 517 [IOMMUFD_OBJ_SELFTEST] = { 518 .destroy = iommufd_selftest_destroy, 519 }, 520 #endif 521 }; 522 523 static struct miscdevice iommu_misc_dev = { 524 .minor = MISC_DYNAMIC_MINOR, 525 .name = "iommu", 526 .fops = &iommufd_fops, 527 .nodename = "iommu", 528 .mode = 0660, 529 }; 530 531 532 static struct miscdevice vfio_misc_dev = { 533 .minor = VFIO_MINOR, 534 .name = "vfio", 535 .fops = &iommufd_fops, 536 .nodename = "vfio/vfio", 537 .mode = 0666, 538 }; 539 540 static int __init iommufd_init(void) 541 { 542 int ret; 543 544 ret = misc_register(&iommu_misc_dev); 545 if (ret) 546 return ret; 547 548 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 549 ret = misc_register(&vfio_misc_dev); 550 if (ret) 551 goto err_misc; 552 } 553 ret = iommufd_test_init(); 554 if (ret) 555 goto err_vfio_misc; 556 return 0; 557 558 err_vfio_misc: 559 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 560 misc_deregister(&vfio_misc_dev); 561 err_misc: 562 misc_deregister(&iommu_misc_dev); 563 return ret; 564 } 565 566 static void __exit iommufd_exit(void) 567 { 568 iommufd_test_exit(); 569 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 570 misc_deregister(&vfio_misc_dev); 571 misc_deregister(&iommu_misc_dev); 572 } 573 574 module_init(iommufd_init); 575 module_exit(iommufd_exit); 576 577 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 578 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 579 MODULE_ALIAS("devname:vfio/vfio"); 580 #endif 581 MODULE_IMPORT_NS(IOMMUFD_INTERNAL); 582 MODULE_IMPORT_NS(IOMMUFD); 583 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 584 MODULE_LICENSE("GPL"); 585