1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/bug.h> 12 #include <linux/file.h> 13 #include <linux/fs.h> 14 #include <linux/iommufd.h> 15 #include <linux/miscdevice.h> 16 #include <linux/module.h> 17 #include <linux/mutex.h> 18 #include <linux/slab.h> 19 #include <uapi/linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 void (*abort)(struct iommufd_object *obj); 28 }; 29 static const struct iommufd_object_ops iommufd_object_ops[]; 30 static struct miscdevice vfio_misc_dev; 31 32 /* 33 * Allow concurrent access to the object. 34 * 35 * Once another thread can see the object pointer it can prevent object 36 * destruction. Expect for special kernel-only objects there is no in-kernel way 37 * to reliably destroy a single object. Thus all APIs that are creating objects 38 * must use iommufd_object_abort() to handle their errors and only call 39 * iommufd_object_finalize() once object creation cannot fail. 40 */ 41 void iommufd_object_finalize(struct iommufd_ctx *ictx, 42 struct iommufd_object *obj) 43 { 44 XA_STATE(xas, &ictx->objects, obj->id); 45 void *old; 46 47 xa_lock(&ictx->objects); 48 old = xas_store(&xas, obj); 49 xa_unlock(&ictx->objects); 50 /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ 51 WARN_ON(old != XA_ZERO_ENTRY); 52 } 53 54 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 55 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 56 { 57 XA_STATE(xas, &ictx->objects, obj->id); 58 void *old; 59 60 xa_lock(&ictx->objects); 61 old = xas_store(&xas, NULL); 62 xa_unlock(&ictx->objects); 63 WARN_ON(old != XA_ZERO_ENTRY); 64 kfree(obj); 65 } 66 67 /* 68 * Abort an object that has been fully initialized and needs destroy, but has 69 * not been finalized. 70 */ 71 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 72 struct iommufd_object *obj) 73 { 74 if (iommufd_object_ops[obj->type].abort) 75 iommufd_object_ops[obj->type].abort(obj); 76 else 77 iommufd_object_ops[obj->type].destroy(obj); 78 iommufd_object_abort(ictx, obj); 79 } 80 81 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 82 enum iommufd_object_type type) 83 { 84 struct iommufd_object *obj; 85 86 if (iommufd_should_fail()) 87 return ERR_PTR(-ENOENT); 88 89 xa_lock(&ictx->objects); 90 obj = xa_load(&ictx->objects, id); 91 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 92 !iommufd_lock_obj(obj)) 93 obj = ERR_PTR(-ENOENT); 94 xa_unlock(&ictx->objects); 95 return obj; 96 } 97 98 static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, 99 struct iommufd_object *to_destroy) 100 { 101 if (refcount_dec_and_test(&to_destroy->shortterm_users)) 102 return 0; 103 104 if (wait_event_timeout(ictx->destroy_wait, 105 refcount_read(&to_destroy->shortterm_users) == 106 0, 107 msecs_to_jiffies(10000))) 108 return 0; 109 110 pr_crit("Time out waiting for iommufd object to become free\n"); 111 refcount_inc(&to_destroy->shortterm_users); 112 return -EBUSY; 113 } 114 115 /* 116 * Remove the given object id from the xarray if the only reference to the 117 * object is held by the xarray. 118 */ 119 int iommufd_object_remove(struct iommufd_ctx *ictx, 120 struct iommufd_object *to_destroy, u32 id, 121 unsigned int flags) 122 { 123 struct iommufd_object *obj; 124 XA_STATE(xas, &ictx->objects, id); 125 bool zerod_shortterm = false; 126 int ret; 127 128 /* 129 * The purpose of the shortterm_users is to ensure deterministic 130 * destruction of objects used by external drivers and destroyed by this 131 * function. Any temporary increment of the refcount must increment 132 * shortterm_users, such as during ioctl execution. 133 */ 134 if (flags & REMOVE_WAIT_SHORTTERM) { 135 ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); 136 if (ret) { 137 /* 138 * We have a bug. Put back the callers reference and 139 * defer cleaning this object until close. 140 */ 141 refcount_dec(&to_destroy->users); 142 return ret; 143 } 144 zerod_shortterm = true; 145 } 146 147 xa_lock(&ictx->objects); 148 obj = xas_load(&xas); 149 if (to_destroy) { 150 /* 151 * If the caller is holding a ref on obj we put it here under 152 * the spinlock. 153 */ 154 refcount_dec(&obj->users); 155 156 if (WARN_ON(obj != to_destroy)) { 157 ret = -ENOENT; 158 goto err_xa; 159 } 160 } else if (xa_is_zero(obj) || !obj) { 161 ret = -ENOENT; 162 goto err_xa; 163 } 164 165 if (!refcount_dec_if_one(&obj->users)) { 166 ret = -EBUSY; 167 goto err_xa; 168 } 169 170 xas_store(&xas, NULL); 171 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 172 ictx->vfio_ioas = NULL; 173 xa_unlock(&ictx->objects); 174 175 /* 176 * Since users is zero any positive users_shortterm must be racing 177 * iommufd_put_object(), or we have a bug. 178 */ 179 if (!zerod_shortterm) { 180 ret = iommufd_object_dec_wait_shortterm(ictx, obj); 181 if (WARN_ON(ret)) 182 return ret; 183 } 184 185 iommufd_object_ops[obj->type].destroy(obj); 186 kfree(obj); 187 return 0; 188 189 err_xa: 190 if (zerod_shortterm) { 191 /* Restore the xarray owned reference */ 192 refcount_set(&obj->shortterm_users, 1); 193 } 194 xa_unlock(&ictx->objects); 195 196 /* The returned object reference count is zero */ 197 return ret; 198 } 199 200 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 201 { 202 struct iommu_destroy *cmd = ucmd->cmd; 203 204 return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); 205 } 206 207 static int iommufd_fops_open(struct inode *inode, struct file *filp) 208 { 209 struct iommufd_ctx *ictx; 210 211 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 212 if (!ictx) 213 return -ENOMEM; 214 215 /* 216 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 217 * to the same rlimit accounting as vfio uses. 218 */ 219 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 220 filp->private_data == &vfio_misc_dev) { 221 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 222 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 223 } 224 225 init_rwsem(&ictx->ioas_creation_lock); 226 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 227 xa_init(&ictx->groups); 228 ictx->file = filp; 229 init_waitqueue_head(&ictx->destroy_wait); 230 filp->private_data = ictx; 231 return 0; 232 } 233 234 static int iommufd_fops_release(struct inode *inode, struct file *filp) 235 { 236 struct iommufd_ctx *ictx = filp->private_data; 237 struct iommufd_object *obj; 238 239 /* 240 * The objects in the xarray form a graph of "users" counts, and we have 241 * to destroy them in a depth first manner. Leaf objects will reduce the 242 * users count of interior objects when they are destroyed. 243 * 244 * Repeatedly destroying all the "1 users" leaf objects will progress 245 * until the entire list is destroyed. If this can't progress then there 246 * is some bug related to object refcounting. 247 */ 248 while (!xa_empty(&ictx->objects)) { 249 unsigned int destroyed = 0; 250 unsigned long index; 251 252 xa_for_each(&ictx->objects, index, obj) { 253 if (!refcount_dec_if_one(&obj->users)) 254 continue; 255 destroyed++; 256 xa_erase(&ictx->objects, index); 257 iommufd_object_ops[obj->type].destroy(obj); 258 kfree(obj); 259 } 260 /* Bug related to users refcount */ 261 if (WARN_ON(!destroyed)) 262 break; 263 } 264 WARN_ON(!xa_empty(&ictx->groups)); 265 kfree(ictx); 266 return 0; 267 } 268 269 static int iommufd_option(struct iommufd_ucmd *ucmd) 270 { 271 struct iommu_option *cmd = ucmd->cmd; 272 int rc; 273 274 if (cmd->__reserved) 275 return -EOPNOTSUPP; 276 277 switch (cmd->option_id) { 278 case IOMMU_OPTION_RLIMIT_MODE: 279 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 280 break; 281 case IOMMU_OPTION_HUGE_PAGES: 282 rc = iommufd_ioas_option(ucmd); 283 break; 284 default: 285 return -EOPNOTSUPP; 286 } 287 if (rc) 288 return rc; 289 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 290 &cmd->val64, sizeof(cmd->val64))) 291 return -EFAULT; 292 return 0; 293 } 294 295 union ucmd_buffer { 296 struct iommu_destroy destroy; 297 struct iommu_fault_alloc fault; 298 struct iommu_hw_info info; 299 struct iommu_hwpt_alloc hwpt; 300 struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; 301 struct iommu_hwpt_invalidate cache; 302 struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; 303 struct iommu_ioas_alloc alloc; 304 struct iommu_ioas_allow_iovas allow_iovas; 305 struct iommu_ioas_copy ioas_copy; 306 struct iommu_ioas_iova_ranges iova_ranges; 307 struct iommu_ioas_map map; 308 struct iommu_ioas_unmap unmap; 309 struct iommu_option option; 310 struct iommu_vfio_ioas vfio_ioas; 311 struct iommu_viommu_alloc viommu; 312 struct iommu_vdevice_alloc vdev; 313 #ifdef CONFIG_IOMMUFD_TEST 314 struct iommu_test_cmd test; 315 #endif 316 }; 317 318 struct iommufd_ioctl_op { 319 unsigned int size; 320 unsigned int min_size; 321 unsigned int ioctl_num; 322 int (*execute)(struct iommufd_ucmd *ucmd); 323 }; 324 325 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 326 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 327 .size = sizeof(_struct) + \ 328 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 329 sizeof(_struct)), \ 330 .min_size = offsetofend(_struct, _last), \ 331 .ioctl_num = _ioctl, \ 332 .execute = _fn, \ 333 } 334 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 335 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 336 IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc, 337 out_fault_fd), 338 IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, 339 __reserved), 340 IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, 341 __reserved), 342 IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, 343 struct iommu_hwpt_get_dirty_bitmap, data), 344 IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, 345 struct iommu_hwpt_invalidate, __reserved), 346 IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, 347 struct iommu_hwpt_set_dirty_tracking, __reserved), 348 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 349 struct iommu_ioas_alloc, out_ioas_id), 350 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 351 struct iommu_ioas_allow_iovas, allowed_iovas), 352 IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, 353 struct iommu_ioas_change_process, __reserved), 354 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 355 src_iova), 356 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 357 struct iommu_ioas_iova_ranges, out_iova_alignment), 358 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, 359 iova), 360 IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, 361 struct iommu_ioas_map_file, iova), 362 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 363 length), 364 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, 365 val64), 366 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 367 __reserved), 368 IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, 369 struct iommu_viommu_alloc, out_viommu_id), 370 IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, 371 struct iommu_vdevice_alloc, virt_id), 372 #ifdef CONFIG_IOMMUFD_TEST 373 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 374 #endif 375 }; 376 377 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 378 unsigned long arg) 379 { 380 struct iommufd_ctx *ictx = filp->private_data; 381 const struct iommufd_ioctl_op *op; 382 struct iommufd_ucmd ucmd = {}; 383 union ucmd_buffer buf; 384 unsigned int nr; 385 int ret; 386 387 nr = _IOC_NR(cmd); 388 if (nr < IOMMUFD_CMD_BASE || 389 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 390 return iommufd_vfio_ioctl(ictx, cmd, arg); 391 392 ucmd.ictx = ictx; 393 ucmd.ubuffer = (void __user *)arg; 394 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 395 if (ret) 396 return ret; 397 398 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 399 if (op->ioctl_num != cmd) 400 return -ENOIOCTLCMD; 401 if (ucmd.user_size < op->min_size) 402 return -EINVAL; 403 404 ucmd.cmd = &buf; 405 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 406 ucmd.user_size); 407 if (ret) 408 return ret; 409 ret = op->execute(&ucmd); 410 return ret; 411 } 412 413 static const struct file_operations iommufd_fops = { 414 .owner = THIS_MODULE, 415 .open = iommufd_fops_open, 416 .release = iommufd_fops_release, 417 .unlocked_ioctl = iommufd_fops_ioctl, 418 }; 419 420 /** 421 * iommufd_ctx_get - Get a context reference 422 * @ictx: Context to get 423 * 424 * The caller must already hold a valid reference to ictx. 425 */ 426 void iommufd_ctx_get(struct iommufd_ctx *ictx) 427 { 428 get_file(ictx->file); 429 } 430 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); 431 432 /** 433 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 434 * @file: File to obtain the reference from 435 * 436 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 437 * remains owned by the caller and the caller must still do fput. On success 438 * the caller is responsible to call iommufd_ctx_put(). 439 */ 440 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 441 { 442 struct iommufd_ctx *ictx; 443 444 if (file->f_op != &iommufd_fops) 445 return ERR_PTR(-EBADFD); 446 ictx = file->private_data; 447 iommufd_ctx_get(ictx); 448 return ictx; 449 } 450 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 451 452 /** 453 * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 454 * @fd: File descriptor to obtain the reference from 455 * 456 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 457 * the caller is responsible to call iommufd_ctx_put(). 458 */ 459 struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 460 { 461 struct file *file; 462 463 file = fget(fd); 464 if (!file) 465 return ERR_PTR(-EBADF); 466 467 if (file->f_op != &iommufd_fops) { 468 fput(file); 469 return ERR_PTR(-EBADFD); 470 } 471 /* fget is the same as iommufd_ctx_get() */ 472 return file->private_data; 473 } 474 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); 475 476 /** 477 * iommufd_ctx_put - Put back a reference 478 * @ictx: Context to put back 479 */ 480 void iommufd_ctx_put(struct iommufd_ctx *ictx) 481 { 482 fput(ictx->file); 483 } 484 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); 485 486 static const struct iommufd_object_ops iommufd_object_ops[] = { 487 [IOMMUFD_OBJ_ACCESS] = { 488 .destroy = iommufd_access_destroy_object, 489 }, 490 [IOMMUFD_OBJ_DEVICE] = { 491 .destroy = iommufd_device_destroy, 492 }, 493 [IOMMUFD_OBJ_IOAS] = { 494 .destroy = iommufd_ioas_destroy, 495 }, 496 [IOMMUFD_OBJ_HWPT_PAGING] = { 497 .destroy = iommufd_hwpt_paging_destroy, 498 .abort = iommufd_hwpt_paging_abort, 499 }, 500 [IOMMUFD_OBJ_HWPT_NESTED] = { 501 .destroy = iommufd_hwpt_nested_destroy, 502 .abort = iommufd_hwpt_nested_abort, 503 }, 504 [IOMMUFD_OBJ_FAULT] = { 505 .destroy = iommufd_fault_destroy, 506 }, 507 [IOMMUFD_OBJ_VIOMMU] = { 508 .destroy = iommufd_viommu_destroy, 509 }, 510 [IOMMUFD_OBJ_VDEVICE] = { 511 .destroy = iommufd_vdevice_destroy, 512 }, 513 #ifdef CONFIG_IOMMUFD_TEST 514 [IOMMUFD_OBJ_SELFTEST] = { 515 .destroy = iommufd_selftest_destroy, 516 }, 517 #endif 518 }; 519 520 static struct miscdevice iommu_misc_dev = { 521 .minor = MISC_DYNAMIC_MINOR, 522 .name = "iommu", 523 .fops = &iommufd_fops, 524 .nodename = "iommu", 525 .mode = 0660, 526 }; 527 528 529 static struct miscdevice vfio_misc_dev = { 530 .minor = VFIO_MINOR, 531 .name = "vfio", 532 .fops = &iommufd_fops, 533 .nodename = "vfio/vfio", 534 .mode = 0666, 535 }; 536 537 static int __init iommufd_init(void) 538 { 539 int ret; 540 541 ret = misc_register(&iommu_misc_dev); 542 if (ret) 543 return ret; 544 545 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 546 ret = misc_register(&vfio_misc_dev); 547 if (ret) 548 goto err_misc; 549 } 550 ret = iommufd_test_init(); 551 if (ret) 552 goto err_vfio_misc; 553 return 0; 554 555 err_vfio_misc: 556 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 557 misc_deregister(&vfio_misc_dev); 558 err_misc: 559 misc_deregister(&iommu_misc_dev); 560 return ret; 561 } 562 563 static void __exit iommufd_exit(void) 564 { 565 iommufd_test_exit(); 566 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 567 misc_deregister(&vfio_misc_dev); 568 misc_deregister(&iommu_misc_dev); 569 } 570 571 module_init(iommufd_init); 572 module_exit(iommufd_exit); 573 574 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 575 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 576 MODULE_ALIAS("devname:vfio/vfio"); 577 #endif 578 MODULE_IMPORT_NS(IOMMUFD_INTERNAL); 579 MODULE_IMPORT_NS(IOMMUFD); 580 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 581 MODULE_LICENSE("GPL"); 582