1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/file.h> 12 #include <linux/fs.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/miscdevice.h> 16 #include <linux/mutex.h> 17 #include <linux/bug.h> 18 #include <uapi/linux/iommufd.h> 19 #include <linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 void (*abort)(struct iommufd_object *obj); 28 }; 29 static const struct iommufd_object_ops iommufd_object_ops[]; 30 static struct miscdevice vfio_misc_dev; 31 32 struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, 33 size_t size, 34 enum iommufd_object_type type) 35 { 36 static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; 37 struct iommufd_object *obj; 38 int rc; 39 40 obj = kzalloc(size, GFP_KERNEL_ACCOUNT); 41 if (!obj) 42 return ERR_PTR(-ENOMEM); 43 obj->type = type; 44 /* 45 * In most cases the destroy_rwsem is obtained with try so it doesn't 46 * interact with lockdep, however on destroy we have to sleep. This 47 * means if we have to destroy an object while holding a get on another 48 * object it triggers lockdep. Using one locking class per object type 49 * is a simple and reasonable way to avoid this. 50 */ 51 __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", 52 &obj_keys[type]); 53 refcount_set(&obj->users, 1); 54 55 /* 56 * Reserve an ID in the xarray but do not publish the pointer yet since 57 * the caller hasn't initialized it yet. Once the pointer is published 58 * in the xarray and visible to other threads we can't reliably destroy 59 * it anymore, so the caller must complete all errorable operations 60 * before calling iommufd_object_finalize(). 61 */ 62 rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, 63 xa_limit_31b, GFP_KERNEL_ACCOUNT); 64 if (rc) 65 goto out_free; 66 return obj; 67 out_free: 68 kfree(obj); 69 return ERR_PTR(rc); 70 } 71 72 /* 73 * Allow concurrent access to the object. 74 * 75 * Once another thread can see the object pointer it can prevent object 76 * destruction. Expect for special kernel-only objects there is no in-kernel way 77 * to reliably destroy a single object. Thus all APIs that are creating objects 78 * must use iommufd_object_abort() to handle their errors and only call 79 * iommufd_object_finalize() once object creation cannot fail. 80 */ 81 void iommufd_object_finalize(struct iommufd_ctx *ictx, 82 struct iommufd_object *obj) 83 { 84 void *old; 85 86 old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); 87 /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ 88 WARN_ON(old); 89 } 90 91 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 92 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 93 { 94 void *old; 95 96 old = xa_erase(&ictx->objects, obj->id); 97 WARN_ON(old); 98 kfree(obj); 99 } 100 101 /* 102 * Abort an object that has been fully initialized and needs destroy, but has 103 * not been finalized. 104 */ 105 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 106 struct iommufd_object *obj) 107 { 108 if (iommufd_object_ops[obj->type].abort) 109 iommufd_object_ops[obj->type].abort(obj); 110 else 111 iommufd_object_ops[obj->type].destroy(obj); 112 iommufd_object_abort(ictx, obj); 113 } 114 115 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 116 enum iommufd_object_type type) 117 { 118 struct iommufd_object *obj; 119 120 if (iommufd_should_fail()) 121 return ERR_PTR(-ENOENT); 122 123 xa_lock(&ictx->objects); 124 obj = xa_load(&ictx->objects, id); 125 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 126 !iommufd_lock_obj(obj)) 127 obj = ERR_PTR(-ENOENT); 128 xa_unlock(&ictx->objects); 129 return obj; 130 } 131 132 /* 133 * Remove the given object id from the xarray if the only reference to the 134 * object is held by the xarray. The caller must call ops destroy(). 135 */ 136 static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, 137 u32 id, bool extra_put) 138 { 139 struct iommufd_object *obj; 140 XA_STATE(xas, &ictx->objects, id); 141 142 xa_lock(&ictx->objects); 143 obj = xas_load(&xas); 144 if (xa_is_zero(obj) || !obj) { 145 obj = ERR_PTR(-ENOENT); 146 goto out_xa; 147 } 148 149 /* 150 * If the caller is holding a ref on obj we put it here under the 151 * spinlock. 152 */ 153 if (extra_put) 154 refcount_dec(&obj->users); 155 156 if (!refcount_dec_if_one(&obj->users)) { 157 obj = ERR_PTR(-EBUSY); 158 goto out_xa; 159 } 160 161 xas_store(&xas, NULL); 162 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 163 ictx->vfio_ioas = NULL; 164 165 out_xa: 166 xa_unlock(&ictx->objects); 167 168 /* The returned object reference count is zero */ 169 return obj; 170 } 171 172 /* 173 * The caller holds a users refcount and wants to destroy the object. Returns 174 * true if the object was destroyed. In all cases the caller no longer has a 175 * reference on obj. 176 */ 177 void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, 178 struct iommufd_object *obj, bool allow_fail) 179 { 180 struct iommufd_object *ret; 181 182 /* 183 * The purpose of the destroy_rwsem is to ensure deterministic 184 * destruction of objects used by external drivers and destroyed by this 185 * function. Any temporary increment of the refcount must hold the read 186 * side of this, such as during ioctl execution. 187 */ 188 down_write(&obj->destroy_rwsem); 189 ret = iommufd_object_remove(ictx, obj->id, true); 190 up_write(&obj->destroy_rwsem); 191 192 if (allow_fail && IS_ERR(ret)) 193 return; 194 195 /* 196 * If there is a bug and we couldn't destroy the object then we did put 197 * back the caller's refcount and will eventually try to free it again 198 * during close. 199 */ 200 if (WARN_ON(IS_ERR(ret))) 201 return; 202 203 iommufd_object_ops[obj->type].destroy(obj); 204 kfree(obj); 205 } 206 207 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 208 { 209 struct iommu_destroy *cmd = ucmd->cmd; 210 struct iommufd_object *obj; 211 212 obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); 213 if (IS_ERR(obj)) 214 return PTR_ERR(obj); 215 iommufd_object_ops[obj->type].destroy(obj); 216 kfree(obj); 217 return 0; 218 } 219 220 static int iommufd_fops_open(struct inode *inode, struct file *filp) 221 { 222 struct iommufd_ctx *ictx; 223 224 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 225 if (!ictx) 226 return -ENOMEM; 227 228 /* 229 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 230 * to the same rlimit accounting as vfio uses. 231 */ 232 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 233 filp->private_data == &vfio_misc_dev) { 234 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 235 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 236 } 237 238 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 239 xa_init(&ictx->groups); 240 ictx->file = filp; 241 filp->private_data = ictx; 242 return 0; 243 } 244 245 static int iommufd_fops_release(struct inode *inode, struct file *filp) 246 { 247 struct iommufd_ctx *ictx = filp->private_data; 248 struct iommufd_object *obj; 249 250 /* 251 * The objects in the xarray form a graph of "users" counts, and we have 252 * to destroy them in a depth first manner. Leaf objects will reduce the 253 * users count of interior objects when they are destroyed. 254 * 255 * Repeatedly destroying all the "1 users" leaf objects will progress 256 * until the entire list is destroyed. If this can't progress then there 257 * is some bug related to object refcounting. 258 */ 259 while (!xa_empty(&ictx->objects)) { 260 unsigned int destroyed = 0; 261 unsigned long index; 262 263 xa_for_each(&ictx->objects, index, obj) { 264 if (!refcount_dec_if_one(&obj->users)) 265 continue; 266 destroyed++; 267 xa_erase(&ictx->objects, index); 268 iommufd_object_ops[obj->type].destroy(obj); 269 kfree(obj); 270 } 271 /* Bug related to users refcount */ 272 if (WARN_ON(!destroyed)) 273 break; 274 } 275 WARN_ON(!xa_empty(&ictx->groups)); 276 kfree(ictx); 277 return 0; 278 } 279 280 static int iommufd_option(struct iommufd_ucmd *ucmd) 281 { 282 struct iommu_option *cmd = ucmd->cmd; 283 int rc; 284 285 if (cmd->__reserved) 286 return -EOPNOTSUPP; 287 288 switch (cmd->option_id) { 289 case IOMMU_OPTION_RLIMIT_MODE: 290 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 291 break; 292 case IOMMU_OPTION_HUGE_PAGES: 293 rc = iommufd_ioas_option(ucmd); 294 break; 295 default: 296 return -EOPNOTSUPP; 297 } 298 if (rc) 299 return rc; 300 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 301 &cmd->val64, sizeof(cmd->val64))) 302 return -EFAULT; 303 return 0; 304 } 305 306 union ucmd_buffer { 307 struct iommu_destroy destroy; 308 struct iommu_hw_info info; 309 struct iommu_hwpt_alloc hwpt; 310 struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; 311 struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; 312 struct iommu_ioas_alloc alloc; 313 struct iommu_ioas_allow_iovas allow_iovas; 314 struct iommu_ioas_copy ioas_copy; 315 struct iommu_ioas_iova_ranges iova_ranges; 316 struct iommu_ioas_map map; 317 struct iommu_ioas_unmap unmap; 318 struct iommu_option option; 319 struct iommu_vfio_ioas vfio_ioas; 320 #ifdef CONFIG_IOMMUFD_TEST 321 struct iommu_test_cmd test; 322 #endif 323 }; 324 325 struct iommufd_ioctl_op { 326 unsigned int size; 327 unsigned int min_size; 328 unsigned int ioctl_num; 329 int (*execute)(struct iommufd_ucmd *ucmd); 330 }; 331 332 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 333 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 334 .size = sizeof(_struct) + \ 335 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 336 sizeof(_struct)), \ 337 .min_size = offsetofend(_struct, _last), \ 338 .ioctl_num = _ioctl, \ 339 .execute = _fn, \ 340 } 341 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 342 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 343 IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, 344 __reserved), 345 IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, 346 __reserved), 347 IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, 348 struct iommu_hwpt_get_dirty_bitmap, data), 349 IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, 350 struct iommu_hwpt_set_dirty_tracking, __reserved), 351 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 352 struct iommu_ioas_alloc, out_ioas_id), 353 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 354 struct iommu_ioas_allow_iovas, allowed_iovas), 355 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 356 src_iova), 357 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 358 struct iommu_ioas_iova_ranges, out_iova_alignment), 359 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, 360 iova), 361 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 362 length), 363 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, 364 val64), 365 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 366 __reserved), 367 #ifdef CONFIG_IOMMUFD_TEST 368 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 369 #endif 370 }; 371 372 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 373 unsigned long arg) 374 { 375 struct iommufd_ctx *ictx = filp->private_data; 376 const struct iommufd_ioctl_op *op; 377 struct iommufd_ucmd ucmd = {}; 378 union ucmd_buffer buf; 379 unsigned int nr; 380 int ret; 381 382 nr = _IOC_NR(cmd); 383 if (nr < IOMMUFD_CMD_BASE || 384 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 385 return iommufd_vfio_ioctl(ictx, cmd, arg); 386 387 ucmd.ictx = ictx; 388 ucmd.ubuffer = (void __user *)arg; 389 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 390 if (ret) 391 return ret; 392 393 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 394 if (op->ioctl_num != cmd) 395 return -ENOIOCTLCMD; 396 if (ucmd.user_size < op->min_size) 397 return -EINVAL; 398 399 ucmd.cmd = &buf; 400 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 401 ucmd.user_size); 402 if (ret) 403 return ret; 404 ret = op->execute(&ucmd); 405 return ret; 406 } 407 408 static const struct file_operations iommufd_fops = { 409 .owner = THIS_MODULE, 410 .open = iommufd_fops_open, 411 .release = iommufd_fops_release, 412 .unlocked_ioctl = iommufd_fops_ioctl, 413 }; 414 415 /** 416 * iommufd_ctx_get - Get a context reference 417 * @ictx: Context to get 418 * 419 * The caller must already hold a valid reference to ictx. 420 */ 421 void iommufd_ctx_get(struct iommufd_ctx *ictx) 422 { 423 get_file(ictx->file); 424 } 425 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); 426 427 /** 428 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 429 * @file: File to obtain the reference from 430 * 431 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 432 * remains owned by the caller and the caller must still do fput. On success 433 * the caller is responsible to call iommufd_ctx_put(). 434 */ 435 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 436 { 437 struct iommufd_ctx *ictx; 438 439 if (file->f_op != &iommufd_fops) 440 return ERR_PTR(-EBADFD); 441 ictx = file->private_data; 442 iommufd_ctx_get(ictx); 443 return ictx; 444 } 445 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 446 447 /** 448 * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 449 * @fd: File descriptor to obtain the reference from 450 * 451 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 452 * the caller is responsible to call iommufd_ctx_put(). 453 */ 454 struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 455 { 456 struct file *file; 457 458 file = fget(fd); 459 if (!file) 460 return ERR_PTR(-EBADF); 461 462 if (file->f_op != &iommufd_fops) { 463 fput(file); 464 return ERR_PTR(-EBADFD); 465 } 466 /* fget is the same as iommufd_ctx_get() */ 467 return file->private_data; 468 } 469 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); 470 471 /** 472 * iommufd_ctx_put - Put back a reference 473 * @ictx: Context to put back 474 */ 475 void iommufd_ctx_put(struct iommufd_ctx *ictx) 476 { 477 fput(ictx->file); 478 } 479 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); 480 481 static const struct iommufd_object_ops iommufd_object_ops[] = { 482 [IOMMUFD_OBJ_ACCESS] = { 483 .destroy = iommufd_access_destroy_object, 484 }, 485 [IOMMUFD_OBJ_DEVICE] = { 486 .destroy = iommufd_device_destroy, 487 }, 488 [IOMMUFD_OBJ_IOAS] = { 489 .destroy = iommufd_ioas_destroy, 490 }, 491 [IOMMUFD_OBJ_HWPT_PAGING] = { 492 .destroy = iommufd_hwpt_paging_destroy, 493 .abort = iommufd_hwpt_paging_abort, 494 }, 495 [IOMMUFD_OBJ_HWPT_NESTED] = { 496 .destroy = iommufd_hwpt_nested_destroy, 497 .abort = iommufd_hwpt_nested_abort, 498 }, 499 #ifdef CONFIG_IOMMUFD_TEST 500 [IOMMUFD_OBJ_SELFTEST] = { 501 .destroy = iommufd_selftest_destroy, 502 }, 503 #endif 504 }; 505 506 static struct miscdevice iommu_misc_dev = { 507 .minor = MISC_DYNAMIC_MINOR, 508 .name = "iommu", 509 .fops = &iommufd_fops, 510 .nodename = "iommu", 511 .mode = 0660, 512 }; 513 514 515 static struct miscdevice vfio_misc_dev = { 516 .minor = VFIO_MINOR, 517 .name = "vfio", 518 .fops = &iommufd_fops, 519 .nodename = "vfio/vfio", 520 .mode = 0666, 521 }; 522 523 static int __init iommufd_init(void) 524 { 525 int ret; 526 527 ret = misc_register(&iommu_misc_dev); 528 if (ret) 529 return ret; 530 531 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 532 ret = misc_register(&vfio_misc_dev); 533 if (ret) 534 goto err_misc; 535 } 536 ret = iommufd_test_init(); 537 if (ret) 538 goto err_vfio_misc; 539 return 0; 540 541 err_vfio_misc: 542 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 543 misc_deregister(&vfio_misc_dev); 544 err_misc: 545 misc_deregister(&iommu_misc_dev); 546 return ret; 547 } 548 549 static void __exit iommufd_exit(void) 550 { 551 iommufd_test_exit(); 552 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 553 misc_deregister(&vfio_misc_dev); 554 misc_deregister(&iommu_misc_dev); 555 } 556 557 module_init(iommufd_init); 558 module_exit(iommufd_exit); 559 560 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 561 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 562 MODULE_ALIAS("devname:vfio/vfio"); 563 #endif 564 MODULE_IMPORT_NS(IOMMUFD_INTERNAL); 565 MODULE_IMPORT_NS(IOMMUFD); 566 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 567 MODULE_LICENSE("GPL"); 568