1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 3 */ 4 #include <linux/file.h> 5 #include <linux/interval_tree.h> 6 #include <linux/iommu.h> 7 #include <linux/iommufd.h> 8 #include <linux/slab.h> 9 #include <linux/vfio.h> 10 #include <uapi/linux/vfio.h> 11 #include <uapi/linux/iommufd.h> 12 13 #include "iommufd_private.h" 14 15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) 16 { 17 struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); 18 19 xa_lock(&ictx->objects); 20 if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) 21 goto out_unlock; 22 ioas = ictx->vfio_ioas; 23 out_unlock: 24 xa_unlock(&ictx->objects); 25 return ioas; 26 } 27 28 /** 29 * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists 30 * @ictx: Context to operate on 31 * @out_ioas_id: The IOAS ID of the compatibility IOAS 32 * 33 * Return the ID of the current compatibility IOAS. The ID can be passed into 34 * other functions that take an ioas_id. 35 */ 36 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) 37 { 38 struct iommufd_ioas *ioas; 39 40 ioas = get_compat_ioas(ictx); 41 if (IS_ERR(ioas)) 42 return PTR_ERR(ioas); 43 *out_ioas_id = ioas->obj.id; 44 iommufd_put_object(ictx, &ioas->obj); 45 return 0; 46 } 47 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); 48 49 /** 50 * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached 51 * @ictx: Context to operate on 52 * 53 * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types. 54 */ 55 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) 56 { 57 int ret; 58 59 xa_lock(&ictx->objects); 60 if (!ictx->vfio_ioas) { 61 ictx->no_iommu_mode = 1; 62 ret = 0; 63 } else { 64 ret = -EINVAL; 65 } 66 xa_unlock(&ictx->objects); 67 return ret; 68 } 69 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO); 70 71 /** 72 * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created 73 * @ictx: Context to operate on 74 * 75 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate 76 * on since they do not have an IOAS ID input in their ABI. Only attaching a 77 * group should cause a default creation of the internal ioas, this does nothing 78 * if an existing ioas has already been assigned somehow. 79 */ 80 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) 81 { 82 struct iommufd_ioas *ioas = NULL; 83 int ret; 84 85 ioas = iommufd_ioas_alloc(ictx); 86 if (IS_ERR(ioas)) 87 return PTR_ERR(ioas); 88 89 xa_lock(&ictx->objects); 90 /* 91 * VFIO won't allow attaching a container to both iommu and no iommu 92 * operation 93 */ 94 if (ictx->no_iommu_mode) { 95 ret = -EINVAL; 96 goto out_abort; 97 } 98 99 if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { 100 ret = 0; 101 iommufd_put_object(ictx, &ictx->vfio_ioas->obj); 102 goto out_abort; 103 } 104 ictx->vfio_ioas = ioas; 105 xa_unlock(&ictx->objects); 106 107 /* 108 * An automatically created compat IOAS is treated as a userspace 109 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, 110 * and if not manually destroyed it will be destroyed automatically 111 * at iommufd release. 112 */ 113 iommufd_object_finalize(ictx, &ioas->obj); 114 return 0; 115 116 out_abort: 117 xa_unlock(&ictx->objects); 118 iommufd_object_abort(ictx, &ioas->obj); 119 return ret; 120 } 121 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO); 122 123 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) 124 { 125 struct iommu_vfio_ioas *cmd = ucmd->cmd; 126 struct iommufd_ioas *ioas; 127 128 if (cmd->__reserved) 129 return -EOPNOTSUPP; 130 switch (cmd->op) { 131 case IOMMU_VFIO_IOAS_GET: 132 ioas = get_compat_ioas(ucmd->ictx); 133 if (IS_ERR(ioas)) 134 return PTR_ERR(ioas); 135 cmd->ioas_id = ioas->obj.id; 136 iommufd_put_object(ucmd->ictx, &ioas->obj); 137 return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); 138 139 case IOMMU_VFIO_IOAS_SET: 140 ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); 141 if (IS_ERR(ioas)) 142 return PTR_ERR(ioas); 143 xa_lock(&ucmd->ictx->objects); 144 ucmd->ictx->vfio_ioas = ioas; 145 xa_unlock(&ucmd->ictx->objects); 146 iommufd_put_object(ucmd->ictx, &ioas->obj); 147 return 0; 148 149 case IOMMU_VFIO_IOAS_CLEAR: 150 xa_lock(&ucmd->ictx->objects); 151 ucmd->ictx->vfio_ioas = NULL; 152 xa_unlock(&ucmd->ictx->objects); 153 return 0; 154 default: 155 return -EOPNOTSUPP; 156 } 157 } 158 159 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, 160 void __user *arg) 161 { 162 u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 163 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 164 struct vfio_iommu_type1_dma_map map; 165 int iommu_prot = IOMMU_CACHE; 166 struct iommufd_ioas *ioas; 167 unsigned long iova; 168 int rc; 169 170 if (copy_from_user(&map, arg, minsz)) 171 return -EFAULT; 172 173 if (map.argsz < minsz || map.flags & ~supported_flags) 174 return -EINVAL; 175 176 if (map.flags & VFIO_DMA_MAP_FLAG_READ) 177 iommu_prot |= IOMMU_READ; 178 if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) 179 iommu_prot |= IOMMU_WRITE; 180 181 ioas = get_compat_ioas(ictx); 182 if (IS_ERR(ioas)) 183 return PTR_ERR(ioas); 184 185 /* 186 * Maps created through the legacy interface always use VFIO compatible 187 * rlimit accounting. If the user wishes to use the faster user based 188 * rlimit accounting then they must use the new interface. 189 */ 190 iova = map.iova; 191 rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), 192 map.size, iommu_prot, 0); 193 iommufd_put_object(ictx, &ioas->obj); 194 return rc; 195 } 196 197 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, 198 void __user *arg) 199 { 200 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 201 /* 202 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new 203 * dirty tracking direction: 204 * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ 205 * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ 206 */ 207 u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; 208 struct vfio_iommu_type1_dma_unmap unmap; 209 unsigned long unmapped = 0; 210 struct iommufd_ioas *ioas; 211 int rc; 212 213 if (copy_from_user(&unmap, arg, minsz)) 214 return -EFAULT; 215 216 if (unmap.argsz < minsz || unmap.flags & ~supported_flags) 217 return -EINVAL; 218 219 ioas = get_compat_ioas(ictx); 220 if (IS_ERR(ioas)) 221 return PTR_ERR(ioas); 222 223 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { 224 if (unmap.iova != 0 || unmap.size != 0) { 225 rc = -EINVAL; 226 goto err_put; 227 } 228 rc = iopt_unmap_all(&ioas->iopt, &unmapped); 229 } else { 230 if (READ_ONCE(ioas->iopt.disable_large_pages)) { 231 /* 232 * Create cuts at the start and last of the requested 233 * range. If the start IOVA is 0 then it doesn't need to 234 * be cut. 235 */ 236 unsigned long iovas[] = { unmap.iova + unmap.size - 1, 237 unmap.iova - 1 }; 238 239 rc = iopt_cut_iova(&ioas->iopt, iovas, 240 unmap.iova ? 2 : 1); 241 if (rc) 242 goto err_put; 243 } 244 rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, 245 &unmapped); 246 } 247 unmap.size = unmapped; 248 if (copy_to_user(arg, &unmap, minsz)) 249 rc = -EFAULT; 250 251 err_put: 252 iommufd_put_object(ictx, &ioas->obj); 253 return rc; 254 } 255 256 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) 257 { 258 struct iommufd_hwpt_paging *hwpt_paging; 259 struct iommufd_ioas *ioas; 260 int rc = 1; 261 262 ioas = get_compat_ioas(ictx); 263 if (IS_ERR(ioas)) 264 return PTR_ERR(ioas); 265 266 mutex_lock(&ioas->mutex); 267 list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { 268 if (!hwpt_paging->enforce_cache_coherency) { 269 rc = 0; 270 break; 271 } 272 } 273 mutex_unlock(&ioas->mutex); 274 275 iommufd_put_object(ictx, &ioas->obj); 276 return rc; 277 } 278 279 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, 280 unsigned long type) 281 { 282 switch (type) { 283 case VFIO_TYPE1_IOMMU: 284 case VFIO_TYPE1v2_IOMMU: 285 case VFIO_UNMAP_ALL: 286 return 1; 287 288 case VFIO_NOIOMMU_IOMMU: 289 return IS_ENABLED(CONFIG_VFIO_NOIOMMU); 290 291 case VFIO_DMA_CC_IOMMU: 292 return iommufd_vfio_cc_iommu(ictx); 293 294 case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: 295 return 0; 296 297 /* 298 * VFIO_DMA_MAP_FLAG_VADDR 299 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ 300 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ 301 * 302 * It is hard to see how this could be implemented safely. 303 */ 304 case VFIO_UPDATE_VADDR: 305 default: 306 return 0; 307 } 308 } 309 310 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) 311 { 312 bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode); 313 struct iommufd_ioas *ioas = NULL; 314 int rc = 0; 315 316 /* 317 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all 318 * other ioctls. We let them keep working but they mostly fail since no 319 * IOAS should exist. 320 */ 321 if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU && 322 no_iommu_mode) { 323 if (!capable(CAP_SYS_RAWIO)) 324 return -EPERM; 325 return 0; 326 } 327 328 if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) || 329 no_iommu_mode) 330 return -EINVAL; 331 332 /* VFIO fails the set_iommu if there is no group */ 333 ioas = get_compat_ioas(ictx); 334 if (IS_ERR(ioas)) 335 return PTR_ERR(ioas); 336 337 /* 338 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in 339 * the middle of mapped ranges. This is complicated by huge page support 340 * which creates single large IOPTEs that cannot be split by the iommu 341 * driver. TYPE1 is very old at this point and likely nothing uses it, 342 * however it is simple enough to emulate by simply disabling the 343 * problematic large IOPTEs. Then we can safely unmap within any range. 344 */ 345 if (type == VFIO_TYPE1_IOMMU) 346 rc = iopt_disable_large_pages(&ioas->iopt); 347 iommufd_put_object(ictx, &ioas->obj); 348 return rc; 349 } 350 351 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) 352 { 353 struct io_pagetable *iopt = &ioas->iopt; 354 unsigned long pgsize_bitmap = ULONG_MAX; 355 struct iommu_domain *domain; 356 unsigned long index; 357 358 down_read(&iopt->domains_rwsem); 359 xa_for_each(&iopt->domains, index, domain) 360 pgsize_bitmap &= domain->pgsize_bitmap; 361 362 /* See vfio_update_pgsize_bitmap() */ 363 if (pgsize_bitmap & ~PAGE_MASK) { 364 pgsize_bitmap &= PAGE_MASK; 365 pgsize_bitmap |= PAGE_SIZE; 366 } 367 pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); 368 up_read(&iopt->domains_rwsem); 369 return pgsize_bitmap; 370 } 371 372 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, 373 struct vfio_info_cap_header __user *cur, 374 size_t avail) 375 { 376 struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = 377 container_of(cur, 378 struct vfio_iommu_type1_info_cap_iova_range __user, 379 header); 380 struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { 381 .header = { 382 .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 383 .version = 1, 384 }, 385 }; 386 struct interval_tree_span_iter span; 387 388 interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, 389 ULONG_MAX) { 390 struct vfio_iova_range range; 391 392 if (!span.is_hole) 393 continue; 394 range.start = span.start_hole; 395 range.end = span.last_hole; 396 if (avail >= struct_size(&cap_iovas, iova_ranges, 397 cap_iovas.nr_iovas + 1) && 398 copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], 399 &range, sizeof(range))) 400 return -EFAULT; 401 cap_iovas.nr_iovas++; 402 } 403 if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && 404 copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) 405 return -EFAULT; 406 return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); 407 } 408 409 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, 410 struct vfio_info_cap_header __user *cur, 411 size_t avail) 412 { 413 struct vfio_iommu_type1_info_dma_avail cap_dma = { 414 .header = { 415 .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, 416 .version = 1, 417 }, 418 /* 419 * iommufd's limit is based on the cgroup's memory limit. 420 * Normally vfio would return U16_MAX here, and provide a module 421 * parameter to adjust it. Since S390 qemu userspace actually 422 * pays attention and needs a value bigger than U16_MAX return 423 * U32_MAX. 424 */ 425 .avail = U32_MAX, 426 }; 427 428 if (avail >= sizeof(cap_dma) && 429 copy_to_user(cur, &cap_dma, sizeof(cap_dma))) 430 return -EFAULT; 431 return sizeof(cap_dma); 432 } 433 434 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, 435 void __user *arg) 436 { 437 typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, 438 struct vfio_info_cap_header __user *cur, 439 size_t avail); 440 static const fill_cap_fn fill_fns[] = { 441 iommufd_fill_cap_dma_avail, 442 iommufd_fill_cap_iova, 443 }; 444 size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 445 struct vfio_info_cap_header __user *last_cap = NULL; 446 struct vfio_iommu_type1_info info = {}; 447 struct iommufd_ioas *ioas; 448 size_t total_cap_size; 449 int rc; 450 int i; 451 452 if (copy_from_user(&info, arg, minsz)) 453 return -EFAULT; 454 455 if (info.argsz < minsz) 456 return -EINVAL; 457 minsz = min_t(size_t, info.argsz, sizeof(info)); 458 459 ioas = get_compat_ioas(ictx); 460 if (IS_ERR(ioas)) 461 return PTR_ERR(ioas); 462 463 info.flags = VFIO_IOMMU_INFO_PGSIZES; 464 info.iova_pgsizes = iommufd_get_pagesizes(ioas); 465 info.cap_offset = 0; 466 467 down_read(&ioas->iopt.iova_rwsem); 468 total_cap_size = sizeof(info); 469 for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { 470 int cap_size; 471 472 if (info.argsz > total_cap_size) 473 cap_size = fill_fns[i](ioas, arg + total_cap_size, 474 info.argsz - total_cap_size); 475 else 476 cap_size = fill_fns[i](ioas, NULL, 0); 477 if (cap_size < 0) { 478 rc = cap_size; 479 goto out_put; 480 } 481 cap_size = ALIGN(cap_size, sizeof(u64)); 482 483 if (last_cap && info.argsz >= total_cap_size && 484 put_user(total_cap_size, &last_cap->next)) { 485 rc = -EFAULT; 486 goto out_put; 487 } 488 last_cap = arg + total_cap_size; 489 total_cap_size += cap_size; 490 } 491 492 /* 493 * If the user did not provide enough space then only some caps are 494 * returned and the argsz will be updated to the correct amount to get 495 * all caps. 496 */ 497 if (info.argsz >= total_cap_size) 498 info.cap_offset = sizeof(info); 499 info.argsz = total_cap_size; 500 info.flags |= VFIO_IOMMU_INFO_CAPS; 501 if (copy_to_user(arg, &info, minsz)) { 502 rc = -EFAULT; 503 goto out_put; 504 } 505 rc = 0; 506 507 out_put: 508 up_read(&ioas->iopt.iova_rwsem); 509 iommufd_put_object(ictx, &ioas->obj); 510 return rc; 511 } 512 513 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, 514 unsigned long arg) 515 { 516 void __user *uarg = (void __user *)arg; 517 518 switch (cmd) { 519 case VFIO_GET_API_VERSION: 520 return VFIO_API_VERSION; 521 case VFIO_SET_IOMMU: 522 return iommufd_vfio_set_iommu(ictx, arg); 523 case VFIO_CHECK_EXTENSION: 524 return iommufd_vfio_check_extension(ictx, arg); 525 case VFIO_IOMMU_GET_INFO: 526 return iommufd_vfio_iommu_get_info(ictx, uarg); 527 case VFIO_IOMMU_MAP_DMA: 528 return iommufd_vfio_map_dma(ictx, cmd, uarg); 529 case VFIO_IOMMU_UNMAP_DMA: 530 return iommufd_vfio_unmap_dma(ictx, cmd, uarg); 531 case VFIO_IOMMU_DIRTY_PAGES: 532 default: 533 return -ENOIOCTLCMD; 534 } 535 return -ENOIOCTLCMD; 536 } 537