1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/xarray.h> 21 #include <linux/ioasid.h> 22 #include <asm/page.h> 23 #include <asm/fpu/api.h> 24 #include <trace/events/intel_iommu.h> 25 26 #include "pasid.h" 27 #include "perf.h" 28 #include "../iommu-sva-lib.h" 29 30 static irqreturn_t prq_event_thread(int irq, void *d); 31 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 32 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 33 34 #define PRQ_ORDER 0 35 36 static DEFINE_XARRAY_ALLOC(pasid_private_array); 37 static int pasid_private_add(ioasid_t pasid, void *priv) 38 { 39 return xa_alloc(&pasid_private_array, &pasid, priv, 40 XA_LIMIT(pasid, pasid), GFP_ATOMIC); 41 } 42 43 static void pasid_private_remove(ioasid_t pasid) 44 { 45 xa_erase(&pasid_private_array, pasid); 46 } 47 48 static void *pasid_private_find(ioasid_t pasid) 49 { 50 return xa_load(&pasid_private_array, pasid); 51 } 52 53 static struct intel_svm_dev * 54 svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid) 55 { 56 struct intel_svm_dev *sdev = NULL, *t; 57 58 rcu_read_lock(); 59 list_for_each_entry_rcu(t, &svm->devs, list) { 60 if (t->sid == sid) { 61 sdev = t; 62 break; 63 } 64 } 65 rcu_read_unlock(); 66 67 return sdev; 68 } 69 70 static struct intel_svm_dev * 71 svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) 72 { 73 struct intel_svm_dev *sdev = NULL, *t; 74 75 rcu_read_lock(); 76 list_for_each_entry_rcu(t, &svm->devs, list) { 77 if (t->dev == dev) { 78 sdev = t; 79 break; 80 } 81 } 82 rcu_read_unlock(); 83 84 return sdev; 85 } 86 87 int intel_svm_enable_prq(struct intel_iommu *iommu) 88 { 89 struct iopf_queue *iopfq; 90 struct page *pages; 91 int irq, ret; 92 93 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 94 if (!pages) { 95 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 96 iommu->name); 97 return -ENOMEM; 98 } 99 iommu->prq = page_address(pages); 100 101 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 102 if (irq <= 0) { 103 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 104 iommu->name); 105 ret = -EINVAL; 106 goto free_prq; 107 } 108 iommu->pr_irq = irq; 109 110 snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), 111 "dmar%d-iopfq", iommu->seq_id); 112 iopfq = iopf_queue_alloc(iommu->iopfq_name); 113 if (!iopfq) { 114 pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); 115 ret = -ENOMEM; 116 goto free_hwirq; 117 } 118 iommu->iopf_queue = iopfq; 119 120 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 121 122 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 123 iommu->prq_name, iommu); 124 if (ret) { 125 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 126 iommu->name); 127 goto free_iopfq; 128 } 129 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 130 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 131 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 132 133 init_completion(&iommu->prq_complete); 134 135 return 0; 136 137 free_iopfq: 138 iopf_queue_free(iommu->iopf_queue); 139 iommu->iopf_queue = NULL; 140 free_hwirq: 141 dmar_free_hwirq(irq); 142 iommu->pr_irq = 0; 143 free_prq: 144 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 145 iommu->prq = NULL; 146 147 return ret; 148 } 149 150 int intel_svm_finish_prq(struct intel_iommu *iommu) 151 { 152 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 153 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 154 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 155 156 if (iommu->pr_irq) { 157 free_irq(iommu->pr_irq, iommu); 158 dmar_free_hwirq(iommu->pr_irq); 159 iommu->pr_irq = 0; 160 } 161 162 if (iommu->iopf_queue) { 163 iopf_queue_free(iommu->iopf_queue); 164 iommu->iopf_queue = NULL; 165 } 166 167 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 168 iommu->prq = NULL; 169 170 return 0; 171 } 172 173 static inline bool intel_svm_capable(struct intel_iommu *iommu) 174 { 175 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 176 } 177 178 void intel_svm_check(struct intel_iommu *iommu) 179 { 180 if (!pasid_supported(iommu)) 181 return; 182 183 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 184 !cap_fl1gp_support(iommu->cap)) { 185 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 186 iommu->name); 187 return; 188 } 189 190 if (cpu_feature_enabled(X86_FEATURE_LA57) && 191 !cap_5lp_support(iommu->cap)) { 192 pr_err("%s SVM disabled, incompatible paging mode\n", 193 iommu->name); 194 return; 195 } 196 197 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 198 } 199 200 static void __flush_svm_range_dev(struct intel_svm *svm, 201 struct intel_svm_dev *sdev, 202 unsigned long address, 203 unsigned long pages, int ih) 204 { 205 struct device_domain_info *info = get_domain_info(sdev->dev); 206 207 if (WARN_ON(!pages)) 208 return; 209 210 qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); 211 if (info->ats_enabled) 212 qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, 213 svm->pasid, sdev->qdep, address, 214 order_base_2(pages)); 215 } 216 217 static void intel_flush_svm_range_dev(struct intel_svm *svm, 218 struct intel_svm_dev *sdev, 219 unsigned long address, 220 unsigned long pages, int ih) 221 { 222 unsigned long shift = ilog2(__roundup_pow_of_two(pages)); 223 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); 224 unsigned long start = ALIGN_DOWN(address, align); 225 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); 226 227 while (start < end) { 228 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); 229 start += align; 230 } 231 } 232 233 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 234 unsigned long pages, int ih) 235 { 236 struct intel_svm_dev *sdev; 237 238 rcu_read_lock(); 239 list_for_each_entry_rcu(sdev, &svm->devs, list) 240 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 241 rcu_read_unlock(); 242 } 243 244 /* Pages have been freed at this point */ 245 static void intel_invalidate_range(struct mmu_notifier *mn, 246 struct mm_struct *mm, 247 unsigned long start, unsigned long end) 248 { 249 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 250 251 intel_flush_svm_range(svm, start, 252 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 253 } 254 255 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 256 { 257 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 258 struct intel_svm_dev *sdev; 259 260 /* This might end up being called from exit_mmap(), *before* the page 261 * tables are cleared. And __mmu_notifier_release() will delete us from 262 * the list of notifiers so that our invalidate_range() callback doesn't 263 * get called when the page tables are cleared. So we need to protect 264 * against hardware accessing those page tables. 265 * 266 * We do it by clearing the entry in the PASID table and then flushing 267 * the IOTLB and the PASID table caches. This might upset hardware; 268 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 269 * page) so that we end up taking a fault that the hardware really 270 * *has* to handle gracefully without affecting other processes. 271 */ 272 rcu_read_lock(); 273 list_for_each_entry_rcu(sdev, &svm->devs, list) 274 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 275 svm->pasid, true); 276 rcu_read_unlock(); 277 278 } 279 280 static const struct mmu_notifier_ops intel_mmuops = { 281 .release = intel_mm_release, 282 .invalidate_range = intel_invalidate_range, 283 }; 284 285 static DEFINE_MUTEX(pasid_mutex); 286 287 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 288 struct intel_svm **rsvm, 289 struct intel_svm_dev **rsdev) 290 { 291 struct intel_svm_dev *sdev = NULL; 292 struct intel_svm *svm; 293 294 /* The caller should hold the pasid_mutex lock */ 295 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 296 return -EINVAL; 297 298 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 299 return -EINVAL; 300 301 svm = pasid_private_find(pasid); 302 if (IS_ERR(svm)) 303 return PTR_ERR(svm); 304 305 if (!svm) 306 goto out; 307 308 /* 309 * If we found svm for the PASID, there must be at least one device 310 * bond. 311 */ 312 if (WARN_ON(list_empty(&svm->devs))) 313 return -EINVAL; 314 sdev = svm_lookup_device_by_dev(svm, dev); 315 316 out: 317 *rsvm = svm; 318 *rsdev = sdev; 319 320 return 0; 321 } 322 323 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 324 struct iommu_gpasid_bind_data *data) 325 { 326 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 327 struct intel_svm_dev *sdev = NULL; 328 struct dmar_domain *dmar_domain; 329 struct device_domain_info *info; 330 struct intel_svm *svm = NULL; 331 unsigned long iflags; 332 int ret = 0; 333 334 if (WARN_ON(!iommu) || !data) 335 return -EINVAL; 336 337 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 338 return -EINVAL; 339 340 /* IOMMU core ensures argsz is more than the start of the union */ 341 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 342 return -EINVAL; 343 344 /* Make sure no undefined flags are used in vendor data */ 345 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 346 return -EINVAL; 347 348 if (!dev_is_pci(dev)) 349 return -ENOTSUPP; 350 351 /* VT-d supports devices with full 20 bit PASIDs only */ 352 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 353 return -EINVAL; 354 355 /* 356 * We only check host PASID range, we have no knowledge to check 357 * guest PASID range. 358 */ 359 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 360 return -EINVAL; 361 362 info = get_domain_info(dev); 363 if (!info) 364 return -EINVAL; 365 366 dmar_domain = to_dmar_domain(domain); 367 368 mutex_lock(&pasid_mutex); 369 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 370 if (ret) 371 goto out; 372 373 if (sdev) { 374 /* 375 * Do not allow multiple bindings of the same device-PASID since 376 * there is only one SL page tables per PASID. We may revisit 377 * once sharing PGD across domains are supported. 378 */ 379 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 380 svm->pasid); 381 ret = -EBUSY; 382 goto out; 383 } 384 385 if (!svm) { 386 /* We come here when PASID has never been bond to a device. */ 387 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 388 if (!svm) { 389 ret = -ENOMEM; 390 goto out; 391 } 392 /* REVISIT: upper layer/VFIO can track host process that bind 393 * the PASID. ioasid_set = mm might be sufficient for vfio to 394 * check pasid VMM ownership. We can drop the following line 395 * once VFIO and IOASID set check is in place. 396 */ 397 svm->mm = get_task_mm(current); 398 svm->pasid = data->hpasid; 399 if (data->flags & IOMMU_SVA_GPASID_VAL) { 400 svm->gpasid = data->gpasid; 401 svm->flags |= SVM_FLAG_GUEST_PASID; 402 } 403 pasid_private_add(data->hpasid, svm); 404 INIT_LIST_HEAD_RCU(&svm->devs); 405 mmput(svm->mm); 406 } 407 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 408 if (!sdev) { 409 ret = -ENOMEM; 410 goto out; 411 } 412 sdev->dev = dev; 413 sdev->sid = PCI_DEVID(info->bus, info->devfn); 414 sdev->iommu = iommu; 415 416 /* Only count users if device has aux domains */ 417 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 418 sdev->users = 1; 419 420 /* Set up device context entry for PASID if not enabled already */ 421 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 422 if (ret) { 423 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 424 kfree(sdev); 425 goto out; 426 } 427 428 /* 429 * PASID table is per device for better security. Therefore, for 430 * each bind of a new device even with an existing PASID, we need to 431 * call the nested mode setup function here. 432 */ 433 spin_lock_irqsave(&iommu->lock, iflags); 434 ret = intel_pasid_setup_nested(iommu, dev, 435 (pgd_t *)(uintptr_t)data->gpgd, 436 data->hpasid, &data->vendor.vtd, dmar_domain, 437 data->addr_width); 438 spin_unlock_irqrestore(&iommu->lock, iflags); 439 if (ret) { 440 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 441 data->hpasid, ret); 442 /* 443 * PASID entry should be in cleared state if nested mode 444 * set up failed. So we only need to clear IOASID tracking 445 * data such that free call will succeed. 446 */ 447 kfree(sdev); 448 goto out; 449 } 450 451 svm->flags |= SVM_FLAG_GUEST_MODE; 452 453 init_rcu_head(&sdev->rcu); 454 list_add_rcu(&sdev->list, &svm->devs); 455 out: 456 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 457 pasid_private_remove(data->hpasid); 458 kfree(svm); 459 } 460 461 mutex_unlock(&pasid_mutex); 462 return ret; 463 } 464 465 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 466 { 467 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 468 struct intel_svm_dev *sdev; 469 struct intel_svm *svm; 470 int ret; 471 472 if (WARN_ON(!iommu)) 473 return -EINVAL; 474 475 mutex_lock(&pasid_mutex); 476 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 477 if (ret) 478 goto out; 479 480 if (sdev) { 481 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 482 sdev->users--; 483 if (!sdev->users) { 484 list_del_rcu(&sdev->list); 485 intel_pasid_tear_down_entry(iommu, dev, 486 svm->pasid, false); 487 intel_svm_drain_prq(dev, svm->pasid); 488 kfree_rcu(sdev, rcu); 489 490 if (list_empty(&svm->devs)) { 491 /* 492 * We do not free the IOASID here in that 493 * IOMMU driver did not allocate it. 494 * Unlike native SVM, IOASID for guest use was 495 * allocated prior to the bind call. 496 * In any case, if the free call comes before 497 * the unbind, IOMMU driver will get notified 498 * and perform cleanup. 499 */ 500 pasid_private_remove(pasid); 501 kfree(svm); 502 } 503 } 504 } 505 out: 506 mutex_unlock(&pasid_mutex); 507 return ret; 508 } 509 510 static void _load_pasid(void *unused) 511 { 512 update_pasid(); 513 } 514 515 static void load_pasid(struct mm_struct *mm, u32 pasid) 516 { 517 mutex_lock(&mm->context.lock); 518 519 /* Synchronize with READ_ONCE in update_pasid(). */ 520 smp_store_release(&mm->pasid, pasid); 521 522 /* Update PASID MSR on all CPUs running the mm's tasks. */ 523 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 524 525 mutex_unlock(&mm->context.lock); 526 } 527 528 static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, 529 unsigned int flags) 530 { 531 ioasid_t max_pasid = dev_is_pci(dev) ? 532 pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; 533 534 return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); 535 } 536 537 static void intel_svm_free_pasid(struct mm_struct *mm) 538 { 539 iommu_sva_free_pasid(mm); 540 } 541 542 static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, 543 struct device *dev, 544 struct mm_struct *mm, 545 unsigned int flags) 546 { 547 struct device_domain_info *info = get_domain_info(dev); 548 unsigned long iflags, sflags; 549 struct intel_svm_dev *sdev; 550 struct intel_svm *svm; 551 int ret = 0; 552 553 svm = pasid_private_find(mm->pasid); 554 if (!svm) { 555 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 556 if (!svm) 557 return ERR_PTR(-ENOMEM); 558 559 svm->pasid = mm->pasid; 560 svm->mm = mm; 561 svm->flags = flags; 562 INIT_LIST_HEAD_RCU(&svm->devs); 563 564 if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) { 565 svm->notifier.ops = &intel_mmuops; 566 ret = mmu_notifier_register(&svm->notifier, mm); 567 if (ret) { 568 kfree(svm); 569 return ERR_PTR(ret); 570 } 571 } 572 573 ret = pasid_private_add(svm->pasid, svm); 574 if (ret) { 575 if (svm->notifier.ops) 576 mmu_notifier_unregister(&svm->notifier, mm); 577 kfree(svm); 578 return ERR_PTR(ret); 579 } 580 } 581 582 /* Find the matching device in svm list */ 583 sdev = svm_lookup_device_by_dev(svm, dev); 584 if (sdev) { 585 sdev->users++; 586 goto success; 587 } 588 589 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 590 if (!sdev) { 591 ret = -ENOMEM; 592 goto free_svm; 593 } 594 595 sdev->dev = dev; 596 sdev->iommu = iommu; 597 sdev->did = FLPT_DEFAULT_DID; 598 sdev->sid = PCI_DEVID(info->bus, info->devfn); 599 sdev->users = 1; 600 sdev->pasid = svm->pasid; 601 sdev->sva.dev = dev; 602 init_rcu_head(&sdev->rcu); 603 if (info->ats_enabled) { 604 sdev->dev_iotlb = 1; 605 sdev->qdep = info->ats_qdep; 606 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 607 sdev->qdep = 0; 608 } 609 610 /* Setup the pasid table: */ 611 sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ? 612 PASID_FLAG_SUPERVISOR_MODE : 0; 613 sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; 614 spin_lock_irqsave(&iommu->lock, iflags); 615 ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, 616 FLPT_DEFAULT_DID, sflags); 617 spin_unlock_irqrestore(&iommu->lock, iflags); 618 619 if (ret) 620 goto free_sdev; 621 622 /* The newly allocated pasid is loaded to the mm. */ 623 if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs)) 624 load_pasid(mm, svm->pasid); 625 626 list_add_rcu(&sdev->list, &svm->devs); 627 success: 628 return &sdev->sva; 629 630 free_sdev: 631 kfree(sdev); 632 free_svm: 633 if (list_empty(&svm->devs)) { 634 if (svm->notifier.ops) 635 mmu_notifier_unregister(&svm->notifier, mm); 636 pasid_private_remove(mm->pasid); 637 kfree(svm); 638 } 639 640 return ERR_PTR(ret); 641 } 642 643 /* Caller must hold pasid_mutex */ 644 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 645 { 646 struct intel_svm_dev *sdev; 647 struct intel_iommu *iommu; 648 struct intel_svm *svm; 649 struct mm_struct *mm; 650 int ret = -EINVAL; 651 652 iommu = device_to_iommu(dev, NULL, NULL); 653 if (!iommu) 654 goto out; 655 656 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 657 if (ret) 658 goto out; 659 mm = svm->mm; 660 661 if (sdev) { 662 sdev->users--; 663 if (!sdev->users) { 664 list_del_rcu(&sdev->list); 665 /* Flush the PASID cache and IOTLB for this device. 666 * Note that we do depend on the hardware *not* using 667 * the PASID any more. Just as we depend on other 668 * devices never using PASIDs that they have no right 669 * to use. We have a *shared* PASID table, because it's 670 * large and has to be physically contiguous. So it's 671 * hard to be as defensive as we might like. */ 672 intel_pasid_tear_down_entry(iommu, dev, 673 svm->pasid, false); 674 intel_svm_drain_prq(dev, svm->pasid); 675 kfree_rcu(sdev, rcu); 676 677 if (list_empty(&svm->devs)) { 678 if (svm->notifier.ops) { 679 mmu_notifier_unregister(&svm->notifier, mm); 680 /* Clear mm's pasid. */ 681 load_pasid(mm, PASID_DISABLED); 682 } 683 pasid_private_remove(svm->pasid); 684 /* We mandate that no page faults may be outstanding 685 * for the PASID when intel_svm_unbind_mm() is called. 686 * If that is not obeyed, subtle errors will happen. 687 * Let's make them less subtle... */ 688 memset(svm, 0x6b, sizeof(*svm)); 689 kfree(svm); 690 } 691 } 692 /* Drop a PASID reference and free it if no reference. */ 693 intel_svm_free_pasid(mm); 694 } 695 out: 696 return ret; 697 } 698 699 /* Page request queue descriptor */ 700 struct page_req_dsc { 701 union { 702 struct { 703 u64 type:8; 704 u64 pasid_present:1; 705 u64 priv_data_present:1; 706 u64 rsvd:6; 707 u64 rid:16; 708 u64 pasid:20; 709 u64 exe_req:1; 710 u64 pm_req:1; 711 u64 rsvd2:10; 712 }; 713 u64 qw_0; 714 }; 715 union { 716 struct { 717 u64 rd_req:1; 718 u64 wr_req:1; 719 u64 lpig:1; 720 u64 prg_index:9; 721 u64 addr:52; 722 }; 723 u64 qw_1; 724 }; 725 u64 priv_data[2]; 726 }; 727 728 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 729 730 static bool is_canonical_address(u64 addr) 731 { 732 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 733 long saddr = (long) addr; 734 735 return (((saddr << shift) >> shift) == saddr); 736 } 737 738 /** 739 * intel_svm_drain_prq - Drain page requests and responses for a pasid 740 * @dev: target device 741 * @pasid: pasid for draining 742 * 743 * Drain all pending page requests and responses related to @pasid in both 744 * software and hardware. This is supposed to be called after the device 745 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 746 * and DevTLB have been invalidated. 747 * 748 * It waits until all pending page requests for @pasid in the page fault 749 * queue are completed by the prq handling thread. Then follow the steps 750 * described in VT-d spec CH7.10 to drain all page requests and page 751 * responses pending in the hardware. 752 */ 753 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 754 { 755 struct device_domain_info *info; 756 struct dmar_domain *domain; 757 struct intel_iommu *iommu; 758 struct qi_desc desc[3]; 759 struct pci_dev *pdev; 760 int head, tail; 761 u16 sid, did; 762 int qdep; 763 764 info = get_domain_info(dev); 765 if (WARN_ON(!info || !dev_is_pci(dev))) 766 return; 767 768 if (!info->pri_enabled) 769 return; 770 771 iommu = info->iommu; 772 domain = info->domain; 773 pdev = to_pci_dev(dev); 774 sid = PCI_DEVID(info->bus, info->devfn); 775 did = domain->iommu_did[iommu->seq_id]; 776 qdep = pci_ats_queue_depth(pdev); 777 778 /* 779 * Check and wait until all pending page requests in the queue are 780 * handled by the prq handling thread. 781 */ 782 prq_retry: 783 reinit_completion(&iommu->prq_complete); 784 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 785 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 786 while (head != tail) { 787 struct page_req_dsc *req; 788 789 req = &iommu->prq[head / sizeof(*req)]; 790 if (!req->pasid_present || req->pasid != pasid) { 791 head = (head + sizeof(*req)) & PRQ_RING_MASK; 792 continue; 793 } 794 795 wait_for_completion(&iommu->prq_complete); 796 goto prq_retry; 797 } 798 799 iopf_queue_flush_dev(dev); 800 801 /* 802 * Perform steps described in VT-d spec CH7.10 to drain page 803 * requests and responses in hardware. 804 */ 805 memset(desc, 0, sizeof(desc)); 806 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 807 QI_IWD_FENCE | 808 QI_IWD_TYPE; 809 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 810 QI_EIOTLB_DID(did) | 811 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 812 QI_EIOTLB_TYPE; 813 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 814 QI_DEV_EIOTLB_SID(sid) | 815 QI_DEV_EIOTLB_QDEP(qdep) | 816 QI_DEIOTLB_TYPE | 817 QI_DEV_IOTLB_PFSID(info->pfsid); 818 qi_retry: 819 reinit_completion(&iommu->prq_complete); 820 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 821 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 822 wait_for_completion(&iommu->prq_complete); 823 goto qi_retry; 824 } 825 } 826 827 static int prq_to_iommu_prot(struct page_req_dsc *req) 828 { 829 int prot = 0; 830 831 if (req->rd_req) 832 prot |= IOMMU_FAULT_PERM_READ; 833 if (req->wr_req) 834 prot |= IOMMU_FAULT_PERM_WRITE; 835 if (req->exe_req) 836 prot |= IOMMU_FAULT_PERM_EXEC; 837 if (req->pm_req) 838 prot |= IOMMU_FAULT_PERM_PRIV; 839 840 return prot; 841 } 842 843 static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, 844 struct page_req_dsc *desc) 845 { 846 struct iommu_fault_event event; 847 848 if (!dev || !dev_is_pci(dev)) 849 return -ENODEV; 850 851 /* Fill in event data for device specific processing */ 852 memset(&event, 0, sizeof(struct iommu_fault_event)); 853 event.fault.type = IOMMU_FAULT_PAGE_REQ; 854 event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; 855 event.fault.prm.pasid = desc->pasid; 856 event.fault.prm.grpid = desc->prg_index; 857 event.fault.prm.perm = prq_to_iommu_prot(desc); 858 859 if (desc->lpig) 860 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 861 if (desc->pasid_present) { 862 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 863 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 864 } 865 if (desc->priv_data_present) { 866 /* 867 * Set last page in group bit if private data is present, 868 * page response is required as it does for LPIG. 869 * iommu_report_device_fault() doesn't understand this vendor 870 * specific requirement thus we set last_page as a workaround. 871 */ 872 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 873 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 874 event.fault.prm.private_data[0] = desc->priv_data[0]; 875 event.fault.prm.private_data[1] = desc->priv_data[1]; 876 } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) { 877 /* 878 * If the private data fields are not used by hardware, use it 879 * to monitor the prq handle latency. 880 */ 881 event.fault.prm.private_data[0] = ktime_to_ns(ktime_get()); 882 } 883 884 return iommu_report_device_fault(dev, &event); 885 } 886 887 static void handle_bad_prq_event(struct intel_iommu *iommu, 888 struct page_req_dsc *req, int result) 889 { 890 struct qi_desc desc; 891 892 pr_err("%s: Invalid page request: %08llx %08llx\n", 893 iommu->name, ((unsigned long long *)req)[0], 894 ((unsigned long long *)req)[1]); 895 896 /* 897 * Per VT-d spec. v3.0 ch7.7, system software must 898 * respond with page group response if private data 899 * is present (PDP) or last page in group (LPIG) bit 900 * is set. This is an additional VT-d feature beyond 901 * PCI ATS spec. 902 */ 903 if (!req->lpig && !req->priv_data_present) 904 return; 905 906 desc.qw0 = QI_PGRP_PASID(req->pasid) | 907 QI_PGRP_DID(req->rid) | 908 QI_PGRP_PASID_P(req->pasid_present) | 909 QI_PGRP_PDP(req->priv_data_present) | 910 QI_PGRP_RESP_CODE(result) | 911 QI_PGRP_RESP_TYPE; 912 desc.qw1 = QI_PGRP_IDX(req->prg_index) | 913 QI_PGRP_LPIG(req->lpig); 914 915 if (req->priv_data_present) { 916 desc.qw2 = req->priv_data[0]; 917 desc.qw3 = req->priv_data[1]; 918 } else { 919 desc.qw2 = 0; 920 desc.qw3 = 0; 921 } 922 923 qi_submit_sync(iommu, &desc, 1, 0); 924 } 925 926 static irqreturn_t prq_event_thread(int irq, void *d) 927 { 928 struct intel_svm_dev *sdev = NULL; 929 struct intel_iommu *iommu = d; 930 struct intel_svm *svm = NULL; 931 struct page_req_dsc *req; 932 int head, tail, handled; 933 u64 address; 934 935 /* 936 * Clear PPR bit before reading head/tail registers, to ensure that 937 * we get a new interrupt if needed. 938 */ 939 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 940 941 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 942 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 943 handled = (head != tail); 944 while (head != tail) { 945 req = &iommu->prq[head / sizeof(*req)]; 946 address = (u64)req->addr << VTD_PAGE_SHIFT; 947 948 if (unlikely(!req->pasid_present)) { 949 pr_err("IOMMU: %s: Page request without PASID\n", 950 iommu->name); 951 bad_req: 952 svm = NULL; 953 sdev = NULL; 954 handle_bad_prq_event(iommu, req, QI_RESP_INVALID); 955 goto prq_advance; 956 } 957 958 if (unlikely(!is_canonical_address(address))) { 959 pr_err("IOMMU: %s: Address is not canonical\n", 960 iommu->name); 961 goto bad_req; 962 } 963 964 if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { 965 pr_err("IOMMU: %s: Page request in Privilege Mode\n", 966 iommu->name); 967 goto bad_req; 968 } 969 970 if (unlikely(req->exe_req && req->rd_req)) { 971 pr_err("IOMMU: %s: Execution request not supported\n", 972 iommu->name); 973 goto bad_req; 974 } 975 976 if (!svm || svm->pasid != req->pasid) { 977 /* 978 * It can't go away, because the driver is not permitted 979 * to unbind the mm while any page faults are outstanding. 980 */ 981 svm = pasid_private_find(req->pasid); 982 if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE)) 983 goto bad_req; 984 } 985 986 if (!sdev || sdev->sid != req->rid) { 987 sdev = svm_lookup_device_by_sid(svm, req->rid); 988 if (!sdev) 989 goto bad_req; 990 } 991 992 sdev->prq_seq_number++; 993 994 /* 995 * If prq is to be handled outside iommu driver via receiver of 996 * the fault notifiers, we skip the page response here. 997 */ 998 if (intel_svm_prq_report(iommu, sdev->dev, req)) 999 handle_bad_prq_event(iommu, req, QI_RESP_INVALID); 1000 1001 trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1, 1002 req->priv_data[0], req->priv_data[1], 1003 sdev->prq_seq_number); 1004 prq_advance: 1005 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1006 } 1007 1008 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1009 1010 /* 1011 * Clear the page request overflow bit and wake up all threads that 1012 * are waiting for the completion of this handling. 1013 */ 1014 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 1015 pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", 1016 iommu->name); 1017 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 1018 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 1019 if (head == tail) { 1020 iopf_queue_discard_partial(iommu->iopf_queue); 1021 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1022 pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", 1023 iommu->name); 1024 } 1025 } 1026 1027 if (!completion_done(&iommu->prq_complete)) 1028 complete(&iommu->prq_complete); 1029 1030 return IRQ_RETVAL(handled); 1031 } 1032 1033 struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1034 { 1035 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 1036 unsigned int flags = 0; 1037 struct iommu_sva *sva; 1038 int ret; 1039 1040 if (drvdata) 1041 flags = *(unsigned int *)drvdata; 1042 1043 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 1044 if (!ecap_srs(iommu->ecap)) { 1045 dev_err(dev, "%s: Supervisor PASID not supported\n", 1046 iommu->name); 1047 return ERR_PTR(-EOPNOTSUPP); 1048 } 1049 1050 if (mm) { 1051 dev_err(dev, "%s: Supervisor PASID with user provided mm\n", 1052 iommu->name); 1053 return ERR_PTR(-EINVAL); 1054 } 1055 1056 mm = &init_mm; 1057 } 1058 1059 mutex_lock(&pasid_mutex); 1060 ret = intel_svm_alloc_pasid(dev, mm, flags); 1061 if (ret) { 1062 mutex_unlock(&pasid_mutex); 1063 return ERR_PTR(ret); 1064 } 1065 1066 sva = intel_svm_bind_mm(iommu, dev, mm, flags); 1067 if (IS_ERR_OR_NULL(sva)) 1068 intel_svm_free_pasid(mm); 1069 mutex_unlock(&pasid_mutex); 1070 1071 return sva; 1072 } 1073 1074 void intel_svm_unbind(struct iommu_sva *sva) 1075 { 1076 struct intel_svm_dev *sdev = to_intel_svm_dev(sva); 1077 1078 mutex_lock(&pasid_mutex); 1079 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1080 mutex_unlock(&pasid_mutex); 1081 } 1082 1083 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1084 { 1085 struct intel_svm_dev *sdev; 1086 u32 pasid; 1087 1088 mutex_lock(&pasid_mutex); 1089 sdev = to_intel_svm_dev(sva); 1090 pasid = sdev->pasid; 1091 mutex_unlock(&pasid_mutex); 1092 1093 return pasid; 1094 } 1095 1096 int intel_svm_page_response(struct device *dev, 1097 struct iommu_fault_event *evt, 1098 struct iommu_page_response *msg) 1099 { 1100 struct iommu_fault_page_request *prm; 1101 struct intel_svm_dev *sdev = NULL; 1102 struct intel_svm *svm = NULL; 1103 struct intel_iommu *iommu; 1104 bool private_present; 1105 bool pasid_present; 1106 bool last_page; 1107 u8 bus, devfn; 1108 int ret = 0; 1109 u16 sid; 1110 1111 if (!dev || !dev_is_pci(dev)) 1112 return -ENODEV; 1113 1114 iommu = device_to_iommu(dev, &bus, &devfn); 1115 if (!iommu) 1116 return -ENODEV; 1117 1118 if (!msg || !evt) 1119 return -EINVAL; 1120 1121 mutex_lock(&pasid_mutex); 1122 1123 prm = &evt->fault.prm; 1124 sid = PCI_DEVID(bus, devfn); 1125 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1126 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1127 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1128 1129 if (!pasid_present) { 1130 ret = -EINVAL; 1131 goto out; 1132 } 1133 1134 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1135 ret = -EINVAL; 1136 goto out; 1137 } 1138 1139 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1140 if (ret || !sdev) { 1141 ret = -ENODEV; 1142 goto out; 1143 } 1144 1145 /* 1146 * For responses from userspace, need to make sure that the 1147 * pasid has been bound to its mm. 1148 */ 1149 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1150 struct mm_struct *mm; 1151 1152 mm = get_task_mm(current); 1153 if (!mm) { 1154 ret = -EINVAL; 1155 goto out; 1156 } 1157 1158 if (mm != svm->mm) { 1159 ret = -ENODEV; 1160 mmput(mm); 1161 goto out; 1162 } 1163 1164 mmput(mm); 1165 } 1166 1167 /* 1168 * Per VT-d spec. v3.0 ch7.7, system software must respond 1169 * with page group response if private data is present (PDP) 1170 * or last page in group (LPIG) bit is set. This is an 1171 * additional VT-d requirement beyond PCI ATS spec. 1172 */ 1173 if (last_page || private_present) { 1174 struct qi_desc desc; 1175 1176 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1177 QI_PGRP_PASID_P(pasid_present) | 1178 QI_PGRP_PDP(private_present) | 1179 QI_PGRP_RESP_CODE(msg->code) | 1180 QI_PGRP_RESP_TYPE; 1181 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1182 desc.qw2 = 0; 1183 desc.qw3 = 0; 1184 1185 if (private_present) { 1186 desc.qw2 = prm->private_data[0]; 1187 desc.qw3 = prm->private_data[1]; 1188 } else if (prm->private_data[0]) { 1189 dmar_latency_update(iommu, DMAR_LATENCY_PRQ, 1190 ktime_to_ns(ktime_get()) - prm->private_data[0]); 1191 } 1192 1193 qi_submit_sync(iommu, &desc, 1, 0); 1194 } 1195 out: 1196 mutex_unlock(&pasid_mutex); 1197 return ret; 1198 } 1199