1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/xarray.h> 21 #include <linux/ioasid.h> 22 #include <asm/page.h> 23 #include <asm/fpu/api.h> 24 #include <trace/events/intel_iommu.h> 25 26 #include "pasid.h" 27 #include "perf.h" 28 #include "../iommu-sva-lib.h" 29 30 static irqreturn_t prq_event_thread(int irq, void *d); 31 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 32 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 33 34 #define PRQ_ORDER 0 35 36 static DEFINE_XARRAY_ALLOC(pasid_private_array); 37 static int pasid_private_add(ioasid_t pasid, void *priv) 38 { 39 return xa_alloc(&pasid_private_array, &pasid, priv, 40 XA_LIMIT(pasid, pasid), GFP_ATOMIC); 41 } 42 43 static void pasid_private_remove(ioasid_t pasid) 44 { 45 xa_erase(&pasid_private_array, pasid); 46 } 47 48 static void *pasid_private_find(ioasid_t pasid) 49 { 50 return xa_load(&pasid_private_array, pasid); 51 } 52 53 static struct intel_svm_dev * 54 svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid) 55 { 56 struct intel_svm_dev *sdev = NULL, *t; 57 58 rcu_read_lock(); 59 list_for_each_entry_rcu(t, &svm->devs, list) { 60 if (t->sid == sid) { 61 sdev = t; 62 break; 63 } 64 } 65 rcu_read_unlock(); 66 67 return sdev; 68 } 69 70 static struct intel_svm_dev * 71 svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) 72 { 73 struct intel_svm_dev *sdev = NULL, *t; 74 75 rcu_read_lock(); 76 list_for_each_entry_rcu(t, &svm->devs, list) { 77 if (t->dev == dev) { 78 sdev = t; 79 break; 80 } 81 } 82 rcu_read_unlock(); 83 84 return sdev; 85 } 86 87 int intel_svm_enable_prq(struct intel_iommu *iommu) 88 { 89 struct iopf_queue *iopfq; 90 struct page *pages; 91 int irq, ret; 92 93 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 94 if (!pages) { 95 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 96 iommu->name); 97 return -ENOMEM; 98 } 99 iommu->prq = page_address(pages); 100 101 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 102 if (irq <= 0) { 103 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 104 iommu->name); 105 ret = -EINVAL; 106 goto free_prq; 107 } 108 iommu->pr_irq = irq; 109 110 snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), 111 "dmar%d-iopfq", iommu->seq_id); 112 iopfq = iopf_queue_alloc(iommu->iopfq_name); 113 if (!iopfq) { 114 pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); 115 ret = -ENOMEM; 116 goto free_hwirq; 117 } 118 iommu->iopf_queue = iopfq; 119 120 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 121 122 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 123 iommu->prq_name, iommu); 124 if (ret) { 125 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 126 iommu->name); 127 goto free_iopfq; 128 } 129 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 130 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 131 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 132 133 init_completion(&iommu->prq_complete); 134 135 return 0; 136 137 free_iopfq: 138 iopf_queue_free(iommu->iopf_queue); 139 iommu->iopf_queue = NULL; 140 free_hwirq: 141 dmar_free_hwirq(irq); 142 iommu->pr_irq = 0; 143 free_prq: 144 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 145 iommu->prq = NULL; 146 147 return ret; 148 } 149 150 int intel_svm_finish_prq(struct intel_iommu *iommu) 151 { 152 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 153 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 154 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 155 156 if (iommu->pr_irq) { 157 free_irq(iommu->pr_irq, iommu); 158 dmar_free_hwirq(iommu->pr_irq); 159 iommu->pr_irq = 0; 160 } 161 162 if (iommu->iopf_queue) { 163 iopf_queue_free(iommu->iopf_queue); 164 iommu->iopf_queue = NULL; 165 } 166 167 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 168 iommu->prq = NULL; 169 170 return 0; 171 } 172 173 static inline bool intel_svm_capable(struct intel_iommu *iommu) 174 { 175 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 176 } 177 178 void intel_svm_check(struct intel_iommu *iommu) 179 { 180 if (!pasid_supported(iommu)) 181 return; 182 183 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 184 !cap_fl1gp_support(iommu->cap)) { 185 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 186 iommu->name); 187 return; 188 } 189 190 if (cpu_feature_enabled(X86_FEATURE_LA57) && 191 !cap_5lp_support(iommu->cap)) { 192 pr_err("%s SVM disabled, incompatible paging mode\n", 193 iommu->name); 194 return; 195 } 196 197 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 198 } 199 200 static void __flush_svm_range_dev(struct intel_svm *svm, 201 struct intel_svm_dev *sdev, 202 unsigned long address, 203 unsigned long pages, int ih) 204 { 205 struct device_domain_info *info = get_domain_info(sdev->dev); 206 207 if (WARN_ON(!pages)) 208 return; 209 210 qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); 211 if (info->ats_enabled) 212 qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, 213 svm->pasid, sdev->qdep, address, 214 order_base_2(pages)); 215 } 216 217 static void intel_flush_svm_range_dev(struct intel_svm *svm, 218 struct intel_svm_dev *sdev, 219 unsigned long address, 220 unsigned long pages, int ih) 221 { 222 unsigned long shift = ilog2(__roundup_pow_of_two(pages)); 223 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); 224 unsigned long start = ALIGN_DOWN(address, align); 225 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); 226 227 while (start < end) { 228 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); 229 start += align; 230 } 231 } 232 233 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 234 unsigned long pages, int ih) 235 { 236 struct intel_svm_dev *sdev; 237 238 rcu_read_lock(); 239 list_for_each_entry_rcu(sdev, &svm->devs, list) 240 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 241 rcu_read_unlock(); 242 } 243 244 /* Pages have been freed at this point */ 245 static void intel_invalidate_range(struct mmu_notifier *mn, 246 struct mm_struct *mm, 247 unsigned long start, unsigned long end) 248 { 249 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 250 251 intel_flush_svm_range(svm, start, 252 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 253 } 254 255 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 256 { 257 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 258 struct intel_svm_dev *sdev; 259 260 /* This might end up being called from exit_mmap(), *before* the page 261 * tables are cleared. And __mmu_notifier_release() will delete us from 262 * the list of notifiers so that our invalidate_range() callback doesn't 263 * get called when the page tables are cleared. So we need to protect 264 * against hardware accessing those page tables. 265 * 266 * We do it by clearing the entry in the PASID table and then flushing 267 * the IOTLB and the PASID table caches. This might upset hardware; 268 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 269 * page) so that we end up taking a fault that the hardware really 270 * *has* to handle gracefully without affecting other processes. 271 */ 272 rcu_read_lock(); 273 list_for_each_entry_rcu(sdev, &svm->devs, list) 274 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 275 svm->pasid, true); 276 rcu_read_unlock(); 277 278 } 279 280 static const struct mmu_notifier_ops intel_mmuops = { 281 .release = intel_mm_release, 282 .invalidate_range = intel_invalidate_range, 283 }; 284 285 static DEFINE_MUTEX(pasid_mutex); 286 287 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 288 struct intel_svm **rsvm, 289 struct intel_svm_dev **rsdev) 290 { 291 struct intel_svm_dev *sdev = NULL; 292 struct intel_svm *svm; 293 294 /* The caller should hold the pasid_mutex lock */ 295 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 296 return -EINVAL; 297 298 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 299 return -EINVAL; 300 301 svm = pasid_private_find(pasid); 302 if (IS_ERR(svm)) 303 return PTR_ERR(svm); 304 305 if (!svm) 306 goto out; 307 308 /* 309 * If we found svm for the PASID, there must be at least one device 310 * bond. 311 */ 312 if (WARN_ON(list_empty(&svm->devs))) 313 return -EINVAL; 314 sdev = svm_lookup_device_by_dev(svm, dev); 315 316 out: 317 *rsvm = svm; 318 *rsdev = sdev; 319 320 return 0; 321 } 322 323 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 324 struct iommu_gpasid_bind_data *data) 325 { 326 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 327 struct intel_svm_dev *sdev = NULL; 328 struct dmar_domain *dmar_domain; 329 struct device_domain_info *info; 330 struct intel_svm *svm = NULL; 331 unsigned long iflags; 332 int ret = 0; 333 334 if (WARN_ON(!iommu) || !data) 335 return -EINVAL; 336 337 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 338 return -EINVAL; 339 340 /* IOMMU core ensures argsz is more than the start of the union */ 341 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 342 return -EINVAL; 343 344 /* Make sure no undefined flags are used in vendor data */ 345 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 346 return -EINVAL; 347 348 if (!dev_is_pci(dev)) 349 return -ENOTSUPP; 350 351 /* VT-d supports devices with full 20 bit PASIDs only */ 352 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 353 return -EINVAL; 354 355 /* 356 * We only check host PASID range, we have no knowledge to check 357 * guest PASID range. 358 */ 359 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 360 return -EINVAL; 361 362 info = get_domain_info(dev); 363 if (!info) 364 return -EINVAL; 365 366 dmar_domain = to_dmar_domain(domain); 367 368 mutex_lock(&pasid_mutex); 369 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 370 if (ret) 371 goto out; 372 373 if (sdev) { 374 /* 375 * Do not allow multiple bindings of the same device-PASID since 376 * there is only one SL page tables per PASID. We may revisit 377 * once sharing PGD across domains are supported. 378 */ 379 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 380 svm->pasid); 381 ret = -EBUSY; 382 goto out; 383 } 384 385 if (!svm) { 386 /* We come here when PASID has never been bond to a device. */ 387 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 388 if (!svm) { 389 ret = -ENOMEM; 390 goto out; 391 } 392 /* REVISIT: upper layer/VFIO can track host process that bind 393 * the PASID. ioasid_set = mm might be sufficient for vfio to 394 * check pasid VMM ownership. We can drop the following line 395 * once VFIO and IOASID set check is in place. 396 */ 397 svm->mm = get_task_mm(current); 398 svm->pasid = data->hpasid; 399 if (data->flags & IOMMU_SVA_GPASID_VAL) { 400 svm->gpasid = data->gpasid; 401 svm->flags |= SVM_FLAG_GUEST_PASID; 402 } 403 pasid_private_add(data->hpasid, svm); 404 INIT_LIST_HEAD_RCU(&svm->devs); 405 mmput(svm->mm); 406 } 407 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 408 if (!sdev) { 409 ret = -ENOMEM; 410 goto out; 411 } 412 sdev->dev = dev; 413 sdev->sid = PCI_DEVID(info->bus, info->devfn); 414 sdev->iommu = iommu; 415 416 /* Only count users if device has aux domains */ 417 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 418 sdev->users = 1; 419 420 /* Set up device context entry for PASID if not enabled already */ 421 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 422 if (ret) { 423 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 424 kfree(sdev); 425 goto out; 426 } 427 428 /* 429 * PASID table is per device for better security. Therefore, for 430 * each bind of a new device even with an existing PASID, we need to 431 * call the nested mode setup function here. 432 */ 433 spin_lock_irqsave(&iommu->lock, iflags); 434 ret = intel_pasid_setup_nested(iommu, dev, 435 (pgd_t *)(uintptr_t)data->gpgd, 436 data->hpasid, &data->vendor.vtd, dmar_domain, 437 data->addr_width); 438 spin_unlock_irqrestore(&iommu->lock, iflags); 439 if (ret) { 440 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 441 data->hpasid, ret); 442 /* 443 * PASID entry should be in cleared state if nested mode 444 * set up failed. So we only need to clear IOASID tracking 445 * data such that free call will succeed. 446 */ 447 kfree(sdev); 448 goto out; 449 } 450 451 svm->flags |= SVM_FLAG_GUEST_MODE; 452 453 init_rcu_head(&sdev->rcu); 454 list_add_rcu(&sdev->list, &svm->devs); 455 out: 456 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 457 pasid_private_remove(data->hpasid); 458 kfree(svm); 459 } 460 461 mutex_unlock(&pasid_mutex); 462 return ret; 463 } 464 465 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 466 { 467 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 468 struct intel_svm_dev *sdev; 469 struct intel_svm *svm; 470 int ret; 471 472 if (WARN_ON(!iommu)) 473 return -EINVAL; 474 475 mutex_lock(&pasid_mutex); 476 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 477 if (ret) 478 goto out; 479 480 if (sdev) { 481 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 482 sdev->users--; 483 if (!sdev->users) { 484 list_del_rcu(&sdev->list); 485 intel_pasid_tear_down_entry(iommu, dev, 486 svm->pasid, false); 487 intel_svm_drain_prq(dev, svm->pasid); 488 kfree_rcu(sdev, rcu); 489 490 if (list_empty(&svm->devs)) { 491 /* 492 * We do not free the IOASID here in that 493 * IOMMU driver did not allocate it. 494 * Unlike native SVM, IOASID for guest use was 495 * allocated prior to the bind call. 496 * In any case, if the free call comes before 497 * the unbind, IOMMU driver will get notified 498 * and perform cleanup. 499 */ 500 pasid_private_remove(pasid); 501 kfree(svm); 502 } 503 } 504 } 505 out: 506 mutex_unlock(&pasid_mutex); 507 return ret; 508 } 509 510 static void _load_pasid(void *unused) 511 { 512 update_pasid(); 513 } 514 515 static void load_pasid(struct mm_struct *mm, u32 pasid) 516 { 517 mutex_lock(&mm->context.lock); 518 519 /* Synchronize with READ_ONCE in update_pasid(). */ 520 smp_store_release(&mm->pasid, pasid); 521 522 /* Update PASID MSR on all CPUs running the mm's tasks. */ 523 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 524 525 mutex_unlock(&mm->context.lock); 526 } 527 528 static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, 529 unsigned int flags) 530 { 531 ioasid_t max_pasid = dev_is_pci(dev) ? 532 pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; 533 534 return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); 535 } 536 537 static void intel_svm_free_pasid(struct mm_struct *mm) 538 { 539 iommu_sva_free_pasid(mm); 540 } 541 542 static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, 543 struct device *dev, 544 struct mm_struct *mm, 545 unsigned int flags) 546 { 547 struct device_domain_info *info = get_domain_info(dev); 548 unsigned long iflags, sflags; 549 struct intel_svm_dev *sdev; 550 struct intel_svm *svm; 551 int ret = 0; 552 553 svm = pasid_private_find(mm->pasid); 554 if (!svm) { 555 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 556 if (!svm) 557 return ERR_PTR(-ENOMEM); 558 559 svm->pasid = mm->pasid; 560 svm->mm = mm; 561 svm->flags = flags; 562 INIT_LIST_HEAD_RCU(&svm->devs); 563 564 if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) { 565 svm->notifier.ops = &intel_mmuops; 566 ret = mmu_notifier_register(&svm->notifier, mm); 567 if (ret) { 568 kfree(svm); 569 return ERR_PTR(ret); 570 } 571 } 572 573 ret = pasid_private_add(svm->pasid, svm); 574 if (ret) { 575 if (svm->notifier.ops) 576 mmu_notifier_unregister(&svm->notifier, mm); 577 kfree(svm); 578 return ERR_PTR(ret); 579 } 580 } 581 582 /* Find the matching device in svm list */ 583 sdev = svm_lookup_device_by_dev(svm, dev); 584 if (sdev) { 585 sdev->users++; 586 goto success; 587 } 588 589 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 590 if (!sdev) { 591 ret = -ENOMEM; 592 goto free_svm; 593 } 594 595 sdev->dev = dev; 596 sdev->iommu = iommu; 597 sdev->did = FLPT_DEFAULT_DID; 598 sdev->sid = PCI_DEVID(info->bus, info->devfn); 599 sdev->users = 1; 600 sdev->pasid = svm->pasid; 601 sdev->sva.dev = dev; 602 init_rcu_head(&sdev->rcu); 603 if (info->ats_enabled) { 604 sdev->dev_iotlb = 1; 605 sdev->qdep = info->ats_qdep; 606 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 607 sdev->qdep = 0; 608 } 609 610 /* Setup the pasid table: */ 611 sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ? 612 PASID_FLAG_SUPERVISOR_MODE : 0; 613 sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; 614 spin_lock_irqsave(&iommu->lock, iflags); 615 ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, 616 FLPT_DEFAULT_DID, sflags); 617 spin_unlock_irqrestore(&iommu->lock, iflags); 618 619 if (ret) 620 goto free_sdev; 621 622 /* The newly allocated pasid is loaded to the mm. */ 623 if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs)) 624 load_pasid(mm, svm->pasid); 625 626 list_add_rcu(&sdev->list, &svm->devs); 627 success: 628 return &sdev->sva; 629 630 free_sdev: 631 kfree(sdev); 632 free_svm: 633 if (list_empty(&svm->devs)) { 634 if (svm->notifier.ops) 635 mmu_notifier_unregister(&svm->notifier, mm); 636 pasid_private_remove(mm->pasid); 637 kfree(svm); 638 } 639 640 return ERR_PTR(ret); 641 } 642 643 /* Caller must hold pasid_mutex */ 644 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 645 { 646 struct intel_svm_dev *sdev; 647 struct intel_iommu *iommu; 648 struct intel_svm *svm; 649 struct mm_struct *mm; 650 int ret = -EINVAL; 651 652 iommu = device_to_iommu(dev, NULL, NULL); 653 if (!iommu) 654 goto out; 655 656 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 657 if (ret) 658 goto out; 659 mm = svm->mm; 660 661 if (sdev) { 662 sdev->users--; 663 if (!sdev->users) { 664 list_del_rcu(&sdev->list); 665 /* Flush the PASID cache and IOTLB for this device. 666 * Note that we do depend on the hardware *not* using 667 * the PASID any more. Just as we depend on other 668 * devices never using PASIDs that they have no right 669 * to use. We have a *shared* PASID table, because it's 670 * large and has to be physically contiguous. So it's 671 * hard to be as defensive as we might like. */ 672 intel_pasid_tear_down_entry(iommu, dev, 673 svm->pasid, false); 674 intel_svm_drain_prq(dev, svm->pasid); 675 kfree_rcu(sdev, rcu); 676 677 if (list_empty(&svm->devs)) { 678 intel_svm_free_pasid(mm); 679 if (svm->notifier.ops) { 680 mmu_notifier_unregister(&svm->notifier, mm); 681 /* Clear mm's pasid. */ 682 load_pasid(mm, PASID_DISABLED); 683 } 684 pasid_private_remove(svm->pasid); 685 /* We mandate that no page faults may be outstanding 686 * for the PASID when intel_svm_unbind_mm() is called. 687 * If that is not obeyed, subtle errors will happen. 688 * Let's make them less subtle... */ 689 memset(svm, 0x6b, sizeof(*svm)); 690 kfree(svm); 691 } 692 } 693 } 694 out: 695 return ret; 696 } 697 698 /* Page request queue descriptor */ 699 struct page_req_dsc { 700 union { 701 struct { 702 u64 type:8; 703 u64 pasid_present:1; 704 u64 priv_data_present:1; 705 u64 rsvd:6; 706 u64 rid:16; 707 u64 pasid:20; 708 u64 exe_req:1; 709 u64 pm_req:1; 710 u64 rsvd2:10; 711 }; 712 u64 qw_0; 713 }; 714 union { 715 struct { 716 u64 rd_req:1; 717 u64 wr_req:1; 718 u64 lpig:1; 719 u64 prg_index:9; 720 u64 addr:52; 721 }; 722 u64 qw_1; 723 }; 724 u64 priv_data[2]; 725 }; 726 727 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 728 729 static bool is_canonical_address(u64 addr) 730 { 731 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 732 long saddr = (long) addr; 733 734 return (((saddr << shift) >> shift) == saddr); 735 } 736 737 /** 738 * intel_svm_drain_prq - Drain page requests and responses for a pasid 739 * @dev: target device 740 * @pasid: pasid for draining 741 * 742 * Drain all pending page requests and responses related to @pasid in both 743 * software and hardware. This is supposed to be called after the device 744 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 745 * and DevTLB have been invalidated. 746 * 747 * It waits until all pending page requests for @pasid in the page fault 748 * queue are completed by the prq handling thread. Then follow the steps 749 * described in VT-d spec CH7.10 to drain all page requests and page 750 * responses pending in the hardware. 751 */ 752 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 753 { 754 struct device_domain_info *info; 755 struct dmar_domain *domain; 756 struct intel_iommu *iommu; 757 struct qi_desc desc[3]; 758 struct pci_dev *pdev; 759 int head, tail; 760 u16 sid, did; 761 int qdep; 762 763 info = get_domain_info(dev); 764 if (WARN_ON(!info || !dev_is_pci(dev))) 765 return; 766 767 if (!info->pri_enabled) 768 return; 769 770 iommu = info->iommu; 771 domain = info->domain; 772 pdev = to_pci_dev(dev); 773 sid = PCI_DEVID(info->bus, info->devfn); 774 did = domain->iommu_did[iommu->seq_id]; 775 qdep = pci_ats_queue_depth(pdev); 776 777 /* 778 * Check and wait until all pending page requests in the queue are 779 * handled by the prq handling thread. 780 */ 781 prq_retry: 782 reinit_completion(&iommu->prq_complete); 783 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 784 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 785 while (head != tail) { 786 struct page_req_dsc *req; 787 788 req = &iommu->prq[head / sizeof(*req)]; 789 if (!req->pasid_present || req->pasid != pasid) { 790 head = (head + sizeof(*req)) & PRQ_RING_MASK; 791 continue; 792 } 793 794 wait_for_completion(&iommu->prq_complete); 795 goto prq_retry; 796 } 797 798 iopf_queue_flush_dev(dev); 799 800 /* 801 * Perform steps described in VT-d spec CH7.10 to drain page 802 * requests and responses in hardware. 803 */ 804 memset(desc, 0, sizeof(desc)); 805 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 806 QI_IWD_FENCE | 807 QI_IWD_TYPE; 808 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 809 QI_EIOTLB_DID(did) | 810 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 811 QI_EIOTLB_TYPE; 812 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 813 QI_DEV_EIOTLB_SID(sid) | 814 QI_DEV_EIOTLB_QDEP(qdep) | 815 QI_DEIOTLB_TYPE | 816 QI_DEV_IOTLB_PFSID(info->pfsid); 817 qi_retry: 818 reinit_completion(&iommu->prq_complete); 819 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 820 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 821 wait_for_completion(&iommu->prq_complete); 822 goto qi_retry; 823 } 824 } 825 826 static int prq_to_iommu_prot(struct page_req_dsc *req) 827 { 828 int prot = 0; 829 830 if (req->rd_req) 831 prot |= IOMMU_FAULT_PERM_READ; 832 if (req->wr_req) 833 prot |= IOMMU_FAULT_PERM_WRITE; 834 if (req->exe_req) 835 prot |= IOMMU_FAULT_PERM_EXEC; 836 if (req->pm_req) 837 prot |= IOMMU_FAULT_PERM_PRIV; 838 839 return prot; 840 } 841 842 static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, 843 struct page_req_dsc *desc) 844 { 845 struct iommu_fault_event event; 846 847 if (!dev || !dev_is_pci(dev)) 848 return -ENODEV; 849 850 /* Fill in event data for device specific processing */ 851 memset(&event, 0, sizeof(struct iommu_fault_event)); 852 event.fault.type = IOMMU_FAULT_PAGE_REQ; 853 event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; 854 event.fault.prm.pasid = desc->pasid; 855 event.fault.prm.grpid = desc->prg_index; 856 event.fault.prm.perm = prq_to_iommu_prot(desc); 857 858 if (desc->lpig) 859 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 860 if (desc->pasid_present) { 861 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 862 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 863 } 864 if (desc->priv_data_present) { 865 /* 866 * Set last page in group bit if private data is present, 867 * page response is required as it does for LPIG. 868 * iommu_report_device_fault() doesn't understand this vendor 869 * specific requirement thus we set last_page as a workaround. 870 */ 871 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 872 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 873 event.fault.prm.private_data[0] = desc->priv_data[0]; 874 event.fault.prm.private_data[1] = desc->priv_data[1]; 875 } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) { 876 /* 877 * If the private data fields are not used by hardware, use it 878 * to monitor the prq handle latency. 879 */ 880 event.fault.prm.private_data[0] = ktime_to_ns(ktime_get()); 881 } 882 883 return iommu_report_device_fault(dev, &event); 884 } 885 886 static void handle_bad_prq_event(struct intel_iommu *iommu, 887 struct page_req_dsc *req, int result) 888 { 889 struct qi_desc desc; 890 891 pr_err("%s: Invalid page request: %08llx %08llx\n", 892 iommu->name, ((unsigned long long *)req)[0], 893 ((unsigned long long *)req)[1]); 894 895 /* 896 * Per VT-d spec. v3.0 ch7.7, system software must 897 * respond with page group response if private data 898 * is present (PDP) or last page in group (LPIG) bit 899 * is set. This is an additional VT-d feature beyond 900 * PCI ATS spec. 901 */ 902 if (!req->lpig && !req->priv_data_present) 903 return; 904 905 desc.qw0 = QI_PGRP_PASID(req->pasid) | 906 QI_PGRP_DID(req->rid) | 907 QI_PGRP_PASID_P(req->pasid_present) | 908 QI_PGRP_PDP(req->priv_data_present) | 909 QI_PGRP_RESP_CODE(result) | 910 QI_PGRP_RESP_TYPE; 911 desc.qw1 = QI_PGRP_IDX(req->prg_index) | 912 QI_PGRP_LPIG(req->lpig); 913 914 if (req->priv_data_present) { 915 desc.qw2 = req->priv_data[0]; 916 desc.qw3 = req->priv_data[1]; 917 } else { 918 desc.qw2 = 0; 919 desc.qw3 = 0; 920 } 921 922 qi_submit_sync(iommu, &desc, 1, 0); 923 } 924 925 static irqreturn_t prq_event_thread(int irq, void *d) 926 { 927 struct intel_svm_dev *sdev = NULL; 928 struct intel_iommu *iommu = d; 929 struct intel_svm *svm = NULL; 930 struct page_req_dsc *req; 931 int head, tail, handled; 932 u64 address; 933 934 /* 935 * Clear PPR bit before reading head/tail registers, to ensure that 936 * we get a new interrupt if needed. 937 */ 938 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 939 940 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 941 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 942 handled = (head != tail); 943 while (head != tail) { 944 req = &iommu->prq[head / sizeof(*req)]; 945 address = (u64)req->addr << VTD_PAGE_SHIFT; 946 947 if (unlikely(!req->pasid_present)) { 948 pr_err("IOMMU: %s: Page request without PASID\n", 949 iommu->name); 950 bad_req: 951 svm = NULL; 952 sdev = NULL; 953 handle_bad_prq_event(iommu, req, QI_RESP_INVALID); 954 goto prq_advance; 955 } 956 957 if (unlikely(!is_canonical_address(address))) { 958 pr_err("IOMMU: %s: Address is not canonical\n", 959 iommu->name); 960 goto bad_req; 961 } 962 963 if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { 964 pr_err("IOMMU: %s: Page request in Privilege Mode\n", 965 iommu->name); 966 goto bad_req; 967 } 968 969 if (unlikely(req->exe_req && req->rd_req)) { 970 pr_err("IOMMU: %s: Execution request not supported\n", 971 iommu->name); 972 goto bad_req; 973 } 974 975 if (!svm || svm->pasid != req->pasid) { 976 /* 977 * It can't go away, because the driver is not permitted 978 * to unbind the mm while any page faults are outstanding. 979 */ 980 svm = pasid_private_find(req->pasid); 981 if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE)) 982 goto bad_req; 983 } 984 985 if (!sdev || sdev->sid != req->rid) { 986 sdev = svm_lookup_device_by_sid(svm, req->rid); 987 if (!sdev) 988 goto bad_req; 989 } 990 991 sdev->prq_seq_number++; 992 993 /* 994 * If prq is to be handled outside iommu driver via receiver of 995 * the fault notifiers, we skip the page response here. 996 */ 997 if (intel_svm_prq_report(iommu, sdev->dev, req)) 998 handle_bad_prq_event(iommu, req, QI_RESP_INVALID); 999 1000 trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1, 1001 req->priv_data[0], req->priv_data[1], 1002 sdev->prq_seq_number); 1003 prq_advance: 1004 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1005 } 1006 1007 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1008 1009 /* 1010 * Clear the page request overflow bit and wake up all threads that 1011 * are waiting for the completion of this handling. 1012 */ 1013 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 1014 pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", 1015 iommu->name); 1016 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 1017 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 1018 if (head == tail) { 1019 iopf_queue_discard_partial(iommu->iopf_queue); 1020 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1021 pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", 1022 iommu->name); 1023 } 1024 } 1025 1026 if (!completion_done(&iommu->prq_complete)) 1027 complete(&iommu->prq_complete); 1028 1029 return IRQ_RETVAL(handled); 1030 } 1031 1032 struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1033 { 1034 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 1035 unsigned int flags = 0; 1036 struct iommu_sva *sva; 1037 int ret; 1038 1039 if (drvdata) 1040 flags = *(unsigned int *)drvdata; 1041 1042 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 1043 if (!ecap_srs(iommu->ecap)) { 1044 dev_err(dev, "%s: Supervisor PASID not supported\n", 1045 iommu->name); 1046 return ERR_PTR(-EOPNOTSUPP); 1047 } 1048 1049 if (mm) { 1050 dev_err(dev, "%s: Supervisor PASID with user provided mm\n", 1051 iommu->name); 1052 return ERR_PTR(-EINVAL); 1053 } 1054 1055 mm = &init_mm; 1056 } 1057 1058 mutex_lock(&pasid_mutex); 1059 ret = intel_svm_alloc_pasid(dev, mm, flags); 1060 if (ret) { 1061 mutex_unlock(&pasid_mutex); 1062 return ERR_PTR(ret); 1063 } 1064 1065 sva = intel_svm_bind_mm(iommu, dev, mm, flags); 1066 if (IS_ERR_OR_NULL(sva)) 1067 intel_svm_free_pasid(mm); 1068 mutex_unlock(&pasid_mutex); 1069 1070 return sva; 1071 } 1072 1073 void intel_svm_unbind(struct iommu_sva *sva) 1074 { 1075 struct intel_svm_dev *sdev = to_intel_svm_dev(sva); 1076 1077 mutex_lock(&pasid_mutex); 1078 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1079 mutex_unlock(&pasid_mutex); 1080 } 1081 1082 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1083 { 1084 struct intel_svm_dev *sdev; 1085 u32 pasid; 1086 1087 mutex_lock(&pasid_mutex); 1088 sdev = to_intel_svm_dev(sva); 1089 pasid = sdev->pasid; 1090 mutex_unlock(&pasid_mutex); 1091 1092 return pasid; 1093 } 1094 1095 int intel_svm_page_response(struct device *dev, 1096 struct iommu_fault_event *evt, 1097 struct iommu_page_response *msg) 1098 { 1099 struct iommu_fault_page_request *prm; 1100 struct intel_svm_dev *sdev = NULL; 1101 struct intel_svm *svm = NULL; 1102 struct intel_iommu *iommu; 1103 bool private_present; 1104 bool pasid_present; 1105 bool last_page; 1106 u8 bus, devfn; 1107 int ret = 0; 1108 u16 sid; 1109 1110 if (!dev || !dev_is_pci(dev)) 1111 return -ENODEV; 1112 1113 iommu = device_to_iommu(dev, &bus, &devfn); 1114 if (!iommu) 1115 return -ENODEV; 1116 1117 if (!msg || !evt) 1118 return -EINVAL; 1119 1120 mutex_lock(&pasid_mutex); 1121 1122 prm = &evt->fault.prm; 1123 sid = PCI_DEVID(bus, devfn); 1124 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1125 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1126 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1127 1128 if (!pasid_present) { 1129 ret = -EINVAL; 1130 goto out; 1131 } 1132 1133 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1134 ret = -EINVAL; 1135 goto out; 1136 } 1137 1138 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1139 if (ret || !sdev) { 1140 ret = -ENODEV; 1141 goto out; 1142 } 1143 1144 /* 1145 * For responses from userspace, need to make sure that the 1146 * pasid has been bound to its mm. 1147 */ 1148 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1149 struct mm_struct *mm; 1150 1151 mm = get_task_mm(current); 1152 if (!mm) { 1153 ret = -EINVAL; 1154 goto out; 1155 } 1156 1157 if (mm != svm->mm) { 1158 ret = -ENODEV; 1159 mmput(mm); 1160 goto out; 1161 } 1162 1163 mmput(mm); 1164 } 1165 1166 /* 1167 * Per VT-d spec. v3.0 ch7.7, system software must respond 1168 * with page group response if private data is present (PDP) 1169 * or last page in group (LPIG) bit is set. This is an 1170 * additional VT-d requirement beyond PCI ATS spec. 1171 */ 1172 if (last_page || private_present) { 1173 struct qi_desc desc; 1174 1175 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1176 QI_PGRP_PASID_P(pasid_present) | 1177 QI_PGRP_PDP(private_present) | 1178 QI_PGRP_RESP_CODE(msg->code) | 1179 QI_PGRP_RESP_TYPE; 1180 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1181 desc.qw2 = 0; 1182 desc.qw3 = 0; 1183 1184 if (private_present) { 1185 desc.qw2 = prm->private_data[0]; 1186 desc.qw3 = prm->private_data[1]; 1187 } else if (prm->private_data[0]) { 1188 dmar_latency_update(iommu, DMAR_LATENCY_PRQ, 1189 ktime_to_ns(ktime_get()) - prm->private_data[0]); 1190 } 1191 1192 qi_submit_sync(iommu, &desc, 1, 0); 1193 } 1194 out: 1195 mutex_unlock(&pasid_mutex); 1196 return ret; 1197 } 1198