1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-pages.h" 31 #include "pasid.h" 32 #include "perfmon.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 50 51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 56 57 static void __init check_tylersburg_isoch(void); 58 static int rwbf_quirk; 59 60 /* 61 * set to 1 to panic kernel if can't successfully enable VT-d 62 * (used when kernel is launched w/ TXT) 63 */ 64 static int force_on = 0; 65 static int intel_iommu_tboot_noforce; 66 static int no_platform_optin; 67 68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 69 70 /* 71 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 72 * if marked present. 73 */ 74 static phys_addr_t root_entry_lctp(struct root_entry *re) 75 { 76 if (!(re->lo & 1)) 77 return 0; 78 79 return re->lo & VTD_PAGE_MASK; 80 } 81 82 /* 83 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 84 * if marked present. 85 */ 86 static phys_addr_t root_entry_uctp(struct root_entry *re) 87 { 88 if (!(re->hi & 1)) 89 return 0; 90 91 return re->hi & VTD_PAGE_MASK; 92 } 93 94 static int device_rid_cmp_key(const void *key, const struct rb_node *node) 95 { 96 struct device_domain_info *info = 97 rb_entry(node, struct device_domain_info, node); 98 const u16 *rid_lhs = key; 99 100 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) 101 return -1; 102 103 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) 104 return 1; 105 106 return 0; 107 } 108 109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) 110 { 111 struct device_domain_info *info = 112 rb_entry(lhs, struct device_domain_info, node); 113 u16 key = PCI_DEVID(info->bus, info->devfn); 114 115 return device_rid_cmp_key(&key, rhs); 116 } 117 118 /* 119 * Looks up an IOMMU-probed device using its source ID. 120 * 121 * Returns the pointer to the device if there is a match. Otherwise, 122 * returns NULL. 123 * 124 * Note that this helper doesn't guarantee that the device won't be 125 * released by the iommu subsystem after being returned. The caller 126 * should use its own synchronization mechanism to avoid the device 127 * being released during its use if its possibly the case. 128 */ 129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) 130 { 131 struct device_domain_info *info = NULL; 132 struct rb_node *node; 133 unsigned long flags; 134 135 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 136 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); 137 if (node) 138 info = rb_entry(node, struct device_domain_info, node); 139 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 140 141 return info ? info->dev : NULL; 142 } 143 144 static int device_rbtree_insert(struct intel_iommu *iommu, 145 struct device_domain_info *info) 146 { 147 struct rb_node *curr; 148 unsigned long flags; 149 150 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 151 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); 152 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 153 if (WARN_ON(curr)) 154 return -EEXIST; 155 156 return 0; 157 } 158 159 static void device_rbtree_remove(struct device_domain_info *info) 160 { 161 struct intel_iommu *iommu = info->iommu; 162 unsigned long flags; 163 164 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 165 rb_erase(&info->node, &iommu->device_rbtree); 166 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 167 } 168 169 struct dmar_rmrr_unit { 170 struct list_head list; /* list of rmrr units */ 171 struct acpi_dmar_header *hdr; /* ACPI header */ 172 u64 base_address; /* reserved base address*/ 173 u64 end_address; /* reserved end address */ 174 struct dmar_dev_scope *devices; /* target devices */ 175 int devices_cnt; /* target device count */ 176 }; 177 178 struct dmar_atsr_unit { 179 struct list_head list; /* list of ATSR units */ 180 struct acpi_dmar_header *hdr; /* ACPI header */ 181 struct dmar_dev_scope *devices; /* target devices */ 182 int devices_cnt; /* target device count */ 183 u8 include_all:1; /* include all ports */ 184 }; 185 186 struct dmar_satc_unit { 187 struct list_head list; /* list of SATC units */ 188 struct acpi_dmar_header *hdr; /* ACPI header */ 189 struct dmar_dev_scope *devices; /* target devices */ 190 struct intel_iommu *iommu; /* the corresponding iommu */ 191 int devices_cnt; /* target device count */ 192 u8 atc_required:1; /* ATS is required */ 193 }; 194 195 static LIST_HEAD(dmar_atsr_units); 196 static LIST_HEAD(dmar_rmrr_units); 197 static LIST_HEAD(dmar_satc_units); 198 199 #define for_each_rmrr_units(rmrr) \ 200 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 201 202 static void intel_iommu_domain_free(struct iommu_domain *domain); 203 204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 206 207 int intel_iommu_enabled = 0; 208 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 209 210 static int intel_iommu_superpage = 1; 211 static int iommu_identity_mapping; 212 static int iommu_skip_te_disable; 213 static int disable_igfx_iommu; 214 215 #define IDENTMAP_AZALIA 4 216 217 const struct iommu_ops intel_iommu_ops; 218 static const struct iommu_dirty_ops intel_dirty_ops; 219 220 static bool translation_pre_enabled(struct intel_iommu *iommu) 221 { 222 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 223 } 224 225 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 226 { 227 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 228 } 229 230 static void init_translation_status(struct intel_iommu *iommu) 231 { 232 u32 gsts; 233 234 gsts = readl(iommu->reg + DMAR_GSTS_REG); 235 if (gsts & DMA_GSTS_TES) 236 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 237 } 238 239 static int __init intel_iommu_setup(char *str) 240 { 241 if (!str) 242 return -EINVAL; 243 244 while (*str) { 245 if (!strncmp(str, "on", 2)) { 246 dmar_disabled = 0; 247 pr_info("IOMMU enabled\n"); 248 } else if (!strncmp(str, "off", 3)) { 249 dmar_disabled = 1; 250 no_platform_optin = 1; 251 pr_info("IOMMU disabled\n"); 252 } else if (!strncmp(str, "igfx_off", 8)) { 253 disable_igfx_iommu = 1; 254 pr_info("Disable GFX device mapping\n"); 255 } else if (!strncmp(str, "forcedac", 8)) { 256 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 257 iommu_dma_forcedac = true; 258 } else if (!strncmp(str, "strict", 6)) { 259 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 260 iommu_set_dma_strict(); 261 } else if (!strncmp(str, "sp_off", 6)) { 262 pr_info("Disable supported super page\n"); 263 intel_iommu_superpage = 0; 264 } else if (!strncmp(str, "sm_on", 5)) { 265 pr_info("Enable scalable mode if hardware supports\n"); 266 intel_iommu_sm = 1; 267 } else if (!strncmp(str, "sm_off", 6)) { 268 pr_info("Scalable mode is disallowed\n"); 269 intel_iommu_sm = 0; 270 } else if (!strncmp(str, "tboot_noforce", 13)) { 271 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 272 intel_iommu_tboot_noforce = 1; 273 } else { 274 pr_notice("Unknown option - '%s'\n", str); 275 } 276 277 str += strcspn(str, ","); 278 while (*str == ',') 279 str++; 280 } 281 282 return 1; 283 } 284 __setup("intel_iommu=", intel_iommu_setup); 285 286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 287 { 288 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 289 290 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 291 } 292 293 /* 294 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 295 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 296 * the returned SAGAW. 297 */ 298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 299 { 300 unsigned long fl_sagaw, sl_sagaw; 301 302 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 303 sl_sagaw = cap_sagaw(iommu->cap); 304 305 /* Second level only. */ 306 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 307 return sl_sagaw; 308 309 /* First level only. */ 310 if (!ecap_slts(iommu->ecap)) 311 return fl_sagaw; 312 313 return fl_sagaw & sl_sagaw; 314 } 315 316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 317 { 318 unsigned long sagaw; 319 int agaw; 320 321 sagaw = __iommu_calculate_sagaw(iommu); 322 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 323 if (test_bit(agaw, &sagaw)) 324 break; 325 } 326 327 return agaw; 328 } 329 330 /* 331 * Calculate max SAGAW for each iommu. 332 */ 333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 334 { 335 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 336 } 337 338 /* 339 * calculate agaw for each iommu. 340 * "SAGAW" may be different across iommus, use a default agaw, and 341 * get a supported less agaw for iommus that don't support the default agaw. 342 */ 343 int iommu_calculate_agaw(struct intel_iommu *iommu) 344 { 345 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 346 } 347 348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 349 { 350 return sm_supported(iommu) ? 351 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 352 } 353 354 /* Return the super pagesize bitmap if supported. */ 355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 356 { 357 unsigned long bitmap = 0; 358 359 /* 360 * 1-level super page supports page size of 2MiB, 2-level super page 361 * supports page size of both 2MiB and 1GiB. 362 */ 363 if (domain->iommu_superpage == 1) 364 bitmap |= SZ_2M; 365 else if (domain->iommu_superpage == 2) 366 bitmap |= SZ_2M | SZ_1G; 367 368 return bitmap; 369 } 370 371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 372 u8 devfn, int alloc) 373 { 374 struct root_entry *root = &iommu->root_entry[bus]; 375 struct context_entry *context; 376 u64 *entry; 377 378 /* 379 * Except that the caller requested to allocate a new entry, 380 * returning a copied context entry makes no sense. 381 */ 382 if (!alloc && context_copied(iommu, bus, devfn)) 383 return NULL; 384 385 entry = &root->lo; 386 if (sm_supported(iommu)) { 387 if (devfn >= 0x80) { 388 devfn -= 0x80; 389 entry = &root->hi; 390 } 391 devfn *= 2; 392 } 393 if (*entry & 1) 394 context = phys_to_virt(*entry & VTD_PAGE_MASK); 395 else { 396 unsigned long phy_addr; 397 if (!alloc) 398 return NULL; 399 400 context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, 401 SZ_4K); 402 if (!context) 403 return NULL; 404 405 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 406 phy_addr = virt_to_phys((void *)context); 407 *entry = phy_addr | 1; 408 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 409 } 410 return &context[devfn]; 411 } 412 413 /** 414 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 415 * sub-hierarchy of a candidate PCI-PCI bridge 416 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 417 * @bridge: the candidate PCI-PCI bridge 418 * 419 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 420 */ 421 static bool 422 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 423 { 424 struct pci_dev *pdev, *pbridge; 425 426 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 427 return false; 428 429 pdev = to_pci_dev(dev); 430 pbridge = to_pci_dev(bridge); 431 432 if (pbridge->subordinate && 433 pbridge->subordinate->number <= pdev->bus->number && 434 pbridge->subordinate->busn_res.end >= pdev->bus->number) 435 return true; 436 437 return false; 438 } 439 440 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 441 { 442 struct dmar_drhd_unit *drhd; 443 u32 vtbar; 444 int rc; 445 446 /* We know that this device on this chipset has its own IOMMU. 447 * If we find it under a different IOMMU, then the BIOS is lying 448 * to us. Hope that the IOMMU for this device is actually 449 * disabled, and it needs no translation... 450 */ 451 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 452 if (rc) { 453 /* "can't" happen */ 454 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 455 return false; 456 } 457 vtbar &= 0xffff0000; 458 459 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 460 drhd = dmar_find_matched_drhd_unit(pdev); 461 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 462 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 463 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 464 return true; 465 } 466 467 return false; 468 } 469 470 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 471 { 472 if (!iommu || iommu->drhd->ignored) 473 return true; 474 475 if (dev_is_pci(dev)) { 476 struct pci_dev *pdev = to_pci_dev(dev); 477 478 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 479 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 480 quirk_ioat_snb_local_iommu(pdev)) 481 return true; 482 } 483 484 return false; 485 } 486 487 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 488 { 489 struct dmar_drhd_unit *drhd = NULL; 490 struct pci_dev *pdev = NULL; 491 struct intel_iommu *iommu; 492 struct device *tmp; 493 u16 segment = 0; 494 int i; 495 496 if (!dev) 497 return NULL; 498 499 if (dev_is_pci(dev)) { 500 struct pci_dev *pf_pdev; 501 502 pdev = pci_real_dma_dev(to_pci_dev(dev)); 503 504 /* VFs aren't listed in scope tables; we need to look up 505 * the PF instead to find the IOMMU. */ 506 pf_pdev = pci_physfn(pdev); 507 dev = &pf_pdev->dev; 508 segment = pci_domain_nr(pdev->bus); 509 } else if (has_acpi_companion(dev)) 510 dev = &ACPI_COMPANION(dev)->dev; 511 512 rcu_read_lock(); 513 for_each_iommu(iommu, drhd) { 514 if (pdev && segment != drhd->segment) 515 continue; 516 517 for_each_active_dev_scope(drhd->devices, 518 drhd->devices_cnt, i, tmp) { 519 if (tmp == dev) { 520 /* For a VF use its original BDF# not that of the PF 521 * which we used for the IOMMU lookup. Strictly speaking 522 * we could do this for all PCI devices; we only need to 523 * get the BDF# from the scope table for ACPI matches. */ 524 if (pdev && pdev->is_virtfn) 525 goto got_pdev; 526 527 if (bus && devfn) { 528 *bus = drhd->devices[i].bus; 529 *devfn = drhd->devices[i].devfn; 530 } 531 goto out; 532 } 533 534 if (is_downstream_to_pci_bridge(dev, tmp)) 535 goto got_pdev; 536 } 537 538 if (pdev && drhd->include_all) { 539 got_pdev: 540 if (bus && devfn) { 541 *bus = pdev->bus->number; 542 *devfn = pdev->devfn; 543 } 544 goto out; 545 } 546 } 547 iommu = NULL; 548 out: 549 if (iommu_is_dummy(iommu, dev)) 550 iommu = NULL; 551 552 rcu_read_unlock(); 553 554 return iommu; 555 } 556 557 static void domain_flush_cache(struct dmar_domain *domain, 558 void *addr, int size) 559 { 560 if (!domain->iommu_coherency) 561 clflush_cache_range(addr, size); 562 } 563 564 static void free_context_table(struct intel_iommu *iommu) 565 { 566 struct context_entry *context; 567 int i; 568 569 if (!iommu->root_entry) 570 return; 571 572 for (i = 0; i < ROOT_ENTRY_NR; i++) { 573 context = iommu_context_addr(iommu, i, 0, 0); 574 if (context) 575 iommu_free_pages(context); 576 577 if (!sm_supported(iommu)) 578 continue; 579 580 context = iommu_context_addr(iommu, i, 0x80, 0); 581 if (context) 582 iommu_free_pages(context); 583 } 584 585 iommu_free_pages(iommu->root_entry); 586 iommu->root_entry = NULL; 587 } 588 589 #ifdef CONFIG_DMAR_DEBUG 590 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 591 u8 bus, u8 devfn, struct dma_pte *parent, int level) 592 { 593 struct dma_pte *pte; 594 int offset; 595 596 while (1) { 597 offset = pfn_level_offset(pfn, level); 598 pte = &parent[offset]; 599 600 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 601 602 if (!dma_pte_present(pte)) { 603 pr_info("page table not present at level %d\n", level - 1); 604 break; 605 } 606 607 if (level == 1 || dma_pte_superpage(pte)) 608 break; 609 610 parent = phys_to_virt(dma_pte_addr(pte)); 611 level--; 612 } 613 } 614 615 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 616 unsigned long long addr, u32 pasid) 617 { 618 struct pasid_dir_entry *dir, *pde; 619 struct pasid_entry *entries, *pte; 620 struct context_entry *ctx_entry; 621 struct root_entry *rt_entry; 622 int i, dir_index, index, level; 623 u8 devfn = source_id & 0xff; 624 u8 bus = source_id >> 8; 625 struct dma_pte *pgtable; 626 627 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 628 629 /* root entry dump */ 630 if (!iommu->root_entry) { 631 pr_info("root table is not present\n"); 632 return; 633 } 634 rt_entry = &iommu->root_entry[bus]; 635 636 if (sm_supported(iommu)) 637 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 638 rt_entry->hi, rt_entry->lo); 639 else 640 pr_info("root entry: 0x%016llx", rt_entry->lo); 641 642 /* context entry dump */ 643 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 644 if (!ctx_entry) { 645 pr_info("context table is not present\n"); 646 return; 647 } 648 649 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 650 ctx_entry->hi, ctx_entry->lo); 651 652 /* legacy mode does not require PASID entries */ 653 if (!sm_supported(iommu)) { 654 if (!context_present(ctx_entry)) { 655 pr_info("legacy mode page table is not present\n"); 656 return; 657 } 658 level = agaw_to_level(ctx_entry->hi & 7); 659 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 660 goto pgtable_walk; 661 } 662 663 if (!context_present(ctx_entry)) { 664 pr_info("pasid directory table is not present\n"); 665 return; 666 } 667 668 /* get the pointer to pasid directory entry */ 669 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 670 671 /* For request-without-pasid, get the pasid from context entry */ 672 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 673 pasid = IOMMU_NO_PASID; 674 675 dir_index = pasid >> PASID_PDE_SHIFT; 676 pde = &dir[dir_index]; 677 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 678 679 /* get the pointer to the pasid table entry */ 680 entries = get_pasid_table_from_pde(pde); 681 if (!entries) { 682 pr_info("pasid table is not present\n"); 683 return; 684 } 685 index = pasid & PASID_PTE_MASK; 686 pte = &entries[index]; 687 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 688 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 689 690 if (!pasid_pte_is_present(pte)) { 691 pr_info("scalable mode page table is not present\n"); 692 return; 693 } 694 695 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 696 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 697 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 698 } else { 699 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 700 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 701 } 702 703 pgtable_walk: 704 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 705 } 706 #endif 707 708 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 709 unsigned long pfn, int *target_level, 710 gfp_t gfp) 711 { 712 struct dma_pte *parent, *pte; 713 int level = agaw_to_level(domain->agaw); 714 int offset; 715 716 if (!domain_pfn_supported(domain, pfn)) 717 /* Address beyond IOMMU's addressing capabilities. */ 718 return NULL; 719 720 parent = domain->pgd; 721 722 while (1) { 723 void *tmp_page; 724 725 offset = pfn_level_offset(pfn, level); 726 pte = &parent[offset]; 727 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 728 break; 729 if (level == *target_level) 730 break; 731 732 if (!dma_pte_present(pte)) { 733 uint64_t pteval, tmp; 734 735 tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp, 736 SZ_4K); 737 738 if (!tmp_page) 739 return NULL; 740 741 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 742 pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | 743 DMA_PTE_WRITE; 744 if (domain->use_first_level) 745 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 746 747 tmp = 0ULL; 748 if (!try_cmpxchg64(&pte->val, &tmp, pteval)) 749 /* Someone else set it while we were thinking; use theirs. */ 750 iommu_free_pages(tmp_page); 751 else 752 domain_flush_cache(domain, pte, sizeof(*pte)); 753 } 754 if (level == 1) 755 break; 756 757 parent = phys_to_virt(dma_pte_addr(pte)); 758 level--; 759 } 760 761 if (!*target_level) 762 *target_level = level; 763 764 return pte; 765 } 766 767 /* return address's pte at specific level */ 768 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 769 unsigned long pfn, 770 int level, int *large_page) 771 { 772 struct dma_pte *parent, *pte; 773 int total = agaw_to_level(domain->agaw); 774 int offset; 775 776 parent = domain->pgd; 777 while (level <= total) { 778 offset = pfn_level_offset(pfn, total); 779 pte = &parent[offset]; 780 if (level == total) 781 return pte; 782 783 if (!dma_pte_present(pte)) { 784 *large_page = total; 785 break; 786 } 787 788 if (dma_pte_superpage(pte)) { 789 *large_page = total; 790 return pte; 791 } 792 793 parent = phys_to_virt(dma_pte_addr(pte)); 794 total--; 795 } 796 return NULL; 797 } 798 799 /* clear last level pte, a tlb flush should be followed */ 800 static void dma_pte_clear_range(struct dmar_domain *domain, 801 unsigned long start_pfn, 802 unsigned long last_pfn) 803 { 804 unsigned int large_page; 805 struct dma_pte *first_pte, *pte; 806 807 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 808 WARN_ON(start_pfn > last_pfn)) 809 return; 810 811 /* we don't need lock here; nobody else touches the iova range */ 812 do { 813 large_page = 1; 814 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 815 if (!pte) { 816 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 817 continue; 818 } 819 do { 820 dma_clear_pte(pte); 821 start_pfn += lvl_to_nr_pages(large_page); 822 pte++; 823 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 824 825 domain_flush_cache(domain, first_pte, 826 (void *)pte - (void *)first_pte); 827 828 } while (start_pfn && start_pfn <= last_pfn); 829 } 830 831 static void dma_pte_free_level(struct dmar_domain *domain, int level, 832 int retain_level, struct dma_pte *pte, 833 unsigned long pfn, unsigned long start_pfn, 834 unsigned long last_pfn) 835 { 836 pfn = max(start_pfn, pfn); 837 pte = &pte[pfn_level_offset(pfn, level)]; 838 839 do { 840 unsigned long level_pfn; 841 struct dma_pte *level_pte; 842 843 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 844 goto next; 845 846 level_pfn = pfn & level_mask(level); 847 level_pte = phys_to_virt(dma_pte_addr(pte)); 848 849 if (level > 2) { 850 dma_pte_free_level(domain, level - 1, retain_level, 851 level_pte, level_pfn, start_pfn, 852 last_pfn); 853 } 854 855 /* 856 * Free the page table if we're below the level we want to 857 * retain and the range covers the entire table. 858 */ 859 if (level < retain_level && !(start_pfn > level_pfn || 860 last_pfn < level_pfn + level_size(level) - 1)) { 861 dma_clear_pte(pte); 862 domain_flush_cache(domain, pte, sizeof(*pte)); 863 iommu_free_pages(level_pte); 864 } 865 next: 866 pfn += level_size(level); 867 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 868 } 869 870 /* 871 * clear last level (leaf) ptes and free page table pages below the 872 * level we wish to keep intact. 873 */ 874 static void dma_pte_free_pagetable(struct dmar_domain *domain, 875 unsigned long start_pfn, 876 unsigned long last_pfn, 877 int retain_level) 878 { 879 dma_pte_clear_range(domain, start_pfn, last_pfn); 880 881 /* We don't need lock here; nobody else touches the iova range */ 882 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 883 domain->pgd, 0, start_pfn, last_pfn); 884 885 /* free pgd */ 886 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 887 iommu_free_pages(domain->pgd); 888 domain->pgd = NULL; 889 } 890 } 891 892 /* When a page at a given level is being unlinked from its parent, we don't 893 need to *modify* it at all. All we need to do is make a list of all the 894 pages which can be freed just as soon as we've flushed the IOTLB and we 895 know the hardware page-walk will no longer touch them. 896 The 'pte' argument is the *parent* PTE, pointing to the page that is to 897 be freed. */ 898 static void dma_pte_list_pagetables(struct dmar_domain *domain, 899 int level, struct dma_pte *parent_pte, 900 struct iommu_pages_list *freelist) 901 { 902 struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte)); 903 904 iommu_pages_list_add(freelist, pte); 905 906 if (level == 1) 907 return; 908 909 do { 910 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 911 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 912 pte++; 913 } while (!first_pte_in_page(pte)); 914 } 915 916 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 917 struct dma_pte *pte, unsigned long pfn, 918 unsigned long start_pfn, unsigned long last_pfn, 919 struct iommu_pages_list *freelist) 920 { 921 struct dma_pte *first_pte = NULL, *last_pte = NULL; 922 923 pfn = max(start_pfn, pfn); 924 pte = &pte[pfn_level_offset(pfn, level)]; 925 926 do { 927 unsigned long level_pfn = pfn & level_mask(level); 928 929 if (!dma_pte_present(pte)) 930 goto next; 931 932 /* If range covers entire pagetable, free it */ 933 if (start_pfn <= level_pfn && 934 last_pfn >= level_pfn + level_size(level) - 1) { 935 /* These suborbinate page tables are going away entirely. Don't 936 bother to clear them; we're just going to *free* them. */ 937 if (level > 1 && !dma_pte_superpage(pte)) 938 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 939 940 dma_clear_pte(pte); 941 if (!first_pte) 942 first_pte = pte; 943 last_pte = pte; 944 } else if (level > 1) { 945 /* Recurse down into a level that isn't *entirely* obsolete */ 946 dma_pte_clear_level(domain, level - 1, 947 phys_to_virt(dma_pte_addr(pte)), 948 level_pfn, start_pfn, last_pfn, 949 freelist); 950 } 951 next: 952 pfn = level_pfn + level_size(level); 953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 954 955 if (first_pte) 956 domain_flush_cache(domain, first_pte, 957 (void *)++last_pte - (void *)first_pte); 958 } 959 960 /* We can't just free the pages because the IOMMU may still be walking 961 the page tables, and may have cached the intermediate levels. The 962 pages can only be freed after the IOTLB flush has been done. */ 963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 964 unsigned long last_pfn, 965 struct iommu_pages_list *freelist) 966 { 967 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 968 WARN_ON(start_pfn > last_pfn)) 969 return; 970 971 /* we don't need lock here; nobody else touches the iova range */ 972 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 973 domain->pgd, 0, start_pfn, last_pfn, freelist); 974 975 /* free pgd */ 976 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 977 iommu_pages_list_add(freelist, domain->pgd); 978 domain->pgd = NULL; 979 } 980 } 981 982 /* iommu handling */ 983 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 984 { 985 struct root_entry *root; 986 987 root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); 988 if (!root) { 989 pr_err("Allocating root entry for %s failed\n", 990 iommu->name); 991 return -ENOMEM; 992 } 993 994 __iommu_flush_cache(iommu, root, ROOT_SIZE); 995 iommu->root_entry = root; 996 997 return 0; 998 } 999 1000 static void iommu_set_root_entry(struct intel_iommu *iommu) 1001 { 1002 u64 addr; 1003 u32 sts; 1004 unsigned long flag; 1005 1006 addr = virt_to_phys(iommu->root_entry); 1007 if (sm_supported(iommu)) 1008 addr |= DMA_RTADDR_SMT; 1009 1010 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1011 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1012 1013 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1014 1015 /* Make sure hardware complete it */ 1016 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1017 readl, (sts & DMA_GSTS_RTPS), sts); 1018 1019 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1020 1021 /* 1022 * Hardware invalidates all DMA remapping hardware translation 1023 * caches as part of SRTP flow. 1024 */ 1025 if (cap_esrtps(iommu->cap)) 1026 return; 1027 1028 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1029 if (sm_supported(iommu)) 1030 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1031 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1032 } 1033 1034 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1035 { 1036 u32 val; 1037 unsigned long flag; 1038 1039 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1040 return; 1041 1042 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1043 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1044 1045 /* Make sure hardware complete it */ 1046 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1047 readl, (!(val & DMA_GSTS_WBFS)), val); 1048 1049 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1050 } 1051 1052 /* return value determine if we need a write buffer flush */ 1053 static void __iommu_flush_context(struct intel_iommu *iommu, 1054 u16 did, u16 source_id, u8 function_mask, 1055 u64 type) 1056 { 1057 u64 val = 0; 1058 unsigned long flag; 1059 1060 switch (type) { 1061 case DMA_CCMD_GLOBAL_INVL: 1062 val = DMA_CCMD_GLOBAL_INVL; 1063 break; 1064 case DMA_CCMD_DOMAIN_INVL: 1065 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1066 break; 1067 case DMA_CCMD_DEVICE_INVL: 1068 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1069 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1070 break; 1071 default: 1072 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1073 iommu->name, type); 1074 return; 1075 } 1076 val |= DMA_CCMD_ICC; 1077 1078 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1079 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1080 1081 /* Make sure hardware complete it */ 1082 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1083 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1084 1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1086 } 1087 1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, 1089 unsigned int size_order, u64 type) 1090 { 1091 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1092 u64 val = 0, val_iva = 0; 1093 unsigned long flag; 1094 1095 switch (type) { 1096 case DMA_TLB_GLOBAL_FLUSH: 1097 /* global flush doesn't need set IVA_REG */ 1098 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1099 break; 1100 case DMA_TLB_DSI_FLUSH: 1101 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1102 break; 1103 case DMA_TLB_PSI_FLUSH: 1104 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1105 /* IH bit is passed in as part of address */ 1106 val_iva = size_order | addr; 1107 break; 1108 default: 1109 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1110 iommu->name, type); 1111 return; 1112 } 1113 1114 if (cap_write_drain(iommu->cap)) 1115 val |= DMA_TLB_WRITE_DRAIN; 1116 1117 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1118 /* Note: Only uses first TLB reg currently */ 1119 if (val_iva) 1120 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1121 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1122 1123 /* Make sure hardware complete it */ 1124 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1125 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1126 1127 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1128 1129 /* check IOTLB invalidation granularity */ 1130 if (DMA_TLB_IAIG(val) == 0) 1131 pr_err("Flush IOTLB failed\n"); 1132 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1133 pr_debug("TLB flush request %Lx, actual %Lx\n", 1134 (unsigned long long)DMA_TLB_IIRG(type), 1135 (unsigned long long)DMA_TLB_IAIG(val)); 1136 } 1137 1138 static struct device_domain_info * 1139 domain_lookup_dev_info(struct dmar_domain *domain, 1140 struct intel_iommu *iommu, u8 bus, u8 devfn) 1141 { 1142 struct device_domain_info *info; 1143 unsigned long flags; 1144 1145 spin_lock_irqsave(&domain->lock, flags); 1146 list_for_each_entry(info, &domain->devices, link) { 1147 if (info->iommu == iommu && info->bus == bus && 1148 info->devfn == devfn) { 1149 spin_unlock_irqrestore(&domain->lock, flags); 1150 return info; 1151 } 1152 } 1153 spin_unlock_irqrestore(&domain->lock, flags); 1154 1155 return NULL; 1156 } 1157 1158 /* 1159 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1160 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1161 * check because it applies only to the built-in QAT devices and it doesn't 1162 * grant additional privileges. 1163 */ 1164 #define BUGGY_QAT_DEVID_MASK 0x4940 1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1166 { 1167 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1168 return false; 1169 1170 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1171 return false; 1172 1173 return true; 1174 } 1175 1176 static void iommu_enable_pci_ats(struct device_domain_info *info) 1177 { 1178 struct pci_dev *pdev; 1179 1180 if (!info->ats_supported) 1181 return; 1182 1183 pdev = to_pci_dev(info->dev); 1184 if (!pci_ats_page_aligned(pdev)) 1185 return; 1186 1187 if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) 1188 info->ats_enabled = 1; 1189 } 1190 1191 static void iommu_disable_pci_ats(struct device_domain_info *info) 1192 { 1193 if (!info->ats_enabled) 1194 return; 1195 1196 pci_disable_ats(to_pci_dev(info->dev)); 1197 info->ats_enabled = 0; 1198 } 1199 1200 static void iommu_enable_pci_pri(struct device_domain_info *info) 1201 { 1202 struct pci_dev *pdev; 1203 1204 if (!info->ats_enabled || !info->pri_supported) 1205 return; 1206 1207 pdev = to_pci_dev(info->dev); 1208 /* PASID is required in PRG Response Message. */ 1209 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 1210 return; 1211 1212 if (pci_reset_pri(pdev)) 1213 return; 1214 1215 if (!pci_enable_pri(pdev, PRQ_DEPTH)) 1216 info->pri_enabled = 1; 1217 } 1218 1219 static void iommu_disable_pci_pri(struct device_domain_info *info) 1220 { 1221 if (!info->pri_enabled) 1222 return; 1223 1224 if (WARN_ON(info->iopf_refcount)) 1225 iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); 1226 1227 pci_disable_pri(to_pci_dev(info->dev)); 1228 info->pri_enabled = 0; 1229 } 1230 1231 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1232 { 1233 cache_tag_flush_all(to_dmar_domain(domain)); 1234 } 1235 1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1237 { 1238 u32 pmen; 1239 unsigned long flags; 1240 1241 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1242 return; 1243 1244 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1245 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1246 pmen &= ~DMA_PMEN_EPM; 1247 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1248 1249 /* wait for the protected region status bit to clear */ 1250 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1251 readl, !(pmen & DMA_PMEN_PRS), pmen); 1252 1253 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1254 } 1255 1256 static void iommu_enable_translation(struct intel_iommu *iommu) 1257 { 1258 u32 sts; 1259 unsigned long flags; 1260 1261 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1262 iommu->gcmd |= DMA_GCMD_TE; 1263 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1264 1265 /* Make sure hardware complete it */ 1266 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1267 readl, (sts & DMA_GSTS_TES), sts); 1268 1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1270 } 1271 1272 static void iommu_disable_translation(struct intel_iommu *iommu) 1273 { 1274 u32 sts; 1275 unsigned long flag; 1276 1277 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1278 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1279 return; 1280 1281 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1282 iommu->gcmd &= ~DMA_GCMD_TE; 1283 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1284 1285 /* Make sure hardware complete it */ 1286 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1287 readl, (!(sts & DMA_GSTS_TES)), sts); 1288 1289 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1290 } 1291 1292 static void disable_dmar_iommu(struct intel_iommu *iommu) 1293 { 1294 /* 1295 * All iommu domains must have been detached from the devices, 1296 * hence there should be no domain IDs in use. 1297 */ 1298 if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) 1299 return; 1300 1301 if (iommu->gcmd & DMA_GCMD_TE) 1302 iommu_disable_translation(iommu); 1303 } 1304 1305 static void free_dmar_iommu(struct intel_iommu *iommu) 1306 { 1307 if (iommu->copied_tables) { 1308 bitmap_free(iommu->copied_tables); 1309 iommu->copied_tables = NULL; 1310 } 1311 1312 /* free context mapping */ 1313 free_context_table(iommu); 1314 1315 if (ecap_prs(iommu->ecap)) 1316 intel_iommu_finish_prq(iommu); 1317 } 1318 1319 /* 1320 * Check and return whether first level is used by default for 1321 * DMA translation. 1322 */ 1323 static bool first_level_by_default(struct intel_iommu *iommu) 1324 { 1325 /* Only SL is available in legacy mode */ 1326 if (!sm_supported(iommu)) 1327 return false; 1328 1329 /* Only level (either FL or SL) is available, just use it */ 1330 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) 1331 return ecap_flts(iommu->ecap); 1332 1333 return true; 1334 } 1335 1336 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1337 { 1338 struct iommu_domain_info *info, *curr; 1339 int num, ret = -ENOSPC; 1340 1341 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1342 return 0; 1343 1344 info = kzalloc(sizeof(*info), GFP_KERNEL); 1345 if (!info) 1346 return -ENOMEM; 1347 1348 guard(mutex)(&iommu->did_lock); 1349 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1350 if (curr) { 1351 curr->refcnt++; 1352 kfree(info); 1353 return 0; 1354 } 1355 1356 num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, 1357 cap_ndoms(iommu->cap) - 1, GFP_KERNEL); 1358 if (num < 0) { 1359 pr_err("%s: No free domain ids\n", iommu->name); 1360 goto err_unlock; 1361 } 1362 1363 info->refcnt = 1; 1364 info->did = num; 1365 info->iommu = iommu; 1366 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1367 NULL, info, GFP_KERNEL); 1368 if (curr) { 1369 ret = xa_err(curr) ? : -EBUSY; 1370 goto err_clear; 1371 } 1372 1373 return 0; 1374 1375 err_clear: 1376 ida_free(&iommu->domain_ida, info->did); 1377 err_unlock: 1378 kfree(info); 1379 return ret; 1380 } 1381 1382 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1383 { 1384 struct iommu_domain_info *info; 1385 1386 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1387 return; 1388 1389 guard(mutex)(&iommu->did_lock); 1390 info = xa_load(&domain->iommu_array, iommu->seq_id); 1391 if (--info->refcnt == 0) { 1392 ida_free(&iommu->domain_ida, info->did); 1393 xa_erase(&domain->iommu_array, iommu->seq_id); 1394 domain->nid = NUMA_NO_NODE; 1395 kfree(info); 1396 } 1397 } 1398 1399 static void domain_exit(struct dmar_domain *domain) 1400 { 1401 if (domain->pgd) { 1402 struct iommu_pages_list freelist = 1403 IOMMU_PAGES_LIST_INIT(freelist); 1404 1405 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1406 iommu_put_pages_list(&freelist); 1407 } 1408 1409 if (WARN_ON(!list_empty(&domain->devices))) 1410 return; 1411 1412 kfree(domain->qi_batch); 1413 kfree(domain); 1414 } 1415 1416 /* 1417 * For kdump cases, old valid entries may be cached due to the 1418 * in-flight DMA and copied pgtable, but there is no unmapping 1419 * behaviour for them, thus we need an explicit cache flush for 1420 * the newly-mapped device. For kdump, at this point, the device 1421 * is supposed to finish reset at its driver probe stage, so no 1422 * in-flight DMA will exist, and we don't need to worry anymore 1423 * hereafter. 1424 */ 1425 static void copied_context_tear_down(struct intel_iommu *iommu, 1426 struct context_entry *context, 1427 u8 bus, u8 devfn) 1428 { 1429 u16 did_old; 1430 1431 if (!context_copied(iommu, bus, devfn)) 1432 return; 1433 1434 assert_spin_locked(&iommu->lock); 1435 1436 did_old = context_domain_id(context); 1437 context_clear_entry(context); 1438 1439 if (did_old < cap_ndoms(iommu->cap)) { 1440 iommu->flush.flush_context(iommu, did_old, 1441 PCI_DEVID(bus, devfn), 1442 DMA_CCMD_MASK_NOBIT, 1443 DMA_CCMD_DEVICE_INVL); 1444 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1445 DMA_TLB_DSI_FLUSH); 1446 } 1447 1448 clear_context_copied(iommu, bus, devfn); 1449 } 1450 1451 /* 1452 * It's a non-present to present mapping. If hardware doesn't cache 1453 * non-present entry we only need to flush the write-buffer. If the 1454 * _does_ cache non-present entries, then it does so in the special 1455 * domain #0, which we have to flush: 1456 */ 1457 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, 1458 u8 bus, u8 devfn) 1459 { 1460 if (cap_caching_mode(iommu->cap)) { 1461 iommu->flush.flush_context(iommu, 0, 1462 PCI_DEVID(bus, devfn), 1463 DMA_CCMD_MASK_NOBIT, 1464 DMA_CCMD_DEVICE_INVL); 1465 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1466 } else { 1467 iommu_flush_write_buffer(iommu); 1468 } 1469 } 1470 1471 static int domain_context_mapping_one(struct dmar_domain *domain, 1472 struct intel_iommu *iommu, 1473 u8 bus, u8 devfn) 1474 { 1475 struct device_domain_info *info = 1476 domain_lookup_dev_info(domain, iommu, bus, devfn); 1477 u16 did = domain_id_iommu(domain, iommu); 1478 int translation = CONTEXT_TT_MULTI_LEVEL; 1479 struct dma_pte *pgd = domain->pgd; 1480 struct context_entry *context; 1481 int ret; 1482 1483 pr_debug("Set context mapping for %02x:%02x.%d\n", 1484 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1485 1486 spin_lock(&iommu->lock); 1487 ret = -ENOMEM; 1488 context = iommu_context_addr(iommu, bus, devfn, 1); 1489 if (!context) 1490 goto out_unlock; 1491 1492 ret = 0; 1493 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1494 goto out_unlock; 1495 1496 copied_context_tear_down(iommu, context, bus, devfn); 1497 context_clear_entry(context); 1498 context_set_domain_id(context, did); 1499 1500 if (info && info->ats_supported) 1501 translation = CONTEXT_TT_DEV_IOTLB; 1502 else 1503 translation = CONTEXT_TT_MULTI_LEVEL; 1504 1505 context_set_address_root(context, virt_to_phys(pgd)); 1506 context_set_address_width(context, domain->agaw); 1507 context_set_translation_type(context, translation); 1508 context_set_fault_enable(context); 1509 context_set_present(context); 1510 if (!ecap_coherent(iommu->ecap)) 1511 clflush_cache_range(context, sizeof(*context)); 1512 context_present_cache_flush(iommu, did, bus, devfn); 1513 ret = 0; 1514 1515 out_unlock: 1516 spin_unlock(&iommu->lock); 1517 1518 return ret; 1519 } 1520 1521 static int domain_context_mapping_cb(struct pci_dev *pdev, 1522 u16 alias, void *opaque) 1523 { 1524 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); 1525 struct intel_iommu *iommu = info->iommu; 1526 struct dmar_domain *domain = opaque; 1527 1528 return domain_context_mapping_one(domain, iommu, 1529 PCI_BUS_NUM(alias), alias & 0xff); 1530 } 1531 1532 static int 1533 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1534 { 1535 struct device_domain_info *info = dev_iommu_priv_get(dev); 1536 struct intel_iommu *iommu = info->iommu; 1537 u8 bus = info->bus, devfn = info->devfn; 1538 int ret; 1539 1540 if (!dev_is_pci(dev)) 1541 return domain_context_mapping_one(domain, iommu, bus, devfn); 1542 1543 ret = pci_for_each_dma_alias(to_pci_dev(dev), 1544 domain_context_mapping_cb, domain); 1545 if (ret) 1546 return ret; 1547 1548 iommu_enable_pci_ats(info); 1549 1550 return 0; 1551 } 1552 1553 /* Return largest possible superpage level for a given mapping */ 1554 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1555 unsigned long phy_pfn, unsigned long pages) 1556 { 1557 int support, level = 1; 1558 unsigned long pfnmerge; 1559 1560 support = domain->iommu_superpage; 1561 1562 /* To use a large page, the virtual *and* physical addresses 1563 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1564 of them will mean we have to use smaller pages. So just 1565 merge them and check both at once. */ 1566 pfnmerge = iov_pfn | phy_pfn; 1567 1568 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1569 pages >>= VTD_STRIDE_SHIFT; 1570 if (!pages) 1571 break; 1572 pfnmerge >>= VTD_STRIDE_SHIFT; 1573 level++; 1574 support--; 1575 } 1576 return level; 1577 } 1578 1579 /* 1580 * Ensure that old small page tables are removed to make room for superpage(s). 1581 * We're going to add new large pages, so make sure we don't remove their parent 1582 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1583 */ 1584 static void switch_to_super_page(struct dmar_domain *domain, 1585 unsigned long start_pfn, 1586 unsigned long end_pfn, int level) 1587 { 1588 unsigned long lvl_pages = lvl_to_nr_pages(level); 1589 struct dma_pte *pte = NULL; 1590 1591 while (start_pfn <= end_pfn) { 1592 if (!pte) 1593 pte = pfn_to_dma_pte(domain, start_pfn, &level, 1594 GFP_ATOMIC); 1595 1596 if (dma_pte_present(pte)) { 1597 dma_pte_free_pagetable(domain, start_pfn, 1598 start_pfn + lvl_pages - 1, 1599 level + 1); 1600 1601 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, 1602 end_pfn << VTD_PAGE_SHIFT, 0); 1603 } 1604 1605 pte++; 1606 start_pfn += lvl_pages; 1607 if (first_pte_in_page(pte)) 1608 pte = NULL; 1609 } 1610 } 1611 1612 static int 1613 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1614 unsigned long phys_pfn, unsigned long nr_pages, int prot, 1615 gfp_t gfp) 1616 { 1617 struct dma_pte *first_pte = NULL, *pte = NULL; 1618 unsigned int largepage_lvl = 0; 1619 unsigned long lvl_pages = 0; 1620 phys_addr_t pteval; 1621 u64 attr; 1622 1623 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 1624 return -EINVAL; 1625 1626 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 1627 return -EINVAL; 1628 1629 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 1630 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 1631 return -EINVAL; 1632 } 1633 1634 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 1635 if (domain->use_first_level) { 1636 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 1637 if (prot & DMA_PTE_WRITE) 1638 attr |= DMA_FL_PTE_DIRTY; 1639 } 1640 1641 domain->has_mappings = true; 1642 1643 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 1644 1645 while (nr_pages > 0) { 1646 uint64_t tmp; 1647 1648 if (!pte) { 1649 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 1650 phys_pfn, nr_pages); 1651 1652 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 1653 gfp); 1654 if (!pte) 1655 return -ENOMEM; 1656 first_pte = pte; 1657 1658 lvl_pages = lvl_to_nr_pages(largepage_lvl); 1659 1660 /* It is large page*/ 1661 if (largepage_lvl > 1) { 1662 unsigned long end_pfn; 1663 unsigned long pages_to_remove; 1664 1665 pteval |= DMA_PTE_LARGE_PAGE; 1666 pages_to_remove = min_t(unsigned long, nr_pages, 1667 nr_pte_to_next_page(pte) * lvl_pages); 1668 end_pfn = iov_pfn + pages_to_remove - 1; 1669 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 1670 } else { 1671 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1672 } 1673 1674 } 1675 /* We don't need lock here, nobody else 1676 * touches the iova range 1677 */ 1678 tmp = 0ULL; 1679 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { 1680 static int dumps = 5; 1681 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 1682 iov_pfn, tmp, (unsigned long long)pteval); 1683 if (dumps) { 1684 dumps--; 1685 debug_dma_dump_mappings(NULL); 1686 } 1687 WARN_ON(1); 1688 } 1689 1690 nr_pages -= lvl_pages; 1691 iov_pfn += lvl_pages; 1692 phys_pfn += lvl_pages; 1693 pteval += lvl_pages * VTD_PAGE_SIZE; 1694 1695 /* If the next PTE would be the first in a new page, then we 1696 * need to flush the cache on the entries we've just written. 1697 * And then we'll need to recalculate 'pte', so clear it and 1698 * let it get set again in the if (!pte) block above. 1699 * 1700 * If we're done (!nr_pages) we need to flush the cache too. 1701 * 1702 * Also if we've been setting superpages, we may need to 1703 * recalculate 'pte' and switch back to smaller pages for the 1704 * end of the mapping, if the trailing size is not enough to 1705 * use another superpage (i.e. nr_pages < lvl_pages). 1706 */ 1707 pte++; 1708 if (!nr_pages || first_pte_in_page(pte) || 1709 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 1710 domain_flush_cache(domain, first_pte, 1711 (void *)pte - (void *)first_pte); 1712 pte = NULL; 1713 } 1714 } 1715 1716 return 0; 1717 } 1718 1719 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 1720 { 1721 struct intel_iommu *iommu = info->iommu; 1722 struct context_entry *context; 1723 u16 did; 1724 1725 spin_lock(&iommu->lock); 1726 context = iommu_context_addr(iommu, bus, devfn, 0); 1727 if (!context) { 1728 spin_unlock(&iommu->lock); 1729 return; 1730 } 1731 1732 did = context_domain_id(context); 1733 context_clear_entry(context); 1734 __iommu_flush_cache(iommu, context, sizeof(*context)); 1735 spin_unlock(&iommu->lock); 1736 intel_context_flush_no_pasid(info, context, did); 1737 } 1738 1739 int __domain_setup_first_level(struct intel_iommu *iommu, 1740 struct device *dev, ioasid_t pasid, 1741 u16 did, pgd_t *pgd, int flags, 1742 struct iommu_domain *old) 1743 { 1744 if (!old) 1745 return intel_pasid_setup_first_level(iommu, dev, pgd, 1746 pasid, did, flags); 1747 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, 1748 iommu_domain_did(old, iommu), 1749 flags); 1750 } 1751 1752 static int domain_setup_second_level(struct intel_iommu *iommu, 1753 struct dmar_domain *domain, 1754 struct device *dev, ioasid_t pasid, 1755 struct iommu_domain *old) 1756 { 1757 if (!old) 1758 return intel_pasid_setup_second_level(iommu, domain, 1759 dev, pasid); 1760 return intel_pasid_replace_second_level(iommu, domain, dev, 1761 iommu_domain_did(old, iommu), 1762 pasid); 1763 } 1764 1765 static int domain_setup_passthrough(struct intel_iommu *iommu, 1766 struct device *dev, ioasid_t pasid, 1767 struct iommu_domain *old) 1768 { 1769 if (!old) 1770 return intel_pasid_setup_pass_through(iommu, dev, pasid); 1771 return intel_pasid_replace_pass_through(iommu, dev, 1772 iommu_domain_did(old, iommu), 1773 pasid); 1774 } 1775 1776 static int domain_setup_first_level(struct intel_iommu *iommu, 1777 struct dmar_domain *domain, 1778 struct device *dev, 1779 u32 pasid, struct iommu_domain *old) 1780 { 1781 struct dma_pte *pgd = domain->pgd; 1782 int level, flags = 0; 1783 1784 level = agaw_to_level(domain->agaw); 1785 if (level != 4 && level != 5) 1786 return -EINVAL; 1787 1788 if (level == 5) 1789 flags |= PASID_FLAG_FL5LP; 1790 1791 if (domain->force_snooping) 1792 flags |= PASID_FLAG_PAGE_SNOOP; 1793 1794 return __domain_setup_first_level(iommu, dev, pasid, 1795 domain_id_iommu(domain, iommu), 1796 (pgd_t *)pgd, flags, old); 1797 } 1798 1799 static int dmar_domain_attach_device(struct dmar_domain *domain, 1800 struct device *dev) 1801 { 1802 struct device_domain_info *info = dev_iommu_priv_get(dev); 1803 struct intel_iommu *iommu = info->iommu; 1804 unsigned long flags; 1805 int ret; 1806 1807 ret = domain_attach_iommu(domain, iommu); 1808 if (ret) 1809 return ret; 1810 1811 info->domain = domain; 1812 info->domain_attached = true; 1813 spin_lock_irqsave(&domain->lock, flags); 1814 list_add(&info->link, &domain->devices); 1815 spin_unlock_irqrestore(&domain->lock, flags); 1816 1817 if (dev_is_real_dma_subdevice(dev)) 1818 return 0; 1819 1820 if (!sm_supported(iommu)) 1821 ret = domain_context_mapping(domain, dev); 1822 else if (domain->use_first_level) 1823 ret = domain_setup_first_level(iommu, domain, dev, 1824 IOMMU_NO_PASID, NULL); 1825 else 1826 ret = domain_setup_second_level(iommu, domain, dev, 1827 IOMMU_NO_PASID, NULL); 1828 1829 if (ret) 1830 goto out_block_translation; 1831 1832 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); 1833 if (ret) 1834 goto out_block_translation; 1835 1836 return 0; 1837 1838 out_block_translation: 1839 device_block_translation(dev); 1840 return ret; 1841 } 1842 1843 /** 1844 * device_rmrr_is_relaxable - Test whether the RMRR of this device 1845 * is relaxable (ie. is allowed to be not enforced under some conditions) 1846 * @dev: device handle 1847 * 1848 * We assume that PCI USB devices with RMRRs have them largely 1849 * for historical reasons and that the RMRR space is not actively used post 1850 * boot. This exclusion may change if vendors begin to abuse it. 1851 * 1852 * The same exception is made for graphics devices, with the requirement that 1853 * any use of the RMRR regions will be torn down before assigning the device 1854 * to a guest. 1855 * 1856 * Return: true if the RMRR is relaxable, false otherwise 1857 */ 1858 static bool device_rmrr_is_relaxable(struct device *dev) 1859 { 1860 struct pci_dev *pdev; 1861 1862 if (!dev_is_pci(dev)) 1863 return false; 1864 1865 pdev = to_pci_dev(dev); 1866 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 1867 return true; 1868 else 1869 return false; 1870 } 1871 1872 static int device_def_domain_type(struct device *dev) 1873 { 1874 struct device_domain_info *info = dev_iommu_priv_get(dev); 1875 struct intel_iommu *iommu = info->iommu; 1876 1877 /* 1878 * Hardware does not support the passthrough translation mode. 1879 * Always use a dynamaic mapping domain. 1880 */ 1881 if (!ecap_pass_through(iommu->ecap)) 1882 return IOMMU_DOMAIN_DMA; 1883 1884 if (dev_is_pci(dev)) { 1885 struct pci_dev *pdev = to_pci_dev(dev); 1886 1887 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 1888 return IOMMU_DOMAIN_IDENTITY; 1889 } 1890 1891 return 0; 1892 } 1893 1894 static void intel_iommu_init_qi(struct intel_iommu *iommu) 1895 { 1896 /* 1897 * Start from the sane iommu hardware state. 1898 * If the queued invalidation is already initialized by us 1899 * (for example, while enabling interrupt-remapping) then 1900 * we got the things already rolling from a sane state. 1901 */ 1902 if (!iommu->qi) { 1903 /* 1904 * Clear any previous faults. 1905 */ 1906 dmar_fault(-1, iommu); 1907 /* 1908 * Disable queued invalidation if supported and already enabled 1909 * before OS handover. 1910 */ 1911 dmar_disable_qi(iommu); 1912 } 1913 1914 if (dmar_enable_qi(iommu)) { 1915 /* 1916 * Queued Invalidate not enabled, use Register Based Invalidate 1917 */ 1918 iommu->flush.flush_context = __iommu_flush_context; 1919 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 1920 pr_info("%s: Using Register based invalidation\n", 1921 iommu->name); 1922 } else { 1923 iommu->flush.flush_context = qi_flush_context; 1924 iommu->flush.flush_iotlb = qi_flush_iotlb; 1925 pr_info("%s: Using Queued invalidation\n", iommu->name); 1926 } 1927 } 1928 1929 static int copy_context_table(struct intel_iommu *iommu, 1930 struct root_entry *old_re, 1931 struct context_entry **tbl, 1932 int bus, bool ext) 1933 { 1934 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 1935 struct context_entry *new_ce = NULL, ce; 1936 struct context_entry *old_ce = NULL; 1937 struct root_entry re; 1938 phys_addr_t old_ce_phys; 1939 1940 tbl_idx = ext ? bus * 2 : bus; 1941 memcpy(&re, old_re, sizeof(re)); 1942 1943 for (devfn = 0; devfn < 256; devfn++) { 1944 /* First calculate the correct index */ 1945 idx = (ext ? devfn * 2 : devfn) % 256; 1946 1947 if (idx == 0) { 1948 /* First save what we may have and clean up */ 1949 if (new_ce) { 1950 tbl[tbl_idx] = new_ce; 1951 __iommu_flush_cache(iommu, new_ce, 1952 VTD_PAGE_SIZE); 1953 pos = 1; 1954 } 1955 1956 if (old_ce) 1957 memunmap(old_ce); 1958 1959 ret = 0; 1960 if (devfn < 0x80) 1961 old_ce_phys = root_entry_lctp(&re); 1962 else 1963 old_ce_phys = root_entry_uctp(&re); 1964 1965 if (!old_ce_phys) { 1966 if (ext && devfn == 0) { 1967 /* No LCTP, try UCTP */ 1968 devfn = 0x7f; 1969 continue; 1970 } else { 1971 goto out; 1972 } 1973 } 1974 1975 ret = -ENOMEM; 1976 old_ce = memremap(old_ce_phys, PAGE_SIZE, 1977 MEMREMAP_WB); 1978 if (!old_ce) 1979 goto out; 1980 1981 new_ce = iommu_alloc_pages_node_sz(iommu->node, 1982 GFP_KERNEL, SZ_4K); 1983 if (!new_ce) 1984 goto out_unmap; 1985 1986 ret = 0; 1987 } 1988 1989 /* Now copy the context entry */ 1990 memcpy(&ce, old_ce + idx, sizeof(ce)); 1991 1992 if (!context_present(&ce)) 1993 continue; 1994 1995 did = context_domain_id(&ce); 1996 if (did >= 0 && did < cap_ndoms(iommu->cap)) 1997 ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL); 1998 1999 set_context_copied(iommu, bus, devfn); 2000 new_ce[idx] = ce; 2001 } 2002 2003 tbl[tbl_idx + pos] = new_ce; 2004 2005 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2006 2007 out_unmap: 2008 memunmap(old_ce); 2009 2010 out: 2011 return ret; 2012 } 2013 2014 static int copy_translation_tables(struct intel_iommu *iommu) 2015 { 2016 struct context_entry **ctxt_tbls; 2017 struct root_entry *old_rt; 2018 phys_addr_t old_rt_phys; 2019 int ctxt_table_entries; 2020 u64 rtaddr_reg; 2021 int bus, ret; 2022 bool new_ext, ext; 2023 2024 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2025 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2026 new_ext = !!sm_supported(iommu); 2027 2028 /* 2029 * The RTT bit can only be changed when translation is disabled, 2030 * but disabling translation means to open a window for data 2031 * corruption. So bail out and don't copy anything if we would 2032 * have to change the bit. 2033 */ 2034 if (new_ext != ext) 2035 return -EINVAL; 2036 2037 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2038 if (!iommu->copied_tables) 2039 return -ENOMEM; 2040 2041 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2042 if (!old_rt_phys) 2043 return -EINVAL; 2044 2045 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2046 if (!old_rt) 2047 return -ENOMEM; 2048 2049 /* This is too big for the stack - allocate it from slab */ 2050 ctxt_table_entries = ext ? 512 : 256; 2051 ret = -ENOMEM; 2052 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2053 if (!ctxt_tbls) 2054 goto out_unmap; 2055 2056 for (bus = 0; bus < 256; bus++) { 2057 ret = copy_context_table(iommu, &old_rt[bus], 2058 ctxt_tbls, bus, ext); 2059 if (ret) { 2060 pr_err("%s: Failed to copy context table for bus %d\n", 2061 iommu->name, bus); 2062 continue; 2063 } 2064 } 2065 2066 spin_lock(&iommu->lock); 2067 2068 /* Context tables are copied, now write them to the root_entry table */ 2069 for (bus = 0; bus < 256; bus++) { 2070 int idx = ext ? bus * 2 : bus; 2071 u64 val; 2072 2073 if (ctxt_tbls[idx]) { 2074 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2075 iommu->root_entry[bus].lo = val; 2076 } 2077 2078 if (!ext || !ctxt_tbls[idx + 1]) 2079 continue; 2080 2081 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2082 iommu->root_entry[bus].hi = val; 2083 } 2084 2085 spin_unlock(&iommu->lock); 2086 2087 kfree(ctxt_tbls); 2088 2089 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2090 2091 ret = 0; 2092 2093 out_unmap: 2094 memunmap(old_rt); 2095 2096 return ret; 2097 } 2098 2099 static int __init init_dmars(void) 2100 { 2101 struct dmar_drhd_unit *drhd; 2102 struct intel_iommu *iommu; 2103 int ret; 2104 2105 for_each_iommu(iommu, drhd) { 2106 if (drhd->ignored) { 2107 iommu_disable_translation(iommu); 2108 continue; 2109 } 2110 2111 /* 2112 * Find the max pasid size of all IOMMU's in the system. 2113 * We need to ensure the system pasid table is no bigger 2114 * than the smallest supported. 2115 */ 2116 if (pasid_supported(iommu)) { 2117 u32 temp = 2 << ecap_pss(iommu->ecap); 2118 2119 intel_pasid_max_id = min_t(u32, temp, 2120 intel_pasid_max_id); 2121 } 2122 2123 intel_iommu_init_qi(iommu); 2124 init_translation_status(iommu); 2125 2126 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2127 iommu_disable_translation(iommu); 2128 clear_translation_pre_enabled(iommu); 2129 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2130 iommu->name); 2131 } 2132 2133 /* 2134 * TBD: 2135 * we could share the same root & context tables 2136 * among all IOMMU's. Need to Split it later. 2137 */ 2138 ret = iommu_alloc_root_entry(iommu); 2139 if (ret) 2140 goto free_iommu; 2141 2142 if (translation_pre_enabled(iommu)) { 2143 pr_info("Translation already enabled - trying to copy translation structures\n"); 2144 2145 ret = copy_translation_tables(iommu); 2146 if (ret) { 2147 /* 2148 * We found the IOMMU with translation 2149 * enabled - but failed to copy over the 2150 * old root-entry table. Try to proceed 2151 * by disabling translation now and 2152 * allocating a clean root-entry table. 2153 * This might cause DMAR faults, but 2154 * probably the dump will still succeed. 2155 */ 2156 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2157 iommu->name); 2158 iommu_disable_translation(iommu); 2159 clear_translation_pre_enabled(iommu); 2160 } else { 2161 pr_info("Copied translation tables from previous kernel for %s\n", 2162 iommu->name); 2163 } 2164 } 2165 2166 intel_svm_check(iommu); 2167 } 2168 2169 /* 2170 * Now that qi is enabled on all iommus, set the root entry and flush 2171 * caches. This is required on some Intel X58 chipsets, otherwise the 2172 * flush_context function will loop forever and the boot hangs. 2173 */ 2174 for_each_active_iommu(iommu, drhd) { 2175 iommu_flush_write_buffer(iommu); 2176 iommu_set_root_entry(iommu); 2177 } 2178 2179 check_tylersburg_isoch(); 2180 2181 /* 2182 * for each drhd 2183 * enable fault log 2184 * global invalidate context cache 2185 * global invalidate iotlb 2186 * enable translation 2187 */ 2188 for_each_iommu(iommu, drhd) { 2189 if (drhd->ignored) { 2190 /* 2191 * we always have to disable PMRs or DMA may fail on 2192 * this device 2193 */ 2194 if (force_on) 2195 iommu_disable_protect_mem_regions(iommu); 2196 continue; 2197 } 2198 2199 iommu_flush_write_buffer(iommu); 2200 2201 if (ecap_prs(iommu->ecap)) { 2202 /* 2203 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2204 * could cause possible lock race condition. 2205 */ 2206 up_write(&dmar_global_lock); 2207 ret = intel_iommu_enable_prq(iommu); 2208 down_write(&dmar_global_lock); 2209 if (ret) 2210 goto free_iommu; 2211 } 2212 2213 ret = dmar_set_interrupt(iommu); 2214 if (ret) 2215 goto free_iommu; 2216 } 2217 2218 return 0; 2219 2220 free_iommu: 2221 for_each_active_iommu(iommu, drhd) { 2222 disable_dmar_iommu(iommu); 2223 free_dmar_iommu(iommu); 2224 } 2225 2226 return ret; 2227 } 2228 2229 static void __init init_no_remapping_devices(void) 2230 { 2231 struct dmar_drhd_unit *drhd; 2232 struct device *dev; 2233 int i; 2234 2235 for_each_drhd_unit(drhd) { 2236 if (!drhd->include_all) { 2237 for_each_active_dev_scope(drhd->devices, 2238 drhd->devices_cnt, i, dev) 2239 break; 2240 /* ignore DMAR unit if no devices exist */ 2241 if (i == drhd->devices_cnt) 2242 drhd->ignored = 1; 2243 } 2244 } 2245 2246 for_each_active_drhd_unit(drhd) { 2247 if (drhd->include_all) 2248 continue; 2249 2250 for_each_active_dev_scope(drhd->devices, 2251 drhd->devices_cnt, i, dev) 2252 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2253 break; 2254 if (i < drhd->devices_cnt) 2255 continue; 2256 2257 /* This IOMMU has *only* gfx devices. Either bypass it or 2258 set the gfx_mapped flag, as appropriate */ 2259 drhd->gfx_dedicated = 1; 2260 if (disable_igfx_iommu) 2261 drhd->ignored = 1; 2262 } 2263 } 2264 2265 #ifdef CONFIG_SUSPEND 2266 static int init_iommu_hw(void) 2267 { 2268 struct dmar_drhd_unit *drhd; 2269 struct intel_iommu *iommu = NULL; 2270 int ret; 2271 2272 for_each_active_iommu(iommu, drhd) { 2273 if (iommu->qi) { 2274 ret = dmar_reenable_qi(iommu); 2275 if (ret) 2276 return ret; 2277 } 2278 } 2279 2280 for_each_iommu(iommu, drhd) { 2281 if (drhd->ignored) { 2282 /* 2283 * we always have to disable PMRs or DMA may fail on 2284 * this device 2285 */ 2286 if (force_on) 2287 iommu_disable_protect_mem_regions(iommu); 2288 continue; 2289 } 2290 2291 iommu_flush_write_buffer(iommu); 2292 iommu_set_root_entry(iommu); 2293 iommu_enable_translation(iommu); 2294 iommu_disable_protect_mem_regions(iommu); 2295 } 2296 2297 return 0; 2298 } 2299 2300 static void iommu_flush_all(void) 2301 { 2302 struct dmar_drhd_unit *drhd; 2303 struct intel_iommu *iommu; 2304 2305 for_each_active_iommu(iommu, drhd) { 2306 iommu->flush.flush_context(iommu, 0, 0, 0, 2307 DMA_CCMD_GLOBAL_INVL); 2308 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2309 DMA_TLB_GLOBAL_FLUSH); 2310 } 2311 } 2312 2313 static int iommu_suspend(void) 2314 { 2315 struct dmar_drhd_unit *drhd; 2316 struct intel_iommu *iommu = NULL; 2317 unsigned long flag; 2318 2319 iommu_flush_all(); 2320 2321 for_each_active_iommu(iommu, drhd) { 2322 iommu_disable_translation(iommu); 2323 2324 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2325 2326 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2327 readl(iommu->reg + DMAR_FECTL_REG); 2328 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2329 readl(iommu->reg + DMAR_FEDATA_REG); 2330 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2331 readl(iommu->reg + DMAR_FEADDR_REG); 2332 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2333 readl(iommu->reg + DMAR_FEUADDR_REG); 2334 2335 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2336 } 2337 return 0; 2338 } 2339 2340 static void iommu_resume(void) 2341 { 2342 struct dmar_drhd_unit *drhd; 2343 struct intel_iommu *iommu = NULL; 2344 unsigned long flag; 2345 2346 if (init_iommu_hw()) { 2347 if (force_on) 2348 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2349 else 2350 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2351 return; 2352 } 2353 2354 for_each_active_iommu(iommu, drhd) { 2355 2356 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2357 2358 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2359 iommu->reg + DMAR_FECTL_REG); 2360 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2361 iommu->reg + DMAR_FEDATA_REG); 2362 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2363 iommu->reg + DMAR_FEADDR_REG); 2364 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2365 iommu->reg + DMAR_FEUADDR_REG); 2366 2367 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2368 } 2369 } 2370 2371 static struct syscore_ops iommu_syscore_ops = { 2372 .resume = iommu_resume, 2373 .suspend = iommu_suspend, 2374 }; 2375 2376 static void __init init_iommu_pm_ops(void) 2377 { 2378 register_syscore_ops(&iommu_syscore_ops); 2379 } 2380 2381 #else 2382 static inline void init_iommu_pm_ops(void) {} 2383 #endif /* CONFIG_PM */ 2384 2385 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2386 { 2387 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2388 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2389 rmrr->end_address <= rmrr->base_address || 2390 arch_rmrr_sanity_check(rmrr)) 2391 return -EINVAL; 2392 2393 return 0; 2394 } 2395 2396 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2397 { 2398 struct acpi_dmar_reserved_memory *rmrr; 2399 struct dmar_rmrr_unit *rmrru; 2400 2401 rmrr = (struct acpi_dmar_reserved_memory *)header; 2402 if (rmrr_sanity_check(rmrr)) { 2403 pr_warn(FW_BUG 2404 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2405 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2406 rmrr->base_address, rmrr->end_address, 2407 dmi_get_system_info(DMI_BIOS_VENDOR), 2408 dmi_get_system_info(DMI_BIOS_VERSION), 2409 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2410 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2411 } 2412 2413 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2414 if (!rmrru) 2415 goto out; 2416 2417 rmrru->hdr = header; 2418 2419 rmrru->base_address = rmrr->base_address; 2420 rmrru->end_address = rmrr->end_address; 2421 2422 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2423 ((void *)rmrr) + rmrr->header.length, 2424 &rmrru->devices_cnt); 2425 if (rmrru->devices_cnt && rmrru->devices == NULL) 2426 goto free_rmrru; 2427 2428 list_add(&rmrru->list, &dmar_rmrr_units); 2429 2430 return 0; 2431 free_rmrru: 2432 kfree(rmrru); 2433 out: 2434 return -ENOMEM; 2435 } 2436 2437 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2438 { 2439 struct dmar_atsr_unit *atsru; 2440 struct acpi_dmar_atsr *tmp; 2441 2442 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2443 dmar_rcu_check()) { 2444 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2445 if (atsr->segment != tmp->segment) 2446 continue; 2447 if (atsr->header.length != tmp->header.length) 2448 continue; 2449 if (memcmp(atsr, tmp, atsr->header.length) == 0) 2450 return atsru; 2451 } 2452 2453 return NULL; 2454 } 2455 2456 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2457 { 2458 struct acpi_dmar_atsr *atsr; 2459 struct dmar_atsr_unit *atsru; 2460 2461 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2462 return 0; 2463 2464 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2465 atsru = dmar_find_atsr(atsr); 2466 if (atsru) 2467 return 0; 2468 2469 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 2470 if (!atsru) 2471 return -ENOMEM; 2472 2473 /* 2474 * If memory is allocated from slab by ACPI _DSM method, we need to 2475 * copy the memory content because the memory buffer will be freed 2476 * on return. 2477 */ 2478 atsru->hdr = (void *)(atsru + 1); 2479 memcpy(atsru->hdr, hdr, hdr->length); 2480 atsru->include_all = atsr->flags & 0x1; 2481 if (!atsru->include_all) { 2482 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 2483 (void *)atsr + atsr->header.length, 2484 &atsru->devices_cnt); 2485 if (atsru->devices_cnt && atsru->devices == NULL) { 2486 kfree(atsru); 2487 return -ENOMEM; 2488 } 2489 } 2490 2491 list_add_rcu(&atsru->list, &dmar_atsr_units); 2492 2493 return 0; 2494 } 2495 2496 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 2497 { 2498 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 2499 kfree(atsru); 2500 } 2501 2502 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2503 { 2504 struct acpi_dmar_atsr *atsr; 2505 struct dmar_atsr_unit *atsru; 2506 2507 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2508 atsru = dmar_find_atsr(atsr); 2509 if (atsru) { 2510 list_del_rcu(&atsru->list); 2511 synchronize_rcu(); 2512 intel_iommu_free_atsr(atsru); 2513 } 2514 2515 return 0; 2516 } 2517 2518 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2519 { 2520 int i; 2521 struct device *dev; 2522 struct acpi_dmar_atsr *atsr; 2523 struct dmar_atsr_unit *atsru; 2524 2525 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2526 atsru = dmar_find_atsr(atsr); 2527 if (!atsru) 2528 return 0; 2529 2530 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 2531 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 2532 i, dev) 2533 return -EBUSY; 2534 } 2535 2536 return 0; 2537 } 2538 2539 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 2540 { 2541 struct dmar_satc_unit *satcu; 2542 struct acpi_dmar_satc *tmp; 2543 2544 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 2545 dmar_rcu_check()) { 2546 tmp = (struct acpi_dmar_satc *)satcu->hdr; 2547 if (satc->segment != tmp->segment) 2548 continue; 2549 if (satc->header.length != tmp->header.length) 2550 continue; 2551 if (memcmp(satc, tmp, satc->header.length) == 0) 2552 return satcu; 2553 } 2554 2555 return NULL; 2556 } 2557 2558 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 2559 { 2560 struct acpi_dmar_satc *satc; 2561 struct dmar_satc_unit *satcu; 2562 2563 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2564 return 0; 2565 2566 satc = container_of(hdr, struct acpi_dmar_satc, header); 2567 satcu = dmar_find_satc(satc); 2568 if (satcu) 2569 return 0; 2570 2571 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 2572 if (!satcu) 2573 return -ENOMEM; 2574 2575 satcu->hdr = (void *)(satcu + 1); 2576 memcpy(satcu->hdr, hdr, hdr->length); 2577 satcu->atc_required = satc->flags & 0x1; 2578 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 2579 (void *)satc + satc->header.length, 2580 &satcu->devices_cnt); 2581 if (satcu->devices_cnt && !satcu->devices) { 2582 kfree(satcu); 2583 return -ENOMEM; 2584 } 2585 list_add_rcu(&satcu->list, &dmar_satc_units); 2586 2587 return 0; 2588 } 2589 2590 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 2591 { 2592 struct intel_iommu *iommu = dmaru->iommu; 2593 int ret; 2594 2595 /* 2596 * Disable translation if already enabled prior to OS handover. 2597 */ 2598 if (iommu->gcmd & DMA_GCMD_TE) 2599 iommu_disable_translation(iommu); 2600 2601 ret = iommu_alloc_root_entry(iommu); 2602 if (ret) 2603 goto out; 2604 2605 intel_svm_check(iommu); 2606 2607 if (dmaru->ignored) { 2608 /* 2609 * we always have to disable PMRs or DMA may fail on this device 2610 */ 2611 if (force_on) 2612 iommu_disable_protect_mem_regions(iommu); 2613 return 0; 2614 } 2615 2616 intel_iommu_init_qi(iommu); 2617 iommu_flush_write_buffer(iommu); 2618 2619 if (ecap_prs(iommu->ecap)) { 2620 ret = intel_iommu_enable_prq(iommu); 2621 if (ret) 2622 goto disable_iommu; 2623 } 2624 2625 ret = dmar_set_interrupt(iommu); 2626 if (ret) 2627 goto disable_iommu; 2628 2629 iommu_set_root_entry(iommu); 2630 iommu_enable_translation(iommu); 2631 2632 iommu_disable_protect_mem_regions(iommu); 2633 return 0; 2634 2635 disable_iommu: 2636 disable_dmar_iommu(iommu); 2637 out: 2638 free_dmar_iommu(iommu); 2639 return ret; 2640 } 2641 2642 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 2643 { 2644 int ret = 0; 2645 struct intel_iommu *iommu = dmaru->iommu; 2646 2647 if (!intel_iommu_enabled) 2648 return 0; 2649 if (iommu == NULL) 2650 return -EINVAL; 2651 2652 if (insert) { 2653 ret = intel_iommu_add(dmaru); 2654 } else { 2655 disable_dmar_iommu(iommu); 2656 free_dmar_iommu(iommu); 2657 } 2658 2659 return ret; 2660 } 2661 2662 static void intel_iommu_free_dmars(void) 2663 { 2664 struct dmar_rmrr_unit *rmrru, *rmrr_n; 2665 struct dmar_atsr_unit *atsru, *atsr_n; 2666 struct dmar_satc_unit *satcu, *satc_n; 2667 2668 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 2669 list_del(&rmrru->list); 2670 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 2671 kfree(rmrru); 2672 } 2673 2674 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 2675 list_del(&atsru->list); 2676 intel_iommu_free_atsr(atsru); 2677 } 2678 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 2679 list_del(&satcu->list); 2680 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 2681 kfree(satcu); 2682 } 2683 } 2684 2685 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 2686 { 2687 struct dmar_satc_unit *satcu; 2688 struct acpi_dmar_satc *satc; 2689 struct device *tmp; 2690 int i; 2691 2692 rcu_read_lock(); 2693 2694 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 2695 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 2696 if (satc->segment != pci_domain_nr(dev->bus)) 2697 continue; 2698 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 2699 if (to_pci_dev(tmp) == dev) 2700 goto out; 2701 } 2702 satcu = NULL; 2703 out: 2704 rcu_read_unlock(); 2705 return satcu; 2706 } 2707 2708 static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 2709 { 2710 struct pci_dev *bridge = NULL; 2711 struct dmar_atsr_unit *atsru; 2712 struct dmar_satc_unit *satcu; 2713 struct acpi_dmar_atsr *atsr; 2714 bool supported = true; 2715 struct pci_bus *bus; 2716 struct device *tmp; 2717 int i; 2718 2719 dev = pci_physfn(dev); 2720 satcu = dmar_find_matched_satc_unit(dev); 2721 if (satcu) 2722 /* 2723 * This device supports ATS as it is in SATC table. 2724 * When IOMMU is in legacy mode, enabling ATS is done 2725 * automatically by HW for the device that requires 2726 * ATS, hence OS should not enable this device ATS 2727 * to avoid duplicated TLB invalidation. 2728 */ 2729 return !(satcu->atc_required && !sm_supported(iommu)); 2730 2731 for (bus = dev->bus; bus; bus = bus->parent) { 2732 bridge = bus->self; 2733 /* If it's an integrated device, allow ATS */ 2734 if (!bridge) 2735 return true; 2736 /* Connected via non-PCIe: no ATS */ 2737 if (!pci_is_pcie(bridge) || 2738 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 2739 return false; 2740 /* If we found the root port, look it up in the ATSR */ 2741 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 2742 break; 2743 } 2744 2745 rcu_read_lock(); 2746 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 2747 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 2748 if (atsr->segment != pci_domain_nr(dev->bus)) 2749 continue; 2750 2751 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 2752 if (tmp == &bridge->dev) 2753 goto out; 2754 2755 if (atsru->include_all) 2756 goto out; 2757 } 2758 supported = false; 2759 out: 2760 rcu_read_unlock(); 2761 2762 return supported; 2763 } 2764 2765 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 2766 { 2767 int ret; 2768 struct dmar_rmrr_unit *rmrru; 2769 struct dmar_atsr_unit *atsru; 2770 struct dmar_satc_unit *satcu; 2771 struct acpi_dmar_atsr *atsr; 2772 struct acpi_dmar_reserved_memory *rmrr; 2773 struct acpi_dmar_satc *satc; 2774 2775 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 2776 return 0; 2777 2778 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 2779 rmrr = container_of(rmrru->hdr, 2780 struct acpi_dmar_reserved_memory, header); 2781 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 2782 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 2783 ((void *)rmrr) + rmrr->header.length, 2784 rmrr->segment, rmrru->devices, 2785 rmrru->devices_cnt); 2786 if (ret < 0) 2787 return ret; 2788 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 2789 dmar_remove_dev_scope(info, rmrr->segment, 2790 rmrru->devices, rmrru->devices_cnt); 2791 } 2792 } 2793 2794 list_for_each_entry(atsru, &dmar_atsr_units, list) { 2795 if (atsru->include_all) 2796 continue; 2797 2798 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 2799 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 2800 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 2801 (void *)atsr + atsr->header.length, 2802 atsr->segment, atsru->devices, 2803 atsru->devices_cnt); 2804 if (ret > 0) 2805 break; 2806 else if (ret < 0) 2807 return ret; 2808 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 2809 if (dmar_remove_dev_scope(info, atsr->segment, 2810 atsru->devices, atsru->devices_cnt)) 2811 break; 2812 } 2813 } 2814 list_for_each_entry(satcu, &dmar_satc_units, list) { 2815 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 2816 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 2817 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 2818 (void *)satc + satc->header.length, 2819 satc->segment, satcu->devices, 2820 satcu->devices_cnt); 2821 if (ret > 0) 2822 break; 2823 else if (ret < 0) 2824 return ret; 2825 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 2826 if (dmar_remove_dev_scope(info, satc->segment, 2827 satcu->devices, satcu->devices_cnt)) 2828 break; 2829 } 2830 } 2831 2832 return 0; 2833 } 2834 2835 static void intel_disable_iommus(void) 2836 { 2837 struct intel_iommu *iommu = NULL; 2838 struct dmar_drhd_unit *drhd; 2839 2840 for_each_iommu(iommu, drhd) 2841 iommu_disable_translation(iommu); 2842 } 2843 2844 void intel_iommu_shutdown(void) 2845 { 2846 struct dmar_drhd_unit *drhd; 2847 struct intel_iommu *iommu = NULL; 2848 2849 if (no_iommu || dmar_disabled) 2850 return; 2851 2852 /* 2853 * All other CPUs were brought down, hotplug interrupts were disabled, 2854 * no lock and RCU checking needed anymore 2855 */ 2856 list_for_each_entry(drhd, &dmar_drhd_units, list) { 2857 iommu = drhd->iommu; 2858 2859 /* Disable PMRs explicitly here. */ 2860 iommu_disable_protect_mem_regions(iommu); 2861 2862 /* Make sure the IOMMUs are switched off */ 2863 iommu_disable_translation(iommu); 2864 } 2865 } 2866 2867 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 2868 { 2869 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 2870 2871 return container_of(iommu_dev, struct intel_iommu, iommu); 2872 } 2873 2874 static ssize_t version_show(struct device *dev, 2875 struct device_attribute *attr, char *buf) 2876 { 2877 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2878 u32 ver = readl(iommu->reg + DMAR_VER_REG); 2879 return sysfs_emit(buf, "%d:%d\n", 2880 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 2881 } 2882 static DEVICE_ATTR_RO(version); 2883 2884 static ssize_t address_show(struct device *dev, 2885 struct device_attribute *attr, char *buf) 2886 { 2887 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2888 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 2889 } 2890 static DEVICE_ATTR_RO(address); 2891 2892 static ssize_t cap_show(struct device *dev, 2893 struct device_attribute *attr, char *buf) 2894 { 2895 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2896 return sysfs_emit(buf, "%llx\n", iommu->cap); 2897 } 2898 static DEVICE_ATTR_RO(cap); 2899 2900 static ssize_t ecap_show(struct device *dev, 2901 struct device_attribute *attr, char *buf) 2902 { 2903 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2904 return sysfs_emit(buf, "%llx\n", iommu->ecap); 2905 } 2906 static DEVICE_ATTR_RO(ecap); 2907 2908 static ssize_t domains_supported_show(struct device *dev, 2909 struct device_attribute *attr, char *buf) 2910 { 2911 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2912 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 2913 } 2914 static DEVICE_ATTR_RO(domains_supported); 2915 2916 static ssize_t domains_used_show(struct device *dev, 2917 struct device_attribute *attr, char *buf) 2918 { 2919 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2920 unsigned int count = 0; 2921 int id; 2922 2923 for (id = 0; id < cap_ndoms(iommu->cap); id++) 2924 if (ida_exists(&iommu->domain_ida, id)) 2925 count++; 2926 2927 return sysfs_emit(buf, "%d\n", count); 2928 } 2929 static DEVICE_ATTR_RO(domains_used); 2930 2931 static struct attribute *intel_iommu_attrs[] = { 2932 &dev_attr_version.attr, 2933 &dev_attr_address.attr, 2934 &dev_attr_cap.attr, 2935 &dev_attr_ecap.attr, 2936 &dev_attr_domains_supported.attr, 2937 &dev_attr_domains_used.attr, 2938 NULL, 2939 }; 2940 2941 static struct attribute_group intel_iommu_group = { 2942 .name = "intel-iommu", 2943 .attrs = intel_iommu_attrs, 2944 }; 2945 2946 const struct attribute_group *intel_iommu_groups[] = { 2947 &intel_iommu_group, 2948 NULL, 2949 }; 2950 2951 static bool has_external_pci(void) 2952 { 2953 struct pci_dev *pdev = NULL; 2954 2955 for_each_pci_dev(pdev) 2956 if (pdev->external_facing) { 2957 pci_dev_put(pdev); 2958 return true; 2959 } 2960 2961 return false; 2962 } 2963 2964 static int __init platform_optin_force_iommu(void) 2965 { 2966 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 2967 return 0; 2968 2969 if (no_iommu || dmar_disabled) 2970 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 2971 2972 /* 2973 * If Intel-IOMMU is disabled by default, we will apply identity 2974 * map for all devices except those marked as being untrusted. 2975 */ 2976 if (dmar_disabled) 2977 iommu_set_default_passthrough(false); 2978 2979 dmar_disabled = 0; 2980 no_iommu = 0; 2981 2982 return 1; 2983 } 2984 2985 static int __init probe_acpi_namespace_devices(void) 2986 { 2987 struct dmar_drhd_unit *drhd; 2988 /* To avoid a -Wunused-but-set-variable warning. */ 2989 struct intel_iommu *iommu __maybe_unused; 2990 struct device *dev; 2991 int i, ret = 0; 2992 2993 for_each_active_iommu(iommu, drhd) { 2994 for_each_active_dev_scope(drhd->devices, 2995 drhd->devices_cnt, i, dev) { 2996 struct acpi_device_physical_node *pn; 2997 struct acpi_device *adev; 2998 2999 if (dev->bus != &acpi_bus_type) 3000 continue; 3001 3002 up_read(&dmar_global_lock); 3003 adev = to_acpi_device(dev); 3004 mutex_lock(&adev->physical_node_lock); 3005 list_for_each_entry(pn, 3006 &adev->physical_node_list, node) { 3007 ret = iommu_probe_device(pn->dev); 3008 if (ret) 3009 break; 3010 } 3011 mutex_unlock(&adev->physical_node_lock); 3012 down_read(&dmar_global_lock); 3013 3014 if (ret) 3015 return ret; 3016 } 3017 } 3018 3019 return 0; 3020 } 3021 3022 static __init int tboot_force_iommu(void) 3023 { 3024 if (!tboot_enabled()) 3025 return 0; 3026 3027 if (no_iommu || dmar_disabled) 3028 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3029 3030 dmar_disabled = 0; 3031 no_iommu = 0; 3032 3033 return 1; 3034 } 3035 3036 int __init intel_iommu_init(void) 3037 { 3038 int ret = -ENODEV; 3039 struct dmar_drhd_unit *drhd; 3040 struct intel_iommu *iommu; 3041 3042 /* 3043 * Intel IOMMU is required for a TXT/tboot launch or platform 3044 * opt in, so enforce that. 3045 */ 3046 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3047 platform_optin_force_iommu(); 3048 3049 down_write(&dmar_global_lock); 3050 if (dmar_table_init()) { 3051 if (force_on) 3052 panic("tboot: Failed to initialize DMAR table\n"); 3053 goto out_free_dmar; 3054 } 3055 3056 if (dmar_dev_scope_init() < 0) { 3057 if (force_on) 3058 panic("tboot: Failed to initialize DMAR device scope\n"); 3059 goto out_free_dmar; 3060 } 3061 3062 up_write(&dmar_global_lock); 3063 3064 /* 3065 * The bus notifier takes the dmar_global_lock, so lockdep will 3066 * complain later when we register it under the lock. 3067 */ 3068 dmar_register_bus_notifier(); 3069 3070 down_write(&dmar_global_lock); 3071 3072 if (!no_iommu) 3073 intel_iommu_debugfs_init(); 3074 3075 if (no_iommu || dmar_disabled) { 3076 /* 3077 * We exit the function here to ensure IOMMU's remapping and 3078 * mempool aren't setup, which means that the IOMMU's PMRs 3079 * won't be disabled via the call to init_dmars(). So disable 3080 * it explicitly here. The PMRs were setup by tboot prior to 3081 * calling SENTER, but the kernel is expected to reset/tear 3082 * down the PMRs. 3083 */ 3084 if (intel_iommu_tboot_noforce) { 3085 for_each_iommu(iommu, drhd) 3086 iommu_disable_protect_mem_regions(iommu); 3087 } 3088 3089 /* 3090 * Make sure the IOMMUs are switched off, even when we 3091 * boot into a kexec kernel and the previous kernel left 3092 * them enabled 3093 */ 3094 intel_disable_iommus(); 3095 goto out_free_dmar; 3096 } 3097 3098 if (list_empty(&dmar_rmrr_units)) 3099 pr_info("No RMRR found\n"); 3100 3101 if (list_empty(&dmar_atsr_units)) 3102 pr_info("No ATSR found\n"); 3103 3104 if (list_empty(&dmar_satc_units)) 3105 pr_info("No SATC found\n"); 3106 3107 init_no_remapping_devices(); 3108 3109 ret = init_dmars(); 3110 if (ret) { 3111 if (force_on) 3112 panic("tboot: Failed to initialize DMARs\n"); 3113 pr_err("Initialization failed\n"); 3114 goto out_free_dmar; 3115 } 3116 up_write(&dmar_global_lock); 3117 3118 init_iommu_pm_ops(); 3119 3120 down_read(&dmar_global_lock); 3121 for_each_active_iommu(iommu, drhd) { 3122 /* 3123 * The flush queue implementation does not perform 3124 * page-selective invalidations that are required for efficient 3125 * TLB flushes in virtual environments. The benefit of batching 3126 * is likely to be much lower than the overhead of synchronizing 3127 * the virtual and physical IOMMU page-tables. 3128 */ 3129 if (cap_caching_mode(iommu->cap) && 3130 !first_level_by_default(iommu)) { 3131 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3132 iommu_set_dma_strict(); 3133 } 3134 iommu_device_sysfs_add(&iommu->iommu, NULL, 3135 intel_iommu_groups, 3136 "%s", iommu->name); 3137 /* 3138 * The iommu device probe is protected by the iommu_probe_device_lock. 3139 * Release the dmar_global_lock before entering the device probe path 3140 * to avoid unnecessary lock order splat. 3141 */ 3142 up_read(&dmar_global_lock); 3143 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3144 down_read(&dmar_global_lock); 3145 3146 iommu_pmu_register(iommu); 3147 } 3148 3149 if (probe_acpi_namespace_devices()) 3150 pr_warn("ACPI name space devices didn't probe correctly\n"); 3151 3152 /* Finally, we enable the DMA remapping hardware. */ 3153 for_each_iommu(iommu, drhd) { 3154 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3155 iommu_enable_translation(iommu); 3156 3157 iommu_disable_protect_mem_regions(iommu); 3158 } 3159 up_read(&dmar_global_lock); 3160 3161 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3162 3163 intel_iommu_enabled = 1; 3164 3165 return 0; 3166 3167 out_free_dmar: 3168 intel_iommu_free_dmars(); 3169 up_write(&dmar_global_lock); 3170 return ret; 3171 } 3172 3173 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3174 { 3175 struct device_domain_info *info = opaque; 3176 3177 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3178 return 0; 3179 } 3180 3181 /* 3182 * NB - intel-iommu lacks any sort of reference counting for the users of 3183 * dependent devices. If multiple endpoints have intersecting dependent 3184 * devices, unbinding the driver from any one of them will possibly leave 3185 * the others unable to operate. 3186 */ 3187 static void domain_context_clear(struct device_domain_info *info) 3188 { 3189 if (!dev_is_pci(info->dev)) { 3190 domain_context_clear_one(info, info->bus, info->devfn); 3191 return; 3192 } 3193 3194 pci_for_each_dma_alias(to_pci_dev(info->dev), 3195 &domain_context_clear_one_cb, info); 3196 iommu_disable_pci_ats(info); 3197 } 3198 3199 /* 3200 * Clear the page table pointer in context or pasid table entries so that 3201 * all DMA requests without PASID from the device are blocked. If the page 3202 * table has been set, clean up the data structures. 3203 */ 3204 void device_block_translation(struct device *dev) 3205 { 3206 struct device_domain_info *info = dev_iommu_priv_get(dev); 3207 struct intel_iommu *iommu = info->iommu; 3208 unsigned long flags; 3209 3210 /* Device in DMA blocking state. Noting to do. */ 3211 if (!info->domain_attached) 3212 return; 3213 3214 if (info->domain) 3215 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); 3216 3217 if (!dev_is_real_dma_subdevice(dev)) { 3218 if (sm_supported(iommu)) 3219 intel_pasid_tear_down_entry(iommu, dev, 3220 IOMMU_NO_PASID, false); 3221 else 3222 domain_context_clear(info); 3223 } 3224 3225 /* Device now in DMA blocking state. */ 3226 info->domain_attached = false; 3227 3228 if (!info->domain) 3229 return; 3230 3231 spin_lock_irqsave(&info->domain->lock, flags); 3232 list_del(&info->link); 3233 spin_unlock_irqrestore(&info->domain->lock, flags); 3234 3235 domain_detach_iommu(info->domain, iommu); 3236 info->domain = NULL; 3237 } 3238 3239 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3240 struct device *dev) 3241 { 3242 struct device_domain_info *info = dev_iommu_priv_get(dev); 3243 3244 iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev); 3245 device_block_translation(dev); 3246 return 0; 3247 } 3248 3249 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, 3250 struct device *dev, ioasid_t pasid, 3251 struct iommu_domain *old); 3252 3253 static struct iommu_domain blocking_domain = { 3254 .type = IOMMU_DOMAIN_BLOCKED, 3255 .ops = &(const struct iommu_domain_ops) { 3256 .attach_dev = blocking_domain_attach_dev, 3257 .set_dev_pasid = blocking_domain_set_dev_pasid, 3258 } 3259 }; 3260 3261 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) 3262 { 3263 if (!intel_iommu_superpage) 3264 return 0; 3265 3266 if (first_stage) 3267 return cap_fl1gp_support(iommu->cap) ? 2 : 1; 3268 3269 return fls(cap_super_page_val(iommu->cap)); 3270 } 3271 3272 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) 3273 { 3274 struct device_domain_info *info = dev_iommu_priv_get(dev); 3275 struct intel_iommu *iommu = info->iommu; 3276 struct dmar_domain *domain; 3277 int addr_width; 3278 3279 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 3280 if (!domain) 3281 return ERR_PTR(-ENOMEM); 3282 3283 INIT_LIST_HEAD(&domain->devices); 3284 INIT_LIST_HEAD(&domain->dev_pasids); 3285 INIT_LIST_HEAD(&domain->cache_tags); 3286 spin_lock_init(&domain->lock); 3287 spin_lock_init(&domain->cache_lock); 3288 xa_init(&domain->iommu_array); 3289 3290 domain->nid = dev_to_node(dev); 3291 domain->use_first_level = first_stage; 3292 3293 /* calculate the address width */ 3294 addr_width = agaw_to_width(iommu->agaw); 3295 if (addr_width > cap_mgaw(iommu->cap)) 3296 addr_width = cap_mgaw(iommu->cap); 3297 domain->gaw = addr_width; 3298 domain->agaw = iommu->agaw; 3299 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); 3300 3301 /* iommu memory access coherency */ 3302 domain->iommu_coherency = iommu_paging_structure_coherency(iommu); 3303 3304 /* pagesize bitmap */ 3305 domain->domain.pgsize_bitmap = SZ_4K; 3306 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); 3307 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 3308 3309 /* 3310 * IOVA aperture: First-level translation restricts the input-address 3311 * to a canonical address (i.e., address bits 63:N have the same value 3312 * as address bit [N-1], where N is 48-bits with 4-level paging and 3313 * 57-bits with 5-level paging). Hence, skip bit [N-1]. 3314 */ 3315 domain->domain.geometry.force_aperture = true; 3316 domain->domain.geometry.aperture_start = 0; 3317 if (first_stage) 3318 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 3319 else 3320 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 3321 3322 /* always allocate the top pgd */ 3323 domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K); 3324 if (!domain->pgd) { 3325 kfree(domain); 3326 return ERR_PTR(-ENOMEM); 3327 } 3328 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3329 3330 return domain; 3331 } 3332 3333 static struct iommu_domain * 3334 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 3335 const struct iommu_user_data *user_data) 3336 { 3337 struct device_domain_info *info = dev_iommu_priv_get(dev); 3338 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3339 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3340 struct intel_iommu *iommu = info->iommu; 3341 struct dmar_domain *dmar_domain; 3342 struct iommu_domain *domain; 3343 bool first_stage; 3344 3345 if (flags & 3346 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | 3347 IOMMU_HWPT_ALLOC_PASID))) 3348 return ERR_PTR(-EOPNOTSUPP); 3349 if (nested_parent && !nested_supported(iommu)) 3350 return ERR_PTR(-EOPNOTSUPP); 3351 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3352 return ERR_PTR(-EOPNOTSUPP); 3353 3354 /* 3355 * Always allocate the guest compatible page table unless 3356 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING 3357 * is specified. 3358 */ 3359 if (nested_parent || dirty_tracking) { 3360 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) 3361 return ERR_PTR(-EOPNOTSUPP); 3362 first_stage = false; 3363 } else { 3364 first_stage = first_level_by_default(iommu); 3365 } 3366 3367 dmar_domain = paging_domain_alloc(dev, first_stage); 3368 if (IS_ERR(dmar_domain)) 3369 return ERR_CAST(dmar_domain); 3370 domain = &dmar_domain->domain; 3371 domain->type = IOMMU_DOMAIN_UNMANAGED; 3372 domain->owner = &intel_iommu_ops; 3373 domain->ops = intel_iommu_ops.default_domain_ops; 3374 3375 if (nested_parent) { 3376 dmar_domain->nested_parent = true; 3377 INIT_LIST_HEAD(&dmar_domain->s1_domains); 3378 spin_lock_init(&dmar_domain->s1_lock); 3379 } 3380 3381 if (dirty_tracking) { 3382 if (dmar_domain->use_first_level) { 3383 iommu_domain_free(domain); 3384 return ERR_PTR(-EOPNOTSUPP); 3385 } 3386 domain->dirty_ops = &intel_dirty_ops; 3387 } 3388 3389 return domain; 3390 } 3391 3392 static void intel_iommu_domain_free(struct iommu_domain *domain) 3393 { 3394 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3395 3396 WARN_ON(dmar_domain->nested_parent && 3397 !list_empty(&dmar_domain->s1_domains)); 3398 domain_exit(dmar_domain); 3399 } 3400 3401 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) 3402 { 3403 struct device_domain_info *info = dev_iommu_priv_get(dev); 3404 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3405 struct intel_iommu *iommu = info->iommu; 3406 int addr_width; 3407 3408 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) 3409 return -EPERM; 3410 3411 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3412 return -EINVAL; 3413 3414 if (domain->dirty_ops && !ssads_supported(iommu)) 3415 return -EINVAL; 3416 3417 if (dmar_domain->iommu_coherency != 3418 iommu_paging_structure_coherency(iommu)) 3419 return -EINVAL; 3420 3421 if (dmar_domain->iommu_superpage != 3422 iommu_superpage_capability(iommu, dmar_domain->use_first_level)) 3423 return -EINVAL; 3424 3425 if (dmar_domain->use_first_level && 3426 (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) 3427 return -EINVAL; 3428 3429 /* check if this iommu agaw is sufficient for max mapped address */ 3430 addr_width = agaw_to_width(iommu->agaw); 3431 if (addr_width > cap_mgaw(iommu->cap)) 3432 addr_width = cap_mgaw(iommu->cap); 3433 3434 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) 3435 return -EINVAL; 3436 3437 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3438 context_copied(iommu, info->bus, info->devfn)) 3439 return intel_pasid_setup_sm_context(dev); 3440 3441 return 0; 3442 } 3443 3444 static int intel_iommu_attach_device(struct iommu_domain *domain, 3445 struct device *dev) 3446 { 3447 int ret; 3448 3449 device_block_translation(dev); 3450 3451 ret = paging_domain_compatible(domain, dev); 3452 if (ret) 3453 return ret; 3454 3455 ret = iopf_for_domain_set(domain, dev); 3456 if (ret) 3457 return ret; 3458 3459 ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); 3460 if (ret) 3461 iopf_for_domain_remove(domain, dev); 3462 3463 return ret; 3464 } 3465 3466 static int intel_iommu_map(struct iommu_domain *domain, 3467 unsigned long iova, phys_addr_t hpa, 3468 size_t size, int iommu_prot, gfp_t gfp) 3469 { 3470 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3471 u64 max_addr; 3472 int prot = 0; 3473 3474 if (iommu_prot & IOMMU_READ) 3475 prot |= DMA_PTE_READ; 3476 if (iommu_prot & IOMMU_WRITE) 3477 prot |= DMA_PTE_WRITE; 3478 if (dmar_domain->set_pte_snp) 3479 prot |= DMA_PTE_SNP; 3480 3481 max_addr = iova + size; 3482 if (dmar_domain->max_addr < max_addr) { 3483 u64 end; 3484 3485 /* check if minimum agaw is sufficient for mapped address */ 3486 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 3487 if (end < max_addr) { 3488 pr_err("%s: iommu width (%d) is not " 3489 "sufficient for the mapped address (%llx)\n", 3490 __func__, dmar_domain->gaw, max_addr); 3491 return -EFAULT; 3492 } 3493 dmar_domain->max_addr = max_addr; 3494 } 3495 /* Round up size to next multiple of PAGE_SIZE, if it and 3496 the low bits of hpa would take us onto the next page */ 3497 size = aligned_nrpages(hpa, size); 3498 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 3499 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 3500 } 3501 3502 static int intel_iommu_map_pages(struct iommu_domain *domain, 3503 unsigned long iova, phys_addr_t paddr, 3504 size_t pgsize, size_t pgcount, 3505 int prot, gfp_t gfp, size_t *mapped) 3506 { 3507 unsigned long pgshift = __ffs(pgsize); 3508 size_t size = pgcount << pgshift; 3509 int ret; 3510 3511 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 3512 return -EINVAL; 3513 3514 if (!IS_ALIGNED(iova | paddr, pgsize)) 3515 return -EINVAL; 3516 3517 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 3518 if (!ret && mapped) 3519 *mapped = size; 3520 3521 return ret; 3522 } 3523 3524 static size_t intel_iommu_unmap(struct iommu_domain *domain, 3525 unsigned long iova, size_t size, 3526 struct iommu_iotlb_gather *gather) 3527 { 3528 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3529 unsigned long start_pfn, last_pfn; 3530 int level = 0; 3531 3532 /* Cope with horrid API which requires us to unmap more than the 3533 size argument if it happens to be a large-page mapping. */ 3534 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 3535 &level, GFP_ATOMIC))) 3536 return 0; 3537 3538 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 3539 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 3540 3541 start_pfn = iova >> VTD_PAGE_SHIFT; 3542 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 3543 3544 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 3545 3546 if (dmar_domain->max_addr == iova + size) 3547 dmar_domain->max_addr = iova; 3548 3549 /* 3550 * We do not use page-selective IOTLB invalidation in flush queue, 3551 * so there is no need to track page and sync iotlb. 3552 */ 3553 if (!iommu_iotlb_gather_queued(gather)) 3554 iommu_iotlb_gather_add_page(domain, gather, iova, size); 3555 3556 return size; 3557 } 3558 3559 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 3560 unsigned long iova, 3561 size_t pgsize, size_t pgcount, 3562 struct iommu_iotlb_gather *gather) 3563 { 3564 unsigned long pgshift = __ffs(pgsize); 3565 size_t size = pgcount << pgshift; 3566 3567 return intel_iommu_unmap(domain, iova, size, gather); 3568 } 3569 3570 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 3571 struct iommu_iotlb_gather *gather) 3572 { 3573 cache_tag_flush_range(to_dmar_domain(domain), gather->start, 3574 gather->end, 3575 iommu_pages_list_empty(&gather->freelist)); 3576 iommu_put_pages_list(&gather->freelist); 3577 } 3578 3579 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 3580 dma_addr_t iova) 3581 { 3582 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3583 struct dma_pte *pte; 3584 int level = 0; 3585 u64 phys = 0; 3586 3587 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 3588 GFP_ATOMIC); 3589 if (pte && dma_pte_present(pte)) 3590 phys = dma_pte_addr(pte) + 3591 (iova & (BIT_MASK(level_to_offset_bits(level) + 3592 VTD_PAGE_SHIFT) - 1)); 3593 3594 return phys; 3595 } 3596 3597 static bool domain_support_force_snooping(struct dmar_domain *domain) 3598 { 3599 struct device_domain_info *info; 3600 bool support = true; 3601 3602 assert_spin_locked(&domain->lock); 3603 list_for_each_entry(info, &domain->devices, link) { 3604 if (!ecap_sc_support(info->iommu->ecap)) { 3605 support = false; 3606 break; 3607 } 3608 } 3609 3610 return support; 3611 } 3612 3613 static void domain_set_force_snooping(struct dmar_domain *domain) 3614 { 3615 struct device_domain_info *info; 3616 3617 assert_spin_locked(&domain->lock); 3618 /* 3619 * Second level page table supports per-PTE snoop control. The 3620 * iommu_map() interface will handle this by setting SNP bit. 3621 */ 3622 if (!domain->use_first_level) { 3623 domain->set_pte_snp = true; 3624 return; 3625 } 3626 3627 list_for_each_entry(info, &domain->devices, link) 3628 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 3629 IOMMU_NO_PASID); 3630 } 3631 3632 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3633 { 3634 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3635 unsigned long flags; 3636 3637 if (dmar_domain->force_snooping) 3638 return true; 3639 3640 spin_lock_irqsave(&dmar_domain->lock, flags); 3641 if (!domain_support_force_snooping(dmar_domain) || 3642 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 3643 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3644 return false; 3645 } 3646 3647 domain_set_force_snooping(dmar_domain); 3648 dmar_domain->force_snooping = true; 3649 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3650 3651 return true; 3652 } 3653 3654 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 3655 { 3656 struct device_domain_info *info = dev_iommu_priv_get(dev); 3657 3658 switch (cap) { 3659 case IOMMU_CAP_CACHE_COHERENCY: 3660 case IOMMU_CAP_DEFERRED_FLUSH: 3661 return true; 3662 case IOMMU_CAP_PRE_BOOT_PROTECTION: 3663 return dmar_platform_optin(); 3664 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 3665 return ecap_sc_support(info->iommu->ecap); 3666 case IOMMU_CAP_DIRTY_TRACKING: 3667 return ssads_supported(info->iommu); 3668 default: 3669 return false; 3670 } 3671 } 3672 3673 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 3674 { 3675 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 3676 struct device_domain_info *info; 3677 struct intel_iommu *iommu; 3678 u8 bus, devfn; 3679 int ret; 3680 3681 iommu = device_lookup_iommu(dev, &bus, &devfn); 3682 if (!iommu || !iommu->iommu.ops) 3683 return ERR_PTR(-ENODEV); 3684 3685 info = kzalloc(sizeof(*info), GFP_KERNEL); 3686 if (!info) 3687 return ERR_PTR(-ENOMEM); 3688 3689 if (dev_is_real_dma_subdevice(dev)) { 3690 info->bus = pdev->bus->number; 3691 info->devfn = pdev->devfn; 3692 info->segment = pci_domain_nr(pdev->bus); 3693 } else { 3694 info->bus = bus; 3695 info->devfn = devfn; 3696 info->segment = iommu->segment; 3697 } 3698 3699 info->dev = dev; 3700 info->iommu = iommu; 3701 if (dev_is_pci(dev)) { 3702 if (ecap_dev_iotlb_support(iommu->ecap) && 3703 pci_ats_supported(pdev) && 3704 dmar_ats_supported(pdev, iommu)) { 3705 info->ats_supported = 1; 3706 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 3707 3708 /* 3709 * For IOMMU that supports device IOTLB throttling 3710 * (DIT), we assign PFSID to the invalidation desc 3711 * of a VF such that IOMMU HW can gauge queue depth 3712 * at PF level. If DIT is not set, PFSID will be 3713 * treated as reserved, which should be set to 0. 3714 */ 3715 if (ecap_dit(iommu->ecap)) 3716 info->pfsid = pci_dev_id(pci_physfn(pdev)); 3717 info->ats_qdep = pci_ats_queue_depth(pdev); 3718 } 3719 if (sm_supported(iommu)) { 3720 if (pasid_supported(iommu)) { 3721 int features = pci_pasid_features(pdev); 3722 3723 if (features >= 0) 3724 info->pasid_supported = features | 1; 3725 } 3726 3727 if (info->ats_supported && ecap_prs(iommu->ecap) && 3728 pci_pri_supported(pdev)) 3729 info->pri_supported = 1; 3730 } 3731 } 3732 3733 dev_iommu_priv_set(dev, info); 3734 if (pdev && pci_ats_supported(pdev)) { 3735 pci_prepare_ats(pdev, VTD_PAGE_SHIFT); 3736 ret = device_rbtree_insert(iommu, info); 3737 if (ret) 3738 goto free; 3739 } 3740 3741 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 3742 ret = intel_pasid_alloc_table(dev); 3743 if (ret) { 3744 dev_err(dev, "PASID table allocation failed\n"); 3745 goto clear_rbtree; 3746 } 3747 3748 if (!context_copied(iommu, info->bus, info->devfn)) { 3749 ret = intel_pasid_setup_sm_context(dev); 3750 if (ret) 3751 goto free_table; 3752 } 3753 } 3754 3755 intel_iommu_debugfs_create_dev(info); 3756 3757 return &iommu->iommu; 3758 free_table: 3759 intel_pasid_free_table(dev); 3760 clear_rbtree: 3761 device_rbtree_remove(info); 3762 free: 3763 kfree(info); 3764 3765 return ERR_PTR(ret); 3766 } 3767 3768 static void intel_iommu_probe_finalize(struct device *dev) 3769 { 3770 struct device_domain_info *info = dev_iommu_priv_get(dev); 3771 struct intel_iommu *iommu = info->iommu; 3772 3773 /* 3774 * The PCIe spec, in its wisdom, declares that the behaviour of the 3775 * device is undefined if you enable PASID support after ATS support. 3776 * So always enable PASID support on devices which have it, even if 3777 * we can't yet know if we're ever going to use it. 3778 */ 3779 if (info->pasid_supported && 3780 !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1)) 3781 info->pasid_enabled = 1; 3782 3783 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) 3784 iommu_enable_pci_ats(info); 3785 iommu_enable_pci_pri(info); 3786 } 3787 3788 static void intel_iommu_release_device(struct device *dev) 3789 { 3790 struct device_domain_info *info = dev_iommu_priv_get(dev); 3791 struct intel_iommu *iommu = info->iommu; 3792 3793 iommu_disable_pci_pri(info); 3794 iommu_disable_pci_ats(info); 3795 3796 if (info->pasid_enabled) { 3797 pci_disable_pasid(to_pci_dev(dev)); 3798 info->pasid_enabled = 0; 3799 } 3800 3801 mutex_lock(&iommu->iopf_lock); 3802 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) 3803 device_rbtree_remove(info); 3804 mutex_unlock(&iommu->iopf_lock); 3805 3806 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3807 !context_copied(iommu, info->bus, info->devfn)) 3808 intel_pasid_teardown_sm_context(dev); 3809 3810 intel_pasid_free_table(dev); 3811 intel_iommu_debugfs_remove_dev(info); 3812 kfree(info); 3813 } 3814 3815 static void intel_iommu_get_resv_regions(struct device *device, 3816 struct list_head *head) 3817 { 3818 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 3819 struct iommu_resv_region *reg; 3820 struct dmar_rmrr_unit *rmrr; 3821 struct device *i_dev; 3822 int i; 3823 3824 rcu_read_lock(); 3825 for_each_rmrr_units(rmrr) { 3826 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 3827 i, i_dev) { 3828 struct iommu_resv_region *resv; 3829 enum iommu_resv_type type; 3830 size_t length; 3831 3832 if (i_dev != device && 3833 !is_downstream_to_pci_bridge(device, i_dev)) 3834 continue; 3835 3836 length = rmrr->end_address - rmrr->base_address + 1; 3837 3838 type = device_rmrr_is_relaxable(device) ? 3839 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 3840 3841 resv = iommu_alloc_resv_region(rmrr->base_address, 3842 length, prot, type, 3843 GFP_ATOMIC); 3844 if (!resv) 3845 break; 3846 3847 list_add_tail(&resv->list, head); 3848 } 3849 } 3850 rcu_read_unlock(); 3851 3852 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 3853 if (dev_is_pci(device)) { 3854 struct pci_dev *pdev = to_pci_dev(device); 3855 3856 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 3857 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 3858 IOMMU_RESV_DIRECT_RELAXABLE, 3859 GFP_KERNEL); 3860 if (reg) 3861 list_add_tail(®->list, head); 3862 } 3863 } 3864 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 3865 3866 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 3867 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 3868 0, IOMMU_RESV_MSI, GFP_KERNEL); 3869 if (!reg) 3870 return; 3871 list_add_tail(®->list, head); 3872 } 3873 3874 static struct iommu_group *intel_iommu_device_group(struct device *dev) 3875 { 3876 if (dev_is_pci(dev)) 3877 return pci_device_group(dev); 3878 return generic_device_group(dev); 3879 } 3880 3881 int intel_iommu_enable_iopf(struct device *dev) 3882 { 3883 struct device_domain_info *info = dev_iommu_priv_get(dev); 3884 struct intel_iommu *iommu = info->iommu; 3885 int ret; 3886 3887 if (!info->pri_enabled) 3888 return -ENODEV; 3889 3890 /* pri_enabled is protected by the group mutex. */ 3891 iommu_group_mutex_assert(dev); 3892 if (info->iopf_refcount) { 3893 info->iopf_refcount++; 3894 return 0; 3895 } 3896 3897 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 3898 if (ret) 3899 return ret; 3900 3901 info->iopf_refcount = 1; 3902 3903 return 0; 3904 } 3905 3906 void intel_iommu_disable_iopf(struct device *dev) 3907 { 3908 struct device_domain_info *info = dev_iommu_priv_get(dev); 3909 struct intel_iommu *iommu = info->iommu; 3910 3911 if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) 3912 return; 3913 3914 iommu_group_mutex_assert(dev); 3915 if (--info->iopf_refcount) 3916 return; 3917 3918 iopf_queue_remove_device(iommu->iopf_queue, dev); 3919 } 3920 3921 static bool intel_iommu_is_attach_deferred(struct device *dev) 3922 { 3923 struct device_domain_info *info = dev_iommu_priv_get(dev); 3924 3925 return translation_pre_enabled(info->iommu) && !info->domain; 3926 } 3927 3928 /* 3929 * Check that the device does not live on an external facing PCI port that is 3930 * marked as untrusted. Such devices should not be able to apply quirks and 3931 * thus not be able to bypass the IOMMU restrictions. 3932 */ 3933 static bool risky_device(struct pci_dev *pdev) 3934 { 3935 if (pdev->untrusted) { 3936 pci_info(pdev, 3937 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 3938 pdev->vendor, pdev->device); 3939 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 3940 return true; 3941 } 3942 return false; 3943 } 3944 3945 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 3946 unsigned long iova, size_t size) 3947 { 3948 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); 3949 3950 return 0; 3951 } 3952 3953 void domain_remove_dev_pasid(struct iommu_domain *domain, 3954 struct device *dev, ioasid_t pasid) 3955 { 3956 struct device_domain_info *info = dev_iommu_priv_get(dev); 3957 struct dev_pasid_info *curr, *dev_pasid = NULL; 3958 struct intel_iommu *iommu = info->iommu; 3959 struct dmar_domain *dmar_domain; 3960 unsigned long flags; 3961 3962 if (!domain) 3963 return; 3964 3965 /* Identity domain has no meta data for pasid. */ 3966 if (domain->type == IOMMU_DOMAIN_IDENTITY) 3967 return; 3968 3969 dmar_domain = to_dmar_domain(domain); 3970 spin_lock_irqsave(&dmar_domain->lock, flags); 3971 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 3972 if (curr->dev == dev && curr->pasid == pasid) { 3973 list_del(&curr->link_domain); 3974 dev_pasid = curr; 3975 break; 3976 } 3977 } 3978 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3979 3980 cache_tag_unassign_domain(dmar_domain, dev, pasid); 3981 domain_detach_iommu(dmar_domain, iommu); 3982 if (!WARN_ON_ONCE(!dev_pasid)) { 3983 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 3984 kfree(dev_pasid); 3985 } 3986 } 3987 3988 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, 3989 struct device *dev, ioasid_t pasid, 3990 struct iommu_domain *old) 3991 { 3992 struct device_domain_info *info = dev_iommu_priv_get(dev); 3993 3994 iopf_for_domain_remove(old, dev); 3995 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); 3996 domain_remove_dev_pasid(old, dev, pasid); 3997 3998 return 0; 3999 } 4000 4001 struct dev_pasid_info * 4002 domain_add_dev_pasid(struct iommu_domain *domain, 4003 struct device *dev, ioasid_t pasid) 4004 { 4005 struct device_domain_info *info = dev_iommu_priv_get(dev); 4006 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4007 struct intel_iommu *iommu = info->iommu; 4008 struct dev_pasid_info *dev_pasid; 4009 unsigned long flags; 4010 int ret; 4011 4012 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4013 if (!dev_pasid) 4014 return ERR_PTR(-ENOMEM); 4015 4016 ret = domain_attach_iommu(dmar_domain, iommu); 4017 if (ret) 4018 goto out_free; 4019 4020 ret = cache_tag_assign_domain(dmar_domain, dev, pasid); 4021 if (ret) 4022 goto out_detach_iommu; 4023 4024 dev_pasid->dev = dev; 4025 dev_pasid->pasid = pasid; 4026 spin_lock_irqsave(&dmar_domain->lock, flags); 4027 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4028 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4029 4030 return dev_pasid; 4031 out_detach_iommu: 4032 domain_detach_iommu(dmar_domain, iommu); 4033 out_free: 4034 kfree(dev_pasid); 4035 return ERR_PTR(ret); 4036 } 4037 4038 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4039 struct device *dev, ioasid_t pasid, 4040 struct iommu_domain *old) 4041 { 4042 struct device_domain_info *info = dev_iommu_priv_get(dev); 4043 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4044 struct intel_iommu *iommu = info->iommu; 4045 struct dev_pasid_info *dev_pasid; 4046 int ret; 4047 4048 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) 4049 return -EINVAL; 4050 4051 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4052 return -EOPNOTSUPP; 4053 4054 if (domain->dirty_ops) 4055 return -EINVAL; 4056 4057 if (context_copied(iommu, info->bus, info->devfn)) 4058 return -EBUSY; 4059 4060 ret = paging_domain_compatible(domain, dev); 4061 if (ret) 4062 return ret; 4063 4064 dev_pasid = domain_add_dev_pasid(domain, dev, pasid); 4065 if (IS_ERR(dev_pasid)) 4066 return PTR_ERR(dev_pasid); 4067 4068 ret = iopf_for_domain_replace(domain, old, dev); 4069 if (ret) 4070 goto out_remove_dev_pasid; 4071 4072 if (dmar_domain->use_first_level) 4073 ret = domain_setup_first_level(iommu, dmar_domain, 4074 dev, pasid, old); 4075 else 4076 ret = domain_setup_second_level(iommu, dmar_domain, 4077 dev, pasid, old); 4078 if (ret) 4079 goto out_unwind_iopf; 4080 4081 domain_remove_dev_pasid(old, dev, pasid); 4082 4083 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4084 4085 return 0; 4086 4087 out_unwind_iopf: 4088 iopf_for_domain_replace(old, domain, dev); 4089 out_remove_dev_pasid: 4090 domain_remove_dev_pasid(domain, dev, pasid); 4091 return ret; 4092 } 4093 4094 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4095 { 4096 struct device_domain_info *info = dev_iommu_priv_get(dev); 4097 struct intel_iommu *iommu = info->iommu; 4098 struct iommu_hw_info_vtd *vtd; 4099 4100 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4101 if (!vtd) 4102 return ERR_PTR(-ENOMEM); 4103 4104 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4105 vtd->cap_reg = iommu->cap; 4106 vtd->ecap_reg = iommu->ecap; 4107 *length = sizeof(*vtd); 4108 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4109 return vtd; 4110 } 4111 4112 /* 4113 * Set dirty tracking for the device list of a domain. The caller must 4114 * hold the domain->lock when calling it. 4115 */ 4116 static int device_set_dirty_tracking(struct list_head *devices, bool enable) 4117 { 4118 struct device_domain_info *info; 4119 int ret = 0; 4120 4121 list_for_each_entry(info, devices, link) { 4122 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 4123 IOMMU_NO_PASID, enable); 4124 if (ret) 4125 break; 4126 } 4127 4128 return ret; 4129 } 4130 4131 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, 4132 bool enable) 4133 { 4134 struct dmar_domain *s1_domain; 4135 unsigned long flags; 4136 int ret; 4137 4138 spin_lock(&domain->s1_lock); 4139 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4140 spin_lock_irqsave(&s1_domain->lock, flags); 4141 ret = device_set_dirty_tracking(&s1_domain->devices, enable); 4142 spin_unlock_irqrestore(&s1_domain->lock, flags); 4143 if (ret) 4144 goto err_unwind; 4145 } 4146 spin_unlock(&domain->s1_lock); 4147 return 0; 4148 4149 err_unwind: 4150 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4151 spin_lock_irqsave(&s1_domain->lock, flags); 4152 device_set_dirty_tracking(&s1_domain->devices, 4153 domain->dirty_tracking); 4154 spin_unlock_irqrestore(&s1_domain->lock, flags); 4155 } 4156 spin_unlock(&domain->s1_lock); 4157 return ret; 4158 } 4159 4160 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4161 bool enable) 4162 { 4163 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4164 int ret; 4165 4166 spin_lock(&dmar_domain->lock); 4167 if (dmar_domain->dirty_tracking == enable) 4168 goto out_unlock; 4169 4170 ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 4171 if (ret) 4172 goto err_unwind; 4173 4174 if (dmar_domain->nested_parent) { 4175 ret = parent_domain_set_dirty_tracking(dmar_domain, enable); 4176 if (ret) 4177 goto err_unwind; 4178 } 4179 4180 dmar_domain->dirty_tracking = enable; 4181 out_unlock: 4182 spin_unlock(&dmar_domain->lock); 4183 4184 return 0; 4185 4186 err_unwind: 4187 device_set_dirty_tracking(&dmar_domain->devices, 4188 dmar_domain->dirty_tracking); 4189 spin_unlock(&dmar_domain->lock); 4190 return ret; 4191 } 4192 4193 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4194 unsigned long iova, size_t size, 4195 unsigned long flags, 4196 struct iommu_dirty_bitmap *dirty) 4197 { 4198 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4199 unsigned long end = iova + size - 1; 4200 unsigned long pgsize; 4201 4202 /* 4203 * IOMMUFD core calls into a dirty tracking disabled domain without an 4204 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4205 * have occurred when we stopped dirty tracking. This ensures that we 4206 * never inherit dirtied bits from a previous cycle. 4207 */ 4208 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4209 return -EINVAL; 4210 4211 do { 4212 struct dma_pte *pte; 4213 int lvl = 0; 4214 4215 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4216 GFP_ATOMIC); 4217 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4218 if (!pte || !dma_pte_present(pte)) { 4219 iova += pgsize; 4220 continue; 4221 } 4222 4223 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4224 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4225 iova += pgsize; 4226 } while (iova < end); 4227 4228 return 0; 4229 } 4230 4231 static const struct iommu_dirty_ops intel_dirty_ops = { 4232 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4233 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4234 }; 4235 4236 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) 4237 { 4238 struct device_domain_info *info = dev_iommu_priv_get(dev); 4239 struct intel_iommu *iommu = info->iommu; 4240 struct context_entry *context; 4241 4242 spin_lock(&iommu->lock); 4243 context = iommu_context_addr(iommu, bus, devfn, 1); 4244 if (!context) { 4245 spin_unlock(&iommu->lock); 4246 return -ENOMEM; 4247 } 4248 4249 if (context_present(context) && !context_copied(iommu, bus, devfn)) { 4250 spin_unlock(&iommu->lock); 4251 return 0; 4252 } 4253 4254 copied_context_tear_down(iommu, context, bus, devfn); 4255 context_clear_entry(context); 4256 context_set_domain_id(context, FLPT_DEFAULT_DID); 4257 4258 /* 4259 * In pass through mode, AW must be programmed to indicate the largest 4260 * AGAW value supported by hardware. And ASR is ignored by hardware. 4261 */ 4262 context_set_address_width(context, iommu->msagaw); 4263 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); 4264 context_set_fault_enable(context); 4265 context_set_present(context); 4266 if (!ecap_coherent(iommu->ecap)) 4267 clflush_cache_range(context, sizeof(*context)); 4268 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); 4269 spin_unlock(&iommu->lock); 4270 4271 return 0; 4272 } 4273 4274 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) 4275 { 4276 struct device *dev = data; 4277 4278 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); 4279 } 4280 4281 static int device_setup_pass_through(struct device *dev) 4282 { 4283 struct device_domain_info *info = dev_iommu_priv_get(dev); 4284 4285 if (!dev_is_pci(dev)) 4286 return context_setup_pass_through(dev, info->bus, info->devfn); 4287 4288 return pci_for_each_dma_alias(to_pci_dev(dev), 4289 context_setup_pass_through_cb, dev); 4290 } 4291 4292 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) 4293 { 4294 struct device_domain_info *info = dev_iommu_priv_get(dev); 4295 struct intel_iommu *iommu = info->iommu; 4296 int ret; 4297 4298 device_block_translation(dev); 4299 4300 if (dev_is_real_dma_subdevice(dev)) 4301 return 0; 4302 4303 /* 4304 * No PRI support with the global identity domain. No need to enable or 4305 * disable PRI in this path as the iommu has been put in the blocking 4306 * state. 4307 */ 4308 if (sm_supported(iommu)) 4309 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); 4310 else 4311 ret = device_setup_pass_through(dev); 4312 4313 if (!ret) 4314 info->domain_attached = true; 4315 4316 return ret; 4317 } 4318 4319 static int identity_domain_set_dev_pasid(struct iommu_domain *domain, 4320 struct device *dev, ioasid_t pasid, 4321 struct iommu_domain *old) 4322 { 4323 struct device_domain_info *info = dev_iommu_priv_get(dev); 4324 struct intel_iommu *iommu = info->iommu; 4325 int ret; 4326 4327 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4328 return -EOPNOTSUPP; 4329 4330 ret = iopf_for_domain_replace(domain, old, dev); 4331 if (ret) 4332 return ret; 4333 4334 ret = domain_setup_passthrough(iommu, dev, pasid, old); 4335 if (ret) { 4336 iopf_for_domain_replace(old, domain, dev); 4337 return ret; 4338 } 4339 4340 domain_remove_dev_pasid(old, dev, pasid); 4341 return 0; 4342 } 4343 4344 static struct iommu_domain identity_domain = { 4345 .type = IOMMU_DOMAIN_IDENTITY, 4346 .ops = &(const struct iommu_domain_ops) { 4347 .attach_dev = identity_domain_attach_dev, 4348 .set_dev_pasid = identity_domain_set_dev_pasid, 4349 }, 4350 }; 4351 4352 const struct iommu_ops intel_iommu_ops = { 4353 .blocked_domain = &blocking_domain, 4354 .release_domain = &blocking_domain, 4355 .identity_domain = &identity_domain, 4356 .capable = intel_iommu_capable, 4357 .hw_info = intel_iommu_hw_info, 4358 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, 4359 .domain_alloc_sva = intel_svm_domain_alloc, 4360 .domain_alloc_nested = intel_iommu_domain_alloc_nested, 4361 .probe_device = intel_iommu_probe_device, 4362 .probe_finalize = intel_iommu_probe_finalize, 4363 .release_device = intel_iommu_release_device, 4364 .get_resv_regions = intel_iommu_get_resv_regions, 4365 .device_group = intel_iommu_device_group, 4366 .is_attach_deferred = intel_iommu_is_attach_deferred, 4367 .def_domain_type = device_def_domain_type, 4368 .pgsize_bitmap = SZ_4K, 4369 .page_response = intel_iommu_page_response, 4370 .default_domain_ops = &(const struct iommu_domain_ops) { 4371 .attach_dev = intel_iommu_attach_device, 4372 .set_dev_pasid = intel_iommu_set_dev_pasid, 4373 .map_pages = intel_iommu_map_pages, 4374 .unmap_pages = intel_iommu_unmap_pages, 4375 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4376 .flush_iotlb_all = intel_flush_iotlb_all, 4377 .iotlb_sync = intel_iommu_tlb_sync, 4378 .iova_to_phys = intel_iommu_iova_to_phys, 4379 .free = intel_iommu_domain_free, 4380 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4381 } 4382 }; 4383 4384 static void quirk_iommu_igfx(struct pci_dev *dev) 4385 { 4386 if (risky_device(dev)) 4387 return; 4388 4389 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4390 disable_igfx_iommu = 1; 4391 } 4392 4393 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4398 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4400 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4401 4402 /* QM57/QS57 integrated gfx malfunctions with dmar */ 4403 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx); 4404 4405 /* Broadwell igfx malfunctions with dmar */ 4406 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4419 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4420 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4421 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4422 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4423 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4424 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4425 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4426 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4430 4431 static void quirk_iommu_rwbf(struct pci_dev *dev) 4432 { 4433 if (risky_device(dev)) 4434 return; 4435 4436 /* 4437 * Mobile 4 Series Chipset neglects to set RWBF capability, 4438 * but needs it. Same seems to hold for the desktop versions. 4439 */ 4440 pci_info(dev, "Forcing write-buffer flush capability\n"); 4441 rwbf_quirk = 1; 4442 } 4443 4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4451 4452 #define GGC 0x52 4453 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4454 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4455 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4456 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4457 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4458 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4459 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4460 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4461 4462 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4463 { 4464 unsigned short ggc; 4465 4466 if (risky_device(dev)) 4467 return; 4468 4469 if (pci_read_config_word(dev, GGC, &ggc)) 4470 return; 4471 4472 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4473 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4474 disable_igfx_iommu = 1; 4475 } else if (!disable_igfx_iommu) { 4476 /* we have to ensure the gfx device is idle before we flush */ 4477 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4478 iommu_set_dma_strict(); 4479 } 4480 } 4481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4484 4485 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4486 { 4487 unsigned short ver; 4488 4489 if (!IS_GFX_DEVICE(dev)) 4490 return; 4491 4492 ver = (dev->device >> 8) & 0xff; 4493 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4494 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4495 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4496 return; 4497 4498 if (risky_device(dev)) 4499 return; 4500 4501 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4502 iommu_skip_te_disable = 1; 4503 } 4504 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4505 4506 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4507 ISOCH DMAR unit for the Azalia sound device, but not give it any 4508 TLB entries, which causes it to deadlock. Check for that. We do 4509 this in a function called from init_dmars(), instead of in a PCI 4510 quirk, because we don't want to print the obnoxious "BIOS broken" 4511 message if VT-d is actually disabled. 4512 */ 4513 static void __init check_tylersburg_isoch(void) 4514 { 4515 struct pci_dev *pdev; 4516 uint32_t vtisochctrl; 4517 4518 /* If there's no Azalia in the system anyway, forget it. */ 4519 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4520 if (!pdev) 4521 return; 4522 4523 if (risky_device(pdev)) { 4524 pci_dev_put(pdev); 4525 return; 4526 } 4527 4528 pci_dev_put(pdev); 4529 4530 /* System Management Registers. Might be hidden, in which case 4531 we can't do the sanity check. But that's OK, because the 4532 known-broken BIOSes _don't_ actually hide it, so far. */ 4533 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4534 if (!pdev) 4535 return; 4536 4537 if (risky_device(pdev)) { 4538 pci_dev_put(pdev); 4539 return; 4540 } 4541 4542 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4543 pci_dev_put(pdev); 4544 return; 4545 } 4546 4547 pci_dev_put(pdev); 4548 4549 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4550 if (vtisochctrl & 1) 4551 return; 4552 4553 /* Drop all bits other than the number of TLB entries */ 4554 vtisochctrl &= 0x1c; 4555 4556 /* If we have the recommended number of TLB entries (16), fine. */ 4557 if (vtisochctrl == 0x10) 4558 return; 4559 4560 /* Zero TLB entries? You get to ride the short bus to school. */ 4561 if (!vtisochctrl) { 4562 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4563 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4564 dmi_get_system_info(DMI_BIOS_VENDOR), 4565 dmi_get_system_info(DMI_BIOS_VERSION), 4566 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4567 iommu_identity_mapping |= IDENTMAP_AZALIA; 4568 return; 4569 } 4570 4571 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4572 vtisochctrl); 4573 } 4574 4575 /* 4576 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4577 * invalidation completion before posted writes initiated with translated address 4578 * that utilized translations matching the invalidation address range, violating 4579 * the invalidation completion ordering. 4580 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4581 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4582 * under the control of the trusted/privileged host device driver must use this 4583 * quirk. 4584 * Device TLBs are invalidated under the following six conditions: 4585 * 1. Device driver does DMA API unmap IOVA 4586 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4587 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4588 * exit_mmap() due to crash 4589 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4590 * VM has to free pages that were unmapped 4591 * 5. Userspace driver unmaps a DMA buffer 4592 * 6. Cache invalidation in vSVA usage (upcoming) 4593 * 4594 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4595 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4596 * invalidate TLB the same way as normal user unmap which will use this quirk. 4597 * The dTLB invalidation after PASID cache flush does not need this quirk. 4598 * 4599 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4600 */ 4601 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4602 unsigned long address, unsigned long mask, 4603 u32 pasid, u16 qdep) 4604 { 4605 u16 sid; 4606 4607 if (likely(!info->dtlb_extra_inval)) 4608 return; 4609 4610 sid = PCI_DEVID(info->bus, info->devfn); 4611 if (pasid == IOMMU_NO_PASID) { 4612 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4613 qdep, address, mask); 4614 } else { 4615 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 4616 pasid, qdep, address, mask); 4617 } 4618 } 4619 4620 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 4621 4622 /* 4623 * Function to submit a command to the enhanced command interface. The 4624 * valid enhanced command descriptions are defined in Table 47 of the 4625 * VT-d spec. The VT-d hardware implementation may support some but not 4626 * all commands, which can be determined by checking the Enhanced 4627 * Command Capability Register. 4628 * 4629 * Return values: 4630 * - 0: Command successful without any error; 4631 * - Negative: software error value; 4632 * - Nonzero positive: failure status code defined in Table 48. 4633 */ 4634 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 4635 { 4636 unsigned long flags; 4637 u64 res; 4638 int ret; 4639 4640 if (!cap_ecmds(iommu->cap)) 4641 return -ENODEV; 4642 4643 raw_spin_lock_irqsave(&iommu->register_lock, flags); 4644 4645 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 4646 if (res & DMA_ECMD_ECRSP_IP) { 4647 ret = -EBUSY; 4648 goto err; 4649 } 4650 4651 /* 4652 * Unconditionally write the operand B, because 4653 * - There is no side effect if an ecmd doesn't require an 4654 * operand B, but we set the register to some value. 4655 * - It's not invoked in any critical path. The extra MMIO 4656 * write doesn't bring any performance concerns. 4657 */ 4658 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 4659 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 4660 4661 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 4662 !(res & DMA_ECMD_ECRSP_IP), res); 4663 4664 if (res & DMA_ECMD_ECRSP_IP) { 4665 ret = -ETIMEDOUT; 4666 goto err; 4667 } 4668 4669 ret = ecmd_get_status_code(res); 4670 err: 4671 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 4672 4673 return ret; 4674 } 4675