1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-pages.h" 31 #include "pasid.h" 32 #include "perfmon.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 50 51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 56 57 static void __init check_tylersburg_isoch(void); 58 static int rwbf_quirk; 59 60 /* 61 * set to 1 to panic kernel if can't successfully enable VT-d 62 * (used when kernel is launched w/ TXT) 63 */ 64 static int force_on = 0; 65 static int intel_iommu_tboot_noforce; 66 static int no_platform_optin; 67 68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 69 70 /* 71 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 72 * if marked present. 73 */ 74 static phys_addr_t root_entry_lctp(struct root_entry *re) 75 { 76 if (!(re->lo & 1)) 77 return 0; 78 79 return re->lo & VTD_PAGE_MASK; 80 } 81 82 /* 83 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 84 * if marked present. 85 */ 86 static phys_addr_t root_entry_uctp(struct root_entry *re) 87 { 88 if (!(re->hi & 1)) 89 return 0; 90 91 return re->hi & VTD_PAGE_MASK; 92 } 93 94 static int device_rid_cmp_key(const void *key, const struct rb_node *node) 95 { 96 struct device_domain_info *info = 97 rb_entry(node, struct device_domain_info, node); 98 const u16 *rid_lhs = key; 99 100 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) 101 return -1; 102 103 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) 104 return 1; 105 106 return 0; 107 } 108 109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) 110 { 111 struct device_domain_info *info = 112 rb_entry(lhs, struct device_domain_info, node); 113 u16 key = PCI_DEVID(info->bus, info->devfn); 114 115 return device_rid_cmp_key(&key, rhs); 116 } 117 118 /* 119 * Looks up an IOMMU-probed device using its source ID. 120 * 121 * Returns the pointer to the device if there is a match. Otherwise, 122 * returns NULL. 123 * 124 * Note that this helper doesn't guarantee that the device won't be 125 * released by the iommu subsystem after being returned. The caller 126 * should use its own synchronization mechanism to avoid the device 127 * being released during its use if its possibly the case. 128 */ 129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) 130 { 131 struct device_domain_info *info = NULL; 132 struct rb_node *node; 133 unsigned long flags; 134 135 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 136 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); 137 if (node) 138 info = rb_entry(node, struct device_domain_info, node); 139 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 140 141 return info ? info->dev : NULL; 142 } 143 144 static int device_rbtree_insert(struct intel_iommu *iommu, 145 struct device_domain_info *info) 146 { 147 struct rb_node *curr; 148 unsigned long flags; 149 150 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 151 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); 152 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 153 if (WARN_ON(curr)) 154 return -EEXIST; 155 156 return 0; 157 } 158 159 static void device_rbtree_remove(struct device_domain_info *info) 160 { 161 struct intel_iommu *iommu = info->iommu; 162 unsigned long flags; 163 164 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 165 rb_erase(&info->node, &iommu->device_rbtree); 166 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 167 } 168 169 struct dmar_rmrr_unit { 170 struct list_head list; /* list of rmrr units */ 171 struct acpi_dmar_header *hdr; /* ACPI header */ 172 u64 base_address; /* reserved base address*/ 173 u64 end_address; /* reserved end address */ 174 struct dmar_dev_scope *devices; /* target devices */ 175 int devices_cnt; /* target device count */ 176 }; 177 178 struct dmar_atsr_unit { 179 struct list_head list; /* list of ATSR units */ 180 struct acpi_dmar_header *hdr; /* ACPI header */ 181 struct dmar_dev_scope *devices; /* target devices */ 182 int devices_cnt; /* target device count */ 183 u8 include_all:1; /* include all ports */ 184 }; 185 186 struct dmar_satc_unit { 187 struct list_head list; /* list of SATC units */ 188 struct acpi_dmar_header *hdr; /* ACPI header */ 189 struct dmar_dev_scope *devices; /* target devices */ 190 struct intel_iommu *iommu; /* the corresponding iommu */ 191 int devices_cnt; /* target device count */ 192 u8 atc_required:1; /* ATS is required */ 193 }; 194 195 static LIST_HEAD(dmar_atsr_units); 196 static LIST_HEAD(dmar_rmrr_units); 197 static LIST_HEAD(dmar_satc_units); 198 199 #define for_each_rmrr_units(rmrr) \ 200 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 201 202 static void intel_iommu_domain_free(struct iommu_domain *domain); 203 204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 206 207 int intel_iommu_enabled = 0; 208 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 209 210 static int intel_iommu_superpage = 1; 211 static int iommu_identity_mapping; 212 static int iommu_skip_te_disable; 213 static int disable_igfx_iommu; 214 215 #define IDENTMAP_AZALIA 4 216 217 const struct iommu_ops intel_iommu_ops; 218 static const struct iommu_dirty_ops intel_dirty_ops; 219 220 static bool translation_pre_enabled(struct intel_iommu *iommu) 221 { 222 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 223 } 224 225 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 226 { 227 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 228 } 229 230 static void init_translation_status(struct intel_iommu *iommu) 231 { 232 u32 gsts; 233 234 gsts = readl(iommu->reg + DMAR_GSTS_REG); 235 if (gsts & DMA_GSTS_TES) 236 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 237 } 238 239 static int __init intel_iommu_setup(char *str) 240 { 241 if (!str) 242 return -EINVAL; 243 244 while (*str) { 245 if (!strncmp(str, "on", 2)) { 246 dmar_disabled = 0; 247 pr_info("IOMMU enabled\n"); 248 } else if (!strncmp(str, "off", 3)) { 249 dmar_disabled = 1; 250 no_platform_optin = 1; 251 pr_info("IOMMU disabled\n"); 252 } else if (!strncmp(str, "igfx_off", 8)) { 253 disable_igfx_iommu = 1; 254 pr_info("Disable GFX device mapping\n"); 255 } else if (!strncmp(str, "forcedac", 8)) { 256 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 257 iommu_dma_forcedac = true; 258 } else if (!strncmp(str, "strict", 6)) { 259 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 260 iommu_set_dma_strict(); 261 } else if (!strncmp(str, "sp_off", 6)) { 262 pr_info("Disable supported super page\n"); 263 intel_iommu_superpage = 0; 264 } else if (!strncmp(str, "sm_on", 5)) { 265 pr_info("Enable scalable mode if hardware supports\n"); 266 intel_iommu_sm = 1; 267 } else if (!strncmp(str, "sm_off", 6)) { 268 pr_info("Scalable mode is disallowed\n"); 269 intel_iommu_sm = 0; 270 } else if (!strncmp(str, "tboot_noforce", 13)) { 271 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 272 intel_iommu_tboot_noforce = 1; 273 } else { 274 pr_notice("Unknown option - '%s'\n", str); 275 } 276 277 str += strcspn(str, ","); 278 while (*str == ',') 279 str++; 280 } 281 282 return 1; 283 } 284 __setup("intel_iommu=", intel_iommu_setup); 285 286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 287 { 288 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 289 290 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 291 } 292 293 /* 294 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 295 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 296 * the returned SAGAW. 297 */ 298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 299 { 300 unsigned long fl_sagaw, sl_sagaw; 301 302 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 303 sl_sagaw = cap_sagaw(iommu->cap); 304 305 /* Second level only. */ 306 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 307 return sl_sagaw; 308 309 /* First level only. */ 310 if (!ecap_slts(iommu->ecap)) 311 return fl_sagaw; 312 313 return fl_sagaw & sl_sagaw; 314 } 315 316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 317 { 318 unsigned long sagaw; 319 int agaw; 320 321 sagaw = __iommu_calculate_sagaw(iommu); 322 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 323 if (test_bit(agaw, &sagaw)) 324 break; 325 } 326 327 return agaw; 328 } 329 330 /* 331 * Calculate max SAGAW for each iommu. 332 */ 333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 334 { 335 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 336 } 337 338 /* 339 * calculate agaw for each iommu. 340 * "SAGAW" may be different across iommus, use a default agaw, and 341 * get a supported less agaw for iommus that don't support the default agaw. 342 */ 343 int iommu_calculate_agaw(struct intel_iommu *iommu) 344 { 345 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 346 } 347 348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 349 { 350 return sm_supported(iommu) ? 351 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 352 } 353 354 /* Return the super pagesize bitmap if supported. */ 355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 356 { 357 unsigned long bitmap = 0; 358 359 /* 360 * 1-level super page supports page size of 2MiB, 2-level super page 361 * supports page size of both 2MiB and 1GiB. 362 */ 363 if (domain->iommu_superpage == 1) 364 bitmap |= SZ_2M; 365 else if (domain->iommu_superpage == 2) 366 bitmap |= SZ_2M | SZ_1G; 367 368 return bitmap; 369 } 370 371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 372 u8 devfn, int alloc) 373 { 374 struct root_entry *root = &iommu->root_entry[bus]; 375 struct context_entry *context; 376 u64 *entry; 377 378 /* 379 * Except that the caller requested to allocate a new entry, 380 * returning a copied context entry makes no sense. 381 */ 382 if (!alloc && context_copied(iommu, bus, devfn)) 383 return NULL; 384 385 entry = &root->lo; 386 if (sm_supported(iommu)) { 387 if (devfn >= 0x80) { 388 devfn -= 0x80; 389 entry = &root->hi; 390 } 391 devfn *= 2; 392 } 393 if (*entry & 1) 394 context = phys_to_virt(*entry & VTD_PAGE_MASK); 395 else { 396 unsigned long phy_addr; 397 if (!alloc) 398 return NULL; 399 400 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 401 if (!context) 402 return NULL; 403 404 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 405 phy_addr = virt_to_phys((void *)context); 406 *entry = phy_addr | 1; 407 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 408 } 409 return &context[devfn]; 410 } 411 412 /** 413 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 414 * sub-hierarchy of a candidate PCI-PCI bridge 415 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 416 * @bridge: the candidate PCI-PCI bridge 417 * 418 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 419 */ 420 static bool 421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 422 { 423 struct pci_dev *pdev, *pbridge; 424 425 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 426 return false; 427 428 pdev = to_pci_dev(dev); 429 pbridge = to_pci_dev(bridge); 430 431 if (pbridge->subordinate && 432 pbridge->subordinate->number <= pdev->bus->number && 433 pbridge->subordinate->busn_res.end >= pdev->bus->number) 434 return true; 435 436 return false; 437 } 438 439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 440 { 441 struct dmar_drhd_unit *drhd; 442 u32 vtbar; 443 int rc; 444 445 /* We know that this device on this chipset has its own IOMMU. 446 * If we find it under a different IOMMU, then the BIOS is lying 447 * to us. Hope that the IOMMU for this device is actually 448 * disabled, and it needs no translation... 449 */ 450 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 451 if (rc) { 452 /* "can't" happen */ 453 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 454 return false; 455 } 456 vtbar &= 0xffff0000; 457 458 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 459 drhd = dmar_find_matched_drhd_unit(pdev); 460 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 461 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 462 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 463 return true; 464 } 465 466 return false; 467 } 468 469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 470 { 471 if (!iommu || iommu->drhd->ignored) 472 return true; 473 474 if (dev_is_pci(dev)) { 475 struct pci_dev *pdev = to_pci_dev(dev); 476 477 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 478 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 479 quirk_ioat_snb_local_iommu(pdev)) 480 return true; 481 } 482 483 return false; 484 } 485 486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 487 { 488 struct dmar_drhd_unit *drhd = NULL; 489 struct pci_dev *pdev = NULL; 490 struct intel_iommu *iommu; 491 struct device *tmp; 492 u16 segment = 0; 493 int i; 494 495 if (!dev) 496 return NULL; 497 498 if (dev_is_pci(dev)) { 499 struct pci_dev *pf_pdev; 500 501 pdev = pci_real_dma_dev(to_pci_dev(dev)); 502 503 /* VFs aren't listed in scope tables; we need to look up 504 * the PF instead to find the IOMMU. */ 505 pf_pdev = pci_physfn(pdev); 506 dev = &pf_pdev->dev; 507 segment = pci_domain_nr(pdev->bus); 508 } else if (has_acpi_companion(dev)) 509 dev = &ACPI_COMPANION(dev)->dev; 510 511 rcu_read_lock(); 512 for_each_iommu(iommu, drhd) { 513 if (pdev && segment != drhd->segment) 514 continue; 515 516 for_each_active_dev_scope(drhd->devices, 517 drhd->devices_cnt, i, tmp) { 518 if (tmp == dev) { 519 /* For a VF use its original BDF# not that of the PF 520 * which we used for the IOMMU lookup. Strictly speaking 521 * we could do this for all PCI devices; we only need to 522 * get the BDF# from the scope table for ACPI matches. */ 523 if (pdev && pdev->is_virtfn) 524 goto got_pdev; 525 526 if (bus && devfn) { 527 *bus = drhd->devices[i].bus; 528 *devfn = drhd->devices[i].devfn; 529 } 530 goto out; 531 } 532 533 if (is_downstream_to_pci_bridge(dev, tmp)) 534 goto got_pdev; 535 } 536 537 if (pdev && drhd->include_all) { 538 got_pdev: 539 if (bus && devfn) { 540 *bus = pdev->bus->number; 541 *devfn = pdev->devfn; 542 } 543 goto out; 544 } 545 } 546 iommu = NULL; 547 out: 548 if (iommu_is_dummy(iommu, dev)) 549 iommu = NULL; 550 551 rcu_read_unlock(); 552 553 return iommu; 554 } 555 556 static void domain_flush_cache(struct dmar_domain *domain, 557 void *addr, int size) 558 { 559 if (!domain->iommu_coherency) 560 clflush_cache_range(addr, size); 561 } 562 563 static void free_context_table(struct intel_iommu *iommu) 564 { 565 struct context_entry *context; 566 int i; 567 568 if (!iommu->root_entry) 569 return; 570 571 for (i = 0; i < ROOT_ENTRY_NR; i++) { 572 context = iommu_context_addr(iommu, i, 0, 0); 573 if (context) 574 iommu_free_page(context); 575 576 if (!sm_supported(iommu)) 577 continue; 578 579 context = iommu_context_addr(iommu, i, 0x80, 0); 580 if (context) 581 iommu_free_page(context); 582 } 583 584 iommu_free_page(iommu->root_entry); 585 iommu->root_entry = NULL; 586 } 587 588 #ifdef CONFIG_DMAR_DEBUG 589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 590 u8 bus, u8 devfn, struct dma_pte *parent, int level) 591 { 592 struct dma_pte *pte; 593 int offset; 594 595 while (1) { 596 offset = pfn_level_offset(pfn, level); 597 pte = &parent[offset]; 598 599 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 600 601 if (!dma_pte_present(pte)) { 602 pr_info("page table not present at level %d\n", level - 1); 603 break; 604 } 605 606 if (level == 1 || dma_pte_superpage(pte)) 607 break; 608 609 parent = phys_to_virt(dma_pte_addr(pte)); 610 level--; 611 } 612 } 613 614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 615 unsigned long long addr, u32 pasid) 616 { 617 struct pasid_dir_entry *dir, *pde; 618 struct pasid_entry *entries, *pte; 619 struct context_entry *ctx_entry; 620 struct root_entry *rt_entry; 621 int i, dir_index, index, level; 622 u8 devfn = source_id & 0xff; 623 u8 bus = source_id >> 8; 624 struct dma_pte *pgtable; 625 626 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 627 628 /* root entry dump */ 629 if (!iommu->root_entry) { 630 pr_info("root table is not present\n"); 631 return; 632 } 633 rt_entry = &iommu->root_entry[bus]; 634 635 if (sm_supported(iommu)) 636 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 637 rt_entry->hi, rt_entry->lo); 638 else 639 pr_info("root entry: 0x%016llx", rt_entry->lo); 640 641 /* context entry dump */ 642 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 643 if (!ctx_entry) { 644 pr_info("context table is not present\n"); 645 return; 646 } 647 648 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 649 ctx_entry->hi, ctx_entry->lo); 650 651 /* legacy mode does not require PASID entries */ 652 if (!sm_supported(iommu)) { 653 if (!context_present(ctx_entry)) { 654 pr_info("legacy mode page table is not present\n"); 655 return; 656 } 657 level = agaw_to_level(ctx_entry->hi & 7); 658 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 659 goto pgtable_walk; 660 } 661 662 if (!context_present(ctx_entry)) { 663 pr_info("pasid directory table is not present\n"); 664 return; 665 } 666 667 /* get the pointer to pasid directory entry */ 668 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 669 670 /* For request-without-pasid, get the pasid from context entry */ 671 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 672 pasid = IOMMU_NO_PASID; 673 674 dir_index = pasid >> PASID_PDE_SHIFT; 675 pde = &dir[dir_index]; 676 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 677 678 /* get the pointer to the pasid table entry */ 679 entries = get_pasid_table_from_pde(pde); 680 if (!entries) { 681 pr_info("pasid table is not present\n"); 682 return; 683 } 684 index = pasid & PASID_PTE_MASK; 685 pte = &entries[index]; 686 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 687 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 688 689 if (!pasid_pte_is_present(pte)) { 690 pr_info("scalable mode page table is not present\n"); 691 return; 692 } 693 694 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 695 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 696 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 697 } else { 698 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 699 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 700 } 701 702 pgtable_walk: 703 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 704 } 705 #endif 706 707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 708 unsigned long pfn, int *target_level, 709 gfp_t gfp) 710 { 711 struct dma_pte *parent, *pte; 712 int level = agaw_to_level(domain->agaw); 713 int offset; 714 715 if (!domain_pfn_supported(domain, pfn)) 716 /* Address beyond IOMMU's addressing capabilities. */ 717 return NULL; 718 719 parent = domain->pgd; 720 721 while (1) { 722 void *tmp_page; 723 724 offset = pfn_level_offset(pfn, level); 725 pte = &parent[offset]; 726 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 727 break; 728 if (level == *target_level) 729 break; 730 731 if (!dma_pte_present(pte)) { 732 uint64_t pteval, tmp; 733 734 tmp_page = iommu_alloc_page_node(domain->nid, gfp); 735 736 if (!tmp_page) 737 return NULL; 738 739 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 740 pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | 741 DMA_PTE_WRITE; 742 if (domain->use_first_level) 743 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 744 745 tmp = 0ULL; 746 if (!try_cmpxchg64(&pte->val, &tmp, pteval)) 747 /* Someone else set it while we were thinking; use theirs. */ 748 iommu_free_page(tmp_page); 749 else 750 domain_flush_cache(domain, pte, sizeof(*pte)); 751 } 752 if (level == 1) 753 break; 754 755 parent = phys_to_virt(dma_pte_addr(pte)); 756 level--; 757 } 758 759 if (!*target_level) 760 *target_level = level; 761 762 return pte; 763 } 764 765 /* return address's pte at specific level */ 766 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 767 unsigned long pfn, 768 int level, int *large_page) 769 { 770 struct dma_pte *parent, *pte; 771 int total = agaw_to_level(domain->agaw); 772 int offset; 773 774 parent = domain->pgd; 775 while (level <= total) { 776 offset = pfn_level_offset(pfn, total); 777 pte = &parent[offset]; 778 if (level == total) 779 return pte; 780 781 if (!dma_pte_present(pte)) { 782 *large_page = total; 783 break; 784 } 785 786 if (dma_pte_superpage(pte)) { 787 *large_page = total; 788 return pte; 789 } 790 791 parent = phys_to_virt(dma_pte_addr(pte)); 792 total--; 793 } 794 return NULL; 795 } 796 797 /* clear last level pte, a tlb flush should be followed */ 798 static void dma_pte_clear_range(struct dmar_domain *domain, 799 unsigned long start_pfn, 800 unsigned long last_pfn) 801 { 802 unsigned int large_page; 803 struct dma_pte *first_pte, *pte; 804 805 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 806 WARN_ON(start_pfn > last_pfn)) 807 return; 808 809 /* we don't need lock here; nobody else touches the iova range */ 810 do { 811 large_page = 1; 812 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 813 if (!pte) { 814 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 815 continue; 816 } 817 do { 818 dma_clear_pte(pte); 819 start_pfn += lvl_to_nr_pages(large_page); 820 pte++; 821 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 822 823 domain_flush_cache(domain, first_pte, 824 (void *)pte - (void *)first_pte); 825 826 } while (start_pfn && start_pfn <= last_pfn); 827 } 828 829 static void dma_pte_free_level(struct dmar_domain *domain, int level, 830 int retain_level, struct dma_pte *pte, 831 unsigned long pfn, unsigned long start_pfn, 832 unsigned long last_pfn) 833 { 834 pfn = max(start_pfn, pfn); 835 pte = &pte[pfn_level_offset(pfn, level)]; 836 837 do { 838 unsigned long level_pfn; 839 struct dma_pte *level_pte; 840 841 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 842 goto next; 843 844 level_pfn = pfn & level_mask(level); 845 level_pte = phys_to_virt(dma_pte_addr(pte)); 846 847 if (level > 2) { 848 dma_pte_free_level(domain, level - 1, retain_level, 849 level_pte, level_pfn, start_pfn, 850 last_pfn); 851 } 852 853 /* 854 * Free the page table if we're below the level we want to 855 * retain and the range covers the entire table. 856 */ 857 if (level < retain_level && !(start_pfn > level_pfn || 858 last_pfn < level_pfn + level_size(level) - 1)) { 859 dma_clear_pte(pte); 860 domain_flush_cache(domain, pte, sizeof(*pte)); 861 iommu_free_page(level_pte); 862 } 863 next: 864 pfn += level_size(level); 865 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 866 } 867 868 /* 869 * clear last level (leaf) ptes and free page table pages below the 870 * level we wish to keep intact. 871 */ 872 static void dma_pte_free_pagetable(struct dmar_domain *domain, 873 unsigned long start_pfn, 874 unsigned long last_pfn, 875 int retain_level) 876 { 877 dma_pte_clear_range(domain, start_pfn, last_pfn); 878 879 /* We don't need lock here; nobody else touches the iova range */ 880 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 881 domain->pgd, 0, start_pfn, last_pfn); 882 883 /* free pgd */ 884 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 885 iommu_free_page(domain->pgd); 886 domain->pgd = NULL; 887 } 888 } 889 890 /* When a page at a given level is being unlinked from its parent, we don't 891 need to *modify* it at all. All we need to do is make a list of all the 892 pages which can be freed just as soon as we've flushed the IOTLB and we 893 know the hardware page-walk will no longer touch them. 894 The 'pte' argument is the *parent* PTE, pointing to the page that is to 895 be freed. */ 896 static void dma_pte_list_pagetables(struct dmar_domain *domain, 897 int level, struct dma_pte *pte, 898 struct list_head *freelist) 899 { 900 struct page *pg; 901 902 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 903 list_add_tail(&pg->lru, freelist); 904 905 if (level == 1) 906 return; 907 908 pte = page_address(pg); 909 do { 910 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 911 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 912 pte++; 913 } while (!first_pte_in_page(pte)); 914 } 915 916 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 917 struct dma_pte *pte, unsigned long pfn, 918 unsigned long start_pfn, unsigned long last_pfn, 919 struct list_head *freelist) 920 { 921 struct dma_pte *first_pte = NULL, *last_pte = NULL; 922 923 pfn = max(start_pfn, pfn); 924 pte = &pte[pfn_level_offset(pfn, level)]; 925 926 do { 927 unsigned long level_pfn = pfn & level_mask(level); 928 929 if (!dma_pte_present(pte)) 930 goto next; 931 932 /* If range covers entire pagetable, free it */ 933 if (start_pfn <= level_pfn && 934 last_pfn >= level_pfn + level_size(level) - 1) { 935 /* These suborbinate page tables are going away entirely. Don't 936 bother to clear them; we're just going to *free* them. */ 937 if (level > 1 && !dma_pte_superpage(pte)) 938 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 939 940 dma_clear_pte(pte); 941 if (!first_pte) 942 first_pte = pte; 943 last_pte = pte; 944 } else if (level > 1) { 945 /* Recurse down into a level that isn't *entirely* obsolete */ 946 dma_pte_clear_level(domain, level - 1, 947 phys_to_virt(dma_pte_addr(pte)), 948 level_pfn, start_pfn, last_pfn, 949 freelist); 950 } 951 next: 952 pfn = level_pfn + level_size(level); 953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 954 955 if (first_pte) 956 domain_flush_cache(domain, first_pte, 957 (void *)++last_pte - (void *)first_pte); 958 } 959 960 /* We can't just free the pages because the IOMMU may still be walking 961 the page tables, and may have cached the intermediate levels. The 962 pages can only be freed after the IOTLB flush has been done. */ 963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 964 unsigned long last_pfn, struct list_head *freelist) 965 { 966 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 967 WARN_ON(start_pfn > last_pfn)) 968 return; 969 970 /* we don't need lock here; nobody else touches the iova range */ 971 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 972 domain->pgd, 0, start_pfn, last_pfn, freelist); 973 974 /* free pgd */ 975 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 976 struct page *pgd_page = virt_to_page(domain->pgd); 977 list_add_tail(&pgd_page->lru, freelist); 978 domain->pgd = NULL; 979 } 980 } 981 982 /* iommu handling */ 983 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 984 { 985 struct root_entry *root; 986 987 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 988 if (!root) { 989 pr_err("Allocating root entry for %s failed\n", 990 iommu->name); 991 return -ENOMEM; 992 } 993 994 __iommu_flush_cache(iommu, root, ROOT_SIZE); 995 iommu->root_entry = root; 996 997 return 0; 998 } 999 1000 static void iommu_set_root_entry(struct intel_iommu *iommu) 1001 { 1002 u64 addr; 1003 u32 sts; 1004 unsigned long flag; 1005 1006 addr = virt_to_phys(iommu->root_entry); 1007 if (sm_supported(iommu)) 1008 addr |= DMA_RTADDR_SMT; 1009 1010 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1011 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1012 1013 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1014 1015 /* Make sure hardware complete it */ 1016 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1017 readl, (sts & DMA_GSTS_RTPS), sts); 1018 1019 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1020 1021 /* 1022 * Hardware invalidates all DMA remapping hardware translation 1023 * caches as part of SRTP flow. 1024 */ 1025 if (cap_esrtps(iommu->cap)) 1026 return; 1027 1028 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1029 if (sm_supported(iommu)) 1030 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1031 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1032 } 1033 1034 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1035 { 1036 u32 val; 1037 unsigned long flag; 1038 1039 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1040 return; 1041 1042 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1043 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1044 1045 /* Make sure hardware complete it */ 1046 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1047 readl, (!(val & DMA_GSTS_WBFS)), val); 1048 1049 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1050 } 1051 1052 /* return value determine if we need a write buffer flush */ 1053 static void __iommu_flush_context(struct intel_iommu *iommu, 1054 u16 did, u16 source_id, u8 function_mask, 1055 u64 type) 1056 { 1057 u64 val = 0; 1058 unsigned long flag; 1059 1060 switch (type) { 1061 case DMA_CCMD_GLOBAL_INVL: 1062 val = DMA_CCMD_GLOBAL_INVL; 1063 break; 1064 case DMA_CCMD_DOMAIN_INVL: 1065 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1066 break; 1067 case DMA_CCMD_DEVICE_INVL: 1068 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1069 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1070 break; 1071 default: 1072 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1073 iommu->name, type); 1074 return; 1075 } 1076 val |= DMA_CCMD_ICC; 1077 1078 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1079 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1080 1081 /* Make sure hardware complete it */ 1082 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1083 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1084 1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1086 } 1087 1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, 1089 unsigned int size_order, u64 type) 1090 { 1091 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1092 u64 val = 0, val_iva = 0; 1093 unsigned long flag; 1094 1095 switch (type) { 1096 case DMA_TLB_GLOBAL_FLUSH: 1097 /* global flush doesn't need set IVA_REG */ 1098 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1099 break; 1100 case DMA_TLB_DSI_FLUSH: 1101 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1102 break; 1103 case DMA_TLB_PSI_FLUSH: 1104 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1105 /* IH bit is passed in as part of address */ 1106 val_iva = size_order | addr; 1107 break; 1108 default: 1109 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1110 iommu->name, type); 1111 return; 1112 } 1113 1114 if (cap_write_drain(iommu->cap)) 1115 val |= DMA_TLB_WRITE_DRAIN; 1116 1117 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1118 /* Note: Only uses first TLB reg currently */ 1119 if (val_iva) 1120 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1121 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1122 1123 /* Make sure hardware complete it */ 1124 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1125 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1126 1127 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1128 1129 /* check IOTLB invalidation granularity */ 1130 if (DMA_TLB_IAIG(val) == 0) 1131 pr_err("Flush IOTLB failed\n"); 1132 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1133 pr_debug("TLB flush request %Lx, actual %Lx\n", 1134 (unsigned long long)DMA_TLB_IIRG(type), 1135 (unsigned long long)DMA_TLB_IAIG(val)); 1136 } 1137 1138 static struct device_domain_info * 1139 domain_lookup_dev_info(struct dmar_domain *domain, 1140 struct intel_iommu *iommu, u8 bus, u8 devfn) 1141 { 1142 struct device_domain_info *info; 1143 unsigned long flags; 1144 1145 spin_lock_irqsave(&domain->lock, flags); 1146 list_for_each_entry(info, &domain->devices, link) { 1147 if (info->iommu == iommu && info->bus == bus && 1148 info->devfn == devfn) { 1149 spin_unlock_irqrestore(&domain->lock, flags); 1150 return info; 1151 } 1152 } 1153 spin_unlock_irqrestore(&domain->lock, flags); 1154 1155 return NULL; 1156 } 1157 1158 /* 1159 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1160 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1161 * check because it applies only to the built-in QAT devices and it doesn't 1162 * grant additional privileges. 1163 */ 1164 #define BUGGY_QAT_DEVID_MASK 0x4940 1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1166 { 1167 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1168 return false; 1169 1170 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1171 return false; 1172 1173 return true; 1174 } 1175 1176 static void iommu_enable_pci_ats(struct device_domain_info *info) 1177 { 1178 struct pci_dev *pdev; 1179 1180 if (!info->ats_supported) 1181 return; 1182 1183 pdev = to_pci_dev(info->dev); 1184 if (!pci_ats_page_aligned(pdev)) 1185 return; 1186 1187 if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) 1188 info->ats_enabled = 1; 1189 } 1190 1191 static void iommu_disable_pci_ats(struct device_domain_info *info) 1192 { 1193 if (!info->ats_enabled) 1194 return; 1195 1196 pci_disable_ats(to_pci_dev(info->dev)); 1197 info->ats_enabled = 0; 1198 } 1199 1200 static void iommu_enable_pci_pri(struct device_domain_info *info) 1201 { 1202 struct pci_dev *pdev; 1203 1204 if (!info->ats_enabled || !info->pri_supported) 1205 return; 1206 1207 pdev = to_pci_dev(info->dev); 1208 /* PASID is required in PRG Response Message. */ 1209 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 1210 return; 1211 1212 if (pci_reset_pri(pdev)) 1213 return; 1214 1215 if (!pci_enable_pri(pdev, PRQ_DEPTH)) 1216 info->pri_enabled = 1; 1217 } 1218 1219 static void iommu_disable_pci_pri(struct device_domain_info *info) 1220 { 1221 if (!info->pri_enabled) 1222 return; 1223 1224 if (WARN_ON(info->iopf_refcount)) 1225 iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); 1226 1227 pci_disable_pri(to_pci_dev(info->dev)); 1228 info->pri_enabled = 0; 1229 } 1230 1231 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1232 { 1233 cache_tag_flush_all(to_dmar_domain(domain)); 1234 } 1235 1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1237 { 1238 u32 pmen; 1239 unsigned long flags; 1240 1241 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1242 return; 1243 1244 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1245 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1246 pmen &= ~DMA_PMEN_EPM; 1247 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1248 1249 /* wait for the protected region status bit to clear */ 1250 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1251 readl, !(pmen & DMA_PMEN_PRS), pmen); 1252 1253 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1254 } 1255 1256 static void iommu_enable_translation(struct intel_iommu *iommu) 1257 { 1258 u32 sts; 1259 unsigned long flags; 1260 1261 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1262 iommu->gcmd |= DMA_GCMD_TE; 1263 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1264 1265 /* Make sure hardware complete it */ 1266 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1267 readl, (sts & DMA_GSTS_TES), sts); 1268 1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1270 } 1271 1272 static void iommu_disable_translation(struct intel_iommu *iommu) 1273 { 1274 u32 sts; 1275 unsigned long flag; 1276 1277 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1278 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1279 return; 1280 1281 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1282 iommu->gcmd &= ~DMA_GCMD_TE; 1283 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1284 1285 /* Make sure hardware complete it */ 1286 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1287 readl, (!(sts & DMA_GSTS_TES)), sts); 1288 1289 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1290 } 1291 1292 static int iommu_init_domains(struct intel_iommu *iommu) 1293 { 1294 u32 ndomains; 1295 1296 ndomains = cap_ndoms(iommu->cap); 1297 pr_debug("%s: Number of Domains supported <%d>\n", 1298 iommu->name, ndomains); 1299 1300 spin_lock_init(&iommu->lock); 1301 1302 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1303 if (!iommu->domain_ids) 1304 return -ENOMEM; 1305 1306 /* 1307 * If Caching mode is set, then invalid translations are tagged 1308 * with domain-id 0, hence we need to pre-allocate it. We also 1309 * use domain-id 0 as a marker for non-allocated domain-id, so 1310 * make sure it is not used for a real domain. 1311 */ 1312 set_bit(0, iommu->domain_ids); 1313 1314 /* 1315 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1316 * entry for first-level or pass-through translation modes should 1317 * be programmed with a domain id different from those used for 1318 * second-level or nested translation. We reserve a domain id for 1319 * this purpose. This domain id is also used for identity domain 1320 * in legacy mode. 1321 */ 1322 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1323 1324 return 0; 1325 } 1326 1327 static void disable_dmar_iommu(struct intel_iommu *iommu) 1328 { 1329 if (!iommu->domain_ids) 1330 return; 1331 1332 /* 1333 * All iommu domains must have been detached from the devices, 1334 * hence there should be no domain IDs in use. 1335 */ 1336 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1337 > NUM_RESERVED_DID)) 1338 return; 1339 1340 if (iommu->gcmd & DMA_GCMD_TE) 1341 iommu_disable_translation(iommu); 1342 } 1343 1344 static void free_dmar_iommu(struct intel_iommu *iommu) 1345 { 1346 if (iommu->domain_ids) { 1347 bitmap_free(iommu->domain_ids); 1348 iommu->domain_ids = NULL; 1349 } 1350 1351 if (iommu->copied_tables) { 1352 bitmap_free(iommu->copied_tables); 1353 iommu->copied_tables = NULL; 1354 } 1355 1356 /* free context mapping */ 1357 free_context_table(iommu); 1358 1359 if (ecap_prs(iommu->ecap)) 1360 intel_iommu_finish_prq(iommu); 1361 } 1362 1363 /* 1364 * Check and return whether first level is used by default for 1365 * DMA translation. 1366 */ 1367 static bool first_level_by_default(struct intel_iommu *iommu) 1368 { 1369 /* Only SL is available in legacy mode */ 1370 if (!sm_supported(iommu)) 1371 return false; 1372 1373 /* Only level (either FL or SL) is available, just use it */ 1374 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) 1375 return ecap_flts(iommu->ecap); 1376 1377 return true; 1378 } 1379 1380 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1381 { 1382 struct iommu_domain_info *info, *curr; 1383 unsigned long ndomains; 1384 int num, ret = -ENOSPC; 1385 1386 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1387 return 0; 1388 1389 info = kzalloc(sizeof(*info), GFP_KERNEL); 1390 if (!info) 1391 return -ENOMEM; 1392 1393 spin_lock(&iommu->lock); 1394 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1395 if (curr) { 1396 curr->refcnt++; 1397 spin_unlock(&iommu->lock); 1398 kfree(info); 1399 return 0; 1400 } 1401 1402 ndomains = cap_ndoms(iommu->cap); 1403 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1404 if (num >= ndomains) { 1405 pr_err("%s: No free domain ids\n", iommu->name); 1406 goto err_unlock; 1407 } 1408 1409 set_bit(num, iommu->domain_ids); 1410 info->refcnt = 1; 1411 info->did = num; 1412 info->iommu = iommu; 1413 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1414 NULL, info, GFP_ATOMIC); 1415 if (curr) { 1416 ret = xa_err(curr) ? : -EBUSY; 1417 goto err_clear; 1418 } 1419 1420 spin_unlock(&iommu->lock); 1421 return 0; 1422 1423 err_clear: 1424 clear_bit(info->did, iommu->domain_ids); 1425 err_unlock: 1426 spin_unlock(&iommu->lock); 1427 kfree(info); 1428 return ret; 1429 } 1430 1431 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1432 { 1433 struct iommu_domain_info *info; 1434 1435 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1436 return; 1437 1438 spin_lock(&iommu->lock); 1439 info = xa_load(&domain->iommu_array, iommu->seq_id); 1440 if (--info->refcnt == 0) { 1441 clear_bit(info->did, iommu->domain_ids); 1442 xa_erase(&domain->iommu_array, iommu->seq_id); 1443 domain->nid = NUMA_NO_NODE; 1444 kfree(info); 1445 } 1446 spin_unlock(&iommu->lock); 1447 } 1448 1449 static void domain_exit(struct dmar_domain *domain) 1450 { 1451 if (domain->pgd) { 1452 LIST_HEAD(freelist); 1453 1454 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1455 iommu_put_pages_list(&freelist); 1456 } 1457 1458 if (WARN_ON(!list_empty(&domain->devices))) 1459 return; 1460 1461 kfree(domain->qi_batch); 1462 kfree(domain); 1463 } 1464 1465 /* 1466 * For kdump cases, old valid entries may be cached due to the 1467 * in-flight DMA and copied pgtable, but there is no unmapping 1468 * behaviour for them, thus we need an explicit cache flush for 1469 * the newly-mapped device. For kdump, at this point, the device 1470 * is supposed to finish reset at its driver probe stage, so no 1471 * in-flight DMA will exist, and we don't need to worry anymore 1472 * hereafter. 1473 */ 1474 static void copied_context_tear_down(struct intel_iommu *iommu, 1475 struct context_entry *context, 1476 u8 bus, u8 devfn) 1477 { 1478 u16 did_old; 1479 1480 if (!context_copied(iommu, bus, devfn)) 1481 return; 1482 1483 assert_spin_locked(&iommu->lock); 1484 1485 did_old = context_domain_id(context); 1486 context_clear_entry(context); 1487 1488 if (did_old < cap_ndoms(iommu->cap)) { 1489 iommu->flush.flush_context(iommu, did_old, 1490 PCI_DEVID(bus, devfn), 1491 DMA_CCMD_MASK_NOBIT, 1492 DMA_CCMD_DEVICE_INVL); 1493 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1494 DMA_TLB_DSI_FLUSH); 1495 } 1496 1497 clear_context_copied(iommu, bus, devfn); 1498 } 1499 1500 /* 1501 * It's a non-present to present mapping. If hardware doesn't cache 1502 * non-present entry we only need to flush the write-buffer. If the 1503 * _does_ cache non-present entries, then it does so in the special 1504 * domain #0, which we have to flush: 1505 */ 1506 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, 1507 u8 bus, u8 devfn) 1508 { 1509 if (cap_caching_mode(iommu->cap)) { 1510 iommu->flush.flush_context(iommu, 0, 1511 PCI_DEVID(bus, devfn), 1512 DMA_CCMD_MASK_NOBIT, 1513 DMA_CCMD_DEVICE_INVL); 1514 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1515 } else { 1516 iommu_flush_write_buffer(iommu); 1517 } 1518 } 1519 1520 static int domain_context_mapping_one(struct dmar_domain *domain, 1521 struct intel_iommu *iommu, 1522 u8 bus, u8 devfn) 1523 { 1524 struct device_domain_info *info = 1525 domain_lookup_dev_info(domain, iommu, bus, devfn); 1526 u16 did = domain_id_iommu(domain, iommu); 1527 int translation = CONTEXT_TT_MULTI_LEVEL; 1528 struct dma_pte *pgd = domain->pgd; 1529 struct context_entry *context; 1530 int ret; 1531 1532 pr_debug("Set context mapping for %02x:%02x.%d\n", 1533 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1534 1535 spin_lock(&iommu->lock); 1536 ret = -ENOMEM; 1537 context = iommu_context_addr(iommu, bus, devfn, 1); 1538 if (!context) 1539 goto out_unlock; 1540 1541 ret = 0; 1542 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1543 goto out_unlock; 1544 1545 copied_context_tear_down(iommu, context, bus, devfn); 1546 context_clear_entry(context); 1547 context_set_domain_id(context, did); 1548 1549 if (info && info->ats_supported) 1550 translation = CONTEXT_TT_DEV_IOTLB; 1551 else 1552 translation = CONTEXT_TT_MULTI_LEVEL; 1553 1554 context_set_address_root(context, virt_to_phys(pgd)); 1555 context_set_address_width(context, domain->agaw); 1556 context_set_translation_type(context, translation); 1557 context_set_fault_enable(context); 1558 context_set_present(context); 1559 if (!ecap_coherent(iommu->ecap)) 1560 clflush_cache_range(context, sizeof(*context)); 1561 context_present_cache_flush(iommu, did, bus, devfn); 1562 ret = 0; 1563 1564 out_unlock: 1565 spin_unlock(&iommu->lock); 1566 1567 return ret; 1568 } 1569 1570 static int domain_context_mapping_cb(struct pci_dev *pdev, 1571 u16 alias, void *opaque) 1572 { 1573 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); 1574 struct intel_iommu *iommu = info->iommu; 1575 struct dmar_domain *domain = opaque; 1576 1577 return domain_context_mapping_one(domain, iommu, 1578 PCI_BUS_NUM(alias), alias & 0xff); 1579 } 1580 1581 static int 1582 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1583 { 1584 struct device_domain_info *info = dev_iommu_priv_get(dev); 1585 struct intel_iommu *iommu = info->iommu; 1586 u8 bus = info->bus, devfn = info->devfn; 1587 int ret; 1588 1589 if (!dev_is_pci(dev)) 1590 return domain_context_mapping_one(domain, iommu, bus, devfn); 1591 1592 ret = pci_for_each_dma_alias(to_pci_dev(dev), 1593 domain_context_mapping_cb, domain); 1594 if (ret) 1595 return ret; 1596 1597 iommu_enable_pci_ats(info); 1598 1599 return 0; 1600 } 1601 1602 /* Return largest possible superpage level for a given mapping */ 1603 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1604 unsigned long phy_pfn, unsigned long pages) 1605 { 1606 int support, level = 1; 1607 unsigned long pfnmerge; 1608 1609 support = domain->iommu_superpage; 1610 1611 /* To use a large page, the virtual *and* physical addresses 1612 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1613 of them will mean we have to use smaller pages. So just 1614 merge them and check both at once. */ 1615 pfnmerge = iov_pfn | phy_pfn; 1616 1617 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1618 pages >>= VTD_STRIDE_SHIFT; 1619 if (!pages) 1620 break; 1621 pfnmerge >>= VTD_STRIDE_SHIFT; 1622 level++; 1623 support--; 1624 } 1625 return level; 1626 } 1627 1628 /* 1629 * Ensure that old small page tables are removed to make room for superpage(s). 1630 * We're going to add new large pages, so make sure we don't remove their parent 1631 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1632 */ 1633 static void switch_to_super_page(struct dmar_domain *domain, 1634 unsigned long start_pfn, 1635 unsigned long end_pfn, int level) 1636 { 1637 unsigned long lvl_pages = lvl_to_nr_pages(level); 1638 struct dma_pte *pte = NULL; 1639 1640 while (start_pfn <= end_pfn) { 1641 if (!pte) 1642 pte = pfn_to_dma_pte(domain, start_pfn, &level, 1643 GFP_ATOMIC); 1644 1645 if (dma_pte_present(pte)) { 1646 dma_pte_free_pagetable(domain, start_pfn, 1647 start_pfn + lvl_pages - 1, 1648 level + 1); 1649 1650 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, 1651 end_pfn << VTD_PAGE_SHIFT, 0); 1652 } 1653 1654 pte++; 1655 start_pfn += lvl_pages; 1656 if (first_pte_in_page(pte)) 1657 pte = NULL; 1658 } 1659 } 1660 1661 static int 1662 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1663 unsigned long phys_pfn, unsigned long nr_pages, int prot, 1664 gfp_t gfp) 1665 { 1666 struct dma_pte *first_pte = NULL, *pte = NULL; 1667 unsigned int largepage_lvl = 0; 1668 unsigned long lvl_pages = 0; 1669 phys_addr_t pteval; 1670 u64 attr; 1671 1672 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 1673 return -EINVAL; 1674 1675 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 1676 return -EINVAL; 1677 1678 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 1679 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 1680 return -EINVAL; 1681 } 1682 1683 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 1684 attr |= DMA_FL_PTE_PRESENT; 1685 if (domain->use_first_level) { 1686 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 1687 if (prot & DMA_PTE_WRITE) 1688 attr |= DMA_FL_PTE_DIRTY; 1689 } 1690 1691 domain->has_mappings = true; 1692 1693 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 1694 1695 while (nr_pages > 0) { 1696 uint64_t tmp; 1697 1698 if (!pte) { 1699 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 1700 phys_pfn, nr_pages); 1701 1702 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 1703 gfp); 1704 if (!pte) 1705 return -ENOMEM; 1706 first_pte = pte; 1707 1708 lvl_pages = lvl_to_nr_pages(largepage_lvl); 1709 1710 /* It is large page*/ 1711 if (largepage_lvl > 1) { 1712 unsigned long end_pfn; 1713 unsigned long pages_to_remove; 1714 1715 pteval |= DMA_PTE_LARGE_PAGE; 1716 pages_to_remove = min_t(unsigned long, nr_pages, 1717 nr_pte_to_next_page(pte) * lvl_pages); 1718 end_pfn = iov_pfn + pages_to_remove - 1; 1719 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 1720 } else { 1721 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1722 } 1723 1724 } 1725 /* We don't need lock here, nobody else 1726 * touches the iova range 1727 */ 1728 tmp = 0ULL; 1729 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { 1730 static int dumps = 5; 1731 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 1732 iov_pfn, tmp, (unsigned long long)pteval); 1733 if (dumps) { 1734 dumps--; 1735 debug_dma_dump_mappings(NULL); 1736 } 1737 WARN_ON(1); 1738 } 1739 1740 nr_pages -= lvl_pages; 1741 iov_pfn += lvl_pages; 1742 phys_pfn += lvl_pages; 1743 pteval += lvl_pages * VTD_PAGE_SIZE; 1744 1745 /* If the next PTE would be the first in a new page, then we 1746 * need to flush the cache on the entries we've just written. 1747 * And then we'll need to recalculate 'pte', so clear it and 1748 * let it get set again in the if (!pte) block above. 1749 * 1750 * If we're done (!nr_pages) we need to flush the cache too. 1751 * 1752 * Also if we've been setting superpages, we may need to 1753 * recalculate 'pte' and switch back to smaller pages for the 1754 * end of the mapping, if the trailing size is not enough to 1755 * use another superpage (i.e. nr_pages < lvl_pages). 1756 */ 1757 pte++; 1758 if (!nr_pages || first_pte_in_page(pte) || 1759 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 1760 domain_flush_cache(domain, first_pte, 1761 (void *)pte - (void *)first_pte); 1762 pte = NULL; 1763 } 1764 } 1765 1766 return 0; 1767 } 1768 1769 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 1770 { 1771 struct intel_iommu *iommu = info->iommu; 1772 struct context_entry *context; 1773 u16 did; 1774 1775 spin_lock(&iommu->lock); 1776 context = iommu_context_addr(iommu, bus, devfn, 0); 1777 if (!context) { 1778 spin_unlock(&iommu->lock); 1779 return; 1780 } 1781 1782 did = context_domain_id(context); 1783 context_clear_entry(context); 1784 __iommu_flush_cache(iommu, context, sizeof(*context)); 1785 spin_unlock(&iommu->lock); 1786 intel_context_flush_no_pasid(info, context, did); 1787 } 1788 1789 int __domain_setup_first_level(struct intel_iommu *iommu, 1790 struct device *dev, ioasid_t pasid, 1791 u16 did, pgd_t *pgd, int flags, 1792 struct iommu_domain *old) 1793 { 1794 if (!old) 1795 return intel_pasid_setup_first_level(iommu, dev, pgd, 1796 pasid, did, flags); 1797 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, 1798 iommu_domain_did(old, iommu), 1799 flags); 1800 } 1801 1802 static int domain_setup_second_level(struct intel_iommu *iommu, 1803 struct dmar_domain *domain, 1804 struct device *dev, ioasid_t pasid, 1805 struct iommu_domain *old) 1806 { 1807 if (!old) 1808 return intel_pasid_setup_second_level(iommu, domain, 1809 dev, pasid); 1810 return intel_pasid_replace_second_level(iommu, domain, dev, 1811 iommu_domain_did(old, iommu), 1812 pasid); 1813 } 1814 1815 static int domain_setup_passthrough(struct intel_iommu *iommu, 1816 struct device *dev, ioasid_t pasid, 1817 struct iommu_domain *old) 1818 { 1819 if (!old) 1820 return intel_pasid_setup_pass_through(iommu, dev, pasid); 1821 return intel_pasid_replace_pass_through(iommu, dev, 1822 iommu_domain_did(old, iommu), 1823 pasid); 1824 } 1825 1826 static int domain_setup_first_level(struct intel_iommu *iommu, 1827 struct dmar_domain *domain, 1828 struct device *dev, 1829 u32 pasid, struct iommu_domain *old) 1830 { 1831 struct dma_pte *pgd = domain->pgd; 1832 int level, flags = 0; 1833 1834 level = agaw_to_level(domain->agaw); 1835 if (level != 4 && level != 5) 1836 return -EINVAL; 1837 1838 if (level == 5) 1839 flags |= PASID_FLAG_FL5LP; 1840 1841 if (domain->force_snooping) 1842 flags |= PASID_FLAG_PAGE_SNOOP; 1843 1844 return __domain_setup_first_level(iommu, dev, pasid, 1845 domain_id_iommu(domain, iommu), 1846 (pgd_t *)pgd, flags, old); 1847 } 1848 1849 static int dmar_domain_attach_device(struct dmar_domain *domain, 1850 struct device *dev) 1851 { 1852 struct device_domain_info *info = dev_iommu_priv_get(dev); 1853 struct intel_iommu *iommu = info->iommu; 1854 unsigned long flags; 1855 int ret; 1856 1857 ret = domain_attach_iommu(domain, iommu); 1858 if (ret) 1859 return ret; 1860 1861 info->domain = domain; 1862 spin_lock_irqsave(&domain->lock, flags); 1863 list_add(&info->link, &domain->devices); 1864 spin_unlock_irqrestore(&domain->lock, flags); 1865 1866 if (dev_is_real_dma_subdevice(dev)) 1867 return 0; 1868 1869 if (!sm_supported(iommu)) 1870 ret = domain_context_mapping(domain, dev); 1871 else if (domain->use_first_level) 1872 ret = domain_setup_first_level(iommu, domain, dev, 1873 IOMMU_NO_PASID, NULL); 1874 else 1875 ret = domain_setup_second_level(iommu, domain, dev, 1876 IOMMU_NO_PASID, NULL); 1877 1878 if (ret) 1879 goto out_block_translation; 1880 1881 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); 1882 if (ret) 1883 goto out_block_translation; 1884 1885 return 0; 1886 1887 out_block_translation: 1888 device_block_translation(dev); 1889 return ret; 1890 } 1891 1892 /** 1893 * device_rmrr_is_relaxable - Test whether the RMRR of this device 1894 * is relaxable (ie. is allowed to be not enforced under some conditions) 1895 * @dev: device handle 1896 * 1897 * We assume that PCI USB devices with RMRRs have them largely 1898 * for historical reasons and that the RMRR space is not actively used post 1899 * boot. This exclusion may change if vendors begin to abuse it. 1900 * 1901 * The same exception is made for graphics devices, with the requirement that 1902 * any use of the RMRR regions will be torn down before assigning the device 1903 * to a guest. 1904 * 1905 * Return: true if the RMRR is relaxable, false otherwise 1906 */ 1907 static bool device_rmrr_is_relaxable(struct device *dev) 1908 { 1909 struct pci_dev *pdev; 1910 1911 if (!dev_is_pci(dev)) 1912 return false; 1913 1914 pdev = to_pci_dev(dev); 1915 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 1916 return true; 1917 else 1918 return false; 1919 } 1920 1921 static int device_def_domain_type(struct device *dev) 1922 { 1923 struct device_domain_info *info = dev_iommu_priv_get(dev); 1924 struct intel_iommu *iommu = info->iommu; 1925 1926 /* 1927 * Hardware does not support the passthrough translation mode. 1928 * Always use a dynamaic mapping domain. 1929 */ 1930 if (!ecap_pass_through(iommu->ecap)) 1931 return IOMMU_DOMAIN_DMA; 1932 1933 if (dev_is_pci(dev)) { 1934 struct pci_dev *pdev = to_pci_dev(dev); 1935 1936 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 1937 return IOMMU_DOMAIN_IDENTITY; 1938 } 1939 1940 return 0; 1941 } 1942 1943 static void intel_iommu_init_qi(struct intel_iommu *iommu) 1944 { 1945 /* 1946 * Start from the sane iommu hardware state. 1947 * If the queued invalidation is already initialized by us 1948 * (for example, while enabling interrupt-remapping) then 1949 * we got the things already rolling from a sane state. 1950 */ 1951 if (!iommu->qi) { 1952 /* 1953 * Clear any previous faults. 1954 */ 1955 dmar_fault(-1, iommu); 1956 /* 1957 * Disable queued invalidation if supported and already enabled 1958 * before OS handover. 1959 */ 1960 dmar_disable_qi(iommu); 1961 } 1962 1963 if (dmar_enable_qi(iommu)) { 1964 /* 1965 * Queued Invalidate not enabled, use Register Based Invalidate 1966 */ 1967 iommu->flush.flush_context = __iommu_flush_context; 1968 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 1969 pr_info("%s: Using Register based invalidation\n", 1970 iommu->name); 1971 } else { 1972 iommu->flush.flush_context = qi_flush_context; 1973 iommu->flush.flush_iotlb = qi_flush_iotlb; 1974 pr_info("%s: Using Queued invalidation\n", iommu->name); 1975 } 1976 } 1977 1978 static int copy_context_table(struct intel_iommu *iommu, 1979 struct root_entry *old_re, 1980 struct context_entry **tbl, 1981 int bus, bool ext) 1982 { 1983 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 1984 struct context_entry *new_ce = NULL, ce; 1985 struct context_entry *old_ce = NULL; 1986 struct root_entry re; 1987 phys_addr_t old_ce_phys; 1988 1989 tbl_idx = ext ? bus * 2 : bus; 1990 memcpy(&re, old_re, sizeof(re)); 1991 1992 for (devfn = 0; devfn < 256; devfn++) { 1993 /* First calculate the correct index */ 1994 idx = (ext ? devfn * 2 : devfn) % 256; 1995 1996 if (idx == 0) { 1997 /* First save what we may have and clean up */ 1998 if (new_ce) { 1999 tbl[tbl_idx] = new_ce; 2000 __iommu_flush_cache(iommu, new_ce, 2001 VTD_PAGE_SIZE); 2002 pos = 1; 2003 } 2004 2005 if (old_ce) 2006 memunmap(old_ce); 2007 2008 ret = 0; 2009 if (devfn < 0x80) 2010 old_ce_phys = root_entry_lctp(&re); 2011 else 2012 old_ce_phys = root_entry_uctp(&re); 2013 2014 if (!old_ce_phys) { 2015 if (ext && devfn == 0) { 2016 /* No LCTP, try UCTP */ 2017 devfn = 0x7f; 2018 continue; 2019 } else { 2020 goto out; 2021 } 2022 } 2023 2024 ret = -ENOMEM; 2025 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2026 MEMREMAP_WB); 2027 if (!old_ce) 2028 goto out; 2029 2030 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); 2031 if (!new_ce) 2032 goto out_unmap; 2033 2034 ret = 0; 2035 } 2036 2037 /* Now copy the context entry */ 2038 memcpy(&ce, old_ce + idx, sizeof(ce)); 2039 2040 if (!context_present(&ce)) 2041 continue; 2042 2043 did = context_domain_id(&ce); 2044 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2045 set_bit(did, iommu->domain_ids); 2046 2047 set_context_copied(iommu, bus, devfn); 2048 new_ce[idx] = ce; 2049 } 2050 2051 tbl[tbl_idx + pos] = new_ce; 2052 2053 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2054 2055 out_unmap: 2056 memunmap(old_ce); 2057 2058 out: 2059 return ret; 2060 } 2061 2062 static int copy_translation_tables(struct intel_iommu *iommu) 2063 { 2064 struct context_entry **ctxt_tbls; 2065 struct root_entry *old_rt; 2066 phys_addr_t old_rt_phys; 2067 int ctxt_table_entries; 2068 u64 rtaddr_reg; 2069 int bus, ret; 2070 bool new_ext, ext; 2071 2072 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2073 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2074 new_ext = !!sm_supported(iommu); 2075 2076 /* 2077 * The RTT bit can only be changed when translation is disabled, 2078 * but disabling translation means to open a window for data 2079 * corruption. So bail out and don't copy anything if we would 2080 * have to change the bit. 2081 */ 2082 if (new_ext != ext) 2083 return -EINVAL; 2084 2085 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2086 if (!iommu->copied_tables) 2087 return -ENOMEM; 2088 2089 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2090 if (!old_rt_phys) 2091 return -EINVAL; 2092 2093 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2094 if (!old_rt) 2095 return -ENOMEM; 2096 2097 /* This is too big for the stack - allocate it from slab */ 2098 ctxt_table_entries = ext ? 512 : 256; 2099 ret = -ENOMEM; 2100 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2101 if (!ctxt_tbls) 2102 goto out_unmap; 2103 2104 for (bus = 0; bus < 256; bus++) { 2105 ret = copy_context_table(iommu, &old_rt[bus], 2106 ctxt_tbls, bus, ext); 2107 if (ret) { 2108 pr_err("%s: Failed to copy context table for bus %d\n", 2109 iommu->name, bus); 2110 continue; 2111 } 2112 } 2113 2114 spin_lock(&iommu->lock); 2115 2116 /* Context tables are copied, now write them to the root_entry table */ 2117 for (bus = 0; bus < 256; bus++) { 2118 int idx = ext ? bus * 2 : bus; 2119 u64 val; 2120 2121 if (ctxt_tbls[idx]) { 2122 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2123 iommu->root_entry[bus].lo = val; 2124 } 2125 2126 if (!ext || !ctxt_tbls[idx + 1]) 2127 continue; 2128 2129 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2130 iommu->root_entry[bus].hi = val; 2131 } 2132 2133 spin_unlock(&iommu->lock); 2134 2135 kfree(ctxt_tbls); 2136 2137 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2138 2139 ret = 0; 2140 2141 out_unmap: 2142 memunmap(old_rt); 2143 2144 return ret; 2145 } 2146 2147 static int __init init_dmars(void) 2148 { 2149 struct dmar_drhd_unit *drhd; 2150 struct intel_iommu *iommu; 2151 int ret; 2152 2153 for_each_iommu(iommu, drhd) { 2154 if (drhd->ignored) { 2155 iommu_disable_translation(iommu); 2156 continue; 2157 } 2158 2159 /* 2160 * Find the max pasid size of all IOMMU's in the system. 2161 * We need to ensure the system pasid table is no bigger 2162 * than the smallest supported. 2163 */ 2164 if (pasid_supported(iommu)) { 2165 u32 temp = 2 << ecap_pss(iommu->ecap); 2166 2167 intel_pasid_max_id = min_t(u32, temp, 2168 intel_pasid_max_id); 2169 } 2170 2171 intel_iommu_init_qi(iommu); 2172 2173 ret = iommu_init_domains(iommu); 2174 if (ret) 2175 goto free_iommu; 2176 2177 init_translation_status(iommu); 2178 2179 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2180 iommu_disable_translation(iommu); 2181 clear_translation_pre_enabled(iommu); 2182 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2183 iommu->name); 2184 } 2185 2186 /* 2187 * TBD: 2188 * we could share the same root & context tables 2189 * among all IOMMU's. Need to Split it later. 2190 */ 2191 ret = iommu_alloc_root_entry(iommu); 2192 if (ret) 2193 goto free_iommu; 2194 2195 if (translation_pre_enabled(iommu)) { 2196 pr_info("Translation already enabled - trying to copy translation structures\n"); 2197 2198 ret = copy_translation_tables(iommu); 2199 if (ret) { 2200 /* 2201 * We found the IOMMU with translation 2202 * enabled - but failed to copy over the 2203 * old root-entry table. Try to proceed 2204 * by disabling translation now and 2205 * allocating a clean root-entry table. 2206 * This might cause DMAR faults, but 2207 * probably the dump will still succeed. 2208 */ 2209 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2210 iommu->name); 2211 iommu_disable_translation(iommu); 2212 clear_translation_pre_enabled(iommu); 2213 } else { 2214 pr_info("Copied translation tables from previous kernel for %s\n", 2215 iommu->name); 2216 } 2217 } 2218 2219 intel_svm_check(iommu); 2220 } 2221 2222 /* 2223 * Now that qi is enabled on all iommus, set the root entry and flush 2224 * caches. This is required on some Intel X58 chipsets, otherwise the 2225 * flush_context function will loop forever and the boot hangs. 2226 */ 2227 for_each_active_iommu(iommu, drhd) { 2228 iommu_flush_write_buffer(iommu); 2229 iommu_set_root_entry(iommu); 2230 } 2231 2232 check_tylersburg_isoch(); 2233 2234 /* 2235 * for each drhd 2236 * enable fault log 2237 * global invalidate context cache 2238 * global invalidate iotlb 2239 * enable translation 2240 */ 2241 for_each_iommu(iommu, drhd) { 2242 if (drhd->ignored) { 2243 /* 2244 * we always have to disable PMRs or DMA may fail on 2245 * this device 2246 */ 2247 if (force_on) 2248 iommu_disable_protect_mem_regions(iommu); 2249 continue; 2250 } 2251 2252 iommu_flush_write_buffer(iommu); 2253 2254 if (ecap_prs(iommu->ecap)) { 2255 /* 2256 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2257 * could cause possible lock race condition. 2258 */ 2259 up_write(&dmar_global_lock); 2260 ret = intel_iommu_enable_prq(iommu); 2261 down_write(&dmar_global_lock); 2262 if (ret) 2263 goto free_iommu; 2264 } 2265 2266 ret = dmar_set_interrupt(iommu); 2267 if (ret) 2268 goto free_iommu; 2269 } 2270 2271 return 0; 2272 2273 free_iommu: 2274 for_each_active_iommu(iommu, drhd) { 2275 disable_dmar_iommu(iommu); 2276 free_dmar_iommu(iommu); 2277 } 2278 2279 return ret; 2280 } 2281 2282 static void __init init_no_remapping_devices(void) 2283 { 2284 struct dmar_drhd_unit *drhd; 2285 struct device *dev; 2286 int i; 2287 2288 for_each_drhd_unit(drhd) { 2289 if (!drhd->include_all) { 2290 for_each_active_dev_scope(drhd->devices, 2291 drhd->devices_cnt, i, dev) 2292 break; 2293 /* ignore DMAR unit if no devices exist */ 2294 if (i == drhd->devices_cnt) 2295 drhd->ignored = 1; 2296 } 2297 } 2298 2299 for_each_active_drhd_unit(drhd) { 2300 if (drhd->include_all) 2301 continue; 2302 2303 for_each_active_dev_scope(drhd->devices, 2304 drhd->devices_cnt, i, dev) 2305 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2306 break; 2307 if (i < drhd->devices_cnt) 2308 continue; 2309 2310 /* This IOMMU has *only* gfx devices. Either bypass it or 2311 set the gfx_mapped flag, as appropriate */ 2312 drhd->gfx_dedicated = 1; 2313 if (disable_igfx_iommu) 2314 drhd->ignored = 1; 2315 } 2316 } 2317 2318 #ifdef CONFIG_SUSPEND 2319 static int init_iommu_hw(void) 2320 { 2321 struct dmar_drhd_unit *drhd; 2322 struct intel_iommu *iommu = NULL; 2323 int ret; 2324 2325 for_each_active_iommu(iommu, drhd) { 2326 if (iommu->qi) { 2327 ret = dmar_reenable_qi(iommu); 2328 if (ret) 2329 return ret; 2330 } 2331 } 2332 2333 for_each_iommu(iommu, drhd) { 2334 if (drhd->ignored) { 2335 /* 2336 * we always have to disable PMRs or DMA may fail on 2337 * this device 2338 */ 2339 if (force_on) 2340 iommu_disable_protect_mem_regions(iommu); 2341 continue; 2342 } 2343 2344 iommu_flush_write_buffer(iommu); 2345 iommu_set_root_entry(iommu); 2346 iommu_enable_translation(iommu); 2347 iommu_disable_protect_mem_regions(iommu); 2348 } 2349 2350 return 0; 2351 } 2352 2353 static void iommu_flush_all(void) 2354 { 2355 struct dmar_drhd_unit *drhd; 2356 struct intel_iommu *iommu; 2357 2358 for_each_active_iommu(iommu, drhd) { 2359 iommu->flush.flush_context(iommu, 0, 0, 0, 2360 DMA_CCMD_GLOBAL_INVL); 2361 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2362 DMA_TLB_GLOBAL_FLUSH); 2363 } 2364 } 2365 2366 static int iommu_suspend(void) 2367 { 2368 struct dmar_drhd_unit *drhd; 2369 struct intel_iommu *iommu = NULL; 2370 unsigned long flag; 2371 2372 iommu_flush_all(); 2373 2374 for_each_active_iommu(iommu, drhd) { 2375 iommu_disable_translation(iommu); 2376 2377 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2378 2379 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2380 readl(iommu->reg + DMAR_FECTL_REG); 2381 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2382 readl(iommu->reg + DMAR_FEDATA_REG); 2383 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2384 readl(iommu->reg + DMAR_FEADDR_REG); 2385 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2386 readl(iommu->reg + DMAR_FEUADDR_REG); 2387 2388 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2389 } 2390 return 0; 2391 } 2392 2393 static void iommu_resume(void) 2394 { 2395 struct dmar_drhd_unit *drhd; 2396 struct intel_iommu *iommu = NULL; 2397 unsigned long flag; 2398 2399 if (init_iommu_hw()) { 2400 if (force_on) 2401 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2402 else 2403 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2404 return; 2405 } 2406 2407 for_each_active_iommu(iommu, drhd) { 2408 2409 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2410 2411 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2412 iommu->reg + DMAR_FECTL_REG); 2413 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2414 iommu->reg + DMAR_FEDATA_REG); 2415 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2416 iommu->reg + DMAR_FEADDR_REG); 2417 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2418 iommu->reg + DMAR_FEUADDR_REG); 2419 2420 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2421 } 2422 } 2423 2424 static struct syscore_ops iommu_syscore_ops = { 2425 .resume = iommu_resume, 2426 .suspend = iommu_suspend, 2427 }; 2428 2429 static void __init init_iommu_pm_ops(void) 2430 { 2431 register_syscore_ops(&iommu_syscore_ops); 2432 } 2433 2434 #else 2435 static inline void init_iommu_pm_ops(void) {} 2436 #endif /* CONFIG_PM */ 2437 2438 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2439 { 2440 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2441 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2442 rmrr->end_address <= rmrr->base_address || 2443 arch_rmrr_sanity_check(rmrr)) 2444 return -EINVAL; 2445 2446 return 0; 2447 } 2448 2449 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2450 { 2451 struct acpi_dmar_reserved_memory *rmrr; 2452 struct dmar_rmrr_unit *rmrru; 2453 2454 rmrr = (struct acpi_dmar_reserved_memory *)header; 2455 if (rmrr_sanity_check(rmrr)) { 2456 pr_warn(FW_BUG 2457 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2458 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2459 rmrr->base_address, rmrr->end_address, 2460 dmi_get_system_info(DMI_BIOS_VENDOR), 2461 dmi_get_system_info(DMI_BIOS_VERSION), 2462 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2463 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2464 } 2465 2466 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2467 if (!rmrru) 2468 goto out; 2469 2470 rmrru->hdr = header; 2471 2472 rmrru->base_address = rmrr->base_address; 2473 rmrru->end_address = rmrr->end_address; 2474 2475 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2476 ((void *)rmrr) + rmrr->header.length, 2477 &rmrru->devices_cnt); 2478 if (rmrru->devices_cnt && rmrru->devices == NULL) 2479 goto free_rmrru; 2480 2481 list_add(&rmrru->list, &dmar_rmrr_units); 2482 2483 return 0; 2484 free_rmrru: 2485 kfree(rmrru); 2486 out: 2487 return -ENOMEM; 2488 } 2489 2490 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2491 { 2492 struct dmar_atsr_unit *atsru; 2493 struct acpi_dmar_atsr *tmp; 2494 2495 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2496 dmar_rcu_check()) { 2497 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2498 if (atsr->segment != tmp->segment) 2499 continue; 2500 if (atsr->header.length != tmp->header.length) 2501 continue; 2502 if (memcmp(atsr, tmp, atsr->header.length) == 0) 2503 return atsru; 2504 } 2505 2506 return NULL; 2507 } 2508 2509 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2510 { 2511 struct acpi_dmar_atsr *atsr; 2512 struct dmar_atsr_unit *atsru; 2513 2514 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2515 return 0; 2516 2517 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2518 atsru = dmar_find_atsr(atsr); 2519 if (atsru) 2520 return 0; 2521 2522 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 2523 if (!atsru) 2524 return -ENOMEM; 2525 2526 /* 2527 * If memory is allocated from slab by ACPI _DSM method, we need to 2528 * copy the memory content because the memory buffer will be freed 2529 * on return. 2530 */ 2531 atsru->hdr = (void *)(atsru + 1); 2532 memcpy(atsru->hdr, hdr, hdr->length); 2533 atsru->include_all = atsr->flags & 0x1; 2534 if (!atsru->include_all) { 2535 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 2536 (void *)atsr + atsr->header.length, 2537 &atsru->devices_cnt); 2538 if (atsru->devices_cnt && atsru->devices == NULL) { 2539 kfree(atsru); 2540 return -ENOMEM; 2541 } 2542 } 2543 2544 list_add_rcu(&atsru->list, &dmar_atsr_units); 2545 2546 return 0; 2547 } 2548 2549 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 2550 { 2551 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 2552 kfree(atsru); 2553 } 2554 2555 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2556 { 2557 struct acpi_dmar_atsr *atsr; 2558 struct dmar_atsr_unit *atsru; 2559 2560 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2561 atsru = dmar_find_atsr(atsr); 2562 if (atsru) { 2563 list_del_rcu(&atsru->list); 2564 synchronize_rcu(); 2565 intel_iommu_free_atsr(atsru); 2566 } 2567 2568 return 0; 2569 } 2570 2571 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2572 { 2573 int i; 2574 struct device *dev; 2575 struct acpi_dmar_atsr *atsr; 2576 struct dmar_atsr_unit *atsru; 2577 2578 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2579 atsru = dmar_find_atsr(atsr); 2580 if (!atsru) 2581 return 0; 2582 2583 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 2584 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 2585 i, dev) 2586 return -EBUSY; 2587 } 2588 2589 return 0; 2590 } 2591 2592 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 2593 { 2594 struct dmar_satc_unit *satcu; 2595 struct acpi_dmar_satc *tmp; 2596 2597 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 2598 dmar_rcu_check()) { 2599 tmp = (struct acpi_dmar_satc *)satcu->hdr; 2600 if (satc->segment != tmp->segment) 2601 continue; 2602 if (satc->header.length != tmp->header.length) 2603 continue; 2604 if (memcmp(satc, tmp, satc->header.length) == 0) 2605 return satcu; 2606 } 2607 2608 return NULL; 2609 } 2610 2611 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 2612 { 2613 struct acpi_dmar_satc *satc; 2614 struct dmar_satc_unit *satcu; 2615 2616 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2617 return 0; 2618 2619 satc = container_of(hdr, struct acpi_dmar_satc, header); 2620 satcu = dmar_find_satc(satc); 2621 if (satcu) 2622 return 0; 2623 2624 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 2625 if (!satcu) 2626 return -ENOMEM; 2627 2628 satcu->hdr = (void *)(satcu + 1); 2629 memcpy(satcu->hdr, hdr, hdr->length); 2630 satcu->atc_required = satc->flags & 0x1; 2631 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 2632 (void *)satc + satc->header.length, 2633 &satcu->devices_cnt); 2634 if (satcu->devices_cnt && !satcu->devices) { 2635 kfree(satcu); 2636 return -ENOMEM; 2637 } 2638 list_add_rcu(&satcu->list, &dmar_satc_units); 2639 2640 return 0; 2641 } 2642 2643 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 2644 { 2645 struct intel_iommu *iommu = dmaru->iommu; 2646 int ret; 2647 2648 /* 2649 * Disable translation if already enabled prior to OS handover. 2650 */ 2651 if (iommu->gcmd & DMA_GCMD_TE) 2652 iommu_disable_translation(iommu); 2653 2654 ret = iommu_init_domains(iommu); 2655 if (ret == 0) 2656 ret = iommu_alloc_root_entry(iommu); 2657 if (ret) 2658 goto out; 2659 2660 intel_svm_check(iommu); 2661 2662 if (dmaru->ignored) { 2663 /* 2664 * we always have to disable PMRs or DMA may fail on this device 2665 */ 2666 if (force_on) 2667 iommu_disable_protect_mem_regions(iommu); 2668 return 0; 2669 } 2670 2671 intel_iommu_init_qi(iommu); 2672 iommu_flush_write_buffer(iommu); 2673 2674 if (ecap_prs(iommu->ecap)) { 2675 ret = intel_iommu_enable_prq(iommu); 2676 if (ret) 2677 goto disable_iommu; 2678 } 2679 2680 ret = dmar_set_interrupt(iommu); 2681 if (ret) 2682 goto disable_iommu; 2683 2684 iommu_set_root_entry(iommu); 2685 iommu_enable_translation(iommu); 2686 2687 iommu_disable_protect_mem_regions(iommu); 2688 return 0; 2689 2690 disable_iommu: 2691 disable_dmar_iommu(iommu); 2692 out: 2693 free_dmar_iommu(iommu); 2694 return ret; 2695 } 2696 2697 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 2698 { 2699 int ret = 0; 2700 struct intel_iommu *iommu = dmaru->iommu; 2701 2702 if (!intel_iommu_enabled) 2703 return 0; 2704 if (iommu == NULL) 2705 return -EINVAL; 2706 2707 if (insert) { 2708 ret = intel_iommu_add(dmaru); 2709 } else { 2710 disable_dmar_iommu(iommu); 2711 free_dmar_iommu(iommu); 2712 } 2713 2714 return ret; 2715 } 2716 2717 static void intel_iommu_free_dmars(void) 2718 { 2719 struct dmar_rmrr_unit *rmrru, *rmrr_n; 2720 struct dmar_atsr_unit *atsru, *atsr_n; 2721 struct dmar_satc_unit *satcu, *satc_n; 2722 2723 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 2724 list_del(&rmrru->list); 2725 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 2726 kfree(rmrru); 2727 } 2728 2729 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 2730 list_del(&atsru->list); 2731 intel_iommu_free_atsr(atsru); 2732 } 2733 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 2734 list_del(&satcu->list); 2735 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 2736 kfree(satcu); 2737 } 2738 } 2739 2740 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 2741 { 2742 struct dmar_satc_unit *satcu; 2743 struct acpi_dmar_satc *satc; 2744 struct device *tmp; 2745 int i; 2746 2747 dev = pci_physfn(dev); 2748 rcu_read_lock(); 2749 2750 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 2751 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 2752 if (satc->segment != pci_domain_nr(dev->bus)) 2753 continue; 2754 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 2755 if (to_pci_dev(tmp) == dev) 2756 goto out; 2757 } 2758 satcu = NULL; 2759 out: 2760 rcu_read_unlock(); 2761 return satcu; 2762 } 2763 2764 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 2765 { 2766 int i, ret = 1; 2767 struct pci_bus *bus; 2768 struct pci_dev *bridge = NULL; 2769 struct device *tmp; 2770 struct acpi_dmar_atsr *atsr; 2771 struct dmar_atsr_unit *atsru; 2772 struct dmar_satc_unit *satcu; 2773 2774 dev = pci_physfn(dev); 2775 satcu = dmar_find_matched_satc_unit(dev); 2776 if (satcu) 2777 /* 2778 * This device supports ATS as it is in SATC table. 2779 * When IOMMU is in legacy mode, enabling ATS is done 2780 * automatically by HW for the device that requires 2781 * ATS, hence OS should not enable this device ATS 2782 * to avoid duplicated TLB invalidation. 2783 */ 2784 return !(satcu->atc_required && !sm_supported(iommu)); 2785 2786 for (bus = dev->bus; bus; bus = bus->parent) { 2787 bridge = bus->self; 2788 /* If it's an integrated device, allow ATS */ 2789 if (!bridge) 2790 return 1; 2791 /* Connected via non-PCIe: no ATS */ 2792 if (!pci_is_pcie(bridge) || 2793 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 2794 return 0; 2795 /* If we found the root port, look it up in the ATSR */ 2796 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 2797 break; 2798 } 2799 2800 rcu_read_lock(); 2801 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 2802 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 2803 if (atsr->segment != pci_domain_nr(dev->bus)) 2804 continue; 2805 2806 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 2807 if (tmp == &bridge->dev) 2808 goto out; 2809 2810 if (atsru->include_all) 2811 goto out; 2812 } 2813 ret = 0; 2814 out: 2815 rcu_read_unlock(); 2816 2817 return ret; 2818 } 2819 2820 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 2821 { 2822 int ret; 2823 struct dmar_rmrr_unit *rmrru; 2824 struct dmar_atsr_unit *atsru; 2825 struct dmar_satc_unit *satcu; 2826 struct acpi_dmar_atsr *atsr; 2827 struct acpi_dmar_reserved_memory *rmrr; 2828 struct acpi_dmar_satc *satc; 2829 2830 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 2831 return 0; 2832 2833 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 2834 rmrr = container_of(rmrru->hdr, 2835 struct acpi_dmar_reserved_memory, header); 2836 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 2837 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 2838 ((void *)rmrr) + rmrr->header.length, 2839 rmrr->segment, rmrru->devices, 2840 rmrru->devices_cnt); 2841 if (ret < 0) 2842 return ret; 2843 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 2844 dmar_remove_dev_scope(info, rmrr->segment, 2845 rmrru->devices, rmrru->devices_cnt); 2846 } 2847 } 2848 2849 list_for_each_entry(atsru, &dmar_atsr_units, list) { 2850 if (atsru->include_all) 2851 continue; 2852 2853 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 2854 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 2855 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 2856 (void *)atsr + atsr->header.length, 2857 atsr->segment, atsru->devices, 2858 atsru->devices_cnt); 2859 if (ret > 0) 2860 break; 2861 else if (ret < 0) 2862 return ret; 2863 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 2864 if (dmar_remove_dev_scope(info, atsr->segment, 2865 atsru->devices, atsru->devices_cnt)) 2866 break; 2867 } 2868 } 2869 list_for_each_entry(satcu, &dmar_satc_units, list) { 2870 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 2871 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 2872 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 2873 (void *)satc + satc->header.length, 2874 satc->segment, satcu->devices, 2875 satcu->devices_cnt); 2876 if (ret > 0) 2877 break; 2878 else if (ret < 0) 2879 return ret; 2880 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 2881 if (dmar_remove_dev_scope(info, satc->segment, 2882 satcu->devices, satcu->devices_cnt)) 2883 break; 2884 } 2885 } 2886 2887 return 0; 2888 } 2889 2890 static void intel_disable_iommus(void) 2891 { 2892 struct intel_iommu *iommu = NULL; 2893 struct dmar_drhd_unit *drhd; 2894 2895 for_each_iommu(iommu, drhd) 2896 iommu_disable_translation(iommu); 2897 } 2898 2899 void intel_iommu_shutdown(void) 2900 { 2901 struct dmar_drhd_unit *drhd; 2902 struct intel_iommu *iommu = NULL; 2903 2904 if (no_iommu || dmar_disabled) 2905 return; 2906 2907 /* 2908 * All other CPUs were brought down, hotplug interrupts were disabled, 2909 * no lock and RCU checking needed anymore 2910 */ 2911 list_for_each_entry(drhd, &dmar_drhd_units, list) { 2912 iommu = drhd->iommu; 2913 2914 /* Disable PMRs explicitly here. */ 2915 iommu_disable_protect_mem_regions(iommu); 2916 2917 /* Make sure the IOMMUs are switched off */ 2918 iommu_disable_translation(iommu); 2919 } 2920 } 2921 2922 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 2923 { 2924 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 2925 2926 return container_of(iommu_dev, struct intel_iommu, iommu); 2927 } 2928 2929 static ssize_t version_show(struct device *dev, 2930 struct device_attribute *attr, char *buf) 2931 { 2932 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2933 u32 ver = readl(iommu->reg + DMAR_VER_REG); 2934 return sysfs_emit(buf, "%d:%d\n", 2935 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 2936 } 2937 static DEVICE_ATTR_RO(version); 2938 2939 static ssize_t address_show(struct device *dev, 2940 struct device_attribute *attr, char *buf) 2941 { 2942 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2943 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 2944 } 2945 static DEVICE_ATTR_RO(address); 2946 2947 static ssize_t cap_show(struct device *dev, 2948 struct device_attribute *attr, char *buf) 2949 { 2950 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2951 return sysfs_emit(buf, "%llx\n", iommu->cap); 2952 } 2953 static DEVICE_ATTR_RO(cap); 2954 2955 static ssize_t ecap_show(struct device *dev, 2956 struct device_attribute *attr, char *buf) 2957 { 2958 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2959 return sysfs_emit(buf, "%llx\n", iommu->ecap); 2960 } 2961 static DEVICE_ATTR_RO(ecap); 2962 2963 static ssize_t domains_supported_show(struct device *dev, 2964 struct device_attribute *attr, char *buf) 2965 { 2966 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2967 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 2968 } 2969 static DEVICE_ATTR_RO(domains_supported); 2970 2971 static ssize_t domains_used_show(struct device *dev, 2972 struct device_attribute *attr, char *buf) 2973 { 2974 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 2975 return sysfs_emit(buf, "%d\n", 2976 bitmap_weight(iommu->domain_ids, 2977 cap_ndoms(iommu->cap))); 2978 } 2979 static DEVICE_ATTR_RO(domains_used); 2980 2981 static struct attribute *intel_iommu_attrs[] = { 2982 &dev_attr_version.attr, 2983 &dev_attr_address.attr, 2984 &dev_attr_cap.attr, 2985 &dev_attr_ecap.attr, 2986 &dev_attr_domains_supported.attr, 2987 &dev_attr_domains_used.attr, 2988 NULL, 2989 }; 2990 2991 static struct attribute_group intel_iommu_group = { 2992 .name = "intel-iommu", 2993 .attrs = intel_iommu_attrs, 2994 }; 2995 2996 const struct attribute_group *intel_iommu_groups[] = { 2997 &intel_iommu_group, 2998 NULL, 2999 }; 3000 3001 static bool has_external_pci(void) 3002 { 3003 struct pci_dev *pdev = NULL; 3004 3005 for_each_pci_dev(pdev) 3006 if (pdev->external_facing) { 3007 pci_dev_put(pdev); 3008 return true; 3009 } 3010 3011 return false; 3012 } 3013 3014 static int __init platform_optin_force_iommu(void) 3015 { 3016 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3017 return 0; 3018 3019 if (no_iommu || dmar_disabled) 3020 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3021 3022 /* 3023 * If Intel-IOMMU is disabled by default, we will apply identity 3024 * map for all devices except those marked as being untrusted. 3025 */ 3026 if (dmar_disabled) 3027 iommu_set_default_passthrough(false); 3028 3029 dmar_disabled = 0; 3030 no_iommu = 0; 3031 3032 return 1; 3033 } 3034 3035 static int __init probe_acpi_namespace_devices(void) 3036 { 3037 struct dmar_drhd_unit *drhd; 3038 /* To avoid a -Wunused-but-set-variable warning. */ 3039 struct intel_iommu *iommu __maybe_unused; 3040 struct device *dev; 3041 int i, ret = 0; 3042 3043 for_each_active_iommu(iommu, drhd) { 3044 for_each_active_dev_scope(drhd->devices, 3045 drhd->devices_cnt, i, dev) { 3046 struct acpi_device_physical_node *pn; 3047 struct acpi_device *adev; 3048 3049 if (dev->bus != &acpi_bus_type) 3050 continue; 3051 3052 up_read(&dmar_global_lock); 3053 adev = to_acpi_device(dev); 3054 mutex_lock(&adev->physical_node_lock); 3055 list_for_each_entry(pn, 3056 &adev->physical_node_list, node) { 3057 ret = iommu_probe_device(pn->dev); 3058 if (ret) 3059 break; 3060 } 3061 mutex_unlock(&adev->physical_node_lock); 3062 down_read(&dmar_global_lock); 3063 3064 if (ret) 3065 return ret; 3066 } 3067 } 3068 3069 return 0; 3070 } 3071 3072 static __init int tboot_force_iommu(void) 3073 { 3074 if (!tboot_enabled()) 3075 return 0; 3076 3077 if (no_iommu || dmar_disabled) 3078 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3079 3080 dmar_disabled = 0; 3081 no_iommu = 0; 3082 3083 return 1; 3084 } 3085 3086 int __init intel_iommu_init(void) 3087 { 3088 int ret = -ENODEV; 3089 struct dmar_drhd_unit *drhd; 3090 struct intel_iommu *iommu; 3091 3092 /* 3093 * Intel IOMMU is required for a TXT/tboot launch or platform 3094 * opt in, so enforce that. 3095 */ 3096 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3097 platform_optin_force_iommu(); 3098 3099 down_write(&dmar_global_lock); 3100 if (dmar_table_init()) { 3101 if (force_on) 3102 panic("tboot: Failed to initialize DMAR table\n"); 3103 goto out_free_dmar; 3104 } 3105 3106 if (dmar_dev_scope_init() < 0) { 3107 if (force_on) 3108 panic("tboot: Failed to initialize DMAR device scope\n"); 3109 goto out_free_dmar; 3110 } 3111 3112 up_write(&dmar_global_lock); 3113 3114 /* 3115 * The bus notifier takes the dmar_global_lock, so lockdep will 3116 * complain later when we register it under the lock. 3117 */ 3118 dmar_register_bus_notifier(); 3119 3120 down_write(&dmar_global_lock); 3121 3122 if (!no_iommu) 3123 intel_iommu_debugfs_init(); 3124 3125 if (no_iommu || dmar_disabled) { 3126 /* 3127 * We exit the function here to ensure IOMMU's remapping and 3128 * mempool aren't setup, which means that the IOMMU's PMRs 3129 * won't be disabled via the call to init_dmars(). So disable 3130 * it explicitly here. The PMRs were setup by tboot prior to 3131 * calling SENTER, but the kernel is expected to reset/tear 3132 * down the PMRs. 3133 */ 3134 if (intel_iommu_tboot_noforce) { 3135 for_each_iommu(iommu, drhd) 3136 iommu_disable_protect_mem_regions(iommu); 3137 } 3138 3139 /* 3140 * Make sure the IOMMUs are switched off, even when we 3141 * boot into a kexec kernel and the previous kernel left 3142 * them enabled 3143 */ 3144 intel_disable_iommus(); 3145 goto out_free_dmar; 3146 } 3147 3148 if (list_empty(&dmar_rmrr_units)) 3149 pr_info("No RMRR found\n"); 3150 3151 if (list_empty(&dmar_atsr_units)) 3152 pr_info("No ATSR found\n"); 3153 3154 if (list_empty(&dmar_satc_units)) 3155 pr_info("No SATC found\n"); 3156 3157 init_no_remapping_devices(); 3158 3159 ret = init_dmars(); 3160 if (ret) { 3161 if (force_on) 3162 panic("tboot: Failed to initialize DMARs\n"); 3163 pr_err("Initialization failed\n"); 3164 goto out_free_dmar; 3165 } 3166 up_write(&dmar_global_lock); 3167 3168 init_iommu_pm_ops(); 3169 3170 down_read(&dmar_global_lock); 3171 for_each_active_iommu(iommu, drhd) { 3172 /* 3173 * The flush queue implementation does not perform 3174 * page-selective invalidations that are required for efficient 3175 * TLB flushes in virtual environments. The benefit of batching 3176 * is likely to be much lower than the overhead of synchronizing 3177 * the virtual and physical IOMMU page-tables. 3178 */ 3179 if (cap_caching_mode(iommu->cap) && 3180 !first_level_by_default(iommu)) { 3181 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3182 iommu_set_dma_strict(); 3183 } 3184 iommu_device_sysfs_add(&iommu->iommu, NULL, 3185 intel_iommu_groups, 3186 "%s", iommu->name); 3187 /* 3188 * The iommu device probe is protected by the iommu_probe_device_lock. 3189 * Release the dmar_global_lock before entering the device probe path 3190 * to avoid unnecessary lock order splat. 3191 */ 3192 up_read(&dmar_global_lock); 3193 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3194 down_read(&dmar_global_lock); 3195 3196 iommu_pmu_register(iommu); 3197 } 3198 3199 if (probe_acpi_namespace_devices()) 3200 pr_warn("ACPI name space devices didn't probe correctly\n"); 3201 3202 /* Finally, we enable the DMA remapping hardware. */ 3203 for_each_iommu(iommu, drhd) { 3204 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3205 iommu_enable_translation(iommu); 3206 3207 iommu_disable_protect_mem_regions(iommu); 3208 } 3209 up_read(&dmar_global_lock); 3210 3211 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3212 3213 intel_iommu_enabled = 1; 3214 3215 return 0; 3216 3217 out_free_dmar: 3218 intel_iommu_free_dmars(); 3219 up_write(&dmar_global_lock); 3220 return ret; 3221 } 3222 3223 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3224 { 3225 struct device_domain_info *info = opaque; 3226 3227 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3228 return 0; 3229 } 3230 3231 /* 3232 * NB - intel-iommu lacks any sort of reference counting for the users of 3233 * dependent devices. If multiple endpoints have intersecting dependent 3234 * devices, unbinding the driver from any one of them will possibly leave 3235 * the others unable to operate. 3236 */ 3237 static void domain_context_clear(struct device_domain_info *info) 3238 { 3239 if (!dev_is_pci(info->dev)) { 3240 domain_context_clear_one(info, info->bus, info->devfn); 3241 return; 3242 } 3243 3244 pci_for_each_dma_alias(to_pci_dev(info->dev), 3245 &domain_context_clear_one_cb, info); 3246 iommu_disable_pci_ats(info); 3247 } 3248 3249 /* 3250 * Clear the page table pointer in context or pasid table entries so that 3251 * all DMA requests without PASID from the device are blocked. If the page 3252 * table has been set, clean up the data structures. 3253 */ 3254 void device_block_translation(struct device *dev) 3255 { 3256 struct device_domain_info *info = dev_iommu_priv_get(dev); 3257 struct intel_iommu *iommu = info->iommu; 3258 unsigned long flags; 3259 3260 if (info->domain) 3261 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); 3262 3263 if (!dev_is_real_dma_subdevice(dev)) { 3264 if (sm_supported(iommu)) 3265 intel_pasid_tear_down_entry(iommu, dev, 3266 IOMMU_NO_PASID, false); 3267 else 3268 domain_context_clear(info); 3269 } 3270 3271 if (!info->domain) 3272 return; 3273 3274 spin_lock_irqsave(&info->domain->lock, flags); 3275 list_del(&info->link); 3276 spin_unlock_irqrestore(&info->domain->lock, flags); 3277 3278 domain_detach_iommu(info->domain, iommu); 3279 info->domain = NULL; 3280 } 3281 3282 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3283 struct device *dev) 3284 { 3285 device_block_translation(dev); 3286 return 0; 3287 } 3288 3289 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, 3290 struct device *dev, ioasid_t pasid, 3291 struct iommu_domain *old); 3292 3293 static struct iommu_domain blocking_domain = { 3294 .type = IOMMU_DOMAIN_BLOCKED, 3295 .ops = &(const struct iommu_domain_ops) { 3296 .attach_dev = blocking_domain_attach_dev, 3297 .set_dev_pasid = blocking_domain_set_dev_pasid, 3298 } 3299 }; 3300 3301 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) 3302 { 3303 if (!intel_iommu_superpage) 3304 return 0; 3305 3306 if (first_stage) 3307 return cap_fl1gp_support(iommu->cap) ? 2 : 1; 3308 3309 return fls(cap_super_page_val(iommu->cap)); 3310 } 3311 3312 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) 3313 { 3314 struct device_domain_info *info = dev_iommu_priv_get(dev); 3315 struct intel_iommu *iommu = info->iommu; 3316 struct dmar_domain *domain; 3317 int addr_width; 3318 3319 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 3320 if (!domain) 3321 return ERR_PTR(-ENOMEM); 3322 3323 INIT_LIST_HEAD(&domain->devices); 3324 INIT_LIST_HEAD(&domain->dev_pasids); 3325 INIT_LIST_HEAD(&domain->cache_tags); 3326 spin_lock_init(&domain->lock); 3327 spin_lock_init(&domain->cache_lock); 3328 xa_init(&domain->iommu_array); 3329 3330 domain->nid = dev_to_node(dev); 3331 domain->use_first_level = first_stage; 3332 3333 /* calculate the address width */ 3334 addr_width = agaw_to_width(iommu->agaw); 3335 if (addr_width > cap_mgaw(iommu->cap)) 3336 addr_width = cap_mgaw(iommu->cap); 3337 domain->gaw = addr_width; 3338 domain->agaw = iommu->agaw; 3339 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); 3340 3341 /* iommu memory access coherency */ 3342 domain->iommu_coherency = iommu_paging_structure_coherency(iommu); 3343 3344 /* pagesize bitmap */ 3345 domain->domain.pgsize_bitmap = SZ_4K; 3346 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); 3347 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 3348 3349 /* 3350 * IOVA aperture: First-level translation restricts the input-address 3351 * to a canonical address (i.e., address bits 63:N have the same value 3352 * as address bit [N-1], where N is 48-bits with 4-level paging and 3353 * 57-bits with 5-level paging). Hence, skip bit [N-1]. 3354 */ 3355 domain->domain.geometry.force_aperture = true; 3356 domain->domain.geometry.aperture_start = 0; 3357 if (first_stage) 3358 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 3359 else 3360 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 3361 3362 /* always allocate the top pgd */ 3363 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL); 3364 if (!domain->pgd) { 3365 kfree(domain); 3366 return ERR_PTR(-ENOMEM); 3367 } 3368 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3369 3370 return domain; 3371 } 3372 3373 static struct iommu_domain * 3374 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 3375 const struct iommu_user_data *user_data) 3376 { 3377 struct device_domain_info *info = dev_iommu_priv_get(dev); 3378 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3379 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3380 struct intel_iommu *iommu = info->iommu; 3381 struct dmar_domain *dmar_domain; 3382 struct iommu_domain *domain; 3383 bool first_stage; 3384 3385 if (flags & 3386 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 3387 return ERR_PTR(-EOPNOTSUPP); 3388 if (nested_parent && !nested_supported(iommu)) 3389 return ERR_PTR(-EOPNOTSUPP); 3390 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3391 return ERR_PTR(-EOPNOTSUPP); 3392 3393 /* 3394 * Always allocate the guest compatible page table unless 3395 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING 3396 * is specified. 3397 */ 3398 if (nested_parent || dirty_tracking) { 3399 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) 3400 return ERR_PTR(-EOPNOTSUPP); 3401 first_stage = false; 3402 } else { 3403 first_stage = first_level_by_default(iommu); 3404 } 3405 3406 dmar_domain = paging_domain_alloc(dev, first_stage); 3407 if (IS_ERR(dmar_domain)) 3408 return ERR_CAST(dmar_domain); 3409 domain = &dmar_domain->domain; 3410 domain->type = IOMMU_DOMAIN_UNMANAGED; 3411 domain->owner = &intel_iommu_ops; 3412 domain->ops = intel_iommu_ops.default_domain_ops; 3413 3414 if (nested_parent) { 3415 dmar_domain->nested_parent = true; 3416 INIT_LIST_HEAD(&dmar_domain->s1_domains); 3417 spin_lock_init(&dmar_domain->s1_lock); 3418 } 3419 3420 if (dirty_tracking) { 3421 if (dmar_domain->use_first_level) { 3422 iommu_domain_free(domain); 3423 return ERR_PTR(-EOPNOTSUPP); 3424 } 3425 domain->dirty_ops = &intel_dirty_ops; 3426 } 3427 3428 return domain; 3429 } 3430 3431 static void intel_iommu_domain_free(struct iommu_domain *domain) 3432 { 3433 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3434 3435 WARN_ON(dmar_domain->nested_parent && 3436 !list_empty(&dmar_domain->s1_domains)); 3437 domain_exit(dmar_domain); 3438 } 3439 3440 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) 3441 { 3442 struct device_domain_info *info = dev_iommu_priv_get(dev); 3443 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3444 struct intel_iommu *iommu = info->iommu; 3445 int addr_width; 3446 3447 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) 3448 return -EPERM; 3449 3450 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3451 return -EINVAL; 3452 3453 if (domain->dirty_ops && !ssads_supported(iommu)) 3454 return -EINVAL; 3455 3456 if (dmar_domain->iommu_coherency != 3457 iommu_paging_structure_coherency(iommu)) 3458 return -EINVAL; 3459 3460 if (dmar_domain->iommu_superpage != 3461 iommu_superpage_capability(iommu, dmar_domain->use_first_level)) 3462 return -EINVAL; 3463 3464 if (dmar_domain->use_first_level && 3465 (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) 3466 return -EINVAL; 3467 3468 /* check if this iommu agaw is sufficient for max mapped address */ 3469 addr_width = agaw_to_width(iommu->agaw); 3470 if (addr_width > cap_mgaw(iommu->cap)) 3471 addr_width = cap_mgaw(iommu->cap); 3472 3473 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) 3474 return -EINVAL; 3475 3476 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3477 context_copied(iommu, info->bus, info->devfn)) 3478 return intel_pasid_setup_sm_context(dev); 3479 3480 return 0; 3481 } 3482 3483 static int intel_iommu_attach_device(struct iommu_domain *domain, 3484 struct device *dev) 3485 { 3486 int ret; 3487 3488 device_block_translation(dev); 3489 3490 ret = paging_domain_compatible(domain, dev); 3491 if (ret) 3492 return ret; 3493 3494 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 3495 } 3496 3497 static int intel_iommu_map(struct iommu_domain *domain, 3498 unsigned long iova, phys_addr_t hpa, 3499 size_t size, int iommu_prot, gfp_t gfp) 3500 { 3501 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3502 u64 max_addr; 3503 int prot = 0; 3504 3505 if (iommu_prot & IOMMU_READ) 3506 prot |= DMA_PTE_READ; 3507 if (iommu_prot & IOMMU_WRITE) 3508 prot |= DMA_PTE_WRITE; 3509 if (dmar_domain->set_pte_snp) 3510 prot |= DMA_PTE_SNP; 3511 3512 max_addr = iova + size; 3513 if (dmar_domain->max_addr < max_addr) { 3514 u64 end; 3515 3516 /* check if minimum agaw is sufficient for mapped address */ 3517 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 3518 if (end < max_addr) { 3519 pr_err("%s: iommu width (%d) is not " 3520 "sufficient for the mapped address (%llx)\n", 3521 __func__, dmar_domain->gaw, max_addr); 3522 return -EFAULT; 3523 } 3524 dmar_domain->max_addr = max_addr; 3525 } 3526 /* Round up size to next multiple of PAGE_SIZE, if it and 3527 the low bits of hpa would take us onto the next page */ 3528 size = aligned_nrpages(hpa, size); 3529 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 3530 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 3531 } 3532 3533 static int intel_iommu_map_pages(struct iommu_domain *domain, 3534 unsigned long iova, phys_addr_t paddr, 3535 size_t pgsize, size_t pgcount, 3536 int prot, gfp_t gfp, size_t *mapped) 3537 { 3538 unsigned long pgshift = __ffs(pgsize); 3539 size_t size = pgcount << pgshift; 3540 int ret; 3541 3542 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 3543 return -EINVAL; 3544 3545 if (!IS_ALIGNED(iova | paddr, pgsize)) 3546 return -EINVAL; 3547 3548 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 3549 if (!ret && mapped) 3550 *mapped = size; 3551 3552 return ret; 3553 } 3554 3555 static size_t intel_iommu_unmap(struct iommu_domain *domain, 3556 unsigned long iova, size_t size, 3557 struct iommu_iotlb_gather *gather) 3558 { 3559 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3560 unsigned long start_pfn, last_pfn; 3561 int level = 0; 3562 3563 /* Cope with horrid API which requires us to unmap more than the 3564 size argument if it happens to be a large-page mapping. */ 3565 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 3566 &level, GFP_ATOMIC))) 3567 return 0; 3568 3569 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 3570 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 3571 3572 start_pfn = iova >> VTD_PAGE_SHIFT; 3573 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 3574 3575 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 3576 3577 if (dmar_domain->max_addr == iova + size) 3578 dmar_domain->max_addr = iova; 3579 3580 /* 3581 * We do not use page-selective IOTLB invalidation in flush queue, 3582 * so there is no need to track page and sync iotlb. 3583 */ 3584 if (!iommu_iotlb_gather_queued(gather)) 3585 iommu_iotlb_gather_add_page(domain, gather, iova, size); 3586 3587 return size; 3588 } 3589 3590 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 3591 unsigned long iova, 3592 size_t pgsize, size_t pgcount, 3593 struct iommu_iotlb_gather *gather) 3594 { 3595 unsigned long pgshift = __ffs(pgsize); 3596 size_t size = pgcount << pgshift; 3597 3598 return intel_iommu_unmap(domain, iova, size, gather); 3599 } 3600 3601 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 3602 struct iommu_iotlb_gather *gather) 3603 { 3604 cache_tag_flush_range(to_dmar_domain(domain), gather->start, 3605 gather->end, list_empty(&gather->freelist)); 3606 iommu_put_pages_list(&gather->freelist); 3607 } 3608 3609 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 3610 dma_addr_t iova) 3611 { 3612 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3613 struct dma_pte *pte; 3614 int level = 0; 3615 u64 phys = 0; 3616 3617 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 3618 GFP_ATOMIC); 3619 if (pte && dma_pte_present(pte)) 3620 phys = dma_pte_addr(pte) + 3621 (iova & (BIT_MASK(level_to_offset_bits(level) + 3622 VTD_PAGE_SHIFT) - 1)); 3623 3624 return phys; 3625 } 3626 3627 static bool domain_support_force_snooping(struct dmar_domain *domain) 3628 { 3629 struct device_domain_info *info; 3630 bool support = true; 3631 3632 assert_spin_locked(&domain->lock); 3633 list_for_each_entry(info, &domain->devices, link) { 3634 if (!ecap_sc_support(info->iommu->ecap)) { 3635 support = false; 3636 break; 3637 } 3638 } 3639 3640 return support; 3641 } 3642 3643 static void domain_set_force_snooping(struct dmar_domain *domain) 3644 { 3645 struct device_domain_info *info; 3646 3647 assert_spin_locked(&domain->lock); 3648 /* 3649 * Second level page table supports per-PTE snoop control. The 3650 * iommu_map() interface will handle this by setting SNP bit. 3651 */ 3652 if (!domain->use_first_level) { 3653 domain->set_pte_snp = true; 3654 return; 3655 } 3656 3657 list_for_each_entry(info, &domain->devices, link) 3658 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 3659 IOMMU_NO_PASID); 3660 } 3661 3662 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3663 { 3664 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3665 unsigned long flags; 3666 3667 if (dmar_domain->force_snooping) 3668 return true; 3669 3670 spin_lock_irqsave(&dmar_domain->lock, flags); 3671 if (!domain_support_force_snooping(dmar_domain) || 3672 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 3673 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3674 return false; 3675 } 3676 3677 domain_set_force_snooping(dmar_domain); 3678 dmar_domain->force_snooping = true; 3679 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3680 3681 return true; 3682 } 3683 3684 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 3685 { 3686 struct device_domain_info *info = dev_iommu_priv_get(dev); 3687 3688 switch (cap) { 3689 case IOMMU_CAP_CACHE_COHERENCY: 3690 case IOMMU_CAP_DEFERRED_FLUSH: 3691 return true; 3692 case IOMMU_CAP_PRE_BOOT_PROTECTION: 3693 return dmar_platform_optin(); 3694 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 3695 return ecap_sc_support(info->iommu->ecap); 3696 case IOMMU_CAP_DIRTY_TRACKING: 3697 return ssads_supported(info->iommu); 3698 default: 3699 return false; 3700 } 3701 } 3702 3703 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 3704 { 3705 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 3706 struct device_domain_info *info; 3707 struct intel_iommu *iommu; 3708 u8 bus, devfn; 3709 int ret; 3710 3711 iommu = device_lookup_iommu(dev, &bus, &devfn); 3712 if (!iommu || !iommu->iommu.ops) 3713 return ERR_PTR(-ENODEV); 3714 3715 info = kzalloc(sizeof(*info), GFP_KERNEL); 3716 if (!info) 3717 return ERR_PTR(-ENOMEM); 3718 3719 if (dev_is_real_dma_subdevice(dev)) { 3720 info->bus = pdev->bus->number; 3721 info->devfn = pdev->devfn; 3722 info->segment = pci_domain_nr(pdev->bus); 3723 } else { 3724 info->bus = bus; 3725 info->devfn = devfn; 3726 info->segment = iommu->segment; 3727 } 3728 3729 info->dev = dev; 3730 info->iommu = iommu; 3731 if (dev_is_pci(dev)) { 3732 if (ecap_dev_iotlb_support(iommu->ecap) && 3733 pci_ats_supported(pdev) && 3734 dmar_ats_supported(pdev, iommu)) { 3735 info->ats_supported = 1; 3736 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 3737 3738 /* 3739 * For IOMMU that supports device IOTLB throttling 3740 * (DIT), we assign PFSID to the invalidation desc 3741 * of a VF such that IOMMU HW can gauge queue depth 3742 * at PF level. If DIT is not set, PFSID will be 3743 * treated as reserved, which should be set to 0. 3744 */ 3745 if (ecap_dit(iommu->ecap)) 3746 info->pfsid = pci_dev_id(pci_physfn(pdev)); 3747 info->ats_qdep = pci_ats_queue_depth(pdev); 3748 } 3749 if (sm_supported(iommu)) { 3750 if (pasid_supported(iommu)) { 3751 int features = pci_pasid_features(pdev); 3752 3753 if (features >= 0) 3754 info->pasid_supported = features | 1; 3755 } 3756 3757 if (info->ats_supported && ecap_prs(iommu->ecap) && 3758 pci_pri_supported(pdev)) 3759 info->pri_supported = 1; 3760 } 3761 } 3762 3763 dev_iommu_priv_set(dev, info); 3764 if (pdev && pci_ats_supported(pdev)) { 3765 pci_prepare_ats(pdev, VTD_PAGE_SHIFT); 3766 ret = device_rbtree_insert(iommu, info); 3767 if (ret) 3768 goto free; 3769 } 3770 3771 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 3772 ret = intel_pasid_alloc_table(dev); 3773 if (ret) { 3774 dev_err(dev, "PASID table allocation failed\n"); 3775 goto clear_rbtree; 3776 } 3777 3778 if (!context_copied(iommu, info->bus, info->devfn)) { 3779 ret = intel_pasid_setup_sm_context(dev); 3780 if (ret) 3781 goto free_table; 3782 } 3783 } 3784 3785 intel_iommu_debugfs_create_dev(info); 3786 3787 /* 3788 * The PCIe spec, in its wisdom, declares that the behaviour of the 3789 * device is undefined if you enable PASID support after ATS support. 3790 * So always enable PASID support on devices which have it, even if 3791 * we can't yet know if we're ever going to use it. 3792 */ 3793 if (info->pasid_supported && 3794 !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 3795 info->pasid_enabled = 1; 3796 3797 if (sm_supported(iommu)) 3798 iommu_enable_pci_ats(info); 3799 iommu_enable_pci_pri(info); 3800 3801 return &iommu->iommu; 3802 free_table: 3803 intel_pasid_free_table(dev); 3804 clear_rbtree: 3805 device_rbtree_remove(info); 3806 free: 3807 kfree(info); 3808 3809 return ERR_PTR(ret); 3810 } 3811 3812 static void intel_iommu_release_device(struct device *dev) 3813 { 3814 struct device_domain_info *info = dev_iommu_priv_get(dev); 3815 struct intel_iommu *iommu = info->iommu; 3816 3817 iommu_disable_pci_pri(info); 3818 iommu_disable_pci_ats(info); 3819 3820 if (info->pasid_enabled) { 3821 pci_disable_pasid(to_pci_dev(dev)); 3822 info->pasid_enabled = 0; 3823 } 3824 3825 mutex_lock(&iommu->iopf_lock); 3826 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) 3827 device_rbtree_remove(info); 3828 mutex_unlock(&iommu->iopf_lock); 3829 3830 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3831 !context_copied(iommu, info->bus, info->devfn)) 3832 intel_pasid_teardown_sm_context(dev); 3833 3834 intel_pasid_free_table(dev); 3835 intel_iommu_debugfs_remove_dev(info); 3836 kfree(info); 3837 set_dma_ops(dev, NULL); 3838 } 3839 3840 static void intel_iommu_get_resv_regions(struct device *device, 3841 struct list_head *head) 3842 { 3843 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 3844 struct iommu_resv_region *reg; 3845 struct dmar_rmrr_unit *rmrr; 3846 struct device *i_dev; 3847 int i; 3848 3849 rcu_read_lock(); 3850 for_each_rmrr_units(rmrr) { 3851 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 3852 i, i_dev) { 3853 struct iommu_resv_region *resv; 3854 enum iommu_resv_type type; 3855 size_t length; 3856 3857 if (i_dev != device && 3858 !is_downstream_to_pci_bridge(device, i_dev)) 3859 continue; 3860 3861 length = rmrr->end_address - rmrr->base_address + 1; 3862 3863 type = device_rmrr_is_relaxable(device) ? 3864 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 3865 3866 resv = iommu_alloc_resv_region(rmrr->base_address, 3867 length, prot, type, 3868 GFP_ATOMIC); 3869 if (!resv) 3870 break; 3871 3872 list_add_tail(&resv->list, head); 3873 } 3874 } 3875 rcu_read_unlock(); 3876 3877 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 3878 if (dev_is_pci(device)) { 3879 struct pci_dev *pdev = to_pci_dev(device); 3880 3881 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 3882 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 3883 IOMMU_RESV_DIRECT_RELAXABLE, 3884 GFP_KERNEL); 3885 if (reg) 3886 list_add_tail(®->list, head); 3887 } 3888 } 3889 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 3890 3891 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 3892 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 3893 0, IOMMU_RESV_MSI, GFP_KERNEL); 3894 if (!reg) 3895 return; 3896 list_add_tail(®->list, head); 3897 } 3898 3899 static struct iommu_group *intel_iommu_device_group(struct device *dev) 3900 { 3901 if (dev_is_pci(dev)) 3902 return pci_device_group(dev); 3903 return generic_device_group(dev); 3904 } 3905 3906 int intel_iommu_enable_iopf(struct device *dev) 3907 { 3908 struct device_domain_info *info = dev_iommu_priv_get(dev); 3909 struct intel_iommu *iommu = info->iommu; 3910 int ret; 3911 3912 if (!info->pri_enabled) 3913 return -ENODEV; 3914 3915 if (info->iopf_refcount) { 3916 info->iopf_refcount++; 3917 return 0; 3918 } 3919 3920 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 3921 if (ret) 3922 return ret; 3923 3924 info->iopf_refcount = 1; 3925 3926 return 0; 3927 } 3928 3929 void intel_iommu_disable_iopf(struct device *dev) 3930 { 3931 struct device_domain_info *info = dev_iommu_priv_get(dev); 3932 struct intel_iommu *iommu = info->iommu; 3933 3934 if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) 3935 return; 3936 3937 if (--info->iopf_refcount) 3938 return; 3939 3940 iopf_queue_remove_device(iommu->iopf_queue, dev); 3941 } 3942 3943 static int 3944 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 3945 { 3946 switch (feat) { 3947 case IOMMU_DEV_FEAT_IOPF: 3948 return intel_iommu_enable_iopf(dev); 3949 3950 case IOMMU_DEV_FEAT_SVA: 3951 return 0; 3952 3953 default: 3954 return -ENODEV; 3955 } 3956 } 3957 3958 static int 3959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 3960 { 3961 switch (feat) { 3962 case IOMMU_DEV_FEAT_IOPF: 3963 intel_iommu_disable_iopf(dev); 3964 return 0; 3965 3966 case IOMMU_DEV_FEAT_SVA: 3967 return 0; 3968 3969 default: 3970 return -ENODEV; 3971 } 3972 } 3973 3974 static bool intel_iommu_is_attach_deferred(struct device *dev) 3975 { 3976 struct device_domain_info *info = dev_iommu_priv_get(dev); 3977 3978 return translation_pre_enabled(info->iommu) && !info->domain; 3979 } 3980 3981 /* 3982 * Check that the device does not live on an external facing PCI port that is 3983 * marked as untrusted. Such devices should not be able to apply quirks and 3984 * thus not be able to bypass the IOMMU restrictions. 3985 */ 3986 static bool risky_device(struct pci_dev *pdev) 3987 { 3988 if (pdev->untrusted) { 3989 pci_info(pdev, 3990 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 3991 pdev->vendor, pdev->device); 3992 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 3993 return true; 3994 } 3995 return false; 3996 } 3997 3998 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 3999 unsigned long iova, size_t size) 4000 { 4001 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); 4002 4003 return 0; 4004 } 4005 4006 void domain_remove_dev_pasid(struct iommu_domain *domain, 4007 struct device *dev, ioasid_t pasid) 4008 { 4009 struct device_domain_info *info = dev_iommu_priv_get(dev); 4010 struct dev_pasid_info *curr, *dev_pasid = NULL; 4011 struct intel_iommu *iommu = info->iommu; 4012 struct dmar_domain *dmar_domain; 4013 unsigned long flags; 4014 4015 if (!domain) 4016 return; 4017 4018 /* Identity domain has no meta data for pasid. */ 4019 if (domain->type == IOMMU_DOMAIN_IDENTITY) 4020 return; 4021 4022 dmar_domain = to_dmar_domain(domain); 4023 spin_lock_irqsave(&dmar_domain->lock, flags); 4024 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4025 if (curr->dev == dev && curr->pasid == pasid) { 4026 list_del(&curr->link_domain); 4027 dev_pasid = curr; 4028 break; 4029 } 4030 } 4031 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4032 4033 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4034 domain_detach_iommu(dmar_domain, iommu); 4035 if (!WARN_ON_ONCE(!dev_pasid)) { 4036 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4037 kfree(dev_pasid); 4038 } 4039 } 4040 4041 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, 4042 struct device *dev, ioasid_t pasid, 4043 struct iommu_domain *old) 4044 { 4045 struct device_domain_info *info = dev_iommu_priv_get(dev); 4046 4047 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); 4048 domain_remove_dev_pasid(old, dev, pasid); 4049 4050 return 0; 4051 } 4052 4053 struct dev_pasid_info * 4054 domain_add_dev_pasid(struct iommu_domain *domain, 4055 struct device *dev, ioasid_t pasid) 4056 { 4057 struct device_domain_info *info = dev_iommu_priv_get(dev); 4058 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4059 struct intel_iommu *iommu = info->iommu; 4060 struct dev_pasid_info *dev_pasid; 4061 unsigned long flags; 4062 int ret; 4063 4064 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4065 if (!dev_pasid) 4066 return ERR_PTR(-ENOMEM); 4067 4068 ret = domain_attach_iommu(dmar_domain, iommu); 4069 if (ret) 4070 goto out_free; 4071 4072 ret = cache_tag_assign_domain(dmar_domain, dev, pasid); 4073 if (ret) 4074 goto out_detach_iommu; 4075 4076 dev_pasid->dev = dev; 4077 dev_pasid->pasid = pasid; 4078 spin_lock_irqsave(&dmar_domain->lock, flags); 4079 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4080 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4081 4082 return dev_pasid; 4083 out_detach_iommu: 4084 domain_detach_iommu(dmar_domain, iommu); 4085 out_free: 4086 kfree(dev_pasid); 4087 return ERR_PTR(ret); 4088 } 4089 4090 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4091 struct device *dev, ioasid_t pasid, 4092 struct iommu_domain *old) 4093 { 4094 struct device_domain_info *info = dev_iommu_priv_get(dev); 4095 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4096 struct intel_iommu *iommu = info->iommu; 4097 struct dev_pasid_info *dev_pasid; 4098 int ret; 4099 4100 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) 4101 return -EINVAL; 4102 4103 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4104 return -EOPNOTSUPP; 4105 4106 if (domain->dirty_ops) 4107 return -EINVAL; 4108 4109 if (context_copied(iommu, info->bus, info->devfn)) 4110 return -EBUSY; 4111 4112 ret = paging_domain_compatible(domain, dev); 4113 if (ret) 4114 return ret; 4115 4116 dev_pasid = domain_add_dev_pasid(domain, dev, pasid); 4117 if (IS_ERR(dev_pasid)) 4118 return PTR_ERR(dev_pasid); 4119 4120 if (dmar_domain->use_first_level) 4121 ret = domain_setup_first_level(iommu, dmar_domain, 4122 dev, pasid, old); 4123 else 4124 ret = domain_setup_second_level(iommu, dmar_domain, 4125 dev, pasid, old); 4126 if (ret) 4127 goto out_remove_dev_pasid; 4128 4129 domain_remove_dev_pasid(old, dev, pasid); 4130 4131 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4132 4133 return 0; 4134 4135 out_remove_dev_pasid: 4136 domain_remove_dev_pasid(domain, dev, pasid); 4137 return ret; 4138 } 4139 4140 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4141 { 4142 struct device_domain_info *info = dev_iommu_priv_get(dev); 4143 struct intel_iommu *iommu = info->iommu; 4144 struct iommu_hw_info_vtd *vtd; 4145 4146 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4147 if (!vtd) 4148 return ERR_PTR(-ENOMEM); 4149 4150 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4151 vtd->cap_reg = iommu->cap; 4152 vtd->ecap_reg = iommu->ecap; 4153 *length = sizeof(*vtd); 4154 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4155 return vtd; 4156 } 4157 4158 /* 4159 * Set dirty tracking for the device list of a domain. The caller must 4160 * hold the domain->lock when calling it. 4161 */ 4162 static int device_set_dirty_tracking(struct list_head *devices, bool enable) 4163 { 4164 struct device_domain_info *info; 4165 int ret = 0; 4166 4167 list_for_each_entry(info, devices, link) { 4168 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 4169 IOMMU_NO_PASID, enable); 4170 if (ret) 4171 break; 4172 } 4173 4174 return ret; 4175 } 4176 4177 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, 4178 bool enable) 4179 { 4180 struct dmar_domain *s1_domain; 4181 unsigned long flags; 4182 int ret; 4183 4184 spin_lock(&domain->s1_lock); 4185 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4186 spin_lock_irqsave(&s1_domain->lock, flags); 4187 ret = device_set_dirty_tracking(&s1_domain->devices, enable); 4188 spin_unlock_irqrestore(&s1_domain->lock, flags); 4189 if (ret) 4190 goto err_unwind; 4191 } 4192 spin_unlock(&domain->s1_lock); 4193 return 0; 4194 4195 err_unwind: 4196 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4197 spin_lock_irqsave(&s1_domain->lock, flags); 4198 device_set_dirty_tracking(&s1_domain->devices, 4199 domain->dirty_tracking); 4200 spin_unlock_irqrestore(&s1_domain->lock, flags); 4201 } 4202 spin_unlock(&domain->s1_lock); 4203 return ret; 4204 } 4205 4206 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4207 bool enable) 4208 { 4209 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4210 int ret; 4211 4212 spin_lock(&dmar_domain->lock); 4213 if (dmar_domain->dirty_tracking == enable) 4214 goto out_unlock; 4215 4216 ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 4217 if (ret) 4218 goto err_unwind; 4219 4220 if (dmar_domain->nested_parent) { 4221 ret = parent_domain_set_dirty_tracking(dmar_domain, enable); 4222 if (ret) 4223 goto err_unwind; 4224 } 4225 4226 dmar_domain->dirty_tracking = enable; 4227 out_unlock: 4228 spin_unlock(&dmar_domain->lock); 4229 4230 return 0; 4231 4232 err_unwind: 4233 device_set_dirty_tracking(&dmar_domain->devices, 4234 dmar_domain->dirty_tracking); 4235 spin_unlock(&dmar_domain->lock); 4236 return ret; 4237 } 4238 4239 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4240 unsigned long iova, size_t size, 4241 unsigned long flags, 4242 struct iommu_dirty_bitmap *dirty) 4243 { 4244 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4245 unsigned long end = iova + size - 1; 4246 unsigned long pgsize; 4247 4248 /* 4249 * IOMMUFD core calls into a dirty tracking disabled domain without an 4250 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4251 * have occurred when we stopped dirty tracking. This ensures that we 4252 * never inherit dirtied bits from a previous cycle. 4253 */ 4254 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4255 return -EINVAL; 4256 4257 do { 4258 struct dma_pte *pte; 4259 int lvl = 0; 4260 4261 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4262 GFP_ATOMIC); 4263 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4264 if (!pte || !dma_pte_present(pte)) { 4265 iova += pgsize; 4266 continue; 4267 } 4268 4269 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4270 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4271 iova += pgsize; 4272 } while (iova < end); 4273 4274 return 0; 4275 } 4276 4277 static const struct iommu_dirty_ops intel_dirty_ops = { 4278 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4279 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4280 }; 4281 4282 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) 4283 { 4284 struct device_domain_info *info = dev_iommu_priv_get(dev); 4285 struct intel_iommu *iommu = info->iommu; 4286 struct context_entry *context; 4287 4288 spin_lock(&iommu->lock); 4289 context = iommu_context_addr(iommu, bus, devfn, 1); 4290 if (!context) { 4291 spin_unlock(&iommu->lock); 4292 return -ENOMEM; 4293 } 4294 4295 if (context_present(context) && !context_copied(iommu, bus, devfn)) { 4296 spin_unlock(&iommu->lock); 4297 return 0; 4298 } 4299 4300 copied_context_tear_down(iommu, context, bus, devfn); 4301 context_clear_entry(context); 4302 context_set_domain_id(context, FLPT_DEFAULT_DID); 4303 4304 /* 4305 * In pass through mode, AW must be programmed to indicate the largest 4306 * AGAW value supported by hardware. And ASR is ignored by hardware. 4307 */ 4308 context_set_address_width(context, iommu->msagaw); 4309 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); 4310 context_set_fault_enable(context); 4311 context_set_present(context); 4312 if (!ecap_coherent(iommu->ecap)) 4313 clflush_cache_range(context, sizeof(*context)); 4314 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); 4315 spin_unlock(&iommu->lock); 4316 4317 return 0; 4318 } 4319 4320 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) 4321 { 4322 struct device *dev = data; 4323 4324 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); 4325 } 4326 4327 static int device_setup_pass_through(struct device *dev) 4328 { 4329 struct device_domain_info *info = dev_iommu_priv_get(dev); 4330 4331 if (!dev_is_pci(dev)) 4332 return context_setup_pass_through(dev, info->bus, info->devfn); 4333 4334 return pci_for_each_dma_alias(to_pci_dev(dev), 4335 context_setup_pass_through_cb, dev); 4336 } 4337 4338 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) 4339 { 4340 struct device_domain_info *info = dev_iommu_priv_get(dev); 4341 struct intel_iommu *iommu = info->iommu; 4342 int ret; 4343 4344 device_block_translation(dev); 4345 4346 if (dev_is_real_dma_subdevice(dev)) 4347 return 0; 4348 4349 if (sm_supported(iommu)) 4350 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); 4351 else 4352 ret = device_setup_pass_through(dev); 4353 4354 return ret; 4355 } 4356 4357 static int identity_domain_set_dev_pasid(struct iommu_domain *domain, 4358 struct device *dev, ioasid_t pasid, 4359 struct iommu_domain *old) 4360 { 4361 struct device_domain_info *info = dev_iommu_priv_get(dev); 4362 struct intel_iommu *iommu = info->iommu; 4363 int ret; 4364 4365 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4366 return -EOPNOTSUPP; 4367 4368 ret = domain_setup_passthrough(iommu, dev, pasid, old); 4369 if (ret) 4370 return ret; 4371 4372 domain_remove_dev_pasid(old, dev, pasid); 4373 return 0; 4374 } 4375 4376 static struct iommu_domain identity_domain = { 4377 .type = IOMMU_DOMAIN_IDENTITY, 4378 .ops = &(const struct iommu_domain_ops) { 4379 .attach_dev = identity_domain_attach_dev, 4380 .set_dev_pasid = identity_domain_set_dev_pasid, 4381 }, 4382 }; 4383 4384 const struct iommu_ops intel_iommu_ops = { 4385 .blocked_domain = &blocking_domain, 4386 .release_domain = &blocking_domain, 4387 .identity_domain = &identity_domain, 4388 .capable = intel_iommu_capable, 4389 .hw_info = intel_iommu_hw_info, 4390 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, 4391 .domain_alloc_sva = intel_svm_domain_alloc, 4392 .domain_alloc_nested = intel_iommu_domain_alloc_nested, 4393 .probe_device = intel_iommu_probe_device, 4394 .release_device = intel_iommu_release_device, 4395 .get_resv_regions = intel_iommu_get_resv_regions, 4396 .device_group = intel_iommu_device_group, 4397 .dev_enable_feat = intel_iommu_dev_enable_feat, 4398 .dev_disable_feat = intel_iommu_dev_disable_feat, 4399 .is_attach_deferred = intel_iommu_is_attach_deferred, 4400 .def_domain_type = device_def_domain_type, 4401 .pgsize_bitmap = SZ_4K, 4402 .page_response = intel_iommu_page_response, 4403 .default_domain_ops = &(const struct iommu_domain_ops) { 4404 .attach_dev = intel_iommu_attach_device, 4405 .set_dev_pasid = intel_iommu_set_dev_pasid, 4406 .map_pages = intel_iommu_map_pages, 4407 .unmap_pages = intel_iommu_unmap_pages, 4408 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4409 .flush_iotlb_all = intel_flush_iotlb_all, 4410 .iotlb_sync = intel_iommu_tlb_sync, 4411 .iova_to_phys = intel_iommu_iova_to_phys, 4412 .free = intel_iommu_domain_free, 4413 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4414 } 4415 }; 4416 4417 static void quirk_iommu_igfx(struct pci_dev *dev) 4418 { 4419 if (risky_device(dev)) 4420 return; 4421 4422 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4423 disable_igfx_iommu = 1; 4424 } 4425 4426 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4434 4435 /* Broadwell igfx malfunctions with dmar */ 4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4439 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4440 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4441 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4442 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4457 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4458 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4459 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4460 4461 static void quirk_iommu_rwbf(struct pci_dev *dev) 4462 { 4463 if (risky_device(dev)) 4464 return; 4465 4466 /* 4467 * Mobile 4 Series Chipset neglects to set RWBF capability, 4468 * but needs it. Same seems to hold for the desktop versions. 4469 */ 4470 pci_info(dev, "Forcing write-buffer flush capability\n"); 4471 rwbf_quirk = 1; 4472 } 4473 4474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4481 4482 #define GGC 0x52 4483 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4484 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4485 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4486 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4487 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4488 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4489 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4490 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4491 4492 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4493 { 4494 unsigned short ggc; 4495 4496 if (risky_device(dev)) 4497 return; 4498 4499 if (pci_read_config_word(dev, GGC, &ggc)) 4500 return; 4501 4502 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4503 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4504 disable_igfx_iommu = 1; 4505 } else if (!disable_igfx_iommu) { 4506 /* we have to ensure the gfx device is idle before we flush */ 4507 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4508 iommu_set_dma_strict(); 4509 } 4510 } 4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4515 4516 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4517 { 4518 unsigned short ver; 4519 4520 if (!IS_GFX_DEVICE(dev)) 4521 return; 4522 4523 ver = (dev->device >> 8) & 0xff; 4524 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4525 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4526 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4527 return; 4528 4529 if (risky_device(dev)) 4530 return; 4531 4532 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4533 iommu_skip_te_disable = 1; 4534 } 4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4536 4537 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4538 ISOCH DMAR unit for the Azalia sound device, but not give it any 4539 TLB entries, which causes it to deadlock. Check for that. We do 4540 this in a function called from init_dmars(), instead of in a PCI 4541 quirk, because we don't want to print the obnoxious "BIOS broken" 4542 message if VT-d is actually disabled. 4543 */ 4544 static void __init check_tylersburg_isoch(void) 4545 { 4546 struct pci_dev *pdev; 4547 uint32_t vtisochctrl; 4548 4549 /* If there's no Azalia in the system anyway, forget it. */ 4550 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4551 if (!pdev) 4552 return; 4553 4554 if (risky_device(pdev)) { 4555 pci_dev_put(pdev); 4556 return; 4557 } 4558 4559 pci_dev_put(pdev); 4560 4561 /* System Management Registers. Might be hidden, in which case 4562 we can't do the sanity check. But that's OK, because the 4563 known-broken BIOSes _don't_ actually hide it, so far. */ 4564 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4565 if (!pdev) 4566 return; 4567 4568 if (risky_device(pdev)) { 4569 pci_dev_put(pdev); 4570 return; 4571 } 4572 4573 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4574 pci_dev_put(pdev); 4575 return; 4576 } 4577 4578 pci_dev_put(pdev); 4579 4580 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4581 if (vtisochctrl & 1) 4582 return; 4583 4584 /* Drop all bits other than the number of TLB entries */ 4585 vtisochctrl &= 0x1c; 4586 4587 /* If we have the recommended number of TLB entries (16), fine. */ 4588 if (vtisochctrl == 0x10) 4589 return; 4590 4591 /* Zero TLB entries? You get to ride the short bus to school. */ 4592 if (!vtisochctrl) { 4593 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4594 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4595 dmi_get_system_info(DMI_BIOS_VENDOR), 4596 dmi_get_system_info(DMI_BIOS_VERSION), 4597 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4598 iommu_identity_mapping |= IDENTMAP_AZALIA; 4599 return; 4600 } 4601 4602 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4603 vtisochctrl); 4604 } 4605 4606 /* 4607 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4608 * invalidation completion before posted writes initiated with translated address 4609 * that utilized translations matching the invalidation address range, violating 4610 * the invalidation completion ordering. 4611 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4612 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4613 * under the control of the trusted/privileged host device driver must use this 4614 * quirk. 4615 * Device TLBs are invalidated under the following six conditions: 4616 * 1. Device driver does DMA API unmap IOVA 4617 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4618 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4619 * exit_mmap() due to crash 4620 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4621 * VM has to free pages that were unmapped 4622 * 5. Userspace driver unmaps a DMA buffer 4623 * 6. Cache invalidation in vSVA usage (upcoming) 4624 * 4625 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4626 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4627 * invalidate TLB the same way as normal user unmap which will use this quirk. 4628 * The dTLB invalidation after PASID cache flush does not need this quirk. 4629 * 4630 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4631 */ 4632 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4633 unsigned long address, unsigned long mask, 4634 u32 pasid, u16 qdep) 4635 { 4636 u16 sid; 4637 4638 if (likely(!info->dtlb_extra_inval)) 4639 return; 4640 4641 sid = PCI_DEVID(info->bus, info->devfn); 4642 if (pasid == IOMMU_NO_PASID) { 4643 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4644 qdep, address, mask); 4645 } else { 4646 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 4647 pasid, qdep, address, mask); 4648 } 4649 } 4650 4651 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 4652 4653 /* 4654 * Function to submit a command to the enhanced command interface. The 4655 * valid enhanced command descriptions are defined in Table 47 of the 4656 * VT-d spec. The VT-d hardware implementation may support some but not 4657 * all commands, which can be determined by checking the Enhanced 4658 * Command Capability Register. 4659 * 4660 * Return values: 4661 * - 0: Command successful without any error; 4662 * - Negative: software error value; 4663 * - Nonzero positive: failure status code defined in Table 48. 4664 */ 4665 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 4666 { 4667 unsigned long flags; 4668 u64 res; 4669 int ret; 4670 4671 if (!cap_ecmds(iommu->cap)) 4672 return -ENODEV; 4673 4674 raw_spin_lock_irqsave(&iommu->register_lock, flags); 4675 4676 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 4677 if (res & DMA_ECMD_ECRSP_IP) { 4678 ret = -EBUSY; 4679 goto err; 4680 } 4681 4682 /* 4683 * Unconditionally write the operand B, because 4684 * - There is no side effect if an ecmd doesn't require an 4685 * operand B, but we set the register to some value. 4686 * - It's not invoked in any critical path. The extra MMIO 4687 * write doesn't bring any performance concerns. 4688 */ 4689 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 4690 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 4691 4692 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 4693 !(res & DMA_ECMD_ECRSP_IP), res); 4694 4695 if (res & DMA_ECMD_ECRSP_IP) { 4696 ret = -ETIMEDOUT; 4697 goto err; 4698 } 4699 4700 ret = ecmd_get_status_code(res); 4701 err: 4702 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 4703 4704 return ret; 4705 } 4706