1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-pages.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 51 52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 57 58 static void __init check_tylersburg_isoch(void); 59 static int rwbf_quirk; 60 61 /* 62 * set to 1 to panic kernel if can't successfully enable VT-d 63 * (used when kernel is launched w/ TXT) 64 */ 65 static int force_on = 0; 66 static int intel_iommu_tboot_noforce; 67 static int no_platform_optin; 68 69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 70 71 /* 72 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 73 * if marked present. 74 */ 75 static phys_addr_t root_entry_lctp(struct root_entry *re) 76 { 77 if (!(re->lo & 1)) 78 return 0; 79 80 return re->lo & VTD_PAGE_MASK; 81 } 82 83 /* 84 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 85 * if marked present. 86 */ 87 static phys_addr_t root_entry_uctp(struct root_entry *re) 88 { 89 if (!(re->hi & 1)) 90 return 0; 91 92 return re->hi & VTD_PAGE_MASK; 93 } 94 95 static int device_rid_cmp_key(const void *key, const struct rb_node *node) 96 { 97 struct device_domain_info *info = 98 rb_entry(node, struct device_domain_info, node); 99 const u16 *rid_lhs = key; 100 101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) 102 return -1; 103 104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) 105 return 1; 106 107 return 0; 108 } 109 110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) 111 { 112 struct device_domain_info *info = 113 rb_entry(lhs, struct device_domain_info, node); 114 u16 key = PCI_DEVID(info->bus, info->devfn); 115 116 return device_rid_cmp_key(&key, rhs); 117 } 118 119 /* 120 * Looks up an IOMMU-probed device using its source ID. 121 * 122 * Returns the pointer to the device if there is a match. Otherwise, 123 * returns NULL. 124 * 125 * Note that this helper doesn't guarantee that the device won't be 126 * released by the iommu subsystem after being returned. The caller 127 * should use its own synchronization mechanism to avoid the device 128 * being released during its use if its possibly the case. 129 */ 130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) 131 { 132 struct device_domain_info *info = NULL; 133 struct rb_node *node; 134 unsigned long flags; 135 136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); 138 if (node) 139 info = rb_entry(node, struct device_domain_info, node); 140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 141 142 return info ? info->dev : NULL; 143 } 144 145 static int device_rbtree_insert(struct intel_iommu *iommu, 146 struct device_domain_info *info) 147 { 148 struct rb_node *curr; 149 unsigned long flags; 150 151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); 153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 154 if (WARN_ON(curr)) 155 return -EEXIST; 156 157 return 0; 158 } 159 160 static void device_rbtree_remove(struct device_domain_info *info) 161 { 162 struct intel_iommu *iommu = info->iommu; 163 unsigned long flags; 164 165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 166 rb_erase(&info->node, &iommu->device_rbtree); 167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 168 } 169 170 /* 171 * This domain is a statically identity mapping domain. 172 * 1. This domain creats a static 1:1 mapping to all usable memory. 173 * 2. It maps to each iommu if successful. 174 * 3. Each iommu mapps to this domain if successful. 175 */ 176 static struct dmar_domain *si_domain; 177 static int hw_pass_through = 1; 178 179 struct dmar_rmrr_unit { 180 struct list_head list; /* list of rmrr units */ 181 struct acpi_dmar_header *hdr; /* ACPI header */ 182 u64 base_address; /* reserved base address*/ 183 u64 end_address; /* reserved end address */ 184 struct dmar_dev_scope *devices; /* target devices */ 185 int devices_cnt; /* target device count */ 186 }; 187 188 struct dmar_atsr_unit { 189 struct list_head list; /* list of ATSR units */ 190 struct acpi_dmar_header *hdr; /* ACPI header */ 191 struct dmar_dev_scope *devices; /* target devices */ 192 int devices_cnt; /* target device count */ 193 u8 include_all:1; /* include all ports */ 194 }; 195 196 struct dmar_satc_unit { 197 struct list_head list; /* list of SATC units */ 198 struct acpi_dmar_header *hdr; /* ACPI header */ 199 struct dmar_dev_scope *devices; /* target devices */ 200 struct intel_iommu *iommu; /* the corresponding iommu */ 201 int devices_cnt; /* target device count */ 202 u8 atc_required:1; /* ATS is required */ 203 }; 204 205 static LIST_HEAD(dmar_atsr_units); 206 static LIST_HEAD(dmar_rmrr_units); 207 static LIST_HEAD(dmar_satc_units); 208 209 #define for_each_rmrr_units(rmrr) \ 210 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 211 212 static void intel_iommu_domain_free(struct iommu_domain *domain); 213 214 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 215 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 216 217 int intel_iommu_enabled = 0; 218 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 219 220 static int intel_iommu_superpage = 1; 221 static int iommu_identity_mapping; 222 static int iommu_skip_te_disable; 223 static int disable_igfx_iommu; 224 225 #define IDENTMAP_AZALIA 4 226 227 const struct iommu_ops intel_iommu_ops; 228 static const struct iommu_dirty_ops intel_dirty_ops; 229 230 static bool translation_pre_enabled(struct intel_iommu *iommu) 231 { 232 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 233 } 234 235 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 236 { 237 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 238 } 239 240 static void init_translation_status(struct intel_iommu *iommu) 241 { 242 u32 gsts; 243 244 gsts = readl(iommu->reg + DMAR_GSTS_REG); 245 if (gsts & DMA_GSTS_TES) 246 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 247 } 248 249 static int __init intel_iommu_setup(char *str) 250 { 251 if (!str) 252 return -EINVAL; 253 254 while (*str) { 255 if (!strncmp(str, "on", 2)) { 256 dmar_disabled = 0; 257 pr_info("IOMMU enabled\n"); 258 } else if (!strncmp(str, "off", 3)) { 259 dmar_disabled = 1; 260 no_platform_optin = 1; 261 pr_info("IOMMU disabled\n"); 262 } else if (!strncmp(str, "igfx_off", 8)) { 263 disable_igfx_iommu = 1; 264 pr_info("Disable GFX device mapping\n"); 265 } else if (!strncmp(str, "forcedac", 8)) { 266 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 267 iommu_dma_forcedac = true; 268 } else if (!strncmp(str, "strict", 6)) { 269 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 270 iommu_set_dma_strict(); 271 } else if (!strncmp(str, "sp_off", 6)) { 272 pr_info("Disable supported super page\n"); 273 intel_iommu_superpage = 0; 274 } else if (!strncmp(str, "sm_on", 5)) { 275 pr_info("Enable scalable mode if hardware supports\n"); 276 intel_iommu_sm = 1; 277 } else if (!strncmp(str, "sm_off", 6)) { 278 pr_info("Scalable mode is disallowed\n"); 279 intel_iommu_sm = 0; 280 } else if (!strncmp(str, "tboot_noforce", 13)) { 281 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 282 intel_iommu_tboot_noforce = 1; 283 } else { 284 pr_notice("Unknown option - '%s'\n", str); 285 } 286 287 str += strcspn(str, ","); 288 while (*str == ',') 289 str++; 290 } 291 292 return 1; 293 } 294 __setup("intel_iommu=", intel_iommu_setup); 295 296 static int domain_type_is_si(struct dmar_domain *domain) 297 { 298 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 299 } 300 301 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 302 { 303 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 304 305 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 306 } 307 308 /* 309 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 310 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 311 * the returned SAGAW. 312 */ 313 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 314 { 315 unsigned long fl_sagaw, sl_sagaw; 316 317 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 318 sl_sagaw = cap_sagaw(iommu->cap); 319 320 /* Second level only. */ 321 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 322 return sl_sagaw; 323 324 /* First level only. */ 325 if (!ecap_slts(iommu->ecap)) 326 return fl_sagaw; 327 328 return fl_sagaw & sl_sagaw; 329 } 330 331 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 332 { 333 unsigned long sagaw; 334 int agaw; 335 336 sagaw = __iommu_calculate_sagaw(iommu); 337 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 338 if (test_bit(agaw, &sagaw)) 339 break; 340 } 341 342 return agaw; 343 } 344 345 /* 346 * Calculate max SAGAW for each iommu. 347 */ 348 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 349 { 350 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 351 } 352 353 /* 354 * calculate agaw for each iommu. 355 * "SAGAW" may be different across iommus, use a default agaw, and 356 * get a supported less agaw for iommus that don't support the default agaw. 357 */ 358 int iommu_calculate_agaw(struct intel_iommu *iommu) 359 { 360 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 361 } 362 363 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 364 { 365 return sm_supported(iommu) ? 366 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 367 } 368 369 static void domain_update_iommu_coherency(struct dmar_domain *domain) 370 { 371 struct iommu_domain_info *info; 372 struct dmar_drhd_unit *drhd; 373 struct intel_iommu *iommu; 374 bool found = false; 375 unsigned long i; 376 377 domain->iommu_coherency = true; 378 xa_for_each(&domain->iommu_array, i, info) { 379 found = true; 380 if (!iommu_paging_structure_coherency(info->iommu)) { 381 domain->iommu_coherency = false; 382 break; 383 } 384 } 385 if (found) 386 return; 387 388 /* No hardware attached; use lowest common denominator */ 389 rcu_read_lock(); 390 for_each_active_iommu(iommu, drhd) { 391 if (!iommu_paging_structure_coherency(iommu)) { 392 domain->iommu_coherency = false; 393 break; 394 } 395 } 396 rcu_read_unlock(); 397 } 398 399 static int domain_update_iommu_superpage(struct dmar_domain *domain, 400 struct intel_iommu *skip) 401 { 402 struct dmar_drhd_unit *drhd; 403 struct intel_iommu *iommu; 404 int mask = 0x3; 405 406 if (!intel_iommu_superpage) 407 return 0; 408 409 /* set iommu_superpage to the smallest common denominator */ 410 rcu_read_lock(); 411 for_each_active_iommu(iommu, drhd) { 412 if (iommu != skip) { 413 if (domain && domain->use_first_level) { 414 if (!cap_fl1gp_support(iommu->cap)) 415 mask = 0x1; 416 } else { 417 mask &= cap_super_page_val(iommu->cap); 418 } 419 420 if (!mask) 421 break; 422 } 423 } 424 rcu_read_unlock(); 425 426 return fls(mask); 427 } 428 429 static int domain_update_device_node(struct dmar_domain *domain) 430 { 431 struct device_domain_info *info; 432 int nid = NUMA_NO_NODE; 433 unsigned long flags; 434 435 spin_lock_irqsave(&domain->lock, flags); 436 list_for_each_entry(info, &domain->devices, link) { 437 /* 438 * There could possibly be multiple device numa nodes as devices 439 * within the same domain may sit behind different IOMMUs. There 440 * isn't perfect answer in such situation, so we select first 441 * come first served policy. 442 */ 443 nid = dev_to_node(info->dev); 444 if (nid != NUMA_NO_NODE) 445 break; 446 } 447 spin_unlock_irqrestore(&domain->lock, flags); 448 449 return nid; 450 } 451 452 /* Return the super pagesize bitmap if supported. */ 453 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 454 { 455 unsigned long bitmap = 0; 456 457 /* 458 * 1-level super page supports page size of 2MiB, 2-level super page 459 * supports page size of both 2MiB and 1GiB. 460 */ 461 if (domain->iommu_superpage == 1) 462 bitmap |= SZ_2M; 463 else if (domain->iommu_superpage == 2) 464 bitmap |= SZ_2M | SZ_1G; 465 466 return bitmap; 467 } 468 469 /* Some capabilities may be different across iommus */ 470 void domain_update_iommu_cap(struct dmar_domain *domain) 471 { 472 domain_update_iommu_coherency(domain); 473 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 474 475 /* 476 * If RHSA is missing, we should default to the device numa domain 477 * as fall back. 478 */ 479 if (domain->nid == NUMA_NO_NODE) 480 domain->nid = domain_update_device_node(domain); 481 482 /* 483 * First-level translation restricts the input-address to a 484 * canonical address (i.e., address bits 63:N have the same 485 * value as address bit [N-1], where N is 48-bits with 4-level 486 * paging and 57-bits with 5-level paging). Hence, skip bit 487 * [N-1]. 488 */ 489 if (domain->use_first_level) 490 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 491 else 492 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 493 494 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 495 domain_update_iotlb(domain); 496 } 497 498 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 499 u8 devfn, int alloc) 500 { 501 struct root_entry *root = &iommu->root_entry[bus]; 502 struct context_entry *context; 503 u64 *entry; 504 505 /* 506 * Except that the caller requested to allocate a new entry, 507 * returning a copied context entry makes no sense. 508 */ 509 if (!alloc && context_copied(iommu, bus, devfn)) 510 return NULL; 511 512 entry = &root->lo; 513 if (sm_supported(iommu)) { 514 if (devfn >= 0x80) { 515 devfn -= 0x80; 516 entry = &root->hi; 517 } 518 devfn *= 2; 519 } 520 if (*entry & 1) 521 context = phys_to_virt(*entry & VTD_PAGE_MASK); 522 else { 523 unsigned long phy_addr; 524 if (!alloc) 525 return NULL; 526 527 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 528 if (!context) 529 return NULL; 530 531 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 532 phy_addr = virt_to_phys((void *)context); 533 *entry = phy_addr | 1; 534 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 535 } 536 return &context[devfn]; 537 } 538 539 /** 540 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 541 * sub-hierarchy of a candidate PCI-PCI bridge 542 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 543 * @bridge: the candidate PCI-PCI bridge 544 * 545 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 546 */ 547 static bool 548 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 549 { 550 struct pci_dev *pdev, *pbridge; 551 552 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 553 return false; 554 555 pdev = to_pci_dev(dev); 556 pbridge = to_pci_dev(bridge); 557 558 if (pbridge->subordinate && 559 pbridge->subordinate->number <= pdev->bus->number && 560 pbridge->subordinate->busn_res.end >= pdev->bus->number) 561 return true; 562 563 return false; 564 } 565 566 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 567 { 568 struct dmar_drhd_unit *drhd; 569 u32 vtbar; 570 int rc; 571 572 /* We know that this device on this chipset has its own IOMMU. 573 * If we find it under a different IOMMU, then the BIOS is lying 574 * to us. Hope that the IOMMU for this device is actually 575 * disabled, and it needs no translation... 576 */ 577 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 578 if (rc) { 579 /* "can't" happen */ 580 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 581 return false; 582 } 583 vtbar &= 0xffff0000; 584 585 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 586 drhd = dmar_find_matched_drhd_unit(pdev); 587 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 588 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 589 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 590 return true; 591 } 592 593 return false; 594 } 595 596 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 597 { 598 if (!iommu || iommu->drhd->ignored) 599 return true; 600 601 if (dev_is_pci(dev)) { 602 struct pci_dev *pdev = to_pci_dev(dev); 603 604 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 605 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 606 quirk_ioat_snb_local_iommu(pdev)) 607 return true; 608 } 609 610 return false; 611 } 612 613 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 614 { 615 struct dmar_drhd_unit *drhd = NULL; 616 struct pci_dev *pdev = NULL; 617 struct intel_iommu *iommu; 618 struct device *tmp; 619 u16 segment = 0; 620 int i; 621 622 if (!dev) 623 return NULL; 624 625 if (dev_is_pci(dev)) { 626 struct pci_dev *pf_pdev; 627 628 pdev = pci_real_dma_dev(to_pci_dev(dev)); 629 630 /* VFs aren't listed in scope tables; we need to look up 631 * the PF instead to find the IOMMU. */ 632 pf_pdev = pci_physfn(pdev); 633 dev = &pf_pdev->dev; 634 segment = pci_domain_nr(pdev->bus); 635 } else if (has_acpi_companion(dev)) 636 dev = &ACPI_COMPANION(dev)->dev; 637 638 rcu_read_lock(); 639 for_each_iommu(iommu, drhd) { 640 if (pdev && segment != drhd->segment) 641 continue; 642 643 for_each_active_dev_scope(drhd->devices, 644 drhd->devices_cnt, i, tmp) { 645 if (tmp == dev) { 646 /* For a VF use its original BDF# not that of the PF 647 * which we used for the IOMMU lookup. Strictly speaking 648 * we could do this for all PCI devices; we only need to 649 * get the BDF# from the scope table for ACPI matches. */ 650 if (pdev && pdev->is_virtfn) 651 goto got_pdev; 652 653 if (bus && devfn) { 654 *bus = drhd->devices[i].bus; 655 *devfn = drhd->devices[i].devfn; 656 } 657 goto out; 658 } 659 660 if (is_downstream_to_pci_bridge(dev, tmp)) 661 goto got_pdev; 662 } 663 664 if (pdev && drhd->include_all) { 665 got_pdev: 666 if (bus && devfn) { 667 *bus = pdev->bus->number; 668 *devfn = pdev->devfn; 669 } 670 goto out; 671 } 672 } 673 iommu = NULL; 674 out: 675 if (iommu_is_dummy(iommu, dev)) 676 iommu = NULL; 677 678 rcu_read_unlock(); 679 680 return iommu; 681 } 682 683 static void domain_flush_cache(struct dmar_domain *domain, 684 void *addr, int size) 685 { 686 if (!domain->iommu_coherency) 687 clflush_cache_range(addr, size); 688 } 689 690 static void free_context_table(struct intel_iommu *iommu) 691 { 692 struct context_entry *context; 693 int i; 694 695 if (!iommu->root_entry) 696 return; 697 698 for (i = 0; i < ROOT_ENTRY_NR; i++) { 699 context = iommu_context_addr(iommu, i, 0, 0); 700 if (context) 701 iommu_free_page(context); 702 703 if (!sm_supported(iommu)) 704 continue; 705 706 context = iommu_context_addr(iommu, i, 0x80, 0); 707 if (context) 708 iommu_free_page(context); 709 } 710 711 iommu_free_page(iommu->root_entry); 712 iommu->root_entry = NULL; 713 } 714 715 #ifdef CONFIG_DMAR_DEBUG 716 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 717 u8 bus, u8 devfn, struct dma_pte *parent, int level) 718 { 719 struct dma_pte *pte; 720 int offset; 721 722 while (1) { 723 offset = pfn_level_offset(pfn, level); 724 pte = &parent[offset]; 725 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 726 pr_info("PTE not present at level %d\n", level); 727 break; 728 } 729 730 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 731 732 if (level == 1) 733 break; 734 735 parent = phys_to_virt(dma_pte_addr(pte)); 736 level--; 737 } 738 } 739 740 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 741 unsigned long long addr, u32 pasid) 742 { 743 struct pasid_dir_entry *dir, *pde; 744 struct pasid_entry *entries, *pte; 745 struct context_entry *ctx_entry; 746 struct root_entry *rt_entry; 747 int i, dir_index, index, level; 748 u8 devfn = source_id & 0xff; 749 u8 bus = source_id >> 8; 750 struct dma_pte *pgtable; 751 752 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 753 754 /* root entry dump */ 755 rt_entry = &iommu->root_entry[bus]; 756 if (!rt_entry) { 757 pr_info("root table entry is not present\n"); 758 return; 759 } 760 761 if (sm_supported(iommu)) 762 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 763 rt_entry->hi, rt_entry->lo); 764 else 765 pr_info("root entry: 0x%016llx", rt_entry->lo); 766 767 /* context entry dump */ 768 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 769 if (!ctx_entry) { 770 pr_info("context table entry is not present\n"); 771 return; 772 } 773 774 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 775 ctx_entry->hi, ctx_entry->lo); 776 777 /* legacy mode does not require PASID entries */ 778 if (!sm_supported(iommu)) { 779 level = agaw_to_level(ctx_entry->hi & 7); 780 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 781 goto pgtable_walk; 782 } 783 784 /* get the pointer to pasid directory entry */ 785 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 786 if (!dir) { 787 pr_info("pasid directory entry is not present\n"); 788 return; 789 } 790 /* For request-without-pasid, get the pasid from context entry */ 791 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 792 pasid = IOMMU_NO_PASID; 793 794 dir_index = pasid >> PASID_PDE_SHIFT; 795 pde = &dir[dir_index]; 796 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 797 798 /* get the pointer to the pasid table entry */ 799 entries = get_pasid_table_from_pde(pde); 800 if (!entries) { 801 pr_info("pasid table entry is not present\n"); 802 return; 803 } 804 index = pasid & PASID_PTE_MASK; 805 pte = &entries[index]; 806 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 807 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 808 809 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 810 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 811 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 812 } else { 813 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 814 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 815 } 816 817 pgtable_walk: 818 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 819 } 820 #endif 821 822 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 823 unsigned long pfn, int *target_level, 824 gfp_t gfp) 825 { 826 struct dma_pte *parent, *pte; 827 int level = agaw_to_level(domain->agaw); 828 int offset; 829 830 if (!domain_pfn_supported(domain, pfn)) 831 /* Address beyond IOMMU's addressing capabilities. */ 832 return NULL; 833 834 parent = domain->pgd; 835 836 while (1) { 837 void *tmp_page; 838 839 offset = pfn_level_offset(pfn, level); 840 pte = &parent[offset]; 841 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 842 break; 843 if (level == *target_level) 844 break; 845 846 if (!dma_pte_present(pte)) { 847 uint64_t pteval, tmp; 848 849 tmp_page = iommu_alloc_page_node(domain->nid, gfp); 850 851 if (!tmp_page) 852 return NULL; 853 854 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 855 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 856 if (domain->use_first_level) 857 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 858 859 tmp = 0ULL; 860 if (!try_cmpxchg64(&pte->val, &tmp, pteval)) 861 /* Someone else set it while we were thinking; use theirs. */ 862 iommu_free_page(tmp_page); 863 else 864 domain_flush_cache(domain, pte, sizeof(*pte)); 865 } 866 if (level == 1) 867 break; 868 869 parent = phys_to_virt(dma_pte_addr(pte)); 870 level--; 871 } 872 873 if (!*target_level) 874 *target_level = level; 875 876 return pte; 877 } 878 879 /* return address's pte at specific level */ 880 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 881 unsigned long pfn, 882 int level, int *large_page) 883 { 884 struct dma_pte *parent, *pte; 885 int total = agaw_to_level(domain->agaw); 886 int offset; 887 888 parent = domain->pgd; 889 while (level <= total) { 890 offset = pfn_level_offset(pfn, total); 891 pte = &parent[offset]; 892 if (level == total) 893 return pte; 894 895 if (!dma_pte_present(pte)) { 896 *large_page = total; 897 break; 898 } 899 900 if (dma_pte_superpage(pte)) { 901 *large_page = total; 902 return pte; 903 } 904 905 parent = phys_to_virt(dma_pte_addr(pte)); 906 total--; 907 } 908 return NULL; 909 } 910 911 /* clear last level pte, a tlb flush should be followed */ 912 static void dma_pte_clear_range(struct dmar_domain *domain, 913 unsigned long start_pfn, 914 unsigned long last_pfn) 915 { 916 unsigned int large_page; 917 struct dma_pte *first_pte, *pte; 918 919 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 920 WARN_ON(start_pfn > last_pfn)) 921 return; 922 923 /* we don't need lock here; nobody else touches the iova range */ 924 do { 925 large_page = 1; 926 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 927 if (!pte) { 928 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 929 continue; 930 } 931 do { 932 dma_clear_pte(pte); 933 start_pfn += lvl_to_nr_pages(large_page); 934 pte++; 935 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 936 937 domain_flush_cache(domain, first_pte, 938 (void *)pte - (void *)first_pte); 939 940 } while (start_pfn && start_pfn <= last_pfn); 941 } 942 943 static void dma_pte_free_level(struct dmar_domain *domain, int level, 944 int retain_level, struct dma_pte *pte, 945 unsigned long pfn, unsigned long start_pfn, 946 unsigned long last_pfn) 947 { 948 pfn = max(start_pfn, pfn); 949 pte = &pte[pfn_level_offset(pfn, level)]; 950 951 do { 952 unsigned long level_pfn; 953 struct dma_pte *level_pte; 954 955 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 956 goto next; 957 958 level_pfn = pfn & level_mask(level); 959 level_pte = phys_to_virt(dma_pte_addr(pte)); 960 961 if (level > 2) { 962 dma_pte_free_level(domain, level - 1, retain_level, 963 level_pte, level_pfn, start_pfn, 964 last_pfn); 965 } 966 967 /* 968 * Free the page table if we're below the level we want to 969 * retain and the range covers the entire table. 970 */ 971 if (level < retain_level && !(start_pfn > level_pfn || 972 last_pfn < level_pfn + level_size(level) - 1)) { 973 dma_clear_pte(pte); 974 domain_flush_cache(domain, pte, sizeof(*pte)); 975 iommu_free_page(level_pte); 976 } 977 next: 978 pfn += level_size(level); 979 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 980 } 981 982 /* 983 * clear last level (leaf) ptes and free page table pages below the 984 * level we wish to keep intact. 985 */ 986 static void dma_pte_free_pagetable(struct dmar_domain *domain, 987 unsigned long start_pfn, 988 unsigned long last_pfn, 989 int retain_level) 990 { 991 dma_pte_clear_range(domain, start_pfn, last_pfn); 992 993 /* We don't need lock here; nobody else touches the iova range */ 994 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 995 domain->pgd, 0, start_pfn, last_pfn); 996 997 /* free pgd */ 998 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 999 iommu_free_page(domain->pgd); 1000 domain->pgd = NULL; 1001 } 1002 } 1003 1004 /* When a page at a given level is being unlinked from its parent, we don't 1005 need to *modify* it at all. All we need to do is make a list of all the 1006 pages which can be freed just as soon as we've flushed the IOTLB and we 1007 know the hardware page-walk will no longer touch them. 1008 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1009 be freed. */ 1010 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1011 int level, struct dma_pte *pte, 1012 struct list_head *freelist) 1013 { 1014 struct page *pg; 1015 1016 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1017 list_add_tail(&pg->lru, freelist); 1018 1019 if (level == 1) 1020 return; 1021 1022 pte = page_address(pg); 1023 do { 1024 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1025 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1026 pte++; 1027 } while (!first_pte_in_page(pte)); 1028 } 1029 1030 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1031 struct dma_pte *pte, unsigned long pfn, 1032 unsigned long start_pfn, unsigned long last_pfn, 1033 struct list_head *freelist) 1034 { 1035 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1036 1037 pfn = max(start_pfn, pfn); 1038 pte = &pte[pfn_level_offset(pfn, level)]; 1039 1040 do { 1041 unsigned long level_pfn = pfn & level_mask(level); 1042 1043 if (!dma_pte_present(pte)) 1044 goto next; 1045 1046 /* If range covers entire pagetable, free it */ 1047 if (start_pfn <= level_pfn && 1048 last_pfn >= level_pfn + level_size(level) - 1) { 1049 /* These suborbinate page tables are going away entirely. Don't 1050 bother to clear them; we're just going to *free* them. */ 1051 if (level > 1 && !dma_pte_superpage(pte)) 1052 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1053 1054 dma_clear_pte(pte); 1055 if (!first_pte) 1056 first_pte = pte; 1057 last_pte = pte; 1058 } else if (level > 1) { 1059 /* Recurse down into a level that isn't *entirely* obsolete */ 1060 dma_pte_clear_level(domain, level - 1, 1061 phys_to_virt(dma_pte_addr(pte)), 1062 level_pfn, start_pfn, last_pfn, 1063 freelist); 1064 } 1065 next: 1066 pfn = level_pfn + level_size(level); 1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1068 1069 if (first_pte) 1070 domain_flush_cache(domain, first_pte, 1071 (void *)++last_pte - (void *)first_pte); 1072 } 1073 1074 /* We can't just free the pages because the IOMMU may still be walking 1075 the page tables, and may have cached the intermediate levels. The 1076 pages can only be freed after the IOTLB flush has been done. */ 1077 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1078 unsigned long last_pfn, struct list_head *freelist) 1079 { 1080 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1081 WARN_ON(start_pfn > last_pfn)) 1082 return; 1083 1084 /* we don't need lock here; nobody else touches the iova range */ 1085 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1086 domain->pgd, 0, start_pfn, last_pfn, freelist); 1087 1088 /* free pgd */ 1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1090 struct page *pgd_page = virt_to_page(domain->pgd); 1091 list_add_tail(&pgd_page->lru, freelist); 1092 domain->pgd = NULL; 1093 } 1094 } 1095 1096 /* iommu handling */ 1097 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1098 { 1099 struct root_entry *root; 1100 1101 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 1102 if (!root) { 1103 pr_err("Allocating root entry for %s failed\n", 1104 iommu->name); 1105 return -ENOMEM; 1106 } 1107 1108 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1109 iommu->root_entry = root; 1110 1111 return 0; 1112 } 1113 1114 static void iommu_set_root_entry(struct intel_iommu *iommu) 1115 { 1116 u64 addr; 1117 u32 sts; 1118 unsigned long flag; 1119 1120 addr = virt_to_phys(iommu->root_entry); 1121 if (sm_supported(iommu)) 1122 addr |= DMA_RTADDR_SMT; 1123 1124 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1125 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1126 1127 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1128 1129 /* Make sure hardware complete it */ 1130 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1131 readl, (sts & DMA_GSTS_RTPS), sts); 1132 1133 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1134 1135 /* 1136 * Hardware invalidates all DMA remapping hardware translation 1137 * caches as part of SRTP flow. 1138 */ 1139 if (cap_esrtps(iommu->cap)) 1140 return; 1141 1142 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1143 if (sm_supported(iommu)) 1144 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1145 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1146 } 1147 1148 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1149 { 1150 u32 val; 1151 unsigned long flag; 1152 1153 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1154 return; 1155 1156 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1157 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1158 1159 /* Make sure hardware complete it */ 1160 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1161 readl, (!(val & DMA_GSTS_WBFS)), val); 1162 1163 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1164 } 1165 1166 /* return value determine if we need a write buffer flush */ 1167 static void __iommu_flush_context(struct intel_iommu *iommu, 1168 u16 did, u16 source_id, u8 function_mask, 1169 u64 type) 1170 { 1171 u64 val = 0; 1172 unsigned long flag; 1173 1174 switch (type) { 1175 case DMA_CCMD_GLOBAL_INVL: 1176 val = DMA_CCMD_GLOBAL_INVL; 1177 break; 1178 case DMA_CCMD_DOMAIN_INVL: 1179 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1180 break; 1181 case DMA_CCMD_DEVICE_INVL: 1182 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1183 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1184 break; 1185 default: 1186 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1187 iommu->name, type); 1188 return; 1189 } 1190 val |= DMA_CCMD_ICC; 1191 1192 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1193 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1194 1195 /* Make sure hardware complete it */ 1196 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1197 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1198 1199 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1200 } 1201 1202 /* return value determine if we need a write buffer flush */ 1203 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1204 u64 addr, unsigned int size_order, u64 type) 1205 { 1206 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1207 u64 val = 0, val_iva = 0; 1208 unsigned long flag; 1209 1210 switch (type) { 1211 case DMA_TLB_GLOBAL_FLUSH: 1212 /* global flush doesn't need set IVA_REG */ 1213 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1214 break; 1215 case DMA_TLB_DSI_FLUSH: 1216 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1217 break; 1218 case DMA_TLB_PSI_FLUSH: 1219 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1220 /* IH bit is passed in as part of address */ 1221 val_iva = size_order | addr; 1222 break; 1223 default: 1224 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1225 iommu->name, type); 1226 return; 1227 } 1228 1229 if (cap_write_drain(iommu->cap)) 1230 val |= DMA_TLB_WRITE_DRAIN; 1231 1232 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1233 /* Note: Only uses first TLB reg currently */ 1234 if (val_iva) 1235 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1236 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1237 1238 /* Make sure hardware complete it */ 1239 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1240 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1241 1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1243 1244 /* check IOTLB invalidation granularity */ 1245 if (DMA_TLB_IAIG(val) == 0) 1246 pr_err("Flush IOTLB failed\n"); 1247 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1248 pr_debug("TLB flush request %Lx, actual %Lx\n", 1249 (unsigned long long)DMA_TLB_IIRG(type), 1250 (unsigned long long)DMA_TLB_IAIG(val)); 1251 } 1252 1253 static struct device_domain_info * 1254 domain_lookup_dev_info(struct dmar_domain *domain, 1255 struct intel_iommu *iommu, u8 bus, u8 devfn) 1256 { 1257 struct device_domain_info *info; 1258 unsigned long flags; 1259 1260 spin_lock_irqsave(&domain->lock, flags); 1261 list_for_each_entry(info, &domain->devices, link) { 1262 if (info->iommu == iommu && info->bus == bus && 1263 info->devfn == devfn) { 1264 spin_unlock_irqrestore(&domain->lock, flags); 1265 return info; 1266 } 1267 } 1268 spin_unlock_irqrestore(&domain->lock, flags); 1269 1270 return NULL; 1271 } 1272 1273 void domain_update_iotlb(struct dmar_domain *domain) 1274 { 1275 struct dev_pasid_info *dev_pasid; 1276 struct device_domain_info *info; 1277 bool has_iotlb_device = false; 1278 unsigned long flags; 1279 1280 spin_lock_irqsave(&domain->lock, flags); 1281 list_for_each_entry(info, &domain->devices, link) { 1282 if (info->ats_enabled) { 1283 has_iotlb_device = true; 1284 break; 1285 } 1286 } 1287 1288 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1289 info = dev_iommu_priv_get(dev_pasid->dev); 1290 if (info->ats_enabled) { 1291 has_iotlb_device = true; 1292 break; 1293 } 1294 } 1295 domain->has_iotlb_device = has_iotlb_device; 1296 spin_unlock_irqrestore(&domain->lock, flags); 1297 } 1298 1299 /* 1300 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1301 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1302 * check because it applies only to the built-in QAT devices and it doesn't 1303 * grant additional privileges. 1304 */ 1305 #define BUGGY_QAT_DEVID_MASK 0x4940 1306 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1307 { 1308 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1309 return false; 1310 1311 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1312 return false; 1313 1314 return true; 1315 } 1316 1317 static void iommu_enable_pci_caps(struct device_domain_info *info) 1318 { 1319 struct pci_dev *pdev; 1320 1321 if (!dev_is_pci(info->dev)) 1322 return; 1323 1324 pdev = to_pci_dev(info->dev); 1325 1326 /* The PCIe spec, in its wisdom, declares that the behaviour of 1327 the device if you enable PASID support after ATS support is 1328 undefined. So always enable PASID support on devices which 1329 have it, even if we can't yet know if we're ever going to 1330 use it. */ 1331 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1332 info->pasid_enabled = 1; 1333 1334 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1335 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1336 info->ats_enabled = 1; 1337 domain_update_iotlb(info->domain); 1338 } 1339 } 1340 1341 static void iommu_disable_pci_caps(struct device_domain_info *info) 1342 { 1343 struct pci_dev *pdev; 1344 1345 if (!dev_is_pci(info->dev)) 1346 return; 1347 1348 pdev = to_pci_dev(info->dev); 1349 1350 if (info->ats_enabled) { 1351 pci_disable_ats(pdev); 1352 info->ats_enabled = 0; 1353 domain_update_iotlb(info->domain); 1354 } 1355 1356 if (info->pasid_enabled) { 1357 pci_disable_pasid(pdev); 1358 info->pasid_enabled = 0; 1359 } 1360 } 1361 1362 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1363 u64 addr, unsigned int mask) 1364 { 1365 u16 sid, qdep; 1366 1367 if (!info || !info->ats_enabled) 1368 return; 1369 1370 sid = info->bus << 8 | info->devfn; 1371 qdep = info->ats_qdep; 1372 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1373 qdep, addr, mask); 1374 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); 1375 } 1376 1377 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1378 { 1379 cache_tag_flush_all(to_dmar_domain(domain)); 1380 } 1381 1382 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1383 { 1384 u32 pmen; 1385 unsigned long flags; 1386 1387 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1388 return; 1389 1390 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1391 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1392 pmen &= ~DMA_PMEN_EPM; 1393 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1394 1395 /* wait for the protected region status bit to clear */ 1396 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1397 readl, !(pmen & DMA_PMEN_PRS), pmen); 1398 1399 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1400 } 1401 1402 static void iommu_enable_translation(struct intel_iommu *iommu) 1403 { 1404 u32 sts; 1405 unsigned long flags; 1406 1407 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1408 iommu->gcmd |= DMA_GCMD_TE; 1409 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1410 1411 /* Make sure hardware complete it */ 1412 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1413 readl, (sts & DMA_GSTS_TES), sts); 1414 1415 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1416 } 1417 1418 static void iommu_disable_translation(struct intel_iommu *iommu) 1419 { 1420 u32 sts; 1421 unsigned long flag; 1422 1423 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1424 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1425 return; 1426 1427 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1428 iommu->gcmd &= ~DMA_GCMD_TE; 1429 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1430 1431 /* Make sure hardware complete it */ 1432 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1433 readl, (!(sts & DMA_GSTS_TES)), sts); 1434 1435 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1436 } 1437 1438 static int iommu_init_domains(struct intel_iommu *iommu) 1439 { 1440 u32 ndomains; 1441 1442 ndomains = cap_ndoms(iommu->cap); 1443 pr_debug("%s: Number of Domains supported <%d>\n", 1444 iommu->name, ndomains); 1445 1446 spin_lock_init(&iommu->lock); 1447 1448 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1449 if (!iommu->domain_ids) 1450 return -ENOMEM; 1451 1452 /* 1453 * If Caching mode is set, then invalid translations are tagged 1454 * with domain-id 0, hence we need to pre-allocate it. We also 1455 * use domain-id 0 as a marker for non-allocated domain-id, so 1456 * make sure it is not used for a real domain. 1457 */ 1458 set_bit(0, iommu->domain_ids); 1459 1460 /* 1461 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1462 * entry for first-level or pass-through translation modes should 1463 * be programmed with a domain id different from those used for 1464 * second-level or nested translation. We reserve a domain id for 1465 * this purpose. 1466 */ 1467 if (sm_supported(iommu)) 1468 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1469 1470 return 0; 1471 } 1472 1473 static void disable_dmar_iommu(struct intel_iommu *iommu) 1474 { 1475 if (!iommu->domain_ids) 1476 return; 1477 1478 /* 1479 * All iommu domains must have been detached from the devices, 1480 * hence there should be no domain IDs in use. 1481 */ 1482 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1483 > NUM_RESERVED_DID)) 1484 return; 1485 1486 if (iommu->gcmd & DMA_GCMD_TE) 1487 iommu_disable_translation(iommu); 1488 } 1489 1490 static void free_dmar_iommu(struct intel_iommu *iommu) 1491 { 1492 if (iommu->domain_ids) { 1493 bitmap_free(iommu->domain_ids); 1494 iommu->domain_ids = NULL; 1495 } 1496 1497 if (iommu->copied_tables) { 1498 bitmap_free(iommu->copied_tables); 1499 iommu->copied_tables = NULL; 1500 } 1501 1502 /* free context mapping */ 1503 free_context_table(iommu); 1504 1505 #ifdef CONFIG_INTEL_IOMMU_SVM 1506 if (pasid_supported(iommu)) { 1507 if (ecap_prs(iommu->ecap)) 1508 intel_svm_finish_prq(iommu); 1509 } 1510 #endif 1511 } 1512 1513 /* 1514 * Check and return whether first level is used by default for 1515 * DMA translation. 1516 */ 1517 static bool first_level_by_default(unsigned int type) 1518 { 1519 /* Only SL is available in legacy mode */ 1520 if (!scalable_mode_support()) 1521 return false; 1522 1523 /* Only level (either FL or SL) is available, just use it */ 1524 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1525 return intel_cap_flts_sanity(); 1526 1527 /* Both levels are available, decide it based on domain type */ 1528 return type != IOMMU_DOMAIN_UNMANAGED; 1529 } 1530 1531 static struct dmar_domain *alloc_domain(unsigned int type) 1532 { 1533 struct dmar_domain *domain; 1534 1535 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1536 if (!domain) 1537 return NULL; 1538 1539 domain->nid = NUMA_NO_NODE; 1540 if (first_level_by_default(type)) 1541 domain->use_first_level = true; 1542 domain->has_iotlb_device = false; 1543 INIT_LIST_HEAD(&domain->devices); 1544 INIT_LIST_HEAD(&domain->dev_pasids); 1545 INIT_LIST_HEAD(&domain->cache_tags); 1546 spin_lock_init(&domain->lock); 1547 spin_lock_init(&domain->cache_lock); 1548 xa_init(&domain->iommu_array); 1549 1550 return domain; 1551 } 1552 1553 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1554 { 1555 struct iommu_domain_info *info, *curr; 1556 unsigned long ndomains; 1557 int num, ret = -ENOSPC; 1558 1559 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1560 return 0; 1561 1562 info = kzalloc(sizeof(*info), GFP_KERNEL); 1563 if (!info) 1564 return -ENOMEM; 1565 1566 spin_lock(&iommu->lock); 1567 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1568 if (curr) { 1569 curr->refcnt++; 1570 spin_unlock(&iommu->lock); 1571 kfree(info); 1572 return 0; 1573 } 1574 1575 ndomains = cap_ndoms(iommu->cap); 1576 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1577 if (num >= ndomains) { 1578 pr_err("%s: No free domain ids\n", iommu->name); 1579 goto err_unlock; 1580 } 1581 1582 set_bit(num, iommu->domain_ids); 1583 info->refcnt = 1; 1584 info->did = num; 1585 info->iommu = iommu; 1586 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1587 NULL, info, GFP_ATOMIC); 1588 if (curr) { 1589 ret = xa_err(curr) ? : -EBUSY; 1590 goto err_clear; 1591 } 1592 domain_update_iommu_cap(domain); 1593 1594 spin_unlock(&iommu->lock); 1595 return 0; 1596 1597 err_clear: 1598 clear_bit(info->did, iommu->domain_ids); 1599 err_unlock: 1600 spin_unlock(&iommu->lock); 1601 kfree(info); 1602 return ret; 1603 } 1604 1605 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1606 { 1607 struct iommu_domain_info *info; 1608 1609 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1610 return; 1611 1612 spin_lock(&iommu->lock); 1613 info = xa_load(&domain->iommu_array, iommu->seq_id); 1614 if (--info->refcnt == 0) { 1615 clear_bit(info->did, iommu->domain_ids); 1616 xa_erase(&domain->iommu_array, iommu->seq_id); 1617 domain->nid = NUMA_NO_NODE; 1618 domain_update_iommu_cap(domain); 1619 kfree(info); 1620 } 1621 spin_unlock(&iommu->lock); 1622 } 1623 1624 static int guestwidth_to_adjustwidth(int gaw) 1625 { 1626 int agaw; 1627 int r = (gaw - 12) % 9; 1628 1629 if (r == 0) 1630 agaw = gaw; 1631 else 1632 agaw = gaw + 9 - r; 1633 if (agaw > 64) 1634 agaw = 64; 1635 return agaw; 1636 } 1637 1638 static void domain_exit(struct dmar_domain *domain) 1639 { 1640 if (domain->pgd) { 1641 LIST_HEAD(freelist); 1642 1643 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1644 iommu_put_pages_list(&freelist); 1645 } 1646 1647 if (WARN_ON(!list_empty(&domain->devices))) 1648 return; 1649 1650 kfree(domain); 1651 } 1652 1653 static int domain_context_mapping_one(struct dmar_domain *domain, 1654 struct intel_iommu *iommu, 1655 u8 bus, u8 devfn) 1656 { 1657 struct device_domain_info *info = 1658 domain_lookup_dev_info(domain, iommu, bus, devfn); 1659 u16 did = domain_id_iommu(domain, iommu); 1660 int translation = CONTEXT_TT_MULTI_LEVEL; 1661 struct dma_pte *pgd = domain->pgd; 1662 struct context_entry *context; 1663 int agaw, ret; 1664 1665 if (hw_pass_through && domain_type_is_si(domain)) 1666 translation = CONTEXT_TT_PASS_THROUGH; 1667 1668 pr_debug("Set context mapping for %02x:%02x.%d\n", 1669 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1670 1671 spin_lock(&iommu->lock); 1672 ret = -ENOMEM; 1673 context = iommu_context_addr(iommu, bus, devfn, 1); 1674 if (!context) 1675 goto out_unlock; 1676 1677 ret = 0; 1678 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1679 goto out_unlock; 1680 1681 /* 1682 * For kdump cases, old valid entries may be cached due to the 1683 * in-flight DMA and copied pgtable, but there is no unmapping 1684 * behaviour for them, thus we need an explicit cache flush for 1685 * the newly-mapped device. For kdump, at this point, the device 1686 * is supposed to finish reset at its driver probe stage, so no 1687 * in-flight DMA will exist, and we don't need to worry anymore 1688 * hereafter. 1689 */ 1690 if (context_copied(iommu, bus, devfn)) { 1691 u16 did_old = context_domain_id(context); 1692 1693 if (did_old < cap_ndoms(iommu->cap)) { 1694 iommu->flush.flush_context(iommu, did_old, 1695 (((u16)bus) << 8) | devfn, 1696 DMA_CCMD_MASK_NOBIT, 1697 DMA_CCMD_DEVICE_INVL); 1698 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1699 DMA_TLB_DSI_FLUSH); 1700 } 1701 1702 clear_context_copied(iommu, bus, devfn); 1703 } 1704 1705 context_clear_entry(context); 1706 context_set_domain_id(context, did); 1707 1708 if (translation != CONTEXT_TT_PASS_THROUGH) { 1709 /* 1710 * Skip top levels of page tables for iommu which has 1711 * less agaw than default. Unnecessary for PT mode. 1712 */ 1713 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1714 ret = -ENOMEM; 1715 pgd = phys_to_virt(dma_pte_addr(pgd)); 1716 if (!dma_pte_present(pgd)) 1717 goto out_unlock; 1718 } 1719 1720 if (info && info->ats_supported) 1721 translation = CONTEXT_TT_DEV_IOTLB; 1722 else 1723 translation = CONTEXT_TT_MULTI_LEVEL; 1724 1725 context_set_address_root(context, virt_to_phys(pgd)); 1726 context_set_address_width(context, agaw); 1727 } else { 1728 /* 1729 * In pass through mode, AW must be programmed to 1730 * indicate the largest AGAW value supported by 1731 * hardware. And ASR is ignored by hardware. 1732 */ 1733 context_set_address_width(context, iommu->msagaw); 1734 } 1735 1736 context_set_translation_type(context, translation); 1737 context_set_fault_enable(context); 1738 context_set_present(context); 1739 if (!ecap_coherent(iommu->ecap)) 1740 clflush_cache_range(context, sizeof(*context)); 1741 1742 /* 1743 * It's a non-present to present mapping. If hardware doesn't cache 1744 * non-present entry we only need to flush the write-buffer. If the 1745 * _does_ cache non-present entries, then it does so in the special 1746 * domain #0, which we have to flush: 1747 */ 1748 if (cap_caching_mode(iommu->cap)) { 1749 iommu->flush.flush_context(iommu, 0, 1750 (((u16)bus) << 8) | devfn, 1751 DMA_CCMD_MASK_NOBIT, 1752 DMA_CCMD_DEVICE_INVL); 1753 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1754 } else { 1755 iommu_flush_write_buffer(iommu); 1756 } 1757 1758 ret = 0; 1759 1760 out_unlock: 1761 spin_unlock(&iommu->lock); 1762 1763 return ret; 1764 } 1765 1766 static int domain_context_mapping_cb(struct pci_dev *pdev, 1767 u16 alias, void *opaque) 1768 { 1769 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); 1770 struct intel_iommu *iommu = info->iommu; 1771 struct dmar_domain *domain = opaque; 1772 1773 return domain_context_mapping_one(domain, iommu, 1774 PCI_BUS_NUM(alias), alias & 0xff); 1775 } 1776 1777 static int 1778 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1779 { 1780 struct device_domain_info *info = dev_iommu_priv_get(dev); 1781 struct intel_iommu *iommu = info->iommu; 1782 u8 bus = info->bus, devfn = info->devfn; 1783 1784 if (!dev_is_pci(dev)) 1785 return domain_context_mapping_one(domain, iommu, bus, devfn); 1786 1787 return pci_for_each_dma_alias(to_pci_dev(dev), 1788 domain_context_mapping_cb, domain); 1789 } 1790 1791 /* Return largest possible superpage level for a given mapping */ 1792 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1793 unsigned long phy_pfn, unsigned long pages) 1794 { 1795 int support, level = 1; 1796 unsigned long pfnmerge; 1797 1798 support = domain->iommu_superpage; 1799 1800 /* To use a large page, the virtual *and* physical addresses 1801 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1802 of them will mean we have to use smaller pages. So just 1803 merge them and check both at once. */ 1804 pfnmerge = iov_pfn | phy_pfn; 1805 1806 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1807 pages >>= VTD_STRIDE_SHIFT; 1808 if (!pages) 1809 break; 1810 pfnmerge >>= VTD_STRIDE_SHIFT; 1811 level++; 1812 support--; 1813 } 1814 return level; 1815 } 1816 1817 /* 1818 * Ensure that old small page tables are removed to make room for superpage(s). 1819 * We're going to add new large pages, so make sure we don't remove their parent 1820 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1821 */ 1822 static void switch_to_super_page(struct dmar_domain *domain, 1823 unsigned long start_pfn, 1824 unsigned long end_pfn, int level) 1825 { 1826 unsigned long lvl_pages = lvl_to_nr_pages(level); 1827 struct dma_pte *pte = NULL; 1828 1829 while (start_pfn <= end_pfn) { 1830 if (!pte) 1831 pte = pfn_to_dma_pte(domain, start_pfn, &level, 1832 GFP_ATOMIC); 1833 1834 if (dma_pte_present(pte)) { 1835 dma_pte_free_pagetable(domain, start_pfn, 1836 start_pfn + lvl_pages - 1, 1837 level + 1); 1838 1839 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, 1840 end_pfn << VTD_PAGE_SHIFT, 0); 1841 } 1842 1843 pte++; 1844 start_pfn += lvl_pages; 1845 if (first_pte_in_page(pte)) 1846 pte = NULL; 1847 } 1848 } 1849 1850 static int 1851 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1852 unsigned long phys_pfn, unsigned long nr_pages, int prot, 1853 gfp_t gfp) 1854 { 1855 struct dma_pte *first_pte = NULL, *pte = NULL; 1856 unsigned int largepage_lvl = 0; 1857 unsigned long lvl_pages = 0; 1858 phys_addr_t pteval; 1859 u64 attr; 1860 1861 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 1862 return -EINVAL; 1863 1864 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 1865 return -EINVAL; 1866 1867 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 1868 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 1869 return -EINVAL; 1870 } 1871 1872 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 1873 attr |= DMA_FL_PTE_PRESENT; 1874 if (domain->use_first_level) { 1875 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 1876 if (prot & DMA_PTE_WRITE) 1877 attr |= DMA_FL_PTE_DIRTY; 1878 } 1879 1880 domain->has_mappings = true; 1881 1882 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 1883 1884 while (nr_pages > 0) { 1885 uint64_t tmp; 1886 1887 if (!pte) { 1888 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 1889 phys_pfn, nr_pages); 1890 1891 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 1892 gfp); 1893 if (!pte) 1894 return -ENOMEM; 1895 first_pte = pte; 1896 1897 lvl_pages = lvl_to_nr_pages(largepage_lvl); 1898 1899 /* It is large page*/ 1900 if (largepage_lvl > 1) { 1901 unsigned long end_pfn; 1902 unsigned long pages_to_remove; 1903 1904 pteval |= DMA_PTE_LARGE_PAGE; 1905 pages_to_remove = min_t(unsigned long, nr_pages, 1906 nr_pte_to_next_page(pte) * lvl_pages); 1907 end_pfn = iov_pfn + pages_to_remove - 1; 1908 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 1909 } else { 1910 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1911 } 1912 1913 } 1914 /* We don't need lock here, nobody else 1915 * touches the iova range 1916 */ 1917 tmp = 0ULL; 1918 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { 1919 static int dumps = 5; 1920 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 1921 iov_pfn, tmp, (unsigned long long)pteval); 1922 if (dumps) { 1923 dumps--; 1924 debug_dma_dump_mappings(NULL); 1925 } 1926 WARN_ON(1); 1927 } 1928 1929 nr_pages -= lvl_pages; 1930 iov_pfn += lvl_pages; 1931 phys_pfn += lvl_pages; 1932 pteval += lvl_pages * VTD_PAGE_SIZE; 1933 1934 /* If the next PTE would be the first in a new page, then we 1935 * need to flush the cache on the entries we've just written. 1936 * And then we'll need to recalculate 'pte', so clear it and 1937 * let it get set again in the if (!pte) block above. 1938 * 1939 * If we're done (!nr_pages) we need to flush the cache too. 1940 * 1941 * Also if we've been setting superpages, we may need to 1942 * recalculate 'pte' and switch back to smaller pages for the 1943 * end of the mapping, if the trailing size is not enough to 1944 * use another superpage (i.e. nr_pages < lvl_pages). 1945 */ 1946 pte++; 1947 if (!nr_pages || first_pte_in_page(pte) || 1948 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 1949 domain_flush_cache(domain, first_pte, 1950 (void *)pte - (void *)first_pte); 1951 pte = NULL; 1952 } 1953 } 1954 1955 return 0; 1956 } 1957 1958 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 1959 { 1960 struct intel_iommu *iommu = info->iommu; 1961 struct context_entry *context; 1962 u16 did_old; 1963 1964 spin_lock(&iommu->lock); 1965 context = iommu_context_addr(iommu, bus, devfn, 0); 1966 if (!context) { 1967 spin_unlock(&iommu->lock); 1968 return; 1969 } 1970 1971 did_old = context_domain_id(context); 1972 1973 context_clear_entry(context); 1974 __iommu_flush_cache(iommu, context, sizeof(*context)); 1975 spin_unlock(&iommu->lock); 1976 iommu->flush.flush_context(iommu, 1977 did_old, 1978 (((u16)bus) << 8) | devfn, 1979 DMA_CCMD_MASK_NOBIT, 1980 DMA_CCMD_DEVICE_INVL); 1981 1982 iommu->flush.flush_iotlb(iommu, 1983 did_old, 1984 0, 1985 0, 1986 DMA_TLB_DSI_FLUSH); 1987 1988 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 1989 } 1990 1991 static int domain_setup_first_level(struct intel_iommu *iommu, 1992 struct dmar_domain *domain, 1993 struct device *dev, 1994 u32 pasid) 1995 { 1996 struct dma_pte *pgd = domain->pgd; 1997 int agaw, level; 1998 int flags = 0; 1999 2000 /* 2001 * Skip top levels of page tables for iommu which has 2002 * less agaw than default. Unnecessary for PT mode. 2003 */ 2004 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2005 pgd = phys_to_virt(dma_pte_addr(pgd)); 2006 if (!dma_pte_present(pgd)) 2007 return -ENOMEM; 2008 } 2009 2010 level = agaw_to_level(agaw); 2011 if (level != 4 && level != 5) 2012 return -EINVAL; 2013 2014 if (level == 5) 2015 flags |= PASID_FLAG_FL5LP; 2016 2017 if (domain->force_snooping) 2018 flags |= PASID_FLAG_PAGE_SNOOP; 2019 2020 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2021 domain_id_iommu(domain, iommu), 2022 flags); 2023 } 2024 2025 static bool dev_is_real_dma_subdevice(struct device *dev) 2026 { 2027 return dev && dev_is_pci(dev) && 2028 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2029 } 2030 2031 static int iommu_domain_identity_map(struct dmar_domain *domain, 2032 unsigned long first_vpfn, 2033 unsigned long last_vpfn) 2034 { 2035 /* 2036 * RMRR range might have overlap with physical memory range, 2037 * clear it first 2038 */ 2039 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2040 2041 return __domain_mapping(domain, first_vpfn, 2042 first_vpfn, last_vpfn - first_vpfn + 1, 2043 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2044 } 2045 2046 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2047 2048 static int __init si_domain_init(int hw) 2049 { 2050 struct dmar_rmrr_unit *rmrr; 2051 struct device *dev; 2052 int i, nid, ret; 2053 2054 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2055 if (!si_domain) 2056 return -EFAULT; 2057 2058 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2059 domain_exit(si_domain); 2060 si_domain = NULL; 2061 return -EFAULT; 2062 } 2063 2064 if (hw) 2065 return 0; 2066 2067 for_each_online_node(nid) { 2068 unsigned long start_pfn, end_pfn; 2069 int i; 2070 2071 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2072 ret = iommu_domain_identity_map(si_domain, 2073 mm_to_dma_pfn_start(start_pfn), 2074 mm_to_dma_pfn_end(end_pfn)); 2075 if (ret) 2076 return ret; 2077 } 2078 } 2079 2080 /* 2081 * Identity map the RMRRs so that devices with RMRRs could also use 2082 * the si_domain. 2083 */ 2084 for_each_rmrr_units(rmrr) { 2085 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2086 i, dev) { 2087 unsigned long long start = rmrr->base_address; 2088 unsigned long long end = rmrr->end_address; 2089 2090 if (WARN_ON(end < start || 2091 end >> agaw_to_width(si_domain->agaw))) 2092 continue; 2093 2094 ret = iommu_domain_identity_map(si_domain, 2095 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2096 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2097 if (ret) 2098 return ret; 2099 } 2100 } 2101 2102 return 0; 2103 } 2104 2105 static int dmar_domain_attach_device(struct dmar_domain *domain, 2106 struct device *dev) 2107 { 2108 struct device_domain_info *info = dev_iommu_priv_get(dev); 2109 struct intel_iommu *iommu = info->iommu; 2110 unsigned long flags; 2111 int ret; 2112 2113 ret = domain_attach_iommu(domain, iommu); 2114 if (ret) 2115 return ret; 2116 2117 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); 2118 if (ret) { 2119 domain_detach_iommu(domain, iommu); 2120 return ret; 2121 } 2122 2123 info->domain = domain; 2124 spin_lock_irqsave(&domain->lock, flags); 2125 list_add(&info->link, &domain->devices); 2126 spin_unlock_irqrestore(&domain->lock, flags); 2127 2128 if (dev_is_real_dma_subdevice(dev)) 2129 return 0; 2130 2131 if (!sm_supported(iommu)) 2132 ret = domain_context_mapping(domain, dev); 2133 else if (hw_pass_through && domain_type_is_si(domain)) 2134 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); 2135 else if (domain->use_first_level) 2136 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); 2137 else 2138 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); 2139 2140 if (ret) { 2141 device_block_translation(dev); 2142 return ret; 2143 } 2144 2145 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2146 iommu_enable_pci_caps(info); 2147 2148 return 0; 2149 } 2150 2151 /** 2152 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2153 * is relaxable (ie. is allowed to be not enforced under some conditions) 2154 * @dev: device handle 2155 * 2156 * We assume that PCI USB devices with RMRRs have them largely 2157 * for historical reasons and that the RMRR space is not actively used post 2158 * boot. This exclusion may change if vendors begin to abuse it. 2159 * 2160 * The same exception is made for graphics devices, with the requirement that 2161 * any use of the RMRR regions will be torn down before assigning the device 2162 * to a guest. 2163 * 2164 * Return: true if the RMRR is relaxable, false otherwise 2165 */ 2166 static bool device_rmrr_is_relaxable(struct device *dev) 2167 { 2168 struct pci_dev *pdev; 2169 2170 if (!dev_is_pci(dev)) 2171 return false; 2172 2173 pdev = to_pci_dev(dev); 2174 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2175 return true; 2176 else 2177 return false; 2178 } 2179 2180 /* 2181 * Return the required default domain type for a specific device. 2182 * 2183 * @dev: the device in query 2184 * @startup: true if this is during early boot 2185 * 2186 * Returns: 2187 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2188 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2189 * - 0: both identity and dynamic domains work for this device 2190 */ 2191 static int device_def_domain_type(struct device *dev) 2192 { 2193 if (dev_is_pci(dev)) { 2194 struct pci_dev *pdev = to_pci_dev(dev); 2195 2196 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2197 return IOMMU_DOMAIN_IDENTITY; 2198 } 2199 2200 return 0; 2201 } 2202 2203 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2204 { 2205 /* 2206 * Start from the sane iommu hardware state. 2207 * If the queued invalidation is already initialized by us 2208 * (for example, while enabling interrupt-remapping) then 2209 * we got the things already rolling from a sane state. 2210 */ 2211 if (!iommu->qi) { 2212 /* 2213 * Clear any previous faults. 2214 */ 2215 dmar_fault(-1, iommu); 2216 /* 2217 * Disable queued invalidation if supported and already enabled 2218 * before OS handover. 2219 */ 2220 dmar_disable_qi(iommu); 2221 } 2222 2223 if (dmar_enable_qi(iommu)) { 2224 /* 2225 * Queued Invalidate not enabled, use Register Based Invalidate 2226 */ 2227 iommu->flush.flush_context = __iommu_flush_context; 2228 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2229 pr_info("%s: Using Register based invalidation\n", 2230 iommu->name); 2231 } else { 2232 iommu->flush.flush_context = qi_flush_context; 2233 iommu->flush.flush_iotlb = qi_flush_iotlb; 2234 pr_info("%s: Using Queued invalidation\n", iommu->name); 2235 } 2236 } 2237 2238 static int copy_context_table(struct intel_iommu *iommu, 2239 struct root_entry *old_re, 2240 struct context_entry **tbl, 2241 int bus, bool ext) 2242 { 2243 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2244 struct context_entry *new_ce = NULL, ce; 2245 struct context_entry *old_ce = NULL; 2246 struct root_entry re; 2247 phys_addr_t old_ce_phys; 2248 2249 tbl_idx = ext ? bus * 2 : bus; 2250 memcpy(&re, old_re, sizeof(re)); 2251 2252 for (devfn = 0; devfn < 256; devfn++) { 2253 /* First calculate the correct index */ 2254 idx = (ext ? devfn * 2 : devfn) % 256; 2255 2256 if (idx == 0) { 2257 /* First save what we may have and clean up */ 2258 if (new_ce) { 2259 tbl[tbl_idx] = new_ce; 2260 __iommu_flush_cache(iommu, new_ce, 2261 VTD_PAGE_SIZE); 2262 pos = 1; 2263 } 2264 2265 if (old_ce) 2266 memunmap(old_ce); 2267 2268 ret = 0; 2269 if (devfn < 0x80) 2270 old_ce_phys = root_entry_lctp(&re); 2271 else 2272 old_ce_phys = root_entry_uctp(&re); 2273 2274 if (!old_ce_phys) { 2275 if (ext && devfn == 0) { 2276 /* No LCTP, try UCTP */ 2277 devfn = 0x7f; 2278 continue; 2279 } else { 2280 goto out; 2281 } 2282 } 2283 2284 ret = -ENOMEM; 2285 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2286 MEMREMAP_WB); 2287 if (!old_ce) 2288 goto out; 2289 2290 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); 2291 if (!new_ce) 2292 goto out_unmap; 2293 2294 ret = 0; 2295 } 2296 2297 /* Now copy the context entry */ 2298 memcpy(&ce, old_ce + idx, sizeof(ce)); 2299 2300 if (!context_present(&ce)) 2301 continue; 2302 2303 did = context_domain_id(&ce); 2304 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2305 set_bit(did, iommu->domain_ids); 2306 2307 set_context_copied(iommu, bus, devfn); 2308 new_ce[idx] = ce; 2309 } 2310 2311 tbl[tbl_idx + pos] = new_ce; 2312 2313 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2314 2315 out_unmap: 2316 memunmap(old_ce); 2317 2318 out: 2319 return ret; 2320 } 2321 2322 static int copy_translation_tables(struct intel_iommu *iommu) 2323 { 2324 struct context_entry **ctxt_tbls; 2325 struct root_entry *old_rt; 2326 phys_addr_t old_rt_phys; 2327 int ctxt_table_entries; 2328 u64 rtaddr_reg; 2329 int bus, ret; 2330 bool new_ext, ext; 2331 2332 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2333 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2334 new_ext = !!sm_supported(iommu); 2335 2336 /* 2337 * The RTT bit can only be changed when translation is disabled, 2338 * but disabling translation means to open a window for data 2339 * corruption. So bail out and don't copy anything if we would 2340 * have to change the bit. 2341 */ 2342 if (new_ext != ext) 2343 return -EINVAL; 2344 2345 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2346 if (!iommu->copied_tables) 2347 return -ENOMEM; 2348 2349 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2350 if (!old_rt_phys) 2351 return -EINVAL; 2352 2353 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2354 if (!old_rt) 2355 return -ENOMEM; 2356 2357 /* This is too big for the stack - allocate it from slab */ 2358 ctxt_table_entries = ext ? 512 : 256; 2359 ret = -ENOMEM; 2360 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2361 if (!ctxt_tbls) 2362 goto out_unmap; 2363 2364 for (bus = 0; bus < 256; bus++) { 2365 ret = copy_context_table(iommu, &old_rt[bus], 2366 ctxt_tbls, bus, ext); 2367 if (ret) { 2368 pr_err("%s: Failed to copy context table for bus %d\n", 2369 iommu->name, bus); 2370 continue; 2371 } 2372 } 2373 2374 spin_lock(&iommu->lock); 2375 2376 /* Context tables are copied, now write them to the root_entry table */ 2377 for (bus = 0; bus < 256; bus++) { 2378 int idx = ext ? bus * 2 : bus; 2379 u64 val; 2380 2381 if (ctxt_tbls[idx]) { 2382 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2383 iommu->root_entry[bus].lo = val; 2384 } 2385 2386 if (!ext || !ctxt_tbls[idx + 1]) 2387 continue; 2388 2389 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2390 iommu->root_entry[bus].hi = val; 2391 } 2392 2393 spin_unlock(&iommu->lock); 2394 2395 kfree(ctxt_tbls); 2396 2397 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2398 2399 ret = 0; 2400 2401 out_unmap: 2402 memunmap(old_rt); 2403 2404 return ret; 2405 } 2406 2407 static int __init init_dmars(void) 2408 { 2409 struct dmar_drhd_unit *drhd; 2410 struct intel_iommu *iommu; 2411 int ret; 2412 2413 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2414 if (ret) 2415 goto free_iommu; 2416 2417 for_each_iommu(iommu, drhd) { 2418 if (drhd->ignored) { 2419 iommu_disable_translation(iommu); 2420 continue; 2421 } 2422 2423 /* 2424 * Find the max pasid size of all IOMMU's in the system. 2425 * We need to ensure the system pasid table is no bigger 2426 * than the smallest supported. 2427 */ 2428 if (pasid_supported(iommu)) { 2429 u32 temp = 2 << ecap_pss(iommu->ecap); 2430 2431 intel_pasid_max_id = min_t(u32, temp, 2432 intel_pasid_max_id); 2433 } 2434 2435 intel_iommu_init_qi(iommu); 2436 2437 ret = iommu_init_domains(iommu); 2438 if (ret) 2439 goto free_iommu; 2440 2441 init_translation_status(iommu); 2442 2443 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2444 iommu_disable_translation(iommu); 2445 clear_translation_pre_enabled(iommu); 2446 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2447 iommu->name); 2448 } 2449 2450 /* 2451 * TBD: 2452 * we could share the same root & context tables 2453 * among all IOMMU's. Need to Split it later. 2454 */ 2455 ret = iommu_alloc_root_entry(iommu); 2456 if (ret) 2457 goto free_iommu; 2458 2459 if (translation_pre_enabled(iommu)) { 2460 pr_info("Translation already enabled - trying to copy translation structures\n"); 2461 2462 ret = copy_translation_tables(iommu); 2463 if (ret) { 2464 /* 2465 * We found the IOMMU with translation 2466 * enabled - but failed to copy over the 2467 * old root-entry table. Try to proceed 2468 * by disabling translation now and 2469 * allocating a clean root-entry table. 2470 * This might cause DMAR faults, but 2471 * probably the dump will still succeed. 2472 */ 2473 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2474 iommu->name); 2475 iommu_disable_translation(iommu); 2476 clear_translation_pre_enabled(iommu); 2477 } else { 2478 pr_info("Copied translation tables from previous kernel for %s\n", 2479 iommu->name); 2480 } 2481 } 2482 2483 if (!ecap_pass_through(iommu->ecap)) 2484 hw_pass_through = 0; 2485 intel_svm_check(iommu); 2486 } 2487 2488 /* 2489 * Now that qi is enabled on all iommus, set the root entry and flush 2490 * caches. This is required on some Intel X58 chipsets, otherwise the 2491 * flush_context function will loop forever and the boot hangs. 2492 */ 2493 for_each_active_iommu(iommu, drhd) { 2494 iommu_flush_write_buffer(iommu); 2495 iommu_set_root_entry(iommu); 2496 } 2497 2498 check_tylersburg_isoch(); 2499 2500 ret = si_domain_init(hw_pass_through); 2501 if (ret) 2502 goto free_iommu; 2503 2504 /* 2505 * for each drhd 2506 * enable fault log 2507 * global invalidate context cache 2508 * global invalidate iotlb 2509 * enable translation 2510 */ 2511 for_each_iommu(iommu, drhd) { 2512 if (drhd->ignored) { 2513 /* 2514 * we always have to disable PMRs or DMA may fail on 2515 * this device 2516 */ 2517 if (force_on) 2518 iommu_disable_protect_mem_regions(iommu); 2519 continue; 2520 } 2521 2522 iommu_flush_write_buffer(iommu); 2523 2524 #ifdef CONFIG_INTEL_IOMMU_SVM 2525 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2526 /* 2527 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2528 * could cause possible lock race condition. 2529 */ 2530 up_write(&dmar_global_lock); 2531 ret = intel_svm_enable_prq(iommu); 2532 down_write(&dmar_global_lock); 2533 if (ret) 2534 goto free_iommu; 2535 } 2536 #endif 2537 ret = dmar_set_interrupt(iommu); 2538 if (ret) 2539 goto free_iommu; 2540 } 2541 2542 return 0; 2543 2544 free_iommu: 2545 for_each_active_iommu(iommu, drhd) { 2546 disable_dmar_iommu(iommu); 2547 free_dmar_iommu(iommu); 2548 } 2549 if (si_domain) { 2550 domain_exit(si_domain); 2551 si_domain = NULL; 2552 } 2553 2554 return ret; 2555 } 2556 2557 static void __init init_no_remapping_devices(void) 2558 { 2559 struct dmar_drhd_unit *drhd; 2560 struct device *dev; 2561 int i; 2562 2563 for_each_drhd_unit(drhd) { 2564 if (!drhd->include_all) { 2565 for_each_active_dev_scope(drhd->devices, 2566 drhd->devices_cnt, i, dev) 2567 break; 2568 /* ignore DMAR unit if no devices exist */ 2569 if (i == drhd->devices_cnt) 2570 drhd->ignored = 1; 2571 } 2572 } 2573 2574 for_each_active_drhd_unit(drhd) { 2575 if (drhd->include_all) 2576 continue; 2577 2578 for_each_active_dev_scope(drhd->devices, 2579 drhd->devices_cnt, i, dev) 2580 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2581 break; 2582 if (i < drhd->devices_cnt) 2583 continue; 2584 2585 /* This IOMMU has *only* gfx devices. Either bypass it or 2586 set the gfx_mapped flag, as appropriate */ 2587 drhd->gfx_dedicated = 1; 2588 if (disable_igfx_iommu) 2589 drhd->ignored = 1; 2590 } 2591 } 2592 2593 #ifdef CONFIG_SUSPEND 2594 static int init_iommu_hw(void) 2595 { 2596 struct dmar_drhd_unit *drhd; 2597 struct intel_iommu *iommu = NULL; 2598 int ret; 2599 2600 for_each_active_iommu(iommu, drhd) { 2601 if (iommu->qi) { 2602 ret = dmar_reenable_qi(iommu); 2603 if (ret) 2604 return ret; 2605 } 2606 } 2607 2608 for_each_iommu(iommu, drhd) { 2609 if (drhd->ignored) { 2610 /* 2611 * we always have to disable PMRs or DMA may fail on 2612 * this device 2613 */ 2614 if (force_on) 2615 iommu_disable_protect_mem_regions(iommu); 2616 continue; 2617 } 2618 2619 iommu_flush_write_buffer(iommu); 2620 iommu_set_root_entry(iommu); 2621 iommu_enable_translation(iommu); 2622 iommu_disable_protect_mem_regions(iommu); 2623 } 2624 2625 return 0; 2626 } 2627 2628 static void iommu_flush_all(void) 2629 { 2630 struct dmar_drhd_unit *drhd; 2631 struct intel_iommu *iommu; 2632 2633 for_each_active_iommu(iommu, drhd) { 2634 iommu->flush.flush_context(iommu, 0, 0, 0, 2635 DMA_CCMD_GLOBAL_INVL); 2636 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2637 DMA_TLB_GLOBAL_FLUSH); 2638 } 2639 } 2640 2641 static int iommu_suspend(void) 2642 { 2643 struct dmar_drhd_unit *drhd; 2644 struct intel_iommu *iommu = NULL; 2645 unsigned long flag; 2646 2647 iommu_flush_all(); 2648 2649 for_each_active_iommu(iommu, drhd) { 2650 iommu_disable_translation(iommu); 2651 2652 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2653 2654 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2655 readl(iommu->reg + DMAR_FECTL_REG); 2656 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2657 readl(iommu->reg + DMAR_FEDATA_REG); 2658 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2659 readl(iommu->reg + DMAR_FEADDR_REG); 2660 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2661 readl(iommu->reg + DMAR_FEUADDR_REG); 2662 2663 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2664 } 2665 return 0; 2666 } 2667 2668 static void iommu_resume(void) 2669 { 2670 struct dmar_drhd_unit *drhd; 2671 struct intel_iommu *iommu = NULL; 2672 unsigned long flag; 2673 2674 if (init_iommu_hw()) { 2675 if (force_on) 2676 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2677 else 2678 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2679 return; 2680 } 2681 2682 for_each_active_iommu(iommu, drhd) { 2683 2684 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2685 2686 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2687 iommu->reg + DMAR_FECTL_REG); 2688 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2689 iommu->reg + DMAR_FEDATA_REG); 2690 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2691 iommu->reg + DMAR_FEADDR_REG); 2692 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2693 iommu->reg + DMAR_FEUADDR_REG); 2694 2695 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2696 } 2697 } 2698 2699 static struct syscore_ops iommu_syscore_ops = { 2700 .resume = iommu_resume, 2701 .suspend = iommu_suspend, 2702 }; 2703 2704 static void __init init_iommu_pm_ops(void) 2705 { 2706 register_syscore_ops(&iommu_syscore_ops); 2707 } 2708 2709 #else 2710 static inline void init_iommu_pm_ops(void) {} 2711 #endif /* CONFIG_PM */ 2712 2713 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2714 { 2715 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2716 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2717 rmrr->end_address <= rmrr->base_address || 2718 arch_rmrr_sanity_check(rmrr)) 2719 return -EINVAL; 2720 2721 return 0; 2722 } 2723 2724 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2725 { 2726 struct acpi_dmar_reserved_memory *rmrr; 2727 struct dmar_rmrr_unit *rmrru; 2728 2729 rmrr = (struct acpi_dmar_reserved_memory *)header; 2730 if (rmrr_sanity_check(rmrr)) { 2731 pr_warn(FW_BUG 2732 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2733 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2734 rmrr->base_address, rmrr->end_address, 2735 dmi_get_system_info(DMI_BIOS_VENDOR), 2736 dmi_get_system_info(DMI_BIOS_VERSION), 2737 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2738 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2739 } 2740 2741 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2742 if (!rmrru) 2743 goto out; 2744 2745 rmrru->hdr = header; 2746 2747 rmrru->base_address = rmrr->base_address; 2748 rmrru->end_address = rmrr->end_address; 2749 2750 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2751 ((void *)rmrr) + rmrr->header.length, 2752 &rmrru->devices_cnt); 2753 if (rmrru->devices_cnt && rmrru->devices == NULL) 2754 goto free_rmrru; 2755 2756 list_add(&rmrru->list, &dmar_rmrr_units); 2757 2758 return 0; 2759 free_rmrru: 2760 kfree(rmrru); 2761 out: 2762 return -ENOMEM; 2763 } 2764 2765 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2766 { 2767 struct dmar_atsr_unit *atsru; 2768 struct acpi_dmar_atsr *tmp; 2769 2770 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2771 dmar_rcu_check()) { 2772 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2773 if (atsr->segment != tmp->segment) 2774 continue; 2775 if (atsr->header.length != tmp->header.length) 2776 continue; 2777 if (memcmp(atsr, tmp, atsr->header.length) == 0) 2778 return atsru; 2779 } 2780 2781 return NULL; 2782 } 2783 2784 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2785 { 2786 struct acpi_dmar_atsr *atsr; 2787 struct dmar_atsr_unit *atsru; 2788 2789 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2790 return 0; 2791 2792 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2793 atsru = dmar_find_atsr(atsr); 2794 if (atsru) 2795 return 0; 2796 2797 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 2798 if (!atsru) 2799 return -ENOMEM; 2800 2801 /* 2802 * If memory is allocated from slab by ACPI _DSM method, we need to 2803 * copy the memory content because the memory buffer will be freed 2804 * on return. 2805 */ 2806 atsru->hdr = (void *)(atsru + 1); 2807 memcpy(atsru->hdr, hdr, hdr->length); 2808 atsru->include_all = atsr->flags & 0x1; 2809 if (!atsru->include_all) { 2810 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 2811 (void *)atsr + atsr->header.length, 2812 &atsru->devices_cnt); 2813 if (atsru->devices_cnt && atsru->devices == NULL) { 2814 kfree(atsru); 2815 return -ENOMEM; 2816 } 2817 } 2818 2819 list_add_rcu(&atsru->list, &dmar_atsr_units); 2820 2821 return 0; 2822 } 2823 2824 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 2825 { 2826 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 2827 kfree(atsru); 2828 } 2829 2830 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2831 { 2832 struct acpi_dmar_atsr *atsr; 2833 struct dmar_atsr_unit *atsru; 2834 2835 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2836 atsru = dmar_find_atsr(atsr); 2837 if (atsru) { 2838 list_del_rcu(&atsru->list); 2839 synchronize_rcu(); 2840 intel_iommu_free_atsr(atsru); 2841 } 2842 2843 return 0; 2844 } 2845 2846 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2847 { 2848 int i; 2849 struct device *dev; 2850 struct acpi_dmar_atsr *atsr; 2851 struct dmar_atsr_unit *atsru; 2852 2853 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2854 atsru = dmar_find_atsr(atsr); 2855 if (!atsru) 2856 return 0; 2857 2858 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 2859 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 2860 i, dev) 2861 return -EBUSY; 2862 } 2863 2864 return 0; 2865 } 2866 2867 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 2868 { 2869 struct dmar_satc_unit *satcu; 2870 struct acpi_dmar_satc *tmp; 2871 2872 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 2873 dmar_rcu_check()) { 2874 tmp = (struct acpi_dmar_satc *)satcu->hdr; 2875 if (satc->segment != tmp->segment) 2876 continue; 2877 if (satc->header.length != tmp->header.length) 2878 continue; 2879 if (memcmp(satc, tmp, satc->header.length) == 0) 2880 return satcu; 2881 } 2882 2883 return NULL; 2884 } 2885 2886 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 2887 { 2888 struct acpi_dmar_satc *satc; 2889 struct dmar_satc_unit *satcu; 2890 2891 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2892 return 0; 2893 2894 satc = container_of(hdr, struct acpi_dmar_satc, header); 2895 satcu = dmar_find_satc(satc); 2896 if (satcu) 2897 return 0; 2898 2899 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 2900 if (!satcu) 2901 return -ENOMEM; 2902 2903 satcu->hdr = (void *)(satcu + 1); 2904 memcpy(satcu->hdr, hdr, hdr->length); 2905 satcu->atc_required = satc->flags & 0x1; 2906 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 2907 (void *)satc + satc->header.length, 2908 &satcu->devices_cnt); 2909 if (satcu->devices_cnt && !satcu->devices) { 2910 kfree(satcu); 2911 return -ENOMEM; 2912 } 2913 list_add_rcu(&satcu->list, &dmar_satc_units); 2914 2915 return 0; 2916 } 2917 2918 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 2919 { 2920 int sp, ret; 2921 struct intel_iommu *iommu = dmaru->iommu; 2922 2923 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 2924 if (ret) 2925 goto out; 2926 2927 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 2928 pr_warn("%s: Doesn't support hardware pass through.\n", 2929 iommu->name); 2930 return -ENXIO; 2931 } 2932 2933 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 2934 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 2935 pr_warn("%s: Doesn't support large page.\n", 2936 iommu->name); 2937 return -ENXIO; 2938 } 2939 2940 /* 2941 * Disable translation if already enabled prior to OS handover. 2942 */ 2943 if (iommu->gcmd & DMA_GCMD_TE) 2944 iommu_disable_translation(iommu); 2945 2946 ret = iommu_init_domains(iommu); 2947 if (ret == 0) 2948 ret = iommu_alloc_root_entry(iommu); 2949 if (ret) 2950 goto out; 2951 2952 intel_svm_check(iommu); 2953 2954 if (dmaru->ignored) { 2955 /* 2956 * we always have to disable PMRs or DMA may fail on this device 2957 */ 2958 if (force_on) 2959 iommu_disable_protect_mem_regions(iommu); 2960 return 0; 2961 } 2962 2963 intel_iommu_init_qi(iommu); 2964 iommu_flush_write_buffer(iommu); 2965 2966 #ifdef CONFIG_INTEL_IOMMU_SVM 2967 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2968 ret = intel_svm_enable_prq(iommu); 2969 if (ret) 2970 goto disable_iommu; 2971 } 2972 #endif 2973 ret = dmar_set_interrupt(iommu); 2974 if (ret) 2975 goto disable_iommu; 2976 2977 iommu_set_root_entry(iommu); 2978 iommu_enable_translation(iommu); 2979 2980 iommu_disable_protect_mem_regions(iommu); 2981 return 0; 2982 2983 disable_iommu: 2984 disable_dmar_iommu(iommu); 2985 out: 2986 free_dmar_iommu(iommu); 2987 return ret; 2988 } 2989 2990 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 2991 { 2992 int ret = 0; 2993 struct intel_iommu *iommu = dmaru->iommu; 2994 2995 if (!intel_iommu_enabled) 2996 return 0; 2997 if (iommu == NULL) 2998 return -EINVAL; 2999 3000 if (insert) { 3001 ret = intel_iommu_add(dmaru); 3002 } else { 3003 disable_dmar_iommu(iommu); 3004 free_dmar_iommu(iommu); 3005 } 3006 3007 return ret; 3008 } 3009 3010 static void intel_iommu_free_dmars(void) 3011 { 3012 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3013 struct dmar_atsr_unit *atsru, *atsr_n; 3014 struct dmar_satc_unit *satcu, *satc_n; 3015 3016 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3017 list_del(&rmrru->list); 3018 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3019 kfree(rmrru); 3020 } 3021 3022 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3023 list_del(&atsru->list); 3024 intel_iommu_free_atsr(atsru); 3025 } 3026 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3027 list_del(&satcu->list); 3028 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3029 kfree(satcu); 3030 } 3031 } 3032 3033 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3034 { 3035 struct dmar_satc_unit *satcu; 3036 struct acpi_dmar_satc *satc; 3037 struct device *tmp; 3038 int i; 3039 3040 dev = pci_physfn(dev); 3041 rcu_read_lock(); 3042 3043 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3044 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3045 if (satc->segment != pci_domain_nr(dev->bus)) 3046 continue; 3047 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3048 if (to_pci_dev(tmp) == dev) 3049 goto out; 3050 } 3051 satcu = NULL; 3052 out: 3053 rcu_read_unlock(); 3054 return satcu; 3055 } 3056 3057 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3058 { 3059 int i, ret = 1; 3060 struct pci_bus *bus; 3061 struct pci_dev *bridge = NULL; 3062 struct device *tmp; 3063 struct acpi_dmar_atsr *atsr; 3064 struct dmar_atsr_unit *atsru; 3065 struct dmar_satc_unit *satcu; 3066 3067 dev = pci_physfn(dev); 3068 satcu = dmar_find_matched_satc_unit(dev); 3069 if (satcu) 3070 /* 3071 * This device supports ATS as it is in SATC table. 3072 * When IOMMU is in legacy mode, enabling ATS is done 3073 * automatically by HW for the device that requires 3074 * ATS, hence OS should not enable this device ATS 3075 * to avoid duplicated TLB invalidation. 3076 */ 3077 return !(satcu->atc_required && !sm_supported(iommu)); 3078 3079 for (bus = dev->bus; bus; bus = bus->parent) { 3080 bridge = bus->self; 3081 /* If it's an integrated device, allow ATS */ 3082 if (!bridge) 3083 return 1; 3084 /* Connected via non-PCIe: no ATS */ 3085 if (!pci_is_pcie(bridge) || 3086 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3087 return 0; 3088 /* If we found the root port, look it up in the ATSR */ 3089 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3090 break; 3091 } 3092 3093 rcu_read_lock(); 3094 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3095 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3096 if (atsr->segment != pci_domain_nr(dev->bus)) 3097 continue; 3098 3099 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3100 if (tmp == &bridge->dev) 3101 goto out; 3102 3103 if (atsru->include_all) 3104 goto out; 3105 } 3106 ret = 0; 3107 out: 3108 rcu_read_unlock(); 3109 3110 return ret; 3111 } 3112 3113 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3114 { 3115 int ret; 3116 struct dmar_rmrr_unit *rmrru; 3117 struct dmar_atsr_unit *atsru; 3118 struct dmar_satc_unit *satcu; 3119 struct acpi_dmar_atsr *atsr; 3120 struct acpi_dmar_reserved_memory *rmrr; 3121 struct acpi_dmar_satc *satc; 3122 3123 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3124 return 0; 3125 3126 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3127 rmrr = container_of(rmrru->hdr, 3128 struct acpi_dmar_reserved_memory, header); 3129 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3130 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3131 ((void *)rmrr) + rmrr->header.length, 3132 rmrr->segment, rmrru->devices, 3133 rmrru->devices_cnt); 3134 if (ret < 0) 3135 return ret; 3136 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3137 dmar_remove_dev_scope(info, rmrr->segment, 3138 rmrru->devices, rmrru->devices_cnt); 3139 } 3140 } 3141 3142 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3143 if (atsru->include_all) 3144 continue; 3145 3146 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3147 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3148 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3149 (void *)atsr + atsr->header.length, 3150 atsr->segment, atsru->devices, 3151 atsru->devices_cnt); 3152 if (ret > 0) 3153 break; 3154 else if (ret < 0) 3155 return ret; 3156 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3157 if (dmar_remove_dev_scope(info, atsr->segment, 3158 atsru->devices, atsru->devices_cnt)) 3159 break; 3160 } 3161 } 3162 list_for_each_entry(satcu, &dmar_satc_units, list) { 3163 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3164 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3165 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3166 (void *)satc + satc->header.length, 3167 satc->segment, satcu->devices, 3168 satcu->devices_cnt); 3169 if (ret > 0) 3170 break; 3171 else if (ret < 0) 3172 return ret; 3173 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3174 if (dmar_remove_dev_scope(info, satc->segment, 3175 satcu->devices, satcu->devices_cnt)) 3176 break; 3177 } 3178 } 3179 3180 return 0; 3181 } 3182 3183 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3184 unsigned long val, void *v) 3185 { 3186 struct memory_notify *mhp = v; 3187 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3188 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3189 mhp->nr_pages - 1); 3190 3191 switch (val) { 3192 case MEM_GOING_ONLINE: 3193 if (iommu_domain_identity_map(si_domain, 3194 start_vpfn, last_vpfn)) { 3195 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3196 start_vpfn, last_vpfn); 3197 return NOTIFY_BAD; 3198 } 3199 break; 3200 3201 case MEM_OFFLINE: 3202 case MEM_CANCEL_ONLINE: 3203 { 3204 LIST_HEAD(freelist); 3205 3206 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3207 iommu_put_pages_list(&freelist); 3208 } 3209 break; 3210 } 3211 3212 return NOTIFY_OK; 3213 } 3214 3215 static struct notifier_block intel_iommu_memory_nb = { 3216 .notifier_call = intel_iommu_memory_notifier, 3217 .priority = 0 3218 }; 3219 3220 static void intel_disable_iommus(void) 3221 { 3222 struct intel_iommu *iommu = NULL; 3223 struct dmar_drhd_unit *drhd; 3224 3225 for_each_iommu(iommu, drhd) 3226 iommu_disable_translation(iommu); 3227 } 3228 3229 void intel_iommu_shutdown(void) 3230 { 3231 struct dmar_drhd_unit *drhd; 3232 struct intel_iommu *iommu = NULL; 3233 3234 if (no_iommu || dmar_disabled) 3235 return; 3236 3237 down_write(&dmar_global_lock); 3238 3239 /* Disable PMRs explicitly here. */ 3240 for_each_iommu(iommu, drhd) 3241 iommu_disable_protect_mem_regions(iommu); 3242 3243 /* Make sure the IOMMUs are switched off */ 3244 intel_disable_iommus(); 3245 3246 up_write(&dmar_global_lock); 3247 } 3248 3249 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3250 { 3251 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3252 3253 return container_of(iommu_dev, struct intel_iommu, iommu); 3254 } 3255 3256 static ssize_t version_show(struct device *dev, 3257 struct device_attribute *attr, char *buf) 3258 { 3259 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3260 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3261 return sysfs_emit(buf, "%d:%d\n", 3262 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3263 } 3264 static DEVICE_ATTR_RO(version); 3265 3266 static ssize_t address_show(struct device *dev, 3267 struct device_attribute *attr, char *buf) 3268 { 3269 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3270 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3271 } 3272 static DEVICE_ATTR_RO(address); 3273 3274 static ssize_t cap_show(struct device *dev, 3275 struct device_attribute *attr, char *buf) 3276 { 3277 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3278 return sysfs_emit(buf, "%llx\n", iommu->cap); 3279 } 3280 static DEVICE_ATTR_RO(cap); 3281 3282 static ssize_t ecap_show(struct device *dev, 3283 struct device_attribute *attr, char *buf) 3284 { 3285 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3286 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3287 } 3288 static DEVICE_ATTR_RO(ecap); 3289 3290 static ssize_t domains_supported_show(struct device *dev, 3291 struct device_attribute *attr, char *buf) 3292 { 3293 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3294 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3295 } 3296 static DEVICE_ATTR_RO(domains_supported); 3297 3298 static ssize_t domains_used_show(struct device *dev, 3299 struct device_attribute *attr, char *buf) 3300 { 3301 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3302 return sysfs_emit(buf, "%d\n", 3303 bitmap_weight(iommu->domain_ids, 3304 cap_ndoms(iommu->cap))); 3305 } 3306 static DEVICE_ATTR_RO(domains_used); 3307 3308 static struct attribute *intel_iommu_attrs[] = { 3309 &dev_attr_version.attr, 3310 &dev_attr_address.attr, 3311 &dev_attr_cap.attr, 3312 &dev_attr_ecap.attr, 3313 &dev_attr_domains_supported.attr, 3314 &dev_attr_domains_used.attr, 3315 NULL, 3316 }; 3317 3318 static struct attribute_group intel_iommu_group = { 3319 .name = "intel-iommu", 3320 .attrs = intel_iommu_attrs, 3321 }; 3322 3323 const struct attribute_group *intel_iommu_groups[] = { 3324 &intel_iommu_group, 3325 NULL, 3326 }; 3327 3328 static bool has_external_pci(void) 3329 { 3330 struct pci_dev *pdev = NULL; 3331 3332 for_each_pci_dev(pdev) 3333 if (pdev->external_facing) { 3334 pci_dev_put(pdev); 3335 return true; 3336 } 3337 3338 return false; 3339 } 3340 3341 static int __init platform_optin_force_iommu(void) 3342 { 3343 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3344 return 0; 3345 3346 if (no_iommu || dmar_disabled) 3347 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3348 3349 /* 3350 * If Intel-IOMMU is disabled by default, we will apply identity 3351 * map for all devices except those marked as being untrusted. 3352 */ 3353 if (dmar_disabled) 3354 iommu_set_default_passthrough(false); 3355 3356 dmar_disabled = 0; 3357 no_iommu = 0; 3358 3359 return 1; 3360 } 3361 3362 static int __init probe_acpi_namespace_devices(void) 3363 { 3364 struct dmar_drhd_unit *drhd; 3365 /* To avoid a -Wunused-but-set-variable warning. */ 3366 struct intel_iommu *iommu __maybe_unused; 3367 struct device *dev; 3368 int i, ret = 0; 3369 3370 for_each_active_iommu(iommu, drhd) { 3371 for_each_active_dev_scope(drhd->devices, 3372 drhd->devices_cnt, i, dev) { 3373 struct acpi_device_physical_node *pn; 3374 struct acpi_device *adev; 3375 3376 if (dev->bus != &acpi_bus_type) 3377 continue; 3378 3379 adev = to_acpi_device(dev); 3380 mutex_lock(&adev->physical_node_lock); 3381 list_for_each_entry(pn, 3382 &adev->physical_node_list, node) { 3383 ret = iommu_probe_device(pn->dev); 3384 if (ret) 3385 break; 3386 } 3387 mutex_unlock(&adev->physical_node_lock); 3388 3389 if (ret) 3390 return ret; 3391 } 3392 } 3393 3394 return 0; 3395 } 3396 3397 static __init int tboot_force_iommu(void) 3398 { 3399 if (!tboot_enabled()) 3400 return 0; 3401 3402 if (no_iommu || dmar_disabled) 3403 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3404 3405 dmar_disabled = 0; 3406 no_iommu = 0; 3407 3408 return 1; 3409 } 3410 3411 int __init intel_iommu_init(void) 3412 { 3413 int ret = -ENODEV; 3414 struct dmar_drhd_unit *drhd; 3415 struct intel_iommu *iommu; 3416 3417 /* 3418 * Intel IOMMU is required for a TXT/tboot launch or platform 3419 * opt in, so enforce that. 3420 */ 3421 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3422 platform_optin_force_iommu(); 3423 3424 down_write(&dmar_global_lock); 3425 if (dmar_table_init()) { 3426 if (force_on) 3427 panic("tboot: Failed to initialize DMAR table\n"); 3428 goto out_free_dmar; 3429 } 3430 3431 if (dmar_dev_scope_init() < 0) { 3432 if (force_on) 3433 panic("tboot: Failed to initialize DMAR device scope\n"); 3434 goto out_free_dmar; 3435 } 3436 3437 up_write(&dmar_global_lock); 3438 3439 /* 3440 * The bus notifier takes the dmar_global_lock, so lockdep will 3441 * complain later when we register it under the lock. 3442 */ 3443 dmar_register_bus_notifier(); 3444 3445 down_write(&dmar_global_lock); 3446 3447 if (!no_iommu) 3448 intel_iommu_debugfs_init(); 3449 3450 if (no_iommu || dmar_disabled) { 3451 /* 3452 * We exit the function here to ensure IOMMU's remapping and 3453 * mempool aren't setup, which means that the IOMMU's PMRs 3454 * won't be disabled via the call to init_dmars(). So disable 3455 * it explicitly here. The PMRs were setup by tboot prior to 3456 * calling SENTER, but the kernel is expected to reset/tear 3457 * down the PMRs. 3458 */ 3459 if (intel_iommu_tboot_noforce) { 3460 for_each_iommu(iommu, drhd) 3461 iommu_disable_protect_mem_regions(iommu); 3462 } 3463 3464 /* 3465 * Make sure the IOMMUs are switched off, even when we 3466 * boot into a kexec kernel and the previous kernel left 3467 * them enabled 3468 */ 3469 intel_disable_iommus(); 3470 goto out_free_dmar; 3471 } 3472 3473 if (list_empty(&dmar_rmrr_units)) 3474 pr_info("No RMRR found\n"); 3475 3476 if (list_empty(&dmar_atsr_units)) 3477 pr_info("No ATSR found\n"); 3478 3479 if (list_empty(&dmar_satc_units)) 3480 pr_info("No SATC found\n"); 3481 3482 init_no_remapping_devices(); 3483 3484 ret = init_dmars(); 3485 if (ret) { 3486 if (force_on) 3487 panic("tboot: Failed to initialize DMARs\n"); 3488 pr_err("Initialization failed\n"); 3489 goto out_free_dmar; 3490 } 3491 up_write(&dmar_global_lock); 3492 3493 init_iommu_pm_ops(); 3494 3495 down_read(&dmar_global_lock); 3496 for_each_active_iommu(iommu, drhd) { 3497 /* 3498 * The flush queue implementation does not perform 3499 * page-selective invalidations that are required for efficient 3500 * TLB flushes in virtual environments. The benefit of batching 3501 * is likely to be much lower than the overhead of synchronizing 3502 * the virtual and physical IOMMU page-tables. 3503 */ 3504 if (cap_caching_mode(iommu->cap) && 3505 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3506 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3507 iommu_set_dma_strict(); 3508 } 3509 iommu_device_sysfs_add(&iommu->iommu, NULL, 3510 intel_iommu_groups, 3511 "%s", iommu->name); 3512 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3513 3514 iommu_pmu_register(iommu); 3515 } 3516 up_read(&dmar_global_lock); 3517 3518 if (si_domain && !hw_pass_through) 3519 register_memory_notifier(&intel_iommu_memory_nb); 3520 3521 down_read(&dmar_global_lock); 3522 if (probe_acpi_namespace_devices()) 3523 pr_warn("ACPI name space devices didn't probe correctly\n"); 3524 3525 /* Finally, we enable the DMA remapping hardware. */ 3526 for_each_iommu(iommu, drhd) { 3527 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3528 iommu_enable_translation(iommu); 3529 3530 iommu_disable_protect_mem_regions(iommu); 3531 } 3532 up_read(&dmar_global_lock); 3533 3534 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3535 3536 intel_iommu_enabled = 1; 3537 3538 return 0; 3539 3540 out_free_dmar: 3541 intel_iommu_free_dmars(); 3542 up_write(&dmar_global_lock); 3543 return ret; 3544 } 3545 3546 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3547 { 3548 struct device_domain_info *info = opaque; 3549 3550 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3551 return 0; 3552 } 3553 3554 /* 3555 * NB - intel-iommu lacks any sort of reference counting for the users of 3556 * dependent devices. If multiple endpoints have intersecting dependent 3557 * devices, unbinding the driver from any one of them will possibly leave 3558 * the others unable to operate. 3559 */ 3560 static void domain_context_clear(struct device_domain_info *info) 3561 { 3562 if (!dev_is_pci(info->dev)) 3563 domain_context_clear_one(info, info->bus, info->devfn); 3564 3565 pci_for_each_dma_alias(to_pci_dev(info->dev), 3566 &domain_context_clear_one_cb, info); 3567 } 3568 3569 /* 3570 * Clear the page table pointer in context or pasid table entries so that 3571 * all DMA requests without PASID from the device are blocked. If the page 3572 * table has been set, clean up the data structures. 3573 */ 3574 void device_block_translation(struct device *dev) 3575 { 3576 struct device_domain_info *info = dev_iommu_priv_get(dev); 3577 struct intel_iommu *iommu = info->iommu; 3578 unsigned long flags; 3579 3580 iommu_disable_pci_caps(info); 3581 if (!dev_is_real_dma_subdevice(dev)) { 3582 if (sm_supported(iommu)) 3583 intel_pasid_tear_down_entry(iommu, dev, 3584 IOMMU_NO_PASID, false); 3585 else 3586 domain_context_clear(info); 3587 } 3588 3589 if (!info->domain) 3590 return; 3591 3592 spin_lock_irqsave(&info->domain->lock, flags); 3593 list_del(&info->link); 3594 spin_unlock_irqrestore(&info->domain->lock, flags); 3595 3596 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); 3597 domain_detach_iommu(info->domain, iommu); 3598 info->domain = NULL; 3599 } 3600 3601 static int md_domain_init(struct dmar_domain *domain, int guest_width) 3602 { 3603 int adjust_width; 3604 3605 /* calculate AGAW */ 3606 domain->gaw = guest_width; 3607 adjust_width = guestwidth_to_adjustwidth(guest_width); 3608 domain->agaw = width_to_agaw(adjust_width); 3609 3610 domain->iommu_coherency = false; 3611 domain->iommu_superpage = 0; 3612 domain->max_addr = 0; 3613 3614 /* always allocate the top pgd */ 3615 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); 3616 if (!domain->pgd) 3617 return -ENOMEM; 3618 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3619 return 0; 3620 } 3621 3622 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3623 struct device *dev) 3624 { 3625 device_block_translation(dev); 3626 return 0; 3627 } 3628 3629 static struct iommu_domain blocking_domain = { 3630 .type = IOMMU_DOMAIN_BLOCKED, 3631 .ops = &(const struct iommu_domain_ops) { 3632 .attach_dev = blocking_domain_attach_dev, 3633 } 3634 }; 3635 3636 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 3637 { 3638 struct dmar_domain *dmar_domain; 3639 struct iommu_domain *domain; 3640 3641 switch (type) { 3642 case IOMMU_DOMAIN_DMA: 3643 case IOMMU_DOMAIN_UNMANAGED: 3644 dmar_domain = alloc_domain(type); 3645 if (!dmar_domain) { 3646 pr_err("Can't allocate dmar_domain\n"); 3647 return NULL; 3648 } 3649 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 3650 pr_err("Domain initialization failed\n"); 3651 domain_exit(dmar_domain); 3652 return NULL; 3653 } 3654 3655 domain = &dmar_domain->domain; 3656 domain->geometry.aperture_start = 0; 3657 domain->geometry.aperture_end = 3658 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 3659 domain->geometry.force_aperture = true; 3660 3661 return domain; 3662 case IOMMU_DOMAIN_IDENTITY: 3663 return &si_domain->domain; 3664 default: 3665 return NULL; 3666 } 3667 3668 return NULL; 3669 } 3670 3671 static struct iommu_domain * 3672 intel_iommu_domain_alloc_user(struct device *dev, u32 flags, 3673 struct iommu_domain *parent, 3674 const struct iommu_user_data *user_data) 3675 { 3676 struct device_domain_info *info = dev_iommu_priv_get(dev); 3677 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3678 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3679 struct intel_iommu *iommu = info->iommu; 3680 struct dmar_domain *dmar_domain; 3681 struct iommu_domain *domain; 3682 3683 /* Must be NESTING domain */ 3684 if (parent) { 3685 if (!nested_supported(iommu) || flags) 3686 return ERR_PTR(-EOPNOTSUPP); 3687 return intel_nested_domain_alloc(parent, user_data); 3688 } 3689 3690 if (flags & 3691 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 3692 return ERR_PTR(-EOPNOTSUPP); 3693 if (nested_parent && !nested_supported(iommu)) 3694 return ERR_PTR(-EOPNOTSUPP); 3695 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3696 return ERR_PTR(-EOPNOTSUPP); 3697 3698 /* 3699 * domain_alloc_user op needs to fully initialize a domain before 3700 * return, so uses iommu_domain_alloc() here for simple. 3701 */ 3702 domain = iommu_domain_alloc(dev->bus); 3703 if (!domain) 3704 return ERR_PTR(-ENOMEM); 3705 3706 dmar_domain = to_dmar_domain(domain); 3707 3708 if (nested_parent) { 3709 dmar_domain->nested_parent = true; 3710 INIT_LIST_HEAD(&dmar_domain->s1_domains); 3711 spin_lock_init(&dmar_domain->s1_lock); 3712 } 3713 3714 if (dirty_tracking) { 3715 if (dmar_domain->use_first_level) { 3716 iommu_domain_free(domain); 3717 return ERR_PTR(-EOPNOTSUPP); 3718 } 3719 domain->dirty_ops = &intel_dirty_ops; 3720 } 3721 3722 return domain; 3723 } 3724 3725 static void intel_iommu_domain_free(struct iommu_domain *domain) 3726 { 3727 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3728 3729 WARN_ON(dmar_domain->nested_parent && 3730 !list_empty(&dmar_domain->s1_domains)); 3731 if (domain != &si_domain->domain) 3732 domain_exit(dmar_domain); 3733 } 3734 3735 int prepare_domain_attach_device(struct iommu_domain *domain, 3736 struct device *dev) 3737 { 3738 struct device_domain_info *info = dev_iommu_priv_get(dev); 3739 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3740 struct intel_iommu *iommu = info->iommu; 3741 int addr_width; 3742 3743 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3744 return -EINVAL; 3745 3746 if (domain->dirty_ops && !ssads_supported(iommu)) 3747 return -EINVAL; 3748 3749 /* check if this iommu agaw is sufficient for max mapped address */ 3750 addr_width = agaw_to_width(iommu->agaw); 3751 if (addr_width > cap_mgaw(iommu->cap)) 3752 addr_width = cap_mgaw(iommu->cap); 3753 3754 if (dmar_domain->max_addr > (1LL << addr_width)) 3755 return -EINVAL; 3756 dmar_domain->gaw = addr_width; 3757 3758 /* 3759 * Knock out extra levels of page tables if necessary 3760 */ 3761 while (iommu->agaw < dmar_domain->agaw) { 3762 struct dma_pte *pte; 3763 3764 pte = dmar_domain->pgd; 3765 if (dma_pte_present(pte)) { 3766 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 3767 iommu_free_page(pte); 3768 } 3769 dmar_domain->agaw--; 3770 } 3771 3772 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3773 context_copied(iommu, info->bus, info->devfn)) 3774 return intel_pasid_setup_sm_context(dev); 3775 3776 return 0; 3777 } 3778 3779 static int intel_iommu_attach_device(struct iommu_domain *domain, 3780 struct device *dev) 3781 { 3782 struct device_domain_info *info = dev_iommu_priv_get(dev); 3783 int ret; 3784 3785 if (info->domain) 3786 device_block_translation(dev); 3787 3788 ret = prepare_domain_attach_device(domain, dev); 3789 if (ret) 3790 return ret; 3791 3792 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 3793 } 3794 3795 static int intel_iommu_map(struct iommu_domain *domain, 3796 unsigned long iova, phys_addr_t hpa, 3797 size_t size, int iommu_prot, gfp_t gfp) 3798 { 3799 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3800 u64 max_addr; 3801 int prot = 0; 3802 3803 if (iommu_prot & IOMMU_READ) 3804 prot |= DMA_PTE_READ; 3805 if (iommu_prot & IOMMU_WRITE) 3806 prot |= DMA_PTE_WRITE; 3807 if (dmar_domain->set_pte_snp) 3808 prot |= DMA_PTE_SNP; 3809 3810 max_addr = iova + size; 3811 if (dmar_domain->max_addr < max_addr) { 3812 u64 end; 3813 3814 /* check if minimum agaw is sufficient for mapped address */ 3815 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 3816 if (end < max_addr) { 3817 pr_err("%s: iommu width (%d) is not " 3818 "sufficient for the mapped address (%llx)\n", 3819 __func__, dmar_domain->gaw, max_addr); 3820 return -EFAULT; 3821 } 3822 dmar_domain->max_addr = max_addr; 3823 } 3824 /* Round up size to next multiple of PAGE_SIZE, if it and 3825 the low bits of hpa would take us onto the next page */ 3826 size = aligned_nrpages(hpa, size); 3827 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 3828 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 3829 } 3830 3831 static int intel_iommu_map_pages(struct iommu_domain *domain, 3832 unsigned long iova, phys_addr_t paddr, 3833 size_t pgsize, size_t pgcount, 3834 int prot, gfp_t gfp, size_t *mapped) 3835 { 3836 unsigned long pgshift = __ffs(pgsize); 3837 size_t size = pgcount << pgshift; 3838 int ret; 3839 3840 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 3841 return -EINVAL; 3842 3843 if (!IS_ALIGNED(iova | paddr, pgsize)) 3844 return -EINVAL; 3845 3846 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 3847 if (!ret && mapped) 3848 *mapped = size; 3849 3850 return ret; 3851 } 3852 3853 static size_t intel_iommu_unmap(struct iommu_domain *domain, 3854 unsigned long iova, size_t size, 3855 struct iommu_iotlb_gather *gather) 3856 { 3857 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3858 unsigned long start_pfn, last_pfn; 3859 int level = 0; 3860 3861 /* Cope with horrid API which requires us to unmap more than the 3862 size argument if it happens to be a large-page mapping. */ 3863 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 3864 &level, GFP_ATOMIC))) 3865 return 0; 3866 3867 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 3868 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 3869 3870 start_pfn = iova >> VTD_PAGE_SHIFT; 3871 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 3872 3873 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 3874 3875 if (dmar_domain->max_addr == iova + size) 3876 dmar_domain->max_addr = iova; 3877 3878 /* 3879 * We do not use page-selective IOTLB invalidation in flush queue, 3880 * so there is no need to track page and sync iotlb. 3881 */ 3882 if (!iommu_iotlb_gather_queued(gather)) 3883 iommu_iotlb_gather_add_page(domain, gather, iova, size); 3884 3885 return size; 3886 } 3887 3888 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 3889 unsigned long iova, 3890 size_t pgsize, size_t pgcount, 3891 struct iommu_iotlb_gather *gather) 3892 { 3893 unsigned long pgshift = __ffs(pgsize); 3894 size_t size = pgcount << pgshift; 3895 3896 return intel_iommu_unmap(domain, iova, size, gather); 3897 } 3898 3899 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 3900 struct iommu_iotlb_gather *gather) 3901 { 3902 cache_tag_flush_range(to_dmar_domain(domain), gather->start, 3903 gather->end, list_empty(&gather->freelist)); 3904 iommu_put_pages_list(&gather->freelist); 3905 } 3906 3907 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 3908 dma_addr_t iova) 3909 { 3910 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3911 struct dma_pte *pte; 3912 int level = 0; 3913 u64 phys = 0; 3914 3915 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 3916 GFP_ATOMIC); 3917 if (pte && dma_pte_present(pte)) 3918 phys = dma_pte_addr(pte) + 3919 (iova & (BIT_MASK(level_to_offset_bits(level) + 3920 VTD_PAGE_SHIFT) - 1)); 3921 3922 return phys; 3923 } 3924 3925 static bool domain_support_force_snooping(struct dmar_domain *domain) 3926 { 3927 struct device_domain_info *info; 3928 bool support = true; 3929 3930 assert_spin_locked(&domain->lock); 3931 list_for_each_entry(info, &domain->devices, link) { 3932 if (!ecap_sc_support(info->iommu->ecap)) { 3933 support = false; 3934 break; 3935 } 3936 } 3937 3938 return support; 3939 } 3940 3941 static void domain_set_force_snooping(struct dmar_domain *domain) 3942 { 3943 struct device_domain_info *info; 3944 3945 assert_spin_locked(&domain->lock); 3946 /* 3947 * Second level page table supports per-PTE snoop control. The 3948 * iommu_map() interface will handle this by setting SNP bit. 3949 */ 3950 if (!domain->use_first_level) { 3951 domain->set_pte_snp = true; 3952 return; 3953 } 3954 3955 list_for_each_entry(info, &domain->devices, link) 3956 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 3957 IOMMU_NO_PASID); 3958 } 3959 3960 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3961 { 3962 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3963 unsigned long flags; 3964 3965 if (dmar_domain->force_snooping) 3966 return true; 3967 3968 spin_lock_irqsave(&dmar_domain->lock, flags); 3969 if (!domain_support_force_snooping(dmar_domain) || 3970 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 3971 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3972 return false; 3973 } 3974 3975 domain_set_force_snooping(dmar_domain); 3976 dmar_domain->force_snooping = true; 3977 spin_unlock_irqrestore(&dmar_domain->lock, flags); 3978 3979 return true; 3980 } 3981 3982 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 3983 { 3984 struct device_domain_info *info = dev_iommu_priv_get(dev); 3985 3986 switch (cap) { 3987 case IOMMU_CAP_CACHE_COHERENCY: 3988 case IOMMU_CAP_DEFERRED_FLUSH: 3989 return true; 3990 case IOMMU_CAP_PRE_BOOT_PROTECTION: 3991 return dmar_platform_optin(); 3992 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 3993 return ecap_sc_support(info->iommu->ecap); 3994 case IOMMU_CAP_DIRTY_TRACKING: 3995 return ssads_supported(info->iommu); 3996 default: 3997 return false; 3998 } 3999 } 4000 4001 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4002 { 4003 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4004 struct device_domain_info *info; 4005 struct intel_iommu *iommu; 4006 u8 bus, devfn; 4007 int ret; 4008 4009 iommu = device_lookup_iommu(dev, &bus, &devfn); 4010 if (!iommu || !iommu->iommu.ops) 4011 return ERR_PTR(-ENODEV); 4012 4013 info = kzalloc(sizeof(*info), GFP_KERNEL); 4014 if (!info) 4015 return ERR_PTR(-ENOMEM); 4016 4017 if (dev_is_real_dma_subdevice(dev)) { 4018 info->bus = pdev->bus->number; 4019 info->devfn = pdev->devfn; 4020 info->segment = pci_domain_nr(pdev->bus); 4021 } else { 4022 info->bus = bus; 4023 info->devfn = devfn; 4024 info->segment = iommu->segment; 4025 } 4026 4027 info->dev = dev; 4028 info->iommu = iommu; 4029 if (dev_is_pci(dev)) { 4030 if (ecap_dev_iotlb_support(iommu->ecap) && 4031 pci_ats_supported(pdev) && 4032 dmar_ats_supported(pdev, iommu)) { 4033 info->ats_supported = 1; 4034 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4035 4036 /* 4037 * For IOMMU that supports device IOTLB throttling 4038 * (DIT), we assign PFSID to the invalidation desc 4039 * of a VF such that IOMMU HW can gauge queue depth 4040 * at PF level. If DIT is not set, PFSID will be 4041 * treated as reserved, which should be set to 0. 4042 */ 4043 if (ecap_dit(iommu->ecap)) 4044 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4045 info->ats_qdep = pci_ats_queue_depth(pdev); 4046 } 4047 if (sm_supported(iommu)) { 4048 if (pasid_supported(iommu)) { 4049 int features = pci_pasid_features(pdev); 4050 4051 if (features >= 0) 4052 info->pasid_supported = features | 1; 4053 } 4054 4055 if (info->ats_supported && ecap_prs(iommu->ecap) && 4056 pci_pri_supported(pdev)) 4057 info->pri_supported = 1; 4058 } 4059 } 4060 4061 dev_iommu_priv_set(dev, info); 4062 if (pdev && pci_ats_supported(pdev)) { 4063 ret = device_rbtree_insert(iommu, info); 4064 if (ret) 4065 goto free; 4066 } 4067 4068 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4069 ret = intel_pasid_alloc_table(dev); 4070 if (ret) { 4071 dev_err(dev, "PASID table allocation failed\n"); 4072 goto clear_rbtree; 4073 } 4074 4075 if (!context_copied(iommu, info->bus, info->devfn)) { 4076 ret = intel_pasid_setup_sm_context(dev); 4077 if (ret) 4078 goto free_table; 4079 } 4080 } 4081 4082 intel_iommu_debugfs_create_dev(info); 4083 4084 return &iommu->iommu; 4085 free_table: 4086 intel_pasid_free_table(dev); 4087 clear_rbtree: 4088 device_rbtree_remove(info); 4089 free: 4090 kfree(info); 4091 4092 return ERR_PTR(ret); 4093 } 4094 4095 static void intel_iommu_release_device(struct device *dev) 4096 { 4097 struct device_domain_info *info = dev_iommu_priv_get(dev); 4098 struct intel_iommu *iommu = info->iommu; 4099 4100 mutex_lock(&iommu->iopf_lock); 4101 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) 4102 device_rbtree_remove(info); 4103 mutex_unlock(&iommu->iopf_lock); 4104 4105 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 4106 !context_copied(iommu, info->bus, info->devfn)) 4107 intel_pasid_teardown_sm_context(dev); 4108 4109 intel_pasid_free_table(dev); 4110 intel_iommu_debugfs_remove_dev(info); 4111 kfree(info); 4112 set_dma_ops(dev, NULL); 4113 } 4114 4115 static void intel_iommu_get_resv_regions(struct device *device, 4116 struct list_head *head) 4117 { 4118 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4119 struct iommu_resv_region *reg; 4120 struct dmar_rmrr_unit *rmrr; 4121 struct device *i_dev; 4122 int i; 4123 4124 rcu_read_lock(); 4125 for_each_rmrr_units(rmrr) { 4126 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4127 i, i_dev) { 4128 struct iommu_resv_region *resv; 4129 enum iommu_resv_type type; 4130 size_t length; 4131 4132 if (i_dev != device && 4133 !is_downstream_to_pci_bridge(device, i_dev)) 4134 continue; 4135 4136 length = rmrr->end_address - rmrr->base_address + 1; 4137 4138 type = device_rmrr_is_relaxable(device) ? 4139 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4140 4141 resv = iommu_alloc_resv_region(rmrr->base_address, 4142 length, prot, type, 4143 GFP_ATOMIC); 4144 if (!resv) 4145 break; 4146 4147 list_add_tail(&resv->list, head); 4148 } 4149 } 4150 rcu_read_unlock(); 4151 4152 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4153 if (dev_is_pci(device)) { 4154 struct pci_dev *pdev = to_pci_dev(device); 4155 4156 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4157 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4158 IOMMU_RESV_DIRECT_RELAXABLE, 4159 GFP_KERNEL); 4160 if (reg) 4161 list_add_tail(®->list, head); 4162 } 4163 } 4164 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4165 4166 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4167 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4168 0, IOMMU_RESV_MSI, GFP_KERNEL); 4169 if (!reg) 4170 return; 4171 list_add_tail(®->list, head); 4172 } 4173 4174 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4175 { 4176 if (dev_is_pci(dev)) 4177 return pci_device_group(dev); 4178 return generic_device_group(dev); 4179 } 4180 4181 static int intel_iommu_enable_sva(struct device *dev) 4182 { 4183 struct device_domain_info *info = dev_iommu_priv_get(dev); 4184 struct intel_iommu *iommu; 4185 4186 if (!info || dmar_disabled) 4187 return -EINVAL; 4188 4189 iommu = info->iommu; 4190 if (!iommu) 4191 return -EINVAL; 4192 4193 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4194 return -ENODEV; 4195 4196 if (!info->pasid_enabled || !info->ats_enabled) 4197 return -EINVAL; 4198 4199 /* 4200 * Devices having device-specific I/O fault handling should not 4201 * support PCI/PRI. The IOMMU side has no means to check the 4202 * capability of device-specific IOPF. Therefore, IOMMU can only 4203 * default that if the device driver enables SVA on a non-PRI 4204 * device, it will handle IOPF in its own way. 4205 */ 4206 if (!info->pri_supported) 4207 return 0; 4208 4209 /* Devices supporting PRI should have it enabled. */ 4210 if (!info->pri_enabled) 4211 return -EINVAL; 4212 4213 return 0; 4214 } 4215 4216 static int intel_iommu_enable_iopf(struct device *dev) 4217 { 4218 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4219 struct device_domain_info *info = dev_iommu_priv_get(dev); 4220 struct intel_iommu *iommu; 4221 int ret; 4222 4223 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4224 return -ENODEV; 4225 4226 if (info->pri_enabled) 4227 return -EBUSY; 4228 4229 iommu = info->iommu; 4230 if (!iommu) 4231 return -EINVAL; 4232 4233 /* PASID is required in PRG Response Message. */ 4234 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4235 return -EINVAL; 4236 4237 ret = pci_reset_pri(pdev); 4238 if (ret) 4239 return ret; 4240 4241 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4242 if (ret) 4243 return ret; 4244 4245 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4246 if (ret) { 4247 iopf_queue_remove_device(iommu->iopf_queue, dev); 4248 return ret; 4249 } 4250 4251 info->pri_enabled = 1; 4252 4253 return 0; 4254 } 4255 4256 static int intel_iommu_disable_iopf(struct device *dev) 4257 { 4258 struct device_domain_info *info = dev_iommu_priv_get(dev); 4259 struct intel_iommu *iommu = info->iommu; 4260 4261 if (!info->pri_enabled) 4262 return -EINVAL; 4263 4264 /* 4265 * PCIe spec states that by clearing PRI enable bit, the Page 4266 * Request Interface will not issue new page requests, but has 4267 * outstanding page requests that have been transmitted or are 4268 * queued for transmission. This is supposed to be called after 4269 * the device driver has stopped DMA, all PASIDs have been 4270 * unbound and the outstanding PRQs have been drained. 4271 */ 4272 pci_disable_pri(to_pci_dev(dev)); 4273 info->pri_enabled = 0; 4274 iopf_queue_remove_device(iommu->iopf_queue, dev); 4275 4276 return 0; 4277 } 4278 4279 static int 4280 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4281 { 4282 switch (feat) { 4283 case IOMMU_DEV_FEAT_IOPF: 4284 return intel_iommu_enable_iopf(dev); 4285 4286 case IOMMU_DEV_FEAT_SVA: 4287 return intel_iommu_enable_sva(dev); 4288 4289 default: 4290 return -ENODEV; 4291 } 4292 } 4293 4294 static int 4295 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4296 { 4297 switch (feat) { 4298 case IOMMU_DEV_FEAT_IOPF: 4299 return intel_iommu_disable_iopf(dev); 4300 4301 case IOMMU_DEV_FEAT_SVA: 4302 return 0; 4303 4304 default: 4305 return -ENODEV; 4306 } 4307 } 4308 4309 static bool intel_iommu_is_attach_deferred(struct device *dev) 4310 { 4311 struct device_domain_info *info = dev_iommu_priv_get(dev); 4312 4313 return translation_pre_enabled(info->iommu) && !info->domain; 4314 } 4315 4316 /* 4317 * Check that the device does not live on an external facing PCI port that is 4318 * marked as untrusted. Such devices should not be able to apply quirks and 4319 * thus not be able to bypass the IOMMU restrictions. 4320 */ 4321 static bool risky_device(struct pci_dev *pdev) 4322 { 4323 if (pdev->untrusted) { 4324 pci_info(pdev, 4325 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4326 pdev->vendor, pdev->device); 4327 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4328 return true; 4329 } 4330 return false; 4331 } 4332 4333 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4334 unsigned long iova, size_t size) 4335 { 4336 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); 4337 4338 return 0; 4339 } 4340 4341 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, 4342 struct iommu_domain *domain) 4343 { 4344 struct device_domain_info *info = dev_iommu_priv_get(dev); 4345 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4346 struct dev_pasid_info *curr, *dev_pasid = NULL; 4347 struct intel_iommu *iommu = info->iommu; 4348 unsigned long flags; 4349 4350 spin_lock_irqsave(&dmar_domain->lock, flags); 4351 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4352 if (curr->dev == dev && curr->pasid == pasid) { 4353 list_del(&curr->link_domain); 4354 dev_pasid = curr; 4355 break; 4356 } 4357 } 4358 WARN_ON_ONCE(!dev_pasid); 4359 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4360 4361 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4362 domain_detach_iommu(dmar_domain, iommu); 4363 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4364 kfree(dev_pasid); 4365 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4366 intel_drain_pasid_prq(dev, pasid); 4367 } 4368 4369 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4370 struct device *dev, ioasid_t pasid) 4371 { 4372 struct device_domain_info *info = dev_iommu_priv_get(dev); 4373 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4374 struct intel_iommu *iommu = info->iommu; 4375 struct dev_pasid_info *dev_pasid; 4376 unsigned long flags; 4377 int ret; 4378 4379 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4380 return -EOPNOTSUPP; 4381 4382 if (domain->dirty_ops) 4383 return -EINVAL; 4384 4385 if (context_copied(iommu, info->bus, info->devfn)) 4386 return -EBUSY; 4387 4388 ret = prepare_domain_attach_device(domain, dev); 4389 if (ret) 4390 return ret; 4391 4392 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4393 if (!dev_pasid) 4394 return -ENOMEM; 4395 4396 ret = domain_attach_iommu(dmar_domain, iommu); 4397 if (ret) 4398 goto out_free; 4399 4400 ret = cache_tag_assign_domain(dmar_domain, dev, pasid); 4401 if (ret) 4402 goto out_detach_iommu; 4403 4404 if (domain_type_is_si(dmar_domain)) 4405 ret = intel_pasid_setup_pass_through(iommu, dev, pasid); 4406 else if (dmar_domain->use_first_level) 4407 ret = domain_setup_first_level(iommu, dmar_domain, 4408 dev, pasid); 4409 else 4410 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4411 dev, pasid); 4412 if (ret) 4413 goto out_unassign_tag; 4414 4415 dev_pasid->dev = dev; 4416 dev_pasid->pasid = pasid; 4417 spin_lock_irqsave(&dmar_domain->lock, flags); 4418 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4419 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4420 4421 if (domain->type & __IOMMU_DOMAIN_PAGING) 4422 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4423 4424 return 0; 4425 out_unassign_tag: 4426 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4427 out_detach_iommu: 4428 domain_detach_iommu(dmar_domain, iommu); 4429 out_free: 4430 kfree(dev_pasid); 4431 return ret; 4432 } 4433 4434 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4435 { 4436 struct device_domain_info *info = dev_iommu_priv_get(dev); 4437 struct intel_iommu *iommu = info->iommu; 4438 struct iommu_hw_info_vtd *vtd; 4439 4440 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4441 if (!vtd) 4442 return ERR_PTR(-ENOMEM); 4443 4444 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4445 vtd->cap_reg = iommu->cap; 4446 vtd->ecap_reg = iommu->ecap; 4447 *length = sizeof(*vtd); 4448 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4449 return vtd; 4450 } 4451 4452 /* 4453 * Set dirty tracking for the device list of a domain. The caller must 4454 * hold the domain->lock when calling it. 4455 */ 4456 static int device_set_dirty_tracking(struct list_head *devices, bool enable) 4457 { 4458 struct device_domain_info *info; 4459 int ret = 0; 4460 4461 list_for_each_entry(info, devices, link) { 4462 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 4463 IOMMU_NO_PASID, enable); 4464 if (ret) 4465 break; 4466 } 4467 4468 return ret; 4469 } 4470 4471 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, 4472 bool enable) 4473 { 4474 struct dmar_domain *s1_domain; 4475 unsigned long flags; 4476 int ret; 4477 4478 spin_lock(&domain->s1_lock); 4479 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4480 spin_lock_irqsave(&s1_domain->lock, flags); 4481 ret = device_set_dirty_tracking(&s1_domain->devices, enable); 4482 spin_unlock_irqrestore(&s1_domain->lock, flags); 4483 if (ret) 4484 goto err_unwind; 4485 } 4486 spin_unlock(&domain->s1_lock); 4487 return 0; 4488 4489 err_unwind: 4490 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4491 spin_lock_irqsave(&s1_domain->lock, flags); 4492 device_set_dirty_tracking(&s1_domain->devices, 4493 domain->dirty_tracking); 4494 spin_unlock_irqrestore(&s1_domain->lock, flags); 4495 } 4496 spin_unlock(&domain->s1_lock); 4497 return ret; 4498 } 4499 4500 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4501 bool enable) 4502 { 4503 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4504 int ret; 4505 4506 spin_lock(&dmar_domain->lock); 4507 if (dmar_domain->dirty_tracking == enable) 4508 goto out_unlock; 4509 4510 ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 4511 if (ret) 4512 goto err_unwind; 4513 4514 if (dmar_domain->nested_parent) { 4515 ret = parent_domain_set_dirty_tracking(dmar_domain, enable); 4516 if (ret) 4517 goto err_unwind; 4518 } 4519 4520 dmar_domain->dirty_tracking = enable; 4521 out_unlock: 4522 spin_unlock(&dmar_domain->lock); 4523 4524 return 0; 4525 4526 err_unwind: 4527 device_set_dirty_tracking(&dmar_domain->devices, 4528 dmar_domain->dirty_tracking); 4529 spin_unlock(&dmar_domain->lock); 4530 return ret; 4531 } 4532 4533 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4534 unsigned long iova, size_t size, 4535 unsigned long flags, 4536 struct iommu_dirty_bitmap *dirty) 4537 { 4538 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4539 unsigned long end = iova + size - 1; 4540 unsigned long pgsize; 4541 4542 /* 4543 * IOMMUFD core calls into a dirty tracking disabled domain without an 4544 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4545 * have occurred when we stopped dirty tracking. This ensures that we 4546 * never inherit dirtied bits from a previous cycle. 4547 */ 4548 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4549 return -EINVAL; 4550 4551 do { 4552 struct dma_pte *pte; 4553 int lvl = 0; 4554 4555 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4556 GFP_ATOMIC); 4557 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4558 if (!pte || !dma_pte_present(pte)) { 4559 iova += pgsize; 4560 continue; 4561 } 4562 4563 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4564 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4565 iova += pgsize; 4566 } while (iova < end); 4567 4568 return 0; 4569 } 4570 4571 static const struct iommu_dirty_ops intel_dirty_ops = { 4572 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4573 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4574 }; 4575 4576 const struct iommu_ops intel_iommu_ops = { 4577 .blocked_domain = &blocking_domain, 4578 .release_domain = &blocking_domain, 4579 .capable = intel_iommu_capable, 4580 .hw_info = intel_iommu_hw_info, 4581 .domain_alloc = intel_iommu_domain_alloc, 4582 .domain_alloc_user = intel_iommu_domain_alloc_user, 4583 .domain_alloc_sva = intel_svm_domain_alloc, 4584 .probe_device = intel_iommu_probe_device, 4585 .release_device = intel_iommu_release_device, 4586 .get_resv_regions = intel_iommu_get_resv_regions, 4587 .device_group = intel_iommu_device_group, 4588 .dev_enable_feat = intel_iommu_dev_enable_feat, 4589 .dev_disable_feat = intel_iommu_dev_disable_feat, 4590 .is_attach_deferred = intel_iommu_is_attach_deferred, 4591 .def_domain_type = device_def_domain_type, 4592 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4593 .pgsize_bitmap = SZ_4K, 4594 #ifdef CONFIG_INTEL_IOMMU_SVM 4595 .page_response = intel_svm_page_response, 4596 #endif 4597 .default_domain_ops = &(const struct iommu_domain_ops) { 4598 .attach_dev = intel_iommu_attach_device, 4599 .set_dev_pasid = intel_iommu_set_dev_pasid, 4600 .map_pages = intel_iommu_map_pages, 4601 .unmap_pages = intel_iommu_unmap_pages, 4602 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4603 .flush_iotlb_all = intel_flush_iotlb_all, 4604 .iotlb_sync = intel_iommu_tlb_sync, 4605 .iova_to_phys = intel_iommu_iova_to_phys, 4606 .free = intel_iommu_domain_free, 4607 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4608 } 4609 }; 4610 4611 static void quirk_iommu_igfx(struct pci_dev *dev) 4612 { 4613 if (risky_device(dev)) 4614 return; 4615 4616 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4617 disable_igfx_iommu = 1; 4618 } 4619 4620 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4621 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4622 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4623 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4624 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4625 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4628 4629 /* Broadwell igfx malfunctions with dmar */ 4630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4633 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4654 4655 static void quirk_iommu_rwbf(struct pci_dev *dev) 4656 { 4657 if (risky_device(dev)) 4658 return; 4659 4660 /* 4661 * Mobile 4 Series Chipset neglects to set RWBF capability, 4662 * but needs it. Same seems to hold for the desktop versions. 4663 */ 4664 pci_info(dev, "Forcing write-buffer flush capability\n"); 4665 rwbf_quirk = 1; 4666 } 4667 4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4675 4676 #define GGC 0x52 4677 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4678 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4679 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4680 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4681 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4682 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4683 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4684 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4685 4686 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4687 { 4688 unsigned short ggc; 4689 4690 if (risky_device(dev)) 4691 return; 4692 4693 if (pci_read_config_word(dev, GGC, &ggc)) 4694 return; 4695 4696 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4697 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4698 disable_igfx_iommu = 1; 4699 } else if (!disable_igfx_iommu) { 4700 /* we have to ensure the gfx device is idle before we flush */ 4701 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4702 iommu_set_dma_strict(); 4703 } 4704 } 4705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4709 4710 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4711 { 4712 unsigned short ver; 4713 4714 if (!IS_GFX_DEVICE(dev)) 4715 return; 4716 4717 ver = (dev->device >> 8) & 0xff; 4718 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4719 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4720 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4721 return; 4722 4723 if (risky_device(dev)) 4724 return; 4725 4726 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4727 iommu_skip_te_disable = 1; 4728 } 4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4730 4731 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4732 ISOCH DMAR unit for the Azalia sound device, but not give it any 4733 TLB entries, which causes it to deadlock. Check for that. We do 4734 this in a function called from init_dmars(), instead of in a PCI 4735 quirk, because we don't want to print the obnoxious "BIOS broken" 4736 message if VT-d is actually disabled. 4737 */ 4738 static void __init check_tylersburg_isoch(void) 4739 { 4740 struct pci_dev *pdev; 4741 uint32_t vtisochctrl; 4742 4743 /* If there's no Azalia in the system anyway, forget it. */ 4744 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4745 if (!pdev) 4746 return; 4747 4748 if (risky_device(pdev)) { 4749 pci_dev_put(pdev); 4750 return; 4751 } 4752 4753 pci_dev_put(pdev); 4754 4755 /* System Management Registers. Might be hidden, in which case 4756 we can't do the sanity check. But that's OK, because the 4757 known-broken BIOSes _don't_ actually hide it, so far. */ 4758 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4759 if (!pdev) 4760 return; 4761 4762 if (risky_device(pdev)) { 4763 pci_dev_put(pdev); 4764 return; 4765 } 4766 4767 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4768 pci_dev_put(pdev); 4769 return; 4770 } 4771 4772 pci_dev_put(pdev); 4773 4774 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4775 if (vtisochctrl & 1) 4776 return; 4777 4778 /* Drop all bits other than the number of TLB entries */ 4779 vtisochctrl &= 0x1c; 4780 4781 /* If we have the recommended number of TLB entries (16), fine. */ 4782 if (vtisochctrl == 0x10) 4783 return; 4784 4785 /* Zero TLB entries? You get to ride the short bus to school. */ 4786 if (!vtisochctrl) { 4787 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4788 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4789 dmi_get_system_info(DMI_BIOS_VENDOR), 4790 dmi_get_system_info(DMI_BIOS_VERSION), 4791 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4792 iommu_identity_mapping |= IDENTMAP_AZALIA; 4793 return; 4794 } 4795 4796 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4797 vtisochctrl); 4798 } 4799 4800 /* 4801 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4802 * invalidation completion before posted writes initiated with translated address 4803 * that utilized translations matching the invalidation address range, violating 4804 * the invalidation completion ordering. 4805 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4806 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4807 * under the control of the trusted/privileged host device driver must use this 4808 * quirk. 4809 * Device TLBs are invalidated under the following six conditions: 4810 * 1. Device driver does DMA API unmap IOVA 4811 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4812 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4813 * exit_mmap() due to crash 4814 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4815 * VM has to free pages that were unmapped 4816 * 5. Userspace driver unmaps a DMA buffer 4817 * 6. Cache invalidation in vSVA usage (upcoming) 4818 * 4819 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4820 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4821 * invalidate TLB the same way as normal user unmap which will use this quirk. 4822 * The dTLB invalidation after PASID cache flush does not need this quirk. 4823 * 4824 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4825 */ 4826 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4827 unsigned long address, unsigned long mask, 4828 u32 pasid, u16 qdep) 4829 { 4830 u16 sid; 4831 4832 if (likely(!info->dtlb_extra_inval)) 4833 return; 4834 4835 sid = PCI_DEVID(info->bus, info->devfn); 4836 if (pasid == IOMMU_NO_PASID) { 4837 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4838 qdep, address, mask); 4839 } else { 4840 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 4841 pasid, qdep, address, mask); 4842 } 4843 } 4844 4845 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 4846 4847 /* 4848 * Function to submit a command to the enhanced command interface. The 4849 * valid enhanced command descriptions are defined in Table 47 of the 4850 * VT-d spec. The VT-d hardware implementation may support some but not 4851 * all commands, which can be determined by checking the Enhanced 4852 * Command Capability Register. 4853 * 4854 * Return values: 4855 * - 0: Command successful without any error; 4856 * - Negative: software error value; 4857 * - Nonzero positive: failure status code defined in Table 48. 4858 */ 4859 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 4860 { 4861 unsigned long flags; 4862 u64 res; 4863 int ret; 4864 4865 if (!cap_ecmds(iommu->cap)) 4866 return -ENODEV; 4867 4868 raw_spin_lock_irqsave(&iommu->register_lock, flags); 4869 4870 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 4871 if (res & DMA_ECMD_ECRSP_IP) { 4872 ret = -EBUSY; 4873 goto err; 4874 } 4875 4876 /* 4877 * Unconditionally write the operand B, because 4878 * - There is no side effect if an ecmd doesn't require an 4879 * operand B, but we set the register to some value. 4880 * - It's not invoked in any critical path. The extra MMIO 4881 * write doesn't bring any performance concerns. 4882 */ 4883 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 4884 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 4885 4886 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 4887 !(res & DMA_ECMD_ECRSP_IP), res); 4888 4889 if (res & DMA_ECMD_ECRSP_IP) { 4890 ret = -ETIMEDOUT; 4891 goto err; 4892 } 4893 4894 ret = ecmd_get_status_code(res); 4895 err: 4896 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 4897 4898 return ret; 4899 } 4900