1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-pages.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 51 52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 57 58 static void __init check_tylersburg_isoch(void); 59 static int rwbf_quirk; 60 61 /* 62 * set to 1 to panic kernel if can't successfully enable VT-d 63 * (used when kernel is launched w/ TXT) 64 */ 65 static int force_on = 0; 66 static int intel_iommu_tboot_noforce; 67 static int no_platform_optin; 68 69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 70 71 /* 72 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 73 * if marked present. 74 */ 75 static phys_addr_t root_entry_lctp(struct root_entry *re) 76 { 77 if (!(re->lo & 1)) 78 return 0; 79 80 return re->lo & VTD_PAGE_MASK; 81 } 82 83 /* 84 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 85 * if marked present. 86 */ 87 static phys_addr_t root_entry_uctp(struct root_entry *re) 88 { 89 if (!(re->hi & 1)) 90 return 0; 91 92 return re->hi & VTD_PAGE_MASK; 93 } 94 95 static int device_rid_cmp_key(const void *key, const struct rb_node *node) 96 { 97 struct device_domain_info *info = 98 rb_entry(node, struct device_domain_info, node); 99 const u16 *rid_lhs = key; 100 101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) 102 return -1; 103 104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) 105 return 1; 106 107 return 0; 108 } 109 110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) 111 { 112 struct device_domain_info *info = 113 rb_entry(lhs, struct device_domain_info, node); 114 u16 key = PCI_DEVID(info->bus, info->devfn); 115 116 return device_rid_cmp_key(&key, rhs); 117 } 118 119 /* 120 * Looks up an IOMMU-probed device using its source ID. 121 * 122 * Returns the pointer to the device if there is a match. Otherwise, 123 * returns NULL. 124 * 125 * Note that this helper doesn't guarantee that the device won't be 126 * released by the iommu subsystem after being returned. The caller 127 * should use its own synchronization mechanism to avoid the device 128 * being released during its use if its possibly the case. 129 */ 130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) 131 { 132 struct device_domain_info *info = NULL; 133 struct rb_node *node; 134 unsigned long flags; 135 136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); 138 if (node) 139 info = rb_entry(node, struct device_domain_info, node); 140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 141 142 return info ? info->dev : NULL; 143 } 144 145 static int device_rbtree_insert(struct intel_iommu *iommu, 146 struct device_domain_info *info) 147 { 148 struct rb_node *curr; 149 unsigned long flags; 150 151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); 153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 154 if (WARN_ON(curr)) 155 return -EEXIST; 156 157 return 0; 158 } 159 160 static void device_rbtree_remove(struct device_domain_info *info) 161 { 162 struct intel_iommu *iommu = info->iommu; 163 unsigned long flags; 164 165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 166 rb_erase(&info->node, &iommu->device_rbtree); 167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 168 } 169 170 /* 171 * This domain is a statically identity mapping domain. 172 * 1. This domain creats a static 1:1 mapping to all usable memory. 173 * 2. It maps to each iommu if successful. 174 * 3. Each iommu mapps to this domain if successful. 175 */ 176 static struct dmar_domain *si_domain; 177 static int hw_pass_through = 1; 178 179 struct dmar_rmrr_unit { 180 struct list_head list; /* list of rmrr units */ 181 struct acpi_dmar_header *hdr; /* ACPI header */ 182 u64 base_address; /* reserved base address*/ 183 u64 end_address; /* reserved end address */ 184 struct dmar_dev_scope *devices; /* target devices */ 185 int devices_cnt; /* target device count */ 186 }; 187 188 struct dmar_atsr_unit { 189 struct list_head list; /* list of ATSR units */ 190 struct acpi_dmar_header *hdr; /* ACPI header */ 191 struct dmar_dev_scope *devices; /* target devices */ 192 int devices_cnt; /* target device count */ 193 u8 include_all:1; /* include all ports */ 194 }; 195 196 struct dmar_satc_unit { 197 struct list_head list; /* list of SATC units */ 198 struct acpi_dmar_header *hdr; /* ACPI header */ 199 struct dmar_dev_scope *devices; /* target devices */ 200 struct intel_iommu *iommu; /* the corresponding iommu */ 201 int devices_cnt; /* target device count */ 202 u8 atc_required:1; /* ATS is required */ 203 }; 204 205 static LIST_HEAD(dmar_atsr_units); 206 static LIST_HEAD(dmar_rmrr_units); 207 static LIST_HEAD(dmar_satc_units); 208 209 #define for_each_rmrr_units(rmrr) \ 210 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 211 212 static void intel_iommu_domain_free(struct iommu_domain *domain); 213 214 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 215 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 216 217 int intel_iommu_enabled = 0; 218 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 219 220 static int intel_iommu_superpage = 1; 221 static int iommu_identity_mapping; 222 static int iommu_skip_te_disable; 223 static int disable_igfx_iommu; 224 225 #define IDENTMAP_AZALIA 4 226 227 const struct iommu_ops intel_iommu_ops; 228 static const struct iommu_dirty_ops intel_dirty_ops; 229 230 static bool translation_pre_enabled(struct intel_iommu *iommu) 231 { 232 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 233 } 234 235 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 236 { 237 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 238 } 239 240 static void init_translation_status(struct intel_iommu *iommu) 241 { 242 u32 gsts; 243 244 gsts = readl(iommu->reg + DMAR_GSTS_REG); 245 if (gsts & DMA_GSTS_TES) 246 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 247 } 248 249 static int __init intel_iommu_setup(char *str) 250 { 251 if (!str) 252 return -EINVAL; 253 254 while (*str) { 255 if (!strncmp(str, "on", 2)) { 256 dmar_disabled = 0; 257 pr_info("IOMMU enabled\n"); 258 } else if (!strncmp(str, "off", 3)) { 259 dmar_disabled = 1; 260 no_platform_optin = 1; 261 pr_info("IOMMU disabled\n"); 262 } else if (!strncmp(str, "igfx_off", 8)) { 263 disable_igfx_iommu = 1; 264 pr_info("Disable GFX device mapping\n"); 265 } else if (!strncmp(str, "forcedac", 8)) { 266 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 267 iommu_dma_forcedac = true; 268 } else if (!strncmp(str, "strict", 6)) { 269 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 270 iommu_set_dma_strict(); 271 } else if (!strncmp(str, "sp_off", 6)) { 272 pr_info("Disable supported super page\n"); 273 intel_iommu_superpage = 0; 274 } else if (!strncmp(str, "sm_on", 5)) { 275 pr_info("Enable scalable mode if hardware supports\n"); 276 intel_iommu_sm = 1; 277 } else if (!strncmp(str, "sm_off", 6)) { 278 pr_info("Scalable mode is disallowed\n"); 279 intel_iommu_sm = 0; 280 } else if (!strncmp(str, "tboot_noforce", 13)) { 281 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 282 intel_iommu_tboot_noforce = 1; 283 } else { 284 pr_notice("Unknown option - '%s'\n", str); 285 } 286 287 str += strcspn(str, ","); 288 while (*str == ',') 289 str++; 290 } 291 292 return 1; 293 } 294 __setup("intel_iommu=", intel_iommu_setup); 295 296 static int domain_type_is_si(struct dmar_domain *domain) 297 { 298 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 299 } 300 301 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 302 { 303 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 304 305 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 306 } 307 308 /* 309 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 310 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 311 * the returned SAGAW. 312 */ 313 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 314 { 315 unsigned long fl_sagaw, sl_sagaw; 316 317 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 318 sl_sagaw = cap_sagaw(iommu->cap); 319 320 /* Second level only. */ 321 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 322 return sl_sagaw; 323 324 /* First level only. */ 325 if (!ecap_slts(iommu->ecap)) 326 return fl_sagaw; 327 328 return fl_sagaw & sl_sagaw; 329 } 330 331 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 332 { 333 unsigned long sagaw; 334 int agaw; 335 336 sagaw = __iommu_calculate_sagaw(iommu); 337 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 338 if (test_bit(agaw, &sagaw)) 339 break; 340 } 341 342 return agaw; 343 } 344 345 /* 346 * Calculate max SAGAW for each iommu. 347 */ 348 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 349 { 350 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 351 } 352 353 /* 354 * calculate agaw for each iommu. 355 * "SAGAW" may be different across iommus, use a default agaw, and 356 * get a supported less agaw for iommus that don't support the default agaw. 357 */ 358 int iommu_calculate_agaw(struct intel_iommu *iommu) 359 { 360 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 361 } 362 363 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 364 { 365 return sm_supported(iommu) ? 366 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 367 } 368 369 static void domain_update_iommu_coherency(struct dmar_domain *domain) 370 { 371 struct iommu_domain_info *info; 372 struct dmar_drhd_unit *drhd; 373 struct intel_iommu *iommu; 374 bool found = false; 375 unsigned long i; 376 377 domain->iommu_coherency = true; 378 xa_for_each(&domain->iommu_array, i, info) { 379 found = true; 380 if (!iommu_paging_structure_coherency(info->iommu)) { 381 domain->iommu_coherency = false; 382 break; 383 } 384 } 385 if (found) 386 return; 387 388 /* No hardware attached; use lowest common denominator */ 389 rcu_read_lock(); 390 for_each_active_iommu(iommu, drhd) { 391 if (!iommu_paging_structure_coherency(iommu)) { 392 domain->iommu_coherency = false; 393 break; 394 } 395 } 396 rcu_read_unlock(); 397 } 398 399 static int domain_update_iommu_superpage(struct dmar_domain *domain, 400 struct intel_iommu *skip) 401 { 402 struct dmar_drhd_unit *drhd; 403 struct intel_iommu *iommu; 404 int mask = 0x3; 405 406 if (!intel_iommu_superpage) 407 return 0; 408 409 /* set iommu_superpage to the smallest common denominator */ 410 rcu_read_lock(); 411 for_each_active_iommu(iommu, drhd) { 412 if (iommu != skip) { 413 if (domain && domain->use_first_level) { 414 if (!cap_fl1gp_support(iommu->cap)) 415 mask = 0x1; 416 } else { 417 mask &= cap_super_page_val(iommu->cap); 418 } 419 420 if (!mask) 421 break; 422 } 423 } 424 rcu_read_unlock(); 425 426 return fls(mask); 427 } 428 429 static int domain_update_device_node(struct dmar_domain *domain) 430 { 431 struct device_domain_info *info; 432 int nid = NUMA_NO_NODE; 433 unsigned long flags; 434 435 spin_lock_irqsave(&domain->lock, flags); 436 list_for_each_entry(info, &domain->devices, link) { 437 /* 438 * There could possibly be multiple device numa nodes as devices 439 * within the same domain may sit behind different IOMMUs. There 440 * isn't perfect answer in such situation, so we select first 441 * come first served policy. 442 */ 443 nid = dev_to_node(info->dev); 444 if (nid != NUMA_NO_NODE) 445 break; 446 } 447 spin_unlock_irqrestore(&domain->lock, flags); 448 449 return nid; 450 } 451 452 /* Return the super pagesize bitmap if supported. */ 453 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 454 { 455 unsigned long bitmap = 0; 456 457 /* 458 * 1-level super page supports page size of 2MiB, 2-level super page 459 * supports page size of both 2MiB and 1GiB. 460 */ 461 if (domain->iommu_superpage == 1) 462 bitmap |= SZ_2M; 463 else if (domain->iommu_superpage == 2) 464 bitmap |= SZ_2M | SZ_1G; 465 466 return bitmap; 467 } 468 469 /* Some capabilities may be different across iommus */ 470 void domain_update_iommu_cap(struct dmar_domain *domain) 471 { 472 domain_update_iommu_coherency(domain); 473 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 474 475 /* 476 * If RHSA is missing, we should default to the device numa domain 477 * as fall back. 478 */ 479 if (domain->nid == NUMA_NO_NODE) 480 domain->nid = domain_update_device_node(domain); 481 482 /* 483 * First-level translation restricts the input-address to a 484 * canonical address (i.e., address bits 63:N have the same 485 * value as address bit [N-1], where N is 48-bits with 4-level 486 * paging and 57-bits with 5-level paging). Hence, skip bit 487 * [N-1]. 488 */ 489 if (domain->use_first_level) 490 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 491 else 492 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 493 494 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 495 domain_update_iotlb(domain); 496 } 497 498 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 499 u8 devfn, int alloc) 500 { 501 struct root_entry *root = &iommu->root_entry[bus]; 502 struct context_entry *context; 503 u64 *entry; 504 505 /* 506 * Except that the caller requested to allocate a new entry, 507 * returning a copied context entry makes no sense. 508 */ 509 if (!alloc && context_copied(iommu, bus, devfn)) 510 return NULL; 511 512 entry = &root->lo; 513 if (sm_supported(iommu)) { 514 if (devfn >= 0x80) { 515 devfn -= 0x80; 516 entry = &root->hi; 517 } 518 devfn *= 2; 519 } 520 if (*entry & 1) 521 context = phys_to_virt(*entry & VTD_PAGE_MASK); 522 else { 523 unsigned long phy_addr; 524 if (!alloc) 525 return NULL; 526 527 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 528 if (!context) 529 return NULL; 530 531 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 532 phy_addr = virt_to_phys((void *)context); 533 *entry = phy_addr | 1; 534 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 535 } 536 return &context[devfn]; 537 } 538 539 /** 540 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 541 * sub-hierarchy of a candidate PCI-PCI bridge 542 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 543 * @bridge: the candidate PCI-PCI bridge 544 * 545 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 546 */ 547 static bool 548 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 549 { 550 struct pci_dev *pdev, *pbridge; 551 552 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 553 return false; 554 555 pdev = to_pci_dev(dev); 556 pbridge = to_pci_dev(bridge); 557 558 if (pbridge->subordinate && 559 pbridge->subordinate->number <= pdev->bus->number && 560 pbridge->subordinate->busn_res.end >= pdev->bus->number) 561 return true; 562 563 return false; 564 } 565 566 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 567 { 568 struct dmar_drhd_unit *drhd; 569 u32 vtbar; 570 int rc; 571 572 /* We know that this device on this chipset has its own IOMMU. 573 * If we find it under a different IOMMU, then the BIOS is lying 574 * to us. Hope that the IOMMU for this device is actually 575 * disabled, and it needs no translation... 576 */ 577 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 578 if (rc) { 579 /* "can't" happen */ 580 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 581 return false; 582 } 583 vtbar &= 0xffff0000; 584 585 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 586 drhd = dmar_find_matched_drhd_unit(pdev); 587 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 588 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 589 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 590 return true; 591 } 592 593 return false; 594 } 595 596 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 597 { 598 if (!iommu || iommu->drhd->ignored) 599 return true; 600 601 if (dev_is_pci(dev)) { 602 struct pci_dev *pdev = to_pci_dev(dev); 603 604 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 605 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 606 quirk_ioat_snb_local_iommu(pdev)) 607 return true; 608 } 609 610 return false; 611 } 612 613 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 614 { 615 struct dmar_drhd_unit *drhd = NULL; 616 struct pci_dev *pdev = NULL; 617 struct intel_iommu *iommu; 618 struct device *tmp; 619 u16 segment = 0; 620 int i; 621 622 if (!dev) 623 return NULL; 624 625 if (dev_is_pci(dev)) { 626 struct pci_dev *pf_pdev; 627 628 pdev = pci_real_dma_dev(to_pci_dev(dev)); 629 630 /* VFs aren't listed in scope tables; we need to look up 631 * the PF instead to find the IOMMU. */ 632 pf_pdev = pci_physfn(pdev); 633 dev = &pf_pdev->dev; 634 segment = pci_domain_nr(pdev->bus); 635 } else if (has_acpi_companion(dev)) 636 dev = &ACPI_COMPANION(dev)->dev; 637 638 rcu_read_lock(); 639 for_each_iommu(iommu, drhd) { 640 if (pdev && segment != drhd->segment) 641 continue; 642 643 for_each_active_dev_scope(drhd->devices, 644 drhd->devices_cnt, i, tmp) { 645 if (tmp == dev) { 646 /* For a VF use its original BDF# not that of the PF 647 * which we used for the IOMMU lookup. Strictly speaking 648 * we could do this for all PCI devices; we only need to 649 * get the BDF# from the scope table for ACPI matches. */ 650 if (pdev && pdev->is_virtfn) 651 goto got_pdev; 652 653 if (bus && devfn) { 654 *bus = drhd->devices[i].bus; 655 *devfn = drhd->devices[i].devfn; 656 } 657 goto out; 658 } 659 660 if (is_downstream_to_pci_bridge(dev, tmp)) 661 goto got_pdev; 662 } 663 664 if (pdev && drhd->include_all) { 665 got_pdev: 666 if (bus && devfn) { 667 *bus = pdev->bus->number; 668 *devfn = pdev->devfn; 669 } 670 goto out; 671 } 672 } 673 iommu = NULL; 674 out: 675 if (iommu_is_dummy(iommu, dev)) 676 iommu = NULL; 677 678 rcu_read_unlock(); 679 680 return iommu; 681 } 682 683 static void domain_flush_cache(struct dmar_domain *domain, 684 void *addr, int size) 685 { 686 if (!domain->iommu_coherency) 687 clflush_cache_range(addr, size); 688 } 689 690 static void free_context_table(struct intel_iommu *iommu) 691 { 692 struct context_entry *context; 693 int i; 694 695 if (!iommu->root_entry) 696 return; 697 698 for (i = 0; i < ROOT_ENTRY_NR; i++) { 699 context = iommu_context_addr(iommu, i, 0, 0); 700 if (context) 701 iommu_free_page(context); 702 703 if (!sm_supported(iommu)) 704 continue; 705 706 context = iommu_context_addr(iommu, i, 0x80, 0); 707 if (context) 708 iommu_free_page(context); 709 } 710 711 iommu_free_page(iommu->root_entry); 712 iommu->root_entry = NULL; 713 } 714 715 #ifdef CONFIG_DMAR_DEBUG 716 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 717 u8 bus, u8 devfn, struct dma_pte *parent, int level) 718 { 719 struct dma_pte *pte; 720 int offset; 721 722 while (1) { 723 offset = pfn_level_offset(pfn, level); 724 pte = &parent[offset]; 725 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 726 pr_info("PTE not present at level %d\n", level); 727 break; 728 } 729 730 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 731 732 if (level == 1) 733 break; 734 735 parent = phys_to_virt(dma_pte_addr(pte)); 736 level--; 737 } 738 } 739 740 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 741 unsigned long long addr, u32 pasid) 742 { 743 struct pasid_dir_entry *dir, *pde; 744 struct pasid_entry *entries, *pte; 745 struct context_entry *ctx_entry; 746 struct root_entry *rt_entry; 747 int i, dir_index, index, level; 748 u8 devfn = source_id & 0xff; 749 u8 bus = source_id >> 8; 750 struct dma_pte *pgtable; 751 752 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 753 754 /* root entry dump */ 755 rt_entry = &iommu->root_entry[bus]; 756 if (!rt_entry) { 757 pr_info("root table entry is not present\n"); 758 return; 759 } 760 761 if (sm_supported(iommu)) 762 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 763 rt_entry->hi, rt_entry->lo); 764 else 765 pr_info("root entry: 0x%016llx", rt_entry->lo); 766 767 /* context entry dump */ 768 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 769 if (!ctx_entry) { 770 pr_info("context table entry is not present\n"); 771 return; 772 } 773 774 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 775 ctx_entry->hi, ctx_entry->lo); 776 777 /* legacy mode does not require PASID entries */ 778 if (!sm_supported(iommu)) { 779 level = agaw_to_level(ctx_entry->hi & 7); 780 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 781 goto pgtable_walk; 782 } 783 784 /* get the pointer to pasid directory entry */ 785 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 786 if (!dir) { 787 pr_info("pasid directory entry is not present\n"); 788 return; 789 } 790 /* For request-without-pasid, get the pasid from context entry */ 791 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 792 pasid = IOMMU_NO_PASID; 793 794 dir_index = pasid >> PASID_PDE_SHIFT; 795 pde = &dir[dir_index]; 796 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 797 798 /* get the pointer to the pasid table entry */ 799 entries = get_pasid_table_from_pde(pde); 800 if (!entries) { 801 pr_info("pasid table entry is not present\n"); 802 return; 803 } 804 index = pasid & PASID_PTE_MASK; 805 pte = &entries[index]; 806 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 807 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 808 809 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 810 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 811 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 812 } else { 813 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 814 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 815 } 816 817 pgtable_walk: 818 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 819 } 820 #endif 821 822 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 823 unsigned long pfn, int *target_level, 824 gfp_t gfp) 825 { 826 struct dma_pte *parent, *pte; 827 int level = agaw_to_level(domain->agaw); 828 int offset; 829 830 if (!domain_pfn_supported(domain, pfn)) 831 /* Address beyond IOMMU's addressing capabilities. */ 832 return NULL; 833 834 parent = domain->pgd; 835 836 while (1) { 837 void *tmp_page; 838 839 offset = pfn_level_offset(pfn, level); 840 pte = &parent[offset]; 841 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 842 break; 843 if (level == *target_level) 844 break; 845 846 if (!dma_pte_present(pte)) { 847 uint64_t pteval, tmp; 848 849 tmp_page = iommu_alloc_page_node(domain->nid, gfp); 850 851 if (!tmp_page) 852 return NULL; 853 854 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 855 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 856 if (domain->use_first_level) 857 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 858 859 tmp = 0ULL; 860 if (!try_cmpxchg64(&pte->val, &tmp, pteval)) 861 /* Someone else set it while we were thinking; use theirs. */ 862 iommu_free_page(tmp_page); 863 else 864 domain_flush_cache(domain, pte, sizeof(*pte)); 865 } 866 if (level == 1) 867 break; 868 869 parent = phys_to_virt(dma_pte_addr(pte)); 870 level--; 871 } 872 873 if (!*target_level) 874 *target_level = level; 875 876 return pte; 877 } 878 879 /* return address's pte at specific level */ 880 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 881 unsigned long pfn, 882 int level, int *large_page) 883 { 884 struct dma_pte *parent, *pte; 885 int total = agaw_to_level(domain->agaw); 886 int offset; 887 888 parent = domain->pgd; 889 while (level <= total) { 890 offset = pfn_level_offset(pfn, total); 891 pte = &parent[offset]; 892 if (level == total) 893 return pte; 894 895 if (!dma_pte_present(pte)) { 896 *large_page = total; 897 break; 898 } 899 900 if (dma_pte_superpage(pte)) { 901 *large_page = total; 902 return pte; 903 } 904 905 parent = phys_to_virt(dma_pte_addr(pte)); 906 total--; 907 } 908 return NULL; 909 } 910 911 /* clear last level pte, a tlb flush should be followed */ 912 static void dma_pte_clear_range(struct dmar_domain *domain, 913 unsigned long start_pfn, 914 unsigned long last_pfn) 915 { 916 unsigned int large_page; 917 struct dma_pte *first_pte, *pte; 918 919 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 920 WARN_ON(start_pfn > last_pfn)) 921 return; 922 923 /* we don't need lock here; nobody else touches the iova range */ 924 do { 925 large_page = 1; 926 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 927 if (!pte) { 928 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 929 continue; 930 } 931 do { 932 dma_clear_pte(pte); 933 start_pfn += lvl_to_nr_pages(large_page); 934 pte++; 935 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 936 937 domain_flush_cache(domain, first_pte, 938 (void *)pte - (void *)first_pte); 939 940 } while (start_pfn && start_pfn <= last_pfn); 941 } 942 943 static void dma_pte_free_level(struct dmar_domain *domain, int level, 944 int retain_level, struct dma_pte *pte, 945 unsigned long pfn, unsigned long start_pfn, 946 unsigned long last_pfn) 947 { 948 pfn = max(start_pfn, pfn); 949 pte = &pte[pfn_level_offset(pfn, level)]; 950 951 do { 952 unsigned long level_pfn; 953 struct dma_pte *level_pte; 954 955 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 956 goto next; 957 958 level_pfn = pfn & level_mask(level); 959 level_pte = phys_to_virt(dma_pte_addr(pte)); 960 961 if (level > 2) { 962 dma_pte_free_level(domain, level - 1, retain_level, 963 level_pte, level_pfn, start_pfn, 964 last_pfn); 965 } 966 967 /* 968 * Free the page table if we're below the level we want to 969 * retain and the range covers the entire table. 970 */ 971 if (level < retain_level && !(start_pfn > level_pfn || 972 last_pfn < level_pfn + level_size(level) - 1)) { 973 dma_clear_pte(pte); 974 domain_flush_cache(domain, pte, sizeof(*pte)); 975 iommu_free_page(level_pte); 976 } 977 next: 978 pfn += level_size(level); 979 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 980 } 981 982 /* 983 * clear last level (leaf) ptes and free page table pages below the 984 * level we wish to keep intact. 985 */ 986 static void dma_pte_free_pagetable(struct dmar_domain *domain, 987 unsigned long start_pfn, 988 unsigned long last_pfn, 989 int retain_level) 990 { 991 dma_pte_clear_range(domain, start_pfn, last_pfn); 992 993 /* We don't need lock here; nobody else touches the iova range */ 994 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 995 domain->pgd, 0, start_pfn, last_pfn); 996 997 /* free pgd */ 998 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 999 iommu_free_page(domain->pgd); 1000 domain->pgd = NULL; 1001 } 1002 } 1003 1004 /* When a page at a given level is being unlinked from its parent, we don't 1005 need to *modify* it at all. All we need to do is make a list of all the 1006 pages which can be freed just as soon as we've flushed the IOTLB and we 1007 know the hardware page-walk will no longer touch them. 1008 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1009 be freed. */ 1010 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1011 int level, struct dma_pte *pte, 1012 struct list_head *freelist) 1013 { 1014 struct page *pg; 1015 1016 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1017 list_add_tail(&pg->lru, freelist); 1018 1019 if (level == 1) 1020 return; 1021 1022 pte = page_address(pg); 1023 do { 1024 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1025 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1026 pte++; 1027 } while (!first_pte_in_page(pte)); 1028 } 1029 1030 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1031 struct dma_pte *pte, unsigned long pfn, 1032 unsigned long start_pfn, unsigned long last_pfn, 1033 struct list_head *freelist) 1034 { 1035 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1036 1037 pfn = max(start_pfn, pfn); 1038 pte = &pte[pfn_level_offset(pfn, level)]; 1039 1040 do { 1041 unsigned long level_pfn = pfn & level_mask(level); 1042 1043 if (!dma_pte_present(pte)) 1044 goto next; 1045 1046 /* If range covers entire pagetable, free it */ 1047 if (start_pfn <= level_pfn && 1048 last_pfn >= level_pfn + level_size(level) - 1) { 1049 /* These suborbinate page tables are going away entirely. Don't 1050 bother to clear them; we're just going to *free* them. */ 1051 if (level > 1 && !dma_pte_superpage(pte)) 1052 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1053 1054 dma_clear_pte(pte); 1055 if (!first_pte) 1056 first_pte = pte; 1057 last_pte = pte; 1058 } else if (level > 1) { 1059 /* Recurse down into a level that isn't *entirely* obsolete */ 1060 dma_pte_clear_level(domain, level - 1, 1061 phys_to_virt(dma_pte_addr(pte)), 1062 level_pfn, start_pfn, last_pfn, 1063 freelist); 1064 } 1065 next: 1066 pfn = level_pfn + level_size(level); 1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1068 1069 if (first_pte) 1070 domain_flush_cache(domain, first_pte, 1071 (void *)++last_pte - (void *)first_pte); 1072 } 1073 1074 /* We can't just free the pages because the IOMMU may still be walking 1075 the page tables, and may have cached the intermediate levels. The 1076 pages can only be freed after the IOTLB flush has been done. */ 1077 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1078 unsigned long last_pfn, struct list_head *freelist) 1079 { 1080 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1081 WARN_ON(start_pfn > last_pfn)) 1082 return; 1083 1084 /* we don't need lock here; nobody else touches the iova range */ 1085 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1086 domain->pgd, 0, start_pfn, last_pfn, freelist); 1087 1088 /* free pgd */ 1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1090 struct page *pgd_page = virt_to_page(domain->pgd); 1091 list_add_tail(&pgd_page->lru, freelist); 1092 domain->pgd = NULL; 1093 } 1094 } 1095 1096 /* iommu handling */ 1097 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1098 { 1099 struct root_entry *root; 1100 1101 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 1102 if (!root) { 1103 pr_err("Allocating root entry for %s failed\n", 1104 iommu->name); 1105 return -ENOMEM; 1106 } 1107 1108 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1109 iommu->root_entry = root; 1110 1111 return 0; 1112 } 1113 1114 static void iommu_set_root_entry(struct intel_iommu *iommu) 1115 { 1116 u64 addr; 1117 u32 sts; 1118 unsigned long flag; 1119 1120 addr = virt_to_phys(iommu->root_entry); 1121 if (sm_supported(iommu)) 1122 addr |= DMA_RTADDR_SMT; 1123 1124 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1125 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1126 1127 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1128 1129 /* Make sure hardware complete it */ 1130 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1131 readl, (sts & DMA_GSTS_RTPS), sts); 1132 1133 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1134 1135 /* 1136 * Hardware invalidates all DMA remapping hardware translation 1137 * caches as part of SRTP flow. 1138 */ 1139 if (cap_esrtps(iommu->cap)) 1140 return; 1141 1142 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1143 if (sm_supported(iommu)) 1144 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1145 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1146 } 1147 1148 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1149 { 1150 u32 val; 1151 unsigned long flag; 1152 1153 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1154 return; 1155 1156 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1157 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1158 1159 /* Make sure hardware complete it */ 1160 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1161 readl, (!(val & DMA_GSTS_WBFS)), val); 1162 1163 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1164 } 1165 1166 /* return value determine if we need a write buffer flush */ 1167 static void __iommu_flush_context(struct intel_iommu *iommu, 1168 u16 did, u16 source_id, u8 function_mask, 1169 u64 type) 1170 { 1171 u64 val = 0; 1172 unsigned long flag; 1173 1174 switch (type) { 1175 case DMA_CCMD_GLOBAL_INVL: 1176 val = DMA_CCMD_GLOBAL_INVL; 1177 break; 1178 case DMA_CCMD_DOMAIN_INVL: 1179 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1180 break; 1181 case DMA_CCMD_DEVICE_INVL: 1182 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1183 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1184 break; 1185 default: 1186 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1187 iommu->name, type); 1188 return; 1189 } 1190 val |= DMA_CCMD_ICC; 1191 1192 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1193 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1194 1195 /* Make sure hardware complete it */ 1196 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1197 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1198 1199 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1200 } 1201 1202 /* return value determine if we need a write buffer flush */ 1203 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1204 u64 addr, unsigned int size_order, u64 type) 1205 { 1206 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1207 u64 val = 0, val_iva = 0; 1208 unsigned long flag; 1209 1210 switch (type) { 1211 case DMA_TLB_GLOBAL_FLUSH: 1212 /* global flush doesn't need set IVA_REG */ 1213 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1214 break; 1215 case DMA_TLB_DSI_FLUSH: 1216 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1217 break; 1218 case DMA_TLB_PSI_FLUSH: 1219 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1220 /* IH bit is passed in as part of address */ 1221 val_iva = size_order | addr; 1222 break; 1223 default: 1224 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1225 iommu->name, type); 1226 return; 1227 } 1228 1229 if (cap_write_drain(iommu->cap)) 1230 val |= DMA_TLB_WRITE_DRAIN; 1231 1232 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1233 /* Note: Only uses first TLB reg currently */ 1234 if (val_iva) 1235 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1236 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1237 1238 /* Make sure hardware complete it */ 1239 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1240 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1241 1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1243 1244 /* check IOTLB invalidation granularity */ 1245 if (DMA_TLB_IAIG(val) == 0) 1246 pr_err("Flush IOTLB failed\n"); 1247 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1248 pr_debug("TLB flush request %Lx, actual %Lx\n", 1249 (unsigned long long)DMA_TLB_IIRG(type), 1250 (unsigned long long)DMA_TLB_IAIG(val)); 1251 } 1252 1253 static struct device_domain_info * 1254 domain_lookup_dev_info(struct dmar_domain *domain, 1255 struct intel_iommu *iommu, u8 bus, u8 devfn) 1256 { 1257 struct device_domain_info *info; 1258 unsigned long flags; 1259 1260 spin_lock_irqsave(&domain->lock, flags); 1261 list_for_each_entry(info, &domain->devices, link) { 1262 if (info->iommu == iommu && info->bus == bus && 1263 info->devfn == devfn) { 1264 spin_unlock_irqrestore(&domain->lock, flags); 1265 return info; 1266 } 1267 } 1268 spin_unlock_irqrestore(&domain->lock, flags); 1269 1270 return NULL; 1271 } 1272 1273 void domain_update_iotlb(struct dmar_domain *domain) 1274 { 1275 struct dev_pasid_info *dev_pasid; 1276 struct device_domain_info *info; 1277 bool has_iotlb_device = false; 1278 unsigned long flags; 1279 1280 spin_lock_irqsave(&domain->lock, flags); 1281 list_for_each_entry(info, &domain->devices, link) { 1282 if (info->ats_enabled) { 1283 has_iotlb_device = true; 1284 break; 1285 } 1286 } 1287 1288 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1289 info = dev_iommu_priv_get(dev_pasid->dev); 1290 if (info->ats_enabled) { 1291 has_iotlb_device = true; 1292 break; 1293 } 1294 } 1295 domain->has_iotlb_device = has_iotlb_device; 1296 spin_unlock_irqrestore(&domain->lock, flags); 1297 } 1298 1299 /* 1300 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1301 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1302 * check because it applies only to the built-in QAT devices and it doesn't 1303 * grant additional privileges. 1304 */ 1305 #define BUGGY_QAT_DEVID_MASK 0x4940 1306 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1307 { 1308 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1309 return false; 1310 1311 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1312 return false; 1313 1314 return true; 1315 } 1316 1317 static void iommu_enable_pci_caps(struct device_domain_info *info) 1318 { 1319 struct pci_dev *pdev; 1320 1321 if (!dev_is_pci(info->dev)) 1322 return; 1323 1324 pdev = to_pci_dev(info->dev); 1325 1326 /* The PCIe spec, in its wisdom, declares that the behaviour of 1327 the device if you enable PASID support after ATS support is 1328 undefined. So always enable PASID support on devices which 1329 have it, even if we can't yet know if we're ever going to 1330 use it. */ 1331 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1332 info->pasid_enabled = 1; 1333 1334 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1335 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1336 info->ats_enabled = 1; 1337 domain_update_iotlb(info->domain); 1338 } 1339 } 1340 1341 static void iommu_disable_pci_caps(struct device_domain_info *info) 1342 { 1343 struct pci_dev *pdev; 1344 1345 if (!dev_is_pci(info->dev)) 1346 return; 1347 1348 pdev = to_pci_dev(info->dev); 1349 1350 if (info->ats_enabled) { 1351 pci_disable_ats(pdev); 1352 info->ats_enabled = 0; 1353 domain_update_iotlb(info->domain); 1354 } 1355 1356 if (info->pasid_enabled) { 1357 pci_disable_pasid(pdev); 1358 info->pasid_enabled = 0; 1359 } 1360 } 1361 1362 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1363 { 1364 cache_tag_flush_all(to_dmar_domain(domain)); 1365 } 1366 1367 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1368 { 1369 u32 pmen; 1370 unsigned long flags; 1371 1372 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1373 return; 1374 1375 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1376 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1377 pmen &= ~DMA_PMEN_EPM; 1378 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1379 1380 /* wait for the protected region status bit to clear */ 1381 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1382 readl, !(pmen & DMA_PMEN_PRS), pmen); 1383 1384 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1385 } 1386 1387 static void iommu_enable_translation(struct intel_iommu *iommu) 1388 { 1389 u32 sts; 1390 unsigned long flags; 1391 1392 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1393 iommu->gcmd |= DMA_GCMD_TE; 1394 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1395 1396 /* Make sure hardware complete it */ 1397 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1398 readl, (sts & DMA_GSTS_TES), sts); 1399 1400 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1401 } 1402 1403 static void iommu_disable_translation(struct intel_iommu *iommu) 1404 { 1405 u32 sts; 1406 unsigned long flag; 1407 1408 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1409 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1410 return; 1411 1412 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1413 iommu->gcmd &= ~DMA_GCMD_TE; 1414 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1415 1416 /* Make sure hardware complete it */ 1417 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1418 readl, (!(sts & DMA_GSTS_TES)), sts); 1419 1420 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1421 } 1422 1423 static int iommu_init_domains(struct intel_iommu *iommu) 1424 { 1425 u32 ndomains; 1426 1427 ndomains = cap_ndoms(iommu->cap); 1428 pr_debug("%s: Number of Domains supported <%d>\n", 1429 iommu->name, ndomains); 1430 1431 spin_lock_init(&iommu->lock); 1432 1433 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1434 if (!iommu->domain_ids) 1435 return -ENOMEM; 1436 1437 /* 1438 * If Caching mode is set, then invalid translations are tagged 1439 * with domain-id 0, hence we need to pre-allocate it. We also 1440 * use domain-id 0 as a marker for non-allocated domain-id, so 1441 * make sure it is not used for a real domain. 1442 */ 1443 set_bit(0, iommu->domain_ids); 1444 1445 /* 1446 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1447 * entry for first-level or pass-through translation modes should 1448 * be programmed with a domain id different from those used for 1449 * second-level or nested translation. We reserve a domain id for 1450 * this purpose. 1451 */ 1452 if (sm_supported(iommu)) 1453 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1454 1455 return 0; 1456 } 1457 1458 static void disable_dmar_iommu(struct intel_iommu *iommu) 1459 { 1460 if (!iommu->domain_ids) 1461 return; 1462 1463 /* 1464 * All iommu domains must have been detached from the devices, 1465 * hence there should be no domain IDs in use. 1466 */ 1467 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1468 > NUM_RESERVED_DID)) 1469 return; 1470 1471 if (iommu->gcmd & DMA_GCMD_TE) 1472 iommu_disable_translation(iommu); 1473 } 1474 1475 static void free_dmar_iommu(struct intel_iommu *iommu) 1476 { 1477 if (iommu->domain_ids) { 1478 bitmap_free(iommu->domain_ids); 1479 iommu->domain_ids = NULL; 1480 } 1481 1482 if (iommu->copied_tables) { 1483 bitmap_free(iommu->copied_tables); 1484 iommu->copied_tables = NULL; 1485 } 1486 1487 /* free context mapping */ 1488 free_context_table(iommu); 1489 1490 #ifdef CONFIG_INTEL_IOMMU_SVM 1491 if (pasid_supported(iommu)) { 1492 if (ecap_prs(iommu->ecap)) 1493 intel_svm_finish_prq(iommu); 1494 } 1495 #endif 1496 } 1497 1498 /* 1499 * Check and return whether first level is used by default for 1500 * DMA translation. 1501 */ 1502 static bool first_level_by_default(unsigned int type) 1503 { 1504 /* Only SL is available in legacy mode */ 1505 if (!scalable_mode_support()) 1506 return false; 1507 1508 /* Only level (either FL or SL) is available, just use it */ 1509 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1510 return intel_cap_flts_sanity(); 1511 1512 /* Both levels are available, decide it based on domain type */ 1513 return type != IOMMU_DOMAIN_UNMANAGED; 1514 } 1515 1516 static struct dmar_domain *alloc_domain(unsigned int type) 1517 { 1518 struct dmar_domain *domain; 1519 1520 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1521 if (!domain) 1522 return NULL; 1523 1524 domain->nid = NUMA_NO_NODE; 1525 if (first_level_by_default(type)) 1526 domain->use_first_level = true; 1527 domain->has_iotlb_device = false; 1528 INIT_LIST_HEAD(&domain->devices); 1529 INIT_LIST_HEAD(&domain->dev_pasids); 1530 INIT_LIST_HEAD(&domain->cache_tags); 1531 spin_lock_init(&domain->lock); 1532 spin_lock_init(&domain->cache_lock); 1533 xa_init(&domain->iommu_array); 1534 1535 return domain; 1536 } 1537 1538 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1539 { 1540 struct iommu_domain_info *info, *curr; 1541 unsigned long ndomains; 1542 int num, ret = -ENOSPC; 1543 1544 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1545 return 0; 1546 1547 info = kzalloc(sizeof(*info), GFP_KERNEL); 1548 if (!info) 1549 return -ENOMEM; 1550 1551 spin_lock(&iommu->lock); 1552 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1553 if (curr) { 1554 curr->refcnt++; 1555 spin_unlock(&iommu->lock); 1556 kfree(info); 1557 return 0; 1558 } 1559 1560 ndomains = cap_ndoms(iommu->cap); 1561 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1562 if (num >= ndomains) { 1563 pr_err("%s: No free domain ids\n", iommu->name); 1564 goto err_unlock; 1565 } 1566 1567 set_bit(num, iommu->domain_ids); 1568 info->refcnt = 1; 1569 info->did = num; 1570 info->iommu = iommu; 1571 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1572 NULL, info, GFP_ATOMIC); 1573 if (curr) { 1574 ret = xa_err(curr) ? : -EBUSY; 1575 goto err_clear; 1576 } 1577 domain_update_iommu_cap(domain); 1578 1579 spin_unlock(&iommu->lock); 1580 return 0; 1581 1582 err_clear: 1583 clear_bit(info->did, iommu->domain_ids); 1584 err_unlock: 1585 spin_unlock(&iommu->lock); 1586 kfree(info); 1587 return ret; 1588 } 1589 1590 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1591 { 1592 struct iommu_domain_info *info; 1593 1594 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1595 return; 1596 1597 spin_lock(&iommu->lock); 1598 info = xa_load(&domain->iommu_array, iommu->seq_id); 1599 if (--info->refcnt == 0) { 1600 clear_bit(info->did, iommu->domain_ids); 1601 xa_erase(&domain->iommu_array, iommu->seq_id); 1602 domain->nid = NUMA_NO_NODE; 1603 domain_update_iommu_cap(domain); 1604 kfree(info); 1605 } 1606 spin_unlock(&iommu->lock); 1607 } 1608 1609 static int guestwidth_to_adjustwidth(int gaw) 1610 { 1611 int agaw; 1612 int r = (gaw - 12) % 9; 1613 1614 if (r == 0) 1615 agaw = gaw; 1616 else 1617 agaw = gaw + 9 - r; 1618 if (agaw > 64) 1619 agaw = 64; 1620 return agaw; 1621 } 1622 1623 static void domain_exit(struct dmar_domain *domain) 1624 { 1625 if (domain->pgd) { 1626 LIST_HEAD(freelist); 1627 1628 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1629 iommu_put_pages_list(&freelist); 1630 } 1631 1632 if (WARN_ON(!list_empty(&domain->devices))) 1633 return; 1634 1635 kfree(domain); 1636 } 1637 1638 static int domain_context_mapping_one(struct dmar_domain *domain, 1639 struct intel_iommu *iommu, 1640 u8 bus, u8 devfn) 1641 { 1642 struct device_domain_info *info = 1643 domain_lookup_dev_info(domain, iommu, bus, devfn); 1644 u16 did = domain_id_iommu(domain, iommu); 1645 int translation = CONTEXT_TT_MULTI_LEVEL; 1646 struct dma_pte *pgd = domain->pgd; 1647 struct context_entry *context; 1648 int agaw, ret; 1649 1650 if (hw_pass_through && domain_type_is_si(domain)) 1651 translation = CONTEXT_TT_PASS_THROUGH; 1652 1653 pr_debug("Set context mapping for %02x:%02x.%d\n", 1654 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1655 1656 spin_lock(&iommu->lock); 1657 ret = -ENOMEM; 1658 context = iommu_context_addr(iommu, bus, devfn, 1); 1659 if (!context) 1660 goto out_unlock; 1661 1662 ret = 0; 1663 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1664 goto out_unlock; 1665 1666 /* 1667 * For kdump cases, old valid entries may be cached due to the 1668 * in-flight DMA and copied pgtable, but there is no unmapping 1669 * behaviour for them, thus we need an explicit cache flush for 1670 * the newly-mapped device. For kdump, at this point, the device 1671 * is supposed to finish reset at its driver probe stage, so no 1672 * in-flight DMA will exist, and we don't need to worry anymore 1673 * hereafter. 1674 */ 1675 if (context_copied(iommu, bus, devfn)) { 1676 u16 did_old = context_domain_id(context); 1677 1678 if (did_old < cap_ndoms(iommu->cap)) { 1679 iommu->flush.flush_context(iommu, did_old, 1680 (((u16)bus) << 8) | devfn, 1681 DMA_CCMD_MASK_NOBIT, 1682 DMA_CCMD_DEVICE_INVL); 1683 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1684 DMA_TLB_DSI_FLUSH); 1685 } 1686 1687 clear_context_copied(iommu, bus, devfn); 1688 } 1689 1690 context_clear_entry(context); 1691 context_set_domain_id(context, did); 1692 1693 if (translation != CONTEXT_TT_PASS_THROUGH) { 1694 /* 1695 * Skip top levels of page tables for iommu which has 1696 * less agaw than default. Unnecessary for PT mode. 1697 */ 1698 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1699 ret = -ENOMEM; 1700 pgd = phys_to_virt(dma_pte_addr(pgd)); 1701 if (!dma_pte_present(pgd)) 1702 goto out_unlock; 1703 } 1704 1705 if (info && info->ats_supported) 1706 translation = CONTEXT_TT_DEV_IOTLB; 1707 else 1708 translation = CONTEXT_TT_MULTI_LEVEL; 1709 1710 context_set_address_root(context, virt_to_phys(pgd)); 1711 context_set_address_width(context, agaw); 1712 } else { 1713 /* 1714 * In pass through mode, AW must be programmed to 1715 * indicate the largest AGAW value supported by 1716 * hardware. And ASR is ignored by hardware. 1717 */ 1718 context_set_address_width(context, iommu->msagaw); 1719 } 1720 1721 context_set_translation_type(context, translation); 1722 context_set_fault_enable(context); 1723 context_set_present(context); 1724 if (!ecap_coherent(iommu->ecap)) 1725 clflush_cache_range(context, sizeof(*context)); 1726 1727 /* 1728 * It's a non-present to present mapping. If hardware doesn't cache 1729 * non-present entry we only need to flush the write-buffer. If the 1730 * _does_ cache non-present entries, then it does so in the special 1731 * domain #0, which we have to flush: 1732 */ 1733 if (cap_caching_mode(iommu->cap)) { 1734 iommu->flush.flush_context(iommu, 0, 1735 (((u16)bus) << 8) | devfn, 1736 DMA_CCMD_MASK_NOBIT, 1737 DMA_CCMD_DEVICE_INVL); 1738 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1739 } else { 1740 iommu_flush_write_buffer(iommu); 1741 } 1742 1743 ret = 0; 1744 1745 out_unlock: 1746 spin_unlock(&iommu->lock); 1747 1748 return ret; 1749 } 1750 1751 static int domain_context_mapping_cb(struct pci_dev *pdev, 1752 u16 alias, void *opaque) 1753 { 1754 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); 1755 struct intel_iommu *iommu = info->iommu; 1756 struct dmar_domain *domain = opaque; 1757 1758 return domain_context_mapping_one(domain, iommu, 1759 PCI_BUS_NUM(alias), alias & 0xff); 1760 } 1761 1762 static int 1763 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1764 { 1765 struct device_domain_info *info = dev_iommu_priv_get(dev); 1766 struct intel_iommu *iommu = info->iommu; 1767 u8 bus = info->bus, devfn = info->devfn; 1768 1769 if (!dev_is_pci(dev)) 1770 return domain_context_mapping_one(domain, iommu, bus, devfn); 1771 1772 return pci_for_each_dma_alias(to_pci_dev(dev), 1773 domain_context_mapping_cb, domain); 1774 } 1775 1776 /* Return largest possible superpage level for a given mapping */ 1777 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1778 unsigned long phy_pfn, unsigned long pages) 1779 { 1780 int support, level = 1; 1781 unsigned long pfnmerge; 1782 1783 support = domain->iommu_superpage; 1784 1785 /* To use a large page, the virtual *and* physical addresses 1786 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1787 of them will mean we have to use smaller pages. So just 1788 merge them and check both at once. */ 1789 pfnmerge = iov_pfn | phy_pfn; 1790 1791 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1792 pages >>= VTD_STRIDE_SHIFT; 1793 if (!pages) 1794 break; 1795 pfnmerge >>= VTD_STRIDE_SHIFT; 1796 level++; 1797 support--; 1798 } 1799 return level; 1800 } 1801 1802 /* 1803 * Ensure that old small page tables are removed to make room for superpage(s). 1804 * We're going to add new large pages, so make sure we don't remove their parent 1805 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1806 */ 1807 static void switch_to_super_page(struct dmar_domain *domain, 1808 unsigned long start_pfn, 1809 unsigned long end_pfn, int level) 1810 { 1811 unsigned long lvl_pages = lvl_to_nr_pages(level); 1812 struct dma_pte *pte = NULL; 1813 1814 while (start_pfn <= end_pfn) { 1815 if (!pte) 1816 pte = pfn_to_dma_pte(domain, start_pfn, &level, 1817 GFP_ATOMIC); 1818 1819 if (dma_pte_present(pte)) { 1820 dma_pte_free_pagetable(domain, start_pfn, 1821 start_pfn + lvl_pages - 1, 1822 level + 1); 1823 1824 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, 1825 end_pfn << VTD_PAGE_SHIFT, 0); 1826 } 1827 1828 pte++; 1829 start_pfn += lvl_pages; 1830 if (first_pte_in_page(pte)) 1831 pte = NULL; 1832 } 1833 } 1834 1835 static int 1836 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1837 unsigned long phys_pfn, unsigned long nr_pages, int prot, 1838 gfp_t gfp) 1839 { 1840 struct dma_pte *first_pte = NULL, *pte = NULL; 1841 unsigned int largepage_lvl = 0; 1842 unsigned long lvl_pages = 0; 1843 phys_addr_t pteval; 1844 u64 attr; 1845 1846 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 1847 return -EINVAL; 1848 1849 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 1850 return -EINVAL; 1851 1852 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 1853 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 1854 return -EINVAL; 1855 } 1856 1857 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 1858 attr |= DMA_FL_PTE_PRESENT; 1859 if (domain->use_first_level) { 1860 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 1861 if (prot & DMA_PTE_WRITE) 1862 attr |= DMA_FL_PTE_DIRTY; 1863 } 1864 1865 domain->has_mappings = true; 1866 1867 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 1868 1869 while (nr_pages > 0) { 1870 uint64_t tmp; 1871 1872 if (!pte) { 1873 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 1874 phys_pfn, nr_pages); 1875 1876 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 1877 gfp); 1878 if (!pte) 1879 return -ENOMEM; 1880 first_pte = pte; 1881 1882 lvl_pages = lvl_to_nr_pages(largepage_lvl); 1883 1884 /* It is large page*/ 1885 if (largepage_lvl > 1) { 1886 unsigned long end_pfn; 1887 unsigned long pages_to_remove; 1888 1889 pteval |= DMA_PTE_LARGE_PAGE; 1890 pages_to_remove = min_t(unsigned long, nr_pages, 1891 nr_pte_to_next_page(pte) * lvl_pages); 1892 end_pfn = iov_pfn + pages_to_remove - 1; 1893 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 1894 } else { 1895 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1896 } 1897 1898 } 1899 /* We don't need lock here, nobody else 1900 * touches the iova range 1901 */ 1902 tmp = 0ULL; 1903 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { 1904 static int dumps = 5; 1905 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 1906 iov_pfn, tmp, (unsigned long long)pteval); 1907 if (dumps) { 1908 dumps--; 1909 debug_dma_dump_mappings(NULL); 1910 } 1911 WARN_ON(1); 1912 } 1913 1914 nr_pages -= lvl_pages; 1915 iov_pfn += lvl_pages; 1916 phys_pfn += lvl_pages; 1917 pteval += lvl_pages * VTD_PAGE_SIZE; 1918 1919 /* If the next PTE would be the first in a new page, then we 1920 * need to flush the cache on the entries we've just written. 1921 * And then we'll need to recalculate 'pte', so clear it and 1922 * let it get set again in the if (!pte) block above. 1923 * 1924 * If we're done (!nr_pages) we need to flush the cache too. 1925 * 1926 * Also if we've been setting superpages, we may need to 1927 * recalculate 'pte' and switch back to smaller pages for the 1928 * end of the mapping, if the trailing size is not enough to 1929 * use another superpage (i.e. nr_pages < lvl_pages). 1930 */ 1931 pte++; 1932 if (!nr_pages || first_pte_in_page(pte) || 1933 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 1934 domain_flush_cache(domain, first_pte, 1935 (void *)pte - (void *)first_pte); 1936 pte = NULL; 1937 } 1938 } 1939 1940 return 0; 1941 } 1942 1943 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 1944 { 1945 struct intel_iommu *iommu = info->iommu; 1946 struct context_entry *context; 1947 u16 did; 1948 1949 spin_lock(&iommu->lock); 1950 context = iommu_context_addr(iommu, bus, devfn, 0); 1951 if (!context) { 1952 spin_unlock(&iommu->lock); 1953 return; 1954 } 1955 1956 did = context_domain_id(context); 1957 context_clear_entry(context); 1958 __iommu_flush_cache(iommu, context, sizeof(*context)); 1959 spin_unlock(&iommu->lock); 1960 intel_context_flush_present(info, context, did, true); 1961 } 1962 1963 static int domain_setup_first_level(struct intel_iommu *iommu, 1964 struct dmar_domain *domain, 1965 struct device *dev, 1966 u32 pasid) 1967 { 1968 struct dma_pte *pgd = domain->pgd; 1969 int agaw, level; 1970 int flags = 0; 1971 1972 /* 1973 * Skip top levels of page tables for iommu which has 1974 * less agaw than default. Unnecessary for PT mode. 1975 */ 1976 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1977 pgd = phys_to_virt(dma_pte_addr(pgd)); 1978 if (!dma_pte_present(pgd)) 1979 return -ENOMEM; 1980 } 1981 1982 level = agaw_to_level(agaw); 1983 if (level != 4 && level != 5) 1984 return -EINVAL; 1985 1986 if (level == 5) 1987 flags |= PASID_FLAG_FL5LP; 1988 1989 if (domain->force_snooping) 1990 flags |= PASID_FLAG_PAGE_SNOOP; 1991 1992 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 1993 domain_id_iommu(domain, iommu), 1994 flags); 1995 } 1996 1997 static bool dev_is_real_dma_subdevice(struct device *dev) 1998 { 1999 return dev && dev_is_pci(dev) && 2000 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2001 } 2002 2003 static int iommu_domain_identity_map(struct dmar_domain *domain, 2004 unsigned long first_vpfn, 2005 unsigned long last_vpfn) 2006 { 2007 /* 2008 * RMRR range might have overlap with physical memory range, 2009 * clear it first 2010 */ 2011 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2012 2013 return __domain_mapping(domain, first_vpfn, 2014 first_vpfn, last_vpfn - first_vpfn + 1, 2015 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2016 } 2017 2018 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2019 2020 static int __init si_domain_init(int hw) 2021 { 2022 struct dmar_rmrr_unit *rmrr; 2023 struct device *dev; 2024 int i, nid, ret; 2025 2026 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2027 if (!si_domain) 2028 return -EFAULT; 2029 2030 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2031 domain_exit(si_domain); 2032 si_domain = NULL; 2033 return -EFAULT; 2034 } 2035 2036 if (hw) 2037 return 0; 2038 2039 for_each_online_node(nid) { 2040 unsigned long start_pfn, end_pfn; 2041 int i; 2042 2043 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2044 ret = iommu_domain_identity_map(si_domain, 2045 mm_to_dma_pfn_start(start_pfn), 2046 mm_to_dma_pfn_end(end_pfn-1)); 2047 if (ret) 2048 return ret; 2049 } 2050 } 2051 2052 /* 2053 * Identity map the RMRRs so that devices with RMRRs could also use 2054 * the si_domain. 2055 */ 2056 for_each_rmrr_units(rmrr) { 2057 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2058 i, dev) { 2059 unsigned long long start = rmrr->base_address; 2060 unsigned long long end = rmrr->end_address; 2061 2062 if (WARN_ON(end < start || 2063 end >> agaw_to_width(si_domain->agaw))) 2064 continue; 2065 2066 ret = iommu_domain_identity_map(si_domain, 2067 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2068 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2069 if (ret) 2070 return ret; 2071 } 2072 } 2073 2074 return 0; 2075 } 2076 2077 static int dmar_domain_attach_device(struct dmar_domain *domain, 2078 struct device *dev) 2079 { 2080 struct device_domain_info *info = dev_iommu_priv_get(dev); 2081 struct intel_iommu *iommu = info->iommu; 2082 unsigned long flags; 2083 int ret; 2084 2085 ret = domain_attach_iommu(domain, iommu); 2086 if (ret) 2087 return ret; 2088 2089 info->domain = domain; 2090 spin_lock_irqsave(&domain->lock, flags); 2091 list_add(&info->link, &domain->devices); 2092 spin_unlock_irqrestore(&domain->lock, flags); 2093 2094 if (dev_is_real_dma_subdevice(dev)) 2095 return 0; 2096 2097 if (!sm_supported(iommu)) 2098 ret = domain_context_mapping(domain, dev); 2099 else if (hw_pass_through && domain_type_is_si(domain)) 2100 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); 2101 else if (domain->use_first_level) 2102 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); 2103 else 2104 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); 2105 2106 if (ret) 2107 goto out_block_translation; 2108 2109 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2110 iommu_enable_pci_caps(info); 2111 2112 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); 2113 if (ret) 2114 goto out_block_translation; 2115 2116 return 0; 2117 2118 out_block_translation: 2119 device_block_translation(dev); 2120 return ret; 2121 } 2122 2123 /** 2124 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2125 * is relaxable (ie. is allowed to be not enforced under some conditions) 2126 * @dev: device handle 2127 * 2128 * We assume that PCI USB devices with RMRRs have them largely 2129 * for historical reasons and that the RMRR space is not actively used post 2130 * boot. This exclusion may change if vendors begin to abuse it. 2131 * 2132 * The same exception is made for graphics devices, with the requirement that 2133 * any use of the RMRR regions will be torn down before assigning the device 2134 * to a guest. 2135 * 2136 * Return: true if the RMRR is relaxable, false otherwise 2137 */ 2138 static bool device_rmrr_is_relaxable(struct device *dev) 2139 { 2140 struct pci_dev *pdev; 2141 2142 if (!dev_is_pci(dev)) 2143 return false; 2144 2145 pdev = to_pci_dev(dev); 2146 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2147 return true; 2148 else 2149 return false; 2150 } 2151 2152 static int device_def_domain_type(struct device *dev) 2153 { 2154 if (dev_is_pci(dev)) { 2155 struct pci_dev *pdev = to_pci_dev(dev); 2156 2157 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2158 return IOMMU_DOMAIN_IDENTITY; 2159 } 2160 2161 return 0; 2162 } 2163 2164 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2165 { 2166 /* 2167 * Start from the sane iommu hardware state. 2168 * If the queued invalidation is already initialized by us 2169 * (for example, while enabling interrupt-remapping) then 2170 * we got the things already rolling from a sane state. 2171 */ 2172 if (!iommu->qi) { 2173 /* 2174 * Clear any previous faults. 2175 */ 2176 dmar_fault(-1, iommu); 2177 /* 2178 * Disable queued invalidation if supported and already enabled 2179 * before OS handover. 2180 */ 2181 dmar_disable_qi(iommu); 2182 } 2183 2184 if (dmar_enable_qi(iommu)) { 2185 /* 2186 * Queued Invalidate not enabled, use Register Based Invalidate 2187 */ 2188 iommu->flush.flush_context = __iommu_flush_context; 2189 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2190 pr_info("%s: Using Register based invalidation\n", 2191 iommu->name); 2192 } else { 2193 iommu->flush.flush_context = qi_flush_context; 2194 iommu->flush.flush_iotlb = qi_flush_iotlb; 2195 pr_info("%s: Using Queued invalidation\n", iommu->name); 2196 } 2197 } 2198 2199 static int copy_context_table(struct intel_iommu *iommu, 2200 struct root_entry *old_re, 2201 struct context_entry **tbl, 2202 int bus, bool ext) 2203 { 2204 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2205 struct context_entry *new_ce = NULL, ce; 2206 struct context_entry *old_ce = NULL; 2207 struct root_entry re; 2208 phys_addr_t old_ce_phys; 2209 2210 tbl_idx = ext ? bus * 2 : bus; 2211 memcpy(&re, old_re, sizeof(re)); 2212 2213 for (devfn = 0; devfn < 256; devfn++) { 2214 /* First calculate the correct index */ 2215 idx = (ext ? devfn * 2 : devfn) % 256; 2216 2217 if (idx == 0) { 2218 /* First save what we may have and clean up */ 2219 if (new_ce) { 2220 tbl[tbl_idx] = new_ce; 2221 __iommu_flush_cache(iommu, new_ce, 2222 VTD_PAGE_SIZE); 2223 pos = 1; 2224 } 2225 2226 if (old_ce) 2227 memunmap(old_ce); 2228 2229 ret = 0; 2230 if (devfn < 0x80) 2231 old_ce_phys = root_entry_lctp(&re); 2232 else 2233 old_ce_phys = root_entry_uctp(&re); 2234 2235 if (!old_ce_phys) { 2236 if (ext && devfn == 0) { 2237 /* No LCTP, try UCTP */ 2238 devfn = 0x7f; 2239 continue; 2240 } else { 2241 goto out; 2242 } 2243 } 2244 2245 ret = -ENOMEM; 2246 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2247 MEMREMAP_WB); 2248 if (!old_ce) 2249 goto out; 2250 2251 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); 2252 if (!new_ce) 2253 goto out_unmap; 2254 2255 ret = 0; 2256 } 2257 2258 /* Now copy the context entry */ 2259 memcpy(&ce, old_ce + idx, sizeof(ce)); 2260 2261 if (!context_present(&ce)) 2262 continue; 2263 2264 did = context_domain_id(&ce); 2265 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2266 set_bit(did, iommu->domain_ids); 2267 2268 set_context_copied(iommu, bus, devfn); 2269 new_ce[idx] = ce; 2270 } 2271 2272 tbl[tbl_idx + pos] = new_ce; 2273 2274 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2275 2276 out_unmap: 2277 memunmap(old_ce); 2278 2279 out: 2280 return ret; 2281 } 2282 2283 static int copy_translation_tables(struct intel_iommu *iommu) 2284 { 2285 struct context_entry **ctxt_tbls; 2286 struct root_entry *old_rt; 2287 phys_addr_t old_rt_phys; 2288 int ctxt_table_entries; 2289 u64 rtaddr_reg; 2290 int bus, ret; 2291 bool new_ext, ext; 2292 2293 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2294 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2295 new_ext = !!sm_supported(iommu); 2296 2297 /* 2298 * The RTT bit can only be changed when translation is disabled, 2299 * but disabling translation means to open a window for data 2300 * corruption. So bail out and don't copy anything if we would 2301 * have to change the bit. 2302 */ 2303 if (new_ext != ext) 2304 return -EINVAL; 2305 2306 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2307 if (!iommu->copied_tables) 2308 return -ENOMEM; 2309 2310 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2311 if (!old_rt_phys) 2312 return -EINVAL; 2313 2314 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2315 if (!old_rt) 2316 return -ENOMEM; 2317 2318 /* This is too big for the stack - allocate it from slab */ 2319 ctxt_table_entries = ext ? 512 : 256; 2320 ret = -ENOMEM; 2321 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2322 if (!ctxt_tbls) 2323 goto out_unmap; 2324 2325 for (bus = 0; bus < 256; bus++) { 2326 ret = copy_context_table(iommu, &old_rt[bus], 2327 ctxt_tbls, bus, ext); 2328 if (ret) { 2329 pr_err("%s: Failed to copy context table for bus %d\n", 2330 iommu->name, bus); 2331 continue; 2332 } 2333 } 2334 2335 spin_lock(&iommu->lock); 2336 2337 /* Context tables are copied, now write them to the root_entry table */ 2338 for (bus = 0; bus < 256; bus++) { 2339 int idx = ext ? bus * 2 : bus; 2340 u64 val; 2341 2342 if (ctxt_tbls[idx]) { 2343 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2344 iommu->root_entry[bus].lo = val; 2345 } 2346 2347 if (!ext || !ctxt_tbls[idx + 1]) 2348 continue; 2349 2350 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2351 iommu->root_entry[bus].hi = val; 2352 } 2353 2354 spin_unlock(&iommu->lock); 2355 2356 kfree(ctxt_tbls); 2357 2358 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2359 2360 ret = 0; 2361 2362 out_unmap: 2363 memunmap(old_rt); 2364 2365 return ret; 2366 } 2367 2368 static int __init init_dmars(void) 2369 { 2370 struct dmar_drhd_unit *drhd; 2371 struct intel_iommu *iommu; 2372 int ret; 2373 2374 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2375 if (ret) 2376 goto free_iommu; 2377 2378 for_each_iommu(iommu, drhd) { 2379 if (drhd->ignored) { 2380 iommu_disable_translation(iommu); 2381 continue; 2382 } 2383 2384 /* 2385 * Find the max pasid size of all IOMMU's in the system. 2386 * We need to ensure the system pasid table is no bigger 2387 * than the smallest supported. 2388 */ 2389 if (pasid_supported(iommu)) { 2390 u32 temp = 2 << ecap_pss(iommu->ecap); 2391 2392 intel_pasid_max_id = min_t(u32, temp, 2393 intel_pasid_max_id); 2394 } 2395 2396 intel_iommu_init_qi(iommu); 2397 2398 ret = iommu_init_domains(iommu); 2399 if (ret) 2400 goto free_iommu; 2401 2402 init_translation_status(iommu); 2403 2404 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2405 iommu_disable_translation(iommu); 2406 clear_translation_pre_enabled(iommu); 2407 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2408 iommu->name); 2409 } 2410 2411 /* 2412 * TBD: 2413 * we could share the same root & context tables 2414 * among all IOMMU's. Need to Split it later. 2415 */ 2416 ret = iommu_alloc_root_entry(iommu); 2417 if (ret) 2418 goto free_iommu; 2419 2420 if (translation_pre_enabled(iommu)) { 2421 pr_info("Translation already enabled - trying to copy translation structures\n"); 2422 2423 ret = copy_translation_tables(iommu); 2424 if (ret) { 2425 /* 2426 * We found the IOMMU with translation 2427 * enabled - but failed to copy over the 2428 * old root-entry table. Try to proceed 2429 * by disabling translation now and 2430 * allocating a clean root-entry table. 2431 * This might cause DMAR faults, but 2432 * probably the dump will still succeed. 2433 */ 2434 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2435 iommu->name); 2436 iommu_disable_translation(iommu); 2437 clear_translation_pre_enabled(iommu); 2438 } else { 2439 pr_info("Copied translation tables from previous kernel for %s\n", 2440 iommu->name); 2441 } 2442 } 2443 2444 if (!ecap_pass_through(iommu->ecap)) 2445 hw_pass_through = 0; 2446 intel_svm_check(iommu); 2447 } 2448 2449 /* 2450 * Now that qi is enabled on all iommus, set the root entry and flush 2451 * caches. This is required on some Intel X58 chipsets, otherwise the 2452 * flush_context function will loop forever and the boot hangs. 2453 */ 2454 for_each_active_iommu(iommu, drhd) { 2455 iommu_flush_write_buffer(iommu); 2456 iommu_set_root_entry(iommu); 2457 } 2458 2459 check_tylersburg_isoch(); 2460 2461 ret = si_domain_init(hw_pass_through); 2462 if (ret) 2463 goto free_iommu; 2464 2465 /* 2466 * for each drhd 2467 * enable fault log 2468 * global invalidate context cache 2469 * global invalidate iotlb 2470 * enable translation 2471 */ 2472 for_each_iommu(iommu, drhd) { 2473 if (drhd->ignored) { 2474 /* 2475 * we always have to disable PMRs or DMA may fail on 2476 * this device 2477 */ 2478 if (force_on) 2479 iommu_disable_protect_mem_regions(iommu); 2480 continue; 2481 } 2482 2483 iommu_flush_write_buffer(iommu); 2484 2485 #ifdef CONFIG_INTEL_IOMMU_SVM 2486 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2487 /* 2488 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2489 * could cause possible lock race condition. 2490 */ 2491 up_write(&dmar_global_lock); 2492 ret = intel_svm_enable_prq(iommu); 2493 down_write(&dmar_global_lock); 2494 if (ret) 2495 goto free_iommu; 2496 } 2497 #endif 2498 ret = dmar_set_interrupt(iommu); 2499 if (ret) 2500 goto free_iommu; 2501 } 2502 2503 return 0; 2504 2505 free_iommu: 2506 for_each_active_iommu(iommu, drhd) { 2507 disable_dmar_iommu(iommu); 2508 free_dmar_iommu(iommu); 2509 } 2510 if (si_domain) { 2511 domain_exit(si_domain); 2512 si_domain = NULL; 2513 } 2514 2515 return ret; 2516 } 2517 2518 static void __init init_no_remapping_devices(void) 2519 { 2520 struct dmar_drhd_unit *drhd; 2521 struct device *dev; 2522 int i; 2523 2524 for_each_drhd_unit(drhd) { 2525 if (!drhd->include_all) { 2526 for_each_active_dev_scope(drhd->devices, 2527 drhd->devices_cnt, i, dev) 2528 break; 2529 /* ignore DMAR unit if no devices exist */ 2530 if (i == drhd->devices_cnt) 2531 drhd->ignored = 1; 2532 } 2533 } 2534 2535 for_each_active_drhd_unit(drhd) { 2536 if (drhd->include_all) 2537 continue; 2538 2539 for_each_active_dev_scope(drhd->devices, 2540 drhd->devices_cnt, i, dev) 2541 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2542 break; 2543 if (i < drhd->devices_cnt) 2544 continue; 2545 2546 /* This IOMMU has *only* gfx devices. Either bypass it or 2547 set the gfx_mapped flag, as appropriate */ 2548 drhd->gfx_dedicated = 1; 2549 if (disable_igfx_iommu) 2550 drhd->ignored = 1; 2551 } 2552 } 2553 2554 #ifdef CONFIG_SUSPEND 2555 static int init_iommu_hw(void) 2556 { 2557 struct dmar_drhd_unit *drhd; 2558 struct intel_iommu *iommu = NULL; 2559 int ret; 2560 2561 for_each_active_iommu(iommu, drhd) { 2562 if (iommu->qi) { 2563 ret = dmar_reenable_qi(iommu); 2564 if (ret) 2565 return ret; 2566 } 2567 } 2568 2569 for_each_iommu(iommu, drhd) { 2570 if (drhd->ignored) { 2571 /* 2572 * we always have to disable PMRs or DMA may fail on 2573 * this device 2574 */ 2575 if (force_on) 2576 iommu_disable_protect_mem_regions(iommu); 2577 continue; 2578 } 2579 2580 iommu_flush_write_buffer(iommu); 2581 iommu_set_root_entry(iommu); 2582 iommu_enable_translation(iommu); 2583 iommu_disable_protect_mem_regions(iommu); 2584 } 2585 2586 return 0; 2587 } 2588 2589 static void iommu_flush_all(void) 2590 { 2591 struct dmar_drhd_unit *drhd; 2592 struct intel_iommu *iommu; 2593 2594 for_each_active_iommu(iommu, drhd) { 2595 iommu->flush.flush_context(iommu, 0, 0, 0, 2596 DMA_CCMD_GLOBAL_INVL); 2597 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2598 DMA_TLB_GLOBAL_FLUSH); 2599 } 2600 } 2601 2602 static int iommu_suspend(void) 2603 { 2604 struct dmar_drhd_unit *drhd; 2605 struct intel_iommu *iommu = NULL; 2606 unsigned long flag; 2607 2608 iommu_flush_all(); 2609 2610 for_each_active_iommu(iommu, drhd) { 2611 iommu_disable_translation(iommu); 2612 2613 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2614 2615 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2616 readl(iommu->reg + DMAR_FECTL_REG); 2617 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2618 readl(iommu->reg + DMAR_FEDATA_REG); 2619 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2620 readl(iommu->reg + DMAR_FEADDR_REG); 2621 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2622 readl(iommu->reg + DMAR_FEUADDR_REG); 2623 2624 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2625 } 2626 return 0; 2627 } 2628 2629 static void iommu_resume(void) 2630 { 2631 struct dmar_drhd_unit *drhd; 2632 struct intel_iommu *iommu = NULL; 2633 unsigned long flag; 2634 2635 if (init_iommu_hw()) { 2636 if (force_on) 2637 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2638 else 2639 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2640 return; 2641 } 2642 2643 for_each_active_iommu(iommu, drhd) { 2644 2645 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2646 2647 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2648 iommu->reg + DMAR_FECTL_REG); 2649 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2650 iommu->reg + DMAR_FEDATA_REG); 2651 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2652 iommu->reg + DMAR_FEADDR_REG); 2653 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2654 iommu->reg + DMAR_FEUADDR_REG); 2655 2656 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2657 } 2658 } 2659 2660 static struct syscore_ops iommu_syscore_ops = { 2661 .resume = iommu_resume, 2662 .suspend = iommu_suspend, 2663 }; 2664 2665 static void __init init_iommu_pm_ops(void) 2666 { 2667 register_syscore_ops(&iommu_syscore_ops); 2668 } 2669 2670 #else 2671 static inline void init_iommu_pm_ops(void) {} 2672 #endif /* CONFIG_PM */ 2673 2674 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2675 { 2676 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2677 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2678 rmrr->end_address <= rmrr->base_address || 2679 arch_rmrr_sanity_check(rmrr)) 2680 return -EINVAL; 2681 2682 return 0; 2683 } 2684 2685 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2686 { 2687 struct acpi_dmar_reserved_memory *rmrr; 2688 struct dmar_rmrr_unit *rmrru; 2689 2690 rmrr = (struct acpi_dmar_reserved_memory *)header; 2691 if (rmrr_sanity_check(rmrr)) { 2692 pr_warn(FW_BUG 2693 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2694 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2695 rmrr->base_address, rmrr->end_address, 2696 dmi_get_system_info(DMI_BIOS_VENDOR), 2697 dmi_get_system_info(DMI_BIOS_VERSION), 2698 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2699 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2700 } 2701 2702 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2703 if (!rmrru) 2704 goto out; 2705 2706 rmrru->hdr = header; 2707 2708 rmrru->base_address = rmrr->base_address; 2709 rmrru->end_address = rmrr->end_address; 2710 2711 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2712 ((void *)rmrr) + rmrr->header.length, 2713 &rmrru->devices_cnt); 2714 if (rmrru->devices_cnt && rmrru->devices == NULL) 2715 goto free_rmrru; 2716 2717 list_add(&rmrru->list, &dmar_rmrr_units); 2718 2719 return 0; 2720 free_rmrru: 2721 kfree(rmrru); 2722 out: 2723 return -ENOMEM; 2724 } 2725 2726 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2727 { 2728 struct dmar_atsr_unit *atsru; 2729 struct acpi_dmar_atsr *tmp; 2730 2731 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2732 dmar_rcu_check()) { 2733 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2734 if (atsr->segment != tmp->segment) 2735 continue; 2736 if (atsr->header.length != tmp->header.length) 2737 continue; 2738 if (memcmp(atsr, tmp, atsr->header.length) == 0) 2739 return atsru; 2740 } 2741 2742 return NULL; 2743 } 2744 2745 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2746 { 2747 struct acpi_dmar_atsr *atsr; 2748 struct dmar_atsr_unit *atsru; 2749 2750 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2751 return 0; 2752 2753 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2754 atsru = dmar_find_atsr(atsr); 2755 if (atsru) 2756 return 0; 2757 2758 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 2759 if (!atsru) 2760 return -ENOMEM; 2761 2762 /* 2763 * If memory is allocated from slab by ACPI _DSM method, we need to 2764 * copy the memory content because the memory buffer will be freed 2765 * on return. 2766 */ 2767 atsru->hdr = (void *)(atsru + 1); 2768 memcpy(atsru->hdr, hdr, hdr->length); 2769 atsru->include_all = atsr->flags & 0x1; 2770 if (!atsru->include_all) { 2771 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 2772 (void *)atsr + atsr->header.length, 2773 &atsru->devices_cnt); 2774 if (atsru->devices_cnt && atsru->devices == NULL) { 2775 kfree(atsru); 2776 return -ENOMEM; 2777 } 2778 } 2779 2780 list_add_rcu(&atsru->list, &dmar_atsr_units); 2781 2782 return 0; 2783 } 2784 2785 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 2786 { 2787 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 2788 kfree(atsru); 2789 } 2790 2791 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2792 { 2793 struct acpi_dmar_atsr *atsr; 2794 struct dmar_atsr_unit *atsru; 2795 2796 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2797 atsru = dmar_find_atsr(atsr); 2798 if (atsru) { 2799 list_del_rcu(&atsru->list); 2800 synchronize_rcu(); 2801 intel_iommu_free_atsr(atsru); 2802 } 2803 2804 return 0; 2805 } 2806 2807 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2808 { 2809 int i; 2810 struct device *dev; 2811 struct acpi_dmar_atsr *atsr; 2812 struct dmar_atsr_unit *atsru; 2813 2814 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2815 atsru = dmar_find_atsr(atsr); 2816 if (!atsru) 2817 return 0; 2818 2819 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 2820 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 2821 i, dev) 2822 return -EBUSY; 2823 } 2824 2825 return 0; 2826 } 2827 2828 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 2829 { 2830 struct dmar_satc_unit *satcu; 2831 struct acpi_dmar_satc *tmp; 2832 2833 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 2834 dmar_rcu_check()) { 2835 tmp = (struct acpi_dmar_satc *)satcu->hdr; 2836 if (satc->segment != tmp->segment) 2837 continue; 2838 if (satc->header.length != tmp->header.length) 2839 continue; 2840 if (memcmp(satc, tmp, satc->header.length) == 0) 2841 return satcu; 2842 } 2843 2844 return NULL; 2845 } 2846 2847 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 2848 { 2849 struct acpi_dmar_satc *satc; 2850 struct dmar_satc_unit *satcu; 2851 2852 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2853 return 0; 2854 2855 satc = container_of(hdr, struct acpi_dmar_satc, header); 2856 satcu = dmar_find_satc(satc); 2857 if (satcu) 2858 return 0; 2859 2860 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 2861 if (!satcu) 2862 return -ENOMEM; 2863 2864 satcu->hdr = (void *)(satcu + 1); 2865 memcpy(satcu->hdr, hdr, hdr->length); 2866 satcu->atc_required = satc->flags & 0x1; 2867 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 2868 (void *)satc + satc->header.length, 2869 &satcu->devices_cnt); 2870 if (satcu->devices_cnt && !satcu->devices) { 2871 kfree(satcu); 2872 return -ENOMEM; 2873 } 2874 list_add_rcu(&satcu->list, &dmar_satc_units); 2875 2876 return 0; 2877 } 2878 2879 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 2880 { 2881 int sp, ret; 2882 struct intel_iommu *iommu = dmaru->iommu; 2883 2884 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 2885 if (ret) 2886 goto out; 2887 2888 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 2889 pr_warn("%s: Doesn't support hardware pass through.\n", 2890 iommu->name); 2891 return -ENXIO; 2892 } 2893 2894 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 2895 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 2896 pr_warn("%s: Doesn't support large page.\n", 2897 iommu->name); 2898 return -ENXIO; 2899 } 2900 2901 /* 2902 * Disable translation if already enabled prior to OS handover. 2903 */ 2904 if (iommu->gcmd & DMA_GCMD_TE) 2905 iommu_disable_translation(iommu); 2906 2907 ret = iommu_init_domains(iommu); 2908 if (ret == 0) 2909 ret = iommu_alloc_root_entry(iommu); 2910 if (ret) 2911 goto out; 2912 2913 intel_svm_check(iommu); 2914 2915 if (dmaru->ignored) { 2916 /* 2917 * we always have to disable PMRs or DMA may fail on this device 2918 */ 2919 if (force_on) 2920 iommu_disable_protect_mem_regions(iommu); 2921 return 0; 2922 } 2923 2924 intel_iommu_init_qi(iommu); 2925 iommu_flush_write_buffer(iommu); 2926 2927 #ifdef CONFIG_INTEL_IOMMU_SVM 2928 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2929 ret = intel_svm_enable_prq(iommu); 2930 if (ret) 2931 goto disable_iommu; 2932 } 2933 #endif 2934 ret = dmar_set_interrupt(iommu); 2935 if (ret) 2936 goto disable_iommu; 2937 2938 iommu_set_root_entry(iommu); 2939 iommu_enable_translation(iommu); 2940 2941 iommu_disable_protect_mem_regions(iommu); 2942 return 0; 2943 2944 disable_iommu: 2945 disable_dmar_iommu(iommu); 2946 out: 2947 free_dmar_iommu(iommu); 2948 return ret; 2949 } 2950 2951 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 2952 { 2953 int ret = 0; 2954 struct intel_iommu *iommu = dmaru->iommu; 2955 2956 if (!intel_iommu_enabled) 2957 return 0; 2958 if (iommu == NULL) 2959 return -EINVAL; 2960 2961 if (insert) { 2962 ret = intel_iommu_add(dmaru); 2963 } else { 2964 disable_dmar_iommu(iommu); 2965 free_dmar_iommu(iommu); 2966 } 2967 2968 return ret; 2969 } 2970 2971 static void intel_iommu_free_dmars(void) 2972 { 2973 struct dmar_rmrr_unit *rmrru, *rmrr_n; 2974 struct dmar_atsr_unit *atsru, *atsr_n; 2975 struct dmar_satc_unit *satcu, *satc_n; 2976 2977 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 2978 list_del(&rmrru->list); 2979 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 2980 kfree(rmrru); 2981 } 2982 2983 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 2984 list_del(&atsru->list); 2985 intel_iommu_free_atsr(atsru); 2986 } 2987 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 2988 list_del(&satcu->list); 2989 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 2990 kfree(satcu); 2991 } 2992 } 2993 2994 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 2995 { 2996 struct dmar_satc_unit *satcu; 2997 struct acpi_dmar_satc *satc; 2998 struct device *tmp; 2999 int i; 3000 3001 dev = pci_physfn(dev); 3002 rcu_read_lock(); 3003 3004 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3005 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3006 if (satc->segment != pci_domain_nr(dev->bus)) 3007 continue; 3008 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3009 if (to_pci_dev(tmp) == dev) 3010 goto out; 3011 } 3012 satcu = NULL; 3013 out: 3014 rcu_read_unlock(); 3015 return satcu; 3016 } 3017 3018 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3019 { 3020 int i, ret = 1; 3021 struct pci_bus *bus; 3022 struct pci_dev *bridge = NULL; 3023 struct device *tmp; 3024 struct acpi_dmar_atsr *atsr; 3025 struct dmar_atsr_unit *atsru; 3026 struct dmar_satc_unit *satcu; 3027 3028 dev = pci_physfn(dev); 3029 satcu = dmar_find_matched_satc_unit(dev); 3030 if (satcu) 3031 /* 3032 * This device supports ATS as it is in SATC table. 3033 * When IOMMU is in legacy mode, enabling ATS is done 3034 * automatically by HW for the device that requires 3035 * ATS, hence OS should not enable this device ATS 3036 * to avoid duplicated TLB invalidation. 3037 */ 3038 return !(satcu->atc_required && !sm_supported(iommu)); 3039 3040 for (bus = dev->bus; bus; bus = bus->parent) { 3041 bridge = bus->self; 3042 /* If it's an integrated device, allow ATS */ 3043 if (!bridge) 3044 return 1; 3045 /* Connected via non-PCIe: no ATS */ 3046 if (!pci_is_pcie(bridge) || 3047 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3048 return 0; 3049 /* If we found the root port, look it up in the ATSR */ 3050 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3051 break; 3052 } 3053 3054 rcu_read_lock(); 3055 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3056 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3057 if (atsr->segment != pci_domain_nr(dev->bus)) 3058 continue; 3059 3060 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3061 if (tmp == &bridge->dev) 3062 goto out; 3063 3064 if (atsru->include_all) 3065 goto out; 3066 } 3067 ret = 0; 3068 out: 3069 rcu_read_unlock(); 3070 3071 return ret; 3072 } 3073 3074 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3075 { 3076 int ret; 3077 struct dmar_rmrr_unit *rmrru; 3078 struct dmar_atsr_unit *atsru; 3079 struct dmar_satc_unit *satcu; 3080 struct acpi_dmar_atsr *atsr; 3081 struct acpi_dmar_reserved_memory *rmrr; 3082 struct acpi_dmar_satc *satc; 3083 3084 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3085 return 0; 3086 3087 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3088 rmrr = container_of(rmrru->hdr, 3089 struct acpi_dmar_reserved_memory, header); 3090 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3091 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3092 ((void *)rmrr) + rmrr->header.length, 3093 rmrr->segment, rmrru->devices, 3094 rmrru->devices_cnt); 3095 if (ret < 0) 3096 return ret; 3097 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3098 dmar_remove_dev_scope(info, rmrr->segment, 3099 rmrru->devices, rmrru->devices_cnt); 3100 } 3101 } 3102 3103 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3104 if (atsru->include_all) 3105 continue; 3106 3107 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3108 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3109 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3110 (void *)atsr + atsr->header.length, 3111 atsr->segment, atsru->devices, 3112 atsru->devices_cnt); 3113 if (ret > 0) 3114 break; 3115 else if (ret < 0) 3116 return ret; 3117 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3118 if (dmar_remove_dev_scope(info, atsr->segment, 3119 atsru->devices, atsru->devices_cnt)) 3120 break; 3121 } 3122 } 3123 list_for_each_entry(satcu, &dmar_satc_units, list) { 3124 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3125 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3126 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3127 (void *)satc + satc->header.length, 3128 satc->segment, satcu->devices, 3129 satcu->devices_cnt); 3130 if (ret > 0) 3131 break; 3132 else if (ret < 0) 3133 return ret; 3134 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3135 if (dmar_remove_dev_scope(info, satc->segment, 3136 satcu->devices, satcu->devices_cnt)) 3137 break; 3138 } 3139 } 3140 3141 return 0; 3142 } 3143 3144 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3145 unsigned long val, void *v) 3146 { 3147 struct memory_notify *mhp = v; 3148 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3149 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3150 mhp->nr_pages - 1); 3151 3152 switch (val) { 3153 case MEM_GOING_ONLINE: 3154 if (iommu_domain_identity_map(si_domain, 3155 start_vpfn, last_vpfn)) { 3156 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3157 start_vpfn, last_vpfn); 3158 return NOTIFY_BAD; 3159 } 3160 break; 3161 3162 case MEM_OFFLINE: 3163 case MEM_CANCEL_ONLINE: 3164 { 3165 LIST_HEAD(freelist); 3166 3167 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3168 iommu_put_pages_list(&freelist); 3169 } 3170 break; 3171 } 3172 3173 return NOTIFY_OK; 3174 } 3175 3176 static struct notifier_block intel_iommu_memory_nb = { 3177 .notifier_call = intel_iommu_memory_notifier, 3178 .priority = 0 3179 }; 3180 3181 static void intel_disable_iommus(void) 3182 { 3183 struct intel_iommu *iommu = NULL; 3184 struct dmar_drhd_unit *drhd; 3185 3186 for_each_iommu(iommu, drhd) 3187 iommu_disable_translation(iommu); 3188 } 3189 3190 void intel_iommu_shutdown(void) 3191 { 3192 struct dmar_drhd_unit *drhd; 3193 struct intel_iommu *iommu = NULL; 3194 3195 if (no_iommu || dmar_disabled) 3196 return; 3197 3198 down_write(&dmar_global_lock); 3199 3200 /* Disable PMRs explicitly here. */ 3201 for_each_iommu(iommu, drhd) 3202 iommu_disable_protect_mem_regions(iommu); 3203 3204 /* Make sure the IOMMUs are switched off */ 3205 intel_disable_iommus(); 3206 3207 up_write(&dmar_global_lock); 3208 } 3209 3210 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3211 { 3212 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3213 3214 return container_of(iommu_dev, struct intel_iommu, iommu); 3215 } 3216 3217 static ssize_t version_show(struct device *dev, 3218 struct device_attribute *attr, char *buf) 3219 { 3220 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3221 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3222 return sysfs_emit(buf, "%d:%d\n", 3223 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3224 } 3225 static DEVICE_ATTR_RO(version); 3226 3227 static ssize_t address_show(struct device *dev, 3228 struct device_attribute *attr, char *buf) 3229 { 3230 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3231 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3232 } 3233 static DEVICE_ATTR_RO(address); 3234 3235 static ssize_t cap_show(struct device *dev, 3236 struct device_attribute *attr, char *buf) 3237 { 3238 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3239 return sysfs_emit(buf, "%llx\n", iommu->cap); 3240 } 3241 static DEVICE_ATTR_RO(cap); 3242 3243 static ssize_t ecap_show(struct device *dev, 3244 struct device_attribute *attr, char *buf) 3245 { 3246 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3247 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3248 } 3249 static DEVICE_ATTR_RO(ecap); 3250 3251 static ssize_t domains_supported_show(struct device *dev, 3252 struct device_attribute *attr, char *buf) 3253 { 3254 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3255 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3256 } 3257 static DEVICE_ATTR_RO(domains_supported); 3258 3259 static ssize_t domains_used_show(struct device *dev, 3260 struct device_attribute *attr, char *buf) 3261 { 3262 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3263 return sysfs_emit(buf, "%d\n", 3264 bitmap_weight(iommu->domain_ids, 3265 cap_ndoms(iommu->cap))); 3266 } 3267 static DEVICE_ATTR_RO(domains_used); 3268 3269 static struct attribute *intel_iommu_attrs[] = { 3270 &dev_attr_version.attr, 3271 &dev_attr_address.attr, 3272 &dev_attr_cap.attr, 3273 &dev_attr_ecap.attr, 3274 &dev_attr_domains_supported.attr, 3275 &dev_attr_domains_used.attr, 3276 NULL, 3277 }; 3278 3279 static struct attribute_group intel_iommu_group = { 3280 .name = "intel-iommu", 3281 .attrs = intel_iommu_attrs, 3282 }; 3283 3284 const struct attribute_group *intel_iommu_groups[] = { 3285 &intel_iommu_group, 3286 NULL, 3287 }; 3288 3289 static bool has_external_pci(void) 3290 { 3291 struct pci_dev *pdev = NULL; 3292 3293 for_each_pci_dev(pdev) 3294 if (pdev->external_facing) { 3295 pci_dev_put(pdev); 3296 return true; 3297 } 3298 3299 return false; 3300 } 3301 3302 static int __init platform_optin_force_iommu(void) 3303 { 3304 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3305 return 0; 3306 3307 if (no_iommu || dmar_disabled) 3308 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3309 3310 /* 3311 * If Intel-IOMMU is disabled by default, we will apply identity 3312 * map for all devices except those marked as being untrusted. 3313 */ 3314 if (dmar_disabled) 3315 iommu_set_default_passthrough(false); 3316 3317 dmar_disabled = 0; 3318 no_iommu = 0; 3319 3320 return 1; 3321 } 3322 3323 static int __init probe_acpi_namespace_devices(void) 3324 { 3325 struct dmar_drhd_unit *drhd; 3326 /* To avoid a -Wunused-but-set-variable warning. */ 3327 struct intel_iommu *iommu __maybe_unused; 3328 struct device *dev; 3329 int i, ret = 0; 3330 3331 for_each_active_iommu(iommu, drhd) { 3332 for_each_active_dev_scope(drhd->devices, 3333 drhd->devices_cnt, i, dev) { 3334 struct acpi_device_physical_node *pn; 3335 struct acpi_device *adev; 3336 3337 if (dev->bus != &acpi_bus_type) 3338 continue; 3339 3340 adev = to_acpi_device(dev); 3341 mutex_lock(&adev->physical_node_lock); 3342 list_for_each_entry(pn, 3343 &adev->physical_node_list, node) { 3344 ret = iommu_probe_device(pn->dev); 3345 if (ret) 3346 break; 3347 } 3348 mutex_unlock(&adev->physical_node_lock); 3349 3350 if (ret) 3351 return ret; 3352 } 3353 } 3354 3355 return 0; 3356 } 3357 3358 static __init int tboot_force_iommu(void) 3359 { 3360 if (!tboot_enabled()) 3361 return 0; 3362 3363 if (no_iommu || dmar_disabled) 3364 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3365 3366 dmar_disabled = 0; 3367 no_iommu = 0; 3368 3369 return 1; 3370 } 3371 3372 int __init intel_iommu_init(void) 3373 { 3374 int ret = -ENODEV; 3375 struct dmar_drhd_unit *drhd; 3376 struct intel_iommu *iommu; 3377 3378 /* 3379 * Intel IOMMU is required for a TXT/tboot launch or platform 3380 * opt in, so enforce that. 3381 */ 3382 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3383 platform_optin_force_iommu(); 3384 3385 down_write(&dmar_global_lock); 3386 if (dmar_table_init()) { 3387 if (force_on) 3388 panic("tboot: Failed to initialize DMAR table\n"); 3389 goto out_free_dmar; 3390 } 3391 3392 if (dmar_dev_scope_init() < 0) { 3393 if (force_on) 3394 panic("tboot: Failed to initialize DMAR device scope\n"); 3395 goto out_free_dmar; 3396 } 3397 3398 up_write(&dmar_global_lock); 3399 3400 /* 3401 * The bus notifier takes the dmar_global_lock, so lockdep will 3402 * complain later when we register it under the lock. 3403 */ 3404 dmar_register_bus_notifier(); 3405 3406 down_write(&dmar_global_lock); 3407 3408 if (!no_iommu) 3409 intel_iommu_debugfs_init(); 3410 3411 if (no_iommu || dmar_disabled) { 3412 /* 3413 * We exit the function here to ensure IOMMU's remapping and 3414 * mempool aren't setup, which means that the IOMMU's PMRs 3415 * won't be disabled via the call to init_dmars(). So disable 3416 * it explicitly here. The PMRs were setup by tboot prior to 3417 * calling SENTER, but the kernel is expected to reset/tear 3418 * down the PMRs. 3419 */ 3420 if (intel_iommu_tboot_noforce) { 3421 for_each_iommu(iommu, drhd) 3422 iommu_disable_protect_mem_regions(iommu); 3423 } 3424 3425 /* 3426 * Make sure the IOMMUs are switched off, even when we 3427 * boot into a kexec kernel and the previous kernel left 3428 * them enabled 3429 */ 3430 intel_disable_iommus(); 3431 goto out_free_dmar; 3432 } 3433 3434 if (list_empty(&dmar_rmrr_units)) 3435 pr_info("No RMRR found\n"); 3436 3437 if (list_empty(&dmar_atsr_units)) 3438 pr_info("No ATSR found\n"); 3439 3440 if (list_empty(&dmar_satc_units)) 3441 pr_info("No SATC found\n"); 3442 3443 init_no_remapping_devices(); 3444 3445 ret = init_dmars(); 3446 if (ret) { 3447 if (force_on) 3448 panic("tboot: Failed to initialize DMARs\n"); 3449 pr_err("Initialization failed\n"); 3450 goto out_free_dmar; 3451 } 3452 up_write(&dmar_global_lock); 3453 3454 init_iommu_pm_ops(); 3455 3456 down_read(&dmar_global_lock); 3457 for_each_active_iommu(iommu, drhd) { 3458 /* 3459 * The flush queue implementation does not perform 3460 * page-selective invalidations that are required for efficient 3461 * TLB flushes in virtual environments. The benefit of batching 3462 * is likely to be much lower than the overhead of synchronizing 3463 * the virtual and physical IOMMU page-tables. 3464 */ 3465 if (cap_caching_mode(iommu->cap) && 3466 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3467 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3468 iommu_set_dma_strict(); 3469 } 3470 iommu_device_sysfs_add(&iommu->iommu, NULL, 3471 intel_iommu_groups, 3472 "%s", iommu->name); 3473 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3474 3475 iommu_pmu_register(iommu); 3476 } 3477 up_read(&dmar_global_lock); 3478 3479 if (si_domain && !hw_pass_through) 3480 register_memory_notifier(&intel_iommu_memory_nb); 3481 3482 down_read(&dmar_global_lock); 3483 if (probe_acpi_namespace_devices()) 3484 pr_warn("ACPI name space devices didn't probe correctly\n"); 3485 3486 /* Finally, we enable the DMA remapping hardware. */ 3487 for_each_iommu(iommu, drhd) { 3488 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3489 iommu_enable_translation(iommu); 3490 3491 iommu_disable_protect_mem_regions(iommu); 3492 } 3493 up_read(&dmar_global_lock); 3494 3495 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3496 3497 intel_iommu_enabled = 1; 3498 3499 return 0; 3500 3501 out_free_dmar: 3502 intel_iommu_free_dmars(); 3503 up_write(&dmar_global_lock); 3504 return ret; 3505 } 3506 3507 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3508 { 3509 struct device_domain_info *info = opaque; 3510 3511 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3512 return 0; 3513 } 3514 3515 /* 3516 * NB - intel-iommu lacks any sort of reference counting for the users of 3517 * dependent devices. If multiple endpoints have intersecting dependent 3518 * devices, unbinding the driver from any one of them will possibly leave 3519 * the others unable to operate. 3520 */ 3521 static void domain_context_clear(struct device_domain_info *info) 3522 { 3523 if (!dev_is_pci(info->dev)) 3524 domain_context_clear_one(info, info->bus, info->devfn); 3525 3526 pci_for_each_dma_alias(to_pci_dev(info->dev), 3527 &domain_context_clear_one_cb, info); 3528 } 3529 3530 /* 3531 * Clear the page table pointer in context or pasid table entries so that 3532 * all DMA requests without PASID from the device are blocked. If the page 3533 * table has been set, clean up the data structures. 3534 */ 3535 void device_block_translation(struct device *dev) 3536 { 3537 struct device_domain_info *info = dev_iommu_priv_get(dev); 3538 struct intel_iommu *iommu = info->iommu; 3539 unsigned long flags; 3540 3541 iommu_disable_pci_caps(info); 3542 if (!dev_is_real_dma_subdevice(dev)) { 3543 if (sm_supported(iommu)) 3544 intel_pasid_tear_down_entry(iommu, dev, 3545 IOMMU_NO_PASID, false); 3546 else 3547 domain_context_clear(info); 3548 } 3549 3550 if (!info->domain) 3551 return; 3552 3553 spin_lock_irqsave(&info->domain->lock, flags); 3554 list_del(&info->link); 3555 spin_unlock_irqrestore(&info->domain->lock, flags); 3556 3557 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); 3558 domain_detach_iommu(info->domain, iommu); 3559 info->domain = NULL; 3560 } 3561 3562 static int md_domain_init(struct dmar_domain *domain, int guest_width) 3563 { 3564 int adjust_width; 3565 3566 /* calculate AGAW */ 3567 domain->gaw = guest_width; 3568 adjust_width = guestwidth_to_adjustwidth(guest_width); 3569 domain->agaw = width_to_agaw(adjust_width); 3570 3571 domain->iommu_coherency = false; 3572 domain->iommu_superpage = 0; 3573 domain->max_addr = 0; 3574 3575 /* always allocate the top pgd */ 3576 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); 3577 if (!domain->pgd) 3578 return -ENOMEM; 3579 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3580 return 0; 3581 } 3582 3583 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3584 struct device *dev) 3585 { 3586 device_block_translation(dev); 3587 return 0; 3588 } 3589 3590 static struct iommu_domain blocking_domain = { 3591 .type = IOMMU_DOMAIN_BLOCKED, 3592 .ops = &(const struct iommu_domain_ops) { 3593 .attach_dev = blocking_domain_attach_dev, 3594 } 3595 }; 3596 3597 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) 3598 { 3599 if (!intel_iommu_superpage) 3600 return 0; 3601 3602 if (first_stage) 3603 return cap_fl1gp_support(iommu->cap) ? 2 : 1; 3604 3605 return fls(cap_super_page_val(iommu->cap)); 3606 } 3607 3608 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) 3609 { 3610 struct device_domain_info *info = dev_iommu_priv_get(dev); 3611 struct intel_iommu *iommu = info->iommu; 3612 struct dmar_domain *domain; 3613 int addr_width; 3614 3615 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 3616 if (!domain) 3617 return ERR_PTR(-ENOMEM); 3618 3619 INIT_LIST_HEAD(&domain->devices); 3620 INIT_LIST_HEAD(&domain->dev_pasids); 3621 INIT_LIST_HEAD(&domain->cache_tags); 3622 spin_lock_init(&domain->lock); 3623 spin_lock_init(&domain->cache_lock); 3624 xa_init(&domain->iommu_array); 3625 3626 domain->nid = dev_to_node(dev); 3627 domain->has_iotlb_device = info->ats_enabled; 3628 domain->use_first_level = first_stage; 3629 3630 /* calculate the address width */ 3631 addr_width = agaw_to_width(iommu->agaw); 3632 if (addr_width > cap_mgaw(iommu->cap)) 3633 addr_width = cap_mgaw(iommu->cap); 3634 domain->gaw = addr_width; 3635 domain->agaw = iommu->agaw; 3636 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); 3637 3638 /* iommu memory access coherency */ 3639 domain->iommu_coherency = iommu_paging_structure_coherency(iommu); 3640 3641 /* pagesize bitmap */ 3642 domain->domain.pgsize_bitmap = SZ_4K; 3643 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); 3644 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 3645 3646 /* 3647 * IOVA aperture: First-level translation restricts the input-address 3648 * to a canonical address (i.e., address bits 63:N have the same value 3649 * as address bit [N-1], where N is 48-bits with 4-level paging and 3650 * 57-bits with 5-level paging). Hence, skip bit [N-1]. 3651 */ 3652 domain->domain.geometry.force_aperture = true; 3653 domain->domain.geometry.aperture_start = 0; 3654 if (first_stage) 3655 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 3656 else 3657 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 3658 3659 /* always allocate the top pgd */ 3660 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL); 3661 if (!domain->pgd) { 3662 kfree(domain); 3663 return ERR_PTR(-ENOMEM); 3664 } 3665 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3666 3667 return domain; 3668 } 3669 3670 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 3671 { 3672 struct dmar_domain *dmar_domain; 3673 struct iommu_domain *domain; 3674 3675 switch (type) { 3676 case IOMMU_DOMAIN_DMA: 3677 case IOMMU_DOMAIN_UNMANAGED: 3678 dmar_domain = alloc_domain(type); 3679 if (!dmar_domain) { 3680 pr_err("Can't allocate dmar_domain\n"); 3681 return NULL; 3682 } 3683 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 3684 pr_err("Domain initialization failed\n"); 3685 domain_exit(dmar_domain); 3686 return NULL; 3687 } 3688 3689 domain = &dmar_domain->domain; 3690 domain->geometry.aperture_start = 0; 3691 domain->geometry.aperture_end = 3692 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 3693 domain->geometry.force_aperture = true; 3694 3695 return domain; 3696 case IOMMU_DOMAIN_IDENTITY: 3697 return &si_domain->domain; 3698 default: 3699 return NULL; 3700 } 3701 3702 return NULL; 3703 } 3704 3705 static struct iommu_domain * 3706 intel_iommu_domain_alloc_user(struct device *dev, u32 flags, 3707 struct iommu_domain *parent, 3708 const struct iommu_user_data *user_data) 3709 { 3710 struct device_domain_info *info = dev_iommu_priv_get(dev); 3711 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3712 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3713 struct intel_iommu *iommu = info->iommu; 3714 struct dmar_domain *dmar_domain; 3715 struct iommu_domain *domain; 3716 3717 /* Must be NESTING domain */ 3718 if (parent) { 3719 if (!nested_supported(iommu) || flags) 3720 return ERR_PTR(-EOPNOTSUPP); 3721 return intel_nested_domain_alloc(parent, user_data); 3722 } 3723 3724 if (flags & 3725 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 3726 return ERR_PTR(-EOPNOTSUPP); 3727 if (nested_parent && !nested_supported(iommu)) 3728 return ERR_PTR(-EOPNOTSUPP); 3729 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3730 return ERR_PTR(-EOPNOTSUPP); 3731 3732 /* Do not use first stage for user domain translation. */ 3733 dmar_domain = paging_domain_alloc(dev, false); 3734 if (IS_ERR(dmar_domain)) 3735 return ERR_CAST(dmar_domain); 3736 domain = &dmar_domain->domain; 3737 domain->type = IOMMU_DOMAIN_UNMANAGED; 3738 domain->owner = &intel_iommu_ops; 3739 domain->ops = intel_iommu_ops.default_domain_ops; 3740 3741 if (nested_parent) { 3742 dmar_domain->nested_parent = true; 3743 INIT_LIST_HEAD(&dmar_domain->s1_domains); 3744 spin_lock_init(&dmar_domain->s1_lock); 3745 } 3746 3747 if (dirty_tracking) { 3748 if (dmar_domain->use_first_level) { 3749 iommu_domain_free(domain); 3750 return ERR_PTR(-EOPNOTSUPP); 3751 } 3752 domain->dirty_ops = &intel_dirty_ops; 3753 } 3754 3755 return domain; 3756 } 3757 3758 static void intel_iommu_domain_free(struct iommu_domain *domain) 3759 { 3760 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3761 3762 WARN_ON(dmar_domain->nested_parent && 3763 !list_empty(&dmar_domain->s1_domains)); 3764 if (domain != &si_domain->domain) 3765 domain_exit(dmar_domain); 3766 } 3767 3768 int prepare_domain_attach_device(struct iommu_domain *domain, 3769 struct device *dev) 3770 { 3771 struct device_domain_info *info = dev_iommu_priv_get(dev); 3772 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3773 struct intel_iommu *iommu = info->iommu; 3774 int addr_width; 3775 3776 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3777 return -EINVAL; 3778 3779 if (domain->dirty_ops && !ssads_supported(iommu)) 3780 return -EINVAL; 3781 3782 /* check if this iommu agaw is sufficient for max mapped address */ 3783 addr_width = agaw_to_width(iommu->agaw); 3784 if (addr_width > cap_mgaw(iommu->cap)) 3785 addr_width = cap_mgaw(iommu->cap); 3786 3787 if (dmar_domain->max_addr > (1LL << addr_width)) 3788 return -EINVAL; 3789 dmar_domain->gaw = addr_width; 3790 3791 /* 3792 * Knock out extra levels of page tables if necessary 3793 */ 3794 while (iommu->agaw < dmar_domain->agaw) { 3795 struct dma_pte *pte; 3796 3797 pte = dmar_domain->pgd; 3798 if (dma_pte_present(pte)) { 3799 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 3800 iommu_free_page(pte); 3801 } 3802 dmar_domain->agaw--; 3803 } 3804 3805 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3806 context_copied(iommu, info->bus, info->devfn)) 3807 return intel_pasid_setup_sm_context(dev); 3808 3809 return 0; 3810 } 3811 3812 static int intel_iommu_attach_device(struct iommu_domain *domain, 3813 struct device *dev) 3814 { 3815 struct device_domain_info *info = dev_iommu_priv_get(dev); 3816 int ret; 3817 3818 if (info->domain) 3819 device_block_translation(dev); 3820 3821 ret = prepare_domain_attach_device(domain, dev); 3822 if (ret) 3823 return ret; 3824 3825 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 3826 } 3827 3828 static int intel_iommu_map(struct iommu_domain *domain, 3829 unsigned long iova, phys_addr_t hpa, 3830 size_t size, int iommu_prot, gfp_t gfp) 3831 { 3832 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3833 u64 max_addr; 3834 int prot = 0; 3835 3836 if (iommu_prot & IOMMU_READ) 3837 prot |= DMA_PTE_READ; 3838 if (iommu_prot & IOMMU_WRITE) 3839 prot |= DMA_PTE_WRITE; 3840 if (dmar_domain->set_pte_snp) 3841 prot |= DMA_PTE_SNP; 3842 3843 max_addr = iova + size; 3844 if (dmar_domain->max_addr < max_addr) { 3845 u64 end; 3846 3847 /* check if minimum agaw is sufficient for mapped address */ 3848 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 3849 if (end < max_addr) { 3850 pr_err("%s: iommu width (%d) is not " 3851 "sufficient for the mapped address (%llx)\n", 3852 __func__, dmar_domain->gaw, max_addr); 3853 return -EFAULT; 3854 } 3855 dmar_domain->max_addr = max_addr; 3856 } 3857 /* Round up size to next multiple of PAGE_SIZE, if it and 3858 the low bits of hpa would take us onto the next page */ 3859 size = aligned_nrpages(hpa, size); 3860 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 3861 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 3862 } 3863 3864 static int intel_iommu_map_pages(struct iommu_domain *domain, 3865 unsigned long iova, phys_addr_t paddr, 3866 size_t pgsize, size_t pgcount, 3867 int prot, gfp_t gfp, size_t *mapped) 3868 { 3869 unsigned long pgshift = __ffs(pgsize); 3870 size_t size = pgcount << pgshift; 3871 int ret; 3872 3873 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 3874 return -EINVAL; 3875 3876 if (!IS_ALIGNED(iova | paddr, pgsize)) 3877 return -EINVAL; 3878 3879 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 3880 if (!ret && mapped) 3881 *mapped = size; 3882 3883 return ret; 3884 } 3885 3886 static size_t intel_iommu_unmap(struct iommu_domain *domain, 3887 unsigned long iova, size_t size, 3888 struct iommu_iotlb_gather *gather) 3889 { 3890 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3891 unsigned long start_pfn, last_pfn; 3892 int level = 0; 3893 3894 /* Cope with horrid API which requires us to unmap more than the 3895 size argument if it happens to be a large-page mapping. */ 3896 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 3897 &level, GFP_ATOMIC))) 3898 return 0; 3899 3900 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 3901 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 3902 3903 start_pfn = iova >> VTD_PAGE_SHIFT; 3904 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 3905 3906 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 3907 3908 if (dmar_domain->max_addr == iova + size) 3909 dmar_domain->max_addr = iova; 3910 3911 /* 3912 * We do not use page-selective IOTLB invalidation in flush queue, 3913 * so there is no need to track page and sync iotlb. 3914 */ 3915 if (!iommu_iotlb_gather_queued(gather)) 3916 iommu_iotlb_gather_add_page(domain, gather, iova, size); 3917 3918 return size; 3919 } 3920 3921 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 3922 unsigned long iova, 3923 size_t pgsize, size_t pgcount, 3924 struct iommu_iotlb_gather *gather) 3925 { 3926 unsigned long pgshift = __ffs(pgsize); 3927 size_t size = pgcount << pgshift; 3928 3929 return intel_iommu_unmap(domain, iova, size, gather); 3930 } 3931 3932 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 3933 struct iommu_iotlb_gather *gather) 3934 { 3935 cache_tag_flush_range(to_dmar_domain(domain), gather->start, 3936 gather->end, list_empty(&gather->freelist)); 3937 iommu_put_pages_list(&gather->freelist); 3938 } 3939 3940 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 3941 dma_addr_t iova) 3942 { 3943 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3944 struct dma_pte *pte; 3945 int level = 0; 3946 u64 phys = 0; 3947 3948 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 3949 GFP_ATOMIC); 3950 if (pte && dma_pte_present(pte)) 3951 phys = dma_pte_addr(pte) + 3952 (iova & (BIT_MASK(level_to_offset_bits(level) + 3953 VTD_PAGE_SHIFT) - 1)); 3954 3955 return phys; 3956 } 3957 3958 static bool domain_support_force_snooping(struct dmar_domain *domain) 3959 { 3960 struct device_domain_info *info; 3961 bool support = true; 3962 3963 assert_spin_locked(&domain->lock); 3964 list_for_each_entry(info, &domain->devices, link) { 3965 if (!ecap_sc_support(info->iommu->ecap)) { 3966 support = false; 3967 break; 3968 } 3969 } 3970 3971 return support; 3972 } 3973 3974 static void domain_set_force_snooping(struct dmar_domain *domain) 3975 { 3976 struct device_domain_info *info; 3977 3978 assert_spin_locked(&domain->lock); 3979 /* 3980 * Second level page table supports per-PTE snoop control. The 3981 * iommu_map() interface will handle this by setting SNP bit. 3982 */ 3983 if (!domain->use_first_level) { 3984 domain->set_pte_snp = true; 3985 return; 3986 } 3987 3988 list_for_each_entry(info, &domain->devices, link) 3989 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 3990 IOMMU_NO_PASID); 3991 } 3992 3993 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3994 { 3995 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3996 unsigned long flags; 3997 3998 if (dmar_domain->force_snooping) 3999 return true; 4000 4001 spin_lock_irqsave(&dmar_domain->lock, flags); 4002 if (!domain_support_force_snooping(dmar_domain) || 4003 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 4004 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4005 return false; 4006 } 4007 4008 domain_set_force_snooping(dmar_domain); 4009 dmar_domain->force_snooping = true; 4010 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4011 4012 return true; 4013 } 4014 4015 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4016 { 4017 struct device_domain_info *info = dev_iommu_priv_get(dev); 4018 4019 switch (cap) { 4020 case IOMMU_CAP_CACHE_COHERENCY: 4021 case IOMMU_CAP_DEFERRED_FLUSH: 4022 return true; 4023 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4024 return dmar_platform_optin(); 4025 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4026 return ecap_sc_support(info->iommu->ecap); 4027 case IOMMU_CAP_DIRTY_TRACKING: 4028 return ssads_supported(info->iommu); 4029 default: 4030 return false; 4031 } 4032 } 4033 4034 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4035 { 4036 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4037 struct device_domain_info *info; 4038 struct intel_iommu *iommu; 4039 u8 bus, devfn; 4040 int ret; 4041 4042 iommu = device_lookup_iommu(dev, &bus, &devfn); 4043 if (!iommu || !iommu->iommu.ops) 4044 return ERR_PTR(-ENODEV); 4045 4046 info = kzalloc(sizeof(*info), GFP_KERNEL); 4047 if (!info) 4048 return ERR_PTR(-ENOMEM); 4049 4050 if (dev_is_real_dma_subdevice(dev)) { 4051 info->bus = pdev->bus->number; 4052 info->devfn = pdev->devfn; 4053 info->segment = pci_domain_nr(pdev->bus); 4054 } else { 4055 info->bus = bus; 4056 info->devfn = devfn; 4057 info->segment = iommu->segment; 4058 } 4059 4060 info->dev = dev; 4061 info->iommu = iommu; 4062 if (dev_is_pci(dev)) { 4063 if (ecap_dev_iotlb_support(iommu->ecap) && 4064 pci_ats_supported(pdev) && 4065 dmar_ats_supported(pdev, iommu)) { 4066 info->ats_supported = 1; 4067 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4068 4069 /* 4070 * For IOMMU that supports device IOTLB throttling 4071 * (DIT), we assign PFSID to the invalidation desc 4072 * of a VF such that IOMMU HW can gauge queue depth 4073 * at PF level. If DIT is not set, PFSID will be 4074 * treated as reserved, which should be set to 0. 4075 */ 4076 if (ecap_dit(iommu->ecap)) 4077 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4078 info->ats_qdep = pci_ats_queue_depth(pdev); 4079 } 4080 if (sm_supported(iommu)) { 4081 if (pasid_supported(iommu)) { 4082 int features = pci_pasid_features(pdev); 4083 4084 if (features >= 0) 4085 info->pasid_supported = features | 1; 4086 } 4087 4088 if (info->ats_supported && ecap_prs(iommu->ecap) && 4089 pci_pri_supported(pdev)) 4090 info->pri_supported = 1; 4091 } 4092 } 4093 4094 dev_iommu_priv_set(dev, info); 4095 if (pdev && pci_ats_supported(pdev)) { 4096 ret = device_rbtree_insert(iommu, info); 4097 if (ret) 4098 goto free; 4099 } 4100 4101 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4102 ret = intel_pasid_alloc_table(dev); 4103 if (ret) { 4104 dev_err(dev, "PASID table allocation failed\n"); 4105 goto clear_rbtree; 4106 } 4107 4108 if (!context_copied(iommu, info->bus, info->devfn)) { 4109 ret = intel_pasid_setup_sm_context(dev); 4110 if (ret) 4111 goto free_table; 4112 } 4113 } 4114 4115 intel_iommu_debugfs_create_dev(info); 4116 4117 return &iommu->iommu; 4118 free_table: 4119 intel_pasid_free_table(dev); 4120 clear_rbtree: 4121 device_rbtree_remove(info); 4122 free: 4123 kfree(info); 4124 4125 return ERR_PTR(ret); 4126 } 4127 4128 static void intel_iommu_release_device(struct device *dev) 4129 { 4130 struct device_domain_info *info = dev_iommu_priv_get(dev); 4131 struct intel_iommu *iommu = info->iommu; 4132 4133 mutex_lock(&iommu->iopf_lock); 4134 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) 4135 device_rbtree_remove(info); 4136 mutex_unlock(&iommu->iopf_lock); 4137 4138 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 4139 !context_copied(iommu, info->bus, info->devfn)) 4140 intel_pasid_teardown_sm_context(dev); 4141 4142 intel_pasid_free_table(dev); 4143 intel_iommu_debugfs_remove_dev(info); 4144 kfree(info); 4145 set_dma_ops(dev, NULL); 4146 } 4147 4148 static void intel_iommu_get_resv_regions(struct device *device, 4149 struct list_head *head) 4150 { 4151 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4152 struct iommu_resv_region *reg; 4153 struct dmar_rmrr_unit *rmrr; 4154 struct device *i_dev; 4155 int i; 4156 4157 rcu_read_lock(); 4158 for_each_rmrr_units(rmrr) { 4159 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4160 i, i_dev) { 4161 struct iommu_resv_region *resv; 4162 enum iommu_resv_type type; 4163 size_t length; 4164 4165 if (i_dev != device && 4166 !is_downstream_to_pci_bridge(device, i_dev)) 4167 continue; 4168 4169 length = rmrr->end_address - rmrr->base_address + 1; 4170 4171 type = device_rmrr_is_relaxable(device) ? 4172 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4173 4174 resv = iommu_alloc_resv_region(rmrr->base_address, 4175 length, prot, type, 4176 GFP_ATOMIC); 4177 if (!resv) 4178 break; 4179 4180 list_add_tail(&resv->list, head); 4181 } 4182 } 4183 rcu_read_unlock(); 4184 4185 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4186 if (dev_is_pci(device)) { 4187 struct pci_dev *pdev = to_pci_dev(device); 4188 4189 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4190 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4191 IOMMU_RESV_DIRECT_RELAXABLE, 4192 GFP_KERNEL); 4193 if (reg) 4194 list_add_tail(®->list, head); 4195 } 4196 } 4197 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4198 4199 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4200 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4201 0, IOMMU_RESV_MSI, GFP_KERNEL); 4202 if (!reg) 4203 return; 4204 list_add_tail(®->list, head); 4205 } 4206 4207 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4208 { 4209 if (dev_is_pci(dev)) 4210 return pci_device_group(dev); 4211 return generic_device_group(dev); 4212 } 4213 4214 static int intel_iommu_enable_sva(struct device *dev) 4215 { 4216 struct device_domain_info *info = dev_iommu_priv_get(dev); 4217 struct intel_iommu *iommu; 4218 4219 if (!info || dmar_disabled) 4220 return -EINVAL; 4221 4222 iommu = info->iommu; 4223 if (!iommu) 4224 return -EINVAL; 4225 4226 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4227 return -ENODEV; 4228 4229 if (!info->pasid_enabled || !info->ats_enabled) 4230 return -EINVAL; 4231 4232 /* 4233 * Devices having device-specific I/O fault handling should not 4234 * support PCI/PRI. The IOMMU side has no means to check the 4235 * capability of device-specific IOPF. Therefore, IOMMU can only 4236 * default that if the device driver enables SVA on a non-PRI 4237 * device, it will handle IOPF in its own way. 4238 */ 4239 if (!info->pri_supported) 4240 return 0; 4241 4242 /* Devices supporting PRI should have it enabled. */ 4243 if (!info->pri_enabled) 4244 return -EINVAL; 4245 4246 return 0; 4247 } 4248 4249 static int context_flip_pri(struct device_domain_info *info, bool enable) 4250 { 4251 struct intel_iommu *iommu = info->iommu; 4252 u8 bus = info->bus, devfn = info->devfn; 4253 struct context_entry *context; 4254 u16 did; 4255 4256 spin_lock(&iommu->lock); 4257 if (context_copied(iommu, bus, devfn)) { 4258 spin_unlock(&iommu->lock); 4259 return -EINVAL; 4260 } 4261 4262 context = iommu_context_addr(iommu, bus, devfn, false); 4263 if (!context || !context_present(context)) { 4264 spin_unlock(&iommu->lock); 4265 return -ENODEV; 4266 } 4267 did = context_domain_id(context); 4268 4269 if (enable) 4270 context_set_sm_pre(context); 4271 else 4272 context_clear_sm_pre(context); 4273 4274 if (!ecap_coherent(iommu->ecap)) 4275 clflush_cache_range(context, sizeof(*context)); 4276 intel_context_flush_present(info, context, did, true); 4277 spin_unlock(&iommu->lock); 4278 4279 return 0; 4280 } 4281 4282 static int intel_iommu_enable_iopf(struct device *dev) 4283 { 4284 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4285 struct device_domain_info *info = dev_iommu_priv_get(dev); 4286 struct intel_iommu *iommu; 4287 int ret; 4288 4289 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4290 return -ENODEV; 4291 4292 if (info->pri_enabled) 4293 return -EBUSY; 4294 4295 iommu = info->iommu; 4296 if (!iommu) 4297 return -EINVAL; 4298 4299 /* PASID is required in PRG Response Message. */ 4300 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4301 return -EINVAL; 4302 4303 ret = pci_reset_pri(pdev); 4304 if (ret) 4305 return ret; 4306 4307 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4308 if (ret) 4309 return ret; 4310 4311 ret = context_flip_pri(info, true); 4312 if (ret) 4313 goto err_remove_device; 4314 4315 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4316 if (ret) 4317 goto err_clear_pri; 4318 4319 info->pri_enabled = 1; 4320 4321 return 0; 4322 err_clear_pri: 4323 context_flip_pri(info, false); 4324 err_remove_device: 4325 iopf_queue_remove_device(iommu->iopf_queue, dev); 4326 4327 return ret; 4328 } 4329 4330 static int intel_iommu_disable_iopf(struct device *dev) 4331 { 4332 struct device_domain_info *info = dev_iommu_priv_get(dev); 4333 struct intel_iommu *iommu = info->iommu; 4334 4335 if (!info->pri_enabled) 4336 return -EINVAL; 4337 4338 /* Disable new PRI reception: */ 4339 context_flip_pri(info, false); 4340 4341 /* 4342 * Remove device from fault queue and acknowledge all outstanding 4343 * PRQs to the device: 4344 */ 4345 iopf_queue_remove_device(iommu->iopf_queue, dev); 4346 4347 /* 4348 * PCIe spec states that by clearing PRI enable bit, the Page 4349 * Request Interface will not issue new page requests, but has 4350 * outstanding page requests that have been transmitted or are 4351 * queued for transmission. This is supposed to be called after 4352 * the device driver has stopped DMA, all PASIDs have been 4353 * unbound and the outstanding PRQs have been drained. 4354 */ 4355 pci_disable_pri(to_pci_dev(dev)); 4356 info->pri_enabled = 0; 4357 4358 return 0; 4359 } 4360 4361 static int 4362 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4363 { 4364 switch (feat) { 4365 case IOMMU_DEV_FEAT_IOPF: 4366 return intel_iommu_enable_iopf(dev); 4367 4368 case IOMMU_DEV_FEAT_SVA: 4369 return intel_iommu_enable_sva(dev); 4370 4371 default: 4372 return -ENODEV; 4373 } 4374 } 4375 4376 static int 4377 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4378 { 4379 switch (feat) { 4380 case IOMMU_DEV_FEAT_IOPF: 4381 return intel_iommu_disable_iopf(dev); 4382 4383 case IOMMU_DEV_FEAT_SVA: 4384 return 0; 4385 4386 default: 4387 return -ENODEV; 4388 } 4389 } 4390 4391 static bool intel_iommu_is_attach_deferred(struct device *dev) 4392 { 4393 struct device_domain_info *info = dev_iommu_priv_get(dev); 4394 4395 return translation_pre_enabled(info->iommu) && !info->domain; 4396 } 4397 4398 /* 4399 * Check that the device does not live on an external facing PCI port that is 4400 * marked as untrusted. Such devices should not be able to apply quirks and 4401 * thus not be able to bypass the IOMMU restrictions. 4402 */ 4403 static bool risky_device(struct pci_dev *pdev) 4404 { 4405 if (pdev->untrusted) { 4406 pci_info(pdev, 4407 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4408 pdev->vendor, pdev->device); 4409 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4410 return true; 4411 } 4412 return false; 4413 } 4414 4415 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4416 unsigned long iova, size_t size) 4417 { 4418 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); 4419 4420 return 0; 4421 } 4422 4423 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, 4424 struct iommu_domain *domain) 4425 { 4426 struct device_domain_info *info = dev_iommu_priv_get(dev); 4427 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4428 struct dev_pasid_info *curr, *dev_pasid = NULL; 4429 struct intel_iommu *iommu = info->iommu; 4430 unsigned long flags; 4431 4432 spin_lock_irqsave(&dmar_domain->lock, flags); 4433 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4434 if (curr->dev == dev && curr->pasid == pasid) { 4435 list_del(&curr->link_domain); 4436 dev_pasid = curr; 4437 break; 4438 } 4439 } 4440 WARN_ON_ONCE(!dev_pasid); 4441 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4442 4443 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4444 domain_detach_iommu(dmar_domain, iommu); 4445 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4446 kfree(dev_pasid); 4447 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4448 intel_drain_pasid_prq(dev, pasid); 4449 } 4450 4451 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4452 struct device *dev, ioasid_t pasid) 4453 { 4454 struct device_domain_info *info = dev_iommu_priv_get(dev); 4455 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4456 struct intel_iommu *iommu = info->iommu; 4457 struct dev_pasid_info *dev_pasid; 4458 unsigned long flags; 4459 int ret; 4460 4461 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4462 return -EOPNOTSUPP; 4463 4464 if (domain->dirty_ops) 4465 return -EINVAL; 4466 4467 if (context_copied(iommu, info->bus, info->devfn)) 4468 return -EBUSY; 4469 4470 ret = prepare_domain_attach_device(domain, dev); 4471 if (ret) 4472 return ret; 4473 4474 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4475 if (!dev_pasid) 4476 return -ENOMEM; 4477 4478 ret = domain_attach_iommu(dmar_domain, iommu); 4479 if (ret) 4480 goto out_free; 4481 4482 ret = cache_tag_assign_domain(dmar_domain, dev, pasid); 4483 if (ret) 4484 goto out_detach_iommu; 4485 4486 if (domain_type_is_si(dmar_domain)) 4487 ret = intel_pasid_setup_pass_through(iommu, dev, pasid); 4488 else if (dmar_domain->use_first_level) 4489 ret = domain_setup_first_level(iommu, dmar_domain, 4490 dev, pasid); 4491 else 4492 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4493 dev, pasid); 4494 if (ret) 4495 goto out_unassign_tag; 4496 4497 dev_pasid->dev = dev; 4498 dev_pasid->pasid = pasid; 4499 spin_lock_irqsave(&dmar_domain->lock, flags); 4500 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4501 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4502 4503 if (domain->type & __IOMMU_DOMAIN_PAGING) 4504 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4505 4506 return 0; 4507 out_unassign_tag: 4508 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4509 out_detach_iommu: 4510 domain_detach_iommu(dmar_domain, iommu); 4511 out_free: 4512 kfree(dev_pasid); 4513 return ret; 4514 } 4515 4516 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4517 { 4518 struct device_domain_info *info = dev_iommu_priv_get(dev); 4519 struct intel_iommu *iommu = info->iommu; 4520 struct iommu_hw_info_vtd *vtd; 4521 4522 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4523 if (!vtd) 4524 return ERR_PTR(-ENOMEM); 4525 4526 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4527 vtd->cap_reg = iommu->cap; 4528 vtd->ecap_reg = iommu->ecap; 4529 *length = sizeof(*vtd); 4530 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4531 return vtd; 4532 } 4533 4534 /* 4535 * Set dirty tracking for the device list of a domain. The caller must 4536 * hold the domain->lock when calling it. 4537 */ 4538 static int device_set_dirty_tracking(struct list_head *devices, bool enable) 4539 { 4540 struct device_domain_info *info; 4541 int ret = 0; 4542 4543 list_for_each_entry(info, devices, link) { 4544 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 4545 IOMMU_NO_PASID, enable); 4546 if (ret) 4547 break; 4548 } 4549 4550 return ret; 4551 } 4552 4553 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, 4554 bool enable) 4555 { 4556 struct dmar_domain *s1_domain; 4557 unsigned long flags; 4558 int ret; 4559 4560 spin_lock(&domain->s1_lock); 4561 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4562 spin_lock_irqsave(&s1_domain->lock, flags); 4563 ret = device_set_dirty_tracking(&s1_domain->devices, enable); 4564 spin_unlock_irqrestore(&s1_domain->lock, flags); 4565 if (ret) 4566 goto err_unwind; 4567 } 4568 spin_unlock(&domain->s1_lock); 4569 return 0; 4570 4571 err_unwind: 4572 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4573 spin_lock_irqsave(&s1_domain->lock, flags); 4574 device_set_dirty_tracking(&s1_domain->devices, 4575 domain->dirty_tracking); 4576 spin_unlock_irqrestore(&s1_domain->lock, flags); 4577 } 4578 spin_unlock(&domain->s1_lock); 4579 return ret; 4580 } 4581 4582 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4583 bool enable) 4584 { 4585 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4586 int ret; 4587 4588 spin_lock(&dmar_domain->lock); 4589 if (dmar_domain->dirty_tracking == enable) 4590 goto out_unlock; 4591 4592 ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 4593 if (ret) 4594 goto err_unwind; 4595 4596 if (dmar_domain->nested_parent) { 4597 ret = parent_domain_set_dirty_tracking(dmar_domain, enable); 4598 if (ret) 4599 goto err_unwind; 4600 } 4601 4602 dmar_domain->dirty_tracking = enable; 4603 out_unlock: 4604 spin_unlock(&dmar_domain->lock); 4605 4606 return 0; 4607 4608 err_unwind: 4609 device_set_dirty_tracking(&dmar_domain->devices, 4610 dmar_domain->dirty_tracking); 4611 spin_unlock(&dmar_domain->lock); 4612 return ret; 4613 } 4614 4615 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4616 unsigned long iova, size_t size, 4617 unsigned long flags, 4618 struct iommu_dirty_bitmap *dirty) 4619 { 4620 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4621 unsigned long end = iova + size - 1; 4622 unsigned long pgsize; 4623 4624 /* 4625 * IOMMUFD core calls into a dirty tracking disabled domain without an 4626 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4627 * have occurred when we stopped dirty tracking. This ensures that we 4628 * never inherit dirtied bits from a previous cycle. 4629 */ 4630 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4631 return -EINVAL; 4632 4633 do { 4634 struct dma_pte *pte; 4635 int lvl = 0; 4636 4637 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4638 GFP_ATOMIC); 4639 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4640 if (!pte || !dma_pte_present(pte)) { 4641 iova += pgsize; 4642 continue; 4643 } 4644 4645 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4646 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4647 iova += pgsize; 4648 } while (iova < end); 4649 4650 return 0; 4651 } 4652 4653 static const struct iommu_dirty_ops intel_dirty_ops = { 4654 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4655 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4656 }; 4657 4658 const struct iommu_ops intel_iommu_ops = { 4659 .blocked_domain = &blocking_domain, 4660 .release_domain = &blocking_domain, 4661 .capable = intel_iommu_capable, 4662 .hw_info = intel_iommu_hw_info, 4663 .domain_alloc = intel_iommu_domain_alloc, 4664 .domain_alloc_user = intel_iommu_domain_alloc_user, 4665 .domain_alloc_sva = intel_svm_domain_alloc, 4666 .probe_device = intel_iommu_probe_device, 4667 .release_device = intel_iommu_release_device, 4668 .get_resv_regions = intel_iommu_get_resv_regions, 4669 .device_group = intel_iommu_device_group, 4670 .dev_enable_feat = intel_iommu_dev_enable_feat, 4671 .dev_disable_feat = intel_iommu_dev_disable_feat, 4672 .is_attach_deferred = intel_iommu_is_attach_deferred, 4673 .def_domain_type = device_def_domain_type, 4674 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4675 .pgsize_bitmap = SZ_4K, 4676 #ifdef CONFIG_INTEL_IOMMU_SVM 4677 .page_response = intel_svm_page_response, 4678 #endif 4679 .default_domain_ops = &(const struct iommu_domain_ops) { 4680 .attach_dev = intel_iommu_attach_device, 4681 .set_dev_pasid = intel_iommu_set_dev_pasid, 4682 .map_pages = intel_iommu_map_pages, 4683 .unmap_pages = intel_iommu_unmap_pages, 4684 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4685 .flush_iotlb_all = intel_flush_iotlb_all, 4686 .iotlb_sync = intel_iommu_tlb_sync, 4687 .iova_to_phys = intel_iommu_iova_to_phys, 4688 .free = intel_iommu_domain_free, 4689 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4690 } 4691 }; 4692 4693 static void quirk_iommu_igfx(struct pci_dev *dev) 4694 { 4695 if (risky_device(dev)) 4696 return; 4697 4698 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4699 disable_igfx_iommu = 1; 4700 } 4701 4702 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4710 4711 /* Broadwell igfx malfunctions with dmar */ 4712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4730 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4736 4737 static void quirk_iommu_rwbf(struct pci_dev *dev) 4738 { 4739 if (risky_device(dev)) 4740 return; 4741 4742 /* 4743 * Mobile 4 Series Chipset neglects to set RWBF capability, 4744 * but needs it. Same seems to hold for the desktop versions. 4745 */ 4746 pci_info(dev, "Forcing write-buffer flush capability\n"); 4747 rwbf_quirk = 1; 4748 } 4749 4750 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4756 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4757 4758 #define GGC 0x52 4759 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4760 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4761 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4762 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4763 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4764 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4765 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4766 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4767 4768 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4769 { 4770 unsigned short ggc; 4771 4772 if (risky_device(dev)) 4773 return; 4774 4775 if (pci_read_config_word(dev, GGC, &ggc)) 4776 return; 4777 4778 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4779 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4780 disable_igfx_iommu = 1; 4781 } else if (!disable_igfx_iommu) { 4782 /* we have to ensure the gfx device is idle before we flush */ 4783 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4784 iommu_set_dma_strict(); 4785 } 4786 } 4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4791 4792 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4793 { 4794 unsigned short ver; 4795 4796 if (!IS_GFX_DEVICE(dev)) 4797 return; 4798 4799 ver = (dev->device >> 8) & 0xff; 4800 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4801 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4802 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4803 return; 4804 4805 if (risky_device(dev)) 4806 return; 4807 4808 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4809 iommu_skip_te_disable = 1; 4810 } 4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4812 4813 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4814 ISOCH DMAR unit for the Azalia sound device, but not give it any 4815 TLB entries, which causes it to deadlock. Check for that. We do 4816 this in a function called from init_dmars(), instead of in a PCI 4817 quirk, because we don't want to print the obnoxious "BIOS broken" 4818 message if VT-d is actually disabled. 4819 */ 4820 static void __init check_tylersburg_isoch(void) 4821 { 4822 struct pci_dev *pdev; 4823 uint32_t vtisochctrl; 4824 4825 /* If there's no Azalia in the system anyway, forget it. */ 4826 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4827 if (!pdev) 4828 return; 4829 4830 if (risky_device(pdev)) { 4831 pci_dev_put(pdev); 4832 return; 4833 } 4834 4835 pci_dev_put(pdev); 4836 4837 /* System Management Registers. Might be hidden, in which case 4838 we can't do the sanity check. But that's OK, because the 4839 known-broken BIOSes _don't_ actually hide it, so far. */ 4840 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4841 if (!pdev) 4842 return; 4843 4844 if (risky_device(pdev)) { 4845 pci_dev_put(pdev); 4846 return; 4847 } 4848 4849 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4850 pci_dev_put(pdev); 4851 return; 4852 } 4853 4854 pci_dev_put(pdev); 4855 4856 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4857 if (vtisochctrl & 1) 4858 return; 4859 4860 /* Drop all bits other than the number of TLB entries */ 4861 vtisochctrl &= 0x1c; 4862 4863 /* If we have the recommended number of TLB entries (16), fine. */ 4864 if (vtisochctrl == 0x10) 4865 return; 4866 4867 /* Zero TLB entries? You get to ride the short bus to school. */ 4868 if (!vtisochctrl) { 4869 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4870 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4871 dmi_get_system_info(DMI_BIOS_VENDOR), 4872 dmi_get_system_info(DMI_BIOS_VERSION), 4873 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4874 iommu_identity_mapping |= IDENTMAP_AZALIA; 4875 return; 4876 } 4877 4878 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4879 vtisochctrl); 4880 } 4881 4882 /* 4883 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4884 * invalidation completion before posted writes initiated with translated address 4885 * that utilized translations matching the invalidation address range, violating 4886 * the invalidation completion ordering. 4887 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4888 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4889 * under the control of the trusted/privileged host device driver must use this 4890 * quirk. 4891 * Device TLBs are invalidated under the following six conditions: 4892 * 1. Device driver does DMA API unmap IOVA 4893 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4894 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4895 * exit_mmap() due to crash 4896 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4897 * VM has to free pages that were unmapped 4898 * 5. Userspace driver unmaps a DMA buffer 4899 * 6. Cache invalidation in vSVA usage (upcoming) 4900 * 4901 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4902 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4903 * invalidate TLB the same way as normal user unmap which will use this quirk. 4904 * The dTLB invalidation after PASID cache flush does not need this quirk. 4905 * 4906 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4907 */ 4908 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4909 unsigned long address, unsigned long mask, 4910 u32 pasid, u16 qdep) 4911 { 4912 u16 sid; 4913 4914 if (likely(!info->dtlb_extra_inval)) 4915 return; 4916 4917 sid = PCI_DEVID(info->bus, info->devfn); 4918 if (pasid == IOMMU_NO_PASID) { 4919 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4920 qdep, address, mask); 4921 } else { 4922 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 4923 pasid, qdep, address, mask); 4924 } 4925 } 4926 4927 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 4928 4929 /* 4930 * Function to submit a command to the enhanced command interface. The 4931 * valid enhanced command descriptions are defined in Table 47 of the 4932 * VT-d spec. The VT-d hardware implementation may support some but not 4933 * all commands, which can be determined by checking the Enhanced 4934 * Command Capability Register. 4935 * 4936 * Return values: 4937 * - 0: Command successful without any error; 4938 * - Negative: software error value; 4939 * - Nonzero positive: failure status code defined in Table 48. 4940 */ 4941 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 4942 { 4943 unsigned long flags; 4944 u64 res; 4945 int ret; 4946 4947 if (!cap_ecmds(iommu->cap)) 4948 return -ENODEV; 4949 4950 raw_spin_lock_irqsave(&iommu->register_lock, flags); 4951 4952 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 4953 if (res & DMA_ECMD_ECRSP_IP) { 4954 ret = -EBUSY; 4955 goto err; 4956 } 4957 4958 /* 4959 * Unconditionally write the operand B, because 4960 * - There is no side effect if an ecmd doesn't require an 4961 * operand B, but we set the register to some value. 4962 * - It's not invoked in any critical path. The extra MMIO 4963 * write doesn't bring any performance concerns. 4964 */ 4965 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 4966 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 4967 4968 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 4969 !(res & DMA_ECMD_ECRSP_IP), res); 4970 4971 if (res & DMA_ECMD_ECRSP_IP) { 4972 ret = -ETIMEDOUT; 4973 goto err; 4974 } 4975 4976 ret = ecmd_get_status_code(res); 4977 err: 4978 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 4979 4980 return ret; 4981 } 4982