1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-pages.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 51 52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 57 58 static void __init check_tylersburg_isoch(void); 59 static int rwbf_quirk; 60 61 /* 62 * set to 1 to panic kernel if can't successfully enable VT-d 63 * (used when kernel is launched w/ TXT) 64 */ 65 static int force_on = 0; 66 static int intel_iommu_tboot_noforce; 67 static int no_platform_optin; 68 69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 70 71 /* 72 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 73 * if marked present. 74 */ 75 static phys_addr_t root_entry_lctp(struct root_entry *re) 76 { 77 if (!(re->lo & 1)) 78 return 0; 79 80 return re->lo & VTD_PAGE_MASK; 81 } 82 83 /* 84 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 85 * if marked present. 86 */ 87 static phys_addr_t root_entry_uctp(struct root_entry *re) 88 { 89 if (!(re->hi & 1)) 90 return 0; 91 92 return re->hi & VTD_PAGE_MASK; 93 } 94 95 static int device_rid_cmp_key(const void *key, const struct rb_node *node) 96 { 97 struct device_domain_info *info = 98 rb_entry(node, struct device_domain_info, node); 99 const u16 *rid_lhs = key; 100 101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) 102 return -1; 103 104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) 105 return 1; 106 107 return 0; 108 } 109 110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) 111 { 112 struct device_domain_info *info = 113 rb_entry(lhs, struct device_domain_info, node); 114 u16 key = PCI_DEVID(info->bus, info->devfn); 115 116 return device_rid_cmp_key(&key, rhs); 117 } 118 119 /* 120 * Looks up an IOMMU-probed device using its source ID. 121 * 122 * Returns the pointer to the device if there is a match. Otherwise, 123 * returns NULL. 124 * 125 * Note that this helper doesn't guarantee that the device won't be 126 * released by the iommu subsystem after being returned. The caller 127 * should use its own synchronization mechanism to avoid the device 128 * being released during its use if its possibly the case. 129 */ 130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) 131 { 132 struct device_domain_info *info = NULL; 133 struct rb_node *node; 134 unsigned long flags; 135 136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); 138 if (node) 139 info = rb_entry(node, struct device_domain_info, node); 140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 141 142 return info ? info->dev : NULL; 143 } 144 145 static int device_rbtree_insert(struct intel_iommu *iommu, 146 struct device_domain_info *info) 147 { 148 struct rb_node *curr; 149 unsigned long flags; 150 151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); 153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 154 if (WARN_ON(curr)) 155 return -EEXIST; 156 157 return 0; 158 } 159 160 static void device_rbtree_remove(struct device_domain_info *info) 161 { 162 struct intel_iommu *iommu = info->iommu; 163 unsigned long flags; 164 165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags); 166 rb_erase(&info->node, &iommu->device_rbtree); 167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); 168 } 169 170 /* 171 * This domain is a statically identity mapping domain. 172 * 1. This domain creats a static 1:1 mapping to all usable memory. 173 * 2. It maps to each iommu if successful. 174 * 3. Each iommu mapps to this domain if successful. 175 */ 176 static struct dmar_domain *si_domain; 177 static int hw_pass_through = 1; 178 179 struct dmar_rmrr_unit { 180 struct list_head list; /* list of rmrr units */ 181 struct acpi_dmar_header *hdr; /* ACPI header */ 182 u64 base_address; /* reserved base address*/ 183 u64 end_address; /* reserved end address */ 184 struct dmar_dev_scope *devices; /* target devices */ 185 int devices_cnt; /* target device count */ 186 }; 187 188 struct dmar_atsr_unit { 189 struct list_head list; /* list of ATSR units */ 190 struct acpi_dmar_header *hdr; /* ACPI header */ 191 struct dmar_dev_scope *devices; /* target devices */ 192 int devices_cnt; /* target device count */ 193 u8 include_all:1; /* include all ports */ 194 }; 195 196 struct dmar_satc_unit { 197 struct list_head list; /* list of SATC units */ 198 struct acpi_dmar_header *hdr; /* ACPI header */ 199 struct dmar_dev_scope *devices; /* target devices */ 200 struct intel_iommu *iommu; /* the corresponding iommu */ 201 int devices_cnt; /* target device count */ 202 u8 atc_required:1; /* ATS is required */ 203 }; 204 205 static LIST_HEAD(dmar_atsr_units); 206 static LIST_HEAD(dmar_rmrr_units); 207 static LIST_HEAD(dmar_satc_units); 208 209 #define for_each_rmrr_units(rmrr) \ 210 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 211 212 static void intel_iommu_domain_free(struct iommu_domain *domain); 213 214 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 215 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 216 217 int intel_iommu_enabled = 0; 218 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 219 220 static int intel_iommu_superpage = 1; 221 static int iommu_identity_mapping; 222 static int iommu_skip_te_disable; 223 static int disable_igfx_iommu; 224 225 #define IDENTMAP_AZALIA 4 226 227 const struct iommu_ops intel_iommu_ops; 228 static const struct iommu_dirty_ops intel_dirty_ops; 229 230 static bool translation_pre_enabled(struct intel_iommu *iommu) 231 { 232 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 233 } 234 235 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 236 { 237 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 238 } 239 240 static void init_translation_status(struct intel_iommu *iommu) 241 { 242 u32 gsts; 243 244 gsts = readl(iommu->reg + DMAR_GSTS_REG); 245 if (gsts & DMA_GSTS_TES) 246 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 247 } 248 249 static int __init intel_iommu_setup(char *str) 250 { 251 if (!str) 252 return -EINVAL; 253 254 while (*str) { 255 if (!strncmp(str, "on", 2)) { 256 dmar_disabled = 0; 257 pr_info("IOMMU enabled\n"); 258 } else if (!strncmp(str, "off", 3)) { 259 dmar_disabled = 1; 260 no_platform_optin = 1; 261 pr_info("IOMMU disabled\n"); 262 } else if (!strncmp(str, "igfx_off", 8)) { 263 disable_igfx_iommu = 1; 264 pr_info("Disable GFX device mapping\n"); 265 } else if (!strncmp(str, "forcedac", 8)) { 266 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 267 iommu_dma_forcedac = true; 268 } else if (!strncmp(str, "strict", 6)) { 269 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 270 iommu_set_dma_strict(); 271 } else if (!strncmp(str, "sp_off", 6)) { 272 pr_info("Disable supported super page\n"); 273 intel_iommu_superpage = 0; 274 } else if (!strncmp(str, "sm_on", 5)) { 275 pr_info("Enable scalable mode if hardware supports\n"); 276 intel_iommu_sm = 1; 277 } else if (!strncmp(str, "sm_off", 6)) { 278 pr_info("Scalable mode is disallowed\n"); 279 intel_iommu_sm = 0; 280 } else if (!strncmp(str, "tboot_noforce", 13)) { 281 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 282 intel_iommu_tboot_noforce = 1; 283 } else { 284 pr_notice("Unknown option - '%s'\n", str); 285 } 286 287 str += strcspn(str, ","); 288 while (*str == ',') 289 str++; 290 } 291 292 return 1; 293 } 294 __setup("intel_iommu=", intel_iommu_setup); 295 296 static int domain_type_is_si(struct dmar_domain *domain) 297 { 298 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 299 } 300 301 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 302 { 303 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 304 305 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 306 } 307 308 /* 309 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 310 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 311 * the returned SAGAW. 312 */ 313 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 314 { 315 unsigned long fl_sagaw, sl_sagaw; 316 317 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 318 sl_sagaw = cap_sagaw(iommu->cap); 319 320 /* Second level only. */ 321 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 322 return sl_sagaw; 323 324 /* First level only. */ 325 if (!ecap_slts(iommu->ecap)) 326 return fl_sagaw; 327 328 return fl_sagaw & sl_sagaw; 329 } 330 331 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 332 { 333 unsigned long sagaw; 334 int agaw; 335 336 sagaw = __iommu_calculate_sagaw(iommu); 337 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 338 if (test_bit(agaw, &sagaw)) 339 break; 340 } 341 342 return agaw; 343 } 344 345 /* 346 * Calculate max SAGAW for each iommu. 347 */ 348 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 349 { 350 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 351 } 352 353 /* 354 * calculate agaw for each iommu. 355 * "SAGAW" may be different across iommus, use a default agaw, and 356 * get a supported less agaw for iommus that don't support the default agaw. 357 */ 358 int iommu_calculate_agaw(struct intel_iommu *iommu) 359 { 360 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 361 } 362 363 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 364 { 365 return sm_supported(iommu) ? 366 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 367 } 368 369 static void domain_update_iommu_coherency(struct dmar_domain *domain) 370 { 371 struct iommu_domain_info *info; 372 struct dmar_drhd_unit *drhd; 373 struct intel_iommu *iommu; 374 bool found = false; 375 unsigned long i; 376 377 domain->iommu_coherency = true; 378 xa_for_each(&domain->iommu_array, i, info) { 379 found = true; 380 if (!iommu_paging_structure_coherency(info->iommu)) { 381 domain->iommu_coherency = false; 382 break; 383 } 384 } 385 if (found) 386 return; 387 388 /* No hardware attached; use lowest common denominator */ 389 rcu_read_lock(); 390 for_each_active_iommu(iommu, drhd) { 391 if (!iommu_paging_structure_coherency(iommu)) { 392 domain->iommu_coherency = false; 393 break; 394 } 395 } 396 rcu_read_unlock(); 397 } 398 399 static int domain_update_iommu_superpage(struct dmar_domain *domain, 400 struct intel_iommu *skip) 401 { 402 struct dmar_drhd_unit *drhd; 403 struct intel_iommu *iommu; 404 int mask = 0x3; 405 406 if (!intel_iommu_superpage) 407 return 0; 408 409 /* set iommu_superpage to the smallest common denominator */ 410 rcu_read_lock(); 411 for_each_active_iommu(iommu, drhd) { 412 if (iommu != skip) { 413 if (domain && domain->use_first_level) { 414 if (!cap_fl1gp_support(iommu->cap)) 415 mask = 0x1; 416 } else { 417 mask &= cap_super_page_val(iommu->cap); 418 } 419 420 if (!mask) 421 break; 422 } 423 } 424 rcu_read_unlock(); 425 426 return fls(mask); 427 } 428 429 static int domain_update_device_node(struct dmar_domain *domain) 430 { 431 struct device_domain_info *info; 432 int nid = NUMA_NO_NODE; 433 unsigned long flags; 434 435 spin_lock_irqsave(&domain->lock, flags); 436 list_for_each_entry(info, &domain->devices, link) { 437 /* 438 * There could possibly be multiple device numa nodes as devices 439 * within the same domain may sit behind different IOMMUs. There 440 * isn't perfect answer in such situation, so we select first 441 * come first served policy. 442 */ 443 nid = dev_to_node(info->dev); 444 if (nid != NUMA_NO_NODE) 445 break; 446 } 447 spin_unlock_irqrestore(&domain->lock, flags); 448 449 return nid; 450 } 451 452 /* Return the super pagesize bitmap if supported. */ 453 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 454 { 455 unsigned long bitmap = 0; 456 457 /* 458 * 1-level super page supports page size of 2MiB, 2-level super page 459 * supports page size of both 2MiB and 1GiB. 460 */ 461 if (domain->iommu_superpage == 1) 462 bitmap |= SZ_2M; 463 else if (domain->iommu_superpage == 2) 464 bitmap |= SZ_2M | SZ_1G; 465 466 return bitmap; 467 } 468 469 /* Some capabilities may be different across iommus */ 470 void domain_update_iommu_cap(struct dmar_domain *domain) 471 { 472 domain_update_iommu_coherency(domain); 473 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 474 475 /* 476 * If RHSA is missing, we should default to the device numa domain 477 * as fall back. 478 */ 479 if (domain->nid == NUMA_NO_NODE) 480 domain->nid = domain_update_device_node(domain); 481 482 /* 483 * First-level translation restricts the input-address to a 484 * canonical address (i.e., address bits 63:N have the same 485 * value as address bit [N-1], where N is 48-bits with 4-level 486 * paging and 57-bits with 5-level paging). Hence, skip bit 487 * [N-1]. 488 */ 489 if (domain->use_first_level) 490 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 491 else 492 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 493 494 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 495 domain_update_iotlb(domain); 496 } 497 498 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 499 u8 devfn, int alloc) 500 { 501 struct root_entry *root = &iommu->root_entry[bus]; 502 struct context_entry *context; 503 u64 *entry; 504 505 /* 506 * Except that the caller requested to allocate a new entry, 507 * returning a copied context entry makes no sense. 508 */ 509 if (!alloc && context_copied(iommu, bus, devfn)) 510 return NULL; 511 512 entry = &root->lo; 513 if (sm_supported(iommu)) { 514 if (devfn >= 0x80) { 515 devfn -= 0x80; 516 entry = &root->hi; 517 } 518 devfn *= 2; 519 } 520 if (*entry & 1) 521 context = phys_to_virt(*entry & VTD_PAGE_MASK); 522 else { 523 unsigned long phy_addr; 524 if (!alloc) 525 return NULL; 526 527 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 528 if (!context) 529 return NULL; 530 531 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 532 phy_addr = virt_to_phys((void *)context); 533 *entry = phy_addr | 1; 534 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 535 } 536 return &context[devfn]; 537 } 538 539 /** 540 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 541 * sub-hierarchy of a candidate PCI-PCI bridge 542 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 543 * @bridge: the candidate PCI-PCI bridge 544 * 545 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 546 */ 547 static bool 548 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 549 { 550 struct pci_dev *pdev, *pbridge; 551 552 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 553 return false; 554 555 pdev = to_pci_dev(dev); 556 pbridge = to_pci_dev(bridge); 557 558 if (pbridge->subordinate && 559 pbridge->subordinate->number <= pdev->bus->number && 560 pbridge->subordinate->busn_res.end >= pdev->bus->number) 561 return true; 562 563 return false; 564 } 565 566 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 567 { 568 struct dmar_drhd_unit *drhd; 569 u32 vtbar; 570 int rc; 571 572 /* We know that this device on this chipset has its own IOMMU. 573 * If we find it under a different IOMMU, then the BIOS is lying 574 * to us. Hope that the IOMMU for this device is actually 575 * disabled, and it needs no translation... 576 */ 577 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 578 if (rc) { 579 /* "can't" happen */ 580 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 581 return false; 582 } 583 vtbar &= 0xffff0000; 584 585 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 586 drhd = dmar_find_matched_drhd_unit(pdev); 587 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 588 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 589 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 590 return true; 591 } 592 593 return false; 594 } 595 596 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 597 { 598 if (!iommu || iommu->drhd->ignored) 599 return true; 600 601 if (dev_is_pci(dev)) { 602 struct pci_dev *pdev = to_pci_dev(dev); 603 604 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 605 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 606 quirk_ioat_snb_local_iommu(pdev)) 607 return true; 608 } 609 610 return false; 611 } 612 613 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 614 { 615 struct dmar_drhd_unit *drhd = NULL; 616 struct pci_dev *pdev = NULL; 617 struct intel_iommu *iommu; 618 struct device *tmp; 619 u16 segment = 0; 620 int i; 621 622 if (!dev) 623 return NULL; 624 625 if (dev_is_pci(dev)) { 626 struct pci_dev *pf_pdev; 627 628 pdev = pci_real_dma_dev(to_pci_dev(dev)); 629 630 /* VFs aren't listed in scope tables; we need to look up 631 * the PF instead to find the IOMMU. */ 632 pf_pdev = pci_physfn(pdev); 633 dev = &pf_pdev->dev; 634 segment = pci_domain_nr(pdev->bus); 635 } else if (has_acpi_companion(dev)) 636 dev = &ACPI_COMPANION(dev)->dev; 637 638 rcu_read_lock(); 639 for_each_iommu(iommu, drhd) { 640 if (pdev && segment != drhd->segment) 641 continue; 642 643 for_each_active_dev_scope(drhd->devices, 644 drhd->devices_cnt, i, tmp) { 645 if (tmp == dev) { 646 /* For a VF use its original BDF# not that of the PF 647 * which we used for the IOMMU lookup. Strictly speaking 648 * we could do this for all PCI devices; we only need to 649 * get the BDF# from the scope table for ACPI matches. */ 650 if (pdev && pdev->is_virtfn) 651 goto got_pdev; 652 653 if (bus && devfn) { 654 *bus = drhd->devices[i].bus; 655 *devfn = drhd->devices[i].devfn; 656 } 657 goto out; 658 } 659 660 if (is_downstream_to_pci_bridge(dev, tmp)) 661 goto got_pdev; 662 } 663 664 if (pdev && drhd->include_all) { 665 got_pdev: 666 if (bus && devfn) { 667 *bus = pdev->bus->number; 668 *devfn = pdev->devfn; 669 } 670 goto out; 671 } 672 } 673 iommu = NULL; 674 out: 675 if (iommu_is_dummy(iommu, dev)) 676 iommu = NULL; 677 678 rcu_read_unlock(); 679 680 return iommu; 681 } 682 683 static void domain_flush_cache(struct dmar_domain *domain, 684 void *addr, int size) 685 { 686 if (!domain->iommu_coherency) 687 clflush_cache_range(addr, size); 688 } 689 690 static void free_context_table(struct intel_iommu *iommu) 691 { 692 struct context_entry *context; 693 int i; 694 695 if (!iommu->root_entry) 696 return; 697 698 for (i = 0; i < ROOT_ENTRY_NR; i++) { 699 context = iommu_context_addr(iommu, i, 0, 0); 700 if (context) 701 iommu_free_page(context); 702 703 if (!sm_supported(iommu)) 704 continue; 705 706 context = iommu_context_addr(iommu, i, 0x80, 0); 707 if (context) 708 iommu_free_page(context); 709 } 710 711 iommu_free_page(iommu->root_entry); 712 iommu->root_entry = NULL; 713 } 714 715 #ifdef CONFIG_DMAR_DEBUG 716 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 717 u8 bus, u8 devfn, struct dma_pte *parent, int level) 718 { 719 struct dma_pte *pte; 720 int offset; 721 722 while (1) { 723 offset = pfn_level_offset(pfn, level); 724 pte = &parent[offset]; 725 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 726 pr_info("PTE not present at level %d\n", level); 727 break; 728 } 729 730 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 731 732 if (level == 1) 733 break; 734 735 parent = phys_to_virt(dma_pte_addr(pte)); 736 level--; 737 } 738 } 739 740 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 741 unsigned long long addr, u32 pasid) 742 { 743 struct pasid_dir_entry *dir, *pde; 744 struct pasid_entry *entries, *pte; 745 struct context_entry *ctx_entry; 746 struct root_entry *rt_entry; 747 int i, dir_index, index, level; 748 u8 devfn = source_id & 0xff; 749 u8 bus = source_id >> 8; 750 struct dma_pte *pgtable; 751 752 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 753 754 /* root entry dump */ 755 rt_entry = &iommu->root_entry[bus]; 756 if (!rt_entry) { 757 pr_info("root table entry is not present\n"); 758 return; 759 } 760 761 if (sm_supported(iommu)) 762 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 763 rt_entry->hi, rt_entry->lo); 764 else 765 pr_info("root entry: 0x%016llx", rt_entry->lo); 766 767 /* context entry dump */ 768 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 769 if (!ctx_entry) { 770 pr_info("context table entry is not present\n"); 771 return; 772 } 773 774 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 775 ctx_entry->hi, ctx_entry->lo); 776 777 /* legacy mode does not require PASID entries */ 778 if (!sm_supported(iommu)) { 779 level = agaw_to_level(ctx_entry->hi & 7); 780 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 781 goto pgtable_walk; 782 } 783 784 /* get the pointer to pasid directory entry */ 785 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 786 if (!dir) { 787 pr_info("pasid directory entry is not present\n"); 788 return; 789 } 790 /* For request-without-pasid, get the pasid from context entry */ 791 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 792 pasid = IOMMU_NO_PASID; 793 794 dir_index = pasid >> PASID_PDE_SHIFT; 795 pde = &dir[dir_index]; 796 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 797 798 /* get the pointer to the pasid table entry */ 799 entries = get_pasid_table_from_pde(pde); 800 if (!entries) { 801 pr_info("pasid table entry is not present\n"); 802 return; 803 } 804 index = pasid & PASID_PTE_MASK; 805 pte = &entries[index]; 806 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 807 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 808 809 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 810 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 811 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 812 } else { 813 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 814 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 815 } 816 817 pgtable_walk: 818 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 819 } 820 #endif 821 822 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 823 unsigned long pfn, int *target_level, 824 gfp_t gfp) 825 { 826 struct dma_pte *parent, *pte; 827 int level = agaw_to_level(domain->agaw); 828 int offset; 829 830 if (!domain_pfn_supported(domain, pfn)) 831 /* Address beyond IOMMU's addressing capabilities. */ 832 return NULL; 833 834 parent = domain->pgd; 835 836 while (1) { 837 void *tmp_page; 838 839 offset = pfn_level_offset(pfn, level); 840 pte = &parent[offset]; 841 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 842 break; 843 if (level == *target_level) 844 break; 845 846 if (!dma_pte_present(pte)) { 847 uint64_t pteval, tmp; 848 849 tmp_page = iommu_alloc_page_node(domain->nid, gfp); 850 851 if (!tmp_page) 852 return NULL; 853 854 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 855 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 856 if (domain->use_first_level) 857 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 858 859 tmp = 0ULL; 860 if (!try_cmpxchg64(&pte->val, &tmp, pteval)) 861 /* Someone else set it while we were thinking; use theirs. */ 862 iommu_free_page(tmp_page); 863 else 864 domain_flush_cache(domain, pte, sizeof(*pte)); 865 } 866 if (level == 1) 867 break; 868 869 parent = phys_to_virt(dma_pte_addr(pte)); 870 level--; 871 } 872 873 if (!*target_level) 874 *target_level = level; 875 876 return pte; 877 } 878 879 /* return address's pte at specific level */ 880 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 881 unsigned long pfn, 882 int level, int *large_page) 883 { 884 struct dma_pte *parent, *pte; 885 int total = agaw_to_level(domain->agaw); 886 int offset; 887 888 parent = domain->pgd; 889 while (level <= total) { 890 offset = pfn_level_offset(pfn, total); 891 pte = &parent[offset]; 892 if (level == total) 893 return pte; 894 895 if (!dma_pte_present(pte)) { 896 *large_page = total; 897 break; 898 } 899 900 if (dma_pte_superpage(pte)) { 901 *large_page = total; 902 return pte; 903 } 904 905 parent = phys_to_virt(dma_pte_addr(pte)); 906 total--; 907 } 908 return NULL; 909 } 910 911 /* clear last level pte, a tlb flush should be followed */ 912 static void dma_pte_clear_range(struct dmar_domain *domain, 913 unsigned long start_pfn, 914 unsigned long last_pfn) 915 { 916 unsigned int large_page; 917 struct dma_pte *first_pte, *pte; 918 919 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 920 WARN_ON(start_pfn > last_pfn)) 921 return; 922 923 /* we don't need lock here; nobody else touches the iova range */ 924 do { 925 large_page = 1; 926 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 927 if (!pte) { 928 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 929 continue; 930 } 931 do { 932 dma_clear_pte(pte); 933 start_pfn += lvl_to_nr_pages(large_page); 934 pte++; 935 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 936 937 domain_flush_cache(domain, first_pte, 938 (void *)pte - (void *)first_pte); 939 940 } while (start_pfn && start_pfn <= last_pfn); 941 } 942 943 static void dma_pte_free_level(struct dmar_domain *domain, int level, 944 int retain_level, struct dma_pte *pte, 945 unsigned long pfn, unsigned long start_pfn, 946 unsigned long last_pfn) 947 { 948 pfn = max(start_pfn, pfn); 949 pte = &pte[pfn_level_offset(pfn, level)]; 950 951 do { 952 unsigned long level_pfn; 953 struct dma_pte *level_pte; 954 955 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 956 goto next; 957 958 level_pfn = pfn & level_mask(level); 959 level_pte = phys_to_virt(dma_pte_addr(pte)); 960 961 if (level > 2) { 962 dma_pte_free_level(domain, level - 1, retain_level, 963 level_pte, level_pfn, start_pfn, 964 last_pfn); 965 } 966 967 /* 968 * Free the page table if we're below the level we want to 969 * retain and the range covers the entire table. 970 */ 971 if (level < retain_level && !(start_pfn > level_pfn || 972 last_pfn < level_pfn + level_size(level) - 1)) { 973 dma_clear_pte(pte); 974 domain_flush_cache(domain, pte, sizeof(*pte)); 975 iommu_free_page(level_pte); 976 } 977 next: 978 pfn += level_size(level); 979 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 980 } 981 982 /* 983 * clear last level (leaf) ptes and free page table pages below the 984 * level we wish to keep intact. 985 */ 986 static void dma_pte_free_pagetable(struct dmar_domain *domain, 987 unsigned long start_pfn, 988 unsigned long last_pfn, 989 int retain_level) 990 { 991 dma_pte_clear_range(domain, start_pfn, last_pfn); 992 993 /* We don't need lock here; nobody else touches the iova range */ 994 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 995 domain->pgd, 0, start_pfn, last_pfn); 996 997 /* free pgd */ 998 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 999 iommu_free_page(domain->pgd); 1000 domain->pgd = NULL; 1001 } 1002 } 1003 1004 /* When a page at a given level is being unlinked from its parent, we don't 1005 need to *modify* it at all. All we need to do is make a list of all the 1006 pages which can be freed just as soon as we've flushed the IOTLB and we 1007 know the hardware page-walk will no longer touch them. 1008 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1009 be freed. */ 1010 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1011 int level, struct dma_pte *pte, 1012 struct list_head *freelist) 1013 { 1014 struct page *pg; 1015 1016 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1017 list_add_tail(&pg->lru, freelist); 1018 1019 if (level == 1) 1020 return; 1021 1022 pte = page_address(pg); 1023 do { 1024 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1025 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1026 pte++; 1027 } while (!first_pte_in_page(pte)); 1028 } 1029 1030 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1031 struct dma_pte *pte, unsigned long pfn, 1032 unsigned long start_pfn, unsigned long last_pfn, 1033 struct list_head *freelist) 1034 { 1035 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1036 1037 pfn = max(start_pfn, pfn); 1038 pte = &pte[pfn_level_offset(pfn, level)]; 1039 1040 do { 1041 unsigned long level_pfn = pfn & level_mask(level); 1042 1043 if (!dma_pte_present(pte)) 1044 goto next; 1045 1046 /* If range covers entire pagetable, free it */ 1047 if (start_pfn <= level_pfn && 1048 last_pfn >= level_pfn + level_size(level) - 1) { 1049 /* These suborbinate page tables are going away entirely. Don't 1050 bother to clear them; we're just going to *free* them. */ 1051 if (level > 1 && !dma_pte_superpage(pte)) 1052 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1053 1054 dma_clear_pte(pte); 1055 if (!first_pte) 1056 first_pte = pte; 1057 last_pte = pte; 1058 } else if (level > 1) { 1059 /* Recurse down into a level that isn't *entirely* obsolete */ 1060 dma_pte_clear_level(domain, level - 1, 1061 phys_to_virt(dma_pte_addr(pte)), 1062 level_pfn, start_pfn, last_pfn, 1063 freelist); 1064 } 1065 next: 1066 pfn = level_pfn + level_size(level); 1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1068 1069 if (first_pte) 1070 domain_flush_cache(domain, first_pte, 1071 (void *)++last_pte - (void *)first_pte); 1072 } 1073 1074 /* We can't just free the pages because the IOMMU may still be walking 1075 the page tables, and may have cached the intermediate levels. The 1076 pages can only be freed after the IOTLB flush has been done. */ 1077 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1078 unsigned long last_pfn, struct list_head *freelist) 1079 { 1080 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1081 WARN_ON(start_pfn > last_pfn)) 1082 return; 1083 1084 /* we don't need lock here; nobody else touches the iova range */ 1085 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1086 domain->pgd, 0, start_pfn, last_pfn, freelist); 1087 1088 /* free pgd */ 1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1090 struct page *pgd_page = virt_to_page(domain->pgd); 1091 list_add_tail(&pgd_page->lru, freelist); 1092 domain->pgd = NULL; 1093 } 1094 } 1095 1096 /* iommu handling */ 1097 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1098 { 1099 struct root_entry *root; 1100 1101 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); 1102 if (!root) { 1103 pr_err("Allocating root entry for %s failed\n", 1104 iommu->name); 1105 return -ENOMEM; 1106 } 1107 1108 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1109 iommu->root_entry = root; 1110 1111 return 0; 1112 } 1113 1114 static void iommu_set_root_entry(struct intel_iommu *iommu) 1115 { 1116 u64 addr; 1117 u32 sts; 1118 unsigned long flag; 1119 1120 addr = virt_to_phys(iommu->root_entry); 1121 if (sm_supported(iommu)) 1122 addr |= DMA_RTADDR_SMT; 1123 1124 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1125 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1126 1127 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1128 1129 /* Make sure hardware complete it */ 1130 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1131 readl, (sts & DMA_GSTS_RTPS), sts); 1132 1133 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1134 1135 /* 1136 * Hardware invalidates all DMA remapping hardware translation 1137 * caches as part of SRTP flow. 1138 */ 1139 if (cap_esrtps(iommu->cap)) 1140 return; 1141 1142 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1143 if (sm_supported(iommu)) 1144 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1145 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1146 } 1147 1148 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1149 { 1150 u32 val; 1151 unsigned long flag; 1152 1153 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1154 return; 1155 1156 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1157 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1158 1159 /* Make sure hardware complete it */ 1160 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1161 readl, (!(val & DMA_GSTS_WBFS)), val); 1162 1163 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1164 } 1165 1166 /* return value determine if we need a write buffer flush */ 1167 static void __iommu_flush_context(struct intel_iommu *iommu, 1168 u16 did, u16 source_id, u8 function_mask, 1169 u64 type) 1170 { 1171 u64 val = 0; 1172 unsigned long flag; 1173 1174 switch (type) { 1175 case DMA_CCMD_GLOBAL_INVL: 1176 val = DMA_CCMD_GLOBAL_INVL; 1177 break; 1178 case DMA_CCMD_DOMAIN_INVL: 1179 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1180 break; 1181 case DMA_CCMD_DEVICE_INVL: 1182 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1183 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1184 break; 1185 default: 1186 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1187 iommu->name, type); 1188 return; 1189 } 1190 val |= DMA_CCMD_ICC; 1191 1192 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1193 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1194 1195 /* Make sure hardware complete it */ 1196 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1197 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1198 1199 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1200 } 1201 1202 /* return value determine if we need a write buffer flush */ 1203 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1204 u64 addr, unsigned int size_order, u64 type) 1205 { 1206 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1207 u64 val = 0, val_iva = 0; 1208 unsigned long flag; 1209 1210 switch (type) { 1211 case DMA_TLB_GLOBAL_FLUSH: 1212 /* global flush doesn't need set IVA_REG */ 1213 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1214 break; 1215 case DMA_TLB_DSI_FLUSH: 1216 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1217 break; 1218 case DMA_TLB_PSI_FLUSH: 1219 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1220 /* IH bit is passed in as part of address */ 1221 val_iva = size_order | addr; 1222 break; 1223 default: 1224 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1225 iommu->name, type); 1226 return; 1227 } 1228 1229 if (cap_write_drain(iommu->cap)) 1230 val |= DMA_TLB_WRITE_DRAIN; 1231 1232 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1233 /* Note: Only uses first TLB reg currently */ 1234 if (val_iva) 1235 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1236 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1237 1238 /* Make sure hardware complete it */ 1239 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1240 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1241 1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1243 1244 /* check IOTLB invalidation granularity */ 1245 if (DMA_TLB_IAIG(val) == 0) 1246 pr_err("Flush IOTLB failed\n"); 1247 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1248 pr_debug("TLB flush request %Lx, actual %Lx\n", 1249 (unsigned long long)DMA_TLB_IIRG(type), 1250 (unsigned long long)DMA_TLB_IAIG(val)); 1251 } 1252 1253 static struct device_domain_info * 1254 domain_lookup_dev_info(struct dmar_domain *domain, 1255 struct intel_iommu *iommu, u8 bus, u8 devfn) 1256 { 1257 struct device_domain_info *info; 1258 unsigned long flags; 1259 1260 spin_lock_irqsave(&domain->lock, flags); 1261 list_for_each_entry(info, &domain->devices, link) { 1262 if (info->iommu == iommu && info->bus == bus && 1263 info->devfn == devfn) { 1264 spin_unlock_irqrestore(&domain->lock, flags); 1265 return info; 1266 } 1267 } 1268 spin_unlock_irqrestore(&domain->lock, flags); 1269 1270 return NULL; 1271 } 1272 1273 void domain_update_iotlb(struct dmar_domain *domain) 1274 { 1275 struct dev_pasid_info *dev_pasid; 1276 struct device_domain_info *info; 1277 bool has_iotlb_device = false; 1278 unsigned long flags; 1279 1280 spin_lock_irqsave(&domain->lock, flags); 1281 list_for_each_entry(info, &domain->devices, link) { 1282 if (info->ats_enabled) { 1283 has_iotlb_device = true; 1284 break; 1285 } 1286 } 1287 1288 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1289 info = dev_iommu_priv_get(dev_pasid->dev); 1290 if (info->ats_enabled) { 1291 has_iotlb_device = true; 1292 break; 1293 } 1294 } 1295 domain->has_iotlb_device = has_iotlb_device; 1296 spin_unlock_irqrestore(&domain->lock, flags); 1297 } 1298 1299 /* 1300 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1301 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1302 * check because it applies only to the built-in QAT devices and it doesn't 1303 * grant additional privileges. 1304 */ 1305 #define BUGGY_QAT_DEVID_MASK 0x4940 1306 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1307 { 1308 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1309 return false; 1310 1311 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1312 return false; 1313 1314 return true; 1315 } 1316 1317 static void iommu_enable_pci_caps(struct device_domain_info *info) 1318 { 1319 struct pci_dev *pdev; 1320 1321 if (!dev_is_pci(info->dev)) 1322 return; 1323 1324 pdev = to_pci_dev(info->dev); 1325 1326 /* The PCIe spec, in its wisdom, declares that the behaviour of 1327 the device if you enable PASID support after ATS support is 1328 undefined. So always enable PASID support on devices which 1329 have it, even if we can't yet know if we're ever going to 1330 use it. */ 1331 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1332 info->pasid_enabled = 1; 1333 1334 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1335 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1336 info->ats_enabled = 1; 1337 domain_update_iotlb(info->domain); 1338 } 1339 } 1340 1341 static void iommu_disable_pci_caps(struct device_domain_info *info) 1342 { 1343 struct pci_dev *pdev; 1344 1345 if (!dev_is_pci(info->dev)) 1346 return; 1347 1348 pdev = to_pci_dev(info->dev); 1349 1350 if (info->ats_enabled) { 1351 pci_disable_ats(pdev); 1352 info->ats_enabled = 0; 1353 domain_update_iotlb(info->domain); 1354 } 1355 1356 if (info->pasid_enabled) { 1357 pci_disable_pasid(pdev); 1358 info->pasid_enabled = 0; 1359 } 1360 } 1361 1362 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1363 { 1364 cache_tag_flush_all(to_dmar_domain(domain)); 1365 } 1366 1367 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1368 { 1369 u32 pmen; 1370 unsigned long flags; 1371 1372 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1373 return; 1374 1375 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1376 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1377 pmen &= ~DMA_PMEN_EPM; 1378 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1379 1380 /* wait for the protected region status bit to clear */ 1381 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1382 readl, !(pmen & DMA_PMEN_PRS), pmen); 1383 1384 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1385 } 1386 1387 static void iommu_enable_translation(struct intel_iommu *iommu) 1388 { 1389 u32 sts; 1390 unsigned long flags; 1391 1392 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1393 iommu->gcmd |= DMA_GCMD_TE; 1394 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1395 1396 /* Make sure hardware complete it */ 1397 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1398 readl, (sts & DMA_GSTS_TES), sts); 1399 1400 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1401 } 1402 1403 static void iommu_disable_translation(struct intel_iommu *iommu) 1404 { 1405 u32 sts; 1406 unsigned long flag; 1407 1408 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1409 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1410 return; 1411 1412 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1413 iommu->gcmd &= ~DMA_GCMD_TE; 1414 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1415 1416 /* Make sure hardware complete it */ 1417 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1418 readl, (!(sts & DMA_GSTS_TES)), sts); 1419 1420 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1421 } 1422 1423 static int iommu_init_domains(struct intel_iommu *iommu) 1424 { 1425 u32 ndomains; 1426 1427 ndomains = cap_ndoms(iommu->cap); 1428 pr_debug("%s: Number of Domains supported <%d>\n", 1429 iommu->name, ndomains); 1430 1431 spin_lock_init(&iommu->lock); 1432 1433 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1434 if (!iommu->domain_ids) 1435 return -ENOMEM; 1436 1437 /* 1438 * If Caching mode is set, then invalid translations are tagged 1439 * with domain-id 0, hence we need to pre-allocate it. We also 1440 * use domain-id 0 as a marker for non-allocated domain-id, so 1441 * make sure it is not used for a real domain. 1442 */ 1443 set_bit(0, iommu->domain_ids); 1444 1445 /* 1446 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1447 * entry for first-level or pass-through translation modes should 1448 * be programmed with a domain id different from those used for 1449 * second-level or nested translation. We reserve a domain id for 1450 * this purpose. 1451 */ 1452 if (sm_supported(iommu)) 1453 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1454 1455 return 0; 1456 } 1457 1458 static void disable_dmar_iommu(struct intel_iommu *iommu) 1459 { 1460 if (!iommu->domain_ids) 1461 return; 1462 1463 /* 1464 * All iommu domains must have been detached from the devices, 1465 * hence there should be no domain IDs in use. 1466 */ 1467 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1468 > NUM_RESERVED_DID)) 1469 return; 1470 1471 if (iommu->gcmd & DMA_GCMD_TE) 1472 iommu_disable_translation(iommu); 1473 } 1474 1475 static void free_dmar_iommu(struct intel_iommu *iommu) 1476 { 1477 if (iommu->domain_ids) { 1478 bitmap_free(iommu->domain_ids); 1479 iommu->domain_ids = NULL; 1480 } 1481 1482 if (iommu->copied_tables) { 1483 bitmap_free(iommu->copied_tables); 1484 iommu->copied_tables = NULL; 1485 } 1486 1487 /* free context mapping */ 1488 free_context_table(iommu); 1489 1490 #ifdef CONFIG_INTEL_IOMMU_SVM 1491 if (pasid_supported(iommu)) { 1492 if (ecap_prs(iommu->ecap)) 1493 intel_svm_finish_prq(iommu); 1494 } 1495 #endif 1496 } 1497 1498 /* 1499 * Check and return whether first level is used by default for 1500 * DMA translation. 1501 */ 1502 static bool first_level_by_default(unsigned int type) 1503 { 1504 /* Only SL is available in legacy mode */ 1505 if (!scalable_mode_support()) 1506 return false; 1507 1508 /* Only level (either FL or SL) is available, just use it */ 1509 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1510 return intel_cap_flts_sanity(); 1511 1512 /* Both levels are available, decide it based on domain type */ 1513 return type != IOMMU_DOMAIN_UNMANAGED; 1514 } 1515 1516 static struct dmar_domain *alloc_domain(unsigned int type) 1517 { 1518 struct dmar_domain *domain; 1519 1520 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1521 if (!domain) 1522 return NULL; 1523 1524 domain->nid = NUMA_NO_NODE; 1525 if (first_level_by_default(type)) 1526 domain->use_first_level = true; 1527 domain->has_iotlb_device = false; 1528 INIT_LIST_HEAD(&domain->devices); 1529 INIT_LIST_HEAD(&domain->dev_pasids); 1530 INIT_LIST_HEAD(&domain->cache_tags); 1531 spin_lock_init(&domain->lock); 1532 spin_lock_init(&domain->cache_lock); 1533 xa_init(&domain->iommu_array); 1534 1535 return domain; 1536 } 1537 1538 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1539 { 1540 struct iommu_domain_info *info, *curr; 1541 unsigned long ndomains; 1542 int num, ret = -ENOSPC; 1543 1544 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1545 return 0; 1546 1547 info = kzalloc(sizeof(*info), GFP_KERNEL); 1548 if (!info) 1549 return -ENOMEM; 1550 1551 spin_lock(&iommu->lock); 1552 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1553 if (curr) { 1554 curr->refcnt++; 1555 spin_unlock(&iommu->lock); 1556 kfree(info); 1557 return 0; 1558 } 1559 1560 ndomains = cap_ndoms(iommu->cap); 1561 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1562 if (num >= ndomains) { 1563 pr_err("%s: No free domain ids\n", iommu->name); 1564 goto err_unlock; 1565 } 1566 1567 set_bit(num, iommu->domain_ids); 1568 info->refcnt = 1; 1569 info->did = num; 1570 info->iommu = iommu; 1571 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1572 NULL, info, GFP_ATOMIC); 1573 if (curr) { 1574 ret = xa_err(curr) ? : -EBUSY; 1575 goto err_clear; 1576 } 1577 domain_update_iommu_cap(domain); 1578 1579 spin_unlock(&iommu->lock); 1580 return 0; 1581 1582 err_clear: 1583 clear_bit(info->did, iommu->domain_ids); 1584 err_unlock: 1585 spin_unlock(&iommu->lock); 1586 kfree(info); 1587 return ret; 1588 } 1589 1590 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1591 { 1592 struct iommu_domain_info *info; 1593 1594 if (domain->domain.type == IOMMU_DOMAIN_SVA) 1595 return; 1596 1597 spin_lock(&iommu->lock); 1598 info = xa_load(&domain->iommu_array, iommu->seq_id); 1599 if (--info->refcnt == 0) { 1600 clear_bit(info->did, iommu->domain_ids); 1601 xa_erase(&domain->iommu_array, iommu->seq_id); 1602 domain->nid = NUMA_NO_NODE; 1603 domain_update_iommu_cap(domain); 1604 kfree(info); 1605 } 1606 spin_unlock(&iommu->lock); 1607 } 1608 1609 static int guestwidth_to_adjustwidth(int gaw) 1610 { 1611 int agaw; 1612 int r = (gaw - 12) % 9; 1613 1614 if (r == 0) 1615 agaw = gaw; 1616 else 1617 agaw = gaw + 9 - r; 1618 if (agaw > 64) 1619 agaw = 64; 1620 return agaw; 1621 } 1622 1623 static void domain_exit(struct dmar_domain *domain) 1624 { 1625 if (domain->pgd) { 1626 LIST_HEAD(freelist); 1627 1628 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1629 iommu_put_pages_list(&freelist); 1630 } 1631 1632 if (WARN_ON(!list_empty(&domain->devices))) 1633 return; 1634 1635 kfree(domain); 1636 } 1637 1638 static int domain_context_mapping_one(struct dmar_domain *domain, 1639 struct intel_iommu *iommu, 1640 u8 bus, u8 devfn) 1641 { 1642 struct device_domain_info *info = 1643 domain_lookup_dev_info(domain, iommu, bus, devfn); 1644 u16 did = domain_id_iommu(domain, iommu); 1645 int translation = CONTEXT_TT_MULTI_LEVEL; 1646 struct dma_pte *pgd = domain->pgd; 1647 struct context_entry *context; 1648 int agaw, ret; 1649 1650 if (hw_pass_through && domain_type_is_si(domain)) 1651 translation = CONTEXT_TT_PASS_THROUGH; 1652 1653 pr_debug("Set context mapping for %02x:%02x.%d\n", 1654 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1655 1656 spin_lock(&iommu->lock); 1657 ret = -ENOMEM; 1658 context = iommu_context_addr(iommu, bus, devfn, 1); 1659 if (!context) 1660 goto out_unlock; 1661 1662 ret = 0; 1663 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1664 goto out_unlock; 1665 1666 /* 1667 * For kdump cases, old valid entries may be cached due to the 1668 * in-flight DMA and copied pgtable, but there is no unmapping 1669 * behaviour for them, thus we need an explicit cache flush for 1670 * the newly-mapped device. For kdump, at this point, the device 1671 * is supposed to finish reset at its driver probe stage, so no 1672 * in-flight DMA will exist, and we don't need to worry anymore 1673 * hereafter. 1674 */ 1675 if (context_copied(iommu, bus, devfn)) { 1676 u16 did_old = context_domain_id(context); 1677 1678 if (did_old < cap_ndoms(iommu->cap)) { 1679 iommu->flush.flush_context(iommu, did_old, 1680 (((u16)bus) << 8) | devfn, 1681 DMA_CCMD_MASK_NOBIT, 1682 DMA_CCMD_DEVICE_INVL); 1683 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1684 DMA_TLB_DSI_FLUSH); 1685 } 1686 1687 clear_context_copied(iommu, bus, devfn); 1688 } 1689 1690 context_clear_entry(context); 1691 context_set_domain_id(context, did); 1692 1693 if (translation != CONTEXT_TT_PASS_THROUGH) { 1694 /* 1695 * Skip top levels of page tables for iommu which has 1696 * less agaw than default. Unnecessary for PT mode. 1697 */ 1698 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1699 ret = -ENOMEM; 1700 pgd = phys_to_virt(dma_pte_addr(pgd)); 1701 if (!dma_pte_present(pgd)) 1702 goto out_unlock; 1703 } 1704 1705 if (info && info->ats_supported) 1706 translation = CONTEXT_TT_DEV_IOTLB; 1707 else 1708 translation = CONTEXT_TT_MULTI_LEVEL; 1709 1710 context_set_address_root(context, virt_to_phys(pgd)); 1711 context_set_address_width(context, agaw); 1712 } else { 1713 /* 1714 * In pass through mode, AW must be programmed to 1715 * indicate the largest AGAW value supported by 1716 * hardware. And ASR is ignored by hardware. 1717 */ 1718 context_set_address_width(context, iommu->msagaw); 1719 } 1720 1721 context_set_translation_type(context, translation); 1722 context_set_fault_enable(context); 1723 context_set_present(context); 1724 if (!ecap_coherent(iommu->ecap)) 1725 clflush_cache_range(context, sizeof(*context)); 1726 1727 /* 1728 * It's a non-present to present mapping. If hardware doesn't cache 1729 * non-present entry we only need to flush the write-buffer. If the 1730 * _does_ cache non-present entries, then it does so in the special 1731 * domain #0, which we have to flush: 1732 */ 1733 if (cap_caching_mode(iommu->cap)) { 1734 iommu->flush.flush_context(iommu, 0, 1735 (((u16)bus) << 8) | devfn, 1736 DMA_CCMD_MASK_NOBIT, 1737 DMA_CCMD_DEVICE_INVL); 1738 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1739 } else { 1740 iommu_flush_write_buffer(iommu); 1741 } 1742 1743 ret = 0; 1744 1745 out_unlock: 1746 spin_unlock(&iommu->lock); 1747 1748 return ret; 1749 } 1750 1751 static int domain_context_mapping_cb(struct pci_dev *pdev, 1752 u16 alias, void *opaque) 1753 { 1754 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); 1755 struct intel_iommu *iommu = info->iommu; 1756 struct dmar_domain *domain = opaque; 1757 1758 return domain_context_mapping_one(domain, iommu, 1759 PCI_BUS_NUM(alias), alias & 0xff); 1760 } 1761 1762 static int 1763 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1764 { 1765 struct device_domain_info *info = dev_iommu_priv_get(dev); 1766 struct intel_iommu *iommu = info->iommu; 1767 u8 bus = info->bus, devfn = info->devfn; 1768 1769 if (!dev_is_pci(dev)) 1770 return domain_context_mapping_one(domain, iommu, bus, devfn); 1771 1772 return pci_for_each_dma_alias(to_pci_dev(dev), 1773 domain_context_mapping_cb, domain); 1774 } 1775 1776 /* Return largest possible superpage level for a given mapping */ 1777 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1778 unsigned long phy_pfn, unsigned long pages) 1779 { 1780 int support, level = 1; 1781 unsigned long pfnmerge; 1782 1783 support = domain->iommu_superpage; 1784 1785 /* To use a large page, the virtual *and* physical addresses 1786 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1787 of them will mean we have to use smaller pages. So just 1788 merge them and check both at once. */ 1789 pfnmerge = iov_pfn | phy_pfn; 1790 1791 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1792 pages >>= VTD_STRIDE_SHIFT; 1793 if (!pages) 1794 break; 1795 pfnmerge >>= VTD_STRIDE_SHIFT; 1796 level++; 1797 support--; 1798 } 1799 return level; 1800 } 1801 1802 /* 1803 * Ensure that old small page tables are removed to make room for superpage(s). 1804 * We're going to add new large pages, so make sure we don't remove their parent 1805 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1806 */ 1807 static void switch_to_super_page(struct dmar_domain *domain, 1808 unsigned long start_pfn, 1809 unsigned long end_pfn, int level) 1810 { 1811 unsigned long lvl_pages = lvl_to_nr_pages(level); 1812 struct dma_pte *pte = NULL; 1813 1814 while (start_pfn <= end_pfn) { 1815 if (!pte) 1816 pte = pfn_to_dma_pte(domain, start_pfn, &level, 1817 GFP_ATOMIC); 1818 1819 if (dma_pte_present(pte)) { 1820 dma_pte_free_pagetable(domain, start_pfn, 1821 start_pfn + lvl_pages - 1, 1822 level + 1); 1823 1824 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, 1825 end_pfn << VTD_PAGE_SHIFT, 0); 1826 } 1827 1828 pte++; 1829 start_pfn += lvl_pages; 1830 if (first_pte_in_page(pte)) 1831 pte = NULL; 1832 } 1833 } 1834 1835 static int 1836 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1837 unsigned long phys_pfn, unsigned long nr_pages, int prot, 1838 gfp_t gfp) 1839 { 1840 struct dma_pte *first_pte = NULL, *pte = NULL; 1841 unsigned int largepage_lvl = 0; 1842 unsigned long lvl_pages = 0; 1843 phys_addr_t pteval; 1844 u64 attr; 1845 1846 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 1847 return -EINVAL; 1848 1849 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 1850 return -EINVAL; 1851 1852 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 1853 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 1854 return -EINVAL; 1855 } 1856 1857 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 1858 attr |= DMA_FL_PTE_PRESENT; 1859 if (domain->use_first_level) { 1860 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 1861 if (prot & DMA_PTE_WRITE) 1862 attr |= DMA_FL_PTE_DIRTY; 1863 } 1864 1865 domain->has_mappings = true; 1866 1867 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 1868 1869 while (nr_pages > 0) { 1870 uint64_t tmp; 1871 1872 if (!pte) { 1873 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 1874 phys_pfn, nr_pages); 1875 1876 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 1877 gfp); 1878 if (!pte) 1879 return -ENOMEM; 1880 first_pte = pte; 1881 1882 lvl_pages = lvl_to_nr_pages(largepage_lvl); 1883 1884 /* It is large page*/ 1885 if (largepage_lvl > 1) { 1886 unsigned long end_pfn; 1887 unsigned long pages_to_remove; 1888 1889 pteval |= DMA_PTE_LARGE_PAGE; 1890 pages_to_remove = min_t(unsigned long, nr_pages, 1891 nr_pte_to_next_page(pte) * lvl_pages); 1892 end_pfn = iov_pfn + pages_to_remove - 1; 1893 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 1894 } else { 1895 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1896 } 1897 1898 } 1899 /* We don't need lock here, nobody else 1900 * touches the iova range 1901 */ 1902 tmp = 0ULL; 1903 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { 1904 static int dumps = 5; 1905 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 1906 iov_pfn, tmp, (unsigned long long)pteval); 1907 if (dumps) { 1908 dumps--; 1909 debug_dma_dump_mappings(NULL); 1910 } 1911 WARN_ON(1); 1912 } 1913 1914 nr_pages -= lvl_pages; 1915 iov_pfn += lvl_pages; 1916 phys_pfn += lvl_pages; 1917 pteval += lvl_pages * VTD_PAGE_SIZE; 1918 1919 /* If the next PTE would be the first in a new page, then we 1920 * need to flush the cache on the entries we've just written. 1921 * And then we'll need to recalculate 'pte', so clear it and 1922 * let it get set again in the if (!pte) block above. 1923 * 1924 * If we're done (!nr_pages) we need to flush the cache too. 1925 * 1926 * Also if we've been setting superpages, we may need to 1927 * recalculate 'pte' and switch back to smaller pages for the 1928 * end of the mapping, if the trailing size is not enough to 1929 * use another superpage (i.e. nr_pages < lvl_pages). 1930 */ 1931 pte++; 1932 if (!nr_pages || first_pte_in_page(pte) || 1933 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 1934 domain_flush_cache(domain, first_pte, 1935 (void *)pte - (void *)first_pte); 1936 pte = NULL; 1937 } 1938 } 1939 1940 return 0; 1941 } 1942 1943 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 1944 { 1945 struct intel_iommu *iommu = info->iommu; 1946 struct context_entry *context; 1947 1948 spin_lock(&iommu->lock); 1949 context = iommu_context_addr(iommu, bus, devfn, 0); 1950 if (!context) { 1951 spin_unlock(&iommu->lock); 1952 return; 1953 } 1954 1955 context_clear_entry(context); 1956 __iommu_flush_cache(iommu, context, sizeof(*context)); 1957 spin_unlock(&iommu->lock); 1958 intel_context_flush_present(info, context, true); 1959 } 1960 1961 static int domain_setup_first_level(struct intel_iommu *iommu, 1962 struct dmar_domain *domain, 1963 struct device *dev, 1964 u32 pasid) 1965 { 1966 struct dma_pte *pgd = domain->pgd; 1967 int agaw, level; 1968 int flags = 0; 1969 1970 /* 1971 * Skip top levels of page tables for iommu which has 1972 * less agaw than default. Unnecessary for PT mode. 1973 */ 1974 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1975 pgd = phys_to_virt(dma_pte_addr(pgd)); 1976 if (!dma_pte_present(pgd)) 1977 return -ENOMEM; 1978 } 1979 1980 level = agaw_to_level(agaw); 1981 if (level != 4 && level != 5) 1982 return -EINVAL; 1983 1984 if (level == 5) 1985 flags |= PASID_FLAG_FL5LP; 1986 1987 if (domain->force_snooping) 1988 flags |= PASID_FLAG_PAGE_SNOOP; 1989 1990 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 1991 domain_id_iommu(domain, iommu), 1992 flags); 1993 } 1994 1995 static bool dev_is_real_dma_subdevice(struct device *dev) 1996 { 1997 return dev && dev_is_pci(dev) && 1998 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 1999 } 2000 2001 static int iommu_domain_identity_map(struct dmar_domain *domain, 2002 unsigned long first_vpfn, 2003 unsigned long last_vpfn) 2004 { 2005 /* 2006 * RMRR range might have overlap with physical memory range, 2007 * clear it first 2008 */ 2009 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2010 2011 return __domain_mapping(domain, first_vpfn, 2012 first_vpfn, last_vpfn - first_vpfn + 1, 2013 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2014 } 2015 2016 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2017 2018 static int __init si_domain_init(int hw) 2019 { 2020 struct dmar_rmrr_unit *rmrr; 2021 struct device *dev; 2022 int i, nid, ret; 2023 2024 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2025 if (!si_domain) 2026 return -EFAULT; 2027 2028 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2029 domain_exit(si_domain); 2030 si_domain = NULL; 2031 return -EFAULT; 2032 } 2033 2034 if (hw) 2035 return 0; 2036 2037 for_each_online_node(nid) { 2038 unsigned long start_pfn, end_pfn; 2039 int i; 2040 2041 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2042 ret = iommu_domain_identity_map(si_domain, 2043 mm_to_dma_pfn_start(start_pfn), 2044 mm_to_dma_pfn_end(end_pfn-1)); 2045 if (ret) 2046 return ret; 2047 } 2048 } 2049 2050 /* 2051 * Identity map the RMRRs so that devices with RMRRs could also use 2052 * the si_domain. 2053 */ 2054 for_each_rmrr_units(rmrr) { 2055 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2056 i, dev) { 2057 unsigned long long start = rmrr->base_address; 2058 unsigned long long end = rmrr->end_address; 2059 2060 if (WARN_ON(end < start || 2061 end >> agaw_to_width(si_domain->agaw))) 2062 continue; 2063 2064 ret = iommu_domain_identity_map(si_domain, 2065 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2066 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2067 if (ret) 2068 return ret; 2069 } 2070 } 2071 2072 return 0; 2073 } 2074 2075 static int dmar_domain_attach_device(struct dmar_domain *domain, 2076 struct device *dev) 2077 { 2078 struct device_domain_info *info = dev_iommu_priv_get(dev); 2079 struct intel_iommu *iommu = info->iommu; 2080 unsigned long flags; 2081 int ret; 2082 2083 ret = domain_attach_iommu(domain, iommu); 2084 if (ret) 2085 return ret; 2086 2087 info->domain = domain; 2088 spin_lock_irqsave(&domain->lock, flags); 2089 list_add(&info->link, &domain->devices); 2090 spin_unlock_irqrestore(&domain->lock, flags); 2091 2092 if (dev_is_real_dma_subdevice(dev)) 2093 return 0; 2094 2095 if (!sm_supported(iommu)) 2096 ret = domain_context_mapping(domain, dev); 2097 else if (hw_pass_through && domain_type_is_si(domain)) 2098 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); 2099 else if (domain->use_first_level) 2100 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); 2101 else 2102 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); 2103 2104 if (ret) 2105 goto out_block_translation; 2106 2107 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2108 iommu_enable_pci_caps(info); 2109 2110 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); 2111 if (ret) 2112 goto out_block_translation; 2113 2114 return 0; 2115 2116 out_block_translation: 2117 device_block_translation(dev); 2118 return ret; 2119 } 2120 2121 /** 2122 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2123 * is relaxable (ie. is allowed to be not enforced under some conditions) 2124 * @dev: device handle 2125 * 2126 * We assume that PCI USB devices with RMRRs have them largely 2127 * for historical reasons and that the RMRR space is not actively used post 2128 * boot. This exclusion may change if vendors begin to abuse it. 2129 * 2130 * The same exception is made for graphics devices, with the requirement that 2131 * any use of the RMRR regions will be torn down before assigning the device 2132 * to a guest. 2133 * 2134 * Return: true if the RMRR is relaxable, false otherwise 2135 */ 2136 static bool device_rmrr_is_relaxable(struct device *dev) 2137 { 2138 struct pci_dev *pdev; 2139 2140 if (!dev_is_pci(dev)) 2141 return false; 2142 2143 pdev = to_pci_dev(dev); 2144 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2145 return true; 2146 else 2147 return false; 2148 } 2149 2150 static int device_def_domain_type(struct device *dev) 2151 { 2152 if (dev_is_pci(dev)) { 2153 struct pci_dev *pdev = to_pci_dev(dev); 2154 2155 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2156 return IOMMU_DOMAIN_IDENTITY; 2157 } 2158 2159 return 0; 2160 } 2161 2162 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2163 { 2164 /* 2165 * Start from the sane iommu hardware state. 2166 * If the queued invalidation is already initialized by us 2167 * (for example, while enabling interrupt-remapping) then 2168 * we got the things already rolling from a sane state. 2169 */ 2170 if (!iommu->qi) { 2171 /* 2172 * Clear any previous faults. 2173 */ 2174 dmar_fault(-1, iommu); 2175 /* 2176 * Disable queued invalidation if supported and already enabled 2177 * before OS handover. 2178 */ 2179 dmar_disable_qi(iommu); 2180 } 2181 2182 if (dmar_enable_qi(iommu)) { 2183 /* 2184 * Queued Invalidate not enabled, use Register Based Invalidate 2185 */ 2186 iommu->flush.flush_context = __iommu_flush_context; 2187 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2188 pr_info("%s: Using Register based invalidation\n", 2189 iommu->name); 2190 } else { 2191 iommu->flush.flush_context = qi_flush_context; 2192 iommu->flush.flush_iotlb = qi_flush_iotlb; 2193 pr_info("%s: Using Queued invalidation\n", iommu->name); 2194 } 2195 } 2196 2197 static int copy_context_table(struct intel_iommu *iommu, 2198 struct root_entry *old_re, 2199 struct context_entry **tbl, 2200 int bus, bool ext) 2201 { 2202 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2203 struct context_entry *new_ce = NULL, ce; 2204 struct context_entry *old_ce = NULL; 2205 struct root_entry re; 2206 phys_addr_t old_ce_phys; 2207 2208 tbl_idx = ext ? bus * 2 : bus; 2209 memcpy(&re, old_re, sizeof(re)); 2210 2211 for (devfn = 0; devfn < 256; devfn++) { 2212 /* First calculate the correct index */ 2213 idx = (ext ? devfn * 2 : devfn) % 256; 2214 2215 if (idx == 0) { 2216 /* First save what we may have and clean up */ 2217 if (new_ce) { 2218 tbl[tbl_idx] = new_ce; 2219 __iommu_flush_cache(iommu, new_ce, 2220 VTD_PAGE_SIZE); 2221 pos = 1; 2222 } 2223 2224 if (old_ce) 2225 memunmap(old_ce); 2226 2227 ret = 0; 2228 if (devfn < 0x80) 2229 old_ce_phys = root_entry_lctp(&re); 2230 else 2231 old_ce_phys = root_entry_uctp(&re); 2232 2233 if (!old_ce_phys) { 2234 if (ext && devfn == 0) { 2235 /* No LCTP, try UCTP */ 2236 devfn = 0x7f; 2237 continue; 2238 } else { 2239 goto out; 2240 } 2241 } 2242 2243 ret = -ENOMEM; 2244 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2245 MEMREMAP_WB); 2246 if (!old_ce) 2247 goto out; 2248 2249 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); 2250 if (!new_ce) 2251 goto out_unmap; 2252 2253 ret = 0; 2254 } 2255 2256 /* Now copy the context entry */ 2257 memcpy(&ce, old_ce + idx, sizeof(ce)); 2258 2259 if (!context_present(&ce)) 2260 continue; 2261 2262 did = context_domain_id(&ce); 2263 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2264 set_bit(did, iommu->domain_ids); 2265 2266 set_context_copied(iommu, bus, devfn); 2267 new_ce[idx] = ce; 2268 } 2269 2270 tbl[tbl_idx + pos] = new_ce; 2271 2272 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2273 2274 out_unmap: 2275 memunmap(old_ce); 2276 2277 out: 2278 return ret; 2279 } 2280 2281 static int copy_translation_tables(struct intel_iommu *iommu) 2282 { 2283 struct context_entry **ctxt_tbls; 2284 struct root_entry *old_rt; 2285 phys_addr_t old_rt_phys; 2286 int ctxt_table_entries; 2287 u64 rtaddr_reg; 2288 int bus, ret; 2289 bool new_ext, ext; 2290 2291 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2292 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2293 new_ext = !!sm_supported(iommu); 2294 2295 /* 2296 * The RTT bit can only be changed when translation is disabled, 2297 * but disabling translation means to open a window for data 2298 * corruption. So bail out and don't copy anything if we would 2299 * have to change the bit. 2300 */ 2301 if (new_ext != ext) 2302 return -EINVAL; 2303 2304 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2305 if (!iommu->copied_tables) 2306 return -ENOMEM; 2307 2308 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2309 if (!old_rt_phys) 2310 return -EINVAL; 2311 2312 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2313 if (!old_rt) 2314 return -ENOMEM; 2315 2316 /* This is too big for the stack - allocate it from slab */ 2317 ctxt_table_entries = ext ? 512 : 256; 2318 ret = -ENOMEM; 2319 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2320 if (!ctxt_tbls) 2321 goto out_unmap; 2322 2323 for (bus = 0; bus < 256; bus++) { 2324 ret = copy_context_table(iommu, &old_rt[bus], 2325 ctxt_tbls, bus, ext); 2326 if (ret) { 2327 pr_err("%s: Failed to copy context table for bus %d\n", 2328 iommu->name, bus); 2329 continue; 2330 } 2331 } 2332 2333 spin_lock(&iommu->lock); 2334 2335 /* Context tables are copied, now write them to the root_entry table */ 2336 for (bus = 0; bus < 256; bus++) { 2337 int idx = ext ? bus * 2 : bus; 2338 u64 val; 2339 2340 if (ctxt_tbls[idx]) { 2341 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2342 iommu->root_entry[bus].lo = val; 2343 } 2344 2345 if (!ext || !ctxt_tbls[idx + 1]) 2346 continue; 2347 2348 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2349 iommu->root_entry[bus].hi = val; 2350 } 2351 2352 spin_unlock(&iommu->lock); 2353 2354 kfree(ctxt_tbls); 2355 2356 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2357 2358 ret = 0; 2359 2360 out_unmap: 2361 memunmap(old_rt); 2362 2363 return ret; 2364 } 2365 2366 static int __init init_dmars(void) 2367 { 2368 struct dmar_drhd_unit *drhd; 2369 struct intel_iommu *iommu; 2370 int ret; 2371 2372 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2373 if (ret) 2374 goto free_iommu; 2375 2376 for_each_iommu(iommu, drhd) { 2377 if (drhd->ignored) { 2378 iommu_disable_translation(iommu); 2379 continue; 2380 } 2381 2382 /* 2383 * Find the max pasid size of all IOMMU's in the system. 2384 * We need to ensure the system pasid table is no bigger 2385 * than the smallest supported. 2386 */ 2387 if (pasid_supported(iommu)) { 2388 u32 temp = 2 << ecap_pss(iommu->ecap); 2389 2390 intel_pasid_max_id = min_t(u32, temp, 2391 intel_pasid_max_id); 2392 } 2393 2394 intel_iommu_init_qi(iommu); 2395 2396 ret = iommu_init_domains(iommu); 2397 if (ret) 2398 goto free_iommu; 2399 2400 init_translation_status(iommu); 2401 2402 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2403 iommu_disable_translation(iommu); 2404 clear_translation_pre_enabled(iommu); 2405 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2406 iommu->name); 2407 } 2408 2409 /* 2410 * TBD: 2411 * we could share the same root & context tables 2412 * among all IOMMU's. Need to Split it later. 2413 */ 2414 ret = iommu_alloc_root_entry(iommu); 2415 if (ret) 2416 goto free_iommu; 2417 2418 if (translation_pre_enabled(iommu)) { 2419 pr_info("Translation already enabled - trying to copy translation structures\n"); 2420 2421 ret = copy_translation_tables(iommu); 2422 if (ret) { 2423 /* 2424 * We found the IOMMU with translation 2425 * enabled - but failed to copy over the 2426 * old root-entry table. Try to proceed 2427 * by disabling translation now and 2428 * allocating a clean root-entry table. 2429 * This might cause DMAR faults, but 2430 * probably the dump will still succeed. 2431 */ 2432 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2433 iommu->name); 2434 iommu_disable_translation(iommu); 2435 clear_translation_pre_enabled(iommu); 2436 } else { 2437 pr_info("Copied translation tables from previous kernel for %s\n", 2438 iommu->name); 2439 } 2440 } 2441 2442 if (!ecap_pass_through(iommu->ecap)) 2443 hw_pass_through = 0; 2444 intel_svm_check(iommu); 2445 } 2446 2447 /* 2448 * Now that qi is enabled on all iommus, set the root entry and flush 2449 * caches. This is required on some Intel X58 chipsets, otherwise the 2450 * flush_context function will loop forever and the boot hangs. 2451 */ 2452 for_each_active_iommu(iommu, drhd) { 2453 iommu_flush_write_buffer(iommu); 2454 iommu_set_root_entry(iommu); 2455 } 2456 2457 check_tylersburg_isoch(); 2458 2459 ret = si_domain_init(hw_pass_through); 2460 if (ret) 2461 goto free_iommu; 2462 2463 /* 2464 * for each drhd 2465 * enable fault log 2466 * global invalidate context cache 2467 * global invalidate iotlb 2468 * enable translation 2469 */ 2470 for_each_iommu(iommu, drhd) { 2471 if (drhd->ignored) { 2472 /* 2473 * we always have to disable PMRs or DMA may fail on 2474 * this device 2475 */ 2476 if (force_on) 2477 iommu_disable_protect_mem_regions(iommu); 2478 continue; 2479 } 2480 2481 iommu_flush_write_buffer(iommu); 2482 2483 #ifdef CONFIG_INTEL_IOMMU_SVM 2484 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2485 /* 2486 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2487 * could cause possible lock race condition. 2488 */ 2489 up_write(&dmar_global_lock); 2490 ret = intel_svm_enable_prq(iommu); 2491 down_write(&dmar_global_lock); 2492 if (ret) 2493 goto free_iommu; 2494 } 2495 #endif 2496 ret = dmar_set_interrupt(iommu); 2497 if (ret) 2498 goto free_iommu; 2499 } 2500 2501 return 0; 2502 2503 free_iommu: 2504 for_each_active_iommu(iommu, drhd) { 2505 disable_dmar_iommu(iommu); 2506 free_dmar_iommu(iommu); 2507 } 2508 if (si_domain) { 2509 domain_exit(si_domain); 2510 si_domain = NULL; 2511 } 2512 2513 return ret; 2514 } 2515 2516 static void __init init_no_remapping_devices(void) 2517 { 2518 struct dmar_drhd_unit *drhd; 2519 struct device *dev; 2520 int i; 2521 2522 for_each_drhd_unit(drhd) { 2523 if (!drhd->include_all) { 2524 for_each_active_dev_scope(drhd->devices, 2525 drhd->devices_cnt, i, dev) 2526 break; 2527 /* ignore DMAR unit if no devices exist */ 2528 if (i == drhd->devices_cnt) 2529 drhd->ignored = 1; 2530 } 2531 } 2532 2533 for_each_active_drhd_unit(drhd) { 2534 if (drhd->include_all) 2535 continue; 2536 2537 for_each_active_dev_scope(drhd->devices, 2538 drhd->devices_cnt, i, dev) 2539 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2540 break; 2541 if (i < drhd->devices_cnt) 2542 continue; 2543 2544 /* This IOMMU has *only* gfx devices. Either bypass it or 2545 set the gfx_mapped flag, as appropriate */ 2546 drhd->gfx_dedicated = 1; 2547 if (disable_igfx_iommu) 2548 drhd->ignored = 1; 2549 } 2550 } 2551 2552 #ifdef CONFIG_SUSPEND 2553 static int init_iommu_hw(void) 2554 { 2555 struct dmar_drhd_unit *drhd; 2556 struct intel_iommu *iommu = NULL; 2557 int ret; 2558 2559 for_each_active_iommu(iommu, drhd) { 2560 if (iommu->qi) { 2561 ret = dmar_reenable_qi(iommu); 2562 if (ret) 2563 return ret; 2564 } 2565 } 2566 2567 for_each_iommu(iommu, drhd) { 2568 if (drhd->ignored) { 2569 /* 2570 * we always have to disable PMRs or DMA may fail on 2571 * this device 2572 */ 2573 if (force_on) 2574 iommu_disable_protect_mem_regions(iommu); 2575 continue; 2576 } 2577 2578 iommu_flush_write_buffer(iommu); 2579 iommu_set_root_entry(iommu); 2580 iommu_enable_translation(iommu); 2581 iommu_disable_protect_mem_regions(iommu); 2582 } 2583 2584 return 0; 2585 } 2586 2587 static void iommu_flush_all(void) 2588 { 2589 struct dmar_drhd_unit *drhd; 2590 struct intel_iommu *iommu; 2591 2592 for_each_active_iommu(iommu, drhd) { 2593 iommu->flush.flush_context(iommu, 0, 0, 0, 2594 DMA_CCMD_GLOBAL_INVL); 2595 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2596 DMA_TLB_GLOBAL_FLUSH); 2597 } 2598 } 2599 2600 static int iommu_suspend(void) 2601 { 2602 struct dmar_drhd_unit *drhd; 2603 struct intel_iommu *iommu = NULL; 2604 unsigned long flag; 2605 2606 iommu_flush_all(); 2607 2608 for_each_active_iommu(iommu, drhd) { 2609 iommu_disable_translation(iommu); 2610 2611 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2612 2613 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2614 readl(iommu->reg + DMAR_FECTL_REG); 2615 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2616 readl(iommu->reg + DMAR_FEDATA_REG); 2617 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2618 readl(iommu->reg + DMAR_FEADDR_REG); 2619 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2620 readl(iommu->reg + DMAR_FEUADDR_REG); 2621 2622 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2623 } 2624 return 0; 2625 } 2626 2627 static void iommu_resume(void) 2628 { 2629 struct dmar_drhd_unit *drhd; 2630 struct intel_iommu *iommu = NULL; 2631 unsigned long flag; 2632 2633 if (init_iommu_hw()) { 2634 if (force_on) 2635 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2636 else 2637 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2638 return; 2639 } 2640 2641 for_each_active_iommu(iommu, drhd) { 2642 2643 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2644 2645 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2646 iommu->reg + DMAR_FECTL_REG); 2647 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2648 iommu->reg + DMAR_FEDATA_REG); 2649 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2650 iommu->reg + DMAR_FEADDR_REG); 2651 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2652 iommu->reg + DMAR_FEUADDR_REG); 2653 2654 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2655 } 2656 } 2657 2658 static struct syscore_ops iommu_syscore_ops = { 2659 .resume = iommu_resume, 2660 .suspend = iommu_suspend, 2661 }; 2662 2663 static void __init init_iommu_pm_ops(void) 2664 { 2665 register_syscore_ops(&iommu_syscore_ops); 2666 } 2667 2668 #else 2669 static inline void init_iommu_pm_ops(void) {} 2670 #endif /* CONFIG_PM */ 2671 2672 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2673 { 2674 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2675 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2676 rmrr->end_address <= rmrr->base_address || 2677 arch_rmrr_sanity_check(rmrr)) 2678 return -EINVAL; 2679 2680 return 0; 2681 } 2682 2683 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2684 { 2685 struct acpi_dmar_reserved_memory *rmrr; 2686 struct dmar_rmrr_unit *rmrru; 2687 2688 rmrr = (struct acpi_dmar_reserved_memory *)header; 2689 if (rmrr_sanity_check(rmrr)) { 2690 pr_warn(FW_BUG 2691 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2692 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2693 rmrr->base_address, rmrr->end_address, 2694 dmi_get_system_info(DMI_BIOS_VENDOR), 2695 dmi_get_system_info(DMI_BIOS_VERSION), 2696 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2697 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2698 } 2699 2700 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2701 if (!rmrru) 2702 goto out; 2703 2704 rmrru->hdr = header; 2705 2706 rmrru->base_address = rmrr->base_address; 2707 rmrru->end_address = rmrr->end_address; 2708 2709 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2710 ((void *)rmrr) + rmrr->header.length, 2711 &rmrru->devices_cnt); 2712 if (rmrru->devices_cnt && rmrru->devices == NULL) 2713 goto free_rmrru; 2714 2715 list_add(&rmrru->list, &dmar_rmrr_units); 2716 2717 return 0; 2718 free_rmrru: 2719 kfree(rmrru); 2720 out: 2721 return -ENOMEM; 2722 } 2723 2724 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2725 { 2726 struct dmar_atsr_unit *atsru; 2727 struct acpi_dmar_atsr *tmp; 2728 2729 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2730 dmar_rcu_check()) { 2731 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2732 if (atsr->segment != tmp->segment) 2733 continue; 2734 if (atsr->header.length != tmp->header.length) 2735 continue; 2736 if (memcmp(atsr, tmp, atsr->header.length) == 0) 2737 return atsru; 2738 } 2739 2740 return NULL; 2741 } 2742 2743 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2744 { 2745 struct acpi_dmar_atsr *atsr; 2746 struct dmar_atsr_unit *atsru; 2747 2748 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2749 return 0; 2750 2751 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2752 atsru = dmar_find_atsr(atsr); 2753 if (atsru) 2754 return 0; 2755 2756 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 2757 if (!atsru) 2758 return -ENOMEM; 2759 2760 /* 2761 * If memory is allocated from slab by ACPI _DSM method, we need to 2762 * copy the memory content because the memory buffer will be freed 2763 * on return. 2764 */ 2765 atsru->hdr = (void *)(atsru + 1); 2766 memcpy(atsru->hdr, hdr, hdr->length); 2767 atsru->include_all = atsr->flags & 0x1; 2768 if (!atsru->include_all) { 2769 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 2770 (void *)atsr + atsr->header.length, 2771 &atsru->devices_cnt); 2772 if (atsru->devices_cnt && atsru->devices == NULL) { 2773 kfree(atsru); 2774 return -ENOMEM; 2775 } 2776 } 2777 2778 list_add_rcu(&atsru->list, &dmar_atsr_units); 2779 2780 return 0; 2781 } 2782 2783 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 2784 { 2785 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 2786 kfree(atsru); 2787 } 2788 2789 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2790 { 2791 struct acpi_dmar_atsr *atsr; 2792 struct dmar_atsr_unit *atsru; 2793 2794 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2795 atsru = dmar_find_atsr(atsr); 2796 if (atsru) { 2797 list_del_rcu(&atsru->list); 2798 synchronize_rcu(); 2799 intel_iommu_free_atsr(atsru); 2800 } 2801 2802 return 0; 2803 } 2804 2805 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2806 { 2807 int i; 2808 struct device *dev; 2809 struct acpi_dmar_atsr *atsr; 2810 struct dmar_atsr_unit *atsru; 2811 2812 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2813 atsru = dmar_find_atsr(atsr); 2814 if (!atsru) 2815 return 0; 2816 2817 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 2818 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 2819 i, dev) 2820 return -EBUSY; 2821 } 2822 2823 return 0; 2824 } 2825 2826 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 2827 { 2828 struct dmar_satc_unit *satcu; 2829 struct acpi_dmar_satc *tmp; 2830 2831 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 2832 dmar_rcu_check()) { 2833 tmp = (struct acpi_dmar_satc *)satcu->hdr; 2834 if (satc->segment != tmp->segment) 2835 continue; 2836 if (satc->header.length != tmp->header.length) 2837 continue; 2838 if (memcmp(satc, tmp, satc->header.length) == 0) 2839 return satcu; 2840 } 2841 2842 return NULL; 2843 } 2844 2845 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 2846 { 2847 struct acpi_dmar_satc *satc; 2848 struct dmar_satc_unit *satcu; 2849 2850 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2851 return 0; 2852 2853 satc = container_of(hdr, struct acpi_dmar_satc, header); 2854 satcu = dmar_find_satc(satc); 2855 if (satcu) 2856 return 0; 2857 2858 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 2859 if (!satcu) 2860 return -ENOMEM; 2861 2862 satcu->hdr = (void *)(satcu + 1); 2863 memcpy(satcu->hdr, hdr, hdr->length); 2864 satcu->atc_required = satc->flags & 0x1; 2865 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 2866 (void *)satc + satc->header.length, 2867 &satcu->devices_cnt); 2868 if (satcu->devices_cnt && !satcu->devices) { 2869 kfree(satcu); 2870 return -ENOMEM; 2871 } 2872 list_add_rcu(&satcu->list, &dmar_satc_units); 2873 2874 return 0; 2875 } 2876 2877 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 2878 { 2879 int sp, ret; 2880 struct intel_iommu *iommu = dmaru->iommu; 2881 2882 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 2883 if (ret) 2884 goto out; 2885 2886 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 2887 pr_warn("%s: Doesn't support hardware pass through.\n", 2888 iommu->name); 2889 return -ENXIO; 2890 } 2891 2892 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 2893 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 2894 pr_warn("%s: Doesn't support large page.\n", 2895 iommu->name); 2896 return -ENXIO; 2897 } 2898 2899 /* 2900 * Disable translation if already enabled prior to OS handover. 2901 */ 2902 if (iommu->gcmd & DMA_GCMD_TE) 2903 iommu_disable_translation(iommu); 2904 2905 ret = iommu_init_domains(iommu); 2906 if (ret == 0) 2907 ret = iommu_alloc_root_entry(iommu); 2908 if (ret) 2909 goto out; 2910 2911 intel_svm_check(iommu); 2912 2913 if (dmaru->ignored) { 2914 /* 2915 * we always have to disable PMRs or DMA may fail on this device 2916 */ 2917 if (force_on) 2918 iommu_disable_protect_mem_regions(iommu); 2919 return 0; 2920 } 2921 2922 intel_iommu_init_qi(iommu); 2923 iommu_flush_write_buffer(iommu); 2924 2925 #ifdef CONFIG_INTEL_IOMMU_SVM 2926 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2927 ret = intel_svm_enable_prq(iommu); 2928 if (ret) 2929 goto disable_iommu; 2930 } 2931 #endif 2932 ret = dmar_set_interrupt(iommu); 2933 if (ret) 2934 goto disable_iommu; 2935 2936 iommu_set_root_entry(iommu); 2937 iommu_enable_translation(iommu); 2938 2939 iommu_disable_protect_mem_regions(iommu); 2940 return 0; 2941 2942 disable_iommu: 2943 disable_dmar_iommu(iommu); 2944 out: 2945 free_dmar_iommu(iommu); 2946 return ret; 2947 } 2948 2949 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 2950 { 2951 int ret = 0; 2952 struct intel_iommu *iommu = dmaru->iommu; 2953 2954 if (!intel_iommu_enabled) 2955 return 0; 2956 if (iommu == NULL) 2957 return -EINVAL; 2958 2959 if (insert) { 2960 ret = intel_iommu_add(dmaru); 2961 } else { 2962 disable_dmar_iommu(iommu); 2963 free_dmar_iommu(iommu); 2964 } 2965 2966 return ret; 2967 } 2968 2969 static void intel_iommu_free_dmars(void) 2970 { 2971 struct dmar_rmrr_unit *rmrru, *rmrr_n; 2972 struct dmar_atsr_unit *atsru, *atsr_n; 2973 struct dmar_satc_unit *satcu, *satc_n; 2974 2975 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 2976 list_del(&rmrru->list); 2977 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 2978 kfree(rmrru); 2979 } 2980 2981 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 2982 list_del(&atsru->list); 2983 intel_iommu_free_atsr(atsru); 2984 } 2985 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 2986 list_del(&satcu->list); 2987 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 2988 kfree(satcu); 2989 } 2990 } 2991 2992 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 2993 { 2994 struct dmar_satc_unit *satcu; 2995 struct acpi_dmar_satc *satc; 2996 struct device *tmp; 2997 int i; 2998 2999 dev = pci_physfn(dev); 3000 rcu_read_lock(); 3001 3002 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3003 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3004 if (satc->segment != pci_domain_nr(dev->bus)) 3005 continue; 3006 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3007 if (to_pci_dev(tmp) == dev) 3008 goto out; 3009 } 3010 satcu = NULL; 3011 out: 3012 rcu_read_unlock(); 3013 return satcu; 3014 } 3015 3016 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3017 { 3018 int i, ret = 1; 3019 struct pci_bus *bus; 3020 struct pci_dev *bridge = NULL; 3021 struct device *tmp; 3022 struct acpi_dmar_atsr *atsr; 3023 struct dmar_atsr_unit *atsru; 3024 struct dmar_satc_unit *satcu; 3025 3026 dev = pci_physfn(dev); 3027 satcu = dmar_find_matched_satc_unit(dev); 3028 if (satcu) 3029 /* 3030 * This device supports ATS as it is in SATC table. 3031 * When IOMMU is in legacy mode, enabling ATS is done 3032 * automatically by HW for the device that requires 3033 * ATS, hence OS should not enable this device ATS 3034 * to avoid duplicated TLB invalidation. 3035 */ 3036 return !(satcu->atc_required && !sm_supported(iommu)); 3037 3038 for (bus = dev->bus; bus; bus = bus->parent) { 3039 bridge = bus->self; 3040 /* If it's an integrated device, allow ATS */ 3041 if (!bridge) 3042 return 1; 3043 /* Connected via non-PCIe: no ATS */ 3044 if (!pci_is_pcie(bridge) || 3045 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3046 return 0; 3047 /* If we found the root port, look it up in the ATSR */ 3048 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3049 break; 3050 } 3051 3052 rcu_read_lock(); 3053 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3054 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3055 if (atsr->segment != pci_domain_nr(dev->bus)) 3056 continue; 3057 3058 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3059 if (tmp == &bridge->dev) 3060 goto out; 3061 3062 if (atsru->include_all) 3063 goto out; 3064 } 3065 ret = 0; 3066 out: 3067 rcu_read_unlock(); 3068 3069 return ret; 3070 } 3071 3072 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3073 { 3074 int ret; 3075 struct dmar_rmrr_unit *rmrru; 3076 struct dmar_atsr_unit *atsru; 3077 struct dmar_satc_unit *satcu; 3078 struct acpi_dmar_atsr *atsr; 3079 struct acpi_dmar_reserved_memory *rmrr; 3080 struct acpi_dmar_satc *satc; 3081 3082 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3083 return 0; 3084 3085 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3086 rmrr = container_of(rmrru->hdr, 3087 struct acpi_dmar_reserved_memory, header); 3088 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3089 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3090 ((void *)rmrr) + rmrr->header.length, 3091 rmrr->segment, rmrru->devices, 3092 rmrru->devices_cnt); 3093 if (ret < 0) 3094 return ret; 3095 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3096 dmar_remove_dev_scope(info, rmrr->segment, 3097 rmrru->devices, rmrru->devices_cnt); 3098 } 3099 } 3100 3101 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3102 if (atsru->include_all) 3103 continue; 3104 3105 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3106 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3107 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3108 (void *)atsr + atsr->header.length, 3109 atsr->segment, atsru->devices, 3110 atsru->devices_cnt); 3111 if (ret > 0) 3112 break; 3113 else if (ret < 0) 3114 return ret; 3115 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3116 if (dmar_remove_dev_scope(info, atsr->segment, 3117 atsru->devices, atsru->devices_cnt)) 3118 break; 3119 } 3120 } 3121 list_for_each_entry(satcu, &dmar_satc_units, list) { 3122 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3123 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3124 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3125 (void *)satc + satc->header.length, 3126 satc->segment, satcu->devices, 3127 satcu->devices_cnt); 3128 if (ret > 0) 3129 break; 3130 else if (ret < 0) 3131 return ret; 3132 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3133 if (dmar_remove_dev_scope(info, satc->segment, 3134 satcu->devices, satcu->devices_cnt)) 3135 break; 3136 } 3137 } 3138 3139 return 0; 3140 } 3141 3142 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3143 unsigned long val, void *v) 3144 { 3145 struct memory_notify *mhp = v; 3146 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3147 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3148 mhp->nr_pages - 1); 3149 3150 switch (val) { 3151 case MEM_GOING_ONLINE: 3152 if (iommu_domain_identity_map(si_domain, 3153 start_vpfn, last_vpfn)) { 3154 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3155 start_vpfn, last_vpfn); 3156 return NOTIFY_BAD; 3157 } 3158 break; 3159 3160 case MEM_OFFLINE: 3161 case MEM_CANCEL_ONLINE: 3162 { 3163 LIST_HEAD(freelist); 3164 3165 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3166 iommu_put_pages_list(&freelist); 3167 } 3168 break; 3169 } 3170 3171 return NOTIFY_OK; 3172 } 3173 3174 static struct notifier_block intel_iommu_memory_nb = { 3175 .notifier_call = intel_iommu_memory_notifier, 3176 .priority = 0 3177 }; 3178 3179 static void intel_disable_iommus(void) 3180 { 3181 struct intel_iommu *iommu = NULL; 3182 struct dmar_drhd_unit *drhd; 3183 3184 for_each_iommu(iommu, drhd) 3185 iommu_disable_translation(iommu); 3186 } 3187 3188 void intel_iommu_shutdown(void) 3189 { 3190 struct dmar_drhd_unit *drhd; 3191 struct intel_iommu *iommu = NULL; 3192 3193 if (no_iommu || dmar_disabled) 3194 return; 3195 3196 down_write(&dmar_global_lock); 3197 3198 /* Disable PMRs explicitly here. */ 3199 for_each_iommu(iommu, drhd) 3200 iommu_disable_protect_mem_regions(iommu); 3201 3202 /* Make sure the IOMMUs are switched off */ 3203 intel_disable_iommus(); 3204 3205 up_write(&dmar_global_lock); 3206 } 3207 3208 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3209 { 3210 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3211 3212 return container_of(iommu_dev, struct intel_iommu, iommu); 3213 } 3214 3215 static ssize_t version_show(struct device *dev, 3216 struct device_attribute *attr, char *buf) 3217 { 3218 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3219 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3220 return sysfs_emit(buf, "%d:%d\n", 3221 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3222 } 3223 static DEVICE_ATTR_RO(version); 3224 3225 static ssize_t address_show(struct device *dev, 3226 struct device_attribute *attr, char *buf) 3227 { 3228 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3229 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3230 } 3231 static DEVICE_ATTR_RO(address); 3232 3233 static ssize_t cap_show(struct device *dev, 3234 struct device_attribute *attr, char *buf) 3235 { 3236 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3237 return sysfs_emit(buf, "%llx\n", iommu->cap); 3238 } 3239 static DEVICE_ATTR_RO(cap); 3240 3241 static ssize_t ecap_show(struct device *dev, 3242 struct device_attribute *attr, char *buf) 3243 { 3244 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3245 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3246 } 3247 static DEVICE_ATTR_RO(ecap); 3248 3249 static ssize_t domains_supported_show(struct device *dev, 3250 struct device_attribute *attr, char *buf) 3251 { 3252 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3253 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3254 } 3255 static DEVICE_ATTR_RO(domains_supported); 3256 3257 static ssize_t domains_used_show(struct device *dev, 3258 struct device_attribute *attr, char *buf) 3259 { 3260 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3261 return sysfs_emit(buf, "%d\n", 3262 bitmap_weight(iommu->domain_ids, 3263 cap_ndoms(iommu->cap))); 3264 } 3265 static DEVICE_ATTR_RO(domains_used); 3266 3267 static struct attribute *intel_iommu_attrs[] = { 3268 &dev_attr_version.attr, 3269 &dev_attr_address.attr, 3270 &dev_attr_cap.attr, 3271 &dev_attr_ecap.attr, 3272 &dev_attr_domains_supported.attr, 3273 &dev_attr_domains_used.attr, 3274 NULL, 3275 }; 3276 3277 static struct attribute_group intel_iommu_group = { 3278 .name = "intel-iommu", 3279 .attrs = intel_iommu_attrs, 3280 }; 3281 3282 const struct attribute_group *intel_iommu_groups[] = { 3283 &intel_iommu_group, 3284 NULL, 3285 }; 3286 3287 static bool has_external_pci(void) 3288 { 3289 struct pci_dev *pdev = NULL; 3290 3291 for_each_pci_dev(pdev) 3292 if (pdev->external_facing) { 3293 pci_dev_put(pdev); 3294 return true; 3295 } 3296 3297 return false; 3298 } 3299 3300 static int __init platform_optin_force_iommu(void) 3301 { 3302 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3303 return 0; 3304 3305 if (no_iommu || dmar_disabled) 3306 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3307 3308 /* 3309 * If Intel-IOMMU is disabled by default, we will apply identity 3310 * map for all devices except those marked as being untrusted. 3311 */ 3312 if (dmar_disabled) 3313 iommu_set_default_passthrough(false); 3314 3315 dmar_disabled = 0; 3316 no_iommu = 0; 3317 3318 return 1; 3319 } 3320 3321 static int __init probe_acpi_namespace_devices(void) 3322 { 3323 struct dmar_drhd_unit *drhd; 3324 /* To avoid a -Wunused-but-set-variable warning. */ 3325 struct intel_iommu *iommu __maybe_unused; 3326 struct device *dev; 3327 int i, ret = 0; 3328 3329 for_each_active_iommu(iommu, drhd) { 3330 for_each_active_dev_scope(drhd->devices, 3331 drhd->devices_cnt, i, dev) { 3332 struct acpi_device_physical_node *pn; 3333 struct acpi_device *adev; 3334 3335 if (dev->bus != &acpi_bus_type) 3336 continue; 3337 3338 adev = to_acpi_device(dev); 3339 mutex_lock(&adev->physical_node_lock); 3340 list_for_each_entry(pn, 3341 &adev->physical_node_list, node) { 3342 ret = iommu_probe_device(pn->dev); 3343 if (ret) 3344 break; 3345 } 3346 mutex_unlock(&adev->physical_node_lock); 3347 3348 if (ret) 3349 return ret; 3350 } 3351 } 3352 3353 return 0; 3354 } 3355 3356 static __init int tboot_force_iommu(void) 3357 { 3358 if (!tboot_enabled()) 3359 return 0; 3360 3361 if (no_iommu || dmar_disabled) 3362 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3363 3364 dmar_disabled = 0; 3365 no_iommu = 0; 3366 3367 return 1; 3368 } 3369 3370 int __init intel_iommu_init(void) 3371 { 3372 int ret = -ENODEV; 3373 struct dmar_drhd_unit *drhd; 3374 struct intel_iommu *iommu; 3375 3376 /* 3377 * Intel IOMMU is required for a TXT/tboot launch or platform 3378 * opt in, so enforce that. 3379 */ 3380 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3381 platform_optin_force_iommu(); 3382 3383 down_write(&dmar_global_lock); 3384 if (dmar_table_init()) { 3385 if (force_on) 3386 panic("tboot: Failed to initialize DMAR table\n"); 3387 goto out_free_dmar; 3388 } 3389 3390 if (dmar_dev_scope_init() < 0) { 3391 if (force_on) 3392 panic("tboot: Failed to initialize DMAR device scope\n"); 3393 goto out_free_dmar; 3394 } 3395 3396 up_write(&dmar_global_lock); 3397 3398 /* 3399 * The bus notifier takes the dmar_global_lock, so lockdep will 3400 * complain later when we register it under the lock. 3401 */ 3402 dmar_register_bus_notifier(); 3403 3404 down_write(&dmar_global_lock); 3405 3406 if (!no_iommu) 3407 intel_iommu_debugfs_init(); 3408 3409 if (no_iommu || dmar_disabled) { 3410 /* 3411 * We exit the function here to ensure IOMMU's remapping and 3412 * mempool aren't setup, which means that the IOMMU's PMRs 3413 * won't be disabled via the call to init_dmars(). So disable 3414 * it explicitly here. The PMRs were setup by tboot prior to 3415 * calling SENTER, but the kernel is expected to reset/tear 3416 * down the PMRs. 3417 */ 3418 if (intel_iommu_tboot_noforce) { 3419 for_each_iommu(iommu, drhd) 3420 iommu_disable_protect_mem_regions(iommu); 3421 } 3422 3423 /* 3424 * Make sure the IOMMUs are switched off, even when we 3425 * boot into a kexec kernel and the previous kernel left 3426 * them enabled 3427 */ 3428 intel_disable_iommus(); 3429 goto out_free_dmar; 3430 } 3431 3432 if (list_empty(&dmar_rmrr_units)) 3433 pr_info("No RMRR found\n"); 3434 3435 if (list_empty(&dmar_atsr_units)) 3436 pr_info("No ATSR found\n"); 3437 3438 if (list_empty(&dmar_satc_units)) 3439 pr_info("No SATC found\n"); 3440 3441 init_no_remapping_devices(); 3442 3443 ret = init_dmars(); 3444 if (ret) { 3445 if (force_on) 3446 panic("tboot: Failed to initialize DMARs\n"); 3447 pr_err("Initialization failed\n"); 3448 goto out_free_dmar; 3449 } 3450 up_write(&dmar_global_lock); 3451 3452 init_iommu_pm_ops(); 3453 3454 down_read(&dmar_global_lock); 3455 for_each_active_iommu(iommu, drhd) { 3456 /* 3457 * The flush queue implementation does not perform 3458 * page-selective invalidations that are required for efficient 3459 * TLB flushes in virtual environments. The benefit of batching 3460 * is likely to be much lower than the overhead of synchronizing 3461 * the virtual and physical IOMMU page-tables. 3462 */ 3463 if (cap_caching_mode(iommu->cap) && 3464 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3465 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3466 iommu_set_dma_strict(); 3467 } 3468 iommu_device_sysfs_add(&iommu->iommu, NULL, 3469 intel_iommu_groups, 3470 "%s", iommu->name); 3471 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3472 3473 iommu_pmu_register(iommu); 3474 } 3475 up_read(&dmar_global_lock); 3476 3477 if (si_domain && !hw_pass_through) 3478 register_memory_notifier(&intel_iommu_memory_nb); 3479 3480 down_read(&dmar_global_lock); 3481 if (probe_acpi_namespace_devices()) 3482 pr_warn("ACPI name space devices didn't probe correctly\n"); 3483 3484 /* Finally, we enable the DMA remapping hardware. */ 3485 for_each_iommu(iommu, drhd) { 3486 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3487 iommu_enable_translation(iommu); 3488 3489 iommu_disable_protect_mem_regions(iommu); 3490 } 3491 up_read(&dmar_global_lock); 3492 3493 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3494 3495 intel_iommu_enabled = 1; 3496 3497 return 0; 3498 3499 out_free_dmar: 3500 intel_iommu_free_dmars(); 3501 up_write(&dmar_global_lock); 3502 return ret; 3503 } 3504 3505 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3506 { 3507 struct device_domain_info *info = opaque; 3508 3509 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3510 return 0; 3511 } 3512 3513 /* 3514 * NB - intel-iommu lacks any sort of reference counting for the users of 3515 * dependent devices. If multiple endpoints have intersecting dependent 3516 * devices, unbinding the driver from any one of them will possibly leave 3517 * the others unable to operate. 3518 */ 3519 static void domain_context_clear(struct device_domain_info *info) 3520 { 3521 if (!dev_is_pci(info->dev)) 3522 domain_context_clear_one(info, info->bus, info->devfn); 3523 3524 pci_for_each_dma_alias(to_pci_dev(info->dev), 3525 &domain_context_clear_one_cb, info); 3526 } 3527 3528 /* 3529 * Clear the page table pointer in context or pasid table entries so that 3530 * all DMA requests without PASID from the device are blocked. If the page 3531 * table has been set, clean up the data structures. 3532 */ 3533 void device_block_translation(struct device *dev) 3534 { 3535 struct device_domain_info *info = dev_iommu_priv_get(dev); 3536 struct intel_iommu *iommu = info->iommu; 3537 unsigned long flags; 3538 3539 iommu_disable_pci_caps(info); 3540 if (!dev_is_real_dma_subdevice(dev)) { 3541 if (sm_supported(iommu)) 3542 intel_pasid_tear_down_entry(iommu, dev, 3543 IOMMU_NO_PASID, false); 3544 else 3545 domain_context_clear(info); 3546 } 3547 3548 if (!info->domain) 3549 return; 3550 3551 spin_lock_irqsave(&info->domain->lock, flags); 3552 list_del(&info->link); 3553 spin_unlock_irqrestore(&info->domain->lock, flags); 3554 3555 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); 3556 domain_detach_iommu(info->domain, iommu); 3557 info->domain = NULL; 3558 } 3559 3560 static int md_domain_init(struct dmar_domain *domain, int guest_width) 3561 { 3562 int adjust_width; 3563 3564 /* calculate AGAW */ 3565 domain->gaw = guest_width; 3566 adjust_width = guestwidth_to_adjustwidth(guest_width); 3567 domain->agaw = width_to_agaw(adjust_width); 3568 3569 domain->iommu_coherency = false; 3570 domain->iommu_superpage = 0; 3571 domain->max_addr = 0; 3572 3573 /* always allocate the top pgd */ 3574 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); 3575 if (!domain->pgd) 3576 return -ENOMEM; 3577 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3578 return 0; 3579 } 3580 3581 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3582 struct device *dev) 3583 { 3584 device_block_translation(dev); 3585 return 0; 3586 } 3587 3588 static struct iommu_domain blocking_domain = { 3589 .type = IOMMU_DOMAIN_BLOCKED, 3590 .ops = &(const struct iommu_domain_ops) { 3591 .attach_dev = blocking_domain_attach_dev, 3592 } 3593 }; 3594 3595 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) 3596 { 3597 if (!intel_iommu_superpage) 3598 return 0; 3599 3600 if (first_stage) 3601 return cap_fl1gp_support(iommu->cap) ? 2 : 1; 3602 3603 return fls(cap_super_page_val(iommu->cap)); 3604 } 3605 3606 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) 3607 { 3608 struct device_domain_info *info = dev_iommu_priv_get(dev); 3609 struct intel_iommu *iommu = info->iommu; 3610 struct dmar_domain *domain; 3611 int addr_width; 3612 3613 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 3614 if (!domain) 3615 return ERR_PTR(-ENOMEM); 3616 3617 INIT_LIST_HEAD(&domain->devices); 3618 INIT_LIST_HEAD(&domain->dev_pasids); 3619 INIT_LIST_HEAD(&domain->cache_tags); 3620 spin_lock_init(&domain->lock); 3621 spin_lock_init(&domain->cache_lock); 3622 xa_init(&domain->iommu_array); 3623 3624 domain->nid = dev_to_node(dev); 3625 domain->has_iotlb_device = info->ats_enabled; 3626 domain->use_first_level = first_stage; 3627 3628 /* calculate the address width */ 3629 addr_width = agaw_to_width(iommu->agaw); 3630 if (addr_width > cap_mgaw(iommu->cap)) 3631 addr_width = cap_mgaw(iommu->cap); 3632 domain->gaw = addr_width; 3633 domain->agaw = iommu->agaw; 3634 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); 3635 3636 /* iommu memory access coherency */ 3637 domain->iommu_coherency = iommu_paging_structure_coherency(iommu); 3638 3639 /* pagesize bitmap */ 3640 domain->domain.pgsize_bitmap = SZ_4K; 3641 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); 3642 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 3643 3644 /* 3645 * IOVA aperture: First-level translation restricts the input-address 3646 * to a canonical address (i.e., address bits 63:N have the same value 3647 * as address bit [N-1], where N is 48-bits with 4-level paging and 3648 * 57-bits with 5-level paging). Hence, skip bit [N-1]. 3649 */ 3650 domain->domain.geometry.force_aperture = true; 3651 domain->domain.geometry.aperture_start = 0; 3652 if (first_stage) 3653 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 3654 else 3655 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 3656 3657 /* always allocate the top pgd */ 3658 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL); 3659 if (!domain->pgd) { 3660 kfree(domain); 3661 return ERR_PTR(-ENOMEM); 3662 } 3663 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3664 3665 return domain; 3666 } 3667 3668 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 3669 { 3670 struct dmar_domain *dmar_domain; 3671 struct iommu_domain *domain; 3672 3673 switch (type) { 3674 case IOMMU_DOMAIN_DMA: 3675 case IOMMU_DOMAIN_UNMANAGED: 3676 dmar_domain = alloc_domain(type); 3677 if (!dmar_domain) { 3678 pr_err("Can't allocate dmar_domain\n"); 3679 return NULL; 3680 } 3681 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 3682 pr_err("Domain initialization failed\n"); 3683 domain_exit(dmar_domain); 3684 return NULL; 3685 } 3686 3687 domain = &dmar_domain->domain; 3688 domain->geometry.aperture_start = 0; 3689 domain->geometry.aperture_end = 3690 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 3691 domain->geometry.force_aperture = true; 3692 3693 return domain; 3694 case IOMMU_DOMAIN_IDENTITY: 3695 return &si_domain->domain; 3696 default: 3697 return NULL; 3698 } 3699 3700 return NULL; 3701 } 3702 3703 static struct iommu_domain * 3704 intel_iommu_domain_alloc_user(struct device *dev, u32 flags, 3705 struct iommu_domain *parent, 3706 const struct iommu_user_data *user_data) 3707 { 3708 struct device_domain_info *info = dev_iommu_priv_get(dev); 3709 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3710 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3711 struct intel_iommu *iommu = info->iommu; 3712 struct dmar_domain *dmar_domain; 3713 struct iommu_domain *domain; 3714 3715 /* Must be NESTING domain */ 3716 if (parent) { 3717 if (!nested_supported(iommu) || flags) 3718 return ERR_PTR(-EOPNOTSUPP); 3719 return intel_nested_domain_alloc(parent, user_data); 3720 } 3721 3722 if (flags & 3723 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 3724 return ERR_PTR(-EOPNOTSUPP); 3725 if (nested_parent && !nested_supported(iommu)) 3726 return ERR_PTR(-EOPNOTSUPP); 3727 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3728 return ERR_PTR(-EOPNOTSUPP); 3729 3730 /* Do not use first stage for user domain translation. */ 3731 dmar_domain = paging_domain_alloc(dev, false); 3732 if (IS_ERR(dmar_domain)) 3733 return ERR_CAST(dmar_domain); 3734 domain = &dmar_domain->domain; 3735 domain->type = IOMMU_DOMAIN_UNMANAGED; 3736 domain->owner = &intel_iommu_ops; 3737 domain->ops = intel_iommu_ops.default_domain_ops; 3738 3739 if (nested_parent) { 3740 dmar_domain->nested_parent = true; 3741 INIT_LIST_HEAD(&dmar_domain->s1_domains); 3742 spin_lock_init(&dmar_domain->s1_lock); 3743 } 3744 3745 if (dirty_tracking) { 3746 if (dmar_domain->use_first_level) { 3747 iommu_domain_free(domain); 3748 return ERR_PTR(-EOPNOTSUPP); 3749 } 3750 domain->dirty_ops = &intel_dirty_ops; 3751 } 3752 3753 return domain; 3754 } 3755 3756 static void intel_iommu_domain_free(struct iommu_domain *domain) 3757 { 3758 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3759 3760 WARN_ON(dmar_domain->nested_parent && 3761 !list_empty(&dmar_domain->s1_domains)); 3762 if (domain != &si_domain->domain) 3763 domain_exit(dmar_domain); 3764 } 3765 3766 int prepare_domain_attach_device(struct iommu_domain *domain, 3767 struct device *dev) 3768 { 3769 struct device_domain_info *info = dev_iommu_priv_get(dev); 3770 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3771 struct intel_iommu *iommu = info->iommu; 3772 int addr_width; 3773 3774 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3775 return -EINVAL; 3776 3777 if (domain->dirty_ops && !ssads_supported(iommu)) 3778 return -EINVAL; 3779 3780 /* check if this iommu agaw is sufficient for max mapped address */ 3781 addr_width = agaw_to_width(iommu->agaw); 3782 if (addr_width > cap_mgaw(iommu->cap)) 3783 addr_width = cap_mgaw(iommu->cap); 3784 3785 if (dmar_domain->max_addr > (1LL << addr_width)) 3786 return -EINVAL; 3787 dmar_domain->gaw = addr_width; 3788 3789 /* 3790 * Knock out extra levels of page tables if necessary 3791 */ 3792 while (iommu->agaw < dmar_domain->agaw) { 3793 struct dma_pte *pte; 3794 3795 pte = dmar_domain->pgd; 3796 if (dma_pte_present(pte)) { 3797 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 3798 iommu_free_page(pte); 3799 } 3800 dmar_domain->agaw--; 3801 } 3802 3803 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3804 context_copied(iommu, info->bus, info->devfn)) 3805 return intel_pasid_setup_sm_context(dev); 3806 3807 return 0; 3808 } 3809 3810 static int intel_iommu_attach_device(struct iommu_domain *domain, 3811 struct device *dev) 3812 { 3813 struct device_domain_info *info = dev_iommu_priv_get(dev); 3814 int ret; 3815 3816 if (info->domain) 3817 device_block_translation(dev); 3818 3819 ret = prepare_domain_attach_device(domain, dev); 3820 if (ret) 3821 return ret; 3822 3823 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 3824 } 3825 3826 static int intel_iommu_map(struct iommu_domain *domain, 3827 unsigned long iova, phys_addr_t hpa, 3828 size_t size, int iommu_prot, gfp_t gfp) 3829 { 3830 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3831 u64 max_addr; 3832 int prot = 0; 3833 3834 if (iommu_prot & IOMMU_READ) 3835 prot |= DMA_PTE_READ; 3836 if (iommu_prot & IOMMU_WRITE) 3837 prot |= DMA_PTE_WRITE; 3838 if (dmar_domain->set_pte_snp) 3839 prot |= DMA_PTE_SNP; 3840 3841 max_addr = iova + size; 3842 if (dmar_domain->max_addr < max_addr) { 3843 u64 end; 3844 3845 /* check if minimum agaw is sufficient for mapped address */ 3846 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 3847 if (end < max_addr) { 3848 pr_err("%s: iommu width (%d) is not " 3849 "sufficient for the mapped address (%llx)\n", 3850 __func__, dmar_domain->gaw, max_addr); 3851 return -EFAULT; 3852 } 3853 dmar_domain->max_addr = max_addr; 3854 } 3855 /* Round up size to next multiple of PAGE_SIZE, if it and 3856 the low bits of hpa would take us onto the next page */ 3857 size = aligned_nrpages(hpa, size); 3858 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 3859 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 3860 } 3861 3862 static int intel_iommu_map_pages(struct iommu_domain *domain, 3863 unsigned long iova, phys_addr_t paddr, 3864 size_t pgsize, size_t pgcount, 3865 int prot, gfp_t gfp, size_t *mapped) 3866 { 3867 unsigned long pgshift = __ffs(pgsize); 3868 size_t size = pgcount << pgshift; 3869 int ret; 3870 3871 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 3872 return -EINVAL; 3873 3874 if (!IS_ALIGNED(iova | paddr, pgsize)) 3875 return -EINVAL; 3876 3877 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 3878 if (!ret && mapped) 3879 *mapped = size; 3880 3881 return ret; 3882 } 3883 3884 static size_t intel_iommu_unmap(struct iommu_domain *domain, 3885 unsigned long iova, size_t size, 3886 struct iommu_iotlb_gather *gather) 3887 { 3888 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3889 unsigned long start_pfn, last_pfn; 3890 int level = 0; 3891 3892 /* Cope with horrid API which requires us to unmap more than the 3893 size argument if it happens to be a large-page mapping. */ 3894 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 3895 &level, GFP_ATOMIC))) 3896 return 0; 3897 3898 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 3899 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 3900 3901 start_pfn = iova >> VTD_PAGE_SHIFT; 3902 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 3903 3904 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 3905 3906 if (dmar_domain->max_addr == iova + size) 3907 dmar_domain->max_addr = iova; 3908 3909 /* 3910 * We do not use page-selective IOTLB invalidation in flush queue, 3911 * so there is no need to track page and sync iotlb. 3912 */ 3913 if (!iommu_iotlb_gather_queued(gather)) 3914 iommu_iotlb_gather_add_page(domain, gather, iova, size); 3915 3916 return size; 3917 } 3918 3919 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 3920 unsigned long iova, 3921 size_t pgsize, size_t pgcount, 3922 struct iommu_iotlb_gather *gather) 3923 { 3924 unsigned long pgshift = __ffs(pgsize); 3925 size_t size = pgcount << pgshift; 3926 3927 return intel_iommu_unmap(domain, iova, size, gather); 3928 } 3929 3930 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 3931 struct iommu_iotlb_gather *gather) 3932 { 3933 cache_tag_flush_range(to_dmar_domain(domain), gather->start, 3934 gather->end, list_empty(&gather->freelist)); 3935 iommu_put_pages_list(&gather->freelist); 3936 } 3937 3938 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 3939 dma_addr_t iova) 3940 { 3941 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3942 struct dma_pte *pte; 3943 int level = 0; 3944 u64 phys = 0; 3945 3946 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 3947 GFP_ATOMIC); 3948 if (pte && dma_pte_present(pte)) 3949 phys = dma_pte_addr(pte) + 3950 (iova & (BIT_MASK(level_to_offset_bits(level) + 3951 VTD_PAGE_SHIFT) - 1)); 3952 3953 return phys; 3954 } 3955 3956 static bool domain_support_force_snooping(struct dmar_domain *domain) 3957 { 3958 struct device_domain_info *info; 3959 bool support = true; 3960 3961 assert_spin_locked(&domain->lock); 3962 list_for_each_entry(info, &domain->devices, link) { 3963 if (!ecap_sc_support(info->iommu->ecap)) { 3964 support = false; 3965 break; 3966 } 3967 } 3968 3969 return support; 3970 } 3971 3972 static void domain_set_force_snooping(struct dmar_domain *domain) 3973 { 3974 struct device_domain_info *info; 3975 3976 assert_spin_locked(&domain->lock); 3977 /* 3978 * Second level page table supports per-PTE snoop control. The 3979 * iommu_map() interface will handle this by setting SNP bit. 3980 */ 3981 if (!domain->use_first_level) { 3982 domain->set_pte_snp = true; 3983 return; 3984 } 3985 3986 list_for_each_entry(info, &domain->devices, link) 3987 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 3988 IOMMU_NO_PASID); 3989 } 3990 3991 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3992 { 3993 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3994 unsigned long flags; 3995 3996 if (dmar_domain->force_snooping) 3997 return true; 3998 3999 spin_lock_irqsave(&dmar_domain->lock, flags); 4000 if (!domain_support_force_snooping(dmar_domain) || 4001 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 4002 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4003 return false; 4004 } 4005 4006 domain_set_force_snooping(dmar_domain); 4007 dmar_domain->force_snooping = true; 4008 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4009 4010 return true; 4011 } 4012 4013 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4014 { 4015 struct device_domain_info *info = dev_iommu_priv_get(dev); 4016 4017 switch (cap) { 4018 case IOMMU_CAP_CACHE_COHERENCY: 4019 case IOMMU_CAP_DEFERRED_FLUSH: 4020 return true; 4021 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4022 return dmar_platform_optin(); 4023 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4024 return ecap_sc_support(info->iommu->ecap); 4025 case IOMMU_CAP_DIRTY_TRACKING: 4026 return ssads_supported(info->iommu); 4027 default: 4028 return false; 4029 } 4030 } 4031 4032 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4033 { 4034 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4035 struct device_domain_info *info; 4036 struct intel_iommu *iommu; 4037 u8 bus, devfn; 4038 int ret; 4039 4040 iommu = device_lookup_iommu(dev, &bus, &devfn); 4041 if (!iommu || !iommu->iommu.ops) 4042 return ERR_PTR(-ENODEV); 4043 4044 info = kzalloc(sizeof(*info), GFP_KERNEL); 4045 if (!info) 4046 return ERR_PTR(-ENOMEM); 4047 4048 if (dev_is_real_dma_subdevice(dev)) { 4049 info->bus = pdev->bus->number; 4050 info->devfn = pdev->devfn; 4051 info->segment = pci_domain_nr(pdev->bus); 4052 } else { 4053 info->bus = bus; 4054 info->devfn = devfn; 4055 info->segment = iommu->segment; 4056 } 4057 4058 info->dev = dev; 4059 info->iommu = iommu; 4060 if (dev_is_pci(dev)) { 4061 if (ecap_dev_iotlb_support(iommu->ecap) && 4062 pci_ats_supported(pdev) && 4063 dmar_ats_supported(pdev, iommu)) { 4064 info->ats_supported = 1; 4065 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4066 4067 /* 4068 * For IOMMU that supports device IOTLB throttling 4069 * (DIT), we assign PFSID to the invalidation desc 4070 * of a VF such that IOMMU HW can gauge queue depth 4071 * at PF level. If DIT is not set, PFSID will be 4072 * treated as reserved, which should be set to 0. 4073 */ 4074 if (ecap_dit(iommu->ecap)) 4075 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4076 info->ats_qdep = pci_ats_queue_depth(pdev); 4077 } 4078 if (sm_supported(iommu)) { 4079 if (pasid_supported(iommu)) { 4080 int features = pci_pasid_features(pdev); 4081 4082 if (features >= 0) 4083 info->pasid_supported = features | 1; 4084 } 4085 4086 if (info->ats_supported && ecap_prs(iommu->ecap) && 4087 pci_pri_supported(pdev)) 4088 info->pri_supported = 1; 4089 } 4090 } 4091 4092 dev_iommu_priv_set(dev, info); 4093 if (pdev && pci_ats_supported(pdev)) { 4094 ret = device_rbtree_insert(iommu, info); 4095 if (ret) 4096 goto free; 4097 } 4098 4099 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4100 ret = intel_pasid_alloc_table(dev); 4101 if (ret) { 4102 dev_err(dev, "PASID table allocation failed\n"); 4103 goto clear_rbtree; 4104 } 4105 4106 if (!context_copied(iommu, info->bus, info->devfn)) { 4107 ret = intel_pasid_setup_sm_context(dev); 4108 if (ret) 4109 goto free_table; 4110 } 4111 } 4112 4113 intel_iommu_debugfs_create_dev(info); 4114 4115 return &iommu->iommu; 4116 free_table: 4117 intel_pasid_free_table(dev); 4118 clear_rbtree: 4119 device_rbtree_remove(info); 4120 free: 4121 kfree(info); 4122 4123 return ERR_PTR(ret); 4124 } 4125 4126 static void intel_iommu_release_device(struct device *dev) 4127 { 4128 struct device_domain_info *info = dev_iommu_priv_get(dev); 4129 struct intel_iommu *iommu = info->iommu; 4130 4131 mutex_lock(&iommu->iopf_lock); 4132 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) 4133 device_rbtree_remove(info); 4134 mutex_unlock(&iommu->iopf_lock); 4135 4136 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 4137 !context_copied(iommu, info->bus, info->devfn)) 4138 intel_pasid_teardown_sm_context(dev); 4139 4140 intel_pasid_free_table(dev); 4141 intel_iommu_debugfs_remove_dev(info); 4142 kfree(info); 4143 set_dma_ops(dev, NULL); 4144 } 4145 4146 static void intel_iommu_get_resv_regions(struct device *device, 4147 struct list_head *head) 4148 { 4149 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4150 struct iommu_resv_region *reg; 4151 struct dmar_rmrr_unit *rmrr; 4152 struct device *i_dev; 4153 int i; 4154 4155 rcu_read_lock(); 4156 for_each_rmrr_units(rmrr) { 4157 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4158 i, i_dev) { 4159 struct iommu_resv_region *resv; 4160 enum iommu_resv_type type; 4161 size_t length; 4162 4163 if (i_dev != device && 4164 !is_downstream_to_pci_bridge(device, i_dev)) 4165 continue; 4166 4167 length = rmrr->end_address - rmrr->base_address + 1; 4168 4169 type = device_rmrr_is_relaxable(device) ? 4170 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4171 4172 resv = iommu_alloc_resv_region(rmrr->base_address, 4173 length, prot, type, 4174 GFP_ATOMIC); 4175 if (!resv) 4176 break; 4177 4178 list_add_tail(&resv->list, head); 4179 } 4180 } 4181 rcu_read_unlock(); 4182 4183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4184 if (dev_is_pci(device)) { 4185 struct pci_dev *pdev = to_pci_dev(device); 4186 4187 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4188 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4189 IOMMU_RESV_DIRECT_RELAXABLE, 4190 GFP_KERNEL); 4191 if (reg) 4192 list_add_tail(®->list, head); 4193 } 4194 } 4195 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4196 4197 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4198 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4199 0, IOMMU_RESV_MSI, GFP_KERNEL); 4200 if (!reg) 4201 return; 4202 list_add_tail(®->list, head); 4203 } 4204 4205 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4206 { 4207 if (dev_is_pci(dev)) 4208 return pci_device_group(dev); 4209 return generic_device_group(dev); 4210 } 4211 4212 static int intel_iommu_enable_sva(struct device *dev) 4213 { 4214 struct device_domain_info *info = dev_iommu_priv_get(dev); 4215 struct intel_iommu *iommu; 4216 4217 if (!info || dmar_disabled) 4218 return -EINVAL; 4219 4220 iommu = info->iommu; 4221 if (!iommu) 4222 return -EINVAL; 4223 4224 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4225 return -ENODEV; 4226 4227 if (!info->pasid_enabled || !info->ats_enabled) 4228 return -EINVAL; 4229 4230 /* 4231 * Devices having device-specific I/O fault handling should not 4232 * support PCI/PRI. The IOMMU side has no means to check the 4233 * capability of device-specific IOPF. Therefore, IOMMU can only 4234 * default that if the device driver enables SVA on a non-PRI 4235 * device, it will handle IOPF in its own way. 4236 */ 4237 if (!info->pri_supported) 4238 return 0; 4239 4240 /* Devices supporting PRI should have it enabled. */ 4241 if (!info->pri_enabled) 4242 return -EINVAL; 4243 4244 return 0; 4245 } 4246 4247 static int context_flip_pri(struct device_domain_info *info, bool enable) 4248 { 4249 struct intel_iommu *iommu = info->iommu; 4250 u8 bus = info->bus, devfn = info->devfn; 4251 struct context_entry *context; 4252 4253 spin_lock(&iommu->lock); 4254 if (context_copied(iommu, bus, devfn)) { 4255 spin_unlock(&iommu->lock); 4256 return -EINVAL; 4257 } 4258 4259 context = iommu_context_addr(iommu, bus, devfn, false); 4260 if (!context || !context_present(context)) { 4261 spin_unlock(&iommu->lock); 4262 return -ENODEV; 4263 } 4264 4265 if (enable) 4266 context_set_sm_pre(context); 4267 else 4268 context_clear_sm_pre(context); 4269 4270 if (!ecap_coherent(iommu->ecap)) 4271 clflush_cache_range(context, sizeof(*context)); 4272 intel_context_flush_present(info, context, true); 4273 spin_unlock(&iommu->lock); 4274 4275 return 0; 4276 } 4277 4278 static int intel_iommu_enable_iopf(struct device *dev) 4279 { 4280 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4281 struct device_domain_info *info = dev_iommu_priv_get(dev); 4282 struct intel_iommu *iommu; 4283 int ret; 4284 4285 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4286 return -ENODEV; 4287 4288 if (info->pri_enabled) 4289 return -EBUSY; 4290 4291 iommu = info->iommu; 4292 if (!iommu) 4293 return -EINVAL; 4294 4295 /* PASID is required in PRG Response Message. */ 4296 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4297 return -EINVAL; 4298 4299 ret = pci_reset_pri(pdev); 4300 if (ret) 4301 return ret; 4302 4303 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4304 if (ret) 4305 return ret; 4306 4307 ret = context_flip_pri(info, true); 4308 if (ret) 4309 goto err_remove_device; 4310 4311 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4312 if (ret) 4313 goto err_clear_pri; 4314 4315 info->pri_enabled = 1; 4316 4317 return 0; 4318 err_clear_pri: 4319 context_flip_pri(info, false); 4320 err_remove_device: 4321 iopf_queue_remove_device(iommu->iopf_queue, dev); 4322 4323 return ret; 4324 } 4325 4326 static int intel_iommu_disable_iopf(struct device *dev) 4327 { 4328 struct device_domain_info *info = dev_iommu_priv_get(dev); 4329 struct intel_iommu *iommu = info->iommu; 4330 4331 if (!info->pri_enabled) 4332 return -EINVAL; 4333 4334 /* Disable new PRI reception: */ 4335 context_flip_pri(info, false); 4336 4337 /* 4338 * Remove device from fault queue and acknowledge all outstanding 4339 * PRQs to the device: 4340 */ 4341 iopf_queue_remove_device(iommu->iopf_queue, dev); 4342 4343 /* 4344 * PCIe spec states that by clearing PRI enable bit, the Page 4345 * Request Interface will not issue new page requests, but has 4346 * outstanding page requests that have been transmitted or are 4347 * queued for transmission. This is supposed to be called after 4348 * the device driver has stopped DMA, all PASIDs have been 4349 * unbound and the outstanding PRQs have been drained. 4350 */ 4351 pci_disable_pri(to_pci_dev(dev)); 4352 info->pri_enabled = 0; 4353 4354 return 0; 4355 } 4356 4357 static int 4358 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4359 { 4360 switch (feat) { 4361 case IOMMU_DEV_FEAT_IOPF: 4362 return intel_iommu_enable_iopf(dev); 4363 4364 case IOMMU_DEV_FEAT_SVA: 4365 return intel_iommu_enable_sva(dev); 4366 4367 default: 4368 return -ENODEV; 4369 } 4370 } 4371 4372 static int 4373 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4374 { 4375 switch (feat) { 4376 case IOMMU_DEV_FEAT_IOPF: 4377 return intel_iommu_disable_iopf(dev); 4378 4379 case IOMMU_DEV_FEAT_SVA: 4380 return 0; 4381 4382 default: 4383 return -ENODEV; 4384 } 4385 } 4386 4387 static bool intel_iommu_is_attach_deferred(struct device *dev) 4388 { 4389 struct device_domain_info *info = dev_iommu_priv_get(dev); 4390 4391 return translation_pre_enabled(info->iommu) && !info->domain; 4392 } 4393 4394 /* 4395 * Check that the device does not live on an external facing PCI port that is 4396 * marked as untrusted. Such devices should not be able to apply quirks and 4397 * thus not be able to bypass the IOMMU restrictions. 4398 */ 4399 static bool risky_device(struct pci_dev *pdev) 4400 { 4401 if (pdev->untrusted) { 4402 pci_info(pdev, 4403 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4404 pdev->vendor, pdev->device); 4405 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4406 return true; 4407 } 4408 return false; 4409 } 4410 4411 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4412 unsigned long iova, size_t size) 4413 { 4414 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); 4415 4416 return 0; 4417 } 4418 4419 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, 4420 struct iommu_domain *domain) 4421 { 4422 struct device_domain_info *info = dev_iommu_priv_get(dev); 4423 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4424 struct dev_pasid_info *curr, *dev_pasid = NULL; 4425 struct intel_iommu *iommu = info->iommu; 4426 unsigned long flags; 4427 4428 spin_lock_irqsave(&dmar_domain->lock, flags); 4429 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4430 if (curr->dev == dev && curr->pasid == pasid) { 4431 list_del(&curr->link_domain); 4432 dev_pasid = curr; 4433 break; 4434 } 4435 } 4436 WARN_ON_ONCE(!dev_pasid); 4437 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4438 4439 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4440 domain_detach_iommu(dmar_domain, iommu); 4441 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4442 kfree(dev_pasid); 4443 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4444 intel_drain_pasid_prq(dev, pasid); 4445 } 4446 4447 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4448 struct device *dev, ioasid_t pasid) 4449 { 4450 struct device_domain_info *info = dev_iommu_priv_get(dev); 4451 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4452 struct intel_iommu *iommu = info->iommu; 4453 struct dev_pasid_info *dev_pasid; 4454 unsigned long flags; 4455 int ret; 4456 4457 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4458 return -EOPNOTSUPP; 4459 4460 if (domain->dirty_ops) 4461 return -EINVAL; 4462 4463 if (context_copied(iommu, info->bus, info->devfn)) 4464 return -EBUSY; 4465 4466 ret = prepare_domain_attach_device(domain, dev); 4467 if (ret) 4468 return ret; 4469 4470 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4471 if (!dev_pasid) 4472 return -ENOMEM; 4473 4474 ret = domain_attach_iommu(dmar_domain, iommu); 4475 if (ret) 4476 goto out_free; 4477 4478 ret = cache_tag_assign_domain(dmar_domain, dev, pasid); 4479 if (ret) 4480 goto out_detach_iommu; 4481 4482 if (domain_type_is_si(dmar_domain)) 4483 ret = intel_pasid_setup_pass_through(iommu, dev, pasid); 4484 else if (dmar_domain->use_first_level) 4485 ret = domain_setup_first_level(iommu, dmar_domain, 4486 dev, pasid); 4487 else 4488 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4489 dev, pasid); 4490 if (ret) 4491 goto out_unassign_tag; 4492 4493 dev_pasid->dev = dev; 4494 dev_pasid->pasid = pasid; 4495 spin_lock_irqsave(&dmar_domain->lock, flags); 4496 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4497 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4498 4499 if (domain->type & __IOMMU_DOMAIN_PAGING) 4500 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4501 4502 return 0; 4503 out_unassign_tag: 4504 cache_tag_unassign_domain(dmar_domain, dev, pasid); 4505 out_detach_iommu: 4506 domain_detach_iommu(dmar_domain, iommu); 4507 out_free: 4508 kfree(dev_pasid); 4509 return ret; 4510 } 4511 4512 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4513 { 4514 struct device_domain_info *info = dev_iommu_priv_get(dev); 4515 struct intel_iommu *iommu = info->iommu; 4516 struct iommu_hw_info_vtd *vtd; 4517 4518 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4519 if (!vtd) 4520 return ERR_PTR(-ENOMEM); 4521 4522 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4523 vtd->cap_reg = iommu->cap; 4524 vtd->ecap_reg = iommu->ecap; 4525 *length = sizeof(*vtd); 4526 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4527 return vtd; 4528 } 4529 4530 /* 4531 * Set dirty tracking for the device list of a domain. The caller must 4532 * hold the domain->lock when calling it. 4533 */ 4534 static int device_set_dirty_tracking(struct list_head *devices, bool enable) 4535 { 4536 struct device_domain_info *info; 4537 int ret = 0; 4538 4539 list_for_each_entry(info, devices, link) { 4540 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 4541 IOMMU_NO_PASID, enable); 4542 if (ret) 4543 break; 4544 } 4545 4546 return ret; 4547 } 4548 4549 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, 4550 bool enable) 4551 { 4552 struct dmar_domain *s1_domain; 4553 unsigned long flags; 4554 int ret; 4555 4556 spin_lock(&domain->s1_lock); 4557 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4558 spin_lock_irqsave(&s1_domain->lock, flags); 4559 ret = device_set_dirty_tracking(&s1_domain->devices, enable); 4560 spin_unlock_irqrestore(&s1_domain->lock, flags); 4561 if (ret) 4562 goto err_unwind; 4563 } 4564 spin_unlock(&domain->s1_lock); 4565 return 0; 4566 4567 err_unwind: 4568 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4569 spin_lock_irqsave(&s1_domain->lock, flags); 4570 device_set_dirty_tracking(&s1_domain->devices, 4571 domain->dirty_tracking); 4572 spin_unlock_irqrestore(&s1_domain->lock, flags); 4573 } 4574 spin_unlock(&domain->s1_lock); 4575 return ret; 4576 } 4577 4578 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4579 bool enable) 4580 { 4581 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4582 int ret; 4583 4584 spin_lock(&dmar_domain->lock); 4585 if (dmar_domain->dirty_tracking == enable) 4586 goto out_unlock; 4587 4588 ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 4589 if (ret) 4590 goto err_unwind; 4591 4592 if (dmar_domain->nested_parent) { 4593 ret = parent_domain_set_dirty_tracking(dmar_domain, enable); 4594 if (ret) 4595 goto err_unwind; 4596 } 4597 4598 dmar_domain->dirty_tracking = enable; 4599 out_unlock: 4600 spin_unlock(&dmar_domain->lock); 4601 4602 return 0; 4603 4604 err_unwind: 4605 device_set_dirty_tracking(&dmar_domain->devices, 4606 dmar_domain->dirty_tracking); 4607 spin_unlock(&dmar_domain->lock); 4608 return ret; 4609 } 4610 4611 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4612 unsigned long iova, size_t size, 4613 unsigned long flags, 4614 struct iommu_dirty_bitmap *dirty) 4615 { 4616 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4617 unsigned long end = iova + size - 1; 4618 unsigned long pgsize; 4619 4620 /* 4621 * IOMMUFD core calls into a dirty tracking disabled domain without an 4622 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4623 * have occurred when we stopped dirty tracking. This ensures that we 4624 * never inherit dirtied bits from a previous cycle. 4625 */ 4626 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4627 return -EINVAL; 4628 4629 do { 4630 struct dma_pte *pte; 4631 int lvl = 0; 4632 4633 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4634 GFP_ATOMIC); 4635 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4636 if (!pte || !dma_pte_present(pte)) { 4637 iova += pgsize; 4638 continue; 4639 } 4640 4641 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4642 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4643 iova += pgsize; 4644 } while (iova < end); 4645 4646 return 0; 4647 } 4648 4649 static const struct iommu_dirty_ops intel_dirty_ops = { 4650 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4651 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4652 }; 4653 4654 const struct iommu_ops intel_iommu_ops = { 4655 .blocked_domain = &blocking_domain, 4656 .release_domain = &blocking_domain, 4657 .capable = intel_iommu_capable, 4658 .hw_info = intel_iommu_hw_info, 4659 .domain_alloc = intel_iommu_domain_alloc, 4660 .domain_alloc_user = intel_iommu_domain_alloc_user, 4661 .domain_alloc_sva = intel_svm_domain_alloc, 4662 .probe_device = intel_iommu_probe_device, 4663 .release_device = intel_iommu_release_device, 4664 .get_resv_regions = intel_iommu_get_resv_regions, 4665 .device_group = intel_iommu_device_group, 4666 .dev_enable_feat = intel_iommu_dev_enable_feat, 4667 .dev_disable_feat = intel_iommu_dev_disable_feat, 4668 .is_attach_deferred = intel_iommu_is_attach_deferred, 4669 .def_domain_type = device_def_domain_type, 4670 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4671 .pgsize_bitmap = SZ_4K, 4672 #ifdef CONFIG_INTEL_IOMMU_SVM 4673 .page_response = intel_svm_page_response, 4674 #endif 4675 .default_domain_ops = &(const struct iommu_domain_ops) { 4676 .attach_dev = intel_iommu_attach_device, 4677 .set_dev_pasid = intel_iommu_set_dev_pasid, 4678 .map_pages = intel_iommu_map_pages, 4679 .unmap_pages = intel_iommu_unmap_pages, 4680 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4681 .flush_iotlb_all = intel_flush_iotlb_all, 4682 .iotlb_sync = intel_iommu_tlb_sync, 4683 .iova_to_phys = intel_iommu_iova_to_phys, 4684 .free = intel_iommu_domain_free, 4685 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4686 } 4687 }; 4688 4689 static void quirk_iommu_igfx(struct pci_dev *dev) 4690 { 4691 if (risky_device(dev)) 4692 return; 4693 4694 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4695 disable_igfx_iommu = 1; 4696 } 4697 4698 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4706 4707 /* Broadwell igfx malfunctions with dmar */ 4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4730 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4732 4733 static void quirk_iommu_rwbf(struct pci_dev *dev) 4734 { 4735 if (risky_device(dev)) 4736 return; 4737 4738 /* 4739 * Mobile 4 Series Chipset neglects to set RWBF capability, 4740 * but needs it. Same seems to hold for the desktop versions. 4741 */ 4742 pci_info(dev, "Forcing write-buffer flush capability\n"); 4743 rwbf_quirk = 1; 4744 } 4745 4746 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4748 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4749 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4750 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4753 4754 #define GGC 0x52 4755 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4756 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4757 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4758 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4759 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4760 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4761 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4762 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4763 4764 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4765 { 4766 unsigned short ggc; 4767 4768 if (risky_device(dev)) 4769 return; 4770 4771 if (pci_read_config_word(dev, GGC, &ggc)) 4772 return; 4773 4774 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4775 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4776 disable_igfx_iommu = 1; 4777 } else if (!disable_igfx_iommu) { 4778 /* we have to ensure the gfx device is idle before we flush */ 4779 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4780 iommu_set_dma_strict(); 4781 } 4782 } 4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4787 4788 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4789 { 4790 unsigned short ver; 4791 4792 if (!IS_GFX_DEVICE(dev)) 4793 return; 4794 4795 ver = (dev->device >> 8) & 0xff; 4796 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4797 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4798 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4799 return; 4800 4801 if (risky_device(dev)) 4802 return; 4803 4804 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4805 iommu_skip_te_disable = 1; 4806 } 4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4808 4809 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4810 ISOCH DMAR unit for the Azalia sound device, but not give it any 4811 TLB entries, which causes it to deadlock. Check for that. We do 4812 this in a function called from init_dmars(), instead of in a PCI 4813 quirk, because we don't want to print the obnoxious "BIOS broken" 4814 message if VT-d is actually disabled. 4815 */ 4816 static void __init check_tylersburg_isoch(void) 4817 { 4818 struct pci_dev *pdev; 4819 uint32_t vtisochctrl; 4820 4821 /* If there's no Azalia in the system anyway, forget it. */ 4822 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4823 if (!pdev) 4824 return; 4825 4826 if (risky_device(pdev)) { 4827 pci_dev_put(pdev); 4828 return; 4829 } 4830 4831 pci_dev_put(pdev); 4832 4833 /* System Management Registers. Might be hidden, in which case 4834 we can't do the sanity check. But that's OK, because the 4835 known-broken BIOSes _don't_ actually hide it, so far. */ 4836 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4837 if (!pdev) 4838 return; 4839 4840 if (risky_device(pdev)) { 4841 pci_dev_put(pdev); 4842 return; 4843 } 4844 4845 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4846 pci_dev_put(pdev); 4847 return; 4848 } 4849 4850 pci_dev_put(pdev); 4851 4852 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4853 if (vtisochctrl & 1) 4854 return; 4855 4856 /* Drop all bits other than the number of TLB entries */ 4857 vtisochctrl &= 0x1c; 4858 4859 /* If we have the recommended number of TLB entries (16), fine. */ 4860 if (vtisochctrl == 0x10) 4861 return; 4862 4863 /* Zero TLB entries? You get to ride the short bus to school. */ 4864 if (!vtisochctrl) { 4865 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4866 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4867 dmi_get_system_info(DMI_BIOS_VENDOR), 4868 dmi_get_system_info(DMI_BIOS_VERSION), 4869 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4870 iommu_identity_mapping |= IDENTMAP_AZALIA; 4871 return; 4872 } 4873 4874 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4875 vtisochctrl); 4876 } 4877 4878 /* 4879 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4880 * invalidation completion before posted writes initiated with translated address 4881 * that utilized translations matching the invalidation address range, violating 4882 * the invalidation completion ordering. 4883 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4884 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4885 * under the control of the trusted/privileged host device driver must use this 4886 * quirk. 4887 * Device TLBs are invalidated under the following six conditions: 4888 * 1. Device driver does DMA API unmap IOVA 4889 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4890 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4891 * exit_mmap() due to crash 4892 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4893 * VM has to free pages that were unmapped 4894 * 5. Userspace driver unmaps a DMA buffer 4895 * 6. Cache invalidation in vSVA usage (upcoming) 4896 * 4897 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4898 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4899 * invalidate TLB the same way as normal user unmap which will use this quirk. 4900 * The dTLB invalidation after PASID cache flush does not need this quirk. 4901 * 4902 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4903 */ 4904 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4905 unsigned long address, unsigned long mask, 4906 u32 pasid, u16 qdep) 4907 { 4908 u16 sid; 4909 4910 if (likely(!info->dtlb_extra_inval)) 4911 return; 4912 4913 sid = PCI_DEVID(info->bus, info->devfn); 4914 if (pasid == IOMMU_NO_PASID) { 4915 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4916 qdep, address, mask); 4917 } else { 4918 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 4919 pasid, qdep, address, mask); 4920 } 4921 } 4922 4923 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 4924 4925 /* 4926 * Function to submit a command to the enhanced command interface. The 4927 * valid enhanced command descriptions are defined in Table 47 of the 4928 * VT-d spec. The VT-d hardware implementation may support some but not 4929 * all commands, which can be determined by checking the Enhanced 4930 * Command Capability Register. 4931 * 4932 * Return values: 4933 * - 0: Command successful without any error; 4934 * - Negative: software error value; 4935 * - Nonzero positive: failure status code defined in Table 48. 4936 */ 4937 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 4938 { 4939 unsigned long flags; 4940 u64 res; 4941 int ret; 4942 4943 if (!cap_ecmds(iommu->cap)) 4944 return -ENODEV; 4945 4946 raw_spin_lock_irqsave(&iommu->register_lock, flags); 4947 4948 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 4949 if (res & DMA_ECMD_ECRSP_IP) { 4950 ret = -EBUSY; 4951 goto err; 4952 } 4953 4954 /* 4955 * Unconditionally write the operand B, because 4956 * - There is no side effect if an ecmd doesn't require an 4957 * operand B, but we set the register to some value. 4958 * - It's not invoked in any critical path. The extra MMIO 4959 * write doesn't bring any performance concerns. 4960 */ 4961 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 4962 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 4963 4964 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 4965 !(res & DMA_ECMD_ECRSP_IP), res); 4966 4967 if (res & DMA_ECMD_ECRSP_IP) { 4968 ret = -ETIMEDOUT; 4969 goto err; 4970 } 4971 4972 ret = ecmd_get_status_code(res); 4973 err: 4974 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 4975 4976 return ret; 4977 } 4978