1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 51 52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 57 58 /* IO virtual address start page frame number */ 59 #define IOVA_START_PFN (1) 60 61 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 62 63 static void __init check_tylersburg_isoch(void); 64 static int rwbf_quirk; 65 66 /* 67 * set to 1 to panic kernel if can't successfully enable VT-d 68 * (used when kernel is launched w/ TXT) 69 */ 70 static int force_on = 0; 71 static int intel_iommu_tboot_noforce; 72 static int no_platform_optin; 73 74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 75 76 /* 77 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 78 * if marked present. 79 */ 80 static phys_addr_t root_entry_lctp(struct root_entry *re) 81 { 82 if (!(re->lo & 1)) 83 return 0; 84 85 return re->lo & VTD_PAGE_MASK; 86 } 87 88 /* 89 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 90 * if marked present. 91 */ 92 static phys_addr_t root_entry_uctp(struct root_entry *re) 93 { 94 if (!(re->hi & 1)) 95 return 0; 96 97 return re->hi & VTD_PAGE_MASK; 98 } 99 100 /* 101 * This domain is a statically identity mapping domain. 102 * 1. This domain creats a static 1:1 mapping to all usable memory. 103 * 2. It maps to each iommu if successful. 104 * 3. Each iommu mapps to this domain if successful. 105 */ 106 static struct dmar_domain *si_domain; 107 static int hw_pass_through = 1; 108 109 struct dmar_rmrr_unit { 110 struct list_head list; /* list of rmrr units */ 111 struct acpi_dmar_header *hdr; /* ACPI header */ 112 u64 base_address; /* reserved base address*/ 113 u64 end_address; /* reserved end address */ 114 struct dmar_dev_scope *devices; /* target devices */ 115 int devices_cnt; /* target device count */ 116 }; 117 118 struct dmar_atsr_unit { 119 struct list_head list; /* list of ATSR units */ 120 struct acpi_dmar_header *hdr; /* ACPI header */ 121 struct dmar_dev_scope *devices; /* target devices */ 122 int devices_cnt; /* target device count */ 123 u8 include_all:1; /* include all ports */ 124 }; 125 126 struct dmar_satc_unit { 127 struct list_head list; /* list of SATC units */ 128 struct acpi_dmar_header *hdr; /* ACPI header */ 129 struct dmar_dev_scope *devices; /* target devices */ 130 struct intel_iommu *iommu; /* the corresponding iommu */ 131 int devices_cnt; /* target device count */ 132 u8 atc_required:1; /* ATS is required */ 133 }; 134 135 static LIST_HEAD(dmar_atsr_units); 136 static LIST_HEAD(dmar_rmrr_units); 137 static LIST_HEAD(dmar_satc_units); 138 139 #define for_each_rmrr_units(rmrr) \ 140 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 141 142 static void intel_iommu_domain_free(struct iommu_domain *domain); 143 144 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 145 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 146 147 int intel_iommu_enabled = 0; 148 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 149 150 static int dmar_map_gfx = 1; 151 static int intel_iommu_superpage = 1; 152 static int iommu_identity_mapping; 153 static int iommu_skip_te_disable; 154 155 #define IDENTMAP_GFX 2 156 #define IDENTMAP_AZALIA 4 157 158 const struct iommu_ops intel_iommu_ops; 159 static const struct iommu_dirty_ops intel_dirty_ops; 160 161 static bool translation_pre_enabled(struct intel_iommu *iommu) 162 { 163 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 164 } 165 166 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 167 { 168 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 169 } 170 171 static void init_translation_status(struct intel_iommu *iommu) 172 { 173 u32 gsts; 174 175 gsts = readl(iommu->reg + DMAR_GSTS_REG); 176 if (gsts & DMA_GSTS_TES) 177 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 178 } 179 180 static int __init intel_iommu_setup(char *str) 181 { 182 if (!str) 183 return -EINVAL; 184 185 while (*str) { 186 if (!strncmp(str, "on", 2)) { 187 dmar_disabled = 0; 188 pr_info("IOMMU enabled\n"); 189 } else if (!strncmp(str, "off", 3)) { 190 dmar_disabled = 1; 191 no_platform_optin = 1; 192 pr_info("IOMMU disabled\n"); 193 } else if (!strncmp(str, "igfx_off", 8)) { 194 dmar_map_gfx = 0; 195 pr_info("Disable GFX device mapping\n"); 196 } else if (!strncmp(str, "forcedac", 8)) { 197 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 198 iommu_dma_forcedac = true; 199 } else if (!strncmp(str, "strict", 6)) { 200 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 201 iommu_set_dma_strict(); 202 } else if (!strncmp(str, "sp_off", 6)) { 203 pr_info("Disable supported super page\n"); 204 intel_iommu_superpage = 0; 205 } else if (!strncmp(str, "sm_on", 5)) { 206 pr_info("Enable scalable mode if hardware supports\n"); 207 intel_iommu_sm = 1; 208 } else if (!strncmp(str, "sm_off", 6)) { 209 pr_info("Scalable mode is disallowed\n"); 210 intel_iommu_sm = 0; 211 } else if (!strncmp(str, "tboot_noforce", 13)) { 212 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 213 intel_iommu_tboot_noforce = 1; 214 } else { 215 pr_notice("Unknown option - '%s'\n", str); 216 } 217 218 str += strcspn(str, ","); 219 while (*str == ',') 220 str++; 221 } 222 223 return 1; 224 } 225 __setup("intel_iommu=", intel_iommu_setup); 226 227 void *alloc_pgtable_page(int node, gfp_t gfp) 228 { 229 struct page *page; 230 void *vaddr = NULL; 231 232 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 233 if (page) 234 vaddr = page_address(page); 235 return vaddr; 236 } 237 238 void free_pgtable_page(void *vaddr) 239 { 240 free_page((unsigned long)vaddr); 241 } 242 243 static int domain_type_is_si(struct dmar_domain *domain) 244 { 245 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 246 } 247 248 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 249 { 250 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 251 252 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 253 } 254 255 /* 256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 258 * the returned SAGAW. 259 */ 260 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 261 { 262 unsigned long fl_sagaw, sl_sagaw; 263 264 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 265 sl_sagaw = cap_sagaw(iommu->cap); 266 267 /* Second level only. */ 268 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 269 return sl_sagaw; 270 271 /* First level only. */ 272 if (!ecap_slts(iommu->ecap)) 273 return fl_sagaw; 274 275 return fl_sagaw & sl_sagaw; 276 } 277 278 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 279 { 280 unsigned long sagaw; 281 int agaw; 282 283 sagaw = __iommu_calculate_sagaw(iommu); 284 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 285 if (test_bit(agaw, &sagaw)) 286 break; 287 } 288 289 return agaw; 290 } 291 292 /* 293 * Calculate max SAGAW for each iommu. 294 */ 295 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 296 { 297 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 298 } 299 300 /* 301 * calculate agaw for each iommu. 302 * "SAGAW" may be different across iommus, use a default agaw, and 303 * get a supported less agaw for iommus that don't support the default agaw. 304 */ 305 int iommu_calculate_agaw(struct intel_iommu *iommu) 306 { 307 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 308 } 309 310 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 311 { 312 return sm_supported(iommu) ? 313 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 314 } 315 316 static void domain_update_iommu_coherency(struct dmar_domain *domain) 317 { 318 struct iommu_domain_info *info; 319 struct dmar_drhd_unit *drhd; 320 struct intel_iommu *iommu; 321 bool found = false; 322 unsigned long i; 323 324 domain->iommu_coherency = true; 325 xa_for_each(&domain->iommu_array, i, info) { 326 found = true; 327 if (!iommu_paging_structure_coherency(info->iommu)) { 328 domain->iommu_coherency = false; 329 break; 330 } 331 } 332 if (found) 333 return; 334 335 /* No hardware attached; use lowest common denominator */ 336 rcu_read_lock(); 337 for_each_active_iommu(iommu, drhd) { 338 if (!iommu_paging_structure_coherency(iommu)) { 339 domain->iommu_coherency = false; 340 break; 341 } 342 } 343 rcu_read_unlock(); 344 } 345 346 static int domain_update_iommu_superpage(struct dmar_domain *domain, 347 struct intel_iommu *skip) 348 { 349 struct dmar_drhd_unit *drhd; 350 struct intel_iommu *iommu; 351 int mask = 0x3; 352 353 if (!intel_iommu_superpage) 354 return 0; 355 356 /* set iommu_superpage to the smallest common denominator */ 357 rcu_read_lock(); 358 for_each_active_iommu(iommu, drhd) { 359 if (iommu != skip) { 360 if (domain && domain->use_first_level) { 361 if (!cap_fl1gp_support(iommu->cap)) 362 mask = 0x1; 363 } else { 364 mask &= cap_super_page_val(iommu->cap); 365 } 366 367 if (!mask) 368 break; 369 } 370 } 371 rcu_read_unlock(); 372 373 return fls(mask); 374 } 375 376 static int domain_update_device_node(struct dmar_domain *domain) 377 { 378 struct device_domain_info *info; 379 int nid = NUMA_NO_NODE; 380 unsigned long flags; 381 382 spin_lock_irqsave(&domain->lock, flags); 383 list_for_each_entry(info, &domain->devices, link) { 384 /* 385 * There could possibly be multiple device numa nodes as devices 386 * within the same domain may sit behind different IOMMUs. There 387 * isn't perfect answer in such situation, so we select first 388 * come first served policy. 389 */ 390 nid = dev_to_node(info->dev); 391 if (nid != NUMA_NO_NODE) 392 break; 393 } 394 spin_unlock_irqrestore(&domain->lock, flags); 395 396 return nid; 397 } 398 399 /* Return the super pagesize bitmap if supported. */ 400 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 401 { 402 unsigned long bitmap = 0; 403 404 /* 405 * 1-level super page supports page size of 2MiB, 2-level super page 406 * supports page size of both 2MiB and 1GiB. 407 */ 408 if (domain->iommu_superpage == 1) 409 bitmap |= SZ_2M; 410 else if (domain->iommu_superpage == 2) 411 bitmap |= SZ_2M | SZ_1G; 412 413 return bitmap; 414 } 415 416 /* Some capabilities may be different across iommus */ 417 void domain_update_iommu_cap(struct dmar_domain *domain) 418 { 419 domain_update_iommu_coherency(domain); 420 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 421 422 /* 423 * If RHSA is missing, we should default to the device numa domain 424 * as fall back. 425 */ 426 if (domain->nid == NUMA_NO_NODE) 427 domain->nid = domain_update_device_node(domain); 428 429 /* 430 * First-level translation restricts the input-address to a 431 * canonical address (i.e., address bits 63:N have the same 432 * value as address bit [N-1], where N is 48-bits with 4-level 433 * paging and 57-bits with 5-level paging). Hence, skip bit 434 * [N-1]. 435 */ 436 if (domain->use_first_level) 437 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 438 else 439 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 440 441 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 442 domain_update_iotlb(domain); 443 } 444 445 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 446 u8 devfn, int alloc) 447 { 448 struct root_entry *root = &iommu->root_entry[bus]; 449 struct context_entry *context; 450 u64 *entry; 451 452 /* 453 * Except that the caller requested to allocate a new entry, 454 * returning a copied context entry makes no sense. 455 */ 456 if (!alloc && context_copied(iommu, bus, devfn)) 457 return NULL; 458 459 entry = &root->lo; 460 if (sm_supported(iommu)) { 461 if (devfn >= 0x80) { 462 devfn -= 0x80; 463 entry = &root->hi; 464 } 465 devfn *= 2; 466 } 467 if (*entry & 1) 468 context = phys_to_virt(*entry & VTD_PAGE_MASK); 469 else { 470 unsigned long phy_addr; 471 if (!alloc) 472 return NULL; 473 474 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 475 if (!context) 476 return NULL; 477 478 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 479 phy_addr = virt_to_phys((void *)context); 480 *entry = phy_addr | 1; 481 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 482 } 483 return &context[devfn]; 484 } 485 486 /** 487 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 488 * sub-hierarchy of a candidate PCI-PCI bridge 489 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 490 * @bridge: the candidate PCI-PCI bridge 491 * 492 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 493 */ 494 static bool 495 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 496 { 497 struct pci_dev *pdev, *pbridge; 498 499 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 500 return false; 501 502 pdev = to_pci_dev(dev); 503 pbridge = to_pci_dev(bridge); 504 505 if (pbridge->subordinate && 506 pbridge->subordinate->number <= pdev->bus->number && 507 pbridge->subordinate->busn_res.end >= pdev->bus->number) 508 return true; 509 510 return false; 511 } 512 513 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 514 { 515 struct dmar_drhd_unit *drhd; 516 u32 vtbar; 517 int rc; 518 519 /* We know that this device on this chipset has its own IOMMU. 520 * If we find it under a different IOMMU, then the BIOS is lying 521 * to us. Hope that the IOMMU for this device is actually 522 * disabled, and it needs no translation... 523 */ 524 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 525 if (rc) { 526 /* "can't" happen */ 527 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 528 return false; 529 } 530 vtbar &= 0xffff0000; 531 532 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 533 drhd = dmar_find_matched_drhd_unit(pdev); 534 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 535 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 536 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 537 return true; 538 } 539 540 return false; 541 } 542 543 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 544 { 545 if (!iommu || iommu->drhd->ignored) 546 return true; 547 548 if (dev_is_pci(dev)) { 549 struct pci_dev *pdev = to_pci_dev(dev); 550 551 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 552 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 553 quirk_ioat_snb_local_iommu(pdev)) 554 return true; 555 } 556 557 return false; 558 } 559 560 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 561 { 562 struct dmar_drhd_unit *drhd = NULL; 563 struct pci_dev *pdev = NULL; 564 struct intel_iommu *iommu; 565 struct device *tmp; 566 u16 segment = 0; 567 int i; 568 569 if (!dev) 570 return NULL; 571 572 if (dev_is_pci(dev)) { 573 struct pci_dev *pf_pdev; 574 575 pdev = pci_real_dma_dev(to_pci_dev(dev)); 576 577 /* VFs aren't listed in scope tables; we need to look up 578 * the PF instead to find the IOMMU. */ 579 pf_pdev = pci_physfn(pdev); 580 dev = &pf_pdev->dev; 581 segment = pci_domain_nr(pdev->bus); 582 } else if (has_acpi_companion(dev)) 583 dev = &ACPI_COMPANION(dev)->dev; 584 585 rcu_read_lock(); 586 for_each_iommu(iommu, drhd) { 587 if (pdev && segment != drhd->segment) 588 continue; 589 590 for_each_active_dev_scope(drhd->devices, 591 drhd->devices_cnt, i, tmp) { 592 if (tmp == dev) { 593 /* For a VF use its original BDF# not that of the PF 594 * which we used for the IOMMU lookup. Strictly speaking 595 * we could do this for all PCI devices; we only need to 596 * get the BDF# from the scope table for ACPI matches. */ 597 if (pdev && pdev->is_virtfn) 598 goto got_pdev; 599 600 if (bus && devfn) { 601 *bus = drhd->devices[i].bus; 602 *devfn = drhd->devices[i].devfn; 603 } 604 goto out; 605 } 606 607 if (is_downstream_to_pci_bridge(dev, tmp)) 608 goto got_pdev; 609 } 610 611 if (pdev && drhd->include_all) { 612 got_pdev: 613 if (bus && devfn) { 614 *bus = pdev->bus->number; 615 *devfn = pdev->devfn; 616 } 617 goto out; 618 } 619 } 620 iommu = NULL; 621 out: 622 if (iommu_is_dummy(iommu, dev)) 623 iommu = NULL; 624 625 rcu_read_unlock(); 626 627 return iommu; 628 } 629 630 static void domain_flush_cache(struct dmar_domain *domain, 631 void *addr, int size) 632 { 633 if (!domain->iommu_coherency) 634 clflush_cache_range(addr, size); 635 } 636 637 static void free_context_table(struct intel_iommu *iommu) 638 { 639 struct context_entry *context; 640 int i; 641 642 if (!iommu->root_entry) 643 return; 644 645 for (i = 0; i < ROOT_ENTRY_NR; i++) { 646 context = iommu_context_addr(iommu, i, 0, 0); 647 if (context) 648 free_pgtable_page(context); 649 650 if (!sm_supported(iommu)) 651 continue; 652 653 context = iommu_context_addr(iommu, i, 0x80, 0); 654 if (context) 655 free_pgtable_page(context); 656 } 657 658 free_pgtable_page(iommu->root_entry); 659 iommu->root_entry = NULL; 660 } 661 662 #ifdef CONFIG_DMAR_DEBUG 663 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 664 u8 bus, u8 devfn, struct dma_pte *parent, int level) 665 { 666 struct dma_pte *pte; 667 int offset; 668 669 while (1) { 670 offset = pfn_level_offset(pfn, level); 671 pte = &parent[offset]; 672 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 673 pr_info("PTE not present at level %d\n", level); 674 break; 675 } 676 677 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 678 679 if (level == 1) 680 break; 681 682 parent = phys_to_virt(dma_pte_addr(pte)); 683 level--; 684 } 685 } 686 687 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 688 unsigned long long addr, u32 pasid) 689 { 690 struct pasid_dir_entry *dir, *pde; 691 struct pasid_entry *entries, *pte; 692 struct context_entry *ctx_entry; 693 struct root_entry *rt_entry; 694 int i, dir_index, index, level; 695 u8 devfn = source_id & 0xff; 696 u8 bus = source_id >> 8; 697 struct dma_pte *pgtable; 698 699 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 700 701 /* root entry dump */ 702 rt_entry = &iommu->root_entry[bus]; 703 if (!rt_entry) { 704 pr_info("root table entry is not present\n"); 705 return; 706 } 707 708 if (sm_supported(iommu)) 709 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 710 rt_entry->hi, rt_entry->lo); 711 else 712 pr_info("root entry: 0x%016llx", rt_entry->lo); 713 714 /* context entry dump */ 715 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 716 if (!ctx_entry) { 717 pr_info("context table entry is not present\n"); 718 return; 719 } 720 721 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 722 ctx_entry->hi, ctx_entry->lo); 723 724 /* legacy mode does not require PASID entries */ 725 if (!sm_supported(iommu)) { 726 level = agaw_to_level(ctx_entry->hi & 7); 727 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 728 goto pgtable_walk; 729 } 730 731 /* get the pointer to pasid directory entry */ 732 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 733 if (!dir) { 734 pr_info("pasid directory entry is not present\n"); 735 return; 736 } 737 /* For request-without-pasid, get the pasid from context entry */ 738 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 739 pasid = IOMMU_NO_PASID; 740 741 dir_index = pasid >> PASID_PDE_SHIFT; 742 pde = &dir[dir_index]; 743 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 744 745 /* get the pointer to the pasid table entry */ 746 entries = get_pasid_table_from_pde(pde); 747 if (!entries) { 748 pr_info("pasid table entry is not present\n"); 749 return; 750 } 751 index = pasid & PASID_PTE_MASK; 752 pte = &entries[index]; 753 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 754 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 755 756 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 757 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 758 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 759 } else { 760 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 761 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 762 } 763 764 pgtable_walk: 765 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 766 } 767 #endif 768 769 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 770 unsigned long pfn, int *target_level, 771 gfp_t gfp) 772 { 773 struct dma_pte *parent, *pte; 774 int level = agaw_to_level(domain->agaw); 775 int offset; 776 777 if (!domain_pfn_supported(domain, pfn)) 778 /* Address beyond IOMMU's addressing capabilities. */ 779 return NULL; 780 781 parent = domain->pgd; 782 783 while (1) { 784 void *tmp_page; 785 786 offset = pfn_level_offset(pfn, level); 787 pte = &parent[offset]; 788 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 789 break; 790 if (level == *target_level) 791 break; 792 793 if (!dma_pte_present(pte)) { 794 uint64_t pteval; 795 796 tmp_page = alloc_pgtable_page(domain->nid, gfp); 797 798 if (!tmp_page) 799 return NULL; 800 801 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 802 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 803 if (domain->use_first_level) 804 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 805 806 if (cmpxchg64(&pte->val, 0ULL, pteval)) 807 /* Someone else set it while we were thinking; use theirs. */ 808 free_pgtable_page(tmp_page); 809 else 810 domain_flush_cache(domain, pte, sizeof(*pte)); 811 } 812 if (level == 1) 813 break; 814 815 parent = phys_to_virt(dma_pte_addr(pte)); 816 level--; 817 } 818 819 if (!*target_level) 820 *target_level = level; 821 822 return pte; 823 } 824 825 /* return address's pte at specific level */ 826 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 827 unsigned long pfn, 828 int level, int *large_page) 829 { 830 struct dma_pte *parent, *pte; 831 int total = agaw_to_level(domain->agaw); 832 int offset; 833 834 parent = domain->pgd; 835 while (level <= total) { 836 offset = pfn_level_offset(pfn, total); 837 pte = &parent[offset]; 838 if (level == total) 839 return pte; 840 841 if (!dma_pte_present(pte)) { 842 *large_page = total; 843 break; 844 } 845 846 if (dma_pte_superpage(pte)) { 847 *large_page = total; 848 return pte; 849 } 850 851 parent = phys_to_virt(dma_pte_addr(pte)); 852 total--; 853 } 854 return NULL; 855 } 856 857 /* clear last level pte, a tlb flush should be followed */ 858 static void dma_pte_clear_range(struct dmar_domain *domain, 859 unsigned long start_pfn, 860 unsigned long last_pfn) 861 { 862 unsigned int large_page; 863 struct dma_pte *first_pte, *pte; 864 865 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 866 WARN_ON(start_pfn > last_pfn)) 867 return; 868 869 /* we don't need lock here; nobody else touches the iova range */ 870 do { 871 large_page = 1; 872 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 873 if (!pte) { 874 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 875 continue; 876 } 877 do { 878 dma_clear_pte(pte); 879 start_pfn += lvl_to_nr_pages(large_page); 880 pte++; 881 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 882 883 domain_flush_cache(domain, first_pte, 884 (void *)pte - (void *)first_pte); 885 886 } while (start_pfn && start_pfn <= last_pfn); 887 } 888 889 static void dma_pte_free_level(struct dmar_domain *domain, int level, 890 int retain_level, struct dma_pte *pte, 891 unsigned long pfn, unsigned long start_pfn, 892 unsigned long last_pfn) 893 { 894 pfn = max(start_pfn, pfn); 895 pte = &pte[pfn_level_offset(pfn, level)]; 896 897 do { 898 unsigned long level_pfn; 899 struct dma_pte *level_pte; 900 901 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 902 goto next; 903 904 level_pfn = pfn & level_mask(level); 905 level_pte = phys_to_virt(dma_pte_addr(pte)); 906 907 if (level > 2) { 908 dma_pte_free_level(domain, level - 1, retain_level, 909 level_pte, level_pfn, start_pfn, 910 last_pfn); 911 } 912 913 /* 914 * Free the page table if we're below the level we want to 915 * retain and the range covers the entire table. 916 */ 917 if (level < retain_level && !(start_pfn > level_pfn || 918 last_pfn < level_pfn + level_size(level) - 1)) { 919 dma_clear_pte(pte); 920 domain_flush_cache(domain, pte, sizeof(*pte)); 921 free_pgtable_page(level_pte); 922 } 923 next: 924 pfn += level_size(level); 925 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 926 } 927 928 /* 929 * clear last level (leaf) ptes and free page table pages below the 930 * level we wish to keep intact. 931 */ 932 static void dma_pte_free_pagetable(struct dmar_domain *domain, 933 unsigned long start_pfn, 934 unsigned long last_pfn, 935 int retain_level) 936 { 937 dma_pte_clear_range(domain, start_pfn, last_pfn); 938 939 /* We don't need lock here; nobody else touches the iova range */ 940 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 941 domain->pgd, 0, start_pfn, last_pfn); 942 943 /* free pgd */ 944 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 945 free_pgtable_page(domain->pgd); 946 domain->pgd = NULL; 947 } 948 } 949 950 /* When a page at a given level is being unlinked from its parent, we don't 951 need to *modify* it at all. All we need to do is make a list of all the 952 pages which can be freed just as soon as we've flushed the IOTLB and we 953 know the hardware page-walk will no longer touch them. 954 The 'pte' argument is the *parent* PTE, pointing to the page that is to 955 be freed. */ 956 static void dma_pte_list_pagetables(struct dmar_domain *domain, 957 int level, struct dma_pte *pte, 958 struct list_head *freelist) 959 { 960 struct page *pg; 961 962 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 963 list_add_tail(&pg->lru, freelist); 964 965 if (level == 1) 966 return; 967 968 pte = page_address(pg); 969 do { 970 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 971 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 972 pte++; 973 } while (!first_pte_in_page(pte)); 974 } 975 976 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 977 struct dma_pte *pte, unsigned long pfn, 978 unsigned long start_pfn, unsigned long last_pfn, 979 struct list_head *freelist) 980 { 981 struct dma_pte *first_pte = NULL, *last_pte = NULL; 982 983 pfn = max(start_pfn, pfn); 984 pte = &pte[pfn_level_offset(pfn, level)]; 985 986 do { 987 unsigned long level_pfn = pfn & level_mask(level); 988 989 if (!dma_pte_present(pte)) 990 goto next; 991 992 /* If range covers entire pagetable, free it */ 993 if (start_pfn <= level_pfn && 994 last_pfn >= level_pfn + level_size(level) - 1) { 995 /* These suborbinate page tables are going away entirely. Don't 996 bother to clear them; we're just going to *free* them. */ 997 if (level > 1 && !dma_pte_superpage(pte)) 998 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 999 1000 dma_clear_pte(pte); 1001 if (!first_pte) 1002 first_pte = pte; 1003 last_pte = pte; 1004 } else if (level > 1) { 1005 /* Recurse down into a level that isn't *entirely* obsolete */ 1006 dma_pte_clear_level(domain, level - 1, 1007 phys_to_virt(dma_pte_addr(pte)), 1008 level_pfn, start_pfn, last_pfn, 1009 freelist); 1010 } 1011 next: 1012 pfn = level_pfn + level_size(level); 1013 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1014 1015 if (first_pte) 1016 domain_flush_cache(domain, first_pte, 1017 (void *)++last_pte - (void *)first_pte); 1018 } 1019 1020 /* We can't just free the pages because the IOMMU may still be walking 1021 the page tables, and may have cached the intermediate levels. The 1022 pages can only be freed after the IOTLB flush has been done. */ 1023 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1024 unsigned long last_pfn, struct list_head *freelist) 1025 { 1026 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1027 WARN_ON(start_pfn > last_pfn)) 1028 return; 1029 1030 /* we don't need lock here; nobody else touches the iova range */ 1031 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1032 domain->pgd, 0, start_pfn, last_pfn, freelist); 1033 1034 /* free pgd */ 1035 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1036 struct page *pgd_page = virt_to_page(domain->pgd); 1037 list_add_tail(&pgd_page->lru, freelist); 1038 domain->pgd = NULL; 1039 } 1040 } 1041 1042 /* iommu handling */ 1043 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1044 { 1045 struct root_entry *root; 1046 1047 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1048 if (!root) { 1049 pr_err("Allocating root entry for %s failed\n", 1050 iommu->name); 1051 return -ENOMEM; 1052 } 1053 1054 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1055 iommu->root_entry = root; 1056 1057 return 0; 1058 } 1059 1060 static void iommu_set_root_entry(struct intel_iommu *iommu) 1061 { 1062 u64 addr; 1063 u32 sts; 1064 unsigned long flag; 1065 1066 addr = virt_to_phys(iommu->root_entry); 1067 if (sm_supported(iommu)) 1068 addr |= DMA_RTADDR_SMT; 1069 1070 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1071 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1072 1073 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1074 1075 /* Make sure hardware complete it */ 1076 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1077 readl, (sts & DMA_GSTS_RTPS), sts); 1078 1079 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1080 1081 /* 1082 * Hardware invalidates all DMA remapping hardware translation 1083 * caches as part of SRTP flow. 1084 */ 1085 if (cap_esrtps(iommu->cap)) 1086 return; 1087 1088 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1089 if (sm_supported(iommu)) 1090 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1091 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1092 } 1093 1094 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1095 { 1096 u32 val; 1097 unsigned long flag; 1098 1099 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1100 return; 1101 1102 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1103 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1104 1105 /* Make sure hardware complete it */ 1106 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1107 readl, (!(val & DMA_GSTS_WBFS)), val); 1108 1109 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1110 } 1111 1112 /* return value determine if we need a write buffer flush */ 1113 static void __iommu_flush_context(struct intel_iommu *iommu, 1114 u16 did, u16 source_id, u8 function_mask, 1115 u64 type) 1116 { 1117 u64 val = 0; 1118 unsigned long flag; 1119 1120 switch (type) { 1121 case DMA_CCMD_GLOBAL_INVL: 1122 val = DMA_CCMD_GLOBAL_INVL; 1123 break; 1124 case DMA_CCMD_DOMAIN_INVL: 1125 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1126 break; 1127 case DMA_CCMD_DEVICE_INVL: 1128 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1129 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1130 break; 1131 default: 1132 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1133 iommu->name, type); 1134 return; 1135 } 1136 val |= DMA_CCMD_ICC; 1137 1138 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1139 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1140 1141 /* Make sure hardware complete it */ 1142 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1143 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1144 1145 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1146 } 1147 1148 /* return value determine if we need a write buffer flush */ 1149 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1150 u64 addr, unsigned int size_order, u64 type) 1151 { 1152 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1153 u64 val = 0, val_iva = 0; 1154 unsigned long flag; 1155 1156 switch (type) { 1157 case DMA_TLB_GLOBAL_FLUSH: 1158 /* global flush doesn't need set IVA_REG */ 1159 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1160 break; 1161 case DMA_TLB_DSI_FLUSH: 1162 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1163 break; 1164 case DMA_TLB_PSI_FLUSH: 1165 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1166 /* IH bit is passed in as part of address */ 1167 val_iva = size_order | addr; 1168 break; 1169 default: 1170 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1171 iommu->name, type); 1172 return; 1173 } 1174 1175 if (cap_write_drain(iommu->cap)) 1176 val |= DMA_TLB_WRITE_DRAIN; 1177 1178 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1179 /* Note: Only uses first TLB reg currently */ 1180 if (val_iva) 1181 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1182 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1183 1184 /* Make sure hardware complete it */ 1185 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1186 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1187 1188 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1189 1190 /* check IOTLB invalidation granularity */ 1191 if (DMA_TLB_IAIG(val) == 0) 1192 pr_err("Flush IOTLB failed\n"); 1193 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1194 pr_debug("TLB flush request %Lx, actual %Lx\n", 1195 (unsigned long long)DMA_TLB_IIRG(type), 1196 (unsigned long long)DMA_TLB_IAIG(val)); 1197 } 1198 1199 static struct device_domain_info * 1200 domain_lookup_dev_info(struct dmar_domain *domain, 1201 struct intel_iommu *iommu, u8 bus, u8 devfn) 1202 { 1203 struct device_domain_info *info; 1204 unsigned long flags; 1205 1206 spin_lock_irqsave(&domain->lock, flags); 1207 list_for_each_entry(info, &domain->devices, link) { 1208 if (info->iommu == iommu && info->bus == bus && 1209 info->devfn == devfn) { 1210 spin_unlock_irqrestore(&domain->lock, flags); 1211 return info; 1212 } 1213 } 1214 spin_unlock_irqrestore(&domain->lock, flags); 1215 1216 return NULL; 1217 } 1218 1219 void domain_update_iotlb(struct dmar_domain *domain) 1220 { 1221 struct dev_pasid_info *dev_pasid; 1222 struct device_domain_info *info; 1223 bool has_iotlb_device = false; 1224 unsigned long flags; 1225 1226 spin_lock_irqsave(&domain->lock, flags); 1227 list_for_each_entry(info, &domain->devices, link) { 1228 if (info->ats_enabled) { 1229 has_iotlb_device = true; 1230 break; 1231 } 1232 } 1233 1234 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1235 info = dev_iommu_priv_get(dev_pasid->dev); 1236 if (info->ats_enabled) { 1237 has_iotlb_device = true; 1238 break; 1239 } 1240 } 1241 domain->has_iotlb_device = has_iotlb_device; 1242 spin_unlock_irqrestore(&domain->lock, flags); 1243 } 1244 1245 /* 1246 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1247 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1248 * check because it applies only to the built-in QAT devices and it doesn't 1249 * grant additional privileges. 1250 */ 1251 #define BUGGY_QAT_DEVID_MASK 0x4940 1252 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1253 { 1254 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1255 return false; 1256 1257 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1258 return false; 1259 1260 return true; 1261 } 1262 1263 static void iommu_enable_pci_caps(struct device_domain_info *info) 1264 { 1265 struct pci_dev *pdev; 1266 1267 if (!dev_is_pci(info->dev)) 1268 return; 1269 1270 pdev = to_pci_dev(info->dev); 1271 1272 /* The PCIe spec, in its wisdom, declares that the behaviour of 1273 the device if you enable PASID support after ATS support is 1274 undefined. So always enable PASID support on devices which 1275 have it, even if we can't yet know if we're ever going to 1276 use it. */ 1277 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1278 info->pasid_enabled = 1; 1279 1280 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1281 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1282 info->ats_enabled = 1; 1283 domain_update_iotlb(info->domain); 1284 } 1285 } 1286 1287 static void iommu_disable_pci_caps(struct device_domain_info *info) 1288 { 1289 struct pci_dev *pdev; 1290 1291 if (!dev_is_pci(info->dev)) 1292 return; 1293 1294 pdev = to_pci_dev(info->dev); 1295 1296 if (info->ats_enabled) { 1297 pci_disable_ats(pdev); 1298 info->ats_enabled = 0; 1299 domain_update_iotlb(info->domain); 1300 } 1301 1302 if (info->pasid_enabled) { 1303 pci_disable_pasid(pdev); 1304 info->pasid_enabled = 0; 1305 } 1306 } 1307 1308 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1309 u64 addr, unsigned int mask) 1310 { 1311 u16 sid, qdep; 1312 1313 if (!info || !info->ats_enabled) 1314 return; 1315 1316 sid = info->bus << 8 | info->devfn; 1317 qdep = info->ats_qdep; 1318 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1319 qdep, addr, mask); 1320 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); 1321 } 1322 1323 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1324 u64 addr, unsigned mask) 1325 { 1326 struct dev_pasid_info *dev_pasid; 1327 struct device_domain_info *info; 1328 unsigned long flags; 1329 1330 if (!domain->has_iotlb_device) 1331 return; 1332 1333 spin_lock_irqsave(&domain->lock, flags); 1334 list_for_each_entry(info, &domain->devices, link) 1335 __iommu_flush_dev_iotlb(info, addr, mask); 1336 1337 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1338 info = dev_iommu_priv_get(dev_pasid->dev); 1339 1340 if (!info->ats_enabled) 1341 continue; 1342 1343 qi_flush_dev_iotlb_pasid(info->iommu, 1344 PCI_DEVID(info->bus, info->devfn), 1345 info->pfsid, dev_pasid->pasid, 1346 info->ats_qdep, addr, 1347 mask); 1348 } 1349 spin_unlock_irqrestore(&domain->lock, flags); 1350 } 1351 1352 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, 1353 struct dmar_domain *domain, u64 addr, 1354 unsigned long npages, bool ih) 1355 { 1356 u16 did = domain_id_iommu(domain, iommu); 1357 struct dev_pasid_info *dev_pasid; 1358 unsigned long flags; 1359 1360 spin_lock_irqsave(&domain->lock, flags); 1361 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) 1362 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); 1363 1364 if (!list_empty(&domain->devices)) 1365 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); 1366 spin_unlock_irqrestore(&domain->lock, flags); 1367 } 1368 1369 static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, 1370 unsigned long pfn, unsigned int pages, 1371 int ih) 1372 { 1373 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1374 unsigned long bitmask = aligned_pages - 1; 1375 unsigned int mask = ilog2(aligned_pages); 1376 u64 addr = (u64)pfn << VTD_PAGE_SHIFT; 1377 1378 /* 1379 * PSI masks the low order bits of the base address. If the 1380 * address isn't aligned to the mask, then compute a mask value 1381 * needed to ensure the target range is flushed. 1382 */ 1383 if (unlikely(bitmask & pfn)) { 1384 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1385 1386 /* 1387 * Since end_pfn <= pfn + bitmask, the only way bits 1388 * higher than bitmask can differ in pfn and end_pfn is 1389 * by carrying. This means after masking out bitmask, 1390 * high bits starting with the first set bit in 1391 * shared_bits are all equal in both pfn and end_pfn. 1392 */ 1393 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1394 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1395 } 1396 1397 /* 1398 * Fallback to domain selective flush if no PSI support or 1399 * the size is too big. 1400 */ 1401 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) 1402 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1403 DMA_TLB_DSI_FLUSH); 1404 else 1405 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1406 DMA_TLB_PSI_FLUSH); 1407 } 1408 1409 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1410 struct dmar_domain *domain, 1411 unsigned long pfn, unsigned int pages, 1412 int ih, int map) 1413 { 1414 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1415 unsigned int mask = ilog2(aligned_pages); 1416 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1417 u16 did = domain_id_iommu(domain, iommu); 1418 1419 if (WARN_ON(!pages)) 1420 return; 1421 1422 if (ih) 1423 ih = 1 << 6; 1424 1425 if (domain->use_first_level) 1426 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); 1427 else 1428 __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih); 1429 1430 /* 1431 * In caching mode, changes of pages from non-present to present require 1432 * flush. However, device IOTLB doesn't need to be flushed in this case. 1433 */ 1434 if (!cap_caching_mode(iommu->cap) || !map) 1435 iommu_flush_dev_iotlb(domain, addr, mask); 1436 } 1437 1438 /* Notification for newly created mappings */ 1439 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, 1440 unsigned long pfn, unsigned int pages) 1441 { 1442 /* 1443 * It's a non-present to present mapping. Only flush if caching mode 1444 * and second level. 1445 */ 1446 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1447 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1448 else 1449 iommu_flush_write_buffer(iommu); 1450 } 1451 1452 /* 1453 * Flush the relevant caches in nested translation if the domain 1454 * also serves as a parent 1455 */ 1456 static void parent_domain_flush(struct dmar_domain *domain, 1457 unsigned long pfn, 1458 unsigned long pages, int ih) 1459 { 1460 struct dmar_domain *s1_domain; 1461 1462 spin_lock(&domain->s1_lock); 1463 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 1464 struct device_domain_info *device_info; 1465 struct iommu_domain_info *info; 1466 unsigned long flags; 1467 unsigned long i; 1468 1469 xa_for_each(&s1_domain->iommu_array, i, info) 1470 __iommu_flush_iotlb_psi(info->iommu, info->did, 1471 pfn, pages, ih); 1472 1473 if (!s1_domain->has_iotlb_device) 1474 continue; 1475 1476 spin_lock_irqsave(&s1_domain->lock, flags); 1477 list_for_each_entry(device_info, &s1_domain->devices, link) 1478 /* 1479 * Address translation cache in device side caches the 1480 * result of nested translation. There is no easy way 1481 * to identify the exact set of nested translations 1482 * affected by a change in S2. So just flush the entire 1483 * device cache. 1484 */ 1485 __iommu_flush_dev_iotlb(device_info, 0, 1486 MAX_AGAW_PFN_WIDTH); 1487 spin_unlock_irqrestore(&s1_domain->lock, flags); 1488 } 1489 spin_unlock(&domain->s1_lock); 1490 } 1491 1492 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1493 { 1494 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1495 struct iommu_domain_info *info; 1496 unsigned long idx; 1497 1498 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1499 struct intel_iommu *iommu = info->iommu; 1500 u16 did = domain_id_iommu(dmar_domain, iommu); 1501 1502 if (dmar_domain->use_first_level) 1503 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); 1504 else 1505 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1506 DMA_TLB_DSI_FLUSH); 1507 1508 if (!cap_caching_mode(iommu->cap)) 1509 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1510 } 1511 1512 if (dmar_domain->nested_parent) 1513 parent_domain_flush(dmar_domain, 0, -1, 0); 1514 } 1515 1516 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1517 { 1518 u32 pmen; 1519 unsigned long flags; 1520 1521 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1522 return; 1523 1524 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1525 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1526 pmen &= ~DMA_PMEN_EPM; 1527 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1528 1529 /* wait for the protected region status bit to clear */ 1530 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1531 readl, !(pmen & DMA_PMEN_PRS), pmen); 1532 1533 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1534 } 1535 1536 static void iommu_enable_translation(struct intel_iommu *iommu) 1537 { 1538 u32 sts; 1539 unsigned long flags; 1540 1541 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1542 iommu->gcmd |= DMA_GCMD_TE; 1543 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1544 1545 /* Make sure hardware complete it */ 1546 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1547 readl, (sts & DMA_GSTS_TES), sts); 1548 1549 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1550 } 1551 1552 static void iommu_disable_translation(struct intel_iommu *iommu) 1553 { 1554 u32 sts; 1555 unsigned long flag; 1556 1557 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1558 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1559 return; 1560 1561 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1562 iommu->gcmd &= ~DMA_GCMD_TE; 1563 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1564 1565 /* Make sure hardware complete it */ 1566 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1567 readl, (!(sts & DMA_GSTS_TES)), sts); 1568 1569 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1570 } 1571 1572 static int iommu_init_domains(struct intel_iommu *iommu) 1573 { 1574 u32 ndomains; 1575 1576 ndomains = cap_ndoms(iommu->cap); 1577 pr_debug("%s: Number of Domains supported <%d>\n", 1578 iommu->name, ndomains); 1579 1580 spin_lock_init(&iommu->lock); 1581 1582 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1583 if (!iommu->domain_ids) 1584 return -ENOMEM; 1585 1586 /* 1587 * If Caching mode is set, then invalid translations are tagged 1588 * with domain-id 0, hence we need to pre-allocate it. We also 1589 * use domain-id 0 as a marker for non-allocated domain-id, so 1590 * make sure it is not used for a real domain. 1591 */ 1592 set_bit(0, iommu->domain_ids); 1593 1594 /* 1595 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1596 * entry for first-level or pass-through translation modes should 1597 * be programmed with a domain id different from those used for 1598 * second-level or nested translation. We reserve a domain id for 1599 * this purpose. 1600 */ 1601 if (sm_supported(iommu)) 1602 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1603 1604 return 0; 1605 } 1606 1607 static void disable_dmar_iommu(struct intel_iommu *iommu) 1608 { 1609 if (!iommu->domain_ids) 1610 return; 1611 1612 /* 1613 * All iommu domains must have been detached from the devices, 1614 * hence there should be no domain IDs in use. 1615 */ 1616 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1617 > NUM_RESERVED_DID)) 1618 return; 1619 1620 if (iommu->gcmd & DMA_GCMD_TE) 1621 iommu_disable_translation(iommu); 1622 } 1623 1624 static void free_dmar_iommu(struct intel_iommu *iommu) 1625 { 1626 if (iommu->domain_ids) { 1627 bitmap_free(iommu->domain_ids); 1628 iommu->domain_ids = NULL; 1629 } 1630 1631 if (iommu->copied_tables) { 1632 bitmap_free(iommu->copied_tables); 1633 iommu->copied_tables = NULL; 1634 } 1635 1636 /* free context mapping */ 1637 free_context_table(iommu); 1638 1639 #ifdef CONFIG_INTEL_IOMMU_SVM 1640 if (pasid_supported(iommu)) { 1641 if (ecap_prs(iommu->ecap)) 1642 intel_svm_finish_prq(iommu); 1643 } 1644 #endif 1645 } 1646 1647 /* 1648 * Check and return whether first level is used by default for 1649 * DMA translation. 1650 */ 1651 static bool first_level_by_default(unsigned int type) 1652 { 1653 /* Only SL is available in legacy mode */ 1654 if (!scalable_mode_support()) 1655 return false; 1656 1657 /* Only level (either FL or SL) is available, just use it */ 1658 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1659 return intel_cap_flts_sanity(); 1660 1661 /* Both levels are available, decide it based on domain type */ 1662 return type != IOMMU_DOMAIN_UNMANAGED; 1663 } 1664 1665 static struct dmar_domain *alloc_domain(unsigned int type) 1666 { 1667 struct dmar_domain *domain; 1668 1669 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1670 if (!domain) 1671 return NULL; 1672 1673 domain->nid = NUMA_NO_NODE; 1674 if (first_level_by_default(type)) 1675 domain->use_first_level = true; 1676 domain->has_iotlb_device = false; 1677 INIT_LIST_HEAD(&domain->devices); 1678 INIT_LIST_HEAD(&domain->dev_pasids); 1679 spin_lock_init(&domain->lock); 1680 xa_init(&domain->iommu_array); 1681 1682 return domain; 1683 } 1684 1685 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1686 { 1687 struct iommu_domain_info *info, *curr; 1688 unsigned long ndomains; 1689 int num, ret = -ENOSPC; 1690 1691 info = kzalloc(sizeof(*info), GFP_KERNEL); 1692 if (!info) 1693 return -ENOMEM; 1694 1695 spin_lock(&iommu->lock); 1696 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1697 if (curr) { 1698 curr->refcnt++; 1699 spin_unlock(&iommu->lock); 1700 kfree(info); 1701 return 0; 1702 } 1703 1704 ndomains = cap_ndoms(iommu->cap); 1705 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1706 if (num >= ndomains) { 1707 pr_err("%s: No free domain ids\n", iommu->name); 1708 goto err_unlock; 1709 } 1710 1711 set_bit(num, iommu->domain_ids); 1712 info->refcnt = 1; 1713 info->did = num; 1714 info->iommu = iommu; 1715 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1716 NULL, info, GFP_ATOMIC); 1717 if (curr) { 1718 ret = xa_err(curr) ? : -EBUSY; 1719 goto err_clear; 1720 } 1721 domain_update_iommu_cap(domain); 1722 1723 spin_unlock(&iommu->lock); 1724 return 0; 1725 1726 err_clear: 1727 clear_bit(info->did, iommu->domain_ids); 1728 err_unlock: 1729 spin_unlock(&iommu->lock); 1730 kfree(info); 1731 return ret; 1732 } 1733 1734 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1735 { 1736 struct iommu_domain_info *info; 1737 1738 spin_lock(&iommu->lock); 1739 info = xa_load(&domain->iommu_array, iommu->seq_id); 1740 if (--info->refcnt == 0) { 1741 clear_bit(info->did, iommu->domain_ids); 1742 xa_erase(&domain->iommu_array, iommu->seq_id); 1743 domain->nid = NUMA_NO_NODE; 1744 domain_update_iommu_cap(domain); 1745 kfree(info); 1746 } 1747 spin_unlock(&iommu->lock); 1748 } 1749 1750 static int guestwidth_to_adjustwidth(int gaw) 1751 { 1752 int agaw; 1753 int r = (gaw - 12) % 9; 1754 1755 if (r == 0) 1756 agaw = gaw; 1757 else 1758 agaw = gaw + 9 - r; 1759 if (agaw > 64) 1760 agaw = 64; 1761 return agaw; 1762 } 1763 1764 static void domain_exit(struct dmar_domain *domain) 1765 { 1766 if (domain->pgd) { 1767 LIST_HEAD(freelist); 1768 1769 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1770 put_pages_list(&freelist); 1771 } 1772 1773 if (WARN_ON(!list_empty(&domain->devices))) 1774 return; 1775 1776 kfree(domain); 1777 } 1778 1779 /* 1780 * Get the PASID directory size for scalable mode context entry. 1781 * Value of X in the PDTS field of a scalable mode context entry 1782 * indicates PASID directory with 2^(X + 7) entries. 1783 */ 1784 static unsigned long context_get_sm_pds(struct pasid_table *table) 1785 { 1786 unsigned long pds, max_pde; 1787 1788 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1789 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1790 if (pds < 7) 1791 return 0; 1792 1793 return pds - 7; 1794 } 1795 1796 static int domain_context_mapping_one(struct dmar_domain *domain, 1797 struct intel_iommu *iommu, 1798 struct pasid_table *table, 1799 u8 bus, u8 devfn) 1800 { 1801 struct device_domain_info *info = 1802 domain_lookup_dev_info(domain, iommu, bus, devfn); 1803 u16 did = domain_id_iommu(domain, iommu); 1804 int translation = CONTEXT_TT_MULTI_LEVEL; 1805 struct context_entry *context; 1806 int ret; 1807 1808 if (hw_pass_through && domain_type_is_si(domain)) 1809 translation = CONTEXT_TT_PASS_THROUGH; 1810 1811 pr_debug("Set context mapping for %02x:%02x.%d\n", 1812 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1813 1814 spin_lock(&iommu->lock); 1815 ret = -ENOMEM; 1816 context = iommu_context_addr(iommu, bus, devfn, 1); 1817 if (!context) 1818 goto out_unlock; 1819 1820 ret = 0; 1821 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1822 goto out_unlock; 1823 1824 /* 1825 * For kdump cases, old valid entries may be cached due to the 1826 * in-flight DMA and copied pgtable, but there is no unmapping 1827 * behaviour for them, thus we need an explicit cache flush for 1828 * the newly-mapped device. For kdump, at this point, the device 1829 * is supposed to finish reset at its driver probe stage, so no 1830 * in-flight DMA will exist, and we don't need to worry anymore 1831 * hereafter. 1832 */ 1833 if (context_copied(iommu, bus, devfn)) { 1834 u16 did_old = context_domain_id(context); 1835 1836 if (did_old < cap_ndoms(iommu->cap)) { 1837 iommu->flush.flush_context(iommu, did_old, 1838 (((u16)bus) << 8) | devfn, 1839 DMA_CCMD_MASK_NOBIT, 1840 DMA_CCMD_DEVICE_INVL); 1841 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1842 DMA_TLB_DSI_FLUSH); 1843 } 1844 1845 clear_context_copied(iommu, bus, devfn); 1846 } 1847 1848 context_clear_entry(context); 1849 1850 if (sm_supported(iommu)) { 1851 unsigned long pds; 1852 1853 /* Setup the PASID DIR pointer: */ 1854 pds = context_get_sm_pds(table); 1855 context->lo = (u64)virt_to_phys(table->table) | 1856 context_pdts(pds); 1857 1858 /* Setup the RID_PASID field: */ 1859 context_set_sm_rid2pasid(context, IOMMU_NO_PASID); 1860 1861 /* 1862 * Setup the Device-TLB enable bit and Page request 1863 * Enable bit: 1864 */ 1865 if (info && info->ats_supported) 1866 context_set_sm_dte(context); 1867 if (info && info->pri_supported) 1868 context_set_sm_pre(context); 1869 if (info && info->pasid_supported) 1870 context_set_pasid(context); 1871 } else { 1872 struct dma_pte *pgd = domain->pgd; 1873 int agaw; 1874 1875 context_set_domain_id(context, did); 1876 1877 if (translation != CONTEXT_TT_PASS_THROUGH) { 1878 /* 1879 * Skip top levels of page tables for iommu which has 1880 * less agaw than default. Unnecessary for PT mode. 1881 */ 1882 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1883 ret = -ENOMEM; 1884 pgd = phys_to_virt(dma_pte_addr(pgd)); 1885 if (!dma_pte_present(pgd)) 1886 goto out_unlock; 1887 } 1888 1889 if (info && info->ats_supported) 1890 translation = CONTEXT_TT_DEV_IOTLB; 1891 else 1892 translation = CONTEXT_TT_MULTI_LEVEL; 1893 1894 context_set_address_root(context, virt_to_phys(pgd)); 1895 context_set_address_width(context, agaw); 1896 } else { 1897 /* 1898 * In pass through mode, AW must be programmed to 1899 * indicate the largest AGAW value supported by 1900 * hardware. And ASR is ignored by hardware. 1901 */ 1902 context_set_address_width(context, iommu->msagaw); 1903 } 1904 1905 context_set_translation_type(context, translation); 1906 } 1907 1908 context_set_fault_enable(context); 1909 context_set_present(context); 1910 if (!ecap_coherent(iommu->ecap)) 1911 clflush_cache_range(context, sizeof(*context)); 1912 1913 /* 1914 * It's a non-present to present mapping. If hardware doesn't cache 1915 * non-present entry we only need to flush the write-buffer. If the 1916 * _does_ cache non-present entries, then it does so in the special 1917 * domain #0, which we have to flush: 1918 */ 1919 if (cap_caching_mode(iommu->cap)) { 1920 iommu->flush.flush_context(iommu, 0, 1921 (((u16)bus) << 8) | devfn, 1922 DMA_CCMD_MASK_NOBIT, 1923 DMA_CCMD_DEVICE_INVL); 1924 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1925 } else { 1926 iommu_flush_write_buffer(iommu); 1927 } 1928 1929 ret = 0; 1930 1931 out_unlock: 1932 spin_unlock(&iommu->lock); 1933 1934 return ret; 1935 } 1936 1937 struct domain_context_mapping_data { 1938 struct dmar_domain *domain; 1939 struct intel_iommu *iommu; 1940 struct pasid_table *table; 1941 }; 1942 1943 static int domain_context_mapping_cb(struct pci_dev *pdev, 1944 u16 alias, void *opaque) 1945 { 1946 struct domain_context_mapping_data *data = opaque; 1947 1948 return domain_context_mapping_one(data->domain, data->iommu, 1949 data->table, PCI_BUS_NUM(alias), 1950 alias & 0xff); 1951 } 1952 1953 static int 1954 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1955 { 1956 struct device_domain_info *info = dev_iommu_priv_get(dev); 1957 struct domain_context_mapping_data data; 1958 struct intel_iommu *iommu = info->iommu; 1959 u8 bus = info->bus, devfn = info->devfn; 1960 struct pasid_table *table; 1961 1962 table = intel_pasid_get_table(dev); 1963 1964 if (!dev_is_pci(dev)) 1965 return domain_context_mapping_one(domain, iommu, table, 1966 bus, devfn); 1967 1968 data.domain = domain; 1969 data.iommu = iommu; 1970 data.table = table; 1971 1972 return pci_for_each_dma_alias(to_pci_dev(dev), 1973 &domain_context_mapping_cb, &data); 1974 } 1975 1976 /* Returns a number of VTD pages, but aligned to MM page size */ 1977 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) 1978 { 1979 host_addr &= ~PAGE_MASK; 1980 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 1981 } 1982 1983 /* Return largest possible superpage level for a given mapping */ 1984 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1985 unsigned long phy_pfn, unsigned long pages) 1986 { 1987 int support, level = 1; 1988 unsigned long pfnmerge; 1989 1990 support = domain->iommu_superpage; 1991 1992 /* To use a large page, the virtual *and* physical addresses 1993 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1994 of them will mean we have to use smaller pages. So just 1995 merge them and check both at once. */ 1996 pfnmerge = iov_pfn | phy_pfn; 1997 1998 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1999 pages >>= VTD_STRIDE_SHIFT; 2000 if (!pages) 2001 break; 2002 pfnmerge >>= VTD_STRIDE_SHIFT; 2003 level++; 2004 support--; 2005 } 2006 return level; 2007 } 2008 2009 /* 2010 * Ensure that old small page tables are removed to make room for superpage(s). 2011 * We're going to add new large pages, so make sure we don't remove their parent 2012 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2013 */ 2014 static void switch_to_super_page(struct dmar_domain *domain, 2015 unsigned long start_pfn, 2016 unsigned long end_pfn, int level) 2017 { 2018 unsigned long lvl_pages = lvl_to_nr_pages(level); 2019 struct iommu_domain_info *info; 2020 struct dma_pte *pte = NULL; 2021 unsigned long i; 2022 2023 while (start_pfn <= end_pfn) { 2024 if (!pte) 2025 pte = pfn_to_dma_pte(domain, start_pfn, &level, 2026 GFP_ATOMIC); 2027 2028 if (dma_pte_present(pte)) { 2029 dma_pte_free_pagetable(domain, start_pfn, 2030 start_pfn + lvl_pages - 1, 2031 level + 1); 2032 2033 xa_for_each(&domain->iommu_array, i, info) 2034 iommu_flush_iotlb_psi(info->iommu, domain, 2035 start_pfn, lvl_pages, 2036 0, 0); 2037 if (domain->nested_parent) 2038 parent_domain_flush(domain, start_pfn, 2039 lvl_pages, 0); 2040 } 2041 2042 pte++; 2043 start_pfn += lvl_pages; 2044 if (first_pte_in_page(pte)) 2045 pte = NULL; 2046 } 2047 } 2048 2049 static int 2050 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2051 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2052 gfp_t gfp) 2053 { 2054 struct dma_pte *first_pte = NULL, *pte = NULL; 2055 unsigned int largepage_lvl = 0; 2056 unsigned long lvl_pages = 0; 2057 phys_addr_t pteval; 2058 u64 attr; 2059 2060 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2061 return -EINVAL; 2062 2063 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2064 return -EINVAL; 2065 2066 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 2067 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 2068 return -EINVAL; 2069 } 2070 2071 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2072 attr |= DMA_FL_PTE_PRESENT; 2073 if (domain->use_first_level) { 2074 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2075 if (prot & DMA_PTE_WRITE) 2076 attr |= DMA_FL_PTE_DIRTY; 2077 } 2078 2079 domain->has_mappings = true; 2080 2081 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2082 2083 while (nr_pages > 0) { 2084 uint64_t tmp; 2085 2086 if (!pte) { 2087 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2088 phys_pfn, nr_pages); 2089 2090 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2091 gfp); 2092 if (!pte) 2093 return -ENOMEM; 2094 first_pte = pte; 2095 2096 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2097 2098 /* It is large page*/ 2099 if (largepage_lvl > 1) { 2100 unsigned long end_pfn; 2101 unsigned long pages_to_remove; 2102 2103 pteval |= DMA_PTE_LARGE_PAGE; 2104 pages_to_remove = min_t(unsigned long, nr_pages, 2105 nr_pte_to_next_page(pte) * lvl_pages); 2106 end_pfn = iov_pfn + pages_to_remove - 1; 2107 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2108 } else { 2109 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2110 } 2111 2112 } 2113 /* We don't need lock here, nobody else 2114 * touches the iova range 2115 */ 2116 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2117 if (tmp) { 2118 static int dumps = 5; 2119 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2120 iov_pfn, tmp, (unsigned long long)pteval); 2121 if (dumps) { 2122 dumps--; 2123 debug_dma_dump_mappings(NULL); 2124 } 2125 WARN_ON(1); 2126 } 2127 2128 nr_pages -= lvl_pages; 2129 iov_pfn += lvl_pages; 2130 phys_pfn += lvl_pages; 2131 pteval += lvl_pages * VTD_PAGE_SIZE; 2132 2133 /* If the next PTE would be the first in a new page, then we 2134 * need to flush the cache on the entries we've just written. 2135 * And then we'll need to recalculate 'pte', so clear it and 2136 * let it get set again in the if (!pte) block above. 2137 * 2138 * If we're done (!nr_pages) we need to flush the cache too. 2139 * 2140 * Also if we've been setting superpages, we may need to 2141 * recalculate 'pte' and switch back to smaller pages for the 2142 * end of the mapping, if the trailing size is not enough to 2143 * use another superpage (i.e. nr_pages < lvl_pages). 2144 */ 2145 pte++; 2146 if (!nr_pages || first_pte_in_page(pte) || 2147 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2148 domain_flush_cache(domain, first_pte, 2149 (void *)pte - (void *)first_pte); 2150 pte = NULL; 2151 } 2152 } 2153 2154 return 0; 2155 } 2156 2157 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2158 { 2159 struct intel_iommu *iommu = info->iommu; 2160 struct context_entry *context; 2161 u16 did_old; 2162 2163 if (!iommu) 2164 return; 2165 2166 spin_lock(&iommu->lock); 2167 context = iommu_context_addr(iommu, bus, devfn, 0); 2168 if (!context) { 2169 spin_unlock(&iommu->lock); 2170 return; 2171 } 2172 2173 if (sm_supported(iommu)) { 2174 if (hw_pass_through && domain_type_is_si(info->domain)) 2175 did_old = FLPT_DEFAULT_DID; 2176 else 2177 did_old = domain_id_iommu(info->domain, iommu); 2178 } else { 2179 did_old = context_domain_id(context); 2180 } 2181 2182 context_clear_entry(context); 2183 __iommu_flush_cache(iommu, context, sizeof(*context)); 2184 spin_unlock(&iommu->lock); 2185 iommu->flush.flush_context(iommu, 2186 did_old, 2187 (((u16)bus) << 8) | devfn, 2188 DMA_CCMD_MASK_NOBIT, 2189 DMA_CCMD_DEVICE_INVL); 2190 2191 if (sm_supported(iommu)) 2192 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2193 2194 iommu->flush.flush_iotlb(iommu, 2195 did_old, 2196 0, 2197 0, 2198 DMA_TLB_DSI_FLUSH); 2199 2200 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2201 } 2202 2203 static int domain_setup_first_level(struct intel_iommu *iommu, 2204 struct dmar_domain *domain, 2205 struct device *dev, 2206 u32 pasid) 2207 { 2208 struct dma_pte *pgd = domain->pgd; 2209 int agaw, level; 2210 int flags = 0; 2211 2212 /* 2213 * Skip top levels of page tables for iommu which has 2214 * less agaw than default. Unnecessary for PT mode. 2215 */ 2216 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2217 pgd = phys_to_virt(dma_pte_addr(pgd)); 2218 if (!dma_pte_present(pgd)) 2219 return -ENOMEM; 2220 } 2221 2222 level = agaw_to_level(agaw); 2223 if (level != 4 && level != 5) 2224 return -EINVAL; 2225 2226 if (level == 5) 2227 flags |= PASID_FLAG_FL5LP; 2228 2229 if (domain->force_snooping) 2230 flags |= PASID_FLAG_PAGE_SNOOP; 2231 2232 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2233 domain_id_iommu(domain, iommu), 2234 flags); 2235 } 2236 2237 static bool dev_is_real_dma_subdevice(struct device *dev) 2238 { 2239 return dev && dev_is_pci(dev) && 2240 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2241 } 2242 2243 static int iommu_domain_identity_map(struct dmar_domain *domain, 2244 unsigned long first_vpfn, 2245 unsigned long last_vpfn) 2246 { 2247 /* 2248 * RMRR range might have overlap with physical memory range, 2249 * clear it first 2250 */ 2251 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2252 2253 return __domain_mapping(domain, first_vpfn, 2254 first_vpfn, last_vpfn - first_vpfn + 1, 2255 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2256 } 2257 2258 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2259 2260 static int __init si_domain_init(int hw) 2261 { 2262 struct dmar_rmrr_unit *rmrr; 2263 struct device *dev; 2264 int i, nid, ret; 2265 2266 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2267 if (!si_domain) 2268 return -EFAULT; 2269 2270 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2271 domain_exit(si_domain); 2272 si_domain = NULL; 2273 return -EFAULT; 2274 } 2275 2276 if (hw) 2277 return 0; 2278 2279 for_each_online_node(nid) { 2280 unsigned long start_pfn, end_pfn; 2281 int i; 2282 2283 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2284 ret = iommu_domain_identity_map(si_domain, 2285 mm_to_dma_pfn_start(start_pfn), 2286 mm_to_dma_pfn_end(end_pfn)); 2287 if (ret) 2288 return ret; 2289 } 2290 } 2291 2292 /* 2293 * Identity map the RMRRs so that devices with RMRRs could also use 2294 * the si_domain. 2295 */ 2296 for_each_rmrr_units(rmrr) { 2297 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2298 i, dev) { 2299 unsigned long long start = rmrr->base_address; 2300 unsigned long long end = rmrr->end_address; 2301 2302 if (WARN_ON(end < start || 2303 end >> agaw_to_width(si_domain->agaw))) 2304 continue; 2305 2306 ret = iommu_domain_identity_map(si_domain, 2307 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2308 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2309 if (ret) 2310 return ret; 2311 } 2312 } 2313 2314 return 0; 2315 } 2316 2317 static int dmar_domain_attach_device(struct dmar_domain *domain, 2318 struct device *dev) 2319 { 2320 struct device_domain_info *info = dev_iommu_priv_get(dev); 2321 struct intel_iommu *iommu = info->iommu; 2322 unsigned long flags; 2323 int ret; 2324 2325 ret = domain_attach_iommu(domain, iommu); 2326 if (ret) 2327 return ret; 2328 info->domain = domain; 2329 spin_lock_irqsave(&domain->lock, flags); 2330 list_add(&info->link, &domain->devices); 2331 spin_unlock_irqrestore(&domain->lock, flags); 2332 2333 /* PASID table is mandatory for a PCI device in scalable mode. */ 2334 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2335 /* Setup the PASID entry for requests without PASID: */ 2336 if (hw_pass_through && domain_type_is_si(domain)) 2337 ret = intel_pasid_setup_pass_through(iommu, 2338 dev, IOMMU_NO_PASID); 2339 else if (domain->use_first_level) 2340 ret = domain_setup_first_level(iommu, domain, dev, 2341 IOMMU_NO_PASID); 2342 else 2343 ret = intel_pasid_setup_second_level(iommu, domain, 2344 dev, IOMMU_NO_PASID); 2345 if (ret) { 2346 dev_err(dev, "Setup RID2PASID failed\n"); 2347 device_block_translation(dev); 2348 return ret; 2349 } 2350 } 2351 2352 ret = domain_context_mapping(domain, dev); 2353 if (ret) { 2354 dev_err(dev, "Domain context map failed\n"); 2355 device_block_translation(dev); 2356 return ret; 2357 } 2358 2359 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2360 iommu_enable_pci_caps(info); 2361 2362 return 0; 2363 } 2364 2365 /** 2366 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2367 * is relaxable (ie. is allowed to be not enforced under some conditions) 2368 * @dev: device handle 2369 * 2370 * We assume that PCI USB devices with RMRRs have them largely 2371 * for historical reasons and that the RMRR space is not actively used post 2372 * boot. This exclusion may change if vendors begin to abuse it. 2373 * 2374 * The same exception is made for graphics devices, with the requirement that 2375 * any use of the RMRR regions will be torn down before assigning the device 2376 * to a guest. 2377 * 2378 * Return: true if the RMRR is relaxable, false otherwise 2379 */ 2380 static bool device_rmrr_is_relaxable(struct device *dev) 2381 { 2382 struct pci_dev *pdev; 2383 2384 if (!dev_is_pci(dev)) 2385 return false; 2386 2387 pdev = to_pci_dev(dev); 2388 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2389 return true; 2390 else 2391 return false; 2392 } 2393 2394 /* 2395 * Return the required default domain type for a specific device. 2396 * 2397 * @dev: the device in query 2398 * @startup: true if this is during early boot 2399 * 2400 * Returns: 2401 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2402 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2403 * - 0: both identity and dynamic domains work for this device 2404 */ 2405 static int device_def_domain_type(struct device *dev) 2406 { 2407 if (dev_is_pci(dev)) { 2408 struct pci_dev *pdev = to_pci_dev(dev); 2409 2410 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2411 return IOMMU_DOMAIN_IDENTITY; 2412 2413 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2414 return IOMMU_DOMAIN_IDENTITY; 2415 } 2416 2417 return 0; 2418 } 2419 2420 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2421 { 2422 /* 2423 * Start from the sane iommu hardware state. 2424 * If the queued invalidation is already initialized by us 2425 * (for example, while enabling interrupt-remapping) then 2426 * we got the things already rolling from a sane state. 2427 */ 2428 if (!iommu->qi) { 2429 /* 2430 * Clear any previous faults. 2431 */ 2432 dmar_fault(-1, iommu); 2433 /* 2434 * Disable queued invalidation if supported and already enabled 2435 * before OS handover. 2436 */ 2437 dmar_disable_qi(iommu); 2438 } 2439 2440 if (dmar_enable_qi(iommu)) { 2441 /* 2442 * Queued Invalidate not enabled, use Register Based Invalidate 2443 */ 2444 iommu->flush.flush_context = __iommu_flush_context; 2445 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2446 pr_info("%s: Using Register based invalidation\n", 2447 iommu->name); 2448 } else { 2449 iommu->flush.flush_context = qi_flush_context; 2450 iommu->flush.flush_iotlb = qi_flush_iotlb; 2451 pr_info("%s: Using Queued invalidation\n", iommu->name); 2452 } 2453 } 2454 2455 static int copy_context_table(struct intel_iommu *iommu, 2456 struct root_entry *old_re, 2457 struct context_entry **tbl, 2458 int bus, bool ext) 2459 { 2460 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2461 struct context_entry *new_ce = NULL, ce; 2462 struct context_entry *old_ce = NULL; 2463 struct root_entry re; 2464 phys_addr_t old_ce_phys; 2465 2466 tbl_idx = ext ? bus * 2 : bus; 2467 memcpy(&re, old_re, sizeof(re)); 2468 2469 for (devfn = 0; devfn < 256; devfn++) { 2470 /* First calculate the correct index */ 2471 idx = (ext ? devfn * 2 : devfn) % 256; 2472 2473 if (idx == 0) { 2474 /* First save what we may have and clean up */ 2475 if (new_ce) { 2476 tbl[tbl_idx] = new_ce; 2477 __iommu_flush_cache(iommu, new_ce, 2478 VTD_PAGE_SIZE); 2479 pos = 1; 2480 } 2481 2482 if (old_ce) 2483 memunmap(old_ce); 2484 2485 ret = 0; 2486 if (devfn < 0x80) 2487 old_ce_phys = root_entry_lctp(&re); 2488 else 2489 old_ce_phys = root_entry_uctp(&re); 2490 2491 if (!old_ce_phys) { 2492 if (ext && devfn == 0) { 2493 /* No LCTP, try UCTP */ 2494 devfn = 0x7f; 2495 continue; 2496 } else { 2497 goto out; 2498 } 2499 } 2500 2501 ret = -ENOMEM; 2502 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2503 MEMREMAP_WB); 2504 if (!old_ce) 2505 goto out; 2506 2507 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2508 if (!new_ce) 2509 goto out_unmap; 2510 2511 ret = 0; 2512 } 2513 2514 /* Now copy the context entry */ 2515 memcpy(&ce, old_ce + idx, sizeof(ce)); 2516 2517 if (!context_present(&ce)) 2518 continue; 2519 2520 did = context_domain_id(&ce); 2521 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2522 set_bit(did, iommu->domain_ids); 2523 2524 set_context_copied(iommu, bus, devfn); 2525 new_ce[idx] = ce; 2526 } 2527 2528 tbl[tbl_idx + pos] = new_ce; 2529 2530 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2531 2532 out_unmap: 2533 memunmap(old_ce); 2534 2535 out: 2536 return ret; 2537 } 2538 2539 static int copy_translation_tables(struct intel_iommu *iommu) 2540 { 2541 struct context_entry **ctxt_tbls; 2542 struct root_entry *old_rt; 2543 phys_addr_t old_rt_phys; 2544 int ctxt_table_entries; 2545 u64 rtaddr_reg; 2546 int bus, ret; 2547 bool new_ext, ext; 2548 2549 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2550 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2551 new_ext = !!sm_supported(iommu); 2552 2553 /* 2554 * The RTT bit can only be changed when translation is disabled, 2555 * but disabling translation means to open a window for data 2556 * corruption. So bail out and don't copy anything if we would 2557 * have to change the bit. 2558 */ 2559 if (new_ext != ext) 2560 return -EINVAL; 2561 2562 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2563 if (!iommu->copied_tables) 2564 return -ENOMEM; 2565 2566 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2567 if (!old_rt_phys) 2568 return -EINVAL; 2569 2570 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2571 if (!old_rt) 2572 return -ENOMEM; 2573 2574 /* This is too big for the stack - allocate it from slab */ 2575 ctxt_table_entries = ext ? 512 : 256; 2576 ret = -ENOMEM; 2577 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2578 if (!ctxt_tbls) 2579 goto out_unmap; 2580 2581 for (bus = 0; bus < 256; bus++) { 2582 ret = copy_context_table(iommu, &old_rt[bus], 2583 ctxt_tbls, bus, ext); 2584 if (ret) { 2585 pr_err("%s: Failed to copy context table for bus %d\n", 2586 iommu->name, bus); 2587 continue; 2588 } 2589 } 2590 2591 spin_lock(&iommu->lock); 2592 2593 /* Context tables are copied, now write them to the root_entry table */ 2594 for (bus = 0; bus < 256; bus++) { 2595 int idx = ext ? bus * 2 : bus; 2596 u64 val; 2597 2598 if (ctxt_tbls[idx]) { 2599 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2600 iommu->root_entry[bus].lo = val; 2601 } 2602 2603 if (!ext || !ctxt_tbls[idx + 1]) 2604 continue; 2605 2606 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2607 iommu->root_entry[bus].hi = val; 2608 } 2609 2610 spin_unlock(&iommu->lock); 2611 2612 kfree(ctxt_tbls); 2613 2614 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2615 2616 ret = 0; 2617 2618 out_unmap: 2619 memunmap(old_rt); 2620 2621 return ret; 2622 } 2623 2624 static int __init init_dmars(void) 2625 { 2626 struct dmar_drhd_unit *drhd; 2627 struct intel_iommu *iommu; 2628 int ret; 2629 2630 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2631 if (ret) 2632 goto free_iommu; 2633 2634 for_each_iommu(iommu, drhd) { 2635 if (drhd->ignored) { 2636 iommu_disable_translation(iommu); 2637 continue; 2638 } 2639 2640 /* 2641 * Find the max pasid size of all IOMMU's in the system. 2642 * We need to ensure the system pasid table is no bigger 2643 * than the smallest supported. 2644 */ 2645 if (pasid_supported(iommu)) { 2646 u32 temp = 2 << ecap_pss(iommu->ecap); 2647 2648 intel_pasid_max_id = min_t(u32, temp, 2649 intel_pasid_max_id); 2650 } 2651 2652 intel_iommu_init_qi(iommu); 2653 2654 ret = iommu_init_domains(iommu); 2655 if (ret) 2656 goto free_iommu; 2657 2658 init_translation_status(iommu); 2659 2660 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2661 iommu_disable_translation(iommu); 2662 clear_translation_pre_enabled(iommu); 2663 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2664 iommu->name); 2665 } 2666 2667 /* 2668 * TBD: 2669 * we could share the same root & context tables 2670 * among all IOMMU's. Need to Split it later. 2671 */ 2672 ret = iommu_alloc_root_entry(iommu); 2673 if (ret) 2674 goto free_iommu; 2675 2676 if (translation_pre_enabled(iommu)) { 2677 pr_info("Translation already enabled - trying to copy translation structures\n"); 2678 2679 ret = copy_translation_tables(iommu); 2680 if (ret) { 2681 /* 2682 * We found the IOMMU with translation 2683 * enabled - but failed to copy over the 2684 * old root-entry table. Try to proceed 2685 * by disabling translation now and 2686 * allocating a clean root-entry table. 2687 * This might cause DMAR faults, but 2688 * probably the dump will still succeed. 2689 */ 2690 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2691 iommu->name); 2692 iommu_disable_translation(iommu); 2693 clear_translation_pre_enabled(iommu); 2694 } else { 2695 pr_info("Copied translation tables from previous kernel for %s\n", 2696 iommu->name); 2697 } 2698 } 2699 2700 if (!ecap_pass_through(iommu->ecap)) 2701 hw_pass_through = 0; 2702 intel_svm_check(iommu); 2703 } 2704 2705 /* 2706 * Now that qi is enabled on all iommus, set the root entry and flush 2707 * caches. This is required on some Intel X58 chipsets, otherwise the 2708 * flush_context function will loop forever and the boot hangs. 2709 */ 2710 for_each_active_iommu(iommu, drhd) { 2711 iommu_flush_write_buffer(iommu); 2712 iommu_set_root_entry(iommu); 2713 } 2714 2715 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2716 dmar_map_gfx = 0; 2717 #endif 2718 2719 if (!dmar_map_gfx) 2720 iommu_identity_mapping |= IDENTMAP_GFX; 2721 2722 check_tylersburg_isoch(); 2723 2724 ret = si_domain_init(hw_pass_through); 2725 if (ret) 2726 goto free_iommu; 2727 2728 /* 2729 * for each drhd 2730 * enable fault log 2731 * global invalidate context cache 2732 * global invalidate iotlb 2733 * enable translation 2734 */ 2735 for_each_iommu(iommu, drhd) { 2736 if (drhd->ignored) { 2737 /* 2738 * we always have to disable PMRs or DMA may fail on 2739 * this device 2740 */ 2741 if (force_on) 2742 iommu_disable_protect_mem_regions(iommu); 2743 continue; 2744 } 2745 2746 iommu_flush_write_buffer(iommu); 2747 2748 #ifdef CONFIG_INTEL_IOMMU_SVM 2749 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2750 /* 2751 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2752 * could cause possible lock race condition. 2753 */ 2754 up_write(&dmar_global_lock); 2755 ret = intel_svm_enable_prq(iommu); 2756 down_write(&dmar_global_lock); 2757 if (ret) 2758 goto free_iommu; 2759 } 2760 #endif 2761 ret = dmar_set_interrupt(iommu); 2762 if (ret) 2763 goto free_iommu; 2764 } 2765 2766 return 0; 2767 2768 free_iommu: 2769 for_each_active_iommu(iommu, drhd) { 2770 disable_dmar_iommu(iommu); 2771 free_dmar_iommu(iommu); 2772 } 2773 if (si_domain) { 2774 domain_exit(si_domain); 2775 si_domain = NULL; 2776 } 2777 2778 return ret; 2779 } 2780 2781 static void __init init_no_remapping_devices(void) 2782 { 2783 struct dmar_drhd_unit *drhd; 2784 struct device *dev; 2785 int i; 2786 2787 for_each_drhd_unit(drhd) { 2788 if (!drhd->include_all) { 2789 for_each_active_dev_scope(drhd->devices, 2790 drhd->devices_cnt, i, dev) 2791 break; 2792 /* ignore DMAR unit if no devices exist */ 2793 if (i == drhd->devices_cnt) 2794 drhd->ignored = 1; 2795 } 2796 } 2797 2798 for_each_active_drhd_unit(drhd) { 2799 if (drhd->include_all) 2800 continue; 2801 2802 for_each_active_dev_scope(drhd->devices, 2803 drhd->devices_cnt, i, dev) 2804 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2805 break; 2806 if (i < drhd->devices_cnt) 2807 continue; 2808 2809 /* This IOMMU has *only* gfx devices. Either bypass it or 2810 set the gfx_mapped flag, as appropriate */ 2811 drhd->gfx_dedicated = 1; 2812 if (!dmar_map_gfx) 2813 drhd->ignored = 1; 2814 } 2815 } 2816 2817 #ifdef CONFIG_SUSPEND 2818 static int init_iommu_hw(void) 2819 { 2820 struct dmar_drhd_unit *drhd; 2821 struct intel_iommu *iommu = NULL; 2822 int ret; 2823 2824 for_each_active_iommu(iommu, drhd) { 2825 if (iommu->qi) { 2826 ret = dmar_reenable_qi(iommu); 2827 if (ret) 2828 return ret; 2829 } 2830 } 2831 2832 for_each_iommu(iommu, drhd) { 2833 if (drhd->ignored) { 2834 /* 2835 * we always have to disable PMRs or DMA may fail on 2836 * this device 2837 */ 2838 if (force_on) 2839 iommu_disable_protect_mem_regions(iommu); 2840 continue; 2841 } 2842 2843 iommu_flush_write_buffer(iommu); 2844 iommu_set_root_entry(iommu); 2845 iommu_enable_translation(iommu); 2846 iommu_disable_protect_mem_regions(iommu); 2847 } 2848 2849 return 0; 2850 } 2851 2852 static void iommu_flush_all(void) 2853 { 2854 struct dmar_drhd_unit *drhd; 2855 struct intel_iommu *iommu; 2856 2857 for_each_active_iommu(iommu, drhd) { 2858 iommu->flush.flush_context(iommu, 0, 0, 0, 2859 DMA_CCMD_GLOBAL_INVL); 2860 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2861 DMA_TLB_GLOBAL_FLUSH); 2862 } 2863 } 2864 2865 static int iommu_suspend(void) 2866 { 2867 struct dmar_drhd_unit *drhd; 2868 struct intel_iommu *iommu = NULL; 2869 unsigned long flag; 2870 2871 iommu_flush_all(); 2872 2873 for_each_active_iommu(iommu, drhd) { 2874 iommu_disable_translation(iommu); 2875 2876 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2877 2878 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2879 readl(iommu->reg + DMAR_FECTL_REG); 2880 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2881 readl(iommu->reg + DMAR_FEDATA_REG); 2882 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2883 readl(iommu->reg + DMAR_FEADDR_REG); 2884 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2885 readl(iommu->reg + DMAR_FEUADDR_REG); 2886 2887 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2888 } 2889 return 0; 2890 } 2891 2892 static void iommu_resume(void) 2893 { 2894 struct dmar_drhd_unit *drhd; 2895 struct intel_iommu *iommu = NULL; 2896 unsigned long flag; 2897 2898 if (init_iommu_hw()) { 2899 if (force_on) 2900 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2901 else 2902 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2903 return; 2904 } 2905 2906 for_each_active_iommu(iommu, drhd) { 2907 2908 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2909 2910 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2911 iommu->reg + DMAR_FECTL_REG); 2912 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2913 iommu->reg + DMAR_FEDATA_REG); 2914 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2915 iommu->reg + DMAR_FEADDR_REG); 2916 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2917 iommu->reg + DMAR_FEUADDR_REG); 2918 2919 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2920 } 2921 } 2922 2923 static struct syscore_ops iommu_syscore_ops = { 2924 .resume = iommu_resume, 2925 .suspend = iommu_suspend, 2926 }; 2927 2928 static void __init init_iommu_pm_ops(void) 2929 { 2930 register_syscore_ops(&iommu_syscore_ops); 2931 } 2932 2933 #else 2934 static inline void init_iommu_pm_ops(void) {} 2935 #endif /* CONFIG_PM */ 2936 2937 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2938 { 2939 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2940 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2941 rmrr->end_address <= rmrr->base_address || 2942 arch_rmrr_sanity_check(rmrr)) 2943 return -EINVAL; 2944 2945 return 0; 2946 } 2947 2948 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2949 { 2950 struct acpi_dmar_reserved_memory *rmrr; 2951 struct dmar_rmrr_unit *rmrru; 2952 2953 rmrr = (struct acpi_dmar_reserved_memory *)header; 2954 if (rmrr_sanity_check(rmrr)) { 2955 pr_warn(FW_BUG 2956 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2957 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2958 rmrr->base_address, rmrr->end_address, 2959 dmi_get_system_info(DMI_BIOS_VENDOR), 2960 dmi_get_system_info(DMI_BIOS_VERSION), 2961 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2962 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2963 } 2964 2965 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2966 if (!rmrru) 2967 goto out; 2968 2969 rmrru->hdr = header; 2970 2971 rmrru->base_address = rmrr->base_address; 2972 rmrru->end_address = rmrr->end_address; 2973 2974 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2975 ((void *)rmrr) + rmrr->header.length, 2976 &rmrru->devices_cnt); 2977 if (rmrru->devices_cnt && rmrru->devices == NULL) 2978 goto free_rmrru; 2979 2980 list_add(&rmrru->list, &dmar_rmrr_units); 2981 2982 return 0; 2983 free_rmrru: 2984 kfree(rmrru); 2985 out: 2986 return -ENOMEM; 2987 } 2988 2989 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2990 { 2991 struct dmar_atsr_unit *atsru; 2992 struct acpi_dmar_atsr *tmp; 2993 2994 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2995 dmar_rcu_check()) { 2996 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2997 if (atsr->segment != tmp->segment) 2998 continue; 2999 if (atsr->header.length != tmp->header.length) 3000 continue; 3001 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3002 return atsru; 3003 } 3004 3005 return NULL; 3006 } 3007 3008 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3009 { 3010 struct acpi_dmar_atsr *atsr; 3011 struct dmar_atsr_unit *atsru; 3012 3013 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3014 return 0; 3015 3016 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3017 atsru = dmar_find_atsr(atsr); 3018 if (atsru) 3019 return 0; 3020 3021 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3022 if (!atsru) 3023 return -ENOMEM; 3024 3025 /* 3026 * If memory is allocated from slab by ACPI _DSM method, we need to 3027 * copy the memory content because the memory buffer will be freed 3028 * on return. 3029 */ 3030 atsru->hdr = (void *)(atsru + 1); 3031 memcpy(atsru->hdr, hdr, hdr->length); 3032 atsru->include_all = atsr->flags & 0x1; 3033 if (!atsru->include_all) { 3034 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3035 (void *)atsr + atsr->header.length, 3036 &atsru->devices_cnt); 3037 if (atsru->devices_cnt && atsru->devices == NULL) { 3038 kfree(atsru); 3039 return -ENOMEM; 3040 } 3041 } 3042 3043 list_add_rcu(&atsru->list, &dmar_atsr_units); 3044 3045 return 0; 3046 } 3047 3048 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3049 { 3050 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3051 kfree(atsru); 3052 } 3053 3054 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3055 { 3056 struct acpi_dmar_atsr *atsr; 3057 struct dmar_atsr_unit *atsru; 3058 3059 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3060 atsru = dmar_find_atsr(atsr); 3061 if (atsru) { 3062 list_del_rcu(&atsru->list); 3063 synchronize_rcu(); 3064 intel_iommu_free_atsr(atsru); 3065 } 3066 3067 return 0; 3068 } 3069 3070 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3071 { 3072 int i; 3073 struct device *dev; 3074 struct acpi_dmar_atsr *atsr; 3075 struct dmar_atsr_unit *atsru; 3076 3077 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3078 atsru = dmar_find_atsr(atsr); 3079 if (!atsru) 3080 return 0; 3081 3082 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3083 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3084 i, dev) 3085 return -EBUSY; 3086 } 3087 3088 return 0; 3089 } 3090 3091 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3092 { 3093 struct dmar_satc_unit *satcu; 3094 struct acpi_dmar_satc *tmp; 3095 3096 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3097 dmar_rcu_check()) { 3098 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3099 if (satc->segment != tmp->segment) 3100 continue; 3101 if (satc->header.length != tmp->header.length) 3102 continue; 3103 if (memcmp(satc, tmp, satc->header.length) == 0) 3104 return satcu; 3105 } 3106 3107 return NULL; 3108 } 3109 3110 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3111 { 3112 struct acpi_dmar_satc *satc; 3113 struct dmar_satc_unit *satcu; 3114 3115 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3116 return 0; 3117 3118 satc = container_of(hdr, struct acpi_dmar_satc, header); 3119 satcu = dmar_find_satc(satc); 3120 if (satcu) 3121 return 0; 3122 3123 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3124 if (!satcu) 3125 return -ENOMEM; 3126 3127 satcu->hdr = (void *)(satcu + 1); 3128 memcpy(satcu->hdr, hdr, hdr->length); 3129 satcu->atc_required = satc->flags & 0x1; 3130 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3131 (void *)satc + satc->header.length, 3132 &satcu->devices_cnt); 3133 if (satcu->devices_cnt && !satcu->devices) { 3134 kfree(satcu); 3135 return -ENOMEM; 3136 } 3137 list_add_rcu(&satcu->list, &dmar_satc_units); 3138 3139 return 0; 3140 } 3141 3142 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3143 { 3144 int sp, ret; 3145 struct intel_iommu *iommu = dmaru->iommu; 3146 3147 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3148 if (ret) 3149 goto out; 3150 3151 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3152 pr_warn("%s: Doesn't support hardware pass through.\n", 3153 iommu->name); 3154 return -ENXIO; 3155 } 3156 3157 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3158 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3159 pr_warn("%s: Doesn't support large page.\n", 3160 iommu->name); 3161 return -ENXIO; 3162 } 3163 3164 /* 3165 * Disable translation if already enabled prior to OS handover. 3166 */ 3167 if (iommu->gcmd & DMA_GCMD_TE) 3168 iommu_disable_translation(iommu); 3169 3170 ret = iommu_init_domains(iommu); 3171 if (ret == 0) 3172 ret = iommu_alloc_root_entry(iommu); 3173 if (ret) 3174 goto out; 3175 3176 intel_svm_check(iommu); 3177 3178 if (dmaru->ignored) { 3179 /* 3180 * we always have to disable PMRs or DMA may fail on this device 3181 */ 3182 if (force_on) 3183 iommu_disable_protect_mem_regions(iommu); 3184 return 0; 3185 } 3186 3187 intel_iommu_init_qi(iommu); 3188 iommu_flush_write_buffer(iommu); 3189 3190 #ifdef CONFIG_INTEL_IOMMU_SVM 3191 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3192 ret = intel_svm_enable_prq(iommu); 3193 if (ret) 3194 goto disable_iommu; 3195 } 3196 #endif 3197 ret = dmar_set_interrupt(iommu); 3198 if (ret) 3199 goto disable_iommu; 3200 3201 iommu_set_root_entry(iommu); 3202 iommu_enable_translation(iommu); 3203 3204 iommu_disable_protect_mem_regions(iommu); 3205 return 0; 3206 3207 disable_iommu: 3208 disable_dmar_iommu(iommu); 3209 out: 3210 free_dmar_iommu(iommu); 3211 return ret; 3212 } 3213 3214 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3215 { 3216 int ret = 0; 3217 struct intel_iommu *iommu = dmaru->iommu; 3218 3219 if (!intel_iommu_enabled) 3220 return 0; 3221 if (iommu == NULL) 3222 return -EINVAL; 3223 3224 if (insert) { 3225 ret = intel_iommu_add(dmaru); 3226 } else { 3227 disable_dmar_iommu(iommu); 3228 free_dmar_iommu(iommu); 3229 } 3230 3231 return ret; 3232 } 3233 3234 static void intel_iommu_free_dmars(void) 3235 { 3236 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3237 struct dmar_atsr_unit *atsru, *atsr_n; 3238 struct dmar_satc_unit *satcu, *satc_n; 3239 3240 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3241 list_del(&rmrru->list); 3242 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3243 kfree(rmrru); 3244 } 3245 3246 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3247 list_del(&atsru->list); 3248 intel_iommu_free_atsr(atsru); 3249 } 3250 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3251 list_del(&satcu->list); 3252 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3253 kfree(satcu); 3254 } 3255 } 3256 3257 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3258 { 3259 struct dmar_satc_unit *satcu; 3260 struct acpi_dmar_satc *satc; 3261 struct device *tmp; 3262 int i; 3263 3264 dev = pci_physfn(dev); 3265 rcu_read_lock(); 3266 3267 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3268 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3269 if (satc->segment != pci_domain_nr(dev->bus)) 3270 continue; 3271 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3272 if (to_pci_dev(tmp) == dev) 3273 goto out; 3274 } 3275 satcu = NULL; 3276 out: 3277 rcu_read_unlock(); 3278 return satcu; 3279 } 3280 3281 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3282 { 3283 int i, ret = 1; 3284 struct pci_bus *bus; 3285 struct pci_dev *bridge = NULL; 3286 struct device *tmp; 3287 struct acpi_dmar_atsr *atsr; 3288 struct dmar_atsr_unit *atsru; 3289 struct dmar_satc_unit *satcu; 3290 3291 dev = pci_physfn(dev); 3292 satcu = dmar_find_matched_satc_unit(dev); 3293 if (satcu) 3294 /* 3295 * This device supports ATS as it is in SATC table. 3296 * When IOMMU is in legacy mode, enabling ATS is done 3297 * automatically by HW for the device that requires 3298 * ATS, hence OS should not enable this device ATS 3299 * to avoid duplicated TLB invalidation. 3300 */ 3301 return !(satcu->atc_required && !sm_supported(iommu)); 3302 3303 for (bus = dev->bus; bus; bus = bus->parent) { 3304 bridge = bus->self; 3305 /* If it's an integrated device, allow ATS */ 3306 if (!bridge) 3307 return 1; 3308 /* Connected via non-PCIe: no ATS */ 3309 if (!pci_is_pcie(bridge) || 3310 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3311 return 0; 3312 /* If we found the root port, look it up in the ATSR */ 3313 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3314 break; 3315 } 3316 3317 rcu_read_lock(); 3318 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3319 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3320 if (atsr->segment != pci_domain_nr(dev->bus)) 3321 continue; 3322 3323 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3324 if (tmp == &bridge->dev) 3325 goto out; 3326 3327 if (atsru->include_all) 3328 goto out; 3329 } 3330 ret = 0; 3331 out: 3332 rcu_read_unlock(); 3333 3334 return ret; 3335 } 3336 3337 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3338 { 3339 int ret; 3340 struct dmar_rmrr_unit *rmrru; 3341 struct dmar_atsr_unit *atsru; 3342 struct dmar_satc_unit *satcu; 3343 struct acpi_dmar_atsr *atsr; 3344 struct acpi_dmar_reserved_memory *rmrr; 3345 struct acpi_dmar_satc *satc; 3346 3347 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3348 return 0; 3349 3350 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3351 rmrr = container_of(rmrru->hdr, 3352 struct acpi_dmar_reserved_memory, header); 3353 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3354 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3355 ((void *)rmrr) + rmrr->header.length, 3356 rmrr->segment, rmrru->devices, 3357 rmrru->devices_cnt); 3358 if (ret < 0) 3359 return ret; 3360 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3361 dmar_remove_dev_scope(info, rmrr->segment, 3362 rmrru->devices, rmrru->devices_cnt); 3363 } 3364 } 3365 3366 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3367 if (atsru->include_all) 3368 continue; 3369 3370 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3371 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3372 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3373 (void *)atsr + atsr->header.length, 3374 atsr->segment, atsru->devices, 3375 atsru->devices_cnt); 3376 if (ret > 0) 3377 break; 3378 else if (ret < 0) 3379 return ret; 3380 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3381 if (dmar_remove_dev_scope(info, atsr->segment, 3382 atsru->devices, atsru->devices_cnt)) 3383 break; 3384 } 3385 } 3386 list_for_each_entry(satcu, &dmar_satc_units, list) { 3387 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3388 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3389 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3390 (void *)satc + satc->header.length, 3391 satc->segment, satcu->devices, 3392 satcu->devices_cnt); 3393 if (ret > 0) 3394 break; 3395 else if (ret < 0) 3396 return ret; 3397 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3398 if (dmar_remove_dev_scope(info, satc->segment, 3399 satcu->devices, satcu->devices_cnt)) 3400 break; 3401 } 3402 } 3403 3404 return 0; 3405 } 3406 3407 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3408 unsigned long val, void *v) 3409 { 3410 struct memory_notify *mhp = v; 3411 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3412 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3413 mhp->nr_pages - 1); 3414 3415 switch (val) { 3416 case MEM_GOING_ONLINE: 3417 if (iommu_domain_identity_map(si_domain, 3418 start_vpfn, last_vpfn)) { 3419 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3420 start_vpfn, last_vpfn); 3421 return NOTIFY_BAD; 3422 } 3423 break; 3424 3425 case MEM_OFFLINE: 3426 case MEM_CANCEL_ONLINE: 3427 { 3428 struct dmar_drhd_unit *drhd; 3429 struct intel_iommu *iommu; 3430 LIST_HEAD(freelist); 3431 3432 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3433 3434 rcu_read_lock(); 3435 for_each_active_iommu(iommu, drhd) 3436 iommu_flush_iotlb_psi(iommu, si_domain, 3437 start_vpfn, mhp->nr_pages, 3438 list_empty(&freelist), 0); 3439 rcu_read_unlock(); 3440 put_pages_list(&freelist); 3441 } 3442 break; 3443 } 3444 3445 return NOTIFY_OK; 3446 } 3447 3448 static struct notifier_block intel_iommu_memory_nb = { 3449 .notifier_call = intel_iommu_memory_notifier, 3450 .priority = 0 3451 }; 3452 3453 static void intel_disable_iommus(void) 3454 { 3455 struct intel_iommu *iommu = NULL; 3456 struct dmar_drhd_unit *drhd; 3457 3458 for_each_iommu(iommu, drhd) 3459 iommu_disable_translation(iommu); 3460 } 3461 3462 void intel_iommu_shutdown(void) 3463 { 3464 struct dmar_drhd_unit *drhd; 3465 struct intel_iommu *iommu = NULL; 3466 3467 if (no_iommu || dmar_disabled) 3468 return; 3469 3470 down_write(&dmar_global_lock); 3471 3472 /* Disable PMRs explicitly here. */ 3473 for_each_iommu(iommu, drhd) 3474 iommu_disable_protect_mem_regions(iommu); 3475 3476 /* Make sure the IOMMUs are switched off */ 3477 intel_disable_iommus(); 3478 3479 up_write(&dmar_global_lock); 3480 } 3481 3482 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3483 { 3484 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3485 3486 return container_of(iommu_dev, struct intel_iommu, iommu); 3487 } 3488 3489 static ssize_t version_show(struct device *dev, 3490 struct device_attribute *attr, char *buf) 3491 { 3492 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3493 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3494 return sysfs_emit(buf, "%d:%d\n", 3495 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3496 } 3497 static DEVICE_ATTR_RO(version); 3498 3499 static ssize_t address_show(struct device *dev, 3500 struct device_attribute *attr, char *buf) 3501 { 3502 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3503 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3504 } 3505 static DEVICE_ATTR_RO(address); 3506 3507 static ssize_t cap_show(struct device *dev, 3508 struct device_attribute *attr, char *buf) 3509 { 3510 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3511 return sysfs_emit(buf, "%llx\n", iommu->cap); 3512 } 3513 static DEVICE_ATTR_RO(cap); 3514 3515 static ssize_t ecap_show(struct device *dev, 3516 struct device_attribute *attr, char *buf) 3517 { 3518 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3519 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3520 } 3521 static DEVICE_ATTR_RO(ecap); 3522 3523 static ssize_t domains_supported_show(struct device *dev, 3524 struct device_attribute *attr, char *buf) 3525 { 3526 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3527 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3528 } 3529 static DEVICE_ATTR_RO(domains_supported); 3530 3531 static ssize_t domains_used_show(struct device *dev, 3532 struct device_attribute *attr, char *buf) 3533 { 3534 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3535 return sysfs_emit(buf, "%d\n", 3536 bitmap_weight(iommu->domain_ids, 3537 cap_ndoms(iommu->cap))); 3538 } 3539 static DEVICE_ATTR_RO(domains_used); 3540 3541 static struct attribute *intel_iommu_attrs[] = { 3542 &dev_attr_version.attr, 3543 &dev_attr_address.attr, 3544 &dev_attr_cap.attr, 3545 &dev_attr_ecap.attr, 3546 &dev_attr_domains_supported.attr, 3547 &dev_attr_domains_used.attr, 3548 NULL, 3549 }; 3550 3551 static struct attribute_group intel_iommu_group = { 3552 .name = "intel-iommu", 3553 .attrs = intel_iommu_attrs, 3554 }; 3555 3556 const struct attribute_group *intel_iommu_groups[] = { 3557 &intel_iommu_group, 3558 NULL, 3559 }; 3560 3561 static bool has_external_pci(void) 3562 { 3563 struct pci_dev *pdev = NULL; 3564 3565 for_each_pci_dev(pdev) 3566 if (pdev->external_facing) { 3567 pci_dev_put(pdev); 3568 return true; 3569 } 3570 3571 return false; 3572 } 3573 3574 static int __init platform_optin_force_iommu(void) 3575 { 3576 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3577 return 0; 3578 3579 if (no_iommu || dmar_disabled) 3580 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3581 3582 /* 3583 * If Intel-IOMMU is disabled by default, we will apply identity 3584 * map for all devices except those marked as being untrusted. 3585 */ 3586 if (dmar_disabled) 3587 iommu_set_default_passthrough(false); 3588 3589 dmar_disabled = 0; 3590 no_iommu = 0; 3591 3592 return 1; 3593 } 3594 3595 static int __init probe_acpi_namespace_devices(void) 3596 { 3597 struct dmar_drhd_unit *drhd; 3598 /* To avoid a -Wunused-but-set-variable warning. */ 3599 struct intel_iommu *iommu __maybe_unused; 3600 struct device *dev; 3601 int i, ret = 0; 3602 3603 for_each_active_iommu(iommu, drhd) { 3604 for_each_active_dev_scope(drhd->devices, 3605 drhd->devices_cnt, i, dev) { 3606 struct acpi_device_physical_node *pn; 3607 struct acpi_device *adev; 3608 3609 if (dev->bus != &acpi_bus_type) 3610 continue; 3611 3612 adev = to_acpi_device(dev); 3613 mutex_lock(&adev->physical_node_lock); 3614 list_for_each_entry(pn, 3615 &adev->physical_node_list, node) { 3616 ret = iommu_probe_device(pn->dev); 3617 if (ret) 3618 break; 3619 } 3620 mutex_unlock(&adev->physical_node_lock); 3621 3622 if (ret) 3623 return ret; 3624 } 3625 } 3626 3627 return 0; 3628 } 3629 3630 static __init int tboot_force_iommu(void) 3631 { 3632 if (!tboot_enabled()) 3633 return 0; 3634 3635 if (no_iommu || dmar_disabled) 3636 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3637 3638 dmar_disabled = 0; 3639 no_iommu = 0; 3640 3641 return 1; 3642 } 3643 3644 int __init intel_iommu_init(void) 3645 { 3646 int ret = -ENODEV; 3647 struct dmar_drhd_unit *drhd; 3648 struct intel_iommu *iommu; 3649 3650 /* 3651 * Intel IOMMU is required for a TXT/tboot launch or platform 3652 * opt in, so enforce that. 3653 */ 3654 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3655 platform_optin_force_iommu(); 3656 3657 down_write(&dmar_global_lock); 3658 if (dmar_table_init()) { 3659 if (force_on) 3660 panic("tboot: Failed to initialize DMAR table\n"); 3661 goto out_free_dmar; 3662 } 3663 3664 if (dmar_dev_scope_init() < 0) { 3665 if (force_on) 3666 panic("tboot: Failed to initialize DMAR device scope\n"); 3667 goto out_free_dmar; 3668 } 3669 3670 up_write(&dmar_global_lock); 3671 3672 /* 3673 * The bus notifier takes the dmar_global_lock, so lockdep will 3674 * complain later when we register it under the lock. 3675 */ 3676 dmar_register_bus_notifier(); 3677 3678 down_write(&dmar_global_lock); 3679 3680 if (!no_iommu) 3681 intel_iommu_debugfs_init(); 3682 3683 if (no_iommu || dmar_disabled) { 3684 /* 3685 * We exit the function here to ensure IOMMU's remapping and 3686 * mempool aren't setup, which means that the IOMMU's PMRs 3687 * won't be disabled via the call to init_dmars(). So disable 3688 * it explicitly here. The PMRs were setup by tboot prior to 3689 * calling SENTER, but the kernel is expected to reset/tear 3690 * down the PMRs. 3691 */ 3692 if (intel_iommu_tboot_noforce) { 3693 for_each_iommu(iommu, drhd) 3694 iommu_disable_protect_mem_regions(iommu); 3695 } 3696 3697 /* 3698 * Make sure the IOMMUs are switched off, even when we 3699 * boot into a kexec kernel and the previous kernel left 3700 * them enabled 3701 */ 3702 intel_disable_iommus(); 3703 goto out_free_dmar; 3704 } 3705 3706 if (list_empty(&dmar_rmrr_units)) 3707 pr_info("No RMRR found\n"); 3708 3709 if (list_empty(&dmar_atsr_units)) 3710 pr_info("No ATSR found\n"); 3711 3712 if (list_empty(&dmar_satc_units)) 3713 pr_info("No SATC found\n"); 3714 3715 init_no_remapping_devices(); 3716 3717 ret = init_dmars(); 3718 if (ret) { 3719 if (force_on) 3720 panic("tboot: Failed to initialize DMARs\n"); 3721 pr_err("Initialization failed\n"); 3722 goto out_free_dmar; 3723 } 3724 up_write(&dmar_global_lock); 3725 3726 init_iommu_pm_ops(); 3727 3728 down_read(&dmar_global_lock); 3729 for_each_active_iommu(iommu, drhd) { 3730 /* 3731 * The flush queue implementation does not perform 3732 * page-selective invalidations that are required for efficient 3733 * TLB flushes in virtual environments. The benefit of batching 3734 * is likely to be much lower than the overhead of synchronizing 3735 * the virtual and physical IOMMU page-tables. 3736 */ 3737 if (cap_caching_mode(iommu->cap) && 3738 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3739 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3740 iommu_set_dma_strict(); 3741 } 3742 iommu_device_sysfs_add(&iommu->iommu, NULL, 3743 intel_iommu_groups, 3744 "%s", iommu->name); 3745 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3746 3747 iommu_pmu_register(iommu); 3748 } 3749 up_read(&dmar_global_lock); 3750 3751 if (si_domain && !hw_pass_through) 3752 register_memory_notifier(&intel_iommu_memory_nb); 3753 3754 down_read(&dmar_global_lock); 3755 if (probe_acpi_namespace_devices()) 3756 pr_warn("ACPI name space devices didn't probe correctly\n"); 3757 3758 /* Finally, we enable the DMA remapping hardware. */ 3759 for_each_iommu(iommu, drhd) { 3760 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3761 iommu_enable_translation(iommu); 3762 3763 iommu_disable_protect_mem_regions(iommu); 3764 } 3765 up_read(&dmar_global_lock); 3766 3767 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3768 3769 intel_iommu_enabled = 1; 3770 3771 return 0; 3772 3773 out_free_dmar: 3774 intel_iommu_free_dmars(); 3775 up_write(&dmar_global_lock); 3776 return ret; 3777 } 3778 3779 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3780 { 3781 struct device_domain_info *info = opaque; 3782 3783 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3784 return 0; 3785 } 3786 3787 /* 3788 * NB - intel-iommu lacks any sort of reference counting for the users of 3789 * dependent devices. If multiple endpoints have intersecting dependent 3790 * devices, unbinding the driver from any one of them will possibly leave 3791 * the others unable to operate. 3792 */ 3793 static void domain_context_clear(struct device_domain_info *info) 3794 { 3795 if (!dev_is_pci(info->dev)) 3796 domain_context_clear_one(info, info->bus, info->devfn); 3797 3798 pci_for_each_dma_alias(to_pci_dev(info->dev), 3799 &domain_context_clear_one_cb, info); 3800 } 3801 3802 static void dmar_remove_one_dev_info(struct device *dev) 3803 { 3804 struct device_domain_info *info = dev_iommu_priv_get(dev); 3805 struct dmar_domain *domain = info->domain; 3806 struct intel_iommu *iommu = info->iommu; 3807 unsigned long flags; 3808 3809 if (!dev_is_real_dma_subdevice(info->dev)) { 3810 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3811 intel_pasid_tear_down_entry(iommu, info->dev, 3812 IOMMU_NO_PASID, false); 3813 3814 iommu_disable_pci_caps(info); 3815 domain_context_clear(info); 3816 } 3817 3818 spin_lock_irqsave(&domain->lock, flags); 3819 list_del(&info->link); 3820 spin_unlock_irqrestore(&domain->lock, flags); 3821 3822 domain_detach_iommu(domain, iommu); 3823 info->domain = NULL; 3824 } 3825 3826 /* 3827 * Clear the page table pointer in context or pasid table entries so that 3828 * all DMA requests without PASID from the device are blocked. If the page 3829 * table has been set, clean up the data structures. 3830 */ 3831 void device_block_translation(struct device *dev) 3832 { 3833 struct device_domain_info *info = dev_iommu_priv_get(dev); 3834 struct intel_iommu *iommu = info->iommu; 3835 unsigned long flags; 3836 3837 iommu_disable_pci_caps(info); 3838 if (!dev_is_real_dma_subdevice(dev)) { 3839 if (sm_supported(iommu)) 3840 intel_pasid_tear_down_entry(iommu, dev, 3841 IOMMU_NO_PASID, false); 3842 else 3843 domain_context_clear(info); 3844 } 3845 3846 if (!info->domain) 3847 return; 3848 3849 spin_lock_irqsave(&info->domain->lock, flags); 3850 list_del(&info->link); 3851 spin_unlock_irqrestore(&info->domain->lock, flags); 3852 3853 domain_detach_iommu(info->domain, iommu); 3854 info->domain = NULL; 3855 } 3856 3857 static int md_domain_init(struct dmar_domain *domain, int guest_width) 3858 { 3859 int adjust_width; 3860 3861 /* calculate AGAW */ 3862 domain->gaw = guest_width; 3863 adjust_width = guestwidth_to_adjustwidth(guest_width); 3864 domain->agaw = width_to_agaw(adjust_width); 3865 3866 domain->iommu_coherency = false; 3867 domain->iommu_superpage = 0; 3868 domain->max_addr = 0; 3869 3870 /* always allocate the top pgd */ 3871 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 3872 if (!domain->pgd) 3873 return -ENOMEM; 3874 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3875 return 0; 3876 } 3877 3878 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3879 struct device *dev) 3880 { 3881 device_block_translation(dev); 3882 return 0; 3883 } 3884 3885 static struct iommu_domain blocking_domain = { 3886 .type = IOMMU_DOMAIN_BLOCKED, 3887 .ops = &(const struct iommu_domain_ops) { 3888 .attach_dev = blocking_domain_attach_dev, 3889 } 3890 }; 3891 3892 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 3893 { 3894 struct dmar_domain *dmar_domain; 3895 struct iommu_domain *domain; 3896 3897 switch (type) { 3898 case IOMMU_DOMAIN_DMA: 3899 case IOMMU_DOMAIN_UNMANAGED: 3900 dmar_domain = alloc_domain(type); 3901 if (!dmar_domain) { 3902 pr_err("Can't allocate dmar_domain\n"); 3903 return NULL; 3904 } 3905 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 3906 pr_err("Domain initialization failed\n"); 3907 domain_exit(dmar_domain); 3908 return NULL; 3909 } 3910 3911 domain = &dmar_domain->domain; 3912 domain->geometry.aperture_start = 0; 3913 domain->geometry.aperture_end = 3914 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 3915 domain->geometry.force_aperture = true; 3916 3917 return domain; 3918 case IOMMU_DOMAIN_IDENTITY: 3919 return &si_domain->domain; 3920 case IOMMU_DOMAIN_SVA: 3921 return intel_svm_domain_alloc(); 3922 default: 3923 return NULL; 3924 } 3925 3926 return NULL; 3927 } 3928 3929 static struct iommu_domain * 3930 intel_iommu_domain_alloc_user(struct device *dev, u32 flags, 3931 struct iommu_domain *parent, 3932 const struct iommu_user_data *user_data) 3933 { 3934 struct device_domain_info *info = dev_iommu_priv_get(dev); 3935 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3936 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3937 struct intel_iommu *iommu = info->iommu; 3938 struct dmar_domain *dmar_domain; 3939 struct iommu_domain *domain; 3940 3941 /* Must be NESTING domain */ 3942 if (parent) { 3943 if (!nested_supported(iommu) || flags) 3944 return ERR_PTR(-EOPNOTSUPP); 3945 return intel_nested_domain_alloc(parent, user_data); 3946 } 3947 3948 if (flags & 3949 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 3950 return ERR_PTR(-EOPNOTSUPP); 3951 if (nested_parent && !nested_supported(iommu)) 3952 return ERR_PTR(-EOPNOTSUPP); 3953 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3954 return ERR_PTR(-EOPNOTSUPP); 3955 3956 /* 3957 * domain_alloc_user op needs to fully initialize a domain before 3958 * return, so uses iommu_domain_alloc() here for simple. 3959 */ 3960 domain = iommu_domain_alloc(dev->bus); 3961 if (!domain) 3962 return ERR_PTR(-ENOMEM); 3963 3964 dmar_domain = to_dmar_domain(domain); 3965 3966 if (nested_parent) { 3967 dmar_domain->nested_parent = true; 3968 INIT_LIST_HEAD(&dmar_domain->s1_domains); 3969 spin_lock_init(&dmar_domain->s1_lock); 3970 } 3971 3972 if (dirty_tracking) { 3973 if (dmar_domain->use_first_level) { 3974 iommu_domain_free(domain); 3975 return ERR_PTR(-EOPNOTSUPP); 3976 } 3977 domain->dirty_ops = &intel_dirty_ops; 3978 } 3979 3980 return domain; 3981 } 3982 3983 static void intel_iommu_domain_free(struct iommu_domain *domain) 3984 { 3985 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3986 3987 WARN_ON(dmar_domain->nested_parent && 3988 !list_empty(&dmar_domain->s1_domains)); 3989 if (domain != &si_domain->domain) 3990 domain_exit(dmar_domain); 3991 } 3992 3993 int prepare_domain_attach_device(struct iommu_domain *domain, 3994 struct device *dev) 3995 { 3996 struct device_domain_info *info = dev_iommu_priv_get(dev); 3997 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3998 struct intel_iommu *iommu = info->iommu; 3999 int addr_width; 4000 4001 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4002 return -EINVAL; 4003 4004 if (domain->dirty_ops && !ssads_supported(iommu)) 4005 return -EINVAL; 4006 4007 /* check if this iommu agaw is sufficient for max mapped address */ 4008 addr_width = agaw_to_width(iommu->agaw); 4009 if (addr_width > cap_mgaw(iommu->cap)) 4010 addr_width = cap_mgaw(iommu->cap); 4011 4012 if (dmar_domain->max_addr > (1LL << addr_width)) 4013 return -EINVAL; 4014 dmar_domain->gaw = addr_width; 4015 4016 /* 4017 * Knock out extra levels of page tables if necessary 4018 */ 4019 while (iommu->agaw < dmar_domain->agaw) { 4020 struct dma_pte *pte; 4021 4022 pte = dmar_domain->pgd; 4023 if (dma_pte_present(pte)) { 4024 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4025 free_pgtable_page(pte); 4026 } 4027 dmar_domain->agaw--; 4028 } 4029 4030 return 0; 4031 } 4032 4033 static int intel_iommu_attach_device(struct iommu_domain *domain, 4034 struct device *dev) 4035 { 4036 struct device_domain_info *info = dev_iommu_priv_get(dev); 4037 int ret; 4038 4039 if (info->domain) 4040 device_block_translation(dev); 4041 4042 ret = prepare_domain_attach_device(domain, dev); 4043 if (ret) 4044 return ret; 4045 4046 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4047 } 4048 4049 static int intel_iommu_map(struct iommu_domain *domain, 4050 unsigned long iova, phys_addr_t hpa, 4051 size_t size, int iommu_prot, gfp_t gfp) 4052 { 4053 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4054 u64 max_addr; 4055 int prot = 0; 4056 4057 if (iommu_prot & IOMMU_READ) 4058 prot |= DMA_PTE_READ; 4059 if (iommu_prot & IOMMU_WRITE) 4060 prot |= DMA_PTE_WRITE; 4061 if (dmar_domain->set_pte_snp) 4062 prot |= DMA_PTE_SNP; 4063 4064 max_addr = iova + size; 4065 if (dmar_domain->max_addr < max_addr) { 4066 u64 end; 4067 4068 /* check if minimum agaw is sufficient for mapped address */ 4069 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4070 if (end < max_addr) { 4071 pr_err("%s: iommu width (%d) is not " 4072 "sufficient for the mapped address (%llx)\n", 4073 __func__, dmar_domain->gaw, max_addr); 4074 return -EFAULT; 4075 } 4076 dmar_domain->max_addr = max_addr; 4077 } 4078 /* Round up size to next multiple of PAGE_SIZE, if it and 4079 the low bits of hpa would take us onto the next page */ 4080 size = aligned_nrpages(hpa, size); 4081 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4082 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4083 } 4084 4085 static int intel_iommu_map_pages(struct iommu_domain *domain, 4086 unsigned long iova, phys_addr_t paddr, 4087 size_t pgsize, size_t pgcount, 4088 int prot, gfp_t gfp, size_t *mapped) 4089 { 4090 unsigned long pgshift = __ffs(pgsize); 4091 size_t size = pgcount << pgshift; 4092 int ret; 4093 4094 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4095 return -EINVAL; 4096 4097 if (!IS_ALIGNED(iova | paddr, pgsize)) 4098 return -EINVAL; 4099 4100 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4101 if (!ret && mapped) 4102 *mapped = size; 4103 4104 return ret; 4105 } 4106 4107 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4108 unsigned long iova, size_t size, 4109 struct iommu_iotlb_gather *gather) 4110 { 4111 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4112 unsigned long start_pfn, last_pfn; 4113 int level = 0; 4114 4115 /* Cope with horrid API which requires us to unmap more than the 4116 size argument if it happens to be a large-page mapping. */ 4117 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4118 &level, GFP_ATOMIC))) 4119 return 0; 4120 4121 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4122 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4123 4124 start_pfn = iova >> VTD_PAGE_SHIFT; 4125 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4126 4127 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4128 4129 if (dmar_domain->max_addr == iova + size) 4130 dmar_domain->max_addr = iova; 4131 4132 /* 4133 * We do not use page-selective IOTLB invalidation in flush queue, 4134 * so there is no need to track page and sync iotlb. 4135 */ 4136 if (!iommu_iotlb_gather_queued(gather)) 4137 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4138 4139 return size; 4140 } 4141 4142 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4143 unsigned long iova, 4144 size_t pgsize, size_t pgcount, 4145 struct iommu_iotlb_gather *gather) 4146 { 4147 unsigned long pgshift = __ffs(pgsize); 4148 size_t size = pgcount << pgshift; 4149 4150 return intel_iommu_unmap(domain, iova, size, gather); 4151 } 4152 4153 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4154 struct iommu_iotlb_gather *gather) 4155 { 4156 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4157 unsigned long iova_pfn = IOVA_PFN(gather->start); 4158 size_t size = gather->end - gather->start; 4159 struct iommu_domain_info *info; 4160 unsigned long start_pfn; 4161 unsigned long nrpages; 4162 unsigned long i; 4163 4164 nrpages = aligned_nrpages(gather->start, size); 4165 start_pfn = mm_to_dma_pfn_start(iova_pfn); 4166 4167 xa_for_each(&dmar_domain->iommu_array, i, info) 4168 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4169 start_pfn, nrpages, 4170 list_empty(&gather->freelist), 0); 4171 4172 if (dmar_domain->nested_parent) 4173 parent_domain_flush(dmar_domain, start_pfn, nrpages, 4174 list_empty(&gather->freelist)); 4175 put_pages_list(&gather->freelist); 4176 } 4177 4178 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4179 dma_addr_t iova) 4180 { 4181 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4182 struct dma_pte *pte; 4183 int level = 0; 4184 u64 phys = 0; 4185 4186 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4187 GFP_ATOMIC); 4188 if (pte && dma_pte_present(pte)) 4189 phys = dma_pte_addr(pte) + 4190 (iova & (BIT_MASK(level_to_offset_bits(level) + 4191 VTD_PAGE_SHIFT) - 1)); 4192 4193 return phys; 4194 } 4195 4196 static bool domain_support_force_snooping(struct dmar_domain *domain) 4197 { 4198 struct device_domain_info *info; 4199 bool support = true; 4200 4201 assert_spin_locked(&domain->lock); 4202 list_for_each_entry(info, &domain->devices, link) { 4203 if (!ecap_sc_support(info->iommu->ecap)) { 4204 support = false; 4205 break; 4206 } 4207 } 4208 4209 return support; 4210 } 4211 4212 static void domain_set_force_snooping(struct dmar_domain *domain) 4213 { 4214 struct device_domain_info *info; 4215 4216 assert_spin_locked(&domain->lock); 4217 /* 4218 * Second level page table supports per-PTE snoop control. The 4219 * iommu_map() interface will handle this by setting SNP bit. 4220 */ 4221 if (!domain->use_first_level) { 4222 domain->set_pte_snp = true; 4223 return; 4224 } 4225 4226 list_for_each_entry(info, &domain->devices, link) 4227 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4228 IOMMU_NO_PASID); 4229 } 4230 4231 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4232 { 4233 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4234 unsigned long flags; 4235 4236 if (dmar_domain->force_snooping) 4237 return true; 4238 4239 spin_lock_irqsave(&dmar_domain->lock, flags); 4240 if (!domain_support_force_snooping(dmar_domain) || 4241 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 4242 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4243 return false; 4244 } 4245 4246 domain_set_force_snooping(dmar_domain); 4247 dmar_domain->force_snooping = true; 4248 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4249 4250 return true; 4251 } 4252 4253 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4254 { 4255 struct device_domain_info *info = dev_iommu_priv_get(dev); 4256 4257 switch (cap) { 4258 case IOMMU_CAP_CACHE_COHERENCY: 4259 case IOMMU_CAP_DEFERRED_FLUSH: 4260 return true; 4261 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4262 return dmar_platform_optin(); 4263 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4264 return ecap_sc_support(info->iommu->ecap); 4265 case IOMMU_CAP_DIRTY_TRACKING: 4266 return ssads_supported(info->iommu); 4267 default: 4268 return false; 4269 } 4270 } 4271 4272 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4273 { 4274 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4275 struct device_domain_info *info; 4276 struct intel_iommu *iommu; 4277 u8 bus, devfn; 4278 int ret; 4279 4280 iommu = device_lookup_iommu(dev, &bus, &devfn); 4281 if (!iommu || !iommu->iommu.ops) 4282 return ERR_PTR(-ENODEV); 4283 4284 info = kzalloc(sizeof(*info), GFP_KERNEL); 4285 if (!info) 4286 return ERR_PTR(-ENOMEM); 4287 4288 if (dev_is_real_dma_subdevice(dev)) { 4289 info->bus = pdev->bus->number; 4290 info->devfn = pdev->devfn; 4291 info->segment = pci_domain_nr(pdev->bus); 4292 } else { 4293 info->bus = bus; 4294 info->devfn = devfn; 4295 info->segment = iommu->segment; 4296 } 4297 4298 info->dev = dev; 4299 info->iommu = iommu; 4300 if (dev_is_pci(dev)) { 4301 if (ecap_dev_iotlb_support(iommu->ecap) && 4302 pci_ats_supported(pdev) && 4303 dmar_ats_supported(pdev, iommu)) { 4304 info->ats_supported = 1; 4305 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4306 4307 /* 4308 * For IOMMU that supports device IOTLB throttling 4309 * (DIT), we assign PFSID to the invalidation desc 4310 * of a VF such that IOMMU HW can gauge queue depth 4311 * at PF level. If DIT is not set, PFSID will be 4312 * treated as reserved, which should be set to 0. 4313 */ 4314 if (ecap_dit(iommu->ecap)) 4315 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4316 info->ats_qdep = pci_ats_queue_depth(pdev); 4317 } 4318 if (sm_supported(iommu)) { 4319 if (pasid_supported(iommu)) { 4320 int features = pci_pasid_features(pdev); 4321 4322 if (features >= 0) 4323 info->pasid_supported = features | 1; 4324 } 4325 4326 if (info->ats_supported && ecap_prs(iommu->ecap) && 4327 pci_pri_supported(pdev)) 4328 info->pri_supported = 1; 4329 } 4330 } 4331 4332 dev_iommu_priv_set(dev, info); 4333 4334 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4335 ret = intel_pasid_alloc_table(dev); 4336 if (ret) { 4337 dev_err(dev, "PASID table allocation failed\n"); 4338 kfree(info); 4339 return ERR_PTR(ret); 4340 } 4341 } 4342 4343 intel_iommu_debugfs_create_dev(info); 4344 4345 return &iommu->iommu; 4346 } 4347 4348 static void intel_iommu_release_device(struct device *dev) 4349 { 4350 struct device_domain_info *info = dev_iommu_priv_get(dev); 4351 4352 dmar_remove_one_dev_info(dev); 4353 intel_pasid_free_table(dev); 4354 intel_iommu_debugfs_remove_dev(info); 4355 kfree(info); 4356 set_dma_ops(dev, NULL); 4357 } 4358 4359 static void intel_iommu_probe_finalize(struct device *dev) 4360 { 4361 set_dma_ops(dev, NULL); 4362 iommu_setup_dma_ops(dev, 0, U64_MAX); 4363 } 4364 4365 static void intel_iommu_get_resv_regions(struct device *device, 4366 struct list_head *head) 4367 { 4368 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4369 struct iommu_resv_region *reg; 4370 struct dmar_rmrr_unit *rmrr; 4371 struct device *i_dev; 4372 int i; 4373 4374 rcu_read_lock(); 4375 for_each_rmrr_units(rmrr) { 4376 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4377 i, i_dev) { 4378 struct iommu_resv_region *resv; 4379 enum iommu_resv_type type; 4380 size_t length; 4381 4382 if (i_dev != device && 4383 !is_downstream_to_pci_bridge(device, i_dev)) 4384 continue; 4385 4386 length = rmrr->end_address - rmrr->base_address + 1; 4387 4388 type = device_rmrr_is_relaxable(device) ? 4389 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4390 4391 resv = iommu_alloc_resv_region(rmrr->base_address, 4392 length, prot, type, 4393 GFP_ATOMIC); 4394 if (!resv) 4395 break; 4396 4397 list_add_tail(&resv->list, head); 4398 } 4399 } 4400 rcu_read_unlock(); 4401 4402 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4403 if (dev_is_pci(device)) { 4404 struct pci_dev *pdev = to_pci_dev(device); 4405 4406 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4407 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4408 IOMMU_RESV_DIRECT_RELAXABLE, 4409 GFP_KERNEL); 4410 if (reg) 4411 list_add_tail(®->list, head); 4412 } 4413 } 4414 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4415 4416 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4417 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4418 0, IOMMU_RESV_MSI, GFP_KERNEL); 4419 if (!reg) 4420 return; 4421 list_add_tail(®->list, head); 4422 } 4423 4424 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4425 { 4426 if (dev_is_pci(dev)) 4427 return pci_device_group(dev); 4428 return generic_device_group(dev); 4429 } 4430 4431 static int intel_iommu_enable_sva(struct device *dev) 4432 { 4433 struct device_domain_info *info = dev_iommu_priv_get(dev); 4434 struct intel_iommu *iommu; 4435 4436 if (!info || dmar_disabled) 4437 return -EINVAL; 4438 4439 iommu = info->iommu; 4440 if (!iommu) 4441 return -EINVAL; 4442 4443 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4444 return -ENODEV; 4445 4446 if (!info->pasid_enabled || !info->ats_enabled) 4447 return -EINVAL; 4448 4449 /* 4450 * Devices having device-specific I/O fault handling should not 4451 * support PCI/PRI. The IOMMU side has no means to check the 4452 * capability of device-specific IOPF. Therefore, IOMMU can only 4453 * default that if the device driver enables SVA on a non-PRI 4454 * device, it will handle IOPF in its own way. 4455 */ 4456 if (!info->pri_supported) 4457 return 0; 4458 4459 /* Devices supporting PRI should have it enabled. */ 4460 if (!info->pri_enabled) 4461 return -EINVAL; 4462 4463 return 0; 4464 } 4465 4466 static int intel_iommu_enable_iopf(struct device *dev) 4467 { 4468 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4469 struct device_domain_info *info = dev_iommu_priv_get(dev); 4470 struct intel_iommu *iommu; 4471 int ret; 4472 4473 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4474 return -ENODEV; 4475 4476 if (info->pri_enabled) 4477 return -EBUSY; 4478 4479 iommu = info->iommu; 4480 if (!iommu) 4481 return -EINVAL; 4482 4483 /* PASID is required in PRG Response Message. */ 4484 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4485 return -EINVAL; 4486 4487 ret = pci_reset_pri(pdev); 4488 if (ret) 4489 return ret; 4490 4491 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4492 if (ret) 4493 return ret; 4494 4495 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4496 if (ret) 4497 goto iopf_remove_device; 4498 4499 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4500 if (ret) 4501 goto iopf_unregister_handler; 4502 info->pri_enabled = 1; 4503 4504 return 0; 4505 4506 iopf_unregister_handler: 4507 iommu_unregister_device_fault_handler(dev); 4508 iopf_remove_device: 4509 iopf_queue_remove_device(iommu->iopf_queue, dev); 4510 4511 return ret; 4512 } 4513 4514 static int intel_iommu_disable_iopf(struct device *dev) 4515 { 4516 struct device_domain_info *info = dev_iommu_priv_get(dev); 4517 struct intel_iommu *iommu = info->iommu; 4518 4519 if (!info->pri_enabled) 4520 return -EINVAL; 4521 4522 /* 4523 * PCIe spec states that by clearing PRI enable bit, the Page 4524 * Request Interface will not issue new page requests, but has 4525 * outstanding page requests that have been transmitted or are 4526 * queued for transmission. This is supposed to be called after 4527 * the device driver has stopped DMA, all PASIDs have been 4528 * unbound and the outstanding PRQs have been drained. 4529 */ 4530 pci_disable_pri(to_pci_dev(dev)); 4531 info->pri_enabled = 0; 4532 4533 /* 4534 * With PRI disabled and outstanding PRQs drained, unregistering 4535 * fault handler and removing device from iopf queue should never 4536 * fail. 4537 */ 4538 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4539 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4540 4541 return 0; 4542 } 4543 4544 static int 4545 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4546 { 4547 switch (feat) { 4548 case IOMMU_DEV_FEAT_IOPF: 4549 return intel_iommu_enable_iopf(dev); 4550 4551 case IOMMU_DEV_FEAT_SVA: 4552 return intel_iommu_enable_sva(dev); 4553 4554 default: 4555 return -ENODEV; 4556 } 4557 } 4558 4559 static int 4560 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4561 { 4562 switch (feat) { 4563 case IOMMU_DEV_FEAT_IOPF: 4564 return intel_iommu_disable_iopf(dev); 4565 4566 case IOMMU_DEV_FEAT_SVA: 4567 return 0; 4568 4569 default: 4570 return -ENODEV; 4571 } 4572 } 4573 4574 static bool intel_iommu_is_attach_deferred(struct device *dev) 4575 { 4576 struct device_domain_info *info = dev_iommu_priv_get(dev); 4577 4578 return translation_pre_enabled(info->iommu) && !info->domain; 4579 } 4580 4581 /* 4582 * Check that the device does not live on an external facing PCI port that is 4583 * marked as untrusted. Such devices should not be able to apply quirks and 4584 * thus not be able to bypass the IOMMU restrictions. 4585 */ 4586 static bool risky_device(struct pci_dev *pdev) 4587 { 4588 if (pdev->untrusted) { 4589 pci_info(pdev, 4590 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4591 pdev->vendor, pdev->device); 4592 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4593 return true; 4594 } 4595 return false; 4596 } 4597 4598 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4599 unsigned long iova, size_t size) 4600 { 4601 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4602 unsigned long pages = aligned_nrpages(iova, size); 4603 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4604 struct iommu_domain_info *info; 4605 unsigned long i; 4606 4607 xa_for_each(&dmar_domain->iommu_array, i, info) 4608 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4609 return 0; 4610 } 4611 4612 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4613 { 4614 struct device_domain_info *info = dev_iommu_priv_get(dev); 4615 struct dev_pasid_info *curr, *dev_pasid = NULL; 4616 struct intel_iommu *iommu = info->iommu; 4617 struct dmar_domain *dmar_domain; 4618 struct iommu_domain *domain; 4619 unsigned long flags; 4620 4621 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4622 if (WARN_ON_ONCE(!domain)) 4623 goto out_tear_down; 4624 4625 /* 4626 * The SVA implementation needs to handle its own stuffs like the mm 4627 * notification. Before consolidating that code into iommu core, let 4628 * the intel sva code handle it. 4629 */ 4630 if (domain->type == IOMMU_DOMAIN_SVA) { 4631 intel_svm_remove_dev_pasid(dev, pasid); 4632 goto out_tear_down; 4633 } 4634 4635 dmar_domain = to_dmar_domain(domain); 4636 spin_lock_irqsave(&dmar_domain->lock, flags); 4637 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4638 if (curr->dev == dev && curr->pasid == pasid) { 4639 list_del(&curr->link_domain); 4640 dev_pasid = curr; 4641 break; 4642 } 4643 } 4644 WARN_ON_ONCE(!dev_pasid); 4645 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4646 4647 domain_detach_iommu(dmar_domain, iommu); 4648 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4649 kfree(dev_pasid); 4650 out_tear_down: 4651 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4652 intel_drain_pasid_prq(dev, pasid); 4653 } 4654 4655 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4656 struct device *dev, ioasid_t pasid) 4657 { 4658 struct device_domain_info *info = dev_iommu_priv_get(dev); 4659 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4660 struct intel_iommu *iommu = info->iommu; 4661 struct dev_pasid_info *dev_pasid; 4662 unsigned long flags; 4663 int ret; 4664 4665 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4666 return -EOPNOTSUPP; 4667 4668 if (domain->dirty_ops) 4669 return -EINVAL; 4670 4671 if (context_copied(iommu, info->bus, info->devfn)) 4672 return -EBUSY; 4673 4674 ret = prepare_domain_attach_device(domain, dev); 4675 if (ret) 4676 return ret; 4677 4678 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4679 if (!dev_pasid) 4680 return -ENOMEM; 4681 4682 ret = domain_attach_iommu(dmar_domain, iommu); 4683 if (ret) 4684 goto out_free; 4685 4686 if (domain_type_is_si(dmar_domain)) 4687 ret = intel_pasid_setup_pass_through(iommu, dev, pasid); 4688 else if (dmar_domain->use_first_level) 4689 ret = domain_setup_first_level(iommu, dmar_domain, 4690 dev, pasid); 4691 else 4692 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4693 dev, pasid); 4694 if (ret) 4695 goto out_detach_iommu; 4696 4697 dev_pasid->dev = dev; 4698 dev_pasid->pasid = pasid; 4699 spin_lock_irqsave(&dmar_domain->lock, flags); 4700 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4701 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4702 4703 if (domain->type & __IOMMU_DOMAIN_PAGING) 4704 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4705 4706 return 0; 4707 out_detach_iommu: 4708 domain_detach_iommu(dmar_domain, iommu); 4709 out_free: 4710 kfree(dev_pasid); 4711 return ret; 4712 } 4713 4714 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4715 { 4716 struct device_domain_info *info = dev_iommu_priv_get(dev); 4717 struct intel_iommu *iommu = info->iommu; 4718 struct iommu_hw_info_vtd *vtd; 4719 4720 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4721 if (!vtd) 4722 return ERR_PTR(-ENOMEM); 4723 4724 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4725 vtd->cap_reg = iommu->cap; 4726 vtd->ecap_reg = iommu->ecap; 4727 *length = sizeof(*vtd); 4728 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4729 return vtd; 4730 } 4731 4732 /* 4733 * Set dirty tracking for the device list of a domain. The caller must 4734 * hold the domain->lock when calling it. 4735 */ 4736 static int device_set_dirty_tracking(struct list_head *devices, bool enable) 4737 { 4738 struct device_domain_info *info; 4739 int ret = 0; 4740 4741 list_for_each_entry(info, devices, link) { 4742 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 4743 IOMMU_NO_PASID, enable); 4744 if (ret) 4745 break; 4746 } 4747 4748 return ret; 4749 } 4750 4751 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, 4752 bool enable) 4753 { 4754 struct dmar_domain *s1_domain; 4755 unsigned long flags; 4756 int ret; 4757 4758 spin_lock(&domain->s1_lock); 4759 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4760 spin_lock_irqsave(&s1_domain->lock, flags); 4761 ret = device_set_dirty_tracking(&s1_domain->devices, enable); 4762 spin_unlock_irqrestore(&s1_domain->lock, flags); 4763 if (ret) 4764 goto err_unwind; 4765 } 4766 spin_unlock(&domain->s1_lock); 4767 return 0; 4768 4769 err_unwind: 4770 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 4771 spin_lock_irqsave(&s1_domain->lock, flags); 4772 device_set_dirty_tracking(&s1_domain->devices, 4773 domain->dirty_tracking); 4774 spin_unlock_irqrestore(&s1_domain->lock, flags); 4775 } 4776 spin_unlock(&domain->s1_lock); 4777 return ret; 4778 } 4779 4780 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4781 bool enable) 4782 { 4783 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4784 int ret; 4785 4786 spin_lock(&dmar_domain->lock); 4787 if (dmar_domain->dirty_tracking == enable) 4788 goto out_unlock; 4789 4790 ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 4791 if (ret) 4792 goto err_unwind; 4793 4794 if (dmar_domain->nested_parent) { 4795 ret = parent_domain_set_dirty_tracking(dmar_domain, enable); 4796 if (ret) 4797 goto err_unwind; 4798 } 4799 4800 dmar_domain->dirty_tracking = enable; 4801 out_unlock: 4802 spin_unlock(&dmar_domain->lock); 4803 4804 return 0; 4805 4806 err_unwind: 4807 device_set_dirty_tracking(&dmar_domain->devices, 4808 dmar_domain->dirty_tracking); 4809 spin_unlock(&dmar_domain->lock); 4810 return ret; 4811 } 4812 4813 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4814 unsigned long iova, size_t size, 4815 unsigned long flags, 4816 struct iommu_dirty_bitmap *dirty) 4817 { 4818 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4819 unsigned long end = iova + size - 1; 4820 unsigned long pgsize; 4821 4822 /* 4823 * IOMMUFD core calls into a dirty tracking disabled domain without an 4824 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4825 * have occurred when we stopped dirty tracking. This ensures that we 4826 * never inherit dirtied bits from a previous cycle. 4827 */ 4828 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4829 return -EINVAL; 4830 4831 do { 4832 struct dma_pte *pte; 4833 int lvl = 0; 4834 4835 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4836 GFP_ATOMIC); 4837 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4838 if (!pte || !dma_pte_present(pte)) { 4839 iova += pgsize; 4840 continue; 4841 } 4842 4843 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4844 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4845 iova += pgsize; 4846 } while (iova < end); 4847 4848 return 0; 4849 } 4850 4851 static const struct iommu_dirty_ops intel_dirty_ops = { 4852 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4853 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4854 }; 4855 4856 const struct iommu_ops intel_iommu_ops = { 4857 .blocked_domain = &blocking_domain, 4858 .capable = intel_iommu_capable, 4859 .hw_info = intel_iommu_hw_info, 4860 .domain_alloc = intel_iommu_domain_alloc, 4861 .domain_alloc_user = intel_iommu_domain_alloc_user, 4862 .probe_device = intel_iommu_probe_device, 4863 .probe_finalize = intel_iommu_probe_finalize, 4864 .release_device = intel_iommu_release_device, 4865 .get_resv_regions = intel_iommu_get_resv_regions, 4866 .device_group = intel_iommu_device_group, 4867 .dev_enable_feat = intel_iommu_dev_enable_feat, 4868 .dev_disable_feat = intel_iommu_dev_disable_feat, 4869 .is_attach_deferred = intel_iommu_is_attach_deferred, 4870 .def_domain_type = device_def_domain_type, 4871 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4872 .pgsize_bitmap = SZ_4K, 4873 #ifdef CONFIG_INTEL_IOMMU_SVM 4874 .page_response = intel_svm_page_response, 4875 #endif 4876 .default_domain_ops = &(const struct iommu_domain_ops) { 4877 .attach_dev = intel_iommu_attach_device, 4878 .set_dev_pasid = intel_iommu_set_dev_pasid, 4879 .map_pages = intel_iommu_map_pages, 4880 .unmap_pages = intel_iommu_unmap_pages, 4881 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4882 .flush_iotlb_all = intel_flush_iotlb_all, 4883 .iotlb_sync = intel_iommu_tlb_sync, 4884 .iova_to_phys = intel_iommu_iova_to_phys, 4885 .free = intel_iommu_domain_free, 4886 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4887 } 4888 }; 4889 4890 static void quirk_iommu_igfx(struct pci_dev *dev) 4891 { 4892 if (risky_device(dev)) 4893 return; 4894 4895 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4896 dmar_map_gfx = 0; 4897 } 4898 4899 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4907 4908 /* Broadwell igfx malfunctions with dmar */ 4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4917 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4918 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4921 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4933 4934 static void quirk_iommu_rwbf(struct pci_dev *dev) 4935 { 4936 if (risky_device(dev)) 4937 return; 4938 4939 /* 4940 * Mobile 4 Series Chipset neglects to set RWBF capability, 4941 * but needs it. Same seems to hold for the desktop versions. 4942 */ 4943 pci_info(dev, "Forcing write-buffer flush capability\n"); 4944 rwbf_quirk = 1; 4945 } 4946 4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4954 4955 #define GGC 0x52 4956 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4957 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4958 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4959 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4960 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4961 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4962 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4963 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4964 4965 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4966 { 4967 unsigned short ggc; 4968 4969 if (risky_device(dev)) 4970 return; 4971 4972 if (pci_read_config_word(dev, GGC, &ggc)) 4973 return; 4974 4975 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4976 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4977 dmar_map_gfx = 0; 4978 } else if (dmar_map_gfx) { 4979 /* we have to ensure the gfx device is idle before we flush */ 4980 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4981 iommu_set_dma_strict(); 4982 } 4983 } 4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4988 4989 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4990 { 4991 unsigned short ver; 4992 4993 if (!IS_GFX_DEVICE(dev)) 4994 return; 4995 4996 ver = (dev->device >> 8) & 0xff; 4997 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4998 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4999 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 5000 return; 5001 5002 if (risky_device(dev)) 5003 return; 5004 5005 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5006 iommu_skip_te_disable = 1; 5007 } 5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5009 5010 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5011 ISOCH DMAR unit for the Azalia sound device, but not give it any 5012 TLB entries, which causes it to deadlock. Check for that. We do 5013 this in a function called from init_dmars(), instead of in a PCI 5014 quirk, because we don't want to print the obnoxious "BIOS broken" 5015 message if VT-d is actually disabled. 5016 */ 5017 static void __init check_tylersburg_isoch(void) 5018 { 5019 struct pci_dev *pdev; 5020 uint32_t vtisochctrl; 5021 5022 /* If there's no Azalia in the system anyway, forget it. */ 5023 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5024 if (!pdev) 5025 return; 5026 5027 if (risky_device(pdev)) { 5028 pci_dev_put(pdev); 5029 return; 5030 } 5031 5032 pci_dev_put(pdev); 5033 5034 /* System Management Registers. Might be hidden, in which case 5035 we can't do the sanity check. But that's OK, because the 5036 known-broken BIOSes _don't_ actually hide it, so far. */ 5037 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5038 if (!pdev) 5039 return; 5040 5041 if (risky_device(pdev)) { 5042 pci_dev_put(pdev); 5043 return; 5044 } 5045 5046 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5047 pci_dev_put(pdev); 5048 return; 5049 } 5050 5051 pci_dev_put(pdev); 5052 5053 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5054 if (vtisochctrl & 1) 5055 return; 5056 5057 /* Drop all bits other than the number of TLB entries */ 5058 vtisochctrl &= 0x1c; 5059 5060 /* If we have the recommended number of TLB entries (16), fine. */ 5061 if (vtisochctrl == 0x10) 5062 return; 5063 5064 /* Zero TLB entries? You get to ride the short bus to school. */ 5065 if (!vtisochctrl) { 5066 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5067 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5068 dmi_get_system_info(DMI_BIOS_VENDOR), 5069 dmi_get_system_info(DMI_BIOS_VERSION), 5070 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5071 iommu_identity_mapping |= IDENTMAP_AZALIA; 5072 return; 5073 } 5074 5075 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5076 vtisochctrl); 5077 } 5078 5079 /* 5080 * Here we deal with a device TLB defect where device may inadvertently issue ATS 5081 * invalidation completion before posted writes initiated with translated address 5082 * that utilized translations matching the invalidation address range, violating 5083 * the invalidation completion ordering. 5084 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 5085 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 5086 * under the control of the trusted/privileged host device driver must use this 5087 * quirk. 5088 * Device TLBs are invalidated under the following six conditions: 5089 * 1. Device driver does DMA API unmap IOVA 5090 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 5091 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 5092 * exit_mmap() due to crash 5093 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 5094 * VM has to free pages that were unmapped 5095 * 5. Userspace driver unmaps a DMA buffer 5096 * 6. Cache invalidation in vSVA usage (upcoming) 5097 * 5098 * For #1 and #2, device drivers are responsible for stopping DMA traffic 5099 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 5100 * invalidate TLB the same way as normal user unmap which will use this quirk. 5101 * The dTLB invalidation after PASID cache flush does not need this quirk. 5102 * 5103 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 5104 */ 5105 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 5106 unsigned long address, unsigned long mask, 5107 u32 pasid, u16 qdep) 5108 { 5109 u16 sid; 5110 5111 if (likely(!info->dtlb_extra_inval)) 5112 return; 5113 5114 sid = PCI_DEVID(info->bus, info->devfn); 5115 if (pasid == IOMMU_NO_PASID) { 5116 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 5117 qdep, address, mask); 5118 } else { 5119 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5120 pasid, qdep, address, mask); 5121 } 5122 } 5123 5124 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5125 5126 /* 5127 * Function to submit a command to the enhanced command interface. The 5128 * valid enhanced command descriptions are defined in Table 47 of the 5129 * VT-d spec. The VT-d hardware implementation may support some but not 5130 * all commands, which can be determined by checking the Enhanced 5131 * Command Capability Register. 5132 * 5133 * Return values: 5134 * - 0: Command successful without any error; 5135 * - Negative: software error value; 5136 * - Nonzero positive: failure status code defined in Table 48. 5137 */ 5138 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5139 { 5140 unsigned long flags; 5141 u64 res; 5142 int ret; 5143 5144 if (!cap_ecmds(iommu->cap)) 5145 return -ENODEV; 5146 5147 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5148 5149 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5150 if (res & DMA_ECMD_ECRSP_IP) { 5151 ret = -EBUSY; 5152 goto err; 5153 } 5154 5155 /* 5156 * Unconditionally write the operand B, because 5157 * - There is no side effect if an ecmd doesn't require an 5158 * operand B, but we set the register to some value. 5159 * - It's not invoked in any critical path. The extra MMIO 5160 * write doesn't bring any performance concerns. 5161 */ 5162 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5163 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5164 5165 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5166 !(res & DMA_ECMD_ECRSP_IP), res); 5167 5168 if (res & DMA_ECMD_ECRSP_IP) { 5169 ret = -ETIMEDOUT; 5170 goto err; 5171 } 5172 5173 ret = ecmd_get_status_code(res); 5174 err: 5175 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5176 5177 return ret; 5178 } 5179