1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 51 52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 57 58 /* IO virtual address start page frame number */ 59 #define IOVA_START_PFN (1) 60 61 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 62 63 static void __init check_tylersburg_isoch(void); 64 static int rwbf_quirk; 65 66 /* 67 * set to 1 to panic kernel if can't successfully enable VT-d 68 * (used when kernel is launched w/ TXT) 69 */ 70 static int force_on = 0; 71 static int intel_iommu_tboot_noforce; 72 static int no_platform_optin; 73 74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 75 76 /* 77 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 78 * if marked present. 79 */ 80 static phys_addr_t root_entry_lctp(struct root_entry *re) 81 { 82 if (!(re->lo & 1)) 83 return 0; 84 85 return re->lo & VTD_PAGE_MASK; 86 } 87 88 /* 89 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 90 * if marked present. 91 */ 92 static phys_addr_t root_entry_uctp(struct root_entry *re) 93 { 94 if (!(re->hi & 1)) 95 return 0; 96 97 return re->hi & VTD_PAGE_MASK; 98 } 99 100 /* 101 * This domain is a statically identity mapping domain. 102 * 1. This domain creats a static 1:1 mapping to all usable memory. 103 * 2. It maps to each iommu if successful. 104 * 3. Each iommu mapps to this domain if successful. 105 */ 106 static struct dmar_domain *si_domain; 107 static int hw_pass_through = 1; 108 109 struct dmar_rmrr_unit { 110 struct list_head list; /* list of rmrr units */ 111 struct acpi_dmar_header *hdr; /* ACPI header */ 112 u64 base_address; /* reserved base address*/ 113 u64 end_address; /* reserved end address */ 114 struct dmar_dev_scope *devices; /* target devices */ 115 int devices_cnt; /* target device count */ 116 }; 117 118 struct dmar_atsr_unit { 119 struct list_head list; /* list of ATSR units */ 120 struct acpi_dmar_header *hdr; /* ACPI header */ 121 struct dmar_dev_scope *devices; /* target devices */ 122 int devices_cnt; /* target device count */ 123 u8 include_all:1; /* include all ports */ 124 }; 125 126 struct dmar_satc_unit { 127 struct list_head list; /* list of SATC units */ 128 struct acpi_dmar_header *hdr; /* ACPI header */ 129 struct dmar_dev_scope *devices; /* target devices */ 130 struct intel_iommu *iommu; /* the corresponding iommu */ 131 int devices_cnt; /* target device count */ 132 u8 atc_required:1; /* ATS is required */ 133 }; 134 135 static LIST_HEAD(dmar_atsr_units); 136 static LIST_HEAD(dmar_rmrr_units); 137 static LIST_HEAD(dmar_satc_units); 138 139 #define for_each_rmrr_units(rmrr) \ 140 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 141 142 static void intel_iommu_domain_free(struct iommu_domain *domain); 143 144 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 145 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 146 147 int intel_iommu_enabled = 0; 148 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 149 150 static int dmar_map_gfx = 1; 151 static int intel_iommu_superpage = 1; 152 static int iommu_identity_mapping; 153 static int iommu_skip_te_disable; 154 155 #define IDENTMAP_GFX 2 156 #define IDENTMAP_AZALIA 4 157 158 const struct iommu_ops intel_iommu_ops; 159 static const struct iommu_dirty_ops intel_dirty_ops; 160 161 static bool translation_pre_enabled(struct intel_iommu *iommu) 162 { 163 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 164 } 165 166 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 167 { 168 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 169 } 170 171 static void init_translation_status(struct intel_iommu *iommu) 172 { 173 u32 gsts; 174 175 gsts = readl(iommu->reg + DMAR_GSTS_REG); 176 if (gsts & DMA_GSTS_TES) 177 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 178 } 179 180 static int __init intel_iommu_setup(char *str) 181 { 182 if (!str) 183 return -EINVAL; 184 185 while (*str) { 186 if (!strncmp(str, "on", 2)) { 187 dmar_disabled = 0; 188 pr_info("IOMMU enabled\n"); 189 } else if (!strncmp(str, "off", 3)) { 190 dmar_disabled = 1; 191 no_platform_optin = 1; 192 pr_info("IOMMU disabled\n"); 193 } else if (!strncmp(str, "igfx_off", 8)) { 194 dmar_map_gfx = 0; 195 pr_info("Disable GFX device mapping\n"); 196 } else if (!strncmp(str, "forcedac", 8)) { 197 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 198 iommu_dma_forcedac = true; 199 } else if (!strncmp(str, "strict", 6)) { 200 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 201 iommu_set_dma_strict(); 202 } else if (!strncmp(str, "sp_off", 6)) { 203 pr_info("Disable supported super page\n"); 204 intel_iommu_superpage = 0; 205 } else if (!strncmp(str, "sm_on", 5)) { 206 pr_info("Enable scalable mode if hardware supports\n"); 207 intel_iommu_sm = 1; 208 } else if (!strncmp(str, "sm_off", 6)) { 209 pr_info("Scalable mode is disallowed\n"); 210 intel_iommu_sm = 0; 211 } else if (!strncmp(str, "tboot_noforce", 13)) { 212 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 213 intel_iommu_tboot_noforce = 1; 214 } else { 215 pr_notice("Unknown option - '%s'\n", str); 216 } 217 218 str += strcspn(str, ","); 219 while (*str == ',') 220 str++; 221 } 222 223 return 1; 224 } 225 __setup("intel_iommu=", intel_iommu_setup); 226 227 void *alloc_pgtable_page(int node, gfp_t gfp) 228 { 229 struct page *page; 230 void *vaddr = NULL; 231 232 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 233 if (page) 234 vaddr = page_address(page); 235 return vaddr; 236 } 237 238 void free_pgtable_page(void *vaddr) 239 { 240 free_page((unsigned long)vaddr); 241 } 242 243 static int domain_type_is_si(struct dmar_domain *domain) 244 { 245 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 246 } 247 248 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 249 { 250 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 251 252 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 253 } 254 255 /* 256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 258 * the returned SAGAW. 259 */ 260 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 261 { 262 unsigned long fl_sagaw, sl_sagaw; 263 264 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 265 sl_sagaw = cap_sagaw(iommu->cap); 266 267 /* Second level only. */ 268 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 269 return sl_sagaw; 270 271 /* First level only. */ 272 if (!ecap_slts(iommu->ecap)) 273 return fl_sagaw; 274 275 return fl_sagaw & sl_sagaw; 276 } 277 278 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 279 { 280 unsigned long sagaw; 281 int agaw; 282 283 sagaw = __iommu_calculate_sagaw(iommu); 284 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 285 if (test_bit(agaw, &sagaw)) 286 break; 287 } 288 289 return agaw; 290 } 291 292 /* 293 * Calculate max SAGAW for each iommu. 294 */ 295 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 296 { 297 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 298 } 299 300 /* 301 * calculate agaw for each iommu. 302 * "SAGAW" may be different across iommus, use a default agaw, and 303 * get a supported less agaw for iommus that don't support the default agaw. 304 */ 305 int iommu_calculate_agaw(struct intel_iommu *iommu) 306 { 307 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 308 } 309 310 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 311 { 312 return sm_supported(iommu) ? 313 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 314 } 315 316 static void domain_update_iommu_coherency(struct dmar_domain *domain) 317 { 318 struct iommu_domain_info *info; 319 struct dmar_drhd_unit *drhd; 320 struct intel_iommu *iommu; 321 bool found = false; 322 unsigned long i; 323 324 domain->iommu_coherency = true; 325 xa_for_each(&domain->iommu_array, i, info) { 326 found = true; 327 if (!iommu_paging_structure_coherency(info->iommu)) { 328 domain->iommu_coherency = false; 329 break; 330 } 331 } 332 if (found) 333 return; 334 335 /* No hardware attached; use lowest common denominator */ 336 rcu_read_lock(); 337 for_each_active_iommu(iommu, drhd) { 338 if (!iommu_paging_structure_coherency(iommu)) { 339 domain->iommu_coherency = false; 340 break; 341 } 342 } 343 rcu_read_unlock(); 344 } 345 346 static int domain_update_iommu_superpage(struct dmar_domain *domain, 347 struct intel_iommu *skip) 348 { 349 struct dmar_drhd_unit *drhd; 350 struct intel_iommu *iommu; 351 int mask = 0x3; 352 353 if (!intel_iommu_superpage) 354 return 0; 355 356 /* set iommu_superpage to the smallest common denominator */ 357 rcu_read_lock(); 358 for_each_active_iommu(iommu, drhd) { 359 if (iommu != skip) { 360 if (domain && domain->use_first_level) { 361 if (!cap_fl1gp_support(iommu->cap)) 362 mask = 0x1; 363 } else { 364 mask &= cap_super_page_val(iommu->cap); 365 } 366 367 if (!mask) 368 break; 369 } 370 } 371 rcu_read_unlock(); 372 373 return fls(mask); 374 } 375 376 static int domain_update_device_node(struct dmar_domain *domain) 377 { 378 struct device_domain_info *info; 379 int nid = NUMA_NO_NODE; 380 unsigned long flags; 381 382 spin_lock_irqsave(&domain->lock, flags); 383 list_for_each_entry(info, &domain->devices, link) { 384 /* 385 * There could possibly be multiple device numa nodes as devices 386 * within the same domain may sit behind different IOMMUs. There 387 * isn't perfect answer in such situation, so we select first 388 * come first served policy. 389 */ 390 nid = dev_to_node(info->dev); 391 if (nid != NUMA_NO_NODE) 392 break; 393 } 394 spin_unlock_irqrestore(&domain->lock, flags); 395 396 return nid; 397 } 398 399 static void domain_update_iotlb(struct dmar_domain *domain); 400 401 /* Return the super pagesize bitmap if supported. */ 402 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 403 { 404 unsigned long bitmap = 0; 405 406 /* 407 * 1-level super page supports page size of 2MiB, 2-level super page 408 * supports page size of both 2MiB and 1GiB. 409 */ 410 if (domain->iommu_superpage == 1) 411 bitmap |= SZ_2M; 412 else if (domain->iommu_superpage == 2) 413 bitmap |= SZ_2M | SZ_1G; 414 415 return bitmap; 416 } 417 418 /* Some capabilities may be different across iommus */ 419 void domain_update_iommu_cap(struct dmar_domain *domain) 420 { 421 domain_update_iommu_coherency(domain); 422 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 423 424 /* 425 * If RHSA is missing, we should default to the device numa domain 426 * as fall back. 427 */ 428 if (domain->nid == NUMA_NO_NODE) 429 domain->nid = domain_update_device_node(domain); 430 431 /* 432 * First-level translation restricts the input-address to a 433 * canonical address (i.e., address bits 63:N have the same 434 * value as address bit [N-1], where N is 48-bits with 4-level 435 * paging and 57-bits with 5-level paging). Hence, skip bit 436 * [N-1]. 437 */ 438 if (domain->use_first_level) 439 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 440 else 441 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 442 443 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 444 domain_update_iotlb(domain); 445 } 446 447 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 448 u8 devfn, int alloc) 449 { 450 struct root_entry *root = &iommu->root_entry[bus]; 451 struct context_entry *context; 452 u64 *entry; 453 454 /* 455 * Except that the caller requested to allocate a new entry, 456 * returning a copied context entry makes no sense. 457 */ 458 if (!alloc && context_copied(iommu, bus, devfn)) 459 return NULL; 460 461 entry = &root->lo; 462 if (sm_supported(iommu)) { 463 if (devfn >= 0x80) { 464 devfn -= 0x80; 465 entry = &root->hi; 466 } 467 devfn *= 2; 468 } 469 if (*entry & 1) 470 context = phys_to_virt(*entry & VTD_PAGE_MASK); 471 else { 472 unsigned long phy_addr; 473 if (!alloc) 474 return NULL; 475 476 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 477 if (!context) 478 return NULL; 479 480 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 481 phy_addr = virt_to_phys((void *)context); 482 *entry = phy_addr | 1; 483 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 484 } 485 return &context[devfn]; 486 } 487 488 /** 489 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 490 * sub-hierarchy of a candidate PCI-PCI bridge 491 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 492 * @bridge: the candidate PCI-PCI bridge 493 * 494 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 495 */ 496 static bool 497 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 498 { 499 struct pci_dev *pdev, *pbridge; 500 501 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 502 return false; 503 504 pdev = to_pci_dev(dev); 505 pbridge = to_pci_dev(bridge); 506 507 if (pbridge->subordinate && 508 pbridge->subordinate->number <= pdev->bus->number && 509 pbridge->subordinate->busn_res.end >= pdev->bus->number) 510 return true; 511 512 return false; 513 } 514 515 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 516 { 517 struct dmar_drhd_unit *drhd; 518 u32 vtbar; 519 int rc; 520 521 /* We know that this device on this chipset has its own IOMMU. 522 * If we find it under a different IOMMU, then the BIOS is lying 523 * to us. Hope that the IOMMU for this device is actually 524 * disabled, and it needs no translation... 525 */ 526 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 527 if (rc) { 528 /* "can't" happen */ 529 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 530 return false; 531 } 532 vtbar &= 0xffff0000; 533 534 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 535 drhd = dmar_find_matched_drhd_unit(pdev); 536 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 537 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 538 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 539 return true; 540 } 541 542 return false; 543 } 544 545 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 546 { 547 if (!iommu || iommu->drhd->ignored) 548 return true; 549 550 if (dev_is_pci(dev)) { 551 struct pci_dev *pdev = to_pci_dev(dev); 552 553 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 554 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 555 quirk_ioat_snb_local_iommu(pdev)) 556 return true; 557 } 558 559 return false; 560 } 561 562 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) 563 { 564 struct dmar_drhd_unit *drhd = NULL; 565 struct pci_dev *pdev = NULL; 566 struct intel_iommu *iommu; 567 struct device *tmp; 568 u16 segment = 0; 569 int i; 570 571 if (!dev) 572 return NULL; 573 574 if (dev_is_pci(dev)) { 575 struct pci_dev *pf_pdev; 576 577 pdev = pci_real_dma_dev(to_pci_dev(dev)); 578 579 /* VFs aren't listed in scope tables; we need to look up 580 * the PF instead to find the IOMMU. */ 581 pf_pdev = pci_physfn(pdev); 582 dev = &pf_pdev->dev; 583 segment = pci_domain_nr(pdev->bus); 584 } else if (has_acpi_companion(dev)) 585 dev = &ACPI_COMPANION(dev)->dev; 586 587 rcu_read_lock(); 588 for_each_iommu(iommu, drhd) { 589 if (pdev && segment != drhd->segment) 590 continue; 591 592 for_each_active_dev_scope(drhd->devices, 593 drhd->devices_cnt, i, tmp) { 594 if (tmp == dev) { 595 /* For a VF use its original BDF# not that of the PF 596 * which we used for the IOMMU lookup. Strictly speaking 597 * we could do this for all PCI devices; we only need to 598 * get the BDF# from the scope table for ACPI matches. */ 599 if (pdev && pdev->is_virtfn) 600 goto got_pdev; 601 602 if (bus && devfn) { 603 *bus = drhd->devices[i].bus; 604 *devfn = drhd->devices[i].devfn; 605 } 606 goto out; 607 } 608 609 if (is_downstream_to_pci_bridge(dev, tmp)) 610 goto got_pdev; 611 } 612 613 if (pdev && drhd->include_all) { 614 got_pdev: 615 if (bus && devfn) { 616 *bus = pdev->bus->number; 617 *devfn = pdev->devfn; 618 } 619 goto out; 620 } 621 } 622 iommu = NULL; 623 out: 624 if (iommu_is_dummy(iommu, dev)) 625 iommu = NULL; 626 627 rcu_read_unlock(); 628 629 return iommu; 630 } 631 632 static void domain_flush_cache(struct dmar_domain *domain, 633 void *addr, int size) 634 { 635 if (!domain->iommu_coherency) 636 clflush_cache_range(addr, size); 637 } 638 639 static void free_context_table(struct intel_iommu *iommu) 640 { 641 struct context_entry *context; 642 int i; 643 644 if (!iommu->root_entry) 645 return; 646 647 for (i = 0; i < ROOT_ENTRY_NR; i++) { 648 context = iommu_context_addr(iommu, i, 0, 0); 649 if (context) 650 free_pgtable_page(context); 651 652 if (!sm_supported(iommu)) 653 continue; 654 655 context = iommu_context_addr(iommu, i, 0x80, 0); 656 if (context) 657 free_pgtable_page(context); 658 } 659 660 free_pgtable_page(iommu->root_entry); 661 iommu->root_entry = NULL; 662 } 663 664 #ifdef CONFIG_DMAR_DEBUG 665 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 666 u8 bus, u8 devfn, struct dma_pte *parent, int level) 667 { 668 struct dma_pte *pte; 669 int offset; 670 671 while (1) { 672 offset = pfn_level_offset(pfn, level); 673 pte = &parent[offset]; 674 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 675 pr_info("PTE not present at level %d\n", level); 676 break; 677 } 678 679 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 680 681 if (level == 1) 682 break; 683 684 parent = phys_to_virt(dma_pte_addr(pte)); 685 level--; 686 } 687 } 688 689 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 690 unsigned long long addr, u32 pasid) 691 { 692 struct pasid_dir_entry *dir, *pde; 693 struct pasid_entry *entries, *pte; 694 struct context_entry *ctx_entry; 695 struct root_entry *rt_entry; 696 int i, dir_index, index, level; 697 u8 devfn = source_id & 0xff; 698 u8 bus = source_id >> 8; 699 struct dma_pte *pgtable; 700 701 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 702 703 /* root entry dump */ 704 rt_entry = &iommu->root_entry[bus]; 705 if (!rt_entry) { 706 pr_info("root table entry is not present\n"); 707 return; 708 } 709 710 if (sm_supported(iommu)) 711 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 712 rt_entry->hi, rt_entry->lo); 713 else 714 pr_info("root entry: 0x%016llx", rt_entry->lo); 715 716 /* context entry dump */ 717 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 718 if (!ctx_entry) { 719 pr_info("context table entry is not present\n"); 720 return; 721 } 722 723 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 724 ctx_entry->hi, ctx_entry->lo); 725 726 /* legacy mode does not require PASID entries */ 727 if (!sm_supported(iommu)) { 728 level = agaw_to_level(ctx_entry->hi & 7); 729 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 730 goto pgtable_walk; 731 } 732 733 /* get the pointer to pasid directory entry */ 734 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 735 if (!dir) { 736 pr_info("pasid directory entry is not present\n"); 737 return; 738 } 739 /* For request-without-pasid, get the pasid from context entry */ 740 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 741 pasid = IOMMU_NO_PASID; 742 743 dir_index = pasid >> PASID_PDE_SHIFT; 744 pde = &dir[dir_index]; 745 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 746 747 /* get the pointer to the pasid table entry */ 748 entries = get_pasid_table_from_pde(pde); 749 if (!entries) { 750 pr_info("pasid table entry is not present\n"); 751 return; 752 } 753 index = pasid & PASID_PTE_MASK; 754 pte = &entries[index]; 755 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 756 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 757 758 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 759 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 760 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 761 } else { 762 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 763 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 764 } 765 766 pgtable_walk: 767 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 768 } 769 #endif 770 771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 772 unsigned long pfn, int *target_level, 773 gfp_t gfp) 774 { 775 struct dma_pte *parent, *pte; 776 int level = agaw_to_level(domain->agaw); 777 int offset; 778 779 if (!domain_pfn_supported(domain, pfn)) 780 /* Address beyond IOMMU's addressing capabilities. */ 781 return NULL; 782 783 parent = domain->pgd; 784 785 while (1) { 786 void *tmp_page; 787 788 offset = pfn_level_offset(pfn, level); 789 pte = &parent[offset]; 790 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 791 break; 792 if (level == *target_level) 793 break; 794 795 if (!dma_pte_present(pte)) { 796 uint64_t pteval; 797 798 tmp_page = alloc_pgtable_page(domain->nid, gfp); 799 800 if (!tmp_page) 801 return NULL; 802 803 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 804 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 805 if (domain->use_first_level) 806 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 807 808 if (cmpxchg64(&pte->val, 0ULL, pteval)) 809 /* Someone else set it while we were thinking; use theirs. */ 810 free_pgtable_page(tmp_page); 811 else 812 domain_flush_cache(domain, pte, sizeof(*pte)); 813 } 814 if (level == 1) 815 break; 816 817 parent = phys_to_virt(dma_pte_addr(pte)); 818 level--; 819 } 820 821 if (!*target_level) 822 *target_level = level; 823 824 return pte; 825 } 826 827 /* return address's pte at specific level */ 828 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 829 unsigned long pfn, 830 int level, int *large_page) 831 { 832 struct dma_pte *parent, *pte; 833 int total = agaw_to_level(domain->agaw); 834 int offset; 835 836 parent = domain->pgd; 837 while (level <= total) { 838 offset = pfn_level_offset(pfn, total); 839 pte = &parent[offset]; 840 if (level == total) 841 return pte; 842 843 if (!dma_pte_present(pte)) { 844 *large_page = total; 845 break; 846 } 847 848 if (dma_pte_superpage(pte)) { 849 *large_page = total; 850 return pte; 851 } 852 853 parent = phys_to_virt(dma_pte_addr(pte)); 854 total--; 855 } 856 return NULL; 857 } 858 859 /* clear last level pte, a tlb flush should be followed */ 860 static void dma_pte_clear_range(struct dmar_domain *domain, 861 unsigned long start_pfn, 862 unsigned long last_pfn) 863 { 864 unsigned int large_page; 865 struct dma_pte *first_pte, *pte; 866 867 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 868 WARN_ON(start_pfn > last_pfn)) 869 return; 870 871 /* we don't need lock here; nobody else touches the iova range */ 872 do { 873 large_page = 1; 874 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 875 if (!pte) { 876 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 877 continue; 878 } 879 do { 880 dma_clear_pte(pte); 881 start_pfn += lvl_to_nr_pages(large_page); 882 pte++; 883 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 884 885 domain_flush_cache(domain, first_pte, 886 (void *)pte - (void *)first_pte); 887 888 } while (start_pfn && start_pfn <= last_pfn); 889 } 890 891 static void dma_pte_free_level(struct dmar_domain *domain, int level, 892 int retain_level, struct dma_pte *pte, 893 unsigned long pfn, unsigned long start_pfn, 894 unsigned long last_pfn) 895 { 896 pfn = max(start_pfn, pfn); 897 pte = &pte[pfn_level_offset(pfn, level)]; 898 899 do { 900 unsigned long level_pfn; 901 struct dma_pte *level_pte; 902 903 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 904 goto next; 905 906 level_pfn = pfn & level_mask(level); 907 level_pte = phys_to_virt(dma_pte_addr(pte)); 908 909 if (level > 2) { 910 dma_pte_free_level(domain, level - 1, retain_level, 911 level_pte, level_pfn, start_pfn, 912 last_pfn); 913 } 914 915 /* 916 * Free the page table if we're below the level we want to 917 * retain and the range covers the entire table. 918 */ 919 if (level < retain_level && !(start_pfn > level_pfn || 920 last_pfn < level_pfn + level_size(level) - 1)) { 921 dma_clear_pte(pte); 922 domain_flush_cache(domain, pte, sizeof(*pte)); 923 free_pgtable_page(level_pte); 924 } 925 next: 926 pfn += level_size(level); 927 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 928 } 929 930 /* 931 * clear last level (leaf) ptes and free page table pages below the 932 * level we wish to keep intact. 933 */ 934 static void dma_pte_free_pagetable(struct dmar_domain *domain, 935 unsigned long start_pfn, 936 unsigned long last_pfn, 937 int retain_level) 938 { 939 dma_pte_clear_range(domain, start_pfn, last_pfn); 940 941 /* We don't need lock here; nobody else touches the iova range */ 942 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 943 domain->pgd, 0, start_pfn, last_pfn); 944 945 /* free pgd */ 946 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 947 free_pgtable_page(domain->pgd); 948 domain->pgd = NULL; 949 } 950 } 951 952 /* When a page at a given level is being unlinked from its parent, we don't 953 need to *modify* it at all. All we need to do is make a list of all the 954 pages which can be freed just as soon as we've flushed the IOTLB and we 955 know the hardware page-walk will no longer touch them. 956 The 'pte' argument is the *parent* PTE, pointing to the page that is to 957 be freed. */ 958 static void dma_pte_list_pagetables(struct dmar_domain *domain, 959 int level, struct dma_pte *pte, 960 struct list_head *freelist) 961 { 962 struct page *pg; 963 964 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 965 list_add_tail(&pg->lru, freelist); 966 967 if (level == 1) 968 return; 969 970 pte = page_address(pg); 971 do { 972 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 973 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 974 pte++; 975 } while (!first_pte_in_page(pte)); 976 } 977 978 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 979 struct dma_pte *pte, unsigned long pfn, 980 unsigned long start_pfn, unsigned long last_pfn, 981 struct list_head *freelist) 982 { 983 struct dma_pte *first_pte = NULL, *last_pte = NULL; 984 985 pfn = max(start_pfn, pfn); 986 pte = &pte[pfn_level_offset(pfn, level)]; 987 988 do { 989 unsigned long level_pfn = pfn & level_mask(level); 990 991 if (!dma_pte_present(pte)) 992 goto next; 993 994 /* If range covers entire pagetable, free it */ 995 if (start_pfn <= level_pfn && 996 last_pfn >= level_pfn + level_size(level) - 1) { 997 /* These suborbinate page tables are going away entirely. Don't 998 bother to clear them; we're just going to *free* them. */ 999 if (level > 1 && !dma_pte_superpage(pte)) 1000 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1001 1002 dma_clear_pte(pte); 1003 if (!first_pte) 1004 first_pte = pte; 1005 last_pte = pte; 1006 } else if (level > 1) { 1007 /* Recurse down into a level that isn't *entirely* obsolete */ 1008 dma_pte_clear_level(domain, level - 1, 1009 phys_to_virt(dma_pte_addr(pte)), 1010 level_pfn, start_pfn, last_pfn, 1011 freelist); 1012 } 1013 next: 1014 pfn = level_pfn + level_size(level); 1015 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1016 1017 if (first_pte) 1018 domain_flush_cache(domain, first_pte, 1019 (void *)++last_pte - (void *)first_pte); 1020 } 1021 1022 /* We can't just free the pages because the IOMMU may still be walking 1023 the page tables, and may have cached the intermediate levels. The 1024 pages can only be freed after the IOTLB flush has been done. */ 1025 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1026 unsigned long last_pfn, struct list_head *freelist) 1027 { 1028 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1029 WARN_ON(start_pfn > last_pfn)) 1030 return; 1031 1032 /* we don't need lock here; nobody else touches the iova range */ 1033 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1034 domain->pgd, 0, start_pfn, last_pfn, freelist); 1035 1036 /* free pgd */ 1037 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1038 struct page *pgd_page = virt_to_page(domain->pgd); 1039 list_add_tail(&pgd_page->lru, freelist); 1040 domain->pgd = NULL; 1041 } 1042 } 1043 1044 /* iommu handling */ 1045 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1046 { 1047 struct root_entry *root; 1048 1049 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1050 if (!root) { 1051 pr_err("Allocating root entry for %s failed\n", 1052 iommu->name); 1053 return -ENOMEM; 1054 } 1055 1056 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1057 iommu->root_entry = root; 1058 1059 return 0; 1060 } 1061 1062 static void iommu_set_root_entry(struct intel_iommu *iommu) 1063 { 1064 u64 addr; 1065 u32 sts; 1066 unsigned long flag; 1067 1068 addr = virt_to_phys(iommu->root_entry); 1069 if (sm_supported(iommu)) 1070 addr |= DMA_RTADDR_SMT; 1071 1072 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1073 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1074 1075 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1076 1077 /* Make sure hardware complete it */ 1078 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1079 readl, (sts & DMA_GSTS_RTPS), sts); 1080 1081 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1082 1083 /* 1084 * Hardware invalidates all DMA remapping hardware translation 1085 * caches as part of SRTP flow. 1086 */ 1087 if (cap_esrtps(iommu->cap)) 1088 return; 1089 1090 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1091 if (sm_supported(iommu)) 1092 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1093 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1094 } 1095 1096 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1097 { 1098 u32 val; 1099 unsigned long flag; 1100 1101 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1102 return; 1103 1104 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1105 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1106 1107 /* Make sure hardware complete it */ 1108 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1109 readl, (!(val & DMA_GSTS_WBFS)), val); 1110 1111 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1112 } 1113 1114 /* return value determine if we need a write buffer flush */ 1115 static void __iommu_flush_context(struct intel_iommu *iommu, 1116 u16 did, u16 source_id, u8 function_mask, 1117 u64 type) 1118 { 1119 u64 val = 0; 1120 unsigned long flag; 1121 1122 switch (type) { 1123 case DMA_CCMD_GLOBAL_INVL: 1124 val = DMA_CCMD_GLOBAL_INVL; 1125 break; 1126 case DMA_CCMD_DOMAIN_INVL: 1127 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1128 break; 1129 case DMA_CCMD_DEVICE_INVL: 1130 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1131 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1132 break; 1133 default: 1134 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1135 iommu->name, type); 1136 return; 1137 } 1138 val |= DMA_CCMD_ICC; 1139 1140 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1141 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1142 1143 /* Make sure hardware complete it */ 1144 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1145 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1146 1147 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1148 } 1149 1150 /* return value determine if we need a write buffer flush */ 1151 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1152 u64 addr, unsigned int size_order, u64 type) 1153 { 1154 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1155 u64 val = 0, val_iva = 0; 1156 unsigned long flag; 1157 1158 switch (type) { 1159 case DMA_TLB_GLOBAL_FLUSH: 1160 /* global flush doesn't need set IVA_REG */ 1161 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1162 break; 1163 case DMA_TLB_DSI_FLUSH: 1164 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1165 break; 1166 case DMA_TLB_PSI_FLUSH: 1167 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1168 /* IH bit is passed in as part of address */ 1169 val_iva = size_order | addr; 1170 break; 1171 default: 1172 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1173 iommu->name, type); 1174 return; 1175 } 1176 1177 if (cap_write_drain(iommu->cap)) 1178 val |= DMA_TLB_WRITE_DRAIN; 1179 1180 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1181 /* Note: Only uses first TLB reg currently */ 1182 if (val_iva) 1183 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1184 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1185 1186 /* Make sure hardware complete it */ 1187 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1188 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1189 1190 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1191 1192 /* check IOTLB invalidation granularity */ 1193 if (DMA_TLB_IAIG(val) == 0) 1194 pr_err("Flush IOTLB failed\n"); 1195 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1196 pr_debug("TLB flush request %Lx, actual %Lx\n", 1197 (unsigned long long)DMA_TLB_IIRG(type), 1198 (unsigned long long)DMA_TLB_IAIG(val)); 1199 } 1200 1201 static struct device_domain_info * 1202 domain_lookup_dev_info(struct dmar_domain *domain, 1203 struct intel_iommu *iommu, u8 bus, u8 devfn) 1204 { 1205 struct device_domain_info *info; 1206 unsigned long flags; 1207 1208 spin_lock_irqsave(&domain->lock, flags); 1209 list_for_each_entry(info, &domain->devices, link) { 1210 if (info->iommu == iommu && info->bus == bus && 1211 info->devfn == devfn) { 1212 spin_unlock_irqrestore(&domain->lock, flags); 1213 return info; 1214 } 1215 } 1216 spin_unlock_irqrestore(&domain->lock, flags); 1217 1218 return NULL; 1219 } 1220 1221 static void domain_update_iotlb(struct dmar_domain *domain) 1222 { 1223 struct dev_pasid_info *dev_pasid; 1224 struct device_domain_info *info; 1225 bool has_iotlb_device = false; 1226 unsigned long flags; 1227 1228 spin_lock_irqsave(&domain->lock, flags); 1229 list_for_each_entry(info, &domain->devices, link) { 1230 if (info->ats_enabled) { 1231 has_iotlb_device = true; 1232 break; 1233 } 1234 } 1235 1236 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1237 info = dev_iommu_priv_get(dev_pasid->dev); 1238 if (info->ats_enabled) { 1239 has_iotlb_device = true; 1240 break; 1241 } 1242 } 1243 domain->has_iotlb_device = has_iotlb_device; 1244 spin_unlock_irqrestore(&domain->lock, flags); 1245 } 1246 1247 /* 1248 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1249 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1250 * check because it applies only to the built-in QAT devices and it doesn't 1251 * grant additional privileges. 1252 */ 1253 #define BUGGY_QAT_DEVID_MASK 0x4940 1254 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1255 { 1256 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1257 return false; 1258 1259 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1260 return false; 1261 1262 return true; 1263 } 1264 1265 static void iommu_enable_pci_caps(struct device_domain_info *info) 1266 { 1267 struct pci_dev *pdev; 1268 1269 if (!dev_is_pci(info->dev)) 1270 return; 1271 1272 pdev = to_pci_dev(info->dev); 1273 1274 /* The PCIe spec, in its wisdom, declares that the behaviour of 1275 the device if you enable PASID support after ATS support is 1276 undefined. So always enable PASID support on devices which 1277 have it, even if we can't yet know if we're ever going to 1278 use it. */ 1279 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1280 info->pasid_enabled = 1; 1281 1282 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1283 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1284 info->ats_enabled = 1; 1285 domain_update_iotlb(info->domain); 1286 } 1287 } 1288 1289 static void iommu_disable_pci_caps(struct device_domain_info *info) 1290 { 1291 struct pci_dev *pdev; 1292 1293 if (!dev_is_pci(info->dev)) 1294 return; 1295 1296 pdev = to_pci_dev(info->dev); 1297 1298 if (info->ats_enabled) { 1299 pci_disable_ats(pdev); 1300 info->ats_enabled = 0; 1301 domain_update_iotlb(info->domain); 1302 } 1303 1304 if (info->pasid_enabled) { 1305 pci_disable_pasid(pdev); 1306 info->pasid_enabled = 0; 1307 } 1308 } 1309 1310 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1311 u64 addr, unsigned int mask) 1312 { 1313 u16 sid, qdep; 1314 1315 if (!info || !info->ats_enabled) 1316 return; 1317 1318 sid = info->bus << 8 | info->devfn; 1319 qdep = info->ats_qdep; 1320 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1321 qdep, addr, mask); 1322 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); 1323 } 1324 1325 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1326 u64 addr, unsigned mask) 1327 { 1328 struct dev_pasid_info *dev_pasid; 1329 struct device_domain_info *info; 1330 unsigned long flags; 1331 1332 if (!domain->has_iotlb_device) 1333 return; 1334 1335 spin_lock_irqsave(&domain->lock, flags); 1336 list_for_each_entry(info, &domain->devices, link) 1337 __iommu_flush_dev_iotlb(info, addr, mask); 1338 1339 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1340 info = dev_iommu_priv_get(dev_pasid->dev); 1341 1342 if (!info->ats_enabled) 1343 continue; 1344 1345 qi_flush_dev_iotlb_pasid(info->iommu, 1346 PCI_DEVID(info->bus, info->devfn), 1347 info->pfsid, dev_pasid->pasid, 1348 info->ats_qdep, addr, 1349 mask); 1350 } 1351 spin_unlock_irqrestore(&domain->lock, flags); 1352 } 1353 1354 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, 1355 struct dmar_domain *domain, u64 addr, 1356 unsigned long npages, bool ih) 1357 { 1358 u16 did = domain_id_iommu(domain, iommu); 1359 struct dev_pasid_info *dev_pasid; 1360 unsigned long flags; 1361 1362 spin_lock_irqsave(&domain->lock, flags); 1363 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) 1364 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); 1365 1366 if (!list_empty(&domain->devices)) 1367 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); 1368 spin_unlock_irqrestore(&domain->lock, flags); 1369 } 1370 1371 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1372 struct dmar_domain *domain, 1373 unsigned long pfn, unsigned int pages, 1374 int ih, int map) 1375 { 1376 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1377 unsigned int mask = ilog2(aligned_pages); 1378 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1379 u16 did = domain_id_iommu(domain, iommu); 1380 1381 if (WARN_ON(!pages)) 1382 return; 1383 1384 if (ih) 1385 ih = 1 << 6; 1386 1387 if (domain->use_first_level) { 1388 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); 1389 } else { 1390 unsigned long bitmask = aligned_pages - 1; 1391 1392 /* 1393 * PSI masks the low order bits of the base address. If the 1394 * address isn't aligned to the mask, then compute a mask value 1395 * needed to ensure the target range is flushed. 1396 */ 1397 if (unlikely(bitmask & pfn)) { 1398 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1399 1400 /* 1401 * Since end_pfn <= pfn + bitmask, the only way bits 1402 * higher than bitmask can differ in pfn and end_pfn is 1403 * by carrying. This means after masking out bitmask, 1404 * high bits starting with the first set bit in 1405 * shared_bits are all equal in both pfn and end_pfn. 1406 */ 1407 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1408 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1409 } 1410 1411 /* 1412 * Fallback to domain selective flush if no PSI support or 1413 * the size is too big. 1414 */ 1415 if (!cap_pgsel_inv(iommu->cap) || 1416 mask > cap_max_amask_val(iommu->cap)) 1417 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1418 DMA_TLB_DSI_FLUSH); 1419 else 1420 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1421 DMA_TLB_PSI_FLUSH); 1422 } 1423 1424 /* 1425 * In caching mode, changes of pages from non-present to present require 1426 * flush. However, device IOTLB doesn't need to be flushed in this case. 1427 */ 1428 if (!cap_caching_mode(iommu->cap) || !map) 1429 iommu_flush_dev_iotlb(domain, addr, mask); 1430 } 1431 1432 /* Notification for newly created mappings */ 1433 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, 1434 unsigned long pfn, unsigned int pages) 1435 { 1436 /* 1437 * It's a non-present to present mapping. Only flush if caching mode 1438 * and second level. 1439 */ 1440 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1441 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1442 else 1443 iommu_flush_write_buffer(iommu); 1444 } 1445 1446 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1447 { 1448 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1449 struct iommu_domain_info *info; 1450 unsigned long idx; 1451 1452 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1453 struct intel_iommu *iommu = info->iommu; 1454 u16 did = domain_id_iommu(dmar_domain, iommu); 1455 1456 if (dmar_domain->use_first_level) 1457 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); 1458 else 1459 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1460 DMA_TLB_DSI_FLUSH); 1461 1462 if (!cap_caching_mode(iommu->cap)) 1463 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1464 } 1465 } 1466 1467 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1468 { 1469 u32 pmen; 1470 unsigned long flags; 1471 1472 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1473 return; 1474 1475 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1476 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1477 pmen &= ~DMA_PMEN_EPM; 1478 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1479 1480 /* wait for the protected region status bit to clear */ 1481 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1482 readl, !(pmen & DMA_PMEN_PRS), pmen); 1483 1484 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1485 } 1486 1487 static void iommu_enable_translation(struct intel_iommu *iommu) 1488 { 1489 u32 sts; 1490 unsigned long flags; 1491 1492 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1493 iommu->gcmd |= DMA_GCMD_TE; 1494 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1495 1496 /* Make sure hardware complete it */ 1497 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1498 readl, (sts & DMA_GSTS_TES), sts); 1499 1500 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1501 } 1502 1503 static void iommu_disable_translation(struct intel_iommu *iommu) 1504 { 1505 u32 sts; 1506 unsigned long flag; 1507 1508 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1509 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1510 return; 1511 1512 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1513 iommu->gcmd &= ~DMA_GCMD_TE; 1514 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1515 1516 /* Make sure hardware complete it */ 1517 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1518 readl, (!(sts & DMA_GSTS_TES)), sts); 1519 1520 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1521 } 1522 1523 static int iommu_init_domains(struct intel_iommu *iommu) 1524 { 1525 u32 ndomains; 1526 1527 ndomains = cap_ndoms(iommu->cap); 1528 pr_debug("%s: Number of Domains supported <%d>\n", 1529 iommu->name, ndomains); 1530 1531 spin_lock_init(&iommu->lock); 1532 1533 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1534 if (!iommu->domain_ids) 1535 return -ENOMEM; 1536 1537 /* 1538 * If Caching mode is set, then invalid translations are tagged 1539 * with domain-id 0, hence we need to pre-allocate it. We also 1540 * use domain-id 0 as a marker for non-allocated domain-id, so 1541 * make sure it is not used for a real domain. 1542 */ 1543 set_bit(0, iommu->domain_ids); 1544 1545 /* 1546 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1547 * entry for first-level or pass-through translation modes should 1548 * be programmed with a domain id different from those used for 1549 * second-level or nested translation. We reserve a domain id for 1550 * this purpose. 1551 */ 1552 if (sm_supported(iommu)) 1553 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1554 1555 return 0; 1556 } 1557 1558 static void disable_dmar_iommu(struct intel_iommu *iommu) 1559 { 1560 if (!iommu->domain_ids) 1561 return; 1562 1563 /* 1564 * All iommu domains must have been detached from the devices, 1565 * hence there should be no domain IDs in use. 1566 */ 1567 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1568 > NUM_RESERVED_DID)) 1569 return; 1570 1571 if (iommu->gcmd & DMA_GCMD_TE) 1572 iommu_disable_translation(iommu); 1573 } 1574 1575 static void free_dmar_iommu(struct intel_iommu *iommu) 1576 { 1577 if (iommu->domain_ids) { 1578 bitmap_free(iommu->domain_ids); 1579 iommu->domain_ids = NULL; 1580 } 1581 1582 if (iommu->copied_tables) { 1583 bitmap_free(iommu->copied_tables); 1584 iommu->copied_tables = NULL; 1585 } 1586 1587 /* free context mapping */ 1588 free_context_table(iommu); 1589 1590 #ifdef CONFIG_INTEL_IOMMU_SVM 1591 if (pasid_supported(iommu)) { 1592 if (ecap_prs(iommu->ecap)) 1593 intel_svm_finish_prq(iommu); 1594 } 1595 #endif 1596 } 1597 1598 /* 1599 * Check and return whether first level is used by default for 1600 * DMA translation. 1601 */ 1602 static bool first_level_by_default(unsigned int type) 1603 { 1604 /* Only SL is available in legacy mode */ 1605 if (!scalable_mode_support()) 1606 return false; 1607 1608 /* Only level (either FL or SL) is available, just use it */ 1609 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1610 return intel_cap_flts_sanity(); 1611 1612 /* Both levels are available, decide it based on domain type */ 1613 return type != IOMMU_DOMAIN_UNMANAGED; 1614 } 1615 1616 static struct dmar_domain *alloc_domain(unsigned int type) 1617 { 1618 struct dmar_domain *domain; 1619 1620 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1621 if (!domain) 1622 return NULL; 1623 1624 domain->nid = NUMA_NO_NODE; 1625 if (first_level_by_default(type)) 1626 domain->use_first_level = true; 1627 domain->has_iotlb_device = false; 1628 INIT_LIST_HEAD(&domain->devices); 1629 INIT_LIST_HEAD(&domain->dev_pasids); 1630 spin_lock_init(&domain->lock); 1631 xa_init(&domain->iommu_array); 1632 1633 return domain; 1634 } 1635 1636 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1637 { 1638 struct iommu_domain_info *info, *curr; 1639 unsigned long ndomains; 1640 int num, ret = -ENOSPC; 1641 1642 info = kzalloc(sizeof(*info), GFP_KERNEL); 1643 if (!info) 1644 return -ENOMEM; 1645 1646 spin_lock(&iommu->lock); 1647 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1648 if (curr) { 1649 curr->refcnt++; 1650 spin_unlock(&iommu->lock); 1651 kfree(info); 1652 return 0; 1653 } 1654 1655 ndomains = cap_ndoms(iommu->cap); 1656 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1657 if (num >= ndomains) { 1658 pr_err("%s: No free domain ids\n", iommu->name); 1659 goto err_unlock; 1660 } 1661 1662 set_bit(num, iommu->domain_ids); 1663 info->refcnt = 1; 1664 info->did = num; 1665 info->iommu = iommu; 1666 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1667 NULL, info, GFP_ATOMIC); 1668 if (curr) { 1669 ret = xa_err(curr) ? : -EBUSY; 1670 goto err_clear; 1671 } 1672 domain_update_iommu_cap(domain); 1673 1674 spin_unlock(&iommu->lock); 1675 return 0; 1676 1677 err_clear: 1678 clear_bit(info->did, iommu->domain_ids); 1679 err_unlock: 1680 spin_unlock(&iommu->lock); 1681 kfree(info); 1682 return ret; 1683 } 1684 1685 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1686 { 1687 struct iommu_domain_info *info; 1688 1689 spin_lock(&iommu->lock); 1690 info = xa_load(&domain->iommu_array, iommu->seq_id); 1691 if (--info->refcnt == 0) { 1692 clear_bit(info->did, iommu->domain_ids); 1693 xa_erase(&domain->iommu_array, iommu->seq_id); 1694 domain->nid = NUMA_NO_NODE; 1695 domain_update_iommu_cap(domain); 1696 kfree(info); 1697 } 1698 spin_unlock(&iommu->lock); 1699 } 1700 1701 static int guestwidth_to_adjustwidth(int gaw) 1702 { 1703 int agaw; 1704 int r = (gaw - 12) % 9; 1705 1706 if (r == 0) 1707 agaw = gaw; 1708 else 1709 agaw = gaw + 9 - r; 1710 if (agaw > 64) 1711 agaw = 64; 1712 return agaw; 1713 } 1714 1715 static void domain_exit(struct dmar_domain *domain) 1716 { 1717 if (domain->pgd) { 1718 LIST_HEAD(freelist); 1719 1720 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1721 put_pages_list(&freelist); 1722 } 1723 1724 if (WARN_ON(!list_empty(&domain->devices))) 1725 return; 1726 1727 kfree(domain); 1728 } 1729 1730 /* 1731 * Get the PASID directory size for scalable mode context entry. 1732 * Value of X in the PDTS field of a scalable mode context entry 1733 * indicates PASID directory with 2^(X + 7) entries. 1734 */ 1735 static unsigned long context_get_sm_pds(struct pasid_table *table) 1736 { 1737 unsigned long pds, max_pde; 1738 1739 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1740 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1741 if (pds < 7) 1742 return 0; 1743 1744 return pds - 7; 1745 } 1746 1747 static int domain_context_mapping_one(struct dmar_domain *domain, 1748 struct intel_iommu *iommu, 1749 struct pasid_table *table, 1750 u8 bus, u8 devfn) 1751 { 1752 struct device_domain_info *info = 1753 domain_lookup_dev_info(domain, iommu, bus, devfn); 1754 u16 did = domain_id_iommu(domain, iommu); 1755 int translation = CONTEXT_TT_MULTI_LEVEL; 1756 struct context_entry *context; 1757 int ret; 1758 1759 if (hw_pass_through && domain_type_is_si(domain)) 1760 translation = CONTEXT_TT_PASS_THROUGH; 1761 1762 pr_debug("Set context mapping for %02x:%02x.%d\n", 1763 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1764 1765 spin_lock(&iommu->lock); 1766 ret = -ENOMEM; 1767 context = iommu_context_addr(iommu, bus, devfn, 1); 1768 if (!context) 1769 goto out_unlock; 1770 1771 ret = 0; 1772 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1773 goto out_unlock; 1774 1775 /* 1776 * For kdump cases, old valid entries may be cached due to the 1777 * in-flight DMA and copied pgtable, but there is no unmapping 1778 * behaviour for them, thus we need an explicit cache flush for 1779 * the newly-mapped device. For kdump, at this point, the device 1780 * is supposed to finish reset at its driver probe stage, so no 1781 * in-flight DMA will exist, and we don't need to worry anymore 1782 * hereafter. 1783 */ 1784 if (context_copied(iommu, bus, devfn)) { 1785 u16 did_old = context_domain_id(context); 1786 1787 if (did_old < cap_ndoms(iommu->cap)) { 1788 iommu->flush.flush_context(iommu, did_old, 1789 (((u16)bus) << 8) | devfn, 1790 DMA_CCMD_MASK_NOBIT, 1791 DMA_CCMD_DEVICE_INVL); 1792 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1793 DMA_TLB_DSI_FLUSH); 1794 } 1795 1796 clear_context_copied(iommu, bus, devfn); 1797 } 1798 1799 context_clear_entry(context); 1800 1801 if (sm_supported(iommu)) { 1802 unsigned long pds; 1803 1804 /* Setup the PASID DIR pointer: */ 1805 pds = context_get_sm_pds(table); 1806 context->lo = (u64)virt_to_phys(table->table) | 1807 context_pdts(pds); 1808 1809 /* Setup the RID_PASID field: */ 1810 context_set_sm_rid2pasid(context, IOMMU_NO_PASID); 1811 1812 /* 1813 * Setup the Device-TLB enable bit and Page request 1814 * Enable bit: 1815 */ 1816 if (info && info->ats_supported) 1817 context_set_sm_dte(context); 1818 if (info && info->pri_supported) 1819 context_set_sm_pre(context); 1820 if (info && info->pasid_supported) 1821 context_set_pasid(context); 1822 } else { 1823 struct dma_pte *pgd = domain->pgd; 1824 int agaw; 1825 1826 context_set_domain_id(context, did); 1827 1828 if (translation != CONTEXT_TT_PASS_THROUGH) { 1829 /* 1830 * Skip top levels of page tables for iommu which has 1831 * less agaw than default. Unnecessary for PT mode. 1832 */ 1833 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1834 ret = -ENOMEM; 1835 pgd = phys_to_virt(dma_pte_addr(pgd)); 1836 if (!dma_pte_present(pgd)) 1837 goto out_unlock; 1838 } 1839 1840 if (info && info->ats_supported) 1841 translation = CONTEXT_TT_DEV_IOTLB; 1842 else 1843 translation = CONTEXT_TT_MULTI_LEVEL; 1844 1845 context_set_address_root(context, virt_to_phys(pgd)); 1846 context_set_address_width(context, agaw); 1847 } else { 1848 /* 1849 * In pass through mode, AW must be programmed to 1850 * indicate the largest AGAW value supported by 1851 * hardware. And ASR is ignored by hardware. 1852 */ 1853 context_set_address_width(context, iommu->msagaw); 1854 } 1855 1856 context_set_translation_type(context, translation); 1857 } 1858 1859 context_set_fault_enable(context); 1860 context_set_present(context); 1861 if (!ecap_coherent(iommu->ecap)) 1862 clflush_cache_range(context, sizeof(*context)); 1863 1864 /* 1865 * It's a non-present to present mapping. If hardware doesn't cache 1866 * non-present entry we only need to flush the write-buffer. If the 1867 * _does_ cache non-present entries, then it does so in the special 1868 * domain #0, which we have to flush: 1869 */ 1870 if (cap_caching_mode(iommu->cap)) { 1871 iommu->flush.flush_context(iommu, 0, 1872 (((u16)bus) << 8) | devfn, 1873 DMA_CCMD_MASK_NOBIT, 1874 DMA_CCMD_DEVICE_INVL); 1875 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 1876 } else { 1877 iommu_flush_write_buffer(iommu); 1878 } 1879 1880 ret = 0; 1881 1882 out_unlock: 1883 spin_unlock(&iommu->lock); 1884 1885 return ret; 1886 } 1887 1888 struct domain_context_mapping_data { 1889 struct dmar_domain *domain; 1890 struct intel_iommu *iommu; 1891 struct pasid_table *table; 1892 }; 1893 1894 static int domain_context_mapping_cb(struct pci_dev *pdev, 1895 u16 alias, void *opaque) 1896 { 1897 struct domain_context_mapping_data *data = opaque; 1898 1899 return domain_context_mapping_one(data->domain, data->iommu, 1900 data->table, PCI_BUS_NUM(alias), 1901 alias & 0xff); 1902 } 1903 1904 static int 1905 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 1906 { 1907 struct device_domain_info *info = dev_iommu_priv_get(dev); 1908 struct domain_context_mapping_data data; 1909 struct intel_iommu *iommu = info->iommu; 1910 u8 bus = info->bus, devfn = info->devfn; 1911 struct pasid_table *table; 1912 1913 table = intel_pasid_get_table(dev); 1914 1915 if (!dev_is_pci(dev)) 1916 return domain_context_mapping_one(domain, iommu, table, 1917 bus, devfn); 1918 1919 data.domain = domain; 1920 data.iommu = iommu; 1921 data.table = table; 1922 1923 return pci_for_each_dma_alias(to_pci_dev(dev), 1924 &domain_context_mapping_cb, &data); 1925 } 1926 1927 /* Returns a number of VTD pages, but aligned to MM page size */ 1928 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) 1929 { 1930 host_addr &= ~PAGE_MASK; 1931 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 1932 } 1933 1934 /* Return largest possible superpage level for a given mapping */ 1935 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1936 unsigned long phy_pfn, unsigned long pages) 1937 { 1938 int support, level = 1; 1939 unsigned long pfnmerge; 1940 1941 support = domain->iommu_superpage; 1942 1943 /* To use a large page, the virtual *and* physical addresses 1944 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1945 of them will mean we have to use smaller pages. So just 1946 merge them and check both at once. */ 1947 pfnmerge = iov_pfn | phy_pfn; 1948 1949 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1950 pages >>= VTD_STRIDE_SHIFT; 1951 if (!pages) 1952 break; 1953 pfnmerge >>= VTD_STRIDE_SHIFT; 1954 level++; 1955 support--; 1956 } 1957 return level; 1958 } 1959 1960 /* 1961 * Ensure that old small page tables are removed to make room for superpage(s). 1962 * We're going to add new large pages, so make sure we don't remove their parent 1963 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1964 */ 1965 static void switch_to_super_page(struct dmar_domain *domain, 1966 unsigned long start_pfn, 1967 unsigned long end_pfn, int level) 1968 { 1969 unsigned long lvl_pages = lvl_to_nr_pages(level); 1970 struct iommu_domain_info *info; 1971 struct dma_pte *pte = NULL; 1972 unsigned long i; 1973 1974 while (start_pfn <= end_pfn) { 1975 if (!pte) 1976 pte = pfn_to_dma_pte(domain, start_pfn, &level, 1977 GFP_ATOMIC); 1978 1979 if (dma_pte_present(pte)) { 1980 dma_pte_free_pagetable(domain, start_pfn, 1981 start_pfn + lvl_pages - 1, 1982 level + 1); 1983 1984 xa_for_each(&domain->iommu_array, i, info) 1985 iommu_flush_iotlb_psi(info->iommu, domain, 1986 start_pfn, lvl_pages, 1987 0, 0); 1988 } 1989 1990 pte++; 1991 start_pfn += lvl_pages; 1992 if (first_pte_in_page(pte)) 1993 pte = NULL; 1994 } 1995 } 1996 1997 static int 1998 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1999 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2000 gfp_t gfp) 2001 { 2002 struct dma_pte *first_pte = NULL, *pte = NULL; 2003 unsigned int largepage_lvl = 0; 2004 unsigned long lvl_pages = 0; 2005 phys_addr_t pteval; 2006 u64 attr; 2007 2008 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2009 return -EINVAL; 2010 2011 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2012 return -EINVAL; 2013 2014 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 2015 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 2016 return -EINVAL; 2017 } 2018 2019 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2020 attr |= DMA_FL_PTE_PRESENT; 2021 if (domain->use_first_level) { 2022 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2023 if (prot & DMA_PTE_WRITE) 2024 attr |= DMA_FL_PTE_DIRTY; 2025 } 2026 2027 domain->has_mappings = true; 2028 2029 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2030 2031 while (nr_pages > 0) { 2032 uint64_t tmp; 2033 2034 if (!pte) { 2035 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2036 phys_pfn, nr_pages); 2037 2038 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2039 gfp); 2040 if (!pte) 2041 return -ENOMEM; 2042 first_pte = pte; 2043 2044 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2045 2046 /* It is large page*/ 2047 if (largepage_lvl > 1) { 2048 unsigned long end_pfn; 2049 unsigned long pages_to_remove; 2050 2051 pteval |= DMA_PTE_LARGE_PAGE; 2052 pages_to_remove = min_t(unsigned long, nr_pages, 2053 nr_pte_to_next_page(pte) * lvl_pages); 2054 end_pfn = iov_pfn + pages_to_remove - 1; 2055 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2056 } else { 2057 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2058 } 2059 2060 } 2061 /* We don't need lock here, nobody else 2062 * touches the iova range 2063 */ 2064 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2065 if (tmp) { 2066 static int dumps = 5; 2067 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2068 iov_pfn, tmp, (unsigned long long)pteval); 2069 if (dumps) { 2070 dumps--; 2071 debug_dma_dump_mappings(NULL); 2072 } 2073 WARN_ON(1); 2074 } 2075 2076 nr_pages -= lvl_pages; 2077 iov_pfn += lvl_pages; 2078 phys_pfn += lvl_pages; 2079 pteval += lvl_pages * VTD_PAGE_SIZE; 2080 2081 /* If the next PTE would be the first in a new page, then we 2082 * need to flush the cache on the entries we've just written. 2083 * And then we'll need to recalculate 'pte', so clear it and 2084 * let it get set again in the if (!pte) block above. 2085 * 2086 * If we're done (!nr_pages) we need to flush the cache too. 2087 * 2088 * Also if we've been setting superpages, we may need to 2089 * recalculate 'pte' and switch back to smaller pages for the 2090 * end of the mapping, if the trailing size is not enough to 2091 * use another superpage (i.e. nr_pages < lvl_pages). 2092 */ 2093 pte++; 2094 if (!nr_pages || first_pte_in_page(pte) || 2095 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2096 domain_flush_cache(domain, first_pte, 2097 (void *)pte - (void *)first_pte); 2098 pte = NULL; 2099 } 2100 } 2101 2102 return 0; 2103 } 2104 2105 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2106 { 2107 struct intel_iommu *iommu = info->iommu; 2108 struct context_entry *context; 2109 u16 did_old; 2110 2111 if (!iommu) 2112 return; 2113 2114 spin_lock(&iommu->lock); 2115 context = iommu_context_addr(iommu, bus, devfn, 0); 2116 if (!context) { 2117 spin_unlock(&iommu->lock); 2118 return; 2119 } 2120 2121 if (sm_supported(iommu)) { 2122 if (hw_pass_through && domain_type_is_si(info->domain)) 2123 did_old = FLPT_DEFAULT_DID; 2124 else 2125 did_old = domain_id_iommu(info->domain, iommu); 2126 } else { 2127 did_old = context_domain_id(context); 2128 } 2129 2130 context_clear_entry(context); 2131 __iommu_flush_cache(iommu, context, sizeof(*context)); 2132 spin_unlock(&iommu->lock); 2133 iommu->flush.flush_context(iommu, 2134 did_old, 2135 (((u16)bus) << 8) | devfn, 2136 DMA_CCMD_MASK_NOBIT, 2137 DMA_CCMD_DEVICE_INVL); 2138 2139 if (sm_supported(iommu)) 2140 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2141 2142 iommu->flush.flush_iotlb(iommu, 2143 did_old, 2144 0, 2145 0, 2146 DMA_TLB_DSI_FLUSH); 2147 2148 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2149 } 2150 2151 static int domain_setup_first_level(struct intel_iommu *iommu, 2152 struct dmar_domain *domain, 2153 struct device *dev, 2154 u32 pasid) 2155 { 2156 struct dma_pte *pgd = domain->pgd; 2157 int agaw, level; 2158 int flags = 0; 2159 2160 /* 2161 * Skip top levels of page tables for iommu which has 2162 * less agaw than default. Unnecessary for PT mode. 2163 */ 2164 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2165 pgd = phys_to_virt(dma_pte_addr(pgd)); 2166 if (!dma_pte_present(pgd)) 2167 return -ENOMEM; 2168 } 2169 2170 level = agaw_to_level(agaw); 2171 if (level != 4 && level != 5) 2172 return -EINVAL; 2173 2174 if (level == 5) 2175 flags |= PASID_FLAG_FL5LP; 2176 2177 if (domain->force_snooping) 2178 flags |= PASID_FLAG_PAGE_SNOOP; 2179 2180 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2181 domain_id_iommu(domain, iommu), 2182 flags); 2183 } 2184 2185 static bool dev_is_real_dma_subdevice(struct device *dev) 2186 { 2187 return dev && dev_is_pci(dev) && 2188 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2189 } 2190 2191 static int iommu_domain_identity_map(struct dmar_domain *domain, 2192 unsigned long first_vpfn, 2193 unsigned long last_vpfn) 2194 { 2195 /* 2196 * RMRR range might have overlap with physical memory range, 2197 * clear it first 2198 */ 2199 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2200 2201 return __domain_mapping(domain, first_vpfn, 2202 first_vpfn, last_vpfn - first_vpfn + 1, 2203 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2204 } 2205 2206 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2207 2208 static int __init si_domain_init(int hw) 2209 { 2210 struct dmar_rmrr_unit *rmrr; 2211 struct device *dev; 2212 int i, nid, ret; 2213 2214 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2215 if (!si_domain) 2216 return -EFAULT; 2217 2218 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2219 domain_exit(si_domain); 2220 si_domain = NULL; 2221 return -EFAULT; 2222 } 2223 2224 if (hw) 2225 return 0; 2226 2227 for_each_online_node(nid) { 2228 unsigned long start_pfn, end_pfn; 2229 int i; 2230 2231 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2232 ret = iommu_domain_identity_map(si_domain, 2233 mm_to_dma_pfn_start(start_pfn), 2234 mm_to_dma_pfn_end(end_pfn)); 2235 if (ret) 2236 return ret; 2237 } 2238 } 2239 2240 /* 2241 * Identity map the RMRRs so that devices with RMRRs could also use 2242 * the si_domain. 2243 */ 2244 for_each_rmrr_units(rmrr) { 2245 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2246 i, dev) { 2247 unsigned long long start = rmrr->base_address; 2248 unsigned long long end = rmrr->end_address; 2249 2250 if (WARN_ON(end < start || 2251 end >> agaw_to_width(si_domain->agaw))) 2252 continue; 2253 2254 ret = iommu_domain_identity_map(si_domain, 2255 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2256 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2257 if (ret) 2258 return ret; 2259 } 2260 } 2261 2262 return 0; 2263 } 2264 2265 static int dmar_domain_attach_device(struct dmar_domain *domain, 2266 struct device *dev) 2267 { 2268 struct device_domain_info *info = dev_iommu_priv_get(dev); 2269 struct intel_iommu *iommu = info->iommu; 2270 unsigned long flags; 2271 int ret; 2272 2273 ret = domain_attach_iommu(domain, iommu); 2274 if (ret) 2275 return ret; 2276 info->domain = domain; 2277 spin_lock_irqsave(&domain->lock, flags); 2278 list_add(&info->link, &domain->devices); 2279 spin_unlock_irqrestore(&domain->lock, flags); 2280 2281 /* PASID table is mandatory for a PCI device in scalable mode. */ 2282 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2283 /* Setup the PASID entry for requests without PASID: */ 2284 if (hw_pass_through && domain_type_is_si(domain)) 2285 ret = intel_pasid_setup_pass_through(iommu, 2286 dev, IOMMU_NO_PASID); 2287 else if (domain->use_first_level) 2288 ret = domain_setup_first_level(iommu, domain, dev, 2289 IOMMU_NO_PASID); 2290 else 2291 ret = intel_pasid_setup_second_level(iommu, domain, 2292 dev, IOMMU_NO_PASID); 2293 if (ret) { 2294 dev_err(dev, "Setup RID2PASID failed\n"); 2295 device_block_translation(dev); 2296 return ret; 2297 } 2298 } 2299 2300 ret = domain_context_mapping(domain, dev); 2301 if (ret) { 2302 dev_err(dev, "Domain context map failed\n"); 2303 device_block_translation(dev); 2304 return ret; 2305 } 2306 2307 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2308 iommu_enable_pci_caps(info); 2309 2310 return 0; 2311 } 2312 2313 /** 2314 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2315 * is relaxable (ie. is allowed to be not enforced under some conditions) 2316 * @dev: device handle 2317 * 2318 * We assume that PCI USB devices with RMRRs have them largely 2319 * for historical reasons and that the RMRR space is not actively used post 2320 * boot. This exclusion may change if vendors begin to abuse it. 2321 * 2322 * The same exception is made for graphics devices, with the requirement that 2323 * any use of the RMRR regions will be torn down before assigning the device 2324 * to a guest. 2325 * 2326 * Return: true if the RMRR is relaxable, false otherwise 2327 */ 2328 static bool device_rmrr_is_relaxable(struct device *dev) 2329 { 2330 struct pci_dev *pdev; 2331 2332 if (!dev_is_pci(dev)) 2333 return false; 2334 2335 pdev = to_pci_dev(dev); 2336 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2337 return true; 2338 else 2339 return false; 2340 } 2341 2342 /* 2343 * Return the required default domain type for a specific device. 2344 * 2345 * @dev: the device in query 2346 * @startup: true if this is during early boot 2347 * 2348 * Returns: 2349 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2350 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2351 * - 0: both identity and dynamic domains work for this device 2352 */ 2353 static int device_def_domain_type(struct device *dev) 2354 { 2355 if (dev_is_pci(dev)) { 2356 struct pci_dev *pdev = to_pci_dev(dev); 2357 2358 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2359 return IOMMU_DOMAIN_IDENTITY; 2360 2361 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2362 return IOMMU_DOMAIN_IDENTITY; 2363 } 2364 2365 return 0; 2366 } 2367 2368 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2369 { 2370 /* 2371 * Start from the sane iommu hardware state. 2372 * If the queued invalidation is already initialized by us 2373 * (for example, while enabling interrupt-remapping) then 2374 * we got the things already rolling from a sane state. 2375 */ 2376 if (!iommu->qi) { 2377 /* 2378 * Clear any previous faults. 2379 */ 2380 dmar_fault(-1, iommu); 2381 /* 2382 * Disable queued invalidation if supported and already enabled 2383 * before OS handover. 2384 */ 2385 dmar_disable_qi(iommu); 2386 } 2387 2388 if (dmar_enable_qi(iommu)) { 2389 /* 2390 * Queued Invalidate not enabled, use Register Based Invalidate 2391 */ 2392 iommu->flush.flush_context = __iommu_flush_context; 2393 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2394 pr_info("%s: Using Register based invalidation\n", 2395 iommu->name); 2396 } else { 2397 iommu->flush.flush_context = qi_flush_context; 2398 iommu->flush.flush_iotlb = qi_flush_iotlb; 2399 pr_info("%s: Using Queued invalidation\n", iommu->name); 2400 } 2401 } 2402 2403 static int copy_context_table(struct intel_iommu *iommu, 2404 struct root_entry *old_re, 2405 struct context_entry **tbl, 2406 int bus, bool ext) 2407 { 2408 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2409 struct context_entry *new_ce = NULL, ce; 2410 struct context_entry *old_ce = NULL; 2411 struct root_entry re; 2412 phys_addr_t old_ce_phys; 2413 2414 tbl_idx = ext ? bus * 2 : bus; 2415 memcpy(&re, old_re, sizeof(re)); 2416 2417 for (devfn = 0; devfn < 256; devfn++) { 2418 /* First calculate the correct index */ 2419 idx = (ext ? devfn * 2 : devfn) % 256; 2420 2421 if (idx == 0) { 2422 /* First save what we may have and clean up */ 2423 if (new_ce) { 2424 tbl[tbl_idx] = new_ce; 2425 __iommu_flush_cache(iommu, new_ce, 2426 VTD_PAGE_SIZE); 2427 pos = 1; 2428 } 2429 2430 if (old_ce) 2431 memunmap(old_ce); 2432 2433 ret = 0; 2434 if (devfn < 0x80) 2435 old_ce_phys = root_entry_lctp(&re); 2436 else 2437 old_ce_phys = root_entry_uctp(&re); 2438 2439 if (!old_ce_phys) { 2440 if (ext && devfn == 0) { 2441 /* No LCTP, try UCTP */ 2442 devfn = 0x7f; 2443 continue; 2444 } else { 2445 goto out; 2446 } 2447 } 2448 2449 ret = -ENOMEM; 2450 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2451 MEMREMAP_WB); 2452 if (!old_ce) 2453 goto out; 2454 2455 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2456 if (!new_ce) 2457 goto out_unmap; 2458 2459 ret = 0; 2460 } 2461 2462 /* Now copy the context entry */ 2463 memcpy(&ce, old_ce + idx, sizeof(ce)); 2464 2465 if (!context_present(&ce)) 2466 continue; 2467 2468 did = context_domain_id(&ce); 2469 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2470 set_bit(did, iommu->domain_ids); 2471 2472 set_context_copied(iommu, bus, devfn); 2473 new_ce[idx] = ce; 2474 } 2475 2476 tbl[tbl_idx + pos] = new_ce; 2477 2478 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2479 2480 out_unmap: 2481 memunmap(old_ce); 2482 2483 out: 2484 return ret; 2485 } 2486 2487 static int copy_translation_tables(struct intel_iommu *iommu) 2488 { 2489 struct context_entry **ctxt_tbls; 2490 struct root_entry *old_rt; 2491 phys_addr_t old_rt_phys; 2492 int ctxt_table_entries; 2493 u64 rtaddr_reg; 2494 int bus, ret; 2495 bool new_ext, ext; 2496 2497 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2498 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2499 new_ext = !!sm_supported(iommu); 2500 2501 /* 2502 * The RTT bit can only be changed when translation is disabled, 2503 * but disabling translation means to open a window for data 2504 * corruption. So bail out and don't copy anything if we would 2505 * have to change the bit. 2506 */ 2507 if (new_ext != ext) 2508 return -EINVAL; 2509 2510 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2511 if (!iommu->copied_tables) 2512 return -ENOMEM; 2513 2514 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2515 if (!old_rt_phys) 2516 return -EINVAL; 2517 2518 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2519 if (!old_rt) 2520 return -ENOMEM; 2521 2522 /* This is too big for the stack - allocate it from slab */ 2523 ctxt_table_entries = ext ? 512 : 256; 2524 ret = -ENOMEM; 2525 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2526 if (!ctxt_tbls) 2527 goto out_unmap; 2528 2529 for (bus = 0; bus < 256; bus++) { 2530 ret = copy_context_table(iommu, &old_rt[bus], 2531 ctxt_tbls, bus, ext); 2532 if (ret) { 2533 pr_err("%s: Failed to copy context table for bus %d\n", 2534 iommu->name, bus); 2535 continue; 2536 } 2537 } 2538 2539 spin_lock(&iommu->lock); 2540 2541 /* Context tables are copied, now write them to the root_entry table */ 2542 for (bus = 0; bus < 256; bus++) { 2543 int idx = ext ? bus * 2 : bus; 2544 u64 val; 2545 2546 if (ctxt_tbls[idx]) { 2547 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2548 iommu->root_entry[bus].lo = val; 2549 } 2550 2551 if (!ext || !ctxt_tbls[idx + 1]) 2552 continue; 2553 2554 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2555 iommu->root_entry[bus].hi = val; 2556 } 2557 2558 spin_unlock(&iommu->lock); 2559 2560 kfree(ctxt_tbls); 2561 2562 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2563 2564 ret = 0; 2565 2566 out_unmap: 2567 memunmap(old_rt); 2568 2569 return ret; 2570 } 2571 2572 static int __init init_dmars(void) 2573 { 2574 struct dmar_drhd_unit *drhd; 2575 struct intel_iommu *iommu; 2576 int ret; 2577 2578 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2579 if (ret) 2580 goto free_iommu; 2581 2582 for_each_iommu(iommu, drhd) { 2583 if (drhd->ignored) { 2584 iommu_disable_translation(iommu); 2585 continue; 2586 } 2587 2588 /* 2589 * Find the max pasid size of all IOMMU's in the system. 2590 * We need to ensure the system pasid table is no bigger 2591 * than the smallest supported. 2592 */ 2593 if (pasid_supported(iommu)) { 2594 u32 temp = 2 << ecap_pss(iommu->ecap); 2595 2596 intel_pasid_max_id = min_t(u32, temp, 2597 intel_pasid_max_id); 2598 } 2599 2600 intel_iommu_init_qi(iommu); 2601 2602 ret = iommu_init_domains(iommu); 2603 if (ret) 2604 goto free_iommu; 2605 2606 init_translation_status(iommu); 2607 2608 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2609 iommu_disable_translation(iommu); 2610 clear_translation_pre_enabled(iommu); 2611 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2612 iommu->name); 2613 } 2614 2615 /* 2616 * TBD: 2617 * we could share the same root & context tables 2618 * among all IOMMU's. Need to Split it later. 2619 */ 2620 ret = iommu_alloc_root_entry(iommu); 2621 if (ret) 2622 goto free_iommu; 2623 2624 if (translation_pre_enabled(iommu)) { 2625 pr_info("Translation already enabled - trying to copy translation structures\n"); 2626 2627 ret = copy_translation_tables(iommu); 2628 if (ret) { 2629 /* 2630 * We found the IOMMU with translation 2631 * enabled - but failed to copy over the 2632 * old root-entry table. Try to proceed 2633 * by disabling translation now and 2634 * allocating a clean root-entry table. 2635 * This might cause DMAR faults, but 2636 * probably the dump will still succeed. 2637 */ 2638 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2639 iommu->name); 2640 iommu_disable_translation(iommu); 2641 clear_translation_pre_enabled(iommu); 2642 } else { 2643 pr_info("Copied translation tables from previous kernel for %s\n", 2644 iommu->name); 2645 } 2646 } 2647 2648 if (!ecap_pass_through(iommu->ecap)) 2649 hw_pass_through = 0; 2650 intel_svm_check(iommu); 2651 } 2652 2653 /* 2654 * Now that qi is enabled on all iommus, set the root entry and flush 2655 * caches. This is required on some Intel X58 chipsets, otherwise the 2656 * flush_context function will loop forever and the boot hangs. 2657 */ 2658 for_each_active_iommu(iommu, drhd) { 2659 iommu_flush_write_buffer(iommu); 2660 iommu_set_root_entry(iommu); 2661 } 2662 2663 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2664 dmar_map_gfx = 0; 2665 #endif 2666 2667 if (!dmar_map_gfx) 2668 iommu_identity_mapping |= IDENTMAP_GFX; 2669 2670 check_tylersburg_isoch(); 2671 2672 ret = si_domain_init(hw_pass_through); 2673 if (ret) 2674 goto free_iommu; 2675 2676 /* 2677 * for each drhd 2678 * enable fault log 2679 * global invalidate context cache 2680 * global invalidate iotlb 2681 * enable translation 2682 */ 2683 for_each_iommu(iommu, drhd) { 2684 if (drhd->ignored) { 2685 /* 2686 * we always have to disable PMRs or DMA may fail on 2687 * this device 2688 */ 2689 if (force_on) 2690 iommu_disable_protect_mem_regions(iommu); 2691 continue; 2692 } 2693 2694 iommu_flush_write_buffer(iommu); 2695 2696 #ifdef CONFIG_INTEL_IOMMU_SVM 2697 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2698 /* 2699 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2700 * could cause possible lock race condition. 2701 */ 2702 up_write(&dmar_global_lock); 2703 ret = intel_svm_enable_prq(iommu); 2704 down_write(&dmar_global_lock); 2705 if (ret) 2706 goto free_iommu; 2707 } 2708 #endif 2709 ret = dmar_set_interrupt(iommu); 2710 if (ret) 2711 goto free_iommu; 2712 } 2713 2714 return 0; 2715 2716 free_iommu: 2717 for_each_active_iommu(iommu, drhd) { 2718 disable_dmar_iommu(iommu); 2719 free_dmar_iommu(iommu); 2720 } 2721 if (si_domain) { 2722 domain_exit(si_domain); 2723 si_domain = NULL; 2724 } 2725 2726 return ret; 2727 } 2728 2729 static void __init init_no_remapping_devices(void) 2730 { 2731 struct dmar_drhd_unit *drhd; 2732 struct device *dev; 2733 int i; 2734 2735 for_each_drhd_unit(drhd) { 2736 if (!drhd->include_all) { 2737 for_each_active_dev_scope(drhd->devices, 2738 drhd->devices_cnt, i, dev) 2739 break; 2740 /* ignore DMAR unit if no devices exist */ 2741 if (i == drhd->devices_cnt) 2742 drhd->ignored = 1; 2743 } 2744 } 2745 2746 for_each_active_drhd_unit(drhd) { 2747 if (drhd->include_all) 2748 continue; 2749 2750 for_each_active_dev_scope(drhd->devices, 2751 drhd->devices_cnt, i, dev) 2752 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2753 break; 2754 if (i < drhd->devices_cnt) 2755 continue; 2756 2757 /* This IOMMU has *only* gfx devices. Either bypass it or 2758 set the gfx_mapped flag, as appropriate */ 2759 drhd->gfx_dedicated = 1; 2760 if (!dmar_map_gfx) 2761 drhd->ignored = 1; 2762 } 2763 } 2764 2765 #ifdef CONFIG_SUSPEND 2766 static int init_iommu_hw(void) 2767 { 2768 struct dmar_drhd_unit *drhd; 2769 struct intel_iommu *iommu = NULL; 2770 int ret; 2771 2772 for_each_active_iommu(iommu, drhd) { 2773 if (iommu->qi) { 2774 ret = dmar_reenable_qi(iommu); 2775 if (ret) 2776 return ret; 2777 } 2778 } 2779 2780 for_each_iommu(iommu, drhd) { 2781 if (drhd->ignored) { 2782 /* 2783 * we always have to disable PMRs or DMA may fail on 2784 * this device 2785 */ 2786 if (force_on) 2787 iommu_disable_protect_mem_regions(iommu); 2788 continue; 2789 } 2790 2791 iommu_flush_write_buffer(iommu); 2792 iommu_set_root_entry(iommu); 2793 iommu_enable_translation(iommu); 2794 iommu_disable_protect_mem_regions(iommu); 2795 } 2796 2797 return 0; 2798 } 2799 2800 static void iommu_flush_all(void) 2801 { 2802 struct dmar_drhd_unit *drhd; 2803 struct intel_iommu *iommu; 2804 2805 for_each_active_iommu(iommu, drhd) { 2806 iommu->flush.flush_context(iommu, 0, 0, 0, 2807 DMA_CCMD_GLOBAL_INVL); 2808 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2809 DMA_TLB_GLOBAL_FLUSH); 2810 } 2811 } 2812 2813 static int iommu_suspend(void) 2814 { 2815 struct dmar_drhd_unit *drhd; 2816 struct intel_iommu *iommu = NULL; 2817 unsigned long flag; 2818 2819 iommu_flush_all(); 2820 2821 for_each_active_iommu(iommu, drhd) { 2822 iommu_disable_translation(iommu); 2823 2824 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2825 2826 iommu->iommu_state[SR_DMAR_FECTL_REG] = 2827 readl(iommu->reg + DMAR_FECTL_REG); 2828 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 2829 readl(iommu->reg + DMAR_FEDATA_REG); 2830 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 2831 readl(iommu->reg + DMAR_FEADDR_REG); 2832 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 2833 readl(iommu->reg + DMAR_FEUADDR_REG); 2834 2835 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2836 } 2837 return 0; 2838 } 2839 2840 static void iommu_resume(void) 2841 { 2842 struct dmar_drhd_unit *drhd; 2843 struct intel_iommu *iommu = NULL; 2844 unsigned long flag; 2845 2846 if (init_iommu_hw()) { 2847 if (force_on) 2848 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 2849 else 2850 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 2851 return; 2852 } 2853 2854 for_each_active_iommu(iommu, drhd) { 2855 2856 raw_spin_lock_irqsave(&iommu->register_lock, flag); 2857 2858 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 2859 iommu->reg + DMAR_FECTL_REG); 2860 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 2861 iommu->reg + DMAR_FEDATA_REG); 2862 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 2863 iommu->reg + DMAR_FEADDR_REG); 2864 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 2865 iommu->reg + DMAR_FEUADDR_REG); 2866 2867 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 2868 } 2869 } 2870 2871 static struct syscore_ops iommu_syscore_ops = { 2872 .resume = iommu_resume, 2873 .suspend = iommu_suspend, 2874 }; 2875 2876 static void __init init_iommu_pm_ops(void) 2877 { 2878 register_syscore_ops(&iommu_syscore_ops); 2879 } 2880 2881 #else 2882 static inline void init_iommu_pm_ops(void) {} 2883 #endif /* CONFIG_PM */ 2884 2885 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 2886 { 2887 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 2888 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 2889 rmrr->end_address <= rmrr->base_address || 2890 arch_rmrr_sanity_check(rmrr)) 2891 return -EINVAL; 2892 2893 return 0; 2894 } 2895 2896 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 2897 { 2898 struct acpi_dmar_reserved_memory *rmrr; 2899 struct dmar_rmrr_unit *rmrru; 2900 2901 rmrr = (struct acpi_dmar_reserved_memory *)header; 2902 if (rmrr_sanity_check(rmrr)) { 2903 pr_warn(FW_BUG 2904 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 2905 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 2906 rmrr->base_address, rmrr->end_address, 2907 dmi_get_system_info(DMI_BIOS_VENDOR), 2908 dmi_get_system_info(DMI_BIOS_VERSION), 2909 dmi_get_system_info(DMI_PRODUCT_VERSION)); 2910 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 2911 } 2912 2913 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 2914 if (!rmrru) 2915 goto out; 2916 2917 rmrru->hdr = header; 2918 2919 rmrru->base_address = rmrr->base_address; 2920 rmrru->end_address = rmrr->end_address; 2921 2922 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 2923 ((void *)rmrr) + rmrr->header.length, 2924 &rmrru->devices_cnt); 2925 if (rmrru->devices_cnt && rmrru->devices == NULL) 2926 goto free_rmrru; 2927 2928 list_add(&rmrru->list, &dmar_rmrr_units); 2929 2930 return 0; 2931 free_rmrru: 2932 kfree(rmrru); 2933 out: 2934 return -ENOMEM; 2935 } 2936 2937 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 2938 { 2939 struct dmar_atsr_unit *atsru; 2940 struct acpi_dmar_atsr *tmp; 2941 2942 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 2943 dmar_rcu_check()) { 2944 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 2945 if (atsr->segment != tmp->segment) 2946 continue; 2947 if (atsr->header.length != tmp->header.length) 2948 continue; 2949 if (memcmp(atsr, tmp, atsr->header.length) == 0) 2950 return atsru; 2951 } 2952 2953 return NULL; 2954 } 2955 2956 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 2957 { 2958 struct acpi_dmar_atsr *atsr; 2959 struct dmar_atsr_unit *atsru; 2960 2961 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 2962 return 0; 2963 2964 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 2965 atsru = dmar_find_atsr(atsr); 2966 if (atsru) 2967 return 0; 2968 2969 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 2970 if (!atsru) 2971 return -ENOMEM; 2972 2973 /* 2974 * If memory is allocated from slab by ACPI _DSM method, we need to 2975 * copy the memory content because the memory buffer will be freed 2976 * on return. 2977 */ 2978 atsru->hdr = (void *)(atsru + 1); 2979 memcpy(atsru->hdr, hdr, hdr->length); 2980 atsru->include_all = atsr->flags & 0x1; 2981 if (!atsru->include_all) { 2982 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 2983 (void *)atsr + atsr->header.length, 2984 &atsru->devices_cnt); 2985 if (atsru->devices_cnt && atsru->devices == NULL) { 2986 kfree(atsru); 2987 return -ENOMEM; 2988 } 2989 } 2990 2991 list_add_rcu(&atsru->list, &dmar_atsr_units); 2992 2993 return 0; 2994 } 2995 2996 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 2997 { 2998 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 2999 kfree(atsru); 3000 } 3001 3002 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3003 { 3004 struct acpi_dmar_atsr *atsr; 3005 struct dmar_atsr_unit *atsru; 3006 3007 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3008 atsru = dmar_find_atsr(atsr); 3009 if (atsru) { 3010 list_del_rcu(&atsru->list); 3011 synchronize_rcu(); 3012 intel_iommu_free_atsr(atsru); 3013 } 3014 3015 return 0; 3016 } 3017 3018 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3019 { 3020 int i; 3021 struct device *dev; 3022 struct acpi_dmar_atsr *atsr; 3023 struct dmar_atsr_unit *atsru; 3024 3025 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3026 atsru = dmar_find_atsr(atsr); 3027 if (!atsru) 3028 return 0; 3029 3030 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3031 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3032 i, dev) 3033 return -EBUSY; 3034 } 3035 3036 return 0; 3037 } 3038 3039 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3040 { 3041 struct dmar_satc_unit *satcu; 3042 struct acpi_dmar_satc *tmp; 3043 3044 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3045 dmar_rcu_check()) { 3046 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3047 if (satc->segment != tmp->segment) 3048 continue; 3049 if (satc->header.length != tmp->header.length) 3050 continue; 3051 if (memcmp(satc, tmp, satc->header.length) == 0) 3052 return satcu; 3053 } 3054 3055 return NULL; 3056 } 3057 3058 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3059 { 3060 struct acpi_dmar_satc *satc; 3061 struct dmar_satc_unit *satcu; 3062 3063 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3064 return 0; 3065 3066 satc = container_of(hdr, struct acpi_dmar_satc, header); 3067 satcu = dmar_find_satc(satc); 3068 if (satcu) 3069 return 0; 3070 3071 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3072 if (!satcu) 3073 return -ENOMEM; 3074 3075 satcu->hdr = (void *)(satcu + 1); 3076 memcpy(satcu->hdr, hdr, hdr->length); 3077 satcu->atc_required = satc->flags & 0x1; 3078 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3079 (void *)satc + satc->header.length, 3080 &satcu->devices_cnt); 3081 if (satcu->devices_cnt && !satcu->devices) { 3082 kfree(satcu); 3083 return -ENOMEM; 3084 } 3085 list_add_rcu(&satcu->list, &dmar_satc_units); 3086 3087 return 0; 3088 } 3089 3090 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3091 { 3092 int sp, ret; 3093 struct intel_iommu *iommu = dmaru->iommu; 3094 3095 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3096 if (ret) 3097 goto out; 3098 3099 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3100 pr_warn("%s: Doesn't support hardware pass through.\n", 3101 iommu->name); 3102 return -ENXIO; 3103 } 3104 3105 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3106 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3107 pr_warn("%s: Doesn't support large page.\n", 3108 iommu->name); 3109 return -ENXIO; 3110 } 3111 3112 /* 3113 * Disable translation if already enabled prior to OS handover. 3114 */ 3115 if (iommu->gcmd & DMA_GCMD_TE) 3116 iommu_disable_translation(iommu); 3117 3118 ret = iommu_init_domains(iommu); 3119 if (ret == 0) 3120 ret = iommu_alloc_root_entry(iommu); 3121 if (ret) 3122 goto out; 3123 3124 intel_svm_check(iommu); 3125 3126 if (dmaru->ignored) { 3127 /* 3128 * we always have to disable PMRs or DMA may fail on this device 3129 */ 3130 if (force_on) 3131 iommu_disable_protect_mem_regions(iommu); 3132 return 0; 3133 } 3134 3135 intel_iommu_init_qi(iommu); 3136 iommu_flush_write_buffer(iommu); 3137 3138 #ifdef CONFIG_INTEL_IOMMU_SVM 3139 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3140 ret = intel_svm_enable_prq(iommu); 3141 if (ret) 3142 goto disable_iommu; 3143 } 3144 #endif 3145 ret = dmar_set_interrupt(iommu); 3146 if (ret) 3147 goto disable_iommu; 3148 3149 iommu_set_root_entry(iommu); 3150 iommu_enable_translation(iommu); 3151 3152 iommu_disable_protect_mem_regions(iommu); 3153 return 0; 3154 3155 disable_iommu: 3156 disable_dmar_iommu(iommu); 3157 out: 3158 free_dmar_iommu(iommu); 3159 return ret; 3160 } 3161 3162 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3163 { 3164 int ret = 0; 3165 struct intel_iommu *iommu = dmaru->iommu; 3166 3167 if (!intel_iommu_enabled) 3168 return 0; 3169 if (iommu == NULL) 3170 return -EINVAL; 3171 3172 if (insert) { 3173 ret = intel_iommu_add(dmaru); 3174 } else { 3175 disable_dmar_iommu(iommu); 3176 free_dmar_iommu(iommu); 3177 } 3178 3179 return ret; 3180 } 3181 3182 static void intel_iommu_free_dmars(void) 3183 { 3184 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3185 struct dmar_atsr_unit *atsru, *atsr_n; 3186 struct dmar_satc_unit *satcu, *satc_n; 3187 3188 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3189 list_del(&rmrru->list); 3190 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3191 kfree(rmrru); 3192 } 3193 3194 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3195 list_del(&atsru->list); 3196 intel_iommu_free_atsr(atsru); 3197 } 3198 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3199 list_del(&satcu->list); 3200 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3201 kfree(satcu); 3202 } 3203 } 3204 3205 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3206 { 3207 struct dmar_satc_unit *satcu; 3208 struct acpi_dmar_satc *satc; 3209 struct device *tmp; 3210 int i; 3211 3212 dev = pci_physfn(dev); 3213 rcu_read_lock(); 3214 3215 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3216 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3217 if (satc->segment != pci_domain_nr(dev->bus)) 3218 continue; 3219 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3220 if (to_pci_dev(tmp) == dev) 3221 goto out; 3222 } 3223 satcu = NULL; 3224 out: 3225 rcu_read_unlock(); 3226 return satcu; 3227 } 3228 3229 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3230 { 3231 int i, ret = 1; 3232 struct pci_bus *bus; 3233 struct pci_dev *bridge = NULL; 3234 struct device *tmp; 3235 struct acpi_dmar_atsr *atsr; 3236 struct dmar_atsr_unit *atsru; 3237 struct dmar_satc_unit *satcu; 3238 3239 dev = pci_physfn(dev); 3240 satcu = dmar_find_matched_satc_unit(dev); 3241 if (satcu) 3242 /* 3243 * This device supports ATS as it is in SATC table. 3244 * When IOMMU is in legacy mode, enabling ATS is done 3245 * automatically by HW for the device that requires 3246 * ATS, hence OS should not enable this device ATS 3247 * to avoid duplicated TLB invalidation. 3248 */ 3249 return !(satcu->atc_required && !sm_supported(iommu)); 3250 3251 for (bus = dev->bus; bus; bus = bus->parent) { 3252 bridge = bus->self; 3253 /* If it's an integrated device, allow ATS */ 3254 if (!bridge) 3255 return 1; 3256 /* Connected via non-PCIe: no ATS */ 3257 if (!pci_is_pcie(bridge) || 3258 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3259 return 0; 3260 /* If we found the root port, look it up in the ATSR */ 3261 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3262 break; 3263 } 3264 3265 rcu_read_lock(); 3266 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3267 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3268 if (atsr->segment != pci_domain_nr(dev->bus)) 3269 continue; 3270 3271 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3272 if (tmp == &bridge->dev) 3273 goto out; 3274 3275 if (atsru->include_all) 3276 goto out; 3277 } 3278 ret = 0; 3279 out: 3280 rcu_read_unlock(); 3281 3282 return ret; 3283 } 3284 3285 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3286 { 3287 int ret; 3288 struct dmar_rmrr_unit *rmrru; 3289 struct dmar_atsr_unit *atsru; 3290 struct dmar_satc_unit *satcu; 3291 struct acpi_dmar_atsr *atsr; 3292 struct acpi_dmar_reserved_memory *rmrr; 3293 struct acpi_dmar_satc *satc; 3294 3295 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3296 return 0; 3297 3298 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3299 rmrr = container_of(rmrru->hdr, 3300 struct acpi_dmar_reserved_memory, header); 3301 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3302 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3303 ((void *)rmrr) + rmrr->header.length, 3304 rmrr->segment, rmrru->devices, 3305 rmrru->devices_cnt); 3306 if (ret < 0) 3307 return ret; 3308 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3309 dmar_remove_dev_scope(info, rmrr->segment, 3310 rmrru->devices, rmrru->devices_cnt); 3311 } 3312 } 3313 3314 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3315 if (atsru->include_all) 3316 continue; 3317 3318 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3319 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3320 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3321 (void *)atsr + atsr->header.length, 3322 atsr->segment, atsru->devices, 3323 atsru->devices_cnt); 3324 if (ret > 0) 3325 break; 3326 else if (ret < 0) 3327 return ret; 3328 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3329 if (dmar_remove_dev_scope(info, atsr->segment, 3330 atsru->devices, atsru->devices_cnt)) 3331 break; 3332 } 3333 } 3334 list_for_each_entry(satcu, &dmar_satc_units, list) { 3335 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3336 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3337 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3338 (void *)satc + satc->header.length, 3339 satc->segment, satcu->devices, 3340 satcu->devices_cnt); 3341 if (ret > 0) 3342 break; 3343 else if (ret < 0) 3344 return ret; 3345 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3346 if (dmar_remove_dev_scope(info, satc->segment, 3347 satcu->devices, satcu->devices_cnt)) 3348 break; 3349 } 3350 } 3351 3352 return 0; 3353 } 3354 3355 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3356 unsigned long val, void *v) 3357 { 3358 struct memory_notify *mhp = v; 3359 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3360 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3361 mhp->nr_pages - 1); 3362 3363 switch (val) { 3364 case MEM_GOING_ONLINE: 3365 if (iommu_domain_identity_map(si_domain, 3366 start_vpfn, last_vpfn)) { 3367 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3368 start_vpfn, last_vpfn); 3369 return NOTIFY_BAD; 3370 } 3371 break; 3372 3373 case MEM_OFFLINE: 3374 case MEM_CANCEL_ONLINE: 3375 { 3376 struct dmar_drhd_unit *drhd; 3377 struct intel_iommu *iommu; 3378 LIST_HEAD(freelist); 3379 3380 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3381 3382 rcu_read_lock(); 3383 for_each_active_iommu(iommu, drhd) 3384 iommu_flush_iotlb_psi(iommu, si_domain, 3385 start_vpfn, mhp->nr_pages, 3386 list_empty(&freelist), 0); 3387 rcu_read_unlock(); 3388 put_pages_list(&freelist); 3389 } 3390 break; 3391 } 3392 3393 return NOTIFY_OK; 3394 } 3395 3396 static struct notifier_block intel_iommu_memory_nb = { 3397 .notifier_call = intel_iommu_memory_notifier, 3398 .priority = 0 3399 }; 3400 3401 static void intel_disable_iommus(void) 3402 { 3403 struct intel_iommu *iommu = NULL; 3404 struct dmar_drhd_unit *drhd; 3405 3406 for_each_iommu(iommu, drhd) 3407 iommu_disable_translation(iommu); 3408 } 3409 3410 void intel_iommu_shutdown(void) 3411 { 3412 struct dmar_drhd_unit *drhd; 3413 struct intel_iommu *iommu = NULL; 3414 3415 if (no_iommu || dmar_disabled) 3416 return; 3417 3418 down_write(&dmar_global_lock); 3419 3420 /* Disable PMRs explicitly here. */ 3421 for_each_iommu(iommu, drhd) 3422 iommu_disable_protect_mem_regions(iommu); 3423 3424 /* Make sure the IOMMUs are switched off */ 3425 intel_disable_iommus(); 3426 3427 up_write(&dmar_global_lock); 3428 } 3429 3430 static struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3431 { 3432 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3433 3434 return container_of(iommu_dev, struct intel_iommu, iommu); 3435 } 3436 3437 static ssize_t version_show(struct device *dev, 3438 struct device_attribute *attr, char *buf) 3439 { 3440 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3441 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3442 return sysfs_emit(buf, "%d:%d\n", 3443 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3444 } 3445 static DEVICE_ATTR_RO(version); 3446 3447 static ssize_t address_show(struct device *dev, 3448 struct device_attribute *attr, char *buf) 3449 { 3450 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3451 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3452 } 3453 static DEVICE_ATTR_RO(address); 3454 3455 static ssize_t cap_show(struct device *dev, 3456 struct device_attribute *attr, char *buf) 3457 { 3458 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3459 return sysfs_emit(buf, "%llx\n", iommu->cap); 3460 } 3461 static DEVICE_ATTR_RO(cap); 3462 3463 static ssize_t ecap_show(struct device *dev, 3464 struct device_attribute *attr, char *buf) 3465 { 3466 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3467 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3468 } 3469 static DEVICE_ATTR_RO(ecap); 3470 3471 static ssize_t domains_supported_show(struct device *dev, 3472 struct device_attribute *attr, char *buf) 3473 { 3474 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3475 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3476 } 3477 static DEVICE_ATTR_RO(domains_supported); 3478 3479 static ssize_t domains_used_show(struct device *dev, 3480 struct device_attribute *attr, char *buf) 3481 { 3482 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3483 return sysfs_emit(buf, "%d\n", 3484 bitmap_weight(iommu->domain_ids, 3485 cap_ndoms(iommu->cap))); 3486 } 3487 static DEVICE_ATTR_RO(domains_used); 3488 3489 static struct attribute *intel_iommu_attrs[] = { 3490 &dev_attr_version.attr, 3491 &dev_attr_address.attr, 3492 &dev_attr_cap.attr, 3493 &dev_attr_ecap.attr, 3494 &dev_attr_domains_supported.attr, 3495 &dev_attr_domains_used.attr, 3496 NULL, 3497 }; 3498 3499 static struct attribute_group intel_iommu_group = { 3500 .name = "intel-iommu", 3501 .attrs = intel_iommu_attrs, 3502 }; 3503 3504 const struct attribute_group *intel_iommu_groups[] = { 3505 &intel_iommu_group, 3506 NULL, 3507 }; 3508 3509 static bool has_external_pci(void) 3510 { 3511 struct pci_dev *pdev = NULL; 3512 3513 for_each_pci_dev(pdev) 3514 if (pdev->external_facing) { 3515 pci_dev_put(pdev); 3516 return true; 3517 } 3518 3519 return false; 3520 } 3521 3522 static int __init platform_optin_force_iommu(void) 3523 { 3524 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3525 return 0; 3526 3527 if (no_iommu || dmar_disabled) 3528 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3529 3530 /* 3531 * If Intel-IOMMU is disabled by default, we will apply identity 3532 * map for all devices except those marked as being untrusted. 3533 */ 3534 if (dmar_disabled) 3535 iommu_set_default_passthrough(false); 3536 3537 dmar_disabled = 0; 3538 no_iommu = 0; 3539 3540 return 1; 3541 } 3542 3543 static int __init probe_acpi_namespace_devices(void) 3544 { 3545 struct dmar_drhd_unit *drhd; 3546 /* To avoid a -Wunused-but-set-variable warning. */ 3547 struct intel_iommu *iommu __maybe_unused; 3548 struct device *dev; 3549 int i, ret = 0; 3550 3551 for_each_active_iommu(iommu, drhd) { 3552 for_each_active_dev_scope(drhd->devices, 3553 drhd->devices_cnt, i, dev) { 3554 struct acpi_device_physical_node *pn; 3555 struct acpi_device *adev; 3556 3557 if (dev->bus != &acpi_bus_type) 3558 continue; 3559 3560 adev = to_acpi_device(dev); 3561 mutex_lock(&adev->physical_node_lock); 3562 list_for_each_entry(pn, 3563 &adev->physical_node_list, node) { 3564 ret = iommu_probe_device(pn->dev); 3565 if (ret) 3566 break; 3567 } 3568 mutex_unlock(&adev->physical_node_lock); 3569 3570 if (ret) 3571 return ret; 3572 } 3573 } 3574 3575 return 0; 3576 } 3577 3578 static __init int tboot_force_iommu(void) 3579 { 3580 if (!tboot_enabled()) 3581 return 0; 3582 3583 if (no_iommu || dmar_disabled) 3584 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3585 3586 dmar_disabled = 0; 3587 no_iommu = 0; 3588 3589 return 1; 3590 } 3591 3592 int __init intel_iommu_init(void) 3593 { 3594 int ret = -ENODEV; 3595 struct dmar_drhd_unit *drhd; 3596 struct intel_iommu *iommu; 3597 3598 /* 3599 * Intel IOMMU is required for a TXT/tboot launch or platform 3600 * opt in, so enforce that. 3601 */ 3602 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3603 platform_optin_force_iommu(); 3604 3605 down_write(&dmar_global_lock); 3606 if (dmar_table_init()) { 3607 if (force_on) 3608 panic("tboot: Failed to initialize DMAR table\n"); 3609 goto out_free_dmar; 3610 } 3611 3612 if (dmar_dev_scope_init() < 0) { 3613 if (force_on) 3614 panic("tboot: Failed to initialize DMAR device scope\n"); 3615 goto out_free_dmar; 3616 } 3617 3618 up_write(&dmar_global_lock); 3619 3620 /* 3621 * The bus notifier takes the dmar_global_lock, so lockdep will 3622 * complain later when we register it under the lock. 3623 */ 3624 dmar_register_bus_notifier(); 3625 3626 down_write(&dmar_global_lock); 3627 3628 if (!no_iommu) 3629 intel_iommu_debugfs_init(); 3630 3631 if (no_iommu || dmar_disabled) { 3632 /* 3633 * We exit the function here to ensure IOMMU's remapping and 3634 * mempool aren't setup, which means that the IOMMU's PMRs 3635 * won't be disabled via the call to init_dmars(). So disable 3636 * it explicitly here. The PMRs were setup by tboot prior to 3637 * calling SENTER, but the kernel is expected to reset/tear 3638 * down the PMRs. 3639 */ 3640 if (intel_iommu_tboot_noforce) { 3641 for_each_iommu(iommu, drhd) 3642 iommu_disable_protect_mem_regions(iommu); 3643 } 3644 3645 /* 3646 * Make sure the IOMMUs are switched off, even when we 3647 * boot into a kexec kernel and the previous kernel left 3648 * them enabled 3649 */ 3650 intel_disable_iommus(); 3651 goto out_free_dmar; 3652 } 3653 3654 if (list_empty(&dmar_rmrr_units)) 3655 pr_info("No RMRR found\n"); 3656 3657 if (list_empty(&dmar_atsr_units)) 3658 pr_info("No ATSR found\n"); 3659 3660 if (list_empty(&dmar_satc_units)) 3661 pr_info("No SATC found\n"); 3662 3663 init_no_remapping_devices(); 3664 3665 ret = init_dmars(); 3666 if (ret) { 3667 if (force_on) 3668 panic("tboot: Failed to initialize DMARs\n"); 3669 pr_err("Initialization failed\n"); 3670 goto out_free_dmar; 3671 } 3672 up_write(&dmar_global_lock); 3673 3674 init_iommu_pm_ops(); 3675 3676 down_read(&dmar_global_lock); 3677 for_each_active_iommu(iommu, drhd) { 3678 /* 3679 * The flush queue implementation does not perform 3680 * page-selective invalidations that are required for efficient 3681 * TLB flushes in virtual environments. The benefit of batching 3682 * is likely to be much lower than the overhead of synchronizing 3683 * the virtual and physical IOMMU page-tables. 3684 */ 3685 if (cap_caching_mode(iommu->cap) && 3686 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3687 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3688 iommu_set_dma_strict(); 3689 } 3690 iommu_device_sysfs_add(&iommu->iommu, NULL, 3691 intel_iommu_groups, 3692 "%s", iommu->name); 3693 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3694 3695 iommu_pmu_register(iommu); 3696 } 3697 up_read(&dmar_global_lock); 3698 3699 if (si_domain && !hw_pass_through) 3700 register_memory_notifier(&intel_iommu_memory_nb); 3701 3702 down_read(&dmar_global_lock); 3703 if (probe_acpi_namespace_devices()) 3704 pr_warn("ACPI name space devices didn't probe correctly\n"); 3705 3706 /* Finally, we enable the DMA remapping hardware. */ 3707 for_each_iommu(iommu, drhd) { 3708 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3709 iommu_enable_translation(iommu); 3710 3711 iommu_disable_protect_mem_regions(iommu); 3712 } 3713 up_read(&dmar_global_lock); 3714 3715 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3716 3717 intel_iommu_enabled = 1; 3718 3719 return 0; 3720 3721 out_free_dmar: 3722 intel_iommu_free_dmars(); 3723 up_write(&dmar_global_lock); 3724 return ret; 3725 } 3726 3727 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3728 { 3729 struct device_domain_info *info = opaque; 3730 3731 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3732 return 0; 3733 } 3734 3735 /* 3736 * NB - intel-iommu lacks any sort of reference counting for the users of 3737 * dependent devices. If multiple endpoints have intersecting dependent 3738 * devices, unbinding the driver from any one of them will possibly leave 3739 * the others unable to operate. 3740 */ 3741 static void domain_context_clear(struct device_domain_info *info) 3742 { 3743 if (!dev_is_pci(info->dev)) 3744 domain_context_clear_one(info, info->bus, info->devfn); 3745 3746 pci_for_each_dma_alias(to_pci_dev(info->dev), 3747 &domain_context_clear_one_cb, info); 3748 } 3749 3750 static void dmar_remove_one_dev_info(struct device *dev) 3751 { 3752 struct device_domain_info *info = dev_iommu_priv_get(dev); 3753 struct dmar_domain *domain = info->domain; 3754 struct intel_iommu *iommu = info->iommu; 3755 unsigned long flags; 3756 3757 if (!dev_is_real_dma_subdevice(info->dev)) { 3758 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3759 intel_pasid_tear_down_entry(iommu, info->dev, 3760 IOMMU_NO_PASID, false); 3761 3762 iommu_disable_pci_caps(info); 3763 domain_context_clear(info); 3764 } 3765 3766 spin_lock_irqsave(&domain->lock, flags); 3767 list_del(&info->link); 3768 spin_unlock_irqrestore(&domain->lock, flags); 3769 3770 domain_detach_iommu(domain, iommu); 3771 info->domain = NULL; 3772 } 3773 3774 /* 3775 * Clear the page table pointer in context or pasid table entries so that 3776 * all DMA requests without PASID from the device are blocked. If the page 3777 * table has been set, clean up the data structures. 3778 */ 3779 void device_block_translation(struct device *dev) 3780 { 3781 struct device_domain_info *info = dev_iommu_priv_get(dev); 3782 struct intel_iommu *iommu = info->iommu; 3783 unsigned long flags; 3784 3785 iommu_disable_pci_caps(info); 3786 if (!dev_is_real_dma_subdevice(dev)) { 3787 if (sm_supported(iommu)) 3788 intel_pasid_tear_down_entry(iommu, dev, 3789 IOMMU_NO_PASID, false); 3790 else 3791 domain_context_clear(info); 3792 } 3793 3794 if (!info->domain) 3795 return; 3796 3797 spin_lock_irqsave(&info->domain->lock, flags); 3798 list_del(&info->link); 3799 spin_unlock_irqrestore(&info->domain->lock, flags); 3800 3801 domain_detach_iommu(info->domain, iommu); 3802 info->domain = NULL; 3803 } 3804 3805 static int md_domain_init(struct dmar_domain *domain, int guest_width) 3806 { 3807 int adjust_width; 3808 3809 /* calculate AGAW */ 3810 domain->gaw = guest_width; 3811 adjust_width = guestwidth_to_adjustwidth(guest_width); 3812 domain->agaw = width_to_agaw(adjust_width); 3813 3814 domain->iommu_coherency = false; 3815 domain->iommu_superpage = 0; 3816 domain->max_addr = 0; 3817 3818 /* always allocate the top pgd */ 3819 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 3820 if (!domain->pgd) 3821 return -ENOMEM; 3822 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 3823 return 0; 3824 } 3825 3826 static int blocking_domain_attach_dev(struct iommu_domain *domain, 3827 struct device *dev) 3828 { 3829 device_block_translation(dev); 3830 return 0; 3831 } 3832 3833 static struct iommu_domain blocking_domain = { 3834 .type = IOMMU_DOMAIN_BLOCKED, 3835 .ops = &(const struct iommu_domain_ops) { 3836 .attach_dev = blocking_domain_attach_dev, 3837 } 3838 }; 3839 3840 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 3841 { 3842 struct dmar_domain *dmar_domain; 3843 struct iommu_domain *domain; 3844 3845 switch (type) { 3846 case IOMMU_DOMAIN_DMA: 3847 case IOMMU_DOMAIN_UNMANAGED: 3848 dmar_domain = alloc_domain(type); 3849 if (!dmar_domain) { 3850 pr_err("Can't allocate dmar_domain\n"); 3851 return NULL; 3852 } 3853 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 3854 pr_err("Domain initialization failed\n"); 3855 domain_exit(dmar_domain); 3856 return NULL; 3857 } 3858 3859 domain = &dmar_domain->domain; 3860 domain->geometry.aperture_start = 0; 3861 domain->geometry.aperture_end = 3862 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 3863 domain->geometry.force_aperture = true; 3864 3865 return domain; 3866 case IOMMU_DOMAIN_IDENTITY: 3867 return &si_domain->domain; 3868 case IOMMU_DOMAIN_SVA: 3869 return intel_svm_domain_alloc(); 3870 default: 3871 return NULL; 3872 } 3873 3874 return NULL; 3875 } 3876 3877 static struct iommu_domain * 3878 intel_iommu_domain_alloc_user(struct device *dev, u32 flags, 3879 struct iommu_domain *parent, 3880 const struct iommu_user_data *user_data) 3881 { 3882 struct device_domain_info *info = dev_iommu_priv_get(dev); 3883 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 3884 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 3885 struct intel_iommu *iommu = info->iommu; 3886 struct iommu_domain *domain; 3887 3888 /* Must be NESTING domain */ 3889 if (parent) { 3890 if (!nested_supported(iommu) || flags) 3891 return ERR_PTR(-EOPNOTSUPP); 3892 return intel_nested_domain_alloc(parent, user_data); 3893 } 3894 3895 if (flags & 3896 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 3897 return ERR_PTR(-EOPNOTSUPP); 3898 if (nested_parent && !nested_supported(iommu)) 3899 return ERR_PTR(-EOPNOTSUPP); 3900 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 3901 return ERR_PTR(-EOPNOTSUPP); 3902 3903 /* 3904 * domain_alloc_user op needs to fully initialize a domain before 3905 * return, so uses iommu_domain_alloc() here for simple. 3906 */ 3907 domain = iommu_domain_alloc(dev->bus); 3908 if (!domain) 3909 return ERR_PTR(-ENOMEM); 3910 3911 if (nested_parent) 3912 to_dmar_domain(domain)->nested_parent = true; 3913 3914 if (dirty_tracking) { 3915 if (to_dmar_domain(domain)->use_first_level) { 3916 iommu_domain_free(domain); 3917 return ERR_PTR(-EOPNOTSUPP); 3918 } 3919 domain->dirty_ops = &intel_dirty_ops; 3920 } 3921 3922 return domain; 3923 } 3924 3925 static void intel_iommu_domain_free(struct iommu_domain *domain) 3926 { 3927 if (domain != &si_domain->domain) 3928 domain_exit(to_dmar_domain(domain)); 3929 } 3930 3931 int prepare_domain_attach_device(struct iommu_domain *domain, 3932 struct device *dev) 3933 { 3934 struct device_domain_info *info = dev_iommu_priv_get(dev); 3935 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3936 struct intel_iommu *iommu = info->iommu; 3937 int addr_width; 3938 3939 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3940 return -EINVAL; 3941 3942 if (domain->dirty_ops && !ssads_supported(iommu)) 3943 return -EINVAL; 3944 3945 /* check if this iommu agaw is sufficient for max mapped address */ 3946 addr_width = agaw_to_width(iommu->agaw); 3947 if (addr_width > cap_mgaw(iommu->cap)) 3948 addr_width = cap_mgaw(iommu->cap); 3949 3950 if (dmar_domain->max_addr > (1LL << addr_width)) 3951 return -EINVAL; 3952 dmar_domain->gaw = addr_width; 3953 3954 /* 3955 * Knock out extra levels of page tables if necessary 3956 */ 3957 while (iommu->agaw < dmar_domain->agaw) { 3958 struct dma_pte *pte; 3959 3960 pte = dmar_domain->pgd; 3961 if (dma_pte_present(pte)) { 3962 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 3963 free_pgtable_page(pte); 3964 } 3965 dmar_domain->agaw--; 3966 } 3967 3968 return 0; 3969 } 3970 3971 static int intel_iommu_attach_device(struct iommu_domain *domain, 3972 struct device *dev) 3973 { 3974 struct device_domain_info *info = dev_iommu_priv_get(dev); 3975 int ret; 3976 3977 if (info->domain) 3978 device_block_translation(dev); 3979 3980 ret = prepare_domain_attach_device(domain, dev); 3981 if (ret) 3982 return ret; 3983 3984 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 3985 } 3986 3987 static int intel_iommu_map(struct iommu_domain *domain, 3988 unsigned long iova, phys_addr_t hpa, 3989 size_t size, int iommu_prot, gfp_t gfp) 3990 { 3991 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3992 u64 max_addr; 3993 int prot = 0; 3994 3995 if (iommu_prot & IOMMU_READ) 3996 prot |= DMA_PTE_READ; 3997 if (iommu_prot & IOMMU_WRITE) 3998 prot |= DMA_PTE_WRITE; 3999 if (dmar_domain->set_pte_snp) 4000 prot |= DMA_PTE_SNP; 4001 4002 max_addr = iova + size; 4003 if (dmar_domain->max_addr < max_addr) { 4004 u64 end; 4005 4006 /* check if minimum agaw is sufficient for mapped address */ 4007 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4008 if (end < max_addr) { 4009 pr_err("%s: iommu width (%d) is not " 4010 "sufficient for the mapped address (%llx)\n", 4011 __func__, dmar_domain->gaw, max_addr); 4012 return -EFAULT; 4013 } 4014 dmar_domain->max_addr = max_addr; 4015 } 4016 /* Round up size to next multiple of PAGE_SIZE, if it and 4017 the low bits of hpa would take us onto the next page */ 4018 size = aligned_nrpages(hpa, size); 4019 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4020 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4021 } 4022 4023 static int intel_iommu_map_pages(struct iommu_domain *domain, 4024 unsigned long iova, phys_addr_t paddr, 4025 size_t pgsize, size_t pgcount, 4026 int prot, gfp_t gfp, size_t *mapped) 4027 { 4028 unsigned long pgshift = __ffs(pgsize); 4029 size_t size = pgcount << pgshift; 4030 int ret; 4031 4032 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4033 return -EINVAL; 4034 4035 if (!IS_ALIGNED(iova | paddr, pgsize)) 4036 return -EINVAL; 4037 4038 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4039 if (!ret && mapped) 4040 *mapped = size; 4041 4042 return ret; 4043 } 4044 4045 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4046 unsigned long iova, size_t size, 4047 struct iommu_iotlb_gather *gather) 4048 { 4049 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4050 unsigned long start_pfn, last_pfn; 4051 int level = 0; 4052 4053 /* Cope with horrid API which requires us to unmap more than the 4054 size argument if it happens to be a large-page mapping. */ 4055 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4056 &level, GFP_ATOMIC))) 4057 return 0; 4058 4059 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4060 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4061 4062 start_pfn = iova >> VTD_PAGE_SHIFT; 4063 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4064 4065 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4066 4067 if (dmar_domain->max_addr == iova + size) 4068 dmar_domain->max_addr = iova; 4069 4070 /* 4071 * We do not use page-selective IOTLB invalidation in flush queue, 4072 * so there is no need to track page and sync iotlb. 4073 */ 4074 if (!iommu_iotlb_gather_queued(gather)) 4075 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4076 4077 return size; 4078 } 4079 4080 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4081 unsigned long iova, 4082 size_t pgsize, size_t pgcount, 4083 struct iommu_iotlb_gather *gather) 4084 { 4085 unsigned long pgshift = __ffs(pgsize); 4086 size_t size = pgcount << pgshift; 4087 4088 return intel_iommu_unmap(domain, iova, size, gather); 4089 } 4090 4091 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4092 struct iommu_iotlb_gather *gather) 4093 { 4094 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4095 unsigned long iova_pfn = IOVA_PFN(gather->start); 4096 size_t size = gather->end - gather->start; 4097 struct iommu_domain_info *info; 4098 unsigned long start_pfn; 4099 unsigned long nrpages; 4100 unsigned long i; 4101 4102 nrpages = aligned_nrpages(gather->start, size); 4103 start_pfn = mm_to_dma_pfn_start(iova_pfn); 4104 4105 xa_for_each(&dmar_domain->iommu_array, i, info) 4106 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4107 start_pfn, nrpages, 4108 list_empty(&gather->freelist), 0); 4109 4110 put_pages_list(&gather->freelist); 4111 } 4112 4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4114 dma_addr_t iova) 4115 { 4116 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4117 struct dma_pte *pte; 4118 int level = 0; 4119 u64 phys = 0; 4120 4121 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4122 GFP_ATOMIC); 4123 if (pte && dma_pte_present(pte)) 4124 phys = dma_pte_addr(pte) + 4125 (iova & (BIT_MASK(level_to_offset_bits(level) + 4126 VTD_PAGE_SHIFT) - 1)); 4127 4128 return phys; 4129 } 4130 4131 static bool domain_support_force_snooping(struct dmar_domain *domain) 4132 { 4133 struct device_domain_info *info; 4134 bool support = true; 4135 4136 assert_spin_locked(&domain->lock); 4137 list_for_each_entry(info, &domain->devices, link) { 4138 if (!ecap_sc_support(info->iommu->ecap)) { 4139 support = false; 4140 break; 4141 } 4142 } 4143 4144 return support; 4145 } 4146 4147 static void domain_set_force_snooping(struct dmar_domain *domain) 4148 { 4149 struct device_domain_info *info; 4150 4151 assert_spin_locked(&domain->lock); 4152 /* 4153 * Second level page table supports per-PTE snoop control. The 4154 * iommu_map() interface will handle this by setting SNP bit. 4155 */ 4156 if (!domain->use_first_level) { 4157 domain->set_pte_snp = true; 4158 return; 4159 } 4160 4161 list_for_each_entry(info, &domain->devices, link) 4162 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4163 IOMMU_NO_PASID); 4164 } 4165 4166 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4167 { 4168 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4169 unsigned long flags; 4170 4171 if (dmar_domain->force_snooping) 4172 return true; 4173 4174 spin_lock_irqsave(&dmar_domain->lock, flags); 4175 if (!domain_support_force_snooping(dmar_domain) || 4176 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 4177 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4178 return false; 4179 } 4180 4181 domain_set_force_snooping(dmar_domain); 4182 dmar_domain->force_snooping = true; 4183 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4184 4185 return true; 4186 } 4187 4188 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4189 { 4190 struct device_domain_info *info = dev_iommu_priv_get(dev); 4191 4192 switch (cap) { 4193 case IOMMU_CAP_CACHE_COHERENCY: 4194 case IOMMU_CAP_DEFERRED_FLUSH: 4195 return true; 4196 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4197 return dmar_platform_optin(); 4198 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4199 return ecap_sc_support(info->iommu->ecap); 4200 case IOMMU_CAP_DIRTY_TRACKING: 4201 return ssads_supported(info->iommu); 4202 default: 4203 return false; 4204 } 4205 } 4206 4207 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4208 { 4209 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4210 struct device_domain_info *info; 4211 struct intel_iommu *iommu; 4212 u8 bus, devfn; 4213 int ret; 4214 4215 iommu = device_lookup_iommu(dev, &bus, &devfn); 4216 if (!iommu || !iommu->iommu.ops) 4217 return ERR_PTR(-ENODEV); 4218 4219 info = kzalloc(sizeof(*info), GFP_KERNEL); 4220 if (!info) 4221 return ERR_PTR(-ENOMEM); 4222 4223 if (dev_is_real_dma_subdevice(dev)) { 4224 info->bus = pdev->bus->number; 4225 info->devfn = pdev->devfn; 4226 info->segment = pci_domain_nr(pdev->bus); 4227 } else { 4228 info->bus = bus; 4229 info->devfn = devfn; 4230 info->segment = iommu->segment; 4231 } 4232 4233 info->dev = dev; 4234 info->iommu = iommu; 4235 if (dev_is_pci(dev)) { 4236 if (ecap_dev_iotlb_support(iommu->ecap) && 4237 pci_ats_supported(pdev) && 4238 dmar_ats_supported(pdev, iommu)) { 4239 info->ats_supported = 1; 4240 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4241 4242 /* 4243 * For IOMMU that supports device IOTLB throttling 4244 * (DIT), we assign PFSID to the invalidation desc 4245 * of a VF such that IOMMU HW can gauge queue depth 4246 * at PF level. If DIT is not set, PFSID will be 4247 * treated as reserved, which should be set to 0. 4248 */ 4249 if (ecap_dit(iommu->ecap)) 4250 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4251 info->ats_qdep = pci_ats_queue_depth(pdev); 4252 } 4253 if (sm_supported(iommu)) { 4254 if (pasid_supported(iommu)) { 4255 int features = pci_pasid_features(pdev); 4256 4257 if (features >= 0) 4258 info->pasid_supported = features | 1; 4259 } 4260 4261 if (info->ats_supported && ecap_prs(iommu->ecap) && 4262 pci_pri_supported(pdev)) 4263 info->pri_supported = 1; 4264 } 4265 } 4266 4267 dev_iommu_priv_set(dev, info); 4268 4269 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4270 ret = intel_pasid_alloc_table(dev); 4271 if (ret) { 4272 dev_err(dev, "PASID table allocation failed\n"); 4273 kfree(info); 4274 return ERR_PTR(ret); 4275 } 4276 } 4277 4278 intel_iommu_debugfs_create_dev(info); 4279 4280 return &iommu->iommu; 4281 } 4282 4283 static void intel_iommu_release_device(struct device *dev) 4284 { 4285 struct device_domain_info *info = dev_iommu_priv_get(dev); 4286 4287 dmar_remove_one_dev_info(dev); 4288 intel_pasid_free_table(dev); 4289 intel_iommu_debugfs_remove_dev(info); 4290 kfree(info); 4291 set_dma_ops(dev, NULL); 4292 } 4293 4294 static void intel_iommu_probe_finalize(struct device *dev) 4295 { 4296 set_dma_ops(dev, NULL); 4297 iommu_setup_dma_ops(dev, 0, U64_MAX); 4298 } 4299 4300 static void intel_iommu_get_resv_regions(struct device *device, 4301 struct list_head *head) 4302 { 4303 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4304 struct iommu_resv_region *reg; 4305 struct dmar_rmrr_unit *rmrr; 4306 struct device *i_dev; 4307 int i; 4308 4309 rcu_read_lock(); 4310 for_each_rmrr_units(rmrr) { 4311 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4312 i, i_dev) { 4313 struct iommu_resv_region *resv; 4314 enum iommu_resv_type type; 4315 size_t length; 4316 4317 if (i_dev != device && 4318 !is_downstream_to_pci_bridge(device, i_dev)) 4319 continue; 4320 4321 length = rmrr->end_address - rmrr->base_address + 1; 4322 4323 type = device_rmrr_is_relaxable(device) ? 4324 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4325 4326 resv = iommu_alloc_resv_region(rmrr->base_address, 4327 length, prot, type, 4328 GFP_ATOMIC); 4329 if (!resv) 4330 break; 4331 4332 list_add_tail(&resv->list, head); 4333 } 4334 } 4335 rcu_read_unlock(); 4336 4337 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4338 if (dev_is_pci(device)) { 4339 struct pci_dev *pdev = to_pci_dev(device); 4340 4341 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4342 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4343 IOMMU_RESV_DIRECT_RELAXABLE, 4344 GFP_KERNEL); 4345 if (reg) 4346 list_add_tail(®->list, head); 4347 } 4348 } 4349 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4350 4351 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4352 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4353 0, IOMMU_RESV_MSI, GFP_KERNEL); 4354 if (!reg) 4355 return; 4356 list_add_tail(®->list, head); 4357 } 4358 4359 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4360 { 4361 if (dev_is_pci(dev)) 4362 return pci_device_group(dev); 4363 return generic_device_group(dev); 4364 } 4365 4366 static int intel_iommu_enable_sva(struct device *dev) 4367 { 4368 struct device_domain_info *info = dev_iommu_priv_get(dev); 4369 struct intel_iommu *iommu; 4370 4371 if (!info || dmar_disabled) 4372 return -EINVAL; 4373 4374 iommu = info->iommu; 4375 if (!iommu) 4376 return -EINVAL; 4377 4378 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4379 return -ENODEV; 4380 4381 if (!info->pasid_enabled || !info->ats_enabled) 4382 return -EINVAL; 4383 4384 /* 4385 * Devices having device-specific I/O fault handling should not 4386 * support PCI/PRI. The IOMMU side has no means to check the 4387 * capability of device-specific IOPF. Therefore, IOMMU can only 4388 * default that if the device driver enables SVA on a non-PRI 4389 * device, it will handle IOPF in its own way. 4390 */ 4391 if (!info->pri_supported) 4392 return 0; 4393 4394 /* Devices supporting PRI should have it enabled. */ 4395 if (!info->pri_enabled) 4396 return -EINVAL; 4397 4398 return 0; 4399 } 4400 4401 static int intel_iommu_enable_iopf(struct device *dev) 4402 { 4403 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4404 struct device_domain_info *info = dev_iommu_priv_get(dev); 4405 struct intel_iommu *iommu; 4406 int ret; 4407 4408 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4409 return -ENODEV; 4410 4411 if (info->pri_enabled) 4412 return -EBUSY; 4413 4414 iommu = info->iommu; 4415 if (!iommu) 4416 return -EINVAL; 4417 4418 /* PASID is required in PRG Response Message. */ 4419 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4420 return -EINVAL; 4421 4422 ret = pci_reset_pri(pdev); 4423 if (ret) 4424 return ret; 4425 4426 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4427 if (ret) 4428 return ret; 4429 4430 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4431 if (ret) 4432 goto iopf_remove_device; 4433 4434 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4435 if (ret) 4436 goto iopf_unregister_handler; 4437 info->pri_enabled = 1; 4438 4439 return 0; 4440 4441 iopf_unregister_handler: 4442 iommu_unregister_device_fault_handler(dev); 4443 iopf_remove_device: 4444 iopf_queue_remove_device(iommu->iopf_queue, dev); 4445 4446 return ret; 4447 } 4448 4449 static int intel_iommu_disable_iopf(struct device *dev) 4450 { 4451 struct device_domain_info *info = dev_iommu_priv_get(dev); 4452 struct intel_iommu *iommu = info->iommu; 4453 4454 if (!info->pri_enabled) 4455 return -EINVAL; 4456 4457 /* 4458 * PCIe spec states that by clearing PRI enable bit, the Page 4459 * Request Interface will not issue new page requests, but has 4460 * outstanding page requests that have been transmitted or are 4461 * queued for transmission. This is supposed to be called after 4462 * the device driver has stopped DMA, all PASIDs have been 4463 * unbound and the outstanding PRQs have been drained. 4464 */ 4465 pci_disable_pri(to_pci_dev(dev)); 4466 info->pri_enabled = 0; 4467 4468 /* 4469 * With PRI disabled and outstanding PRQs drained, unregistering 4470 * fault handler and removing device from iopf queue should never 4471 * fail. 4472 */ 4473 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4474 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4475 4476 return 0; 4477 } 4478 4479 static int 4480 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4481 { 4482 switch (feat) { 4483 case IOMMU_DEV_FEAT_IOPF: 4484 return intel_iommu_enable_iopf(dev); 4485 4486 case IOMMU_DEV_FEAT_SVA: 4487 return intel_iommu_enable_sva(dev); 4488 4489 default: 4490 return -ENODEV; 4491 } 4492 } 4493 4494 static int 4495 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4496 { 4497 switch (feat) { 4498 case IOMMU_DEV_FEAT_IOPF: 4499 return intel_iommu_disable_iopf(dev); 4500 4501 case IOMMU_DEV_FEAT_SVA: 4502 return 0; 4503 4504 default: 4505 return -ENODEV; 4506 } 4507 } 4508 4509 static bool intel_iommu_is_attach_deferred(struct device *dev) 4510 { 4511 struct device_domain_info *info = dev_iommu_priv_get(dev); 4512 4513 return translation_pre_enabled(info->iommu) && !info->domain; 4514 } 4515 4516 /* 4517 * Check that the device does not live on an external facing PCI port that is 4518 * marked as untrusted. Such devices should not be able to apply quirks and 4519 * thus not be able to bypass the IOMMU restrictions. 4520 */ 4521 static bool risky_device(struct pci_dev *pdev) 4522 { 4523 if (pdev->untrusted) { 4524 pci_info(pdev, 4525 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4526 pdev->vendor, pdev->device); 4527 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4528 return true; 4529 } 4530 return false; 4531 } 4532 4533 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4534 unsigned long iova, size_t size) 4535 { 4536 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4537 unsigned long pages = aligned_nrpages(iova, size); 4538 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4539 struct iommu_domain_info *info; 4540 unsigned long i; 4541 4542 xa_for_each(&dmar_domain->iommu_array, i, info) 4543 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4544 return 0; 4545 } 4546 4547 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4548 { 4549 struct device_domain_info *info = dev_iommu_priv_get(dev); 4550 struct dev_pasid_info *curr, *dev_pasid = NULL; 4551 struct intel_iommu *iommu = info->iommu; 4552 struct dmar_domain *dmar_domain; 4553 struct iommu_domain *domain; 4554 unsigned long flags; 4555 4556 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4557 if (WARN_ON_ONCE(!domain)) 4558 goto out_tear_down; 4559 4560 /* 4561 * The SVA implementation needs to handle its own stuffs like the mm 4562 * notification. Before consolidating that code into iommu core, let 4563 * the intel sva code handle it. 4564 */ 4565 if (domain->type == IOMMU_DOMAIN_SVA) { 4566 intel_svm_remove_dev_pasid(dev, pasid); 4567 goto out_tear_down; 4568 } 4569 4570 dmar_domain = to_dmar_domain(domain); 4571 spin_lock_irqsave(&dmar_domain->lock, flags); 4572 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4573 if (curr->dev == dev && curr->pasid == pasid) { 4574 list_del(&curr->link_domain); 4575 dev_pasid = curr; 4576 break; 4577 } 4578 } 4579 WARN_ON_ONCE(!dev_pasid); 4580 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4581 4582 domain_detach_iommu(dmar_domain, iommu); 4583 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4584 kfree(dev_pasid); 4585 out_tear_down: 4586 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4587 intel_drain_pasid_prq(dev, pasid); 4588 } 4589 4590 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4591 struct device *dev, ioasid_t pasid) 4592 { 4593 struct device_domain_info *info = dev_iommu_priv_get(dev); 4594 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4595 struct intel_iommu *iommu = info->iommu; 4596 struct dev_pasid_info *dev_pasid; 4597 unsigned long flags; 4598 int ret; 4599 4600 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4601 return -EOPNOTSUPP; 4602 4603 if (domain->dirty_ops) 4604 return -EINVAL; 4605 4606 if (context_copied(iommu, info->bus, info->devfn)) 4607 return -EBUSY; 4608 4609 ret = prepare_domain_attach_device(domain, dev); 4610 if (ret) 4611 return ret; 4612 4613 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4614 if (!dev_pasid) 4615 return -ENOMEM; 4616 4617 ret = domain_attach_iommu(dmar_domain, iommu); 4618 if (ret) 4619 goto out_free; 4620 4621 if (domain_type_is_si(dmar_domain)) 4622 ret = intel_pasid_setup_pass_through(iommu, dev, pasid); 4623 else if (dmar_domain->use_first_level) 4624 ret = domain_setup_first_level(iommu, dmar_domain, 4625 dev, pasid); 4626 else 4627 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4628 dev, pasid); 4629 if (ret) 4630 goto out_detach_iommu; 4631 4632 dev_pasid->dev = dev; 4633 dev_pasid->pasid = pasid; 4634 spin_lock_irqsave(&dmar_domain->lock, flags); 4635 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4636 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4637 4638 if (domain->type & __IOMMU_DOMAIN_PAGING) 4639 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4640 4641 return 0; 4642 out_detach_iommu: 4643 domain_detach_iommu(dmar_domain, iommu); 4644 out_free: 4645 kfree(dev_pasid); 4646 return ret; 4647 } 4648 4649 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4650 { 4651 struct device_domain_info *info = dev_iommu_priv_get(dev); 4652 struct intel_iommu *iommu = info->iommu; 4653 struct iommu_hw_info_vtd *vtd; 4654 4655 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4656 if (!vtd) 4657 return ERR_PTR(-ENOMEM); 4658 4659 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4660 vtd->cap_reg = iommu->cap; 4661 vtd->ecap_reg = iommu->ecap; 4662 *length = sizeof(*vtd); 4663 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4664 return vtd; 4665 } 4666 4667 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4668 bool enable) 4669 { 4670 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4671 struct device_domain_info *info; 4672 int ret; 4673 4674 spin_lock(&dmar_domain->lock); 4675 if (dmar_domain->dirty_tracking == enable) 4676 goto out_unlock; 4677 4678 list_for_each_entry(info, &dmar_domain->devices, link) { 4679 ret = intel_pasid_setup_dirty_tracking(info->iommu, 4680 info->domain, info->dev, 4681 IOMMU_NO_PASID, enable); 4682 if (ret) 4683 goto err_unwind; 4684 } 4685 4686 dmar_domain->dirty_tracking = enable; 4687 out_unlock: 4688 spin_unlock(&dmar_domain->lock); 4689 4690 return 0; 4691 4692 err_unwind: 4693 list_for_each_entry(info, &dmar_domain->devices, link) 4694 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, 4695 info->dev, IOMMU_NO_PASID, 4696 dmar_domain->dirty_tracking); 4697 spin_unlock(&dmar_domain->lock); 4698 return ret; 4699 } 4700 4701 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4702 unsigned long iova, size_t size, 4703 unsigned long flags, 4704 struct iommu_dirty_bitmap *dirty) 4705 { 4706 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4707 unsigned long end = iova + size - 1; 4708 unsigned long pgsize; 4709 4710 /* 4711 * IOMMUFD core calls into a dirty tracking disabled domain without an 4712 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4713 * have occurred when we stopped dirty tracking. This ensures that we 4714 * never inherit dirtied bits from a previous cycle. 4715 */ 4716 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4717 return -EINVAL; 4718 4719 do { 4720 struct dma_pte *pte; 4721 int lvl = 0; 4722 4723 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4724 GFP_ATOMIC); 4725 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4726 if (!pte || !dma_pte_present(pte)) { 4727 iova += pgsize; 4728 continue; 4729 } 4730 4731 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4732 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4733 iova += pgsize; 4734 } while (iova < end); 4735 4736 return 0; 4737 } 4738 4739 static const struct iommu_dirty_ops intel_dirty_ops = { 4740 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4741 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4742 }; 4743 4744 const struct iommu_ops intel_iommu_ops = { 4745 .blocked_domain = &blocking_domain, 4746 .capable = intel_iommu_capable, 4747 .hw_info = intel_iommu_hw_info, 4748 .domain_alloc = intel_iommu_domain_alloc, 4749 .domain_alloc_user = intel_iommu_domain_alloc_user, 4750 .probe_device = intel_iommu_probe_device, 4751 .probe_finalize = intel_iommu_probe_finalize, 4752 .release_device = intel_iommu_release_device, 4753 .get_resv_regions = intel_iommu_get_resv_regions, 4754 .device_group = intel_iommu_device_group, 4755 .dev_enable_feat = intel_iommu_dev_enable_feat, 4756 .dev_disable_feat = intel_iommu_dev_disable_feat, 4757 .is_attach_deferred = intel_iommu_is_attach_deferred, 4758 .def_domain_type = device_def_domain_type, 4759 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4760 .pgsize_bitmap = SZ_4K, 4761 #ifdef CONFIG_INTEL_IOMMU_SVM 4762 .page_response = intel_svm_page_response, 4763 #endif 4764 .default_domain_ops = &(const struct iommu_domain_ops) { 4765 .attach_dev = intel_iommu_attach_device, 4766 .set_dev_pasid = intel_iommu_set_dev_pasid, 4767 .map_pages = intel_iommu_map_pages, 4768 .unmap_pages = intel_iommu_unmap_pages, 4769 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4770 .flush_iotlb_all = intel_flush_iotlb_all, 4771 .iotlb_sync = intel_iommu_tlb_sync, 4772 .iova_to_phys = intel_iommu_iova_to_phys, 4773 .free = intel_iommu_domain_free, 4774 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4775 } 4776 }; 4777 4778 static void quirk_iommu_igfx(struct pci_dev *dev) 4779 { 4780 if (risky_device(dev)) 4781 return; 4782 4783 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4784 dmar_map_gfx = 0; 4785 } 4786 4787 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4795 4796 /* Broadwell igfx malfunctions with dmar */ 4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4821 4822 static void quirk_iommu_rwbf(struct pci_dev *dev) 4823 { 4824 if (risky_device(dev)) 4825 return; 4826 4827 /* 4828 * Mobile 4 Series Chipset neglects to set RWBF capability, 4829 * but needs it. Same seems to hold for the desktop versions. 4830 */ 4831 pci_info(dev, "Forcing write-buffer flush capability\n"); 4832 rwbf_quirk = 1; 4833 } 4834 4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4842 4843 #define GGC 0x52 4844 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4845 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4846 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4847 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4848 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4849 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4850 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4851 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4852 4853 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4854 { 4855 unsigned short ggc; 4856 4857 if (risky_device(dev)) 4858 return; 4859 4860 if (pci_read_config_word(dev, GGC, &ggc)) 4861 return; 4862 4863 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4864 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4865 dmar_map_gfx = 0; 4866 } else if (dmar_map_gfx) { 4867 /* we have to ensure the gfx device is idle before we flush */ 4868 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4869 iommu_set_dma_strict(); 4870 } 4871 } 4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4876 4877 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4878 { 4879 unsigned short ver; 4880 4881 if (!IS_GFX_DEVICE(dev)) 4882 return; 4883 4884 ver = (dev->device >> 8) & 0xff; 4885 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4886 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4887 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4888 return; 4889 4890 if (risky_device(dev)) 4891 return; 4892 4893 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4894 iommu_skip_te_disable = 1; 4895 } 4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4897 4898 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4899 ISOCH DMAR unit for the Azalia sound device, but not give it any 4900 TLB entries, which causes it to deadlock. Check for that. We do 4901 this in a function called from init_dmars(), instead of in a PCI 4902 quirk, because we don't want to print the obnoxious "BIOS broken" 4903 message if VT-d is actually disabled. 4904 */ 4905 static void __init check_tylersburg_isoch(void) 4906 { 4907 struct pci_dev *pdev; 4908 uint32_t vtisochctrl; 4909 4910 /* If there's no Azalia in the system anyway, forget it. */ 4911 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4912 if (!pdev) 4913 return; 4914 4915 if (risky_device(pdev)) { 4916 pci_dev_put(pdev); 4917 return; 4918 } 4919 4920 pci_dev_put(pdev); 4921 4922 /* System Management Registers. Might be hidden, in which case 4923 we can't do the sanity check. But that's OK, because the 4924 known-broken BIOSes _don't_ actually hide it, so far. */ 4925 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4926 if (!pdev) 4927 return; 4928 4929 if (risky_device(pdev)) { 4930 pci_dev_put(pdev); 4931 return; 4932 } 4933 4934 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4935 pci_dev_put(pdev); 4936 return; 4937 } 4938 4939 pci_dev_put(pdev); 4940 4941 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4942 if (vtisochctrl & 1) 4943 return; 4944 4945 /* Drop all bits other than the number of TLB entries */ 4946 vtisochctrl &= 0x1c; 4947 4948 /* If we have the recommended number of TLB entries (16), fine. */ 4949 if (vtisochctrl == 0x10) 4950 return; 4951 4952 /* Zero TLB entries? You get to ride the short bus to school. */ 4953 if (!vtisochctrl) { 4954 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4955 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4956 dmi_get_system_info(DMI_BIOS_VENDOR), 4957 dmi_get_system_info(DMI_BIOS_VERSION), 4958 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4959 iommu_identity_mapping |= IDENTMAP_AZALIA; 4960 return; 4961 } 4962 4963 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4964 vtisochctrl); 4965 } 4966 4967 /* 4968 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4969 * invalidation completion before posted writes initiated with translated address 4970 * that utilized translations matching the invalidation address range, violating 4971 * the invalidation completion ordering. 4972 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4973 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4974 * under the control of the trusted/privileged host device driver must use this 4975 * quirk. 4976 * Device TLBs are invalidated under the following six conditions: 4977 * 1. Device driver does DMA API unmap IOVA 4978 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4979 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4980 * exit_mmap() due to crash 4981 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4982 * VM has to free pages that were unmapped 4983 * 5. Userspace driver unmaps a DMA buffer 4984 * 6. Cache invalidation in vSVA usage (upcoming) 4985 * 4986 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4987 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4988 * invalidate TLB the same way as normal user unmap which will use this quirk. 4989 * The dTLB invalidation after PASID cache flush does not need this quirk. 4990 * 4991 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4992 */ 4993 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4994 unsigned long address, unsigned long mask, 4995 u32 pasid, u16 qdep) 4996 { 4997 u16 sid; 4998 4999 if (likely(!info->dtlb_extra_inval)) 5000 return; 5001 5002 sid = PCI_DEVID(info->bus, info->devfn); 5003 if (pasid == IOMMU_NO_PASID) { 5004 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 5005 qdep, address, mask); 5006 } else { 5007 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5008 pasid, qdep, address, mask); 5009 } 5010 } 5011 5012 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5013 5014 /* 5015 * Function to submit a command to the enhanced command interface. The 5016 * valid enhanced command descriptions are defined in Table 47 of the 5017 * VT-d spec. The VT-d hardware implementation may support some but not 5018 * all commands, which can be determined by checking the Enhanced 5019 * Command Capability Register. 5020 * 5021 * Return values: 5022 * - 0: Command successful without any error; 5023 * - Negative: software error value; 5024 * - Nonzero positive: failure status code defined in Table 48. 5025 */ 5026 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5027 { 5028 unsigned long flags; 5029 u64 res; 5030 int ret; 5031 5032 if (!cap_ecmds(iommu->cap)) 5033 return -ENODEV; 5034 5035 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5036 5037 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5038 if (res & DMA_ECMD_ECRSP_IP) { 5039 ret = -EBUSY; 5040 goto err; 5041 } 5042 5043 /* 5044 * Unconditionally write the operand B, because 5045 * - There is no side effect if an ecmd doesn't require an 5046 * operand B, but we set the register to some value. 5047 * - It's not invoked in any critical path. The extra MMIO 5048 * write doesn't bring any performance concerns. 5049 */ 5050 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5051 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5052 5053 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5054 !(res & DMA_ECMD_ECRSP_IP), res); 5055 5056 if (res & DMA_ECMD_ECRSP_IP) { 5057 ret = -ETIMEDOUT; 5058 goto err; 5059 } 5060 5061 ret = ecmd_get_status_code(res); 5062 err: 5063 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5064 5065 return ret; 5066 } 5067