1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/intel-svm.h> 20 #include <linux/memory.h> 21 #include <linux/pci.h> 22 #include <linux/pci-ats.h> 23 #include <linux/spinlock.h> 24 #include <linux/syscore_ops.h> 25 #include <linux/tboot.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 static void __init check_tylersburg_isoch(void); 130 static int rwbf_quirk; 131 132 /* 133 * set to 1 to panic kernel if can't successfully enable VT-d 134 * (used when kernel is launched w/ TXT) 135 */ 136 static int force_on = 0; 137 static int intel_iommu_tboot_noforce; 138 static int no_platform_optin; 139 140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 141 142 /* 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 144 * if marked present. 145 */ 146 static phys_addr_t root_entry_lctp(struct root_entry *re) 147 { 148 if (!(re->lo & 1)) 149 return 0; 150 151 return re->lo & VTD_PAGE_MASK; 152 } 153 154 /* 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 156 * if marked present. 157 */ 158 static phys_addr_t root_entry_uctp(struct root_entry *re) 159 { 160 if (!(re->hi & 1)) 161 return 0; 162 163 return re->hi & VTD_PAGE_MASK; 164 } 165 166 static inline void context_set_present(struct context_entry *context) 167 { 168 context->lo |= 1; 169 } 170 171 static inline void context_set_fault_enable(struct context_entry *context) 172 { 173 context->lo &= (((u64)-1) << 2) | 1; 174 } 175 176 static inline void context_set_translation_type(struct context_entry *context, 177 unsigned long value) 178 { 179 context->lo &= (((u64)-1) << 4) | 3; 180 context->lo |= (value & 3) << 2; 181 } 182 183 static inline void context_set_address_root(struct context_entry *context, 184 unsigned long value) 185 { 186 context->lo &= ~VTD_PAGE_MASK; 187 context->lo |= value & VTD_PAGE_MASK; 188 } 189 190 static inline void context_set_address_width(struct context_entry *context, 191 unsigned long value) 192 { 193 context->hi |= value & 7; 194 } 195 196 static inline void context_set_domain_id(struct context_entry *context, 197 unsigned long value) 198 { 199 context->hi |= (value & ((1 << 16) - 1)) << 8; 200 } 201 202 static inline void context_set_pasid(struct context_entry *context) 203 { 204 context->lo |= CONTEXT_PASIDE; 205 } 206 207 static inline int context_domain_id(struct context_entry *c) 208 { 209 return((c->hi >> 8) & 0xffff); 210 } 211 212 static inline void context_clear_entry(struct context_entry *context) 213 { 214 context->lo = 0; 215 context->hi = 0; 216 } 217 218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 219 { 220 if (!iommu->copied_tables) 221 return false; 222 223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); 224 } 225 226 static inline void 227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 228 { 229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables); 230 } 231 232 static inline void 233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 234 { 235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); 236 } 237 238 /* 239 * This domain is a statically identity mapping domain. 240 * 1. This domain creats a static 1:1 mapping to all usable memory. 241 * 2. It maps to each iommu if successful. 242 * 3. Each iommu mapps to this domain if successful. 243 */ 244 static struct dmar_domain *si_domain; 245 static int hw_pass_through = 1; 246 247 struct dmar_rmrr_unit { 248 struct list_head list; /* list of rmrr units */ 249 struct acpi_dmar_header *hdr; /* ACPI header */ 250 u64 base_address; /* reserved base address*/ 251 u64 end_address; /* reserved end address */ 252 struct dmar_dev_scope *devices; /* target devices */ 253 int devices_cnt; /* target device count */ 254 }; 255 256 struct dmar_atsr_unit { 257 struct list_head list; /* list of ATSR units */ 258 struct acpi_dmar_header *hdr; /* ACPI header */ 259 struct dmar_dev_scope *devices; /* target devices */ 260 int devices_cnt; /* target device count */ 261 u8 include_all:1; /* include all ports */ 262 }; 263 264 struct dmar_satc_unit { 265 struct list_head list; /* list of SATC units */ 266 struct acpi_dmar_header *hdr; /* ACPI header */ 267 struct dmar_dev_scope *devices; /* target devices */ 268 struct intel_iommu *iommu; /* the corresponding iommu */ 269 int devices_cnt; /* target device count */ 270 u8 atc_required:1; /* ATS is required */ 271 }; 272 273 static LIST_HEAD(dmar_atsr_units); 274 static LIST_HEAD(dmar_rmrr_units); 275 static LIST_HEAD(dmar_satc_units); 276 277 #define for_each_rmrr_units(rmrr) \ 278 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 279 280 static void device_block_translation(struct device *dev); 281 static void intel_iommu_domain_free(struct iommu_domain *domain); 282 283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 285 286 int intel_iommu_enabled = 0; 287 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 288 289 static int dmar_map_gfx = 1; 290 static int intel_iommu_superpage = 1; 291 static int iommu_identity_mapping; 292 static int iommu_skip_te_disable; 293 294 #define IDENTMAP_GFX 2 295 #define IDENTMAP_AZALIA 4 296 297 const struct iommu_ops intel_iommu_ops; 298 299 static bool translation_pre_enabled(struct intel_iommu *iommu) 300 { 301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 302 } 303 304 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 305 { 306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 307 } 308 309 static void init_translation_status(struct intel_iommu *iommu) 310 { 311 u32 gsts; 312 313 gsts = readl(iommu->reg + DMAR_GSTS_REG); 314 if (gsts & DMA_GSTS_TES) 315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 316 } 317 318 static int __init intel_iommu_setup(char *str) 319 { 320 if (!str) 321 return -EINVAL; 322 323 while (*str) { 324 if (!strncmp(str, "on", 2)) { 325 dmar_disabled = 0; 326 pr_info("IOMMU enabled\n"); 327 } else if (!strncmp(str, "off", 3)) { 328 dmar_disabled = 1; 329 no_platform_optin = 1; 330 pr_info("IOMMU disabled\n"); 331 } else if (!strncmp(str, "igfx_off", 8)) { 332 dmar_map_gfx = 0; 333 pr_info("Disable GFX device mapping\n"); 334 } else if (!strncmp(str, "forcedac", 8)) { 335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 336 iommu_dma_forcedac = true; 337 } else if (!strncmp(str, "strict", 6)) { 338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 339 iommu_set_dma_strict(); 340 } else if (!strncmp(str, "sp_off", 6)) { 341 pr_info("Disable supported super page\n"); 342 intel_iommu_superpage = 0; 343 } else if (!strncmp(str, "sm_on", 5)) { 344 pr_info("Enable scalable mode if hardware supports\n"); 345 intel_iommu_sm = 1; 346 } else if (!strncmp(str, "sm_off", 6)) { 347 pr_info("Scalable mode is disallowed\n"); 348 intel_iommu_sm = 0; 349 } else if (!strncmp(str, "tboot_noforce", 13)) { 350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 351 intel_iommu_tboot_noforce = 1; 352 } else { 353 pr_notice("Unknown option - '%s'\n", str); 354 } 355 356 str += strcspn(str, ","); 357 while (*str == ',') 358 str++; 359 } 360 361 return 1; 362 } 363 __setup("intel_iommu=", intel_iommu_setup); 364 365 void *alloc_pgtable_page(int node) 366 { 367 struct page *page; 368 void *vaddr = NULL; 369 370 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 371 if (page) 372 vaddr = page_address(page); 373 return vaddr; 374 } 375 376 void free_pgtable_page(void *vaddr) 377 { 378 free_page((unsigned long)vaddr); 379 } 380 381 static inline int domain_type_is_si(struct dmar_domain *domain) 382 { 383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 384 } 385 386 static inline int domain_pfn_supported(struct dmar_domain *domain, 387 unsigned long pfn) 388 { 389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 390 391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 392 } 393 394 /* 395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 397 * the returned SAGAW. 398 */ 399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 400 { 401 unsigned long fl_sagaw, sl_sagaw; 402 403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 404 sl_sagaw = cap_sagaw(iommu->cap); 405 406 /* Second level only. */ 407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 408 return sl_sagaw; 409 410 /* First level only. */ 411 if (!ecap_slts(iommu->ecap)) 412 return fl_sagaw; 413 414 return fl_sagaw & sl_sagaw; 415 } 416 417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 418 { 419 unsigned long sagaw; 420 int agaw; 421 422 sagaw = __iommu_calculate_sagaw(iommu); 423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 424 if (test_bit(agaw, &sagaw)) 425 break; 426 } 427 428 return agaw; 429 } 430 431 /* 432 * Calculate max SAGAW for each iommu. 433 */ 434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 435 { 436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 437 } 438 439 /* 440 * calculate agaw for each iommu. 441 * "SAGAW" may be different across iommus, use a default agaw, and 442 * get a supported less agaw for iommus that don't support the default agaw. 443 */ 444 int iommu_calculate_agaw(struct intel_iommu *iommu) 445 { 446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 447 } 448 449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 450 { 451 return sm_supported(iommu) ? 452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 453 } 454 455 static void domain_update_iommu_coherency(struct dmar_domain *domain) 456 { 457 struct iommu_domain_info *info; 458 struct dmar_drhd_unit *drhd; 459 struct intel_iommu *iommu; 460 bool found = false; 461 unsigned long i; 462 463 domain->iommu_coherency = true; 464 xa_for_each(&domain->iommu_array, i, info) { 465 found = true; 466 if (!iommu_paging_structure_coherency(info->iommu)) { 467 domain->iommu_coherency = false; 468 break; 469 } 470 } 471 if (found) 472 return; 473 474 /* No hardware attached; use lowest common denominator */ 475 rcu_read_lock(); 476 for_each_active_iommu(iommu, drhd) { 477 if (!iommu_paging_structure_coherency(iommu)) { 478 domain->iommu_coherency = false; 479 break; 480 } 481 } 482 rcu_read_unlock(); 483 } 484 485 static int domain_update_iommu_superpage(struct dmar_domain *domain, 486 struct intel_iommu *skip) 487 { 488 struct dmar_drhd_unit *drhd; 489 struct intel_iommu *iommu; 490 int mask = 0x3; 491 492 if (!intel_iommu_superpage) 493 return 0; 494 495 /* set iommu_superpage to the smallest common denominator */ 496 rcu_read_lock(); 497 for_each_active_iommu(iommu, drhd) { 498 if (iommu != skip) { 499 if (domain && domain->use_first_level) { 500 if (!cap_fl1gp_support(iommu->cap)) 501 mask = 0x1; 502 } else { 503 mask &= cap_super_page_val(iommu->cap); 504 } 505 506 if (!mask) 507 break; 508 } 509 } 510 rcu_read_unlock(); 511 512 return fls(mask); 513 } 514 515 static int domain_update_device_node(struct dmar_domain *domain) 516 { 517 struct device_domain_info *info; 518 int nid = NUMA_NO_NODE; 519 unsigned long flags; 520 521 spin_lock_irqsave(&domain->lock, flags); 522 list_for_each_entry(info, &domain->devices, link) { 523 /* 524 * There could possibly be multiple device numa nodes as devices 525 * within the same domain may sit behind different IOMMUs. There 526 * isn't perfect answer in such situation, so we select first 527 * come first served policy. 528 */ 529 nid = dev_to_node(info->dev); 530 if (nid != NUMA_NO_NODE) 531 break; 532 } 533 spin_unlock_irqrestore(&domain->lock, flags); 534 535 return nid; 536 } 537 538 static void domain_update_iotlb(struct dmar_domain *domain); 539 540 /* Return the super pagesize bitmap if supported. */ 541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 542 { 543 unsigned long bitmap = 0; 544 545 /* 546 * 1-level super page supports page size of 2MiB, 2-level super page 547 * supports page size of both 2MiB and 1GiB. 548 */ 549 if (domain->iommu_superpage == 1) 550 bitmap |= SZ_2M; 551 else if (domain->iommu_superpage == 2) 552 bitmap |= SZ_2M | SZ_1G; 553 554 return bitmap; 555 } 556 557 /* Some capabilities may be different across iommus */ 558 static void domain_update_iommu_cap(struct dmar_domain *domain) 559 { 560 domain_update_iommu_coherency(domain); 561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 562 563 /* 564 * If RHSA is missing, we should default to the device numa domain 565 * as fall back. 566 */ 567 if (domain->nid == NUMA_NO_NODE) 568 domain->nid = domain_update_device_node(domain); 569 570 /* 571 * First-level translation restricts the input-address to a 572 * canonical address (i.e., address bits 63:N have the same 573 * value as address bit [N-1], where N is 48-bits with 4-level 574 * paging and 57-bits with 5-level paging). Hence, skip bit 575 * [N-1]. 576 */ 577 if (domain->use_first_level) 578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 579 else 580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 581 582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 583 domain_update_iotlb(domain); 584 } 585 586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 587 u8 devfn, int alloc) 588 { 589 struct root_entry *root = &iommu->root_entry[bus]; 590 struct context_entry *context; 591 u64 *entry; 592 593 /* 594 * Except that the caller requested to allocate a new entry, 595 * returning a copied context entry makes no sense. 596 */ 597 if (!alloc && context_copied(iommu, bus, devfn)) 598 return NULL; 599 600 entry = &root->lo; 601 if (sm_supported(iommu)) { 602 if (devfn >= 0x80) { 603 devfn -= 0x80; 604 entry = &root->hi; 605 } 606 devfn *= 2; 607 } 608 if (*entry & 1) 609 context = phys_to_virt(*entry & VTD_PAGE_MASK); 610 else { 611 unsigned long phy_addr; 612 if (!alloc) 613 return NULL; 614 615 context = alloc_pgtable_page(iommu->node); 616 if (!context) 617 return NULL; 618 619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 620 phy_addr = virt_to_phys((void *)context); 621 *entry = phy_addr | 1; 622 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 623 } 624 return &context[devfn]; 625 } 626 627 /** 628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 629 * sub-hierarchy of a candidate PCI-PCI bridge 630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 631 * @bridge: the candidate PCI-PCI bridge 632 * 633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 634 */ 635 static bool 636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 637 { 638 struct pci_dev *pdev, *pbridge; 639 640 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 641 return false; 642 643 pdev = to_pci_dev(dev); 644 pbridge = to_pci_dev(bridge); 645 646 if (pbridge->subordinate && 647 pbridge->subordinate->number <= pdev->bus->number && 648 pbridge->subordinate->busn_res.end >= pdev->bus->number) 649 return true; 650 651 return false; 652 } 653 654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 655 { 656 struct dmar_drhd_unit *drhd; 657 u32 vtbar; 658 int rc; 659 660 /* We know that this device on this chipset has its own IOMMU. 661 * If we find it under a different IOMMU, then the BIOS is lying 662 * to us. Hope that the IOMMU for this device is actually 663 * disabled, and it needs no translation... 664 */ 665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 666 if (rc) { 667 /* "can't" happen */ 668 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 669 return false; 670 } 671 vtbar &= 0xffff0000; 672 673 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 674 drhd = dmar_find_matched_drhd_unit(pdev); 675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 678 return true; 679 } 680 681 return false; 682 } 683 684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 685 { 686 if (!iommu || iommu->drhd->ignored) 687 return true; 688 689 if (dev_is_pci(dev)) { 690 struct pci_dev *pdev = to_pci_dev(dev); 691 692 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 694 quirk_ioat_snb_local_iommu(pdev)) 695 return true; 696 } 697 698 return false; 699 } 700 701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 702 { 703 struct dmar_drhd_unit *drhd = NULL; 704 struct pci_dev *pdev = NULL; 705 struct intel_iommu *iommu; 706 struct device *tmp; 707 u16 segment = 0; 708 int i; 709 710 if (!dev) 711 return NULL; 712 713 if (dev_is_pci(dev)) { 714 struct pci_dev *pf_pdev; 715 716 pdev = pci_real_dma_dev(to_pci_dev(dev)); 717 718 /* VFs aren't listed in scope tables; we need to look up 719 * the PF instead to find the IOMMU. */ 720 pf_pdev = pci_physfn(pdev); 721 dev = &pf_pdev->dev; 722 segment = pci_domain_nr(pdev->bus); 723 } else if (has_acpi_companion(dev)) 724 dev = &ACPI_COMPANION(dev)->dev; 725 726 rcu_read_lock(); 727 for_each_iommu(iommu, drhd) { 728 if (pdev && segment != drhd->segment) 729 continue; 730 731 for_each_active_dev_scope(drhd->devices, 732 drhd->devices_cnt, i, tmp) { 733 if (tmp == dev) { 734 /* For a VF use its original BDF# not that of the PF 735 * which we used for the IOMMU lookup. Strictly speaking 736 * we could do this for all PCI devices; we only need to 737 * get the BDF# from the scope table for ACPI matches. */ 738 if (pdev && pdev->is_virtfn) 739 goto got_pdev; 740 741 if (bus && devfn) { 742 *bus = drhd->devices[i].bus; 743 *devfn = drhd->devices[i].devfn; 744 } 745 goto out; 746 } 747 748 if (is_downstream_to_pci_bridge(dev, tmp)) 749 goto got_pdev; 750 } 751 752 if (pdev && drhd->include_all) { 753 got_pdev: 754 if (bus && devfn) { 755 *bus = pdev->bus->number; 756 *devfn = pdev->devfn; 757 } 758 goto out; 759 } 760 } 761 iommu = NULL; 762 out: 763 if (iommu_is_dummy(iommu, dev)) 764 iommu = NULL; 765 766 rcu_read_unlock(); 767 768 return iommu; 769 } 770 771 static void domain_flush_cache(struct dmar_domain *domain, 772 void *addr, int size) 773 { 774 if (!domain->iommu_coherency) 775 clflush_cache_range(addr, size); 776 } 777 778 static void free_context_table(struct intel_iommu *iommu) 779 { 780 struct context_entry *context; 781 int i; 782 783 if (!iommu->root_entry) 784 return; 785 786 for (i = 0; i < ROOT_ENTRY_NR; i++) { 787 context = iommu_context_addr(iommu, i, 0, 0); 788 if (context) 789 free_pgtable_page(context); 790 791 if (!sm_supported(iommu)) 792 continue; 793 794 context = iommu_context_addr(iommu, i, 0x80, 0); 795 if (context) 796 free_pgtable_page(context); 797 } 798 799 free_pgtable_page(iommu->root_entry); 800 iommu->root_entry = NULL; 801 } 802 803 #ifdef CONFIG_DMAR_DEBUG 804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 805 u8 bus, u8 devfn, struct dma_pte *parent, int level) 806 { 807 struct dma_pte *pte; 808 int offset; 809 810 while (1) { 811 offset = pfn_level_offset(pfn, level); 812 pte = &parent[offset]; 813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 814 pr_info("PTE not present at level %d\n", level); 815 break; 816 } 817 818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 819 820 if (level == 1) 821 break; 822 823 parent = phys_to_virt(dma_pte_addr(pte)); 824 level--; 825 } 826 } 827 828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 829 unsigned long long addr, u32 pasid) 830 { 831 struct pasid_dir_entry *dir, *pde; 832 struct pasid_entry *entries, *pte; 833 struct context_entry *ctx_entry; 834 struct root_entry *rt_entry; 835 int i, dir_index, index, level; 836 u8 devfn = source_id & 0xff; 837 u8 bus = source_id >> 8; 838 struct dma_pte *pgtable; 839 840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 841 842 /* root entry dump */ 843 rt_entry = &iommu->root_entry[bus]; 844 if (!rt_entry) { 845 pr_info("root table entry is not present\n"); 846 return; 847 } 848 849 if (sm_supported(iommu)) 850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 851 rt_entry->hi, rt_entry->lo); 852 else 853 pr_info("root entry: 0x%016llx", rt_entry->lo); 854 855 /* context entry dump */ 856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 857 if (!ctx_entry) { 858 pr_info("context table entry is not present\n"); 859 return; 860 } 861 862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 863 ctx_entry->hi, ctx_entry->lo); 864 865 /* legacy mode does not require PASID entries */ 866 if (!sm_supported(iommu)) { 867 level = agaw_to_level(ctx_entry->hi & 7); 868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 869 goto pgtable_walk; 870 } 871 872 /* get the pointer to pasid directory entry */ 873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 874 if (!dir) { 875 pr_info("pasid directory entry is not present\n"); 876 return; 877 } 878 /* For request-without-pasid, get the pasid from context entry */ 879 if (intel_iommu_sm && pasid == INVALID_IOASID) 880 pasid = PASID_RID2PASID; 881 882 dir_index = pasid >> PASID_PDE_SHIFT; 883 pde = &dir[dir_index]; 884 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 885 886 /* get the pointer to the pasid table entry */ 887 entries = get_pasid_table_from_pde(pde); 888 if (!entries) { 889 pr_info("pasid table entry is not present\n"); 890 return; 891 } 892 index = pasid & PASID_PTE_MASK; 893 pte = &entries[index]; 894 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 896 897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 900 } else { 901 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 903 } 904 905 pgtable_walk: 906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 907 } 908 #endif 909 910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 911 unsigned long pfn, int *target_level) 912 { 913 struct dma_pte *parent, *pte; 914 int level = agaw_to_level(domain->agaw); 915 int offset; 916 917 BUG_ON(!domain->pgd); 918 919 if (!domain_pfn_supported(domain, pfn)) 920 /* Address beyond IOMMU's addressing capabilities. */ 921 return NULL; 922 923 parent = domain->pgd; 924 925 while (1) { 926 void *tmp_page; 927 928 offset = pfn_level_offset(pfn, level); 929 pte = &parent[offset]; 930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 931 break; 932 if (level == *target_level) 933 break; 934 935 if (!dma_pte_present(pte)) { 936 uint64_t pteval; 937 938 tmp_page = alloc_pgtable_page(domain->nid); 939 940 if (!tmp_page) 941 return NULL; 942 943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 945 if (domain->use_first_level) 946 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 947 948 if (cmpxchg64(&pte->val, 0ULL, pteval)) 949 /* Someone else set it while we were thinking; use theirs. */ 950 free_pgtable_page(tmp_page); 951 else 952 domain_flush_cache(domain, pte, sizeof(*pte)); 953 } 954 if (level == 1) 955 break; 956 957 parent = phys_to_virt(dma_pte_addr(pte)); 958 level--; 959 } 960 961 if (!*target_level) 962 *target_level = level; 963 964 return pte; 965 } 966 967 /* return address's pte at specific level */ 968 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 969 unsigned long pfn, 970 int level, int *large_page) 971 { 972 struct dma_pte *parent, *pte; 973 int total = agaw_to_level(domain->agaw); 974 int offset; 975 976 parent = domain->pgd; 977 while (level <= total) { 978 offset = pfn_level_offset(pfn, total); 979 pte = &parent[offset]; 980 if (level == total) 981 return pte; 982 983 if (!dma_pte_present(pte)) { 984 *large_page = total; 985 break; 986 } 987 988 if (dma_pte_superpage(pte)) { 989 *large_page = total; 990 return pte; 991 } 992 993 parent = phys_to_virt(dma_pte_addr(pte)); 994 total--; 995 } 996 return NULL; 997 } 998 999 /* clear last level pte, a tlb flush should be followed */ 1000 static void dma_pte_clear_range(struct dmar_domain *domain, 1001 unsigned long start_pfn, 1002 unsigned long last_pfn) 1003 { 1004 unsigned int large_page; 1005 struct dma_pte *first_pte, *pte; 1006 1007 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1008 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1009 BUG_ON(start_pfn > last_pfn); 1010 1011 /* we don't need lock here; nobody else touches the iova range */ 1012 do { 1013 large_page = 1; 1014 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1015 if (!pte) { 1016 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1017 continue; 1018 } 1019 do { 1020 dma_clear_pte(pte); 1021 start_pfn += lvl_to_nr_pages(large_page); 1022 pte++; 1023 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1024 1025 domain_flush_cache(domain, first_pte, 1026 (void *)pte - (void *)first_pte); 1027 1028 } while (start_pfn && start_pfn <= last_pfn); 1029 } 1030 1031 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1032 int retain_level, struct dma_pte *pte, 1033 unsigned long pfn, unsigned long start_pfn, 1034 unsigned long last_pfn) 1035 { 1036 pfn = max(start_pfn, pfn); 1037 pte = &pte[pfn_level_offset(pfn, level)]; 1038 1039 do { 1040 unsigned long level_pfn; 1041 struct dma_pte *level_pte; 1042 1043 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1044 goto next; 1045 1046 level_pfn = pfn & level_mask(level); 1047 level_pte = phys_to_virt(dma_pte_addr(pte)); 1048 1049 if (level > 2) { 1050 dma_pte_free_level(domain, level - 1, retain_level, 1051 level_pte, level_pfn, start_pfn, 1052 last_pfn); 1053 } 1054 1055 /* 1056 * Free the page table if we're below the level we want to 1057 * retain and the range covers the entire table. 1058 */ 1059 if (level < retain_level && !(start_pfn > level_pfn || 1060 last_pfn < level_pfn + level_size(level) - 1)) { 1061 dma_clear_pte(pte); 1062 domain_flush_cache(domain, pte, sizeof(*pte)); 1063 free_pgtable_page(level_pte); 1064 } 1065 next: 1066 pfn += level_size(level); 1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1068 } 1069 1070 /* 1071 * clear last level (leaf) ptes and free page table pages below the 1072 * level we wish to keep intact. 1073 */ 1074 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1075 unsigned long start_pfn, 1076 unsigned long last_pfn, 1077 int retain_level) 1078 { 1079 dma_pte_clear_range(domain, start_pfn, last_pfn); 1080 1081 /* We don't need lock here; nobody else touches the iova range */ 1082 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1083 domain->pgd, 0, start_pfn, last_pfn); 1084 1085 /* free pgd */ 1086 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1087 free_pgtable_page(domain->pgd); 1088 domain->pgd = NULL; 1089 } 1090 } 1091 1092 /* When a page at a given level is being unlinked from its parent, we don't 1093 need to *modify* it at all. All we need to do is make a list of all the 1094 pages which can be freed just as soon as we've flushed the IOTLB and we 1095 know the hardware page-walk will no longer touch them. 1096 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1097 be freed. */ 1098 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1099 int level, struct dma_pte *pte, 1100 struct list_head *freelist) 1101 { 1102 struct page *pg; 1103 1104 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1105 list_add_tail(&pg->lru, freelist); 1106 1107 if (level == 1) 1108 return; 1109 1110 pte = page_address(pg); 1111 do { 1112 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1113 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1114 pte++; 1115 } while (!first_pte_in_page(pte)); 1116 } 1117 1118 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1119 struct dma_pte *pte, unsigned long pfn, 1120 unsigned long start_pfn, unsigned long last_pfn, 1121 struct list_head *freelist) 1122 { 1123 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1124 1125 pfn = max(start_pfn, pfn); 1126 pte = &pte[pfn_level_offset(pfn, level)]; 1127 1128 do { 1129 unsigned long level_pfn = pfn & level_mask(level); 1130 1131 if (!dma_pte_present(pte)) 1132 goto next; 1133 1134 /* If range covers entire pagetable, free it */ 1135 if (start_pfn <= level_pfn && 1136 last_pfn >= level_pfn + level_size(level) - 1) { 1137 /* These suborbinate page tables are going away entirely. Don't 1138 bother to clear them; we're just going to *free* them. */ 1139 if (level > 1 && !dma_pte_superpage(pte)) 1140 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1141 1142 dma_clear_pte(pte); 1143 if (!first_pte) 1144 first_pte = pte; 1145 last_pte = pte; 1146 } else if (level > 1) { 1147 /* Recurse down into a level that isn't *entirely* obsolete */ 1148 dma_pte_clear_level(domain, level - 1, 1149 phys_to_virt(dma_pte_addr(pte)), 1150 level_pfn, start_pfn, last_pfn, 1151 freelist); 1152 } 1153 next: 1154 pfn = level_pfn + level_size(level); 1155 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1156 1157 if (first_pte) 1158 domain_flush_cache(domain, first_pte, 1159 (void *)++last_pte - (void *)first_pte); 1160 } 1161 1162 /* We can't just free the pages because the IOMMU may still be walking 1163 the page tables, and may have cached the intermediate levels. The 1164 pages can only be freed after the IOTLB flush has been done. */ 1165 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1166 unsigned long last_pfn, struct list_head *freelist) 1167 { 1168 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1169 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1170 BUG_ON(start_pfn > last_pfn); 1171 1172 /* we don't need lock here; nobody else touches the iova range */ 1173 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1174 domain->pgd, 0, start_pfn, last_pfn, freelist); 1175 1176 /* free pgd */ 1177 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1178 struct page *pgd_page = virt_to_page(domain->pgd); 1179 list_add_tail(&pgd_page->lru, freelist); 1180 domain->pgd = NULL; 1181 } 1182 } 1183 1184 /* iommu handling */ 1185 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1186 { 1187 struct root_entry *root; 1188 1189 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1190 if (!root) { 1191 pr_err("Allocating root entry for %s failed\n", 1192 iommu->name); 1193 return -ENOMEM; 1194 } 1195 1196 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1197 iommu->root_entry = root; 1198 1199 return 0; 1200 } 1201 1202 static void iommu_set_root_entry(struct intel_iommu *iommu) 1203 { 1204 u64 addr; 1205 u32 sts; 1206 unsigned long flag; 1207 1208 addr = virt_to_phys(iommu->root_entry); 1209 if (sm_supported(iommu)) 1210 addr |= DMA_RTADDR_SMT; 1211 1212 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1213 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1214 1215 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1216 1217 /* Make sure hardware complete it */ 1218 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1219 readl, (sts & DMA_GSTS_RTPS), sts); 1220 1221 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1222 1223 /* 1224 * Hardware invalidates all DMA remapping hardware translation 1225 * caches as part of SRTP flow. 1226 */ 1227 if (cap_esrtps(iommu->cap)) 1228 return; 1229 1230 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1231 if (sm_supported(iommu)) 1232 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1233 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1234 } 1235 1236 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1237 { 1238 u32 val; 1239 unsigned long flag; 1240 1241 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1242 return; 1243 1244 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1245 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1246 1247 /* Make sure hardware complete it */ 1248 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1249 readl, (!(val & DMA_GSTS_WBFS)), val); 1250 1251 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1252 } 1253 1254 /* return value determine if we need a write buffer flush */ 1255 static void __iommu_flush_context(struct intel_iommu *iommu, 1256 u16 did, u16 source_id, u8 function_mask, 1257 u64 type) 1258 { 1259 u64 val = 0; 1260 unsigned long flag; 1261 1262 switch (type) { 1263 case DMA_CCMD_GLOBAL_INVL: 1264 val = DMA_CCMD_GLOBAL_INVL; 1265 break; 1266 case DMA_CCMD_DOMAIN_INVL: 1267 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1268 break; 1269 case DMA_CCMD_DEVICE_INVL: 1270 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1271 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1272 break; 1273 default: 1274 BUG(); 1275 } 1276 val |= DMA_CCMD_ICC; 1277 1278 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1279 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1280 1281 /* Make sure hardware complete it */ 1282 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1283 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1284 1285 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1286 } 1287 1288 /* return value determine if we need a write buffer flush */ 1289 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1290 u64 addr, unsigned int size_order, u64 type) 1291 { 1292 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1293 u64 val = 0, val_iva = 0; 1294 unsigned long flag; 1295 1296 switch (type) { 1297 case DMA_TLB_GLOBAL_FLUSH: 1298 /* global flush doesn't need set IVA_REG */ 1299 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1300 break; 1301 case DMA_TLB_DSI_FLUSH: 1302 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1303 break; 1304 case DMA_TLB_PSI_FLUSH: 1305 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1306 /* IH bit is passed in as part of address */ 1307 val_iva = size_order | addr; 1308 break; 1309 default: 1310 BUG(); 1311 } 1312 /* Note: set drain read/write */ 1313 #if 0 1314 /* 1315 * This is probably to be super secure.. Looks like we can 1316 * ignore it without any impact. 1317 */ 1318 if (cap_read_drain(iommu->cap)) 1319 val |= DMA_TLB_READ_DRAIN; 1320 #endif 1321 if (cap_write_drain(iommu->cap)) 1322 val |= DMA_TLB_WRITE_DRAIN; 1323 1324 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1325 /* Note: Only uses first TLB reg currently */ 1326 if (val_iva) 1327 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1328 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1329 1330 /* Make sure hardware complete it */ 1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1332 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1333 1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1335 1336 /* check IOTLB invalidation granularity */ 1337 if (DMA_TLB_IAIG(val) == 0) 1338 pr_err("Flush IOTLB failed\n"); 1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1340 pr_debug("TLB flush request %Lx, actual %Lx\n", 1341 (unsigned long long)DMA_TLB_IIRG(type), 1342 (unsigned long long)DMA_TLB_IAIG(val)); 1343 } 1344 1345 static struct device_domain_info * 1346 domain_lookup_dev_info(struct dmar_domain *domain, 1347 struct intel_iommu *iommu, u8 bus, u8 devfn) 1348 { 1349 struct device_domain_info *info; 1350 unsigned long flags; 1351 1352 spin_lock_irqsave(&domain->lock, flags); 1353 list_for_each_entry(info, &domain->devices, link) { 1354 if (info->iommu == iommu && info->bus == bus && 1355 info->devfn == devfn) { 1356 spin_unlock_irqrestore(&domain->lock, flags); 1357 return info; 1358 } 1359 } 1360 spin_unlock_irqrestore(&domain->lock, flags); 1361 1362 return NULL; 1363 } 1364 1365 static void domain_update_iotlb(struct dmar_domain *domain) 1366 { 1367 struct device_domain_info *info; 1368 bool has_iotlb_device = false; 1369 unsigned long flags; 1370 1371 spin_lock_irqsave(&domain->lock, flags); 1372 list_for_each_entry(info, &domain->devices, link) { 1373 if (info->ats_enabled) { 1374 has_iotlb_device = true; 1375 break; 1376 } 1377 } 1378 domain->has_iotlb_device = has_iotlb_device; 1379 spin_unlock_irqrestore(&domain->lock, flags); 1380 } 1381 1382 /* 1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1385 * check because it applies only to the built-in QAT devices and it doesn't 1386 * grant additional privileges. 1387 */ 1388 #define BUGGY_QAT_DEVID_MASK 0x4940 1389 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1390 { 1391 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1392 return false; 1393 1394 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1395 return false; 1396 1397 return true; 1398 } 1399 1400 static void iommu_enable_pci_caps(struct device_domain_info *info) 1401 { 1402 struct pci_dev *pdev; 1403 1404 if (!dev_is_pci(info->dev)) 1405 return; 1406 1407 pdev = to_pci_dev(info->dev); 1408 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1409 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1410 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1411 * reserved, which should be set to 0. 1412 */ 1413 if (!ecap_dit(info->iommu->ecap)) 1414 info->pfsid = 0; 1415 else { 1416 struct pci_dev *pf_pdev; 1417 1418 /* pdev will be returned if device is not a vf */ 1419 pf_pdev = pci_physfn(pdev); 1420 info->pfsid = pci_dev_id(pf_pdev); 1421 } 1422 1423 /* The PCIe spec, in its wisdom, declares that the behaviour of 1424 the device if you enable PASID support after ATS support is 1425 undefined. So always enable PASID support on devices which 1426 have it, even if we can't yet know if we're ever going to 1427 use it. */ 1428 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1429 info->pasid_enabled = 1; 1430 1431 if (info->pri_supported && 1432 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1433 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1434 info->pri_enabled = 1; 1435 1436 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1437 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1438 info->ats_enabled = 1; 1439 domain_update_iotlb(info->domain); 1440 info->ats_qdep = pci_ats_queue_depth(pdev); 1441 } 1442 } 1443 1444 static void iommu_disable_pci_caps(struct device_domain_info *info) 1445 { 1446 struct pci_dev *pdev; 1447 1448 if (!dev_is_pci(info->dev)) 1449 return; 1450 1451 pdev = to_pci_dev(info->dev); 1452 1453 if (info->ats_enabled) { 1454 pci_disable_ats(pdev); 1455 info->ats_enabled = 0; 1456 domain_update_iotlb(info->domain); 1457 } 1458 1459 if (info->pri_enabled) { 1460 pci_disable_pri(pdev); 1461 info->pri_enabled = 0; 1462 } 1463 1464 if (info->pasid_enabled) { 1465 pci_disable_pasid(pdev); 1466 info->pasid_enabled = 0; 1467 } 1468 } 1469 1470 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1471 u64 addr, unsigned int mask) 1472 { 1473 u16 sid, qdep; 1474 1475 if (!info || !info->ats_enabled) 1476 return; 1477 1478 sid = info->bus << 8 | info->devfn; 1479 qdep = info->ats_qdep; 1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1481 qdep, addr, mask); 1482 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep); 1483 } 1484 1485 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1486 u64 addr, unsigned mask) 1487 { 1488 struct device_domain_info *info; 1489 unsigned long flags; 1490 1491 if (!domain->has_iotlb_device) 1492 return; 1493 1494 spin_lock_irqsave(&domain->lock, flags); 1495 list_for_each_entry(info, &domain->devices, link) 1496 __iommu_flush_dev_iotlb(info, addr, mask); 1497 spin_unlock_irqrestore(&domain->lock, flags); 1498 } 1499 1500 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1501 struct dmar_domain *domain, 1502 unsigned long pfn, unsigned int pages, 1503 int ih, int map) 1504 { 1505 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1506 unsigned int mask = ilog2(aligned_pages); 1507 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1508 u16 did = domain_id_iommu(domain, iommu); 1509 1510 BUG_ON(pages == 0); 1511 1512 if (ih) 1513 ih = 1 << 6; 1514 1515 if (domain->use_first_level) { 1516 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1517 } else { 1518 unsigned long bitmask = aligned_pages - 1; 1519 1520 /* 1521 * PSI masks the low order bits of the base address. If the 1522 * address isn't aligned to the mask, then compute a mask value 1523 * needed to ensure the target range is flushed. 1524 */ 1525 if (unlikely(bitmask & pfn)) { 1526 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1527 1528 /* 1529 * Since end_pfn <= pfn + bitmask, the only way bits 1530 * higher than bitmask can differ in pfn and end_pfn is 1531 * by carrying. This means after masking out bitmask, 1532 * high bits starting with the first set bit in 1533 * shared_bits are all equal in both pfn and end_pfn. 1534 */ 1535 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1536 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1537 } 1538 1539 /* 1540 * Fallback to domain selective flush if no PSI support or 1541 * the size is too big. 1542 */ 1543 if (!cap_pgsel_inv(iommu->cap) || 1544 mask > cap_max_amask_val(iommu->cap)) 1545 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1546 DMA_TLB_DSI_FLUSH); 1547 else 1548 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1549 DMA_TLB_PSI_FLUSH); 1550 } 1551 1552 /* 1553 * In caching mode, changes of pages from non-present to present require 1554 * flush. However, device IOTLB doesn't need to be flushed in this case. 1555 */ 1556 if (!cap_caching_mode(iommu->cap) || !map) 1557 iommu_flush_dev_iotlb(domain, addr, mask); 1558 } 1559 1560 /* Notification for newly created mappings */ 1561 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1562 struct dmar_domain *domain, 1563 unsigned long pfn, unsigned int pages) 1564 { 1565 /* 1566 * It's a non-present to present mapping. Only flush if caching mode 1567 * and second level. 1568 */ 1569 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1570 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1571 else 1572 iommu_flush_write_buffer(iommu); 1573 } 1574 1575 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1576 { 1577 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1578 struct iommu_domain_info *info; 1579 unsigned long idx; 1580 1581 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1582 struct intel_iommu *iommu = info->iommu; 1583 u16 did = domain_id_iommu(dmar_domain, iommu); 1584 1585 if (dmar_domain->use_first_level) 1586 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1587 else 1588 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1589 DMA_TLB_DSI_FLUSH); 1590 1591 if (!cap_caching_mode(iommu->cap)) 1592 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1593 } 1594 } 1595 1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1597 { 1598 u32 pmen; 1599 unsigned long flags; 1600 1601 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1602 return; 1603 1604 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1605 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1606 pmen &= ~DMA_PMEN_EPM; 1607 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1608 1609 /* wait for the protected region status bit to clear */ 1610 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1611 readl, !(pmen & DMA_PMEN_PRS), pmen); 1612 1613 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1614 } 1615 1616 static void iommu_enable_translation(struct intel_iommu *iommu) 1617 { 1618 u32 sts; 1619 unsigned long flags; 1620 1621 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1622 iommu->gcmd |= DMA_GCMD_TE; 1623 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1624 1625 /* Make sure hardware complete it */ 1626 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1627 readl, (sts & DMA_GSTS_TES), sts); 1628 1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1630 } 1631 1632 static void iommu_disable_translation(struct intel_iommu *iommu) 1633 { 1634 u32 sts; 1635 unsigned long flag; 1636 1637 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1638 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1639 return; 1640 1641 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1642 iommu->gcmd &= ~DMA_GCMD_TE; 1643 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1644 1645 /* Make sure hardware complete it */ 1646 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1647 readl, (!(sts & DMA_GSTS_TES)), sts); 1648 1649 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1650 } 1651 1652 static int iommu_init_domains(struct intel_iommu *iommu) 1653 { 1654 u32 ndomains; 1655 1656 ndomains = cap_ndoms(iommu->cap); 1657 pr_debug("%s: Number of Domains supported <%d>\n", 1658 iommu->name, ndomains); 1659 1660 spin_lock_init(&iommu->lock); 1661 1662 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1663 if (!iommu->domain_ids) 1664 return -ENOMEM; 1665 1666 /* 1667 * If Caching mode is set, then invalid translations are tagged 1668 * with domain-id 0, hence we need to pre-allocate it. We also 1669 * use domain-id 0 as a marker for non-allocated domain-id, so 1670 * make sure it is not used for a real domain. 1671 */ 1672 set_bit(0, iommu->domain_ids); 1673 1674 /* 1675 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1676 * entry for first-level or pass-through translation modes should 1677 * be programmed with a domain id different from those used for 1678 * second-level or nested translation. We reserve a domain id for 1679 * this purpose. 1680 */ 1681 if (sm_supported(iommu)) 1682 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1683 1684 return 0; 1685 } 1686 1687 static void disable_dmar_iommu(struct intel_iommu *iommu) 1688 { 1689 if (!iommu->domain_ids) 1690 return; 1691 1692 /* 1693 * All iommu domains must have been detached from the devices, 1694 * hence there should be no domain IDs in use. 1695 */ 1696 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1697 > NUM_RESERVED_DID)) 1698 return; 1699 1700 if (iommu->gcmd & DMA_GCMD_TE) 1701 iommu_disable_translation(iommu); 1702 } 1703 1704 static void free_dmar_iommu(struct intel_iommu *iommu) 1705 { 1706 if (iommu->domain_ids) { 1707 bitmap_free(iommu->domain_ids); 1708 iommu->domain_ids = NULL; 1709 } 1710 1711 if (iommu->copied_tables) { 1712 bitmap_free(iommu->copied_tables); 1713 iommu->copied_tables = NULL; 1714 } 1715 1716 /* free context mapping */ 1717 free_context_table(iommu); 1718 1719 #ifdef CONFIG_INTEL_IOMMU_SVM 1720 if (pasid_supported(iommu)) { 1721 if (ecap_prs(iommu->ecap)) 1722 intel_svm_finish_prq(iommu); 1723 } 1724 if (vccap_pasid(iommu->vccap)) 1725 ioasid_unregister_allocator(&iommu->pasid_allocator); 1726 1727 #endif 1728 } 1729 1730 /* 1731 * Check and return whether first level is used by default for 1732 * DMA translation. 1733 */ 1734 static bool first_level_by_default(unsigned int type) 1735 { 1736 /* Only SL is available in legacy mode */ 1737 if (!scalable_mode_support()) 1738 return false; 1739 1740 /* Only level (either FL or SL) is available, just use it */ 1741 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1742 return intel_cap_flts_sanity(); 1743 1744 /* Both levels are available, decide it based on domain type */ 1745 return type != IOMMU_DOMAIN_UNMANAGED; 1746 } 1747 1748 static struct dmar_domain *alloc_domain(unsigned int type) 1749 { 1750 struct dmar_domain *domain; 1751 1752 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1753 if (!domain) 1754 return NULL; 1755 1756 domain->nid = NUMA_NO_NODE; 1757 if (first_level_by_default(type)) 1758 domain->use_first_level = true; 1759 domain->has_iotlb_device = false; 1760 INIT_LIST_HEAD(&domain->devices); 1761 spin_lock_init(&domain->lock); 1762 xa_init(&domain->iommu_array); 1763 1764 return domain; 1765 } 1766 1767 static int domain_attach_iommu(struct dmar_domain *domain, 1768 struct intel_iommu *iommu) 1769 { 1770 struct iommu_domain_info *info, *curr; 1771 unsigned long ndomains; 1772 int num, ret = -ENOSPC; 1773 1774 info = kzalloc(sizeof(*info), GFP_KERNEL); 1775 if (!info) 1776 return -ENOMEM; 1777 1778 spin_lock(&iommu->lock); 1779 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1780 if (curr) { 1781 curr->refcnt++; 1782 spin_unlock(&iommu->lock); 1783 kfree(info); 1784 return 0; 1785 } 1786 1787 ndomains = cap_ndoms(iommu->cap); 1788 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1789 if (num >= ndomains) { 1790 pr_err("%s: No free domain ids\n", iommu->name); 1791 goto err_unlock; 1792 } 1793 1794 set_bit(num, iommu->domain_ids); 1795 info->refcnt = 1; 1796 info->did = num; 1797 info->iommu = iommu; 1798 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1799 NULL, info, GFP_ATOMIC); 1800 if (curr) { 1801 ret = xa_err(curr) ? : -EBUSY; 1802 goto err_clear; 1803 } 1804 domain_update_iommu_cap(domain); 1805 1806 spin_unlock(&iommu->lock); 1807 return 0; 1808 1809 err_clear: 1810 clear_bit(info->did, iommu->domain_ids); 1811 err_unlock: 1812 spin_unlock(&iommu->lock); 1813 kfree(info); 1814 return ret; 1815 } 1816 1817 static void domain_detach_iommu(struct dmar_domain *domain, 1818 struct intel_iommu *iommu) 1819 { 1820 struct iommu_domain_info *info; 1821 1822 spin_lock(&iommu->lock); 1823 info = xa_load(&domain->iommu_array, iommu->seq_id); 1824 if (--info->refcnt == 0) { 1825 clear_bit(info->did, iommu->domain_ids); 1826 xa_erase(&domain->iommu_array, iommu->seq_id); 1827 domain->nid = NUMA_NO_NODE; 1828 domain_update_iommu_cap(domain); 1829 kfree(info); 1830 } 1831 spin_unlock(&iommu->lock); 1832 } 1833 1834 static inline int guestwidth_to_adjustwidth(int gaw) 1835 { 1836 int agaw; 1837 int r = (gaw - 12) % 9; 1838 1839 if (r == 0) 1840 agaw = gaw; 1841 else 1842 agaw = gaw + 9 - r; 1843 if (agaw > 64) 1844 agaw = 64; 1845 return agaw; 1846 } 1847 1848 static void domain_exit(struct dmar_domain *domain) 1849 { 1850 if (domain->pgd) { 1851 LIST_HEAD(freelist); 1852 1853 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1854 put_pages_list(&freelist); 1855 } 1856 1857 if (WARN_ON(!list_empty(&domain->devices))) 1858 return; 1859 1860 kfree(domain); 1861 } 1862 1863 /* 1864 * Get the PASID directory size for scalable mode context entry. 1865 * Value of X in the PDTS field of a scalable mode context entry 1866 * indicates PASID directory with 2^(X + 7) entries. 1867 */ 1868 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1869 { 1870 unsigned long pds, max_pde; 1871 1872 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1873 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1874 if (pds < 7) 1875 return 0; 1876 1877 return pds - 7; 1878 } 1879 1880 /* 1881 * Set the RID_PASID field of a scalable mode context entry. The 1882 * IOMMU hardware will use the PASID value set in this field for 1883 * DMA translations of DMA requests without PASID. 1884 */ 1885 static inline void 1886 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1887 { 1888 context->hi |= pasid & ((1 << 20) - 1); 1889 } 1890 1891 /* 1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1893 * entry. 1894 */ 1895 static inline void context_set_sm_dte(struct context_entry *context) 1896 { 1897 context->lo |= (1 << 2); 1898 } 1899 1900 /* 1901 * Set the PRE(Page Request Enable) field of a scalable mode context 1902 * entry. 1903 */ 1904 static inline void context_set_sm_pre(struct context_entry *context) 1905 { 1906 context->lo |= (1 << 4); 1907 } 1908 1909 /* Convert value to context PASID directory size field coding. */ 1910 #define context_pdts(pds) (((pds) & 0x7) << 9) 1911 1912 static int domain_context_mapping_one(struct dmar_domain *domain, 1913 struct intel_iommu *iommu, 1914 struct pasid_table *table, 1915 u8 bus, u8 devfn) 1916 { 1917 struct device_domain_info *info = 1918 domain_lookup_dev_info(domain, iommu, bus, devfn); 1919 u16 did = domain_id_iommu(domain, iommu); 1920 int translation = CONTEXT_TT_MULTI_LEVEL; 1921 struct context_entry *context; 1922 int ret; 1923 1924 WARN_ON(did == 0); 1925 1926 if (hw_pass_through && domain_type_is_si(domain)) 1927 translation = CONTEXT_TT_PASS_THROUGH; 1928 1929 pr_debug("Set context mapping for %02x:%02x.%d\n", 1930 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1931 1932 BUG_ON(!domain->pgd); 1933 1934 spin_lock(&iommu->lock); 1935 ret = -ENOMEM; 1936 context = iommu_context_addr(iommu, bus, devfn, 1); 1937 if (!context) 1938 goto out_unlock; 1939 1940 ret = 0; 1941 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1942 goto out_unlock; 1943 1944 /* 1945 * For kdump cases, old valid entries may be cached due to the 1946 * in-flight DMA and copied pgtable, but there is no unmapping 1947 * behaviour for them, thus we need an explicit cache flush for 1948 * the newly-mapped device. For kdump, at this point, the device 1949 * is supposed to finish reset at its driver probe stage, so no 1950 * in-flight DMA will exist, and we don't need to worry anymore 1951 * hereafter. 1952 */ 1953 if (context_copied(iommu, bus, devfn)) { 1954 u16 did_old = context_domain_id(context); 1955 1956 if (did_old < cap_ndoms(iommu->cap)) { 1957 iommu->flush.flush_context(iommu, did_old, 1958 (((u16)bus) << 8) | devfn, 1959 DMA_CCMD_MASK_NOBIT, 1960 DMA_CCMD_DEVICE_INVL); 1961 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1962 DMA_TLB_DSI_FLUSH); 1963 } 1964 1965 clear_context_copied(iommu, bus, devfn); 1966 } 1967 1968 context_clear_entry(context); 1969 1970 if (sm_supported(iommu)) { 1971 unsigned long pds; 1972 1973 WARN_ON(!table); 1974 1975 /* Setup the PASID DIR pointer: */ 1976 pds = context_get_sm_pds(table); 1977 context->lo = (u64)virt_to_phys(table->table) | 1978 context_pdts(pds); 1979 1980 /* Setup the RID_PASID field: */ 1981 context_set_sm_rid2pasid(context, PASID_RID2PASID); 1982 1983 /* 1984 * Setup the Device-TLB enable bit and Page request 1985 * Enable bit: 1986 */ 1987 if (info && info->ats_supported) 1988 context_set_sm_dte(context); 1989 if (info && info->pri_supported) 1990 context_set_sm_pre(context); 1991 if (info && info->pasid_supported) 1992 context_set_pasid(context); 1993 } else { 1994 struct dma_pte *pgd = domain->pgd; 1995 int agaw; 1996 1997 context_set_domain_id(context, did); 1998 1999 if (translation != CONTEXT_TT_PASS_THROUGH) { 2000 /* 2001 * Skip top levels of page tables for iommu which has 2002 * less agaw than default. Unnecessary for PT mode. 2003 */ 2004 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2005 ret = -ENOMEM; 2006 pgd = phys_to_virt(dma_pte_addr(pgd)); 2007 if (!dma_pte_present(pgd)) 2008 goto out_unlock; 2009 } 2010 2011 if (info && info->ats_supported) 2012 translation = CONTEXT_TT_DEV_IOTLB; 2013 else 2014 translation = CONTEXT_TT_MULTI_LEVEL; 2015 2016 context_set_address_root(context, virt_to_phys(pgd)); 2017 context_set_address_width(context, agaw); 2018 } else { 2019 /* 2020 * In pass through mode, AW must be programmed to 2021 * indicate the largest AGAW value supported by 2022 * hardware. And ASR is ignored by hardware. 2023 */ 2024 context_set_address_width(context, iommu->msagaw); 2025 } 2026 2027 context_set_translation_type(context, translation); 2028 } 2029 2030 context_set_fault_enable(context); 2031 context_set_present(context); 2032 if (!ecap_coherent(iommu->ecap)) 2033 clflush_cache_range(context, sizeof(*context)); 2034 2035 /* 2036 * It's a non-present to present mapping. If hardware doesn't cache 2037 * non-present entry we only need to flush the write-buffer. If the 2038 * _does_ cache non-present entries, then it does so in the special 2039 * domain #0, which we have to flush: 2040 */ 2041 if (cap_caching_mode(iommu->cap)) { 2042 iommu->flush.flush_context(iommu, 0, 2043 (((u16)bus) << 8) | devfn, 2044 DMA_CCMD_MASK_NOBIT, 2045 DMA_CCMD_DEVICE_INVL); 2046 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2047 } else { 2048 iommu_flush_write_buffer(iommu); 2049 } 2050 2051 ret = 0; 2052 2053 out_unlock: 2054 spin_unlock(&iommu->lock); 2055 2056 return ret; 2057 } 2058 2059 struct domain_context_mapping_data { 2060 struct dmar_domain *domain; 2061 struct intel_iommu *iommu; 2062 struct pasid_table *table; 2063 }; 2064 2065 static int domain_context_mapping_cb(struct pci_dev *pdev, 2066 u16 alias, void *opaque) 2067 { 2068 struct domain_context_mapping_data *data = opaque; 2069 2070 return domain_context_mapping_one(data->domain, data->iommu, 2071 data->table, PCI_BUS_NUM(alias), 2072 alias & 0xff); 2073 } 2074 2075 static int 2076 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2077 { 2078 struct domain_context_mapping_data data; 2079 struct pasid_table *table; 2080 struct intel_iommu *iommu; 2081 u8 bus, devfn; 2082 2083 iommu = device_to_iommu(dev, &bus, &devfn); 2084 if (!iommu) 2085 return -ENODEV; 2086 2087 table = intel_pasid_get_table(dev); 2088 2089 if (!dev_is_pci(dev)) 2090 return domain_context_mapping_one(domain, iommu, table, 2091 bus, devfn); 2092 2093 data.domain = domain; 2094 data.iommu = iommu; 2095 data.table = table; 2096 2097 return pci_for_each_dma_alias(to_pci_dev(dev), 2098 &domain_context_mapping_cb, &data); 2099 } 2100 2101 /* Returns a number of VTD pages, but aligned to MM page size */ 2102 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2103 size_t size) 2104 { 2105 host_addr &= ~PAGE_MASK; 2106 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2107 } 2108 2109 /* Return largest possible superpage level for a given mapping */ 2110 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2111 unsigned long iov_pfn, 2112 unsigned long phy_pfn, 2113 unsigned long pages) 2114 { 2115 int support, level = 1; 2116 unsigned long pfnmerge; 2117 2118 support = domain->iommu_superpage; 2119 2120 /* To use a large page, the virtual *and* physical addresses 2121 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2122 of them will mean we have to use smaller pages. So just 2123 merge them and check both at once. */ 2124 pfnmerge = iov_pfn | phy_pfn; 2125 2126 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2127 pages >>= VTD_STRIDE_SHIFT; 2128 if (!pages) 2129 break; 2130 pfnmerge >>= VTD_STRIDE_SHIFT; 2131 level++; 2132 support--; 2133 } 2134 return level; 2135 } 2136 2137 /* 2138 * Ensure that old small page tables are removed to make room for superpage(s). 2139 * We're going to add new large pages, so make sure we don't remove their parent 2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2141 */ 2142 static void switch_to_super_page(struct dmar_domain *domain, 2143 unsigned long start_pfn, 2144 unsigned long end_pfn, int level) 2145 { 2146 unsigned long lvl_pages = lvl_to_nr_pages(level); 2147 struct iommu_domain_info *info; 2148 struct dma_pte *pte = NULL; 2149 unsigned long i; 2150 2151 while (start_pfn <= end_pfn) { 2152 if (!pte) 2153 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2154 2155 if (dma_pte_present(pte)) { 2156 dma_pte_free_pagetable(domain, start_pfn, 2157 start_pfn + lvl_pages - 1, 2158 level + 1); 2159 2160 xa_for_each(&domain->iommu_array, i, info) 2161 iommu_flush_iotlb_psi(info->iommu, domain, 2162 start_pfn, lvl_pages, 2163 0, 0); 2164 } 2165 2166 pte++; 2167 start_pfn += lvl_pages; 2168 if (first_pte_in_page(pte)) 2169 pte = NULL; 2170 } 2171 } 2172 2173 static int 2174 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2175 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2176 { 2177 struct dma_pte *first_pte = NULL, *pte = NULL; 2178 unsigned int largepage_lvl = 0; 2179 unsigned long lvl_pages = 0; 2180 phys_addr_t pteval; 2181 u64 attr; 2182 2183 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2184 2185 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2186 return -EINVAL; 2187 2188 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2189 attr |= DMA_FL_PTE_PRESENT; 2190 if (domain->use_first_level) { 2191 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2192 if (prot & DMA_PTE_WRITE) 2193 attr |= DMA_FL_PTE_DIRTY; 2194 } 2195 2196 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2197 2198 while (nr_pages > 0) { 2199 uint64_t tmp; 2200 2201 if (!pte) { 2202 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2203 phys_pfn, nr_pages); 2204 2205 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2206 if (!pte) 2207 return -ENOMEM; 2208 first_pte = pte; 2209 2210 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2211 2212 /* It is large page*/ 2213 if (largepage_lvl > 1) { 2214 unsigned long end_pfn; 2215 unsigned long pages_to_remove; 2216 2217 pteval |= DMA_PTE_LARGE_PAGE; 2218 pages_to_remove = min_t(unsigned long, nr_pages, 2219 nr_pte_to_next_page(pte) * lvl_pages); 2220 end_pfn = iov_pfn + pages_to_remove - 1; 2221 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2222 } else { 2223 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2224 } 2225 2226 } 2227 /* We don't need lock here, nobody else 2228 * touches the iova range 2229 */ 2230 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2231 if (tmp) { 2232 static int dumps = 5; 2233 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2234 iov_pfn, tmp, (unsigned long long)pteval); 2235 if (dumps) { 2236 dumps--; 2237 debug_dma_dump_mappings(NULL); 2238 } 2239 WARN_ON(1); 2240 } 2241 2242 nr_pages -= lvl_pages; 2243 iov_pfn += lvl_pages; 2244 phys_pfn += lvl_pages; 2245 pteval += lvl_pages * VTD_PAGE_SIZE; 2246 2247 /* If the next PTE would be the first in a new page, then we 2248 * need to flush the cache on the entries we've just written. 2249 * And then we'll need to recalculate 'pte', so clear it and 2250 * let it get set again in the if (!pte) block above. 2251 * 2252 * If we're done (!nr_pages) we need to flush the cache too. 2253 * 2254 * Also if we've been setting superpages, we may need to 2255 * recalculate 'pte' and switch back to smaller pages for the 2256 * end of the mapping, if the trailing size is not enough to 2257 * use another superpage (i.e. nr_pages < lvl_pages). 2258 */ 2259 pte++; 2260 if (!nr_pages || first_pte_in_page(pte) || 2261 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2262 domain_flush_cache(domain, first_pte, 2263 (void *)pte - (void *)first_pte); 2264 pte = NULL; 2265 } 2266 } 2267 2268 return 0; 2269 } 2270 2271 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2272 { 2273 struct intel_iommu *iommu = info->iommu; 2274 struct context_entry *context; 2275 u16 did_old; 2276 2277 if (!iommu) 2278 return; 2279 2280 spin_lock(&iommu->lock); 2281 context = iommu_context_addr(iommu, bus, devfn, 0); 2282 if (!context) { 2283 spin_unlock(&iommu->lock); 2284 return; 2285 } 2286 2287 if (sm_supported(iommu)) { 2288 if (hw_pass_through && domain_type_is_si(info->domain)) 2289 did_old = FLPT_DEFAULT_DID; 2290 else 2291 did_old = domain_id_iommu(info->domain, iommu); 2292 } else { 2293 did_old = context_domain_id(context); 2294 } 2295 2296 context_clear_entry(context); 2297 __iommu_flush_cache(iommu, context, sizeof(*context)); 2298 spin_unlock(&iommu->lock); 2299 iommu->flush.flush_context(iommu, 2300 did_old, 2301 (((u16)bus) << 8) | devfn, 2302 DMA_CCMD_MASK_NOBIT, 2303 DMA_CCMD_DEVICE_INVL); 2304 2305 if (sm_supported(iommu)) 2306 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2307 2308 iommu->flush.flush_iotlb(iommu, 2309 did_old, 2310 0, 2311 0, 2312 DMA_TLB_DSI_FLUSH); 2313 2314 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2315 } 2316 2317 static int domain_setup_first_level(struct intel_iommu *iommu, 2318 struct dmar_domain *domain, 2319 struct device *dev, 2320 u32 pasid) 2321 { 2322 struct dma_pte *pgd = domain->pgd; 2323 int agaw, level; 2324 int flags = 0; 2325 2326 /* 2327 * Skip top levels of page tables for iommu which has 2328 * less agaw than default. Unnecessary for PT mode. 2329 */ 2330 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2331 pgd = phys_to_virt(dma_pte_addr(pgd)); 2332 if (!dma_pte_present(pgd)) 2333 return -ENOMEM; 2334 } 2335 2336 level = agaw_to_level(agaw); 2337 if (level != 4 && level != 5) 2338 return -EINVAL; 2339 2340 if (pasid != PASID_RID2PASID) 2341 flags |= PASID_FLAG_SUPERVISOR_MODE; 2342 if (level == 5) 2343 flags |= PASID_FLAG_FL5LP; 2344 2345 if (domain->force_snooping) 2346 flags |= PASID_FLAG_PAGE_SNOOP; 2347 2348 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2349 domain_id_iommu(domain, iommu), 2350 flags); 2351 } 2352 2353 static bool dev_is_real_dma_subdevice(struct device *dev) 2354 { 2355 return dev && dev_is_pci(dev) && 2356 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2357 } 2358 2359 static int iommu_domain_identity_map(struct dmar_domain *domain, 2360 unsigned long first_vpfn, 2361 unsigned long last_vpfn) 2362 { 2363 /* 2364 * RMRR range might have overlap with physical memory range, 2365 * clear it first 2366 */ 2367 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2368 2369 return __domain_mapping(domain, first_vpfn, 2370 first_vpfn, last_vpfn - first_vpfn + 1, 2371 DMA_PTE_READ|DMA_PTE_WRITE); 2372 } 2373 2374 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2375 2376 static int __init si_domain_init(int hw) 2377 { 2378 struct dmar_rmrr_unit *rmrr; 2379 struct device *dev; 2380 int i, nid, ret; 2381 2382 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2383 if (!si_domain) 2384 return -EFAULT; 2385 2386 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2387 domain_exit(si_domain); 2388 si_domain = NULL; 2389 return -EFAULT; 2390 } 2391 2392 if (hw) 2393 return 0; 2394 2395 for_each_online_node(nid) { 2396 unsigned long start_pfn, end_pfn; 2397 int i; 2398 2399 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2400 ret = iommu_domain_identity_map(si_domain, 2401 mm_to_dma_pfn(start_pfn), 2402 mm_to_dma_pfn(end_pfn)); 2403 if (ret) 2404 return ret; 2405 } 2406 } 2407 2408 /* 2409 * Identity map the RMRRs so that devices with RMRRs could also use 2410 * the si_domain. 2411 */ 2412 for_each_rmrr_units(rmrr) { 2413 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2414 i, dev) { 2415 unsigned long long start = rmrr->base_address; 2416 unsigned long long end = rmrr->end_address; 2417 2418 if (WARN_ON(end < start || 2419 end >> agaw_to_width(si_domain->agaw))) 2420 continue; 2421 2422 ret = iommu_domain_identity_map(si_domain, 2423 mm_to_dma_pfn(start >> PAGE_SHIFT), 2424 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2425 if (ret) 2426 return ret; 2427 } 2428 } 2429 2430 return 0; 2431 } 2432 2433 static int dmar_domain_attach_device(struct dmar_domain *domain, 2434 struct device *dev) 2435 { 2436 struct device_domain_info *info = dev_iommu_priv_get(dev); 2437 struct intel_iommu *iommu; 2438 unsigned long flags; 2439 u8 bus, devfn; 2440 int ret; 2441 2442 iommu = device_to_iommu(dev, &bus, &devfn); 2443 if (!iommu) 2444 return -ENODEV; 2445 2446 ret = domain_attach_iommu(domain, iommu); 2447 if (ret) 2448 return ret; 2449 info->domain = domain; 2450 spin_lock_irqsave(&domain->lock, flags); 2451 list_add(&info->link, &domain->devices); 2452 spin_unlock_irqrestore(&domain->lock, flags); 2453 2454 /* PASID table is mandatory for a PCI device in scalable mode. */ 2455 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2456 /* Setup the PASID entry for requests without PASID: */ 2457 if (hw_pass_through && domain_type_is_si(domain)) 2458 ret = intel_pasid_setup_pass_through(iommu, domain, 2459 dev, PASID_RID2PASID); 2460 else if (domain->use_first_level) 2461 ret = domain_setup_first_level(iommu, domain, dev, 2462 PASID_RID2PASID); 2463 else 2464 ret = intel_pasid_setup_second_level(iommu, domain, 2465 dev, PASID_RID2PASID); 2466 if (ret) { 2467 dev_err(dev, "Setup RID2PASID failed\n"); 2468 device_block_translation(dev); 2469 return ret; 2470 } 2471 } 2472 2473 ret = domain_context_mapping(domain, dev); 2474 if (ret) { 2475 dev_err(dev, "Domain context map failed\n"); 2476 device_block_translation(dev); 2477 return ret; 2478 } 2479 2480 iommu_enable_pci_caps(info); 2481 2482 return 0; 2483 } 2484 2485 static bool device_has_rmrr(struct device *dev) 2486 { 2487 struct dmar_rmrr_unit *rmrr; 2488 struct device *tmp; 2489 int i; 2490 2491 rcu_read_lock(); 2492 for_each_rmrr_units(rmrr) { 2493 /* 2494 * Return TRUE if this RMRR contains the device that 2495 * is passed in. 2496 */ 2497 for_each_active_dev_scope(rmrr->devices, 2498 rmrr->devices_cnt, i, tmp) 2499 if (tmp == dev || 2500 is_downstream_to_pci_bridge(dev, tmp)) { 2501 rcu_read_unlock(); 2502 return true; 2503 } 2504 } 2505 rcu_read_unlock(); 2506 return false; 2507 } 2508 2509 /** 2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2511 * is relaxable (ie. is allowed to be not enforced under some conditions) 2512 * @dev: device handle 2513 * 2514 * We assume that PCI USB devices with RMRRs have them largely 2515 * for historical reasons and that the RMRR space is not actively used post 2516 * boot. This exclusion may change if vendors begin to abuse it. 2517 * 2518 * The same exception is made for graphics devices, with the requirement that 2519 * any use of the RMRR regions will be torn down before assigning the device 2520 * to a guest. 2521 * 2522 * Return: true if the RMRR is relaxable, false otherwise 2523 */ 2524 static bool device_rmrr_is_relaxable(struct device *dev) 2525 { 2526 struct pci_dev *pdev; 2527 2528 if (!dev_is_pci(dev)) 2529 return false; 2530 2531 pdev = to_pci_dev(dev); 2532 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2533 return true; 2534 else 2535 return false; 2536 } 2537 2538 /* 2539 * There are a couple cases where we need to restrict the functionality of 2540 * devices associated with RMRRs. The first is when evaluating a device for 2541 * identity mapping because problems exist when devices are moved in and out 2542 * of domains and their respective RMRR information is lost. This means that 2543 * a device with associated RMRRs will never be in a "passthrough" domain. 2544 * The second is use of the device through the IOMMU API. This interface 2545 * expects to have full control of the IOVA space for the device. We cannot 2546 * satisfy both the requirement that RMRR access is maintained and have an 2547 * unencumbered IOVA space. We also have no ability to quiesce the device's 2548 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2549 * We therefore prevent devices associated with an RMRR from participating in 2550 * the IOMMU API, which eliminates them from device assignment. 2551 * 2552 * In both cases, devices which have relaxable RMRRs are not concerned by this 2553 * restriction. See device_rmrr_is_relaxable comment. 2554 */ 2555 static bool device_is_rmrr_locked(struct device *dev) 2556 { 2557 if (!device_has_rmrr(dev)) 2558 return false; 2559 2560 if (device_rmrr_is_relaxable(dev)) 2561 return false; 2562 2563 return true; 2564 } 2565 2566 /* 2567 * Return the required default domain type for a specific device. 2568 * 2569 * @dev: the device in query 2570 * @startup: true if this is during early boot 2571 * 2572 * Returns: 2573 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2574 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2575 * - 0: both identity and dynamic domains work for this device 2576 */ 2577 static int device_def_domain_type(struct device *dev) 2578 { 2579 if (dev_is_pci(dev)) { 2580 struct pci_dev *pdev = to_pci_dev(dev); 2581 2582 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2583 return IOMMU_DOMAIN_IDENTITY; 2584 2585 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2586 return IOMMU_DOMAIN_IDENTITY; 2587 } 2588 2589 return 0; 2590 } 2591 2592 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2593 { 2594 /* 2595 * Start from the sane iommu hardware state. 2596 * If the queued invalidation is already initialized by us 2597 * (for example, while enabling interrupt-remapping) then 2598 * we got the things already rolling from a sane state. 2599 */ 2600 if (!iommu->qi) { 2601 /* 2602 * Clear any previous faults. 2603 */ 2604 dmar_fault(-1, iommu); 2605 /* 2606 * Disable queued invalidation if supported and already enabled 2607 * before OS handover. 2608 */ 2609 dmar_disable_qi(iommu); 2610 } 2611 2612 if (dmar_enable_qi(iommu)) { 2613 /* 2614 * Queued Invalidate not enabled, use Register Based Invalidate 2615 */ 2616 iommu->flush.flush_context = __iommu_flush_context; 2617 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2618 pr_info("%s: Using Register based invalidation\n", 2619 iommu->name); 2620 } else { 2621 iommu->flush.flush_context = qi_flush_context; 2622 iommu->flush.flush_iotlb = qi_flush_iotlb; 2623 pr_info("%s: Using Queued invalidation\n", iommu->name); 2624 } 2625 } 2626 2627 static int copy_context_table(struct intel_iommu *iommu, 2628 struct root_entry *old_re, 2629 struct context_entry **tbl, 2630 int bus, bool ext) 2631 { 2632 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2633 struct context_entry *new_ce = NULL, ce; 2634 struct context_entry *old_ce = NULL; 2635 struct root_entry re; 2636 phys_addr_t old_ce_phys; 2637 2638 tbl_idx = ext ? bus * 2 : bus; 2639 memcpy(&re, old_re, sizeof(re)); 2640 2641 for (devfn = 0; devfn < 256; devfn++) { 2642 /* First calculate the correct index */ 2643 idx = (ext ? devfn * 2 : devfn) % 256; 2644 2645 if (idx == 0) { 2646 /* First save what we may have and clean up */ 2647 if (new_ce) { 2648 tbl[tbl_idx] = new_ce; 2649 __iommu_flush_cache(iommu, new_ce, 2650 VTD_PAGE_SIZE); 2651 pos = 1; 2652 } 2653 2654 if (old_ce) 2655 memunmap(old_ce); 2656 2657 ret = 0; 2658 if (devfn < 0x80) 2659 old_ce_phys = root_entry_lctp(&re); 2660 else 2661 old_ce_phys = root_entry_uctp(&re); 2662 2663 if (!old_ce_phys) { 2664 if (ext && devfn == 0) { 2665 /* No LCTP, try UCTP */ 2666 devfn = 0x7f; 2667 continue; 2668 } else { 2669 goto out; 2670 } 2671 } 2672 2673 ret = -ENOMEM; 2674 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2675 MEMREMAP_WB); 2676 if (!old_ce) 2677 goto out; 2678 2679 new_ce = alloc_pgtable_page(iommu->node); 2680 if (!new_ce) 2681 goto out_unmap; 2682 2683 ret = 0; 2684 } 2685 2686 /* Now copy the context entry */ 2687 memcpy(&ce, old_ce + idx, sizeof(ce)); 2688 2689 if (!context_present(&ce)) 2690 continue; 2691 2692 did = context_domain_id(&ce); 2693 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2694 set_bit(did, iommu->domain_ids); 2695 2696 set_context_copied(iommu, bus, devfn); 2697 new_ce[idx] = ce; 2698 } 2699 2700 tbl[tbl_idx + pos] = new_ce; 2701 2702 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2703 2704 out_unmap: 2705 memunmap(old_ce); 2706 2707 out: 2708 return ret; 2709 } 2710 2711 static int copy_translation_tables(struct intel_iommu *iommu) 2712 { 2713 struct context_entry **ctxt_tbls; 2714 struct root_entry *old_rt; 2715 phys_addr_t old_rt_phys; 2716 int ctxt_table_entries; 2717 u64 rtaddr_reg; 2718 int bus, ret; 2719 bool new_ext, ext; 2720 2721 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2722 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2723 new_ext = !!sm_supported(iommu); 2724 2725 /* 2726 * The RTT bit can only be changed when translation is disabled, 2727 * but disabling translation means to open a window for data 2728 * corruption. So bail out and don't copy anything if we would 2729 * have to change the bit. 2730 */ 2731 if (new_ext != ext) 2732 return -EINVAL; 2733 2734 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2735 if (!iommu->copied_tables) 2736 return -ENOMEM; 2737 2738 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2739 if (!old_rt_phys) 2740 return -EINVAL; 2741 2742 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2743 if (!old_rt) 2744 return -ENOMEM; 2745 2746 /* This is too big for the stack - allocate it from slab */ 2747 ctxt_table_entries = ext ? 512 : 256; 2748 ret = -ENOMEM; 2749 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2750 if (!ctxt_tbls) 2751 goto out_unmap; 2752 2753 for (bus = 0; bus < 256; bus++) { 2754 ret = copy_context_table(iommu, &old_rt[bus], 2755 ctxt_tbls, bus, ext); 2756 if (ret) { 2757 pr_err("%s: Failed to copy context table for bus %d\n", 2758 iommu->name, bus); 2759 continue; 2760 } 2761 } 2762 2763 spin_lock(&iommu->lock); 2764 2765 /* Context tables are copied, now write them to the root_entry table */ 2766 for (bus = 0; bus < 256; bus++) { 2767 int idx = ext ? bus * 2 : bus; 2768 u64 val; 2769 2770 if (ctxt_tbls[idx]) { 2771 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2772 iommu->root_entry[bus].lo = val; 2773 } 2774 2775 if (!ext || !ctxt_tbls[idx + 1]) 2776 continue; 2777 2778 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2779 iommu->root_entry[bus].hi = val; 2780 } 2781 2782 spin_unlock(&iommu->lock); 2783 2784 kfree(ctxt_tbls); 2785 2786 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2787 2788 ret = 0; 2789 2790 out_unmap: 2791 memunmap(old_rt); 2792 2793 return ret; 2794 } 2795 2796 #ifdef CONFIG_INTEL_IOMMU_SVM 2797 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 2798 { 2799 struct intel_iommu *iommu = data; 2800 ioasid_t ioasid; 2801 2802 if (!iommu) 2803 return INVALID_IOASID; 2804 /* 2805 * VT-d virtual command interface always uses the full 20 bit 2806 * PASID range. Host can partition guest PASID range based on 2807 * policies but it is out of guest's control. 2808 */ 2809 if (min < PASID_MIN || max > intel_pasid_max_id) 2810 return INVALID_IOASID; 2811 2812 if (vcmd_alloc_pasid(iommu, &ioasid)) 2813 return INVALID_IOASID; 2814 2815 return ioasid; 2816 } 2817 2818 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 2819 { 2820 struct intel_iommu *iommu = data; 2821 2822 if (!iommu) 2823 return; 2824 /* 2825 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 2826 * We can only free the PASID when all the devices are unbound. 2827 */ 2828 if (ioasid_find(NULL, ioasid, NULL)) { 2829 pr_alert("Cannot free active IOASID %d\n", ioasid); 2830 return; 2831 } 2832 vcmd_free_pasid(iommu, ioasid); 2833 } 2834 2835 static void register_pasid_allocator(struct intel_iommu *iommu) 2836 { 2837 /* 2838 * If we are running in the host, no need for custom allocator 2839 * in that PASIDs are allocated from the host system-wide. 2840 */ 2841 if (!cap_caching_mode(iommu->cap)) 2842 return; 2843 2844 if (!sm_supported(iommu)) { 2845 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 2846 return; 2847 } 2848 2849 /* 2850 * Register a custom PASID allocator if we are running in a guest, 2851 * guest PASID must be obtained via virtual command interface. 2852 * There can be multiple vIOMMUs in each guest but only one allocator 2853 * is active. All vIOMMU allocators will eventually be calling the same 2854 * host allocator. 2855 */ 2856 if (!vccap_pasid(iommu->vccap)) 2857 return; 2858 2859 pr_info("Register custom PASID allocator\n"); 2860 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 2861 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 2862 iommu->pasid_allocator.pdata = (void *)iommu; 2863 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 2864 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 2865 /* 2866 * Disable scalable mode on this IOMMU if there 2867 * is no custom allocator. Mixing SM capable vIOMMU 2868 * and non-SM vIOMMU are not supported. 2869 */ 2870 intel_iommu_sm = 0; 2871 } 2872 } 2873 #endif 2874 2875 static int __init init_dmars(void) 2876 { 2877 struct dmar_drhd_unit *drhd; 2878 struct intel_iommu *iommu; 2879 int ret; 2880 2881 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2882 if (ret) 2883 goto free_iommu; 2884 2885 for_each_iommu(iommu, drhd) { 2886 if (drhd->ignored) { 2887 iommu_disable_translation(iommu); 2888 continue; 2889 } 2890 2891 /* 2892 * Find the max pasid size of all IOMMU's in the system. 2893 * We need to ensure the system pasid table is no bigger 2894 * than the smallest supported. 2895 */ 2896 if (pasid_supported(iommu)) { 2897 u32 temp = 2 << ecap_pss(iommu->ecap); 2898 2899 intel_pasid_max_id = min_t(u32, temp, 2900 intel_pasid_max_id); 2901 } 2902 2903 intel_iommu_init_qi(iommu); 2904 2905 ret = iommu_init_domains(iommu); 2906 if (ret) 2907 goto free_iommu; 2908 2909 init_translation_status(iommu); 2910 2911 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2912 iommu_disable_translation(iommu); 2913 clear_translation_pre_enabled(iommu); 2914 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2915 iommu->name); 2916 } 2917 2918 /* 2919 * TBD: 2920 * we could share the same root & context tables 2921 * among all IOMMU's. Need to Split it later. 2922 */ 2923 ret = iommu_alloc_root_entry(iommu); 2924 if (ret) 2925 goto free_iommu; 2926 2927 if (translation_pre_enabled(iommu)) { 2928 pr_info("Translation already enabled - trying to copy translation structures\n"); 2929 2930 ret = copy_translation_tables(iommu); 2931 if (ret) { 2932 /* 2933 * We found the IOMMU with translation 2934 * enabled - but failed to copy over the 2935 * old root-entry table. Try to proceed 2936 * by disabling translation now and 2937 * allocating a clean root-entry table. 2938 * This might cause DMAR faults, but 2939 * probably the dump will still succeed. 2940 */ 2941 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2942 iommu->name); 2943 iommu_disable_translation(iommu); 2944 clear_translation_pre_enabled(iommu); 2945 } else { 2946 pr_info("Copied translation tables from previous kernel for %s\n", 2947 iommu->name); 2948 } 2949 } 2950 2951 if (!ecap_pass_through(iommu->ecap)) 2952 hw_pass_through = 0; 2953 intel_svm_check(iommu); 2954 } 2955 2956 /* 2957 * Now that qi is enabled on all iommus, set the root entry and flush 2958 * caches. This is required on some Intel X58 chipsets, otherwise the 2959 * flush_context function will loop forever and the boot hangs. 2960 */ 2961 for_each_active_iommu(iommu, drhd) { 2962 iommu_flush_write_buffer(iommu); 2963 #ifdef CONFIG_INTEL_IOMMU_SVM 2964 register_pasid_allocator(iommu); 2965 #endif 2966 iommu_set_root_entry(iommu); 2967 } 2968 2969 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2970 dmar_map_gfx = 0; 2971 #endif 2972 2973 if (!dmar_map_gfx) 2974 iommu_identity_mapping |= IDENTMAP_GFX; 2975 2976 check_tylersburg_isoch(); 2977 2978 ret = si_domain_init(hw_pass_through); 2979 if (ret) 2980 goto free_iommu; 2981 2982 /* 2983 * for each drhd 2984 * enable fault log 2985 * global invalidate context cache 2986 * global invalidate iotlb 2987 * enable translation 2988 */ 2989 for_each_iommu(iommu, drhd) { 2990 if (drhd->ignored) { 2991 /* 2992 * we always have to disable PMRs or DMA may fail on 2993 * this device 2994 */ 2995 if (force_on) 2996 iommu_disable_protect_mem_regions(iommu); 2997 continue; 2998 } 2999 3000 iommu_flush_write_buffer(iommu); 3001 3002 #ifdef CONFIG_INTEL_IOMMU_SVM 3003 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3004 /* 3005 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3006 * could cause possible lock race condition. 3007 */ 3008 up_write(&dmar_global_lock); 3009 ret = intel_svm_enable_prq(iommu); 3010 down_write(&dmar_global_lock); 3011 if (ret) 3012 goto free_iommu; 3013 } 3014 #endif 3015 ret = dmar_set_interrupt(iommu); 3016 if (ret) 3017 goto free_iommu; 3018 } 3019 3020 return 0; 3021 3022 free_iommu: 3023 for_each_active_iommu(iommu, drhd) { 3024 disable_dmar_iommu(iommu); 3025 free_dmar_iommu(iommu); 3026 } 3027 if (si_domain) { 3028 domain_exit(si_domain); 3029 si_domain = NULL; 3030 } 3031 3032 return ret; 3033 } 3034 3035 static void __init init_no_remapping_devices(void) 3036 { 3037 struct dmar_drhd_unit *drhd; 3038 struct device *dev; 3039 int i; 3040 3041 for_each_drhd_unit(drhd) { 3042 if (!drhd->include_all) { 3043 for_each_active_dev_scope(drhd->devices, 3044 drhd->devices_cnt, i, dev) 3045 break; 3046 /* ignore DMAR unit if no devices exist */ 3047 if (i == drhd->devices_cnt) 3048 drhd->ignored = 1; 3049 } 3050 } 3051 3052 for_each_active_drhd_unit(drhd) { 3053 if (drhd->include_all) 3054 continue; 3055 3056 for_each_active_dev_scope(drhd->devices, 3057 drhd->devices_cnt, i, dev) 3058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3059 break; 3060 if (i < drhd->devices_cnt) 3061 continue; 3062 3063 /* This IOMMU has *only* gfx devices. Either bypass it or 3064 set the gfx_mapped flag, as appropriate */ 3065 drhd->gfx_dedicated = 1; 3066 if (!dmar_map_gfx) 3067 drhd->ignored = 1; 3068 } 3069 } 3070 3071 #ifdef CONFIG_SUSPEND 3072 static int init_iommu_hw(void) 3073 { 3074 struct dmar_drhd_unit *drhd; 3075 struct intel_iommu *iommu = NULL; 3076 3077 for_each_active_iommu(iommu, drhd) 3078 if (iommu->qi) 3079 dmar_reenable_qi(iommu); 3080 3081 for_each_iommu(iommu, drhd) { 3082 if (drhd->ignored) { 3083 /* 3084 * we always have to disable PMRs or DMA may fail on 3085 * this device 3086 */ 3087 if (force_on) 3088 iommu_disable_protect_mem_regions(iommu); 3089 continue; 3090 } 3091 3092 iommu_flush_write_buffer(iommu); 3093 iommu_set_root_entry(iommu); 3094 iommu_enable_translation(iommu); 3095 iommu_disable_protect_mem_regions(iommu); 3096 } 3097 3098 return 0; 3099 } 3100 3101 static void iommu_flush_all(void) 3102 { 3103 struct dmar_drhd_unit *drhd; 3104 struct intel_iommu *iommu; 3105 3106 for_each_active_iommu(iommu, drhd) { 3107 iommu->flush.flush_context(iommu, 0, 0, 0, 3108 DMA_CCMD_GLOBAL_INVL); 3109 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3110 DMA_TLB_GLOBAL_FLUSH); 3111 } 3112 } 3113 3114 static int iommu_suspend(void) 3115 { 3116 struct dmar_drhd_unit *drhd; 3117 struct intel_iommu *iommu = NULL; 3118 unsigned long flag; 3119 3120 for_each_active_iommu(iommu, drhd) { 3121 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3122 GFP_KERNEL); 3123 if (!iommu->iommu_state) 3124 goto nomem; 3125 } 3126 3127 iommu_flush_all(); 3128 3129 for_each_active_iommu(iommu, drhd) { 3130 iommu_disable_translation(iommu); 3131 3132 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3133 3134 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3135 readl(iommu->reg + DMAR_FECTL_REG); 3136 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3137 readl(iommu->reg + DMAR_FEDATA_REG); 3138 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3139 readl(iommu->reg + DMAR_FEADDR_REG); 3140 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3141 readl(iommu->reg + DMAR_FEUADDR_REG); 3142 3143 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3144 } 3145 return 0; 3146 3147 nomem: 3148 for_each_active_iommu(iommu, drhd) 3149 kfree(iommu->iommu_state); 3150 3151 return -ENOMEM; 3152 } 3153 3154 static void iommu_resume(void) 3155 { 3156 struct dmar_drhd_unit *drhd; 3157 struct intel_iommu *iommu = NULL; 3158 unsigned long flag; 3159 3160 if (init_iommu_hw()) { 3161 if (force_on) 3162 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3163 else 3164 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3165 return; 3166 } 3167 3168 for_each_active_iommu(iommu, drhd) { 3169 3170 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3171 3172 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3173 iommu->reg + DMAR_FECTL_REG); 3174 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3175 iommu->reg + DMAR_FEDATA_REG); 3176 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3177 iommu->reg + DMAR_FEADDR_REG); 3178 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3179 iommu->reg + DMAR_FEUADDR_REG); 3180 3181 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3182 } 3183 3184 for_each_active_iommu(iommu, drhd) 3185 kfree(iommu->iommu_state); 3186 } 3187 3188 static struct syscore_ops iommu_syscore_ops = { 3189 .resume = iommu_resume, 3190 .suspend = iommu_suspend, 3191 }; 3192 3193 static void __init init_iommu_pm_ops(void) 3194 { 3195 register_syscore_ops(&iommu_syscore_ops); 3196 } 3197 3198 #else 3199 static inline void init_iommu_pm_ops(void) {} 3200 #endif /* CONFIG_PM */ 3201 3202 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3203 { 3204 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3205 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3206 rmrr->end_address <= rmrr->base_address || 3207 arch_rmrr_sanity_check(rmrr)) 3208 return -EINVAL; 3209 3210 return 0; 3211 } 3212 3213 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3214 { 3215 struct acpi_dmar_reserved_memory *rmrr; 3216 struct dmar_rmrr_unit *rmrru; 3217 3218 rmrr = (struct acpi_dmar_reserved_memory *)header; 3219 if (rmrr_sanity_check(rmrr)) { 3220 pr_warn(FW_BUG 3221 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3222 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3223 rmrr->base_address, rmrr->end_address, 3224 dmi_get_system_info(DMI_BIOS_VENDOR), 3225 dmi_get_system_info(DMI_BIOS_VERSION), 3226 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3227 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3228 } 3229 3230 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3231 if (!rmrru) 3232 goto out; 3233 3234 rmrru->hdr = header; 3235 3236 rmrru->base_address = rmrr->base_address; 3237 rmrru->end_address = rmrr->end_address; 3238 3239 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3240 ((void *)rmrr) + rmrr->header.length, 3241 &rmrru->devices_cnt); 3242 if (rmrru->devices_cnt && rmrru->devices == NULL) 3243 goto free_rmrru; 3244 3245 list_add(&rmrru->list, &dmar_rmrr_units); 3246 3247 return 0; 3248 free_rmrru: 3249 kfree(rmrru); 3250 out: 3251 return -ENOMEM; 3252 } 3253 3254 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3255 { 3256 struct dmar_atsr_unit *atsru; 3257 struct acpi_dmar_atsr *tmp; 3258 3259 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3260 dmar_rcu_check()) { 3261 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3262 if (atsr->segment != tmp->segment) 3263 continue; 3264 if (atsr->header.length != tmp->header.length) 3265 continue; 3266 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3267 return atsru; 3268 } 3269 3270 return NULL; 3271 } 3272 3273 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3274 { 3275 struct acpi_dmar_atsr *atsr; 3276 struct dmar_atsr_unit *atsru; 3277 3278 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3279 return 0; 3280 3281 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3282 atsru = dmar_find_atsr(atsr); 3283 if (atsru) 3284 return 0; 3285 3286 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3287 if (!atsru) 3288 return -ENOMEM; 3289 3290 /* 3291 * If memory is allocated from slab by ACPI _DSM method, we need to 3292 * copy the memory content because the memory buffer will be freed 3293 * on return. 3294 */ 3295 atsru->hdr = (void *)(atsru + 1); 3296 memcpy(atsru->hdr, hdr, hdr->length); 3297 atsru->include_all = atsr->flags & 0x1; 3298 if (!atsru->include_all) { 3299 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3300 (void *)atsr + atsr->header.length, 3301 &atsru->devices_cnt); 3302 if (atsru->devices_cnt && atsru->devices == NULL) { 3303 kfree(atsru); 3304 return -ENOMEM; 3305 } 3306 } 3307 3308 list_add_rcu(&atsru->list, &dmar_atsr_units); 3309 3310 return 0; 3311 } 3312 3313 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3314 { 3315 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3316 kfree(atsru); 3317 } 3318 3319 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3320 { 3321 struct acpi_dmar_atsr *atsr; 3322 struct dmar_atsr_unit *atsru; 3323 3324 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3325 atsru = dmar_find_atsr(atsr); 3326 if (atsru) { 3327 list_del_rcu(&atsru->list); 3328 synchronize_rcu(); 3329 intel_iommu_free_atsr(atsru); 3330 } 3331 3332 return 0; 3333 } 3334 3335 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3336 { 3337 int i; 3338 struct device *dev; 3339 struct acpi_dmar_atsr *atsr; 3340 struct dmar_atsr_unit *atsru; 3341 3342 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3343 atsru = dmar_find_atsr(atsr); 3344 if (!atsru) 3345 return 0; 3346 3347 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3348 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3349 i, dev) 3350 return -EBUSY; 3351 } 3352 3353 return 0; 3354 } 3355 3356 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3357 { 3358 struct dmar_satc_unit *satcu; 3359 struct acpi_dmar_satc *tmp; 3360 3361 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3362 dmar_rcu_check()) { 3363 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3364 if (satc->segment != tmp->segment) 3365 continue; 3366 if (satc->header.length != tmp->header.length) 3367 continue; 3368 if (memcmp(satc, tmp, satc->header.length) == 0) 3369 return satcu; 3370 } 3371 3372 return NULL; 3373 } 3374 3375 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3376 { 3377 struct acpi_dmar_satc *satc; 3378 struct dmar_satc_unit *satcu; 3379 3380 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3381 return 0; 3382 3383 satc = container_of(hdr, struct acpi_dmar_satc, header); 3384 satcu = dmar_find_satc(satc); 3385 if (satcu) 3386 return 0; 3387 3388 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3389 if (!satcu) 3390 return -ENOMEM; 3391 3392 satcu->hdr = (void *)(satcu + 1); 3393 memcpy(satcu->hdr, hdr, hdr->length); 3394 satcu->atc_required = satc->flags & 0x1; 3395 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3396 (void *)satc + satc->header.length, 3397 &satcu->devices_cnt); 3398 if (satcu->devices_cnt && !satcu->devices) { 3399 kfree(satcu); 3400 return -ENOMEM; 3401 } 3402 list_add_rcu(&satcu->list, &dmar_satc_units); 3403 3404 return 0; 3405 } 3406 3407 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3408 { 3409 int sp, ret; 3410 struct intel_iommu *iommu = dmaru->iommu; 3411 3412 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3413 if (ret) 3414 goto out; 3415 3416 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3417 pr_warn("%s: Doesn't support hardware pass through.\n", 3418 iommu->name); 3419 return -ENXIO; 3420 } 3421 3422 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3423 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3424 pr_warn("%s: Doesn't support large page.\n", 3425 iommu->name); 3426 return -ENXIO; 3427 } 3428 3429 /* 3430 * Disable translation if already enabled prior to OS handover. 3431 */ 3432 if (iommu->gcmd & DMA_GCMD_TE) 3433 iommu_disable_translation(iommu); 3434 3435 ret = iommu_init_domains(iommu); 3436 if (ret == 0) 3437 ret = iommu_alloc_root_entry(iommu); 3438 if (ret) 3439 goto out; 3440 3441 intel_svm_check(iommu); 3442 3443 if (dmaru->ignored) { 3444 /* 3445 * we always have to disable PMRs or DMA may fail on this device 3446 */ 3447 if (force_on) 3448 iommu_disable_protect_mem_regions(iommu); 3449 return 0; 3450 } 3451 3452 intel_iommu_init_qi(iommu); 3453 iommu_flush_write_buffer(iommu); 3454 3455 #ifdef CONFIG_INTEL_IOMMU_SVM 3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3457 ret = intel_svm_enable_prq(iommu); 3458 if (ret) 3459 goto disable_iommu; 3460 } 3461 #endif 3462 ret = dmar_set_interrupt(iommu); 3463 if (ret) 3464 goto disable_iommu; 3465 3466 iommu_set_root_entry(iommu); 3467 iommu_enable_translation(iommu); 3468 3469 iommu_disable_protect_mem_regions(iommu); 3470 return 0; 3471 3472 disable_iommu: 3473 disable_dmar_iommu(iommu); 3474 out: 3475 free_dmar_iommu(iommu); 3476 return ret; 3477 } 3478 3479 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3480 { 3481 int ret = 0; 3482 struct intel_iommu *iommu = dmaru->iommu; 3483 3484 if (!intel_iommu_enabled) 3485 return 0; 3486 if (iommu == NULL) 3487 return -EINVAL; 3488 3489 if (insert) { 3490 ret = intel_iommu_add(dmaru); 3491 } else { 3492 disable_dmar_iommu(iommu); 3493 free_dmar_iommu(iommu); 3494 } 3495 3496 return ret; 3497 } 3498 3499 static void intel_iommu_free_dmars(void) 3500 { 3501 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3502 struct dmar_atsr_unit *atsru, *atsr_n; 3503 struct dmar_satc_unit *satcu, *satc_n; 3504 3505 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3506 list_del(&rmrru->list); 3507 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3508 kfree(rmrru); 3509 } 3510 3511 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3512 list_del(&atsru->list); 3513 intel_iommu_free_atsr(atsru); 3514 } 3515 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3516 list_del(&satcu->list); 3517 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3518 kfree(satcu); 3519 } 3520 } 3521 3522 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3523 { 3524 struct dmar_satc_unit *satcu; 3525 struct acpi_dmar_satc *satc; 3526 struct device *tmp; 3527 int i; 3528 3529 dev = pci_physfn(dev); 3530 rcu_read_lock(); 3531 3532 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3533 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3534 if (satc->segment != pci_domain_nr(dev->bus)) 3535 continue; 3536 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3537 if (to_pci_dev(tmp) == dev) 3538 goto out; 3539 } 3540 satcu = NULL; 3541 out: 3542 rcu_read_unlock(); 3543 return satcu; 3544 } 3545 3546 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3547 { 3548 int i, ret = 1; 3549 struct pci_bus *bus; 3550 struct pci_dev *bridge = NULL; 3551 struct device *tmp; 3552 struct acpi_dmar_atsr *atsr; 3553 struct dmar_atsr_unit *atsru; 3554 struct dmar_satc_unit *satcu; 3555 3556 dev = pci_physfn(dev); 3557 satcu = dmar_find_matched_satc_unit(dev); 3558 if (satcu) 3559 /* 3560 * This device supports ATS as it is in SATC table. 3561 * When IOMMU is in legacy mode, enabling ATS is done 3562 * automatically by HW for the device that requires 3563 * ATS, hence OS should not enable this device ATS 3564 * to avoid duplicated TLB invalidation. 3565 */ 3566 return !(satcu->atc_required && !sm_supported(iommu)); 3567 3568 for (bus = dev->bus; bus; bus = bus->parent) { 3569 bridge = bus->self; 3570 /* If it's an integrated device, allow ATS */ 3571 if (!bridge) 3572 return 1; 3573 /* Connected via non-PCIe: no ATS */ 3574 if (!pci_is_pcie(bridge) || 3575 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3576 return 0; 3577 /* If we found the root port, look it up in the ATSR */ 3578 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3579 break; 3580 } 3581 3582 rcu_read_lock(); 3583 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3584 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3585 if (atsr->segment != pci_domain_nr(dev->bus)) 3586 continue; 3587 3588 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3589 if (tmp == &bridge->dev) 3590 goto out; 3591 3592 if (atsru->include_all) 3593 goto out; 3594 } 3595 ret = 0; 3596 out: 3597 rcu_read_unlock(); 3598 3599 return ret; 3600 } 3601 3602 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3603 { 3604 int ret; 3605 struct dmar_rmrr_unit *rmrru; 3606 struct dmar_atsr_unit *atsru; 3607 struct dmar_satc_unit *satcu; 3608 struct acpi_dmar_atsr *atsr; 3609 struct acpi_dmar_reserved_memory *rmrr; 3610 struct acpi_dmar_satc *satc; 3611 3612 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3613 return 0; 3614 3615 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3616 rmrr = container_of(rmrru->hdr, 3617 struct acpi_dmar_reserved_memory, header); 3618 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3619 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3620 ((void *)rmrr) + rmrr->header.length, 3621 rmrr->segment, rmrru->devices, 3622 rmrru->devices_cnt); 3623 if (ret < 0) 3624 return ret; 3625 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3626 dmar_remove_dev_scope(info, rmrr->segment, 3627 rmrru->devices, rmrru->devices_cnt); 3628 } 3629 } 3630 3631 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3632 if (atsru->include_all) 3633 continue; 3634 3635 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3636 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3637 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3638 (void *)atsr + atsr->header.length, 3639 atsr->segment, atsru->devices, 3640 atsru->devices_cnt); 3641 if (ret > 0) 3642 break; 3643 else if (ret < 0) 3644 return ret; 3645 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3646 if (dmar_remove_dev_scope(info, atsr->segment, 3647 atsru->devices, atsru->devices_cnt)) 3648 break; 3649 } 3650 } 3651 list_for_each_entry(satcu, &dmar_satc_units, list) { 3652 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3653 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3654 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3655 (void *)satc + satc->header.length, 3656 satc->segment, satcu->devices, 3657 satcu->devices_cnt); 3658 if (ret > 0) 3659 break; 3660 else if (ret < 0) 3661 return ret; 3662 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3663 if (dmar_remove_dev_scope(info, satc->segment, 3664 satcu->devices, satcu->devices_cnt)) 3665 break; 3666 } 3667 } 3668 3669 return 0; 3670 } 3671 3672 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3673 unsigned long val, void *v) 3674 { 3675 struct memory_notify *mhp = v; 3676 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3677 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3678 mhp->nr_pages - 1); 3679 3680 switch (val) { 3681 case MEM_GOING_ONLINE: 3682 if (iommu_domain_identity_map(si_domain, 3683 start_vpfn, last_vpfn)) { 3684 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3685 start_vpfn, last_vpfn); 3686 return NOTIFY_BAD; 3687 } 3688 break; 3689 3690 case MEM_OFFLINE: 3691 case MEM_CANCEL_ONLINE: 3692 { 3693 struct dmar_drhd_unit *drhd; 3694 struct intel_iommu *iommu; 3695 LIST_HEAD(freelist); 3696 3697 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3698 3699 rcu_read_lock(); 3700 for_each_active_iommu(iommu, drhd) 3701 iommu_flush_iotlb_psi(iommu, si_domain, 3702 start_vpfn, mhp->nr_pages, 3703 list_empty(&freelist), 0); 3704 rcu_read_unlock(); 3705 put_pages_list(&freelist); 3706 } 3707 break; 3708 } 3709 3710 return NOTIFY_OK; 3711 } 3712 3713 static struct notifier_block intel_iommu_memory_nb = { 3714 .notifier_call = intel_iommu_memory_notifier, 3715 .priority = 0 3716 }; 3717 3718 static void intel_disable_iommus(void) 3719 { 3720 struct intel_iommu *iommu = NULL; 3721 struct dmar_drhd_unit *drhd; 3722 3723 for_each_iommu(iommu, drhd) 3724 iommu_disable_translation(iommu); 3725 } 3726 3727 void intel_iommu_shutdown(void) 3728 { 3729 struct dmar_drhd_unit *drhd; 3730 struct intel_iommu *iommu = NULL; 3731 3732 if (no_iommu || dmar_disabled) 3733 return; 3734 3735 down_write(&dmar_global_lock); 3736 3737 /* Disable PMRs explicitly here. */ 3738 for_each_iommu(iommu, drhd) 3739 iommu_disable_protect_mem_regions(iommu); 3740 3741 /* Make sure the IOMMUs are switched off */ 3742 intel_disable_iommus(); 3743 3744 up_write(&dmar_global_lock); 3745 } 3746 3747 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3748 { 3749 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3750 3751 return container_of(iommu_dev, struct intel_iommu, iommu); 3752 } 3753 3754 static ssize_t version_show(struct device *dev, 3755 struct device_attribute *attr, char *buf) 3756 { 3757 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3758 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3759 return sprintf(buf, "%d:%d\n", 3760 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3761 } 3762 static DEVICE_ATTR_RO(version); 3763 3764 static ssize_t address_show(struct device *dev, 3765 struct device_attribute *attr, char *buf) 3766 { 3767 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3768 return sprintf(buf, "%llx\n", iommu->reg_phys); 3769 } 3770 static DEVICE_ATTR_RO(address); 3771 3772 static ssize_t cap_show(struct device *dev, 3773 struct device_attribute *attr, char *buf) 3774 { 3775 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3776 return sprintf(buf, "%llx\n", iommu->cap); 3777 } 3778 static DEVICE_ATTR_RO(cap); 3779 3780 static ssize_t ecap_show(struct device *dev, 3781 struct device_attribute *attr, char *buf) 3782 { 3783 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3784 return sprintf(buf, "%llx\n", iommu->ecap); 3785 } 3786 static DEVICE_ATTR_RO(ecap); 3787 3788 static ssize_t domains_supported_show(struct device *dev, 3789 struct device_attribute *attr, char *buf) 3790 { 3791 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3792 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 3793 } 3794 static DEVICE_ATTR_RO(domains_supported); 3795 3796 static ssize_t domains_used_show(struct device *dev, 3797 struct device_attribute *attr, char *buf) 3798 { 3799 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3800 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 3801 cap_ndoms(iommu->cap))); 3802 } 3803 static DEVICE_ATTR_RO(domains_used); 3804 3805 static struct attribute *intel_iommu_attrs[] = { 3806 &dev_attr_version.attr, 3807 &dev_attr_address.attr, 3808 &dev_attr_cap.attr, 3809 &dev_attr_ecap.attr, 3810 &dev_attr_domains_supported.attr, 3811 &dev_attr_domains_used.attr, 3812 NULL, 3813 }; 3814 3815 static struct attribute_group intel_iommu_group = { 3816 .name = "intel-iommu", 3817 .attrs = intel_iommu_attrs, 3818 }; 3819 3820 const struct attribute_group *intel_iommu_groups[] = { 3821 &intel_iommu_group, 3822 NULL, 3823 }; 3824 3825 static inline bool has_external_pci(void) 3826 { 3827 struct pci_dev *pdev = NULL; 3828 3829 for_each_pci_dev(pdev) 3830 if (pdev->external_facing) { 3831 pci_dev_put(pdev); 3832 return true; 3833 } 3834 3835 return false; 3836 } 3837 3838 static int __init platform_optin_force_iommu(void) 3839 { 3840 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3841 return 0; 3842 3843 if (no_iommu || dmar_disabled) 3844 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3845 3846 /* 3847 * If Intel-IOMMU is disabled by default, we will apply identity 3848 * map for all devices except those marked as being untrusted. 3849 */ 3850 if (dmar_disabled) 3851 iommu_set_default_passthrough(false); 3852 3853 dmar_disabled = 0; 3854 no_iommu = 0; 3855 3856 return 1; 3857 } 3858 3859 static int __init probe_acpi_namespace_devices(void) 3860 { 3861 struct dmar_drhd_unit *drhd; 3862 /* To avoid a -Wunused-but-set-variable warning. */ 3863 struct intel_iommu *iommu __maybe_unused; 3864 struct device *dev; 3865 int i, ret = 0; 3866 3867 for_each_active_iommu(iommu, drhd) { 3868 for_each_active_dev_scope(drhd->devices, 3869 drhd->devices_cnt, i, dev) { 3870 struct acpi_device_physical_node *pn; 3871 struct iommu_group *group; 3872 struct acpi_device *adev; 3873 3874 if (dev->bus != &acpi_bus_type) 3875 continue; 3876 3877 adev = to_acpi_device(dev); 3878 mutex_lock(&adev->physical_node_lock); 3879 list_for_each_entry(pn, 3880 &adev->physical_node_list, node) { 3881 group = iommu_group_get(pn->dev); 3882 if (group) { 3883 iommu_group_put(group); 3884 continue; 3885 } 3886 3887 ret = iommu_probe_device(pn->dev); 3888 if (ret) 3889 break; 3890 } 3891 mutex_unlock(&adev->physical_node_lock); 3892 3893 if (ret) 3894 return ret; 3895 } 3896 } 3897 3898 return 0; 3899 } 3900 3901 static __init int tboot_force_iommu(void) 3902 { 3903 if (!tboot_enabled()) 3904 return 0; 3905 3906 if (no_iommu || dmar_disabled) 3907 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3908 3909 dmar_disabled = 0; 3910 no_iommu = 0; 3911 3912 return 1; 3913 } 3914 3915 int __init intel_iommu_init(void) 3916 { 3917 int ret = -ENODEV; 3918 struct dmar_drhd_unit *drhd; 3919 struct intel_iommu *iommu; 3920 3921 /* 3922 * Intel IOMMU is required for a TXT/tboot launch or platform 3923 * opt in, so enforce that. 3924 */ 3925 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3926 platform_optin_force_iommu(); 3927 3928 down_write(&dmar_global_lock); 3929 if (dmar_table_init()) { 3930 if (force_on) 3931 panic("tboot: Failed to initialize DMAR table\n"); 3932 goto out_free_dmar; 3933 } 3934 3935 if (dmar_dev_scope_init() < 0) { 3936 if (force_on) 3937 panic("tboot: Failed to initialize DMAR device scope\n"); 3938 goto out_free_dmar; 3939 } 3940 3941 up_write(&dmar_global_lock); 3942 3943 /* 3944 * The bus notifier takes the dmar_global_lock, so lockdep will 3945 * complain later when we register it under the lock. 3946 */ 3947 dmar_register_bus_notifier(); 3948 3949 down_write(&dmar_global_lock); 3950 3951 if (!no_iommu) 3952 intel_iommu_debugfs_init(); 3953 3954 if (no_iommu || dmar_disabled) { 3955 /* 3956 * We exit the function here to ensure IOMMU's remapping and 3957 * mempool aren't setup, which means that the IOMMU's PMRs 3958 * won't be disabled via the call to init_dmars(). So disable 3959 * it explicitly here. The PMRs were setup by tboot prior to 3960 * calling SENTER, but the kernel is expected to reset/tear 3961 * down the PMRs. 3962 */ 3963 if (intel_iommu_tboot_noforce) { 3964 for_each_iommu(iommu, drhd) 3965 iommu_disable_protect_mem_regions(iommu); 3966 } 3967 3968 /* 3969 * Make sure the IOMMUs are switched off, even when we 3970 * boot into a kexec kernel and the previous kernel left 3971 * them enabled 3972 */ 3973 intel_disable_iommus(); 3974 goto out_free_dmar; 3975 } 3976 3977 if (list_empty(&dmar_rmrr_units)) 3978 pr_info("No RMRR found\n"); 3979 3980 if (list_empty(&dmar_atsr_units)) 3981 pr_info("No ATSR found\n"); 3982 3983 if (list_empty(&dmar_satc_units)) 3984 pr_info("No SATC found\n"); 3985 3986 init_no_remapping_devices(); 3987 3988 ret = init_dmars(); 3989 if (ret) { 3990 if (force_on) 3991 panic("tboot: Failed to initialize DMARs\n"); 3992 pr_err("Initialization failed\n"); 3993 goto out_free_dmar; 3994 } 3995 up_write(&dmar_global_lock); 3996 3997 init_iommu_pm_ops(); 3998 3999 down_read(&dmar_global_lock); 4000 for_each_active_iommu(iommu, drhd) { 4001 /* 4002 * The flush queue implementation does not perform 4003 * page-selective invalidations that are required for efficient 4004 * TLB flushes in virtual environments. The benefit of batching 4005 * is likely to be much lower than the overhead of synchronizing 4006 * the virtual and physical IOMMU page-tables. 4007 */ 4008 if (cap_caching_mode(iommu->cap)) { 4009 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4010 iommu_set_dma_strict(); 4011 } 4012 iommu_device_sysfs_add(&iommu->iommu, NULL, 4013 intel_iommu_groups, 4014 "%s", iommu->name); 4015 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4016 } 4017 up_read(&dmar_global_lock); 4018 4019 if (si_domain && !hw_pass_through) 4020 register_memory_notifier(&intel_iommu_memory_nb); 4021 4022 down_read(&dmar_global_lock); 4023 if (probe_acpi_namespace_devices()) 4024 pr_warn("ACPI name space devices didn't probe correctly\n"); 4025 4026 /* Finally, we enable the DMA remapping hardware. */ 4027 for_each_iommu(iommu, drhd) { 4028 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4029 iommu_enable_translation(iommu); 4030 4031 iommu_disable_protect_mem_regions(iommu); 4032 } 4033 up_read(&dmar_global_lock); 4034 4035 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4036 4037 intel_iommu_enabled = 1; 4038 4039 return 0; 4040 4041 out_free_dmar: 4042 intel_iommu_free_dmars(); 4043 up_write(&dmar_global_lock); 4044 return ret; 4045 } 4046 4047 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4048 { 4049 struct device_domain_info *info = opaque; 4050 4051 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4052 return 0; 4053 } 4054 4055 /* 4056 * NB - intel-iommu lacks any sort of reference counting for the users of 4057 * dependent devices. If multiple endpoints have intersecting dependent 4058 * devices, unbinding the driver from any one of them will possibly leave 4059 * the others unable to operate. 4060 */ 4061 static void domain_context_clear(struct device_domain_info *info) 4062 { 4063 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4064 return; 4065 4066 pci_for_each_dma_alias(to_pci_dev(info->dev), 4067 &domain_context_clear_one_cb, info); 4068 } 4069 4070 static void dmar_remove_one_dev_info(struct device *dev) 4071 { 4072 struct device_domain_info *info = dev_iommu_priv_get(dev); 4073 struct dmar_domain *domain = info->domain; 4074 struct intel_iommu *iommu = info->iommu; 4075 unsigned long flags; 4076 4077 if (!dev_is_real_dma_subdevice(info->dev)) { 4078 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4079 intel_pasid_tear_down_entry(iommu, info->dev, 4080 PASID_RID2PASID, false); 4081 4082 iommu_disable_pci_caps(info); 4083 domain_context_clear(info); 4084 } 4085 4086 spin_lock_irqsave(&domain->lock, flags); 4087 list_del(&info->link); 4088 spin_unlock_irqrestore(&domain->lock, flags); 4089 4090 domain_detach_iommu(domain, iommu); 4091 info->domain = NULL; 4092 } 4093 4094 /* 4095 * Clear the page table pointer in context or pasid table entries so that 4096 * all DMA requests without PASID from the device are blocked. If the page 4097 * table has been set, clean up the data structures. 4098 */ 4099 static void device_block_translation(struct device *dev) 4100 { 4101 struct device_domain_info *info = dev_iommu_priv_get(dev); 4102 struct intel_iommu *iommu = info->iommu; 4103 unsigned long flags; 4104 4105 iommu_disable_pci_caps(info); 4106 if (!dev_is_real_dma_subdevice(dev)) { 4107 if (sm_supported(iommu)) 4108 intel_pasid_tear_down_entry(iommu, dev, 4109 PASID_RID2PASID, false); 4110 else 4111 domain_context_clear(info); 4112 } 4113 4114 if (!info->domain) 4115 return; 4116 4117 spin_lock_irqsave(&info->domain->lock, flags); 4118 list_del(&info->link); 4119 spin_unlock_irqrestore(&info->domain->lock, flags); 4120 4121 domain_detach_iommu(info->domain, iommu); 4122 info->domain = NULL; 4123 } 4124 4125 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4126 { 4127 int adjust_width; 4128 4129 /* calculate AGAW */ 4130 domain->gaw = guest_width; 4131 adjust_width = guestwidth_to_adjustwidth(guest_width); 4132 domain->agaw = width_to_agaw(adjust_width); 4133 4134 domain->iommu_coherency = false; 4135 domain->iommu_superpage = 0; 4136 domain->max_addr = 0; 4137 4138 /* always allocate the top pgd */ 4139 domain->pgd = alloc_pgtable_page(domain->nid); 4140 if (!domain->pgd) 4141 return -ENOMEM; 4142 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4143 return 0; 4144 } 4145 4146 static int blocking_domain_attach_dev(struct iommu_domain *domain, 4147 struct device *dev) 4148 { 4149 device_block_translation(dev); 4150 return 0; 4151 } 4152 4153 static struct iommu_domain blocking_domain = { 4154 .ops = &(const struct iommu_domain_ops) { 4155 .attach_dev = blocking_domain_attach_dev, 4156 .free = intel_iommu_domain_free 4157 } 4158 }; 4159 4160 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4161 { 4162 struct dmar_domain *dmar_domain; 4163 struct iommu_domain *domain; 4164 4165 switch (type) { 4166 case IOMMU_DOMAIN_BLOCKED: 4167 return &blocking_domain; 4168 case IOMMU_DOMAIN_DMA: 4169 case IOMMU_DOMAIN_DMA_FQ: 4170 case IOMMU_DOMAIN_UNMANAGED: 4171 dmar_domain = alloc_domain(type); 4172 if (!dmar_domain) { 4173 pr_err("Can't allocate dmar_domain\n"); 4174 return NULL; 4175 } 4176 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4177 pr_err("Domain initialization failed\n"); 4178 domain_exit(dmar_domain); 4179 return NULL; 4180 } 4181 4182 domain = &dmar_domain->domain; 4183 domain->geometry.aperture_start = 0; 4184 domain->geometry.aperture_end = 4185 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4186 domain->geometry.force_aperture = true; 4187 4188 return domain; 4189 case IOMMU_DOMAIN_IDENTITY: 4190 return &si_domain->domain; 4191 case IOMMU_DOMAIN_SVA: 4192 return intel_svm_domain_alloc(); 4193 default: 4194 return NULL; 4195 } 4196 4197 return NULL; 4198 } 4199 4200 static void intel_iommu_domain_free(struct iommu_domain *domain) 4201 { 4202 if (domain != &si_domain->domain && domain != &blocking_domain) 4203 domain_exit(to_dmar_domain(domain)); 4204 } 4205 4206 static int prepare_domain_attach_device(struct iommu_domain *domain, 4207 struct device *dev) 4208 { 4209 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4210 struct intel_iommu *iommu; 4211 int addr_width; 4212 4213 iommu = device_to_iommu(dev, NULL, NULL); 4214 if (!iommu) 4215 return -ENODEV; 4216 4217 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4218 return -EINVAL; 4219 4220 /* check if this iommu agaw is sufficient for max mapped address */ 4221 addr_width = agaw_to_width(iommu->agaw); 4222 if (addr_width > cap_mgaw(iommu->cap)) 4223 addr_width = cap_mgaw(iommu->cap); 4224 4225 if (dmar_domain->max_addr > (1LL << addr_width)) 4226 return -EINVAL; 4227 dmar_domain->gaw = addr_width; 4228 4229 /* 4230 * Knock out extra levels of page tables if necessary 4231 */ 4232 while (iommu->agaw < dmar_domain->agaw) { 4233 struct dma_pte *pte; 4234 4235 pte = dmar_domain->pgd; 4236 if (dma_pte_present(pte)) { 4237 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4238 free_pgtable_page(pte); 4239 } 4240 dmar_domain->agaw--; 4241 } 4242 4243 return 0; 4244 } 4245 4246 static int intel_iommu_attach_device(struct iommu_domain *domain, 4247 struct device *dev) 4248 { 4249 struct device_domain_info *info = dev_iommu_priv_get(dev); 4250 int ret; 4251 4252 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4253 device_is_rmrr_locked(dev)) { 4254 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4255 return -EPERM; 4256 } 4257 4258 if (info->domain) 4259 device_block_translation(dev); 4260 4261 ret = prepare_domain_attach_device(domain, dev); 4262 if (ret) 4263 return ret; 4264 4265 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4266 } 4267 4268 static int intel_iommu_map(struct iommu_domain *domain, 4269 unsigned long iova, phys_addr_t hpa, 4270 size_t size, int iommu_prot, gfp_t gfp) 4271 { 4272 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4273 u64 max_addr; 4274 int prot = 0; 4275 4276 if (iommu_prot & IOMMU_READ) 4277 prot |= DMA_PTE_READ; 4278 if (iommu_prot & IOMMU_WRITE) 4279 prot |= DMA_PTE_WRITE; 4280 if (dmar_domain->set_pte_snp) 4281 prot |= DMA_PTE_SNP; 4282 4283 max_addr = iova + size; 4284 if (dmar_domain->max_addr < max_addr) { 4285 u64 end; 4286 4287 /* check if minimum agaw is sufficient for mapped address */ 4288 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4289 if (end < max_addr) { 4290 pr_err("%s: iommu width (%d) is not " 4291 "sufficient for the mapped address (%llx)\n", 4292 __func__, dmar_domain->gaw, max_addr); 4293 return -EFAULT; 4294 } 4295 dmar_domain->max_addr = max_addr; 4296 } 4297 /* Round up size to next multiple of PAGE_SIZE, if it and 4298 the low bits of hpa would take us onto the next page */ 4299 size = aligned_nrpages(hpa, size); 4300 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4301 hpa >> VTD_PAGE_SHIFT, size, prot); 4302 } 4303 4304 static int intel_iommu_map_pages(struct iommu_domain *domain, 4305 unsigned long iova, phys_addr_t paddr, 4306 size_t pgsize, size_t pgcount, 4307 int prot, gfp_t gfp, size_t *mapped) 4308 { 4309 unsigned long pgshift = __ffs(pgsize); 4310 size_t size = pgcount << pgshift; 4311 int ret; 4312 4313 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4314 return -EINVAL; 4315 4316 if (!IS_ALIGNED(iova | paddr, pgsize)) 4317 return -EINVAL; 4318 4319 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4320 if (!ret && mapped) 4321 *mapped = size; 4322 4323 return ret; 4324 } 4325 4326 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4327 unsigned long iova, size_t size, 4328 struct iommu_iotlb_gather *gather) 4329 { 4330 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4331 unsigned long start_pfn, last_pfn; 4332 int level = 0; 4333 4334 /* Cope with horrid API which requires us to unmap more than the 4335 size argument if it happens to be a large-page mapping. */ 4336 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4337 4338 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4339 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4340 4341 start_pfn = iova >> VTD_PAGE_SHIFT; 4342 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4343 4344 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4345 4346 if (dmar_domain->max_addr == iova + size) 4347 dmar_domain->max_addr = iova; 4348 4349 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4350 4351 return size; 4352 } 4353 4354 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4355 unsigned long iova, 4356 size_t pgsize, size_t pgcount, 4357 struct iommu_iotlb_gather *gather) 4358 { 4359 unsigned long pgshift = __ffs(pgsize); 4360 size_t size = pgcount << pgshift; 4361 4362 return intel_iommu_unmap(domain, iova, size, gather); 4363 } 4364 4365 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4366 struct iommu_iotlb_gather *gather) 4367 { 4368 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4369 unsigned long iova_pfn = IOVA_PFN(gather->start); 4370 size_t size = gather->end - gather->start; 4371 struct iommu_domain_info *info; 4372 unsigned long start_pfn; 4373 unsigned long nrpages; 4374 unsigned long i; 4375 4376 nrpages = aligned_nrpages(gather->start, size); 4377 start_pfn = mm_to_dma_pfn(iova_pfn); 4378 4379 xa_for_each(&dmar_domain->iommu_array, i, info) 4380 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4381 start_pfn, nrpages, 4382 list_empty(&gather->freelist), 0); 4383 4384 put_pages_list(&gather->freelist); 4385 } 4386 4387 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4388 dma_addr_t iova) 4389 { 4390 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4391 struct dma_pte *pte; 4392 int level = 0; 4393 u64 phys = 0; 4394 4395 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 4396 if (pte && dma_pte_present(pte)) 4397 phys = dma_pte_addr(pte) + 4398 (iova & (BIT_MASK(level_to_offset_bits(level) + 4399 VTD_PAGE_SHIFT) - 1)); 4400 4401 return phys; 4402 } 4403 4404 static bool domain_support_force_snooping(struct dmar_domain *domain) 4405 { 4406 struct device_domain_info *info; 4407 bool support = true; 4408 4409 assert_spin_locked(&domain->lock); 4410 list_for_each_entry(info, &domain->devices, link) { 4411 if (!ecap_sc_support(info->iommu->ecap)) { 4412 support = false; 4413 break; 4414 } 4415 } 4416 4417 return support; 4418 } 4419 4420 static void domain_set_force_snooping(struct dmar_domain *domain) 4421 { 4422 struct device_domain_info *info; 4423 4424 assert_spin_locked(&domain->lock); 4425 /* 4426 * Second level page table supports per-PTE snoop control. The 4427 * iommu_map() interface will handle this by setting SNP bit. 4428 */ 4429 if (!domain->use_first_level) { 4430 domain->set_pte_snp = true; 4431 return; 4432 } 4433 4434 list_for_each_entry(info, &domain->devices, link) 4435 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4436 PASID_RID2PASID); 4437 } 4438 4439 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4440 { 4441 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4442 unsigned long flags; 4443 4444 if (dmar_domain->force_snooping) 4445 return true; 4446 4447 spin_lock_irqsave(&dmar_domain->lock, flags); 4448 if (!domain_support_force_snooping(dmar_domain)) { 4449 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4450 return false; 4451 } 4452 4453 domain_set_force_snooping(dmar_domain); 4454 dmar_domain->force_snooping = true; 4455 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4456 4457 return true; 4458 } 4459 4460 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4461 { 4462 struct device_domain_info *info = dev_iommu_priv_get(dev); 4463 4464 switch (cap) { 4465 case IOMMU_CAP_CACHE_COHERENCY: 4466 return true; 4467 case IOMMU_CAP_INTR_REMAP: 4468 return irq_remapping_enabled == 1; 4469 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4470 return dmar_platform_optin(); 4471 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4472 return ecap_sc_support(info->iommu->ecap); 4473 default: 4474 return false; 4475 } 4476 } 4477 4478 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4479 { 4480 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4481 struct device_domain_info *info; 4482 struct intel_iommu *iommu; 4483 u8 bus, devfn; 4484 int ret; 4485 4486 iommu = device_to_iommu(dev, &bus, &devfn); 4487 if (!iommu || !iommu->iommu.ops) 4488 return ERR_PTR(-ENODEV); 4489 4490 info = kzalloc(sizeof(*info), GFP_KERNEL); 4491 if (!info) 4492 return ERR_PTR(-ENOMEM); 4493 4494 if (dev_is_real_dma_subdevice(dev)) { 4495 info->bus = pdev->bus->number; 4496 info->devfn = pdev->devfn; 4497 info->segment = pci_domain_nr(pdev->bus); 4498 } else { 4499 info->bus = bus; 4500 info->devfn = devfn; 4501 info->segment = iommu->segment; 4502 } 4503 4504 info->dev = dev; 4505 info->iommu = iommu; 4506 if (dev_is_pci(dev)) { 4507 if (ecap_dev_iotlb_support(iommu->ecap) && 4508 pci_ats_supported(pdev) && 4509 dmar_ats_supported(pdev, iommu)) { 4510 info->ats_supported = 1; 4511 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4512 } 4513 if (sm_supported(iommu)) { 4514 if (pasid_supported(iommu)) { 4515 int features = pci_pasid_features(pdev); 4516 4517 if (features >= 0) 4518 info->pasid_supported = features | 1; 4519 } 4520 4521 if (info->ats_supported && ecap_prs(iommu->ecap) && 4522 pci_pri_supported(pdev)) 4523 info->pri_supported = 1; 4524 } 4525 } 4526 4527 dev_iommu_priv_set(dev, info); 4528 4529 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4530 ret = intel_pasid_alloc_table(dev); 4531 if (ret) { 4532 dev_err(dev, "PASID table allocation failed\n"); 4533 dev_iommu_priv_set(dev, NULL); 4534 kfree(info); 4535 return ERR_PTR(ret); 4536 } 4537 } 4538 4539 return &iommu->iommu; 4540 } 4541 4542 static void intel_iommu_release_device(struct device *dev) 4543 { 4544 struct device_domain_info *info = dev_iommu_priv_get(dev); 4545 4546 dmar_remove_one_dev_info(dev); 4547 intel_pasid_free_table(dev); 4548 dev_iommu_priv_set(dev, NULL); 4549 kfree(info); 4550 set_dma_ops(dev, NULL); 4551 } 4552 4553 static void intel_iommu_probe_finalize(struct device *dev) 4554 { 4555 set_dma_ops(dev, NULL); 4556 iommu_setup_dma_ops(dev, 0, U64_MAX); 4557 } 4558 4559 static void intel_iommu_get_resv_regions(struct device *device, 4560 struct list_head *head) 4561 { 4562 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4563 struct iommu_resv_region *reg; 4564 struct dmar_rmrr_unit *rmrr; 4565 struct device *i_dev; 4566 int i; 4567 4568 rcu_read_lock(); 4569 for_each_rmrr_units(rmrr) { 4570 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4571 i, i_dev) { 4572 struct iommu_resv_region *resv; 4573 enum iommu_resv_type type; 4574 size_t length; 4575 4576 if (i_dev != device && 4577 !is_downstream_to_pci_bridge(device, i_dev)) 4578 continue; 4579 4580 length = rmrr->end_address - rmrr->base_address + 1; 4581 4582 type = device_rmrr_is_relaxable(device) ? 4583 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4584 4585 resv = iommu_alloc_resv_region(rmrr->base_address, 4586 length, prot, type, 4587 GFP_ATOMIC); 4588 if (!resv) 4589 break; 4590 4591 list_add_tail(&resv->list, head); 4592 } 4593 } 4594 rcu_read_unlock(); 4595 4596 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4597 if (dev_is_pci(device)) { 4598 struct pci_dev *pdev = to_pci_dev(device); 4599 4600 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4601 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4602 IOMMU_RESV_DIRECT_RELAXABLE, 4603 GFP_KERNEL); 4604 if (reg) 4605 list_add_tail(®->list, head); 4606 } 4607 } 4608 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4609 4610 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4611 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4612 0, IOMMU_RESV_MSI, GFP_KERNEL); 4613 if (!reg) 4614 return; 4615 list_add_tail(®->list, head); 4616 } 4617 4618 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4619 { 4620 if (dev_is_pci(dev)) 4621 return pci_device_group(dev); 4622 return generic_device_group(dev); 4623 } 4624 4625 static int intel_iommu_enable_sva(struct device *dev) 4626 { 4627 struct device_domain_info *info = dev_iommu_priv_get(dev); 4628 struct intel_iommu *iommu; 4629 int ret; 4630 4631 if (!info || dmar_disabled) 4632 return -EINVAL; 4633 4634 iommu = info->iommu; 4635 if (!iommu) 4636 return -EINVAL; 4637 4638 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4639 return -ENODEV; 4640 4641 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 4642 return -EINVAL; 4643 4644 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4645 if (!ret) 4646 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4647 4648 return ret; 4649 } 4650 4651 static int intel_iommu_disable_sva(struct device *dev) 4652 { 4653 struct device_domain_info *info = dev_iommu_priv_get(dev); 4654 struct intel_iommu *iommu = info->iommu; 4655 int ret; 4656 4657 ret = iommu_unregister_device_fault_handler(dev); 4658 if (!ret) 4659 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 4660 4661 return ret; 4662 } 4663 4664 static int intel_iommu_enable_iopf(struct device *dev) 4665 { 4666 struct device_domain_info *info = dev_iommu_priv_get(dev); 4667 4668 if (info && info->pri_supported) 4669 return 0; 4670 4671 return -ENODEV; 4672 } 4673 4674 static int 4675 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4676 { 4677 switch (feat) { 4678 case IOMMU_DEV_FEAT_IOPF: 4679 return intel_iommu_enable_iopf(dev); 4680 4681 case IOMMU_DEV_FEAT_SVA: 4682 return intel_iommu_enable_sva(dev); 4683 4684 default: 4685 return -ENODEV; 4686 } 4687 } 4688 4689 static int 4690 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4691 { 4692 switch (feat) { 4693 case IOMMU_DEV_FEAT_IOPF: 4694 return 0; 4695 4696 case IOMMU_DEV_FEAT_SVA: 4697 return intel_iommu_disable_sva(dev); 4698 4699 default: 4700 return -ENODEV; 4701 } 4702 } 4703 4704 static bool intel_iommu_is_attach_deferred(struct device *dev) 4705 { 4706 struct device_domain_info *info = dev_iommu_priv_get(dev); 4707 4708 return translation_pre_enabled(info->iommu) && !info->domain; 4709 } 4710 4711 /* 4712 * Check that the device does not live on an external facing PCI port that is 4713 * marked as untrusted. Such devices should not be able to apply quirks and 4714 * thus not be able to bypass the IOMMU restrictions. 4715 */ 4716 static bool risky_device(struct pci_dev *pdev) 4717 { 4718 if (pdev->untrusted) { 4719 pci_info(pdev, 4720 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4721 pdev->vendor, pdev->device); 4722 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4723 return true; 4724 } 4725 return false; 4726 } 4727 4728 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4729 unsigned long iova, size_t size) 4730 { 4731 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4732 unsigned long pages = aligned_nrpages(iova, size); 4733 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4734 struct iommu_domain_info *info; 4735 unsigned long i; 4736 4737 xa_for_each(&dmar_domain->iommu_array, i, info) 4738 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4739 } 4740 4741 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4742 { 4743 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 4744 struct iommu_domain *domain; 4745 4746 /* Domain type specific cleanup: */ 4747 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4748 if (domain) { 4749 switch (domain->type) { 4750 case IOMMU_DOMAIN_SVA: 4751 intel_svm_remove_dev_pasid(dev, pasid); 4752 break; 4753 default: 4754 /* should never reach here */ 4755 WARN_ON(1); 4756 break; 4757 } 4758 } 4759 4760 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4761 } 4762 4763 const struct iommu_ops intel_iommu_ops = { 4764 .capable = intel_iommu_capable, 4765 .domain_alloc = intel_iommu_domain_alloc, 4766 .probe_device = intel_iommu_probe_device, 4767 .probe_finalize = intel_iommu_probe_finalize, 4768 .release_device = intel_iommu_release_device, 4769 .get_resv_regions = intel_iommu_get_resv_regions, 4770 .device_group = intel_iommu_device_group, 4771 .dev_enable_feat = intel_iommu_dev_enable_feat, 4772 .dev_disable_feat = intel_iommu_dev_disable_feat, 4773 .is_attach_deferred = intel_iommu_is_attach_deferred, 4774 .def_domain_type = device_def_domain_type, 4775 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4776 .pgsize_bitmap = SZ_4K, 4777 #ifdef CONFIG_INTEL_IOMMU_SVM 4778 .page_response = intel_svm_page_response, 4779 #endif 4780 .default_domain_ops = &(const struct iommu_domain_ops) { 4781 .attach_dev = intel_iommu_attach_device, 4782 .map_pages = intel_iommu_map_pages, 4783 .unmap_pages = intel_iommu_unmap_pages, 4784 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4785 .flush_iotlb_all = intel_flush_iotlb_all, 4786 .iotlb_sync = intel_iommu_tlb_sync, 4787 .iova_to_phys = intel_iommu_iova_to_phys, 4788 .free = intel_iommu_domain_free, 4789 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4790 } 4791 }; 4792 4793 static void quirk_iommu_igfx(struct pci_dev *dev) 4794 { 4795 if (risky_device(dev)) 4796 return; 4797 4798 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4799 dmar_map_gfx = 0; 4800 } 4801 4802 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4810 4811 /* Broadwell igfx malfunctions with dmar */ 4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4836 4837 static void quirk_iommu_rwbf(struct pci_dev *dev) 4838 { 4839 if (risky_device(dev)) 4840 return; 4841 4842 /* 4843 * Mobile 4 Series Chipset neglects to set RWBF capability, 4844 * but needs it. Same seems to hold for the desktop versions. 4845 */ 4846 pci_info(dev, "Forcing write-buffer flush capability\n"); 4847 rwbf_quirk = 1; 4848 } 4849 4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4857 4858 #define GGC 0x52 4859 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4860 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4861 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4862 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4863 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4864 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4865 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4866 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4867 4868 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4869 { 4870 unsigned short ggc; 4871 4872 if (risky_device(dev)) 4873 return; 4874 4875 if (pci_read_config_word(dev, GGC, &ggc)) 4876 return; 4877 4878 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4879 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4880 dmar_map_gfx = 0; 4881 } else if (dmar_map_gfx) { 4882 /* we have to ensure the gfx device is idle before we flush */ 4883 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4884 iommu_set_dma_strict(); 4885 } 4886 } 4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4891 4892 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4893 { 4894 unsigned short ver; 4895 4896 if (!IS_GFX_DEVICE(dev)) 4897 return; 4898 4899 ver = (dev->device >> 8) & 0xff; 4900 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4901 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4902 ver != 0x9a && ver != 0xa7) 4903 return; 4904 4905 if (risky_device(dev)) 4906 return; 4907 4908 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4909 iommu_skip_te_disable = 1; 4910 } 4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4912 4913 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4914 ISOCH DMAR unit for the Azalia sound device, but not give it any 4915 TLB entries, which causes it to deadlock. Check for that. We do 4916 this in a function called from init_dmars(), instead of in a PCI 4917 quirk, because we don't want to print the obnoxious "BIOS broken" 4918 message if VT-d is actually disabled. 4919 */ 4920 static void __init check_tylersburg_isoch(void) 4921 { 4922 struct pci_dev *pdev; 4923 uint32_t vtisochctrl; 4924 4925 /* If there's no Azalia in the system anyway, forget it. */ 4926 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4927 if (!pdev) 4928 return; 4929 4930 if (risky_device(pdev)) { 4931 pci_dev_put(pdev); 4932 return; 4933 } 4934 4935 pci_dev_put(pdev); 4936 4937 /* System Management Registers. Might be hidden, in which case 4938 we can't do the sanity check. But that's OK, because the 4939 known-broken BIOSes _don't_ actually hide it, so far. */ 4940 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4941 if (!pdev) 4942 return; 4943 4944 if (risky_device(pdev)) { 4945 pci_dev_put(pdev); 4946 return; 4947 } 4948 4949 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4950 pci_dev_put(pdev); 4951 return; 4952 } 4953 4954 pci_dev_put(pdev); 4955 4956 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4957 if (vtisochctrl & 1) 4958 return; 4959 4960 /* Drop all bits other than the number of TLB entries */ 4961 vtisochctrl &= 0x1c; 4962 4963 /* If we have the recommended number of TLB entries (16), fine. */ 4964 if (vtisochctrl == 0x10) 4965 return; 4966 4967 /* Zero TLB entries? You get to ride the short bus to school. */ 4968 if (!vtisochctrl) { 4969 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4970 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4971 dmi_get_system_info(DMI_BIOS_VENDOR), 4972 dmi_get_system_info(DMI_BIOS_VERSION), 4973 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4974 iommu_identity_mapping |= IDENTMAP_AZALIA; 4975 return; 4976 } 4977 4978 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4979 vtisochctrl); 4980 } 4981 4982 /* 4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4984 * invalidation completion before posted writes initiated with translated address 4985 * that utilized translations matching the invalidation address range, violating 4986 * the invalidation completion ordering. 4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4989 * under the control of the trusted/privileged host device driver must use this 4990 * quirk. 4991 * Device TLBs are invalidated under the following six conditions: 4992 * 1. Device driver does DMA API unmap IOVA 4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4995 * exit_mmap() due to crash 4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4997 * VM has to free pages that were unmapped 4998 * 5. Userspace driver unmaps a DMA buffer 4999 * 6. Cache invalidation in vSVA usage (upcoming) 5000 * 5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic 5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 5003 * invalidate TLB the same way as normal user unmap which will use this quirk. 5004 * The dTLB invalidation after PASID cache flush does not need this quirk. 5005 * 5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 5007 */ 5008 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 5009 unsigned long address, unsigned long mask, 5010 u32 pasid, u16 qdep) 5011 { 5012 u16 sid; 5013 5014 if (likely(!info->dtlb_extra_inval)) 5015 return; 5016 5017 sid = PCI_DEVID(info->bus, info->devfn); 5018 if (pasid == PASID_RID2PASID) { 5019 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 5020 qdep, address, mask); 5021 } else { 5022 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5023 pasid, qdep, address, mask); 5024 } 5025 } 5026