1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 26 #include "iommu.h" 27 #include "../dma-iommu.h" 28 #include "../irq_remapping.h" 29 #include "../iommu-sva.h" 30 #include "pasid.h" 31 #include "cap_audit.h" 32 #include "perfmon.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 static void __init check_tylersburg_isoch(void); 130 static int rwbf_quirk; 131 132 /* 133 * set to 1 to panic kernel if can't successfully enable VT-d 134 * (used when kernel is launched w/ TXT) 135 */ 136 static int force_on = 0; 137 static int intel_iommu_tboot_noforce; 138 static int no_platform_optin; 139 140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 141 142 /* 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 144 * if marked present. 145 */ 146 static phys_addr_t root_entry_lctp(struct root_entry *re) 147 { 148 if (!(re->lo & 1)) 149 return 0; 150 151 return re->lo & VTD_PAGE_MASK; 152 } 153 154 /* 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 156 * if marked present. 157 */ 158 static phys_addr_t root_entry_uctp(struct root_entry *re) 159 { 160 if (!(re->hi & 1)) 161 return 0; 162 163 return re->hi & VTD_PAGE_MASK; 164 } 165 166 static inline void context_set_present(struct context_entry *context) 167 { 168 context->lo |= 1; 169 } 170 171 static inline void context_set_fault_enable(struct context_entry *context) 172 { 173 context->lo &= (((u64)-1) << 2) | 1; 174 } 175 176 static inline void context_set_translation_type(struct context_entry *context, 177 unsigned long value) 178 { 179 context->lo &= (((u64)-1) << 4) | 3; 180 context->lo |= (value & 3) << 2; 181 } 182 183 static inline void context_set_address_root(struct context_entry *context, 184 unsigned long value) 185 { 186 context->lo &= ~VTD_PAGE_MASK; 187 context->lo |= value & VTD_PAGE_MASK; 188 } 189 190 static inline void context_set_address_width(struct context_entry *context, 191 unsigned long value) 192 { 193 context->hi |= value & 7; 194 } 195 196 static inline void context_set_domain_id(struct context_entry *context, 197 unsigned long value) 198 { 199 context->hi |= (value & ((1 << 16) - 1)) << 8; 200 } 201 202 static inline void context_set_pasid(struct context_entry *context) 203 { 204 context->lo |= CONTEXT_PASIDE; 205 } 206 207 static inline int context_domain_id(struct context_entry *c) 208 { 209 return((c->hi >> 8) & 0xffff); 210 } 211 212 static inline void context_clear_entry(struct context_entry *context) 213 { 214 context->lo = 0; 215 context->hi = 0; 216 } 217 218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 219 { 220 if (!iommu->copied_tables) 221 return false; 222 223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); 224 } 225 226 static inline void 227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 228 { 229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables); 230 } 231 232 static inline void 233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 234 { 235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); 236 } 237 238 /* 239 * This domain is a statically identity mapping domain. 240 * 1. This domain creats a static 1:1 mapping to all usable memory. 241 * 2. It maps to each iommu if successful. 242 * 3. Each iommu mapps to this domain if successful. 243 */ 244 static struct dmar_domain *si_domain; 245 static int hw_pass_through = 1; 246 247 struct dmar_rmrr_unit { 248 struct list_head list; /* list of rmrr units */ 249 struct acpi_dmar_header *hdr; /* ACPI header */ 250 u64 base_address; /* reserved base address*/ 251 u64 end_address; /* reserved end address */ 252 struct dmar_dev_scope *devices; /* target devices */ 253 int devices_cnt; /* target device count */ 254 }; 255 256 struct dmar_atsr_unit { 257 struct list_head list; /* list of ATSR units */ 258 struct acpi_dmar_header *hdr; /* ACPI header */ 259 struct dmar_dev_scope *devices; /* target devices */ 260 int devices_cnt; /* target device count */ 261 u8 include_all:1; /* include all ports */ 262 }; 263 264 struct dmar_satc_unit { 265 struct list_head list; /* list of SATC units */ 266 struct acpi_dmar_header *hdr; /* ACPI header */ 267 struct dmar_dev_scope *devices; /* target devices */ 268 struct intel_iommu *iommu; /* the corresponding iommu */ 269 int devices_cnt; /* target device count */ 270 u8 atc_required:1; /* ATS is required */ 271 }; 272 273 static LIST_HEAD(dmar_atsr_units); 274 static LIST_HEAD(dmar_rmrr_units); 275 static LIST_HEAD(dmar_satc_units); 276 277 #define for_each_rmrr_units(rmrr) \ 278 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 279 280 static void device_block_translation(struct device *dev); 281 static void intel_iommu_domain_free(struct iommu_domain *domain); 282 283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 285 286 int intel_iommu_enabled = 0; 287 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 288 289 static int dmar_map_gfx = 1; 290 static int intel_iommu_superpage = 1; 291 static int iommu_identity_mapping; 292 static int iommu_skip_te_disable; 293 294 #define IDENTMAP_GFX 2 295 #define IDENTMAP_AZALIA 4 296 297 const struct iommu_ops intel_iommu_ops; 298 299 static bool translation_pre_enabled(struct intel_iommu *iommu) 300 { 301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 302 } 303 304 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 305 { 306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 307 } 308 309 static void init_translation_status(struct intel_iommu *iommu) 310 { 311 u32 gsts; 312 313 gsts = readl(iommu->reg + DMAR_GSTS_REG); 314 if (gsts & DMA_GSTS_TES) 315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 316 } 317 318 static int __init intel_iommu_setup(char *str) 319 { 320 if (!str) 321 return -EINVAL; 322 323 while (*str) { 324 if (!strncmp(str, "on", 2)) { 325 dmar_disabled = 0; 326 pr_info("IOMMU enabled\n"); 327 } else if (!strncmp(str, "off", 3)) { 328 dmar_disabled = 1; 329 no_platform_optin = 1; 330 pr_info("IOMMU disabled\n"); 331 } else if (!strncmp(str, "igfx_off", 8)) { 332 dmar_map_gfx = 0; 333 pr_info("Disable GFX device mapping\n"); 334 } else if (!strncmp(str, "forcedac", 8)) { 335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 336 iommu_dma_forcedac = true; 337 } else if (!strncmp(str, "strict", 6)) { 338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 339 iommu_set_dma_strict(); 340 } else if (!strncmp(str, "sp_off", 6)) { 341 pr_info("Disable supported super page\n"); 342 intel_iommu_superpage = 0; 343 } else if (!strncmp(str, "sm_on", 5)) { 344 pr_info("Enable scalable mode if hardware supports\n"); 345 intel_iommu_sm = 1; 346 } else if (!strncmp(str, "sm_off", 6)) { 347 pr_info("Scalable mode is disallowed\n"); 348 intel_iommu_sm = 0; 349 } else if (!strncmp(str, "tboot_noforce", 13)) { 350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 351 intel_iommu_tboot_noforce = 1; 352 } else { 353 pr_notice("Unknown option - '%s'\n", str); 354 } 355 356 str += strcspn(str, ","); 357 while (*str == ',') 358 str++; 359 } 360 361 return 1; 362 } 363 __setup("intel_iommu=", intel_iommu_setup); 364 365 void *alloc_pgtable_page(int node, gfp_t gfp) 366 { 367 struct page *page; 368 void *vaddr = NULL; 369 370 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 371 if (page) 372 vaddr = page_address(page); 373 return vaddr; 374 } 375 376 void free_pgtable_page(void *vaddr) 377 { 378 free_page((unsigned long)vaddr); 379 } 380 381 static inline int domain_type_is_si(struct dmar_domain *domain) 382 { 383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 384 } 385 386 static inline int domain_pfn_supported(struct dmar_domain *domain, 387 unsigned long pfn) 388 { 389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 390 391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 392 } 393 394 /* 395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 397 * the returned SAGAW. 398 */ 399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 400 { 401 unsigned long fl_sagaw, sl_sagaw; 402 403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 404 sl_sagaw = cap_sagaw(iommu->cap); 405 406 /* Second level only. */ 407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 408 return sl_sagaw; 409 410 /* First level only. */ 411 if (!ecap_slts(iommu->ecap)) 412 return fl_sagaw; 413 414 return fl_sagaw & sl_sagaw; 415 } 416 417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 418 { 419 unsigned long sagaw; 420 int agaw; 421 422 sagaw = __iommu_calculate_sagaw(iommu); 423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 424 if (test_bit(agaw, &sagaw)) 425 break; 426 } 427 428 return agaw; 429 } 430 431 /* 432 * Calculate max SAGAW for each iommu. 433 */ 434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 435 { 436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 437 } 438 439 /* 440 * calculate agaw for each iommu. 441 * "SAGAW" may be different across iommus, use a default agaw, and 442 * get a supported less agaw for iommus that don't support the default agaw. 443 */ 444 int iommu_calculate_agaw(struct intel_iommu *iommu) 445 { 446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 447 } 448 449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 450 { 451 return sm_supported(iommu) ? 452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 453 } 454 455 static void domain_update_iommu_coherency(struct dmar_domain *domain) 456 { 457 struct iommu_domain_info *info; 458 struct dmar_drhd_unit *drhd; 459 struct intel_iommu *iommu; 460 bool found = false; 461 unsigned long i; 462 463 domain->iommu_coherency = true; 464 xa_for_each(&domain->iommu_array, i, info) { 465 found = true; 466 if (!iommu_paging_structure_coherency(info->iommu)) { 467 domain->iommu_coherency = false; 468 break; 469 } 470 } 471 if (found) 472 return; 473 474 /* No hardware attached; use lowest common denominator */ 475 rcu_read_lock(); 476 for_each_active_iommu(iommu, drhd) { 477 if (!iommu_paging_structure_coherency(iommu)) { 478 domain->iommu_coherency = false; 479 break; 480 } 481 } 482 rcu_read_unlock(); 483 } 484 485 static int domain_update_iommu_superpage(struct dmar_domain *domain, 486 struct intel_iommu *skip) 487 { 488 struct dmar_drhd_unit *drhd; 489 struct intel_iommu *iommu; 490 int mask = 0x3; 491 492 if (!intel_iommu_superpage) 493 return 0; 494 495 /* set iommu_superpage to the smallest common denominator */ 496 rcu_read_lock(); 497 for_each_active_iommu(iommu, drhd) { 498 if (iommu != skip) { 499 if (domain && domain->use_first_level) { 500 if (!cap_fl1gp_support(iommu->cap)) 501 mask = 0x1; 502 } else { 503 mask &= cap_super_page_val(iommu->cap); 504 } 505 506 if (!mask) 507 break; 508 } 509 } 510 rcu_read_unlock(); 511 512 return fls(mask); 513 } 514 515 static int domain_update_device_node(struct dmar_domain *domain) 516 { 517 struct device_domain_info *info; 518 int nid = NUMA_NO_NODE; 519 unsigned long flags; 520 521 spin_lock_irqsave(&domain->lock, flags); 522 list_for_each_entry(info, &domain->devices, link) { 523 /* 524 * There could possibly be multiple device numa nodes as devices 525 * within the same domain may sit behind different IOMMUs. There 526 * isn't perfect answer in such situation, so we select first 527 * come first served policy. 528 */ 529 nid = dev_to_node(info->dev); 530 if (nid != NUMA_NO_NODE) 531 break; 532 } 533 spin_unlock_irqrestore(&domain->lock, flags); 534 535 return nid; 536 } 537 538 static void domain_update_iotlb(struct dmar_domain *domain); 539 540 /* Return the super pagesize bitmap if supported. */ 541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 542 { 543 unsigned long bitmap = 0; 544 545 /* 546 * 1-level super page supports page size of 2MiB, 2-level super page 547 * supports page size of both 2MiB and 1GiB. 548 */ 549 if (domain->iommu_superpage == 1) 550 bitmap |= SZ_2M; 551 else if (domain->iommu_superpage == 2) 552 bitmap |= SZ_2M | SZ_1G; 553 554 return bitmap; 555 } 556 557 /* Some capabilities may be different across iommus */ 558 static void domain_update_iommu_cap(struct dmar_domain *domain) 559 { 560 domain_update_iommu_coherency(domain); 561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 562 563 /* 564 * If RHSA is missing, we should default to the device numa domain 565 * as fall back. 566 */ 567 if (domain->nid == NUMA_NO_NODE) 568 domain->nid = domain_update_device_node(domain); 569 570 /* 571 * First-level translation restricts the input-address to a 572 * canonical address (i.e., address bits 63:N have the same 573 * value as address bit [N-1], where N is 48-bits with 4-level 574 * paging and 57-bits with 5-level paging). Hence, skip bit 575 * [N-1]. 576 */ 577 if (domain->use_first_level) 578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 579 else 580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 581 582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 583 domain_update_iotlb(domain); 584 } 585 586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 587 u8 devfn, int alloc) 588 { 589 struct root_entry *root = &iommu->root_entry[bus]; 590 struct context_entry *context; 591 u64 *entry; 592 593 /* 594 * Except that the caller requested to allocate a new entry, 595 * returning a copied context entry makes no sense. 596 */ 597 if (!alloc && context_copied(iommu, bus, devfn)) 598 return NULL; 599 600 entry = &root->lo; 601 if (sm_supported(iommu)) { 602 if (devfn >= 0x80) { 603 devfn -= 0x80; 604 entry = &root->hi; 605 } 606 devfn *= 2; 607 } 608 if (*entry & 1) 609 context = phys_to_virt(*entry & VTD_PAGE_MASK); 610 else { 611 unsigned long phy_addr; 612 if (!alloc) 613 return NULL; 614 615 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 616 if (!context) 617 return NULL; 618 619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 620 phy_addr = virt_to_phys((void *)context); 621 *entry = phy_addr | 1; 622 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 623 } 624 return &context[devfn]; 625 } 626 627 /** 628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 629 * sub-hierarchy of a candidate PCI-PCI bridge 630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 631 * @bridge: the candidate PCI-PCI bridge 632 * 633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 634 */ 635 static bool 636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 637 { 638 struct pci_dev *pdev, *pbridge; 639 640 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 641 return false; 642 643 pdev = to_pci_dev(dev); 644 pbridge = to_pci_dev(bridge); 645 646 if (pbridge->subordinate && 647 pbridge->subordinate->number <= pdev->bus->number && 648 pbridge->subordinate->busn_res.end >= pdev->bus->number) 649 return true; 650 651 return false; 652 } 653 654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 655 { 656 struct dmar_drhd_unit *drhd; 657 u32 vtbar; 658 int rc; 659 660 /* We know that this device on this chipset has its own IOMMU. 661 * If we find it under a different IOMMU, then the BIOS is lying 662 * to us. Hope that the IOMMU for this device is actually 663 * disabled, and it needs no translation... 664 */ 665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 666 if (rc) { 667 /* "can't" happen */ 668 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 669 return false; 670 } 671 vtbar &= 0xffff0000; 672 673 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 674 drhd = dmar_find_matched_drhd_unit(pdev); 675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 678 return true; 679 } 680 681 return false; 682 } 683 684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 685 { 686 if (!iommu || iommu->drhd->ignored) 687 return true; 688 689 if (dev_is_pci(dev)) { 690 struct pci_dev *pdev = to_pci_dev(dev); 691 692 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 694 quirk_ioat_snb_local_iommu(pdev)) 695 return true; 696 } 697 698 return false; 699 } 700 701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 702 { 703 struct dmar_drhd_unit *drhd = NULL; 704 struct pci_dev *pdev = NULL; 705 struct intel_iommu *iommu; 706 struct device *tmp; 707 u16 segment = 0; 708 int i; 709 710 if (!dev) 711 return NULL; 712 713 if (dev_is_pci(dev)) { 714 struct pci_dev *pf_pdev; 715 716 pdev = pci_real_dma_dev(to_pci_dev(dev)); 717 718 /* VFs aren't listed in scope tables; we need to look up 719 * the PF instead to find the IOMMU. */ 720 pf_pdev = pci_physfn(pdev); 721 dev = &pf_pdev->dev; 722 segment = pci_domain_nr(pdev->bus); 723 } else if (has_acpi_companion(dev)) 724 dev = &ACPI_COMPANION(dev)->dev; 725 726 rcu_read_lock(); 727 for_each_iommu(iommu, drhd) { 728 if (pdev && segment != drhd->segment) 729 continue; 730 731 for_each_active_dev_scope(drhd->devices, 732 drhd->devices_cnt, i, tmp) { 733 if (tmp == dev) { 734 /* For a VF use its original BDF# not that of the PF 735 * which we used for the IOMMU lookup. Strictly speaking 736 * we could do this for all PCI devices; we only need to 737 * get the BDF# from the scope table for ACPI matches. */ 738 if (pdev && pdev->is_virtfn) 739 goto got_pdev; 740 741 if (bus && devfn) { 742 *bus = drhd->devices[i].bus; 743 *devfn = drhd->devices[i].devfn; 744 } 745 goto out; 746 } 747 748 if (is_downstream_to_pci_bridge(dev, tmp)) 749 goto got_pdev; 750 } 751 752 if (pdev && drhd->include_all) { 753 got_pdev: 754 if (bus && devfn) { 755 *bus = pdev->bus->number; 756 *devfn = pdev->devfn; 757 } 758 goto out; 759 } 760 } 761 iommu = NULL; 762 out: 763 if (iommu_is_dummy(iommu, dev)) 764 iommu = NULL; 765 766 rcu_read_unlock(); 767 768 return iommu; 769 } 770 771 static void domain_flush_cache(struct dmar_domain *domain, 772 void *addr, int size) 773 { 774 if (!domain->iommu_coherency) 775 clflush_cache_range(addr, size); 776 } 777 778 static void free_context_table(struct intel_iommu *iommu) 779 { 780 struct context_entry *context; 781 int i; 782 783 if (!iommu->root_entry) 784 return; 785 786 for (i = 0; i < ROOT_ENTRY_NR; i++) { 787 context = iommu_context_addr(iommu, i, 0, 0); 788 if (context) 789 free_pgtable_page(context); 790 791 if (!sm_supported(iommu)) 792 continue; 793 794 context = iommu_context_addr(iommu, i, 0x80, 0); 795 if (context) 796 free_pgtable_page(context); 797 } 798 799 free_pgtable_page(iommu->root_entry); 800 iommu->root_entry = NULL; 801 } 802 803 #ifdef CONFIG_DMAR_DEBUG 804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 805 u8 bus, u8 devfn, struct dma_pte *parent, int level) 806 { 807 struct dma_pte *pte; 808 int offset; 809 810 while (1) { 811 offset = pfn_level_offset(pfn, level); 812 pte = &parent[offset]; 813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 814 pr_info("PTE not present at level %d\n", level); 815 break; 816 } 817 818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 819 820 if (level == 1) 821 break; 822 823 parent = phys_to_virt(dma_pte_addr(pte)); 824 level--; 825 } 826 } 827 828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 829 unsigned long long addr, u32 pasid) 830 { 831 struct pasid_dir_entry *dir, *pde; 832 struct pasid_entry *entries, *pte; 833 struct context_entry *ctx_entry; 834 struct root_entry *rt_entry; 835 int i, dir_index, index, level; 836 u8 devfn = source_id & 0xff; 837 u8 bus = source_id >> 8; 838 struct dma_pte *pgtable; 839 840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 841 842 /* root entry dump */ 843 rt_entry = &iommu->root_entry[bus]; 844 if (!rt_entry) { 845 pr_info("root table entry is not present\n"); 846 return; 847 } 848 849 if (sm_supported(iommu)) 850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 851 rt_entry->hi, rt_entry->lo); 852 else 853 pr_info("root entry: 0x%016llx", rt_entry->lo); 854 855 /* context entry dump */ 856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 857 if (!ctx_entry) { 858 pr_info("context table entry is not present\n"); 859 return; 860 } 861 862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 863 ctx_entry->hi, ctx_entry->lo); 864 865 /* legacy mode does not require PASID entries */ 866 if (!sm_supported(iommu)) { 867 level = agaw_to_level(ctx_entry->hi & 7); 868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 869 goto pgtable_walk; 870 } 871 872 /* get the pointer to pasid directory entry */ 873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 874 if (!dir) { 875 pr_info("pasid directory entry is not present\n"); 876 return; 877 } 878 /* For request-without-pasid, get the pasid from context entry */ 879 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 880 pasid = PASID_RID2PASID; 881 882 dir_index = pasid >> PASID_PDE_SHIFT; 883 pde = &dir[dir_index]; 884 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 885 886 /* get the pointer to the pasid table entry */ 887 entries = get_pasid_table_from_pde(pde); 888 if (!entries) { 889 pr_info("pasid table entry is not present\n"); 890 return; 891 } 892 index = pasid & PASID_PTE_MASK; 893 pte = &entries[index]; 894 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 896 897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 900 } else { 901 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 903 } 904 905 pgtable_walk: 906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 907 } 908 #endif 909 910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 911 unsigned long pfn, int *target_level, 912 gfp_t gfp) 913 { 914 struct dma_pte *parent, *pte; 915 int level = agaw_to_level(domain->agaw); 916 int offset; 917 918 if (!domain_pfn_supported(domain, pfn)) 919 /* Address beyond IOMMU's addressing capabilities. */ 920 return NULL; 921 922 parent = domain->pgd; 923 924 while (1) { 925 void *tmp_page; 926 927 offset = pfn_level_offset(pfn, level); 928 pte = &parent[offset]; 929 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 930 break; 931 if (level == *target_level) 932 break; 933 934 if (!dma_pte_present(pte)) { 935 uint64_t pteval; 936 937 tmp_page = alloc_pgtable_page(domain->nid, gfp); 938 939 if (!tmp_page) 940 return NULL; 941 942 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 943 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 944 if (domain->use_first_level) 945 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 946 947 if (cmpxchg64(&pte->val, 0ULL, pteval)) 948 /* Someone else set it while we were thinking; use theirs. */ 949 free_pgtable_page(tmp_page); 950 else 951 domain_flush_cache(domain, pte, sizeof(*pte)); 952 } 953 if (level == 1) 954 break; 955 956 parent = phys_to_virt(dma_pte_addr(pte)); 957 level--; 958 } 959 960 if (!*target_level) 961 *target_level = level; 962 963 return pte; 964 } 965 966 /* return address's pte at specific level */ 967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 968 unsigned long pfn, 969 int level, int *large_page) 970 { 971 struct dma_pte *parent, *pte; 972 int total = agaw_to_level(domain->agaw); 973 int offset; 974 975 parent = domain->pgd; 976 while (level <= total) { 977 offset = pfn_level_offset(pfn, total); 978 pte = &parent[offset]; 979 if (level == total) 980 return pte; 981 982 if (!dma_pte_present(pte)) { 983 *large_page = total; 984 break; 985 } 986 987 if (dma_pte_superpage(pte)) { 988 *large_page = total; 989 return pte; 990 } 991 992 parent = phys_to_virt(dma_pte_addr(pte)); 993 total--; 994 } 995 return NULL; 996 } 997 998 /* clear last level pte, a tlb flush should be followed */ 999 static void dma_pte_clear_range(struct dmar_domain *domain, 1000 unsigned long start_pfn, 1001 unsigned long last_pfn) 1002 { 1003 unsigned int large_page; 1004 struct dma_pte *first_pte, *pte; 1005 1006 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1007 WARN_ON(start_pfn > last_pfn)) 1008 return; 1009 1010 /* we don't need lock here; nobody else touches the iova range */ 1011 do { 1012 large_page = 1; 1013 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1014 if (!pte) { 1015 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1016 continue; 1017 } 1018 do { 1019 dma_clear_pte(pte); 1020 start_pfn += lvl_to_nr_pages(large_page); 1021 pte++; 1022 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1023 1024 domain_flush_cache(domain, first_pte, 1025 (void *)pte - (void *)first_pte); 1026 1027 } while (start_pfn && start_pfn <= last_pfn); 1028 } 1029 1030 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1031 int retain_level, struct dma_pte *pte, 1032 unsigned long pfn, unsigned long start_pfn, 1033 unsigned long last_pfn) 1034 { 1035 pfn = max(start_pfn, pfn); 1036 pte = &pte[pfn_level_offset(pfn, level)]; 1037 1038 do { 1039 unsigned long level_pfn; 1040 struct dma_pte *level_pte; 1041 1042 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1043 goto next; 1044 1045 level_pfn = pfn & level_mask(level); 1046 level_pte = phys_to_virt(dma_pte_addr(pte)); 1047 1048 if (level > 2) { 1049 dma_pte_free_level(domain, level - 1, retain_level, 1050 level_pte, level_pfn, start_pfn, 1051 last_pfn); 1052 } 1053 1054 /* 1055 * Free the page table if we're below the level we want to 1056 * retain and the range covers the entire table. 1057 */ 1058 if (level < retain_level && !(start_pfn > level_pfn || 1059 last_pfn < level_pfn + level_size(level) - 1)) { 1060 dma_clear_pte(pte); 1061 domain_flush_cache(domain, pte, sizeof(*pte)); 1062 free_pgtable_page(level_pte); 1063 } 1064 next: 1065 pfn += level_size(level); 1066 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1067 } 1068 1069 /* 1070 * clear last level (leaf) ptes and free page table pages below the 1071 * level we wish to keep intact. 1072 */ 1073 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1074 unsigned long start_pfn, 1075 unsigned long last_pfn, 1076 int retain_level) 1077 { 1078 dma_pte_clear_range(domain, start_pfn, last_pfn); 1079 1080 /* We don't need lock here; nobody else touches the iova range */ 1081 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1082 domain->pgd, 0, start_pfn, last_pfn); 1083 1084 /* free pgd */ 1085 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1086 free_pgtable_page(domain->pgd); 1087 domain->pgd = NULL; 1088 } 1089 } 1090 1091 /* When a page at a given level is being unlinked from its parent, we don't 1092 need to *modify* it at all. All we need to do is make a list of all the 1093 pages which can be freed just as soon as we've flushed the IOTLB and we 1094 know the hardware page-walk will no longer touch them. 1095 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1096 be freed. */ 1097 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1098 int level, struct dma_pte *pte, 1099 struct list_head *freelist) 1100 { 1101 struct page *pg; 1102 1103 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1104 list_add_tail(&pg->lru, freelist); 1105 1106 if (level == 1) 1107 return; 1108 1109 pte = page_address(pg); 1110 do { 1111 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1112 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1113 pte++; 1114 } while (!first_pte_in_page(pte)); 1115 } 1116 1117 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1118 struct dma_pte *pte, unsigned long pfn, 1119 unsigned long start_pfn, unsigned long last_pfn, 1120 struct list_head *freelist) 1121 { 1122 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1123 1124 pfn = max(start_pfn, pfn); 1125 pte = &pte[pfn_level_offset(pfn, level)]; 1126 1127 do { 1128 unsigned long level_pfn = pfn & level_mask(level); 1129 1130 if (!dma_pte_present(pte)) 1131 goto next; 1132 1133 /* If range covers entire pagetable, free it */ 1134 if (start_pfn <= level_pfn && 1135 last_pfn >= level_pfn + level_size(level) - 1) { 1136 /* These suborbinate page tables are going away entirely. Don't 1137 bother to clear them; we're just going to *free* them. */ 1138 if (level > 1 && !dma_pte_superpage(pte)) 1139 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1140 1141 dma_clear_pte(pte); 1142 if (!first_pte) 1143 first_pte = pte; 1144 last_pte = pte; 1145 } else if (level > 1) { 1146 /* Recurse down into a level that isn't *entirely* obsolete */ 1147 dma_pte_clear_level(domain, level - 1, 1148 phys_to_virt(dma_pte_addr(pte)), 1149 level_pfn, start_pfn, last_pfn, 1150 freelist); 1151 } 1152 next: 1153 pfn = level_pfn + level_size(level); 1154 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1155 1156 if (first_pte) 1157 domain_flush_cache(domain, first_pte, 1158 (void *)++last_pte - (void *)first_pte); 1159 } 1160 1161 /* We can't just free the pages because the IOMMU may still be walking 1162 the page tables, and may have cached the intermediate levels. The 1163 pages can only be freed after the IOTLB flush has been done. */ 1164 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1165 unsigned long last_pfn, struct list_head *freelist) 1166 { 1167 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1168 WARN_ON(start_pfn > last_pfn)) 1169 return; 1170 1171 /* we don't need lock here; nobody else touches the iova range */ 1172 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1173 domain->pgd, 0, start_pfn, last_pfn, freelist); 1174 1175 /* free pgd */ 1176 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1177 struct page *pgd_page = virt_to_page(domain->pgd); 1178 list_add_tail(&pgd_page->lru, freelist); 1179 domain->pgd = NULL; 1180 } 1181 } 1182 1183 /* iommu handling */ 1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1185 { 1186 struct root_entry *root; 1187 1188 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1189 if (!root) { 1190 pr_err("Allocating root entry for %s failed\n", 1191 iommu->name); 1192 return -ENOMEM; 1193 } 1194 1195 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1196 iommu->root_entry = root; 1197 1198 return 0; 1199 } 1200 1201 static void iommu_set_root_entry(struct intel_iommu *iommu) 1202 { 1203 u64 addr; 1204 u32 sts; 1205 unsigned long flag; 1206 1207 addr = virt_to_phys(iommu->root_entry); 1208 if (sm_supported(iommu)) 1209 addr |= DMA_RTADDR_SMT; 1210 1211 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1212 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1213 1214 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1215 1216 /* Make sure hardware complete it */ 1217 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1218 readl, (sts & DMA_GSTS_RTPS), sts); 1219 1220 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1221 1222 /* 1223 * Hardware invalidates all DMA remapping hardware translation 1224 * caches as part of SRTP flow. 1225 */ 1226 if (cap_esrtps(iommu->cap)) 1227 return; 1228 1229 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1230 if (sm_supported(iommu)) 1231 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1232 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1233 } 1234 1235 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1236 { 1237 u32 val; 1238 unsigned long flag; 1239 1240 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1241 return; 1242 1243 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1244 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1245 1246 /* Make sure hardware complete it */ 1247 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1248 readl, (!(val & DMA_GSTS_WBFS)), val); 1249 1250 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1251 } 1252 1253 /* return value determine if we need a write buffer flush */ 1254 static void __iommu_flush_context(struct intel_iommu *iommu, 1255 u16 did, u16 source_id, u8 function_mask, 1256 u64 type) 1257 { 1258 u64 val = 0; 1259 unsigned long flag; 1260 1261 switch (type) { 1262 case DMA_CCMD_GLOBAL_INVL: 1263 val = DMA_CCMD_GLOBAL_INVL; 1264 break; 1265 case DMA_CCMD_DOMAIN_INVL: 1266 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1267 break; 1268 case DMA_CCMD_DEVICE_INVL: 1269 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1270 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1271 break; 1272 default: 1273 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1274 iommu->name, type); 1275 return; 1276 } 1277 val |= DMA_CCMD_ICC; 1278 1279 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1280 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1281 1282 /* Make sure hardware complete it */ 1283 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1284 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1285 1286 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1287 } 1288 1289 /* return value determine if we need a write buffer flush */ 1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1291 u64 addr, unsigned int size_order, u64 type) 1292 { 1293 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1294 u64 val = 0, val_iva = 0; 1295 unsigned long flag; 1296 1297 switch (type) { 1298 case DMA_TLB_GLOBAL_FLUSH: 1299 /* global flush doesn't need set IVA_REG */ 1300 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1301 break; 1302 case DMA_TLB_DSI_FLUSH: 1303 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1304 break; 1305 case DMA_TLB_PSI_FLUSH: 1306 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1307 /* IH bit is passed in as part of address */ 1308 val_iva = size_order | addr; 1309 break; 1310 default: 1311 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1312 iommu->name, type); 1313 return; 1314 } 1315 1316 if (cap_write_drain(iommu->cap)) 1317 val |= DMA_TLB_WRITE_DRAIN; 1318 1319 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1320 /* Note: Only uses first TLB reg currently */ 1321 if (val_iva) 1322 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1323 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1324 1325 /* Make sure hardware complete it */ 1326 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1327 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1328 1329 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1330 1331 /* check IOTLB invalidation granularity */ 1332 if (DMA_TLB_IAIG(val) == 0) 1333 pr_err("Flush IOTLB failed\n"); 1334 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1335 pr_debug("TLB flush request %Lx, actual %Lx\n", 1336 (unsigned long long)DMA_TLB_IIRG(type), 1337 (unsigned long long)DMA_TLB_IAIG(val)); 1338 } 1339 1340 static struct device_domain_info * 1341 domain_lookup_dev_info(struct dmar_domain *domain, 1342 struct intel_iommu *iommu, u8 bus, u8 devfn) 1343 { 1344 struct device_domain_info *info; 1345 unsigned long flags; 1346 1347 spin_lock_irqsave(&domain->lock, flags); 1348 list_for_each_entry(info, &domain->devices, link) { 1349 if (info->iommu == iommu && info->bus == bus && 1350 info->devfn == devfn) { 1351 spin_unlock_irqrestore(&domain->lock, flags); 1352 return info; 1353 } 1354 } 1355 spin_unlock_irqrestore(&domain->lock, flags); 1356 1357 return NULL; 1358 } 1359 1360 static void domain_update_iotlb(struct dmar_domain *domain) 1361 { 1362 struct device_domain_info *info; 1363 bool has_iotlb_device = false; 1364 unsigned long flags; 1365 1366 spin_lock_irqsave(&domain->lock, flags); 1367 list_for_each_entry(info, &domain->devices, link) { 1368 if (info->ats_enabled) { 1369 has_iotlb_device = true; 1370 break; 1371 } 1372 } 1373 domain->has_iotlb_device = has_iotlb_device; 1374 spin_unlock_irqrestore(&domain->lock, flags); 1375 } 1376 1377 /* 1378 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1379 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1380 * check because it applies only to the built-in QAT devices and it doesn't 1381 * grant additional privileges. 1382 */ 1383 #define BUGGY_QAT_DEVID_MASK 0x4940 1384 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1385 { 1386 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1387 return false; 1388 1389 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1390 return false; 1391 1392 return true; 1393 } 1394 1395 static void iommu_enable_pci_caps(struct device_domain_info *info) 1396 { 1397 struct pci_dev *pdev; 1398 1399 if (!dev_is_pci(info->dev)) 1400 return; 1401 1402 pdev = to_pci_dev(info->dev); 1403 1404 /* The PCIe spec, in its wisdom, declares that the behaviour of 1405 the device if you enable PASID support after ATS support is 1406 undefined. So always enable PASID support on devices which 1407 have it, even if we can't yet know if we're ever going to 1408 use it. */ 1409 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1410 info->pasid_enabled = 1; 1411 1412 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1413 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1414 info->ats_enabled = 1; 1415 domain_update_iotlb(info->domain); 1416 } 1417 } 1418 1419 static void iommu_disable_pci_caps(struct device_domain_info *info) 1420 { 1421 struct pci_dev *pdev; 1422 1423 if (!dev_is_pci(info->dev)) 1424 return; 1425 1426 pdev = to_pci_dev(info->dev); 1427 1428 if (info->ats_enabled) { 1429 pci_disable_ats(pdev); 1430 info->ats_enabled = 0; 1431 domain_update_iotlb(info->domain); 1432 } 1433 1434 if (info->pasid_enabled) { 1435 pci_disable_pasid(pdev); 1436 info->pasid_enabled = 0; 1437 } 1438 } 1439 1440 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1441 u64 addr, unsigned int mask) 1442 { 1443 u16 sid, qdep; 1444 1445 if (!info || !info->ats_enabled) 1446 return; 1447 1448 sid = info->bus << 8 | info->devfn; 1449 qdep = info->ats_qdep; 1450 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1451 qdep, addr, mask); 1452 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep); 1453 } 1454 1455 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1456 u64 addr, unsigned mask) 1457 { 1458 struct device_domain_info *info; 1459 unsigned long flags; 1460 1461 if (!domain->has_iotlb_device) 1462 return; 1463 1464 spin_lock_irqsave(&domain->lock, flags); 1465 list_for_each_entry(info, &domain->devices, link) 1466 __iommu_flush_dev_iotlb(info, addr, mask); 1467 spin_unlock_irqrestore(&domain->lock, flags); 1468 } 1469 1470 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1471 struct dmar_domain *domain, 1472 unsigned long pfn, unsigned int pages, 1473 int ih, int map) 1474 { 1475 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1476 unsigned int mask = ilog2(aligned_pages); 1477 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1478 u16 did = domain_id_iommu(domain, iommu); 1479 1480 if (WARN_ON(!pages)) 1481 return; 1482 1483 if (ih) 1484 ih = 1 << 6; 1485 1486 if (domain->use_first_level) { 1487 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1488 } else { 1489 unsigned long bitmask = aligned_pages - 1; 1490 1491 /* 1492 * PSI masks the low order bits of the base address. If the 1493 * address isn't aligned to the mask, then compute a mask value 1494 * needed to ensure the target range is flushed. 1495 */ 1496 if (unlikely(bitmask & pfn)) { 1497 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1498 1499 /* 1500 * Since end_pfn <= pfn + bitmask, the only way bits 1501 * higher than bitmask can differ in pfn and end_pfn is 1502 * by carrying. This means after masking out bitmask, 1503 * high bits starting with the first set bit in 1504 * shared_bits are all equal in both pfn and end_pfn. 1505 */ 1506 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1507 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1508 } 1509 1510 /* 1511 * Fallback to domain selective flush if no PSI support or 1512 * the size is too big. 1513 */ 1514 if (!cap_pgsel_inv(iommu->cap) || 1515 mask > cap_max_amask_val(iommu->cap)) 1516 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1517 DMA_TLB_DSI_FLUSH); 1518 else 1519 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1520 DMA_TLB_PSI_FLUSH); 1521 } 1522 1523 /* 1524 * In caching mode, changes of pages from non-present to present require 1525 * flush. However, device IOTLB doesn't need to be flushed in this case. 1526 */ 1527 if (!cap_caching_mode(iommu->cap) || !map) 1528 iommu_flush_dev_iotlb(domain, addr, mask); 1529 } 1530 1531 /* Notification for newly created mappings */ 1532 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1533 struct dmar_domain *domain, 1534 unsigned long pfn, unsigned int pages) 1535 { 1536 /* 1537 * It's a non-present to present mapping. Only flush if caching mode 1538 * and second level. 1539 */ 1540 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1541 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1542 else 1543 iommu_flush_write_buffer(iommu); 1544 } 1545 1546 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1547 { 1548 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1549 struct iommu_domain_info *info; 1550 unsigned long idx; 1551 1552 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1553 struct intel_iommu *iommu = info->iommu; 1554 u16 did = domain_id_iommu(dmar_domain, iommu); 1555 1556 if (dmar_domain->use_first_level) 1557 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1558 else 1559 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1560 DMA_TLB_DSI_FLUSH); 1561 1562 if (!cap_caching_mode(iommu->cap)) 1563 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1564 } 1565 } 1566 1567 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1568 { 1569 u32 pmen; 1570 unsigned long flags; 1571 1572 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1573 return; 1574 1575 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1576 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1577 pmen &= ~DMA_PMEN_EPM; 1578 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1579 1580 /* wait for the protected region status bit to clear */ 1581 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1582 readl, !(pmen & DMA_PMEN_PRS), pmen); 1583 1584 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1585 } 1586 1587 static void iommu_enable_translation(struct intel_iommu *iommu) 1588 { 1589 u32 sts; 1590 unsigned long flags; 1591 1592 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1593 iommu->gcmd |= DMA_GCMD_TE; 1594 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1595 1596 /* Make sure hardware complete it */ 1597 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1598 readl, (sts & DMA_GSTS_TES), sts); 1599 1600 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1601 } 1602 1603 static void iommu_disable_translation(struct intel_iommu *iommu) 1604 { 1605 u32 sts; 1606 unsigned long flag; 1607 1608 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1609 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1610 return; 1611 1612 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1613 iommu->gcmd &= ~DMA_GCMD_TE; 1614 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1615 1616 /* Make sure hardware complete it */ 1617 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1618 readl, (!(sts & DMA_GSTS_TES)), sts); 1619 1620 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1621 } 1622 1623 static int iommu_init_domains(struct intel_iommu *iommu) 1624 { 1625 u32 ndomains; 1626 1627 ndomains = cap_ndoms(iommu->cap); 1628 pr_debug("%s: Number of Domains supported <%d>\n", 1629 iommu->name, ndomains); 1630 1631 spin_lock_init(&iommu->lock); 1632 1633 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1634 if (!iommu->domain_ids) 1635 return -ENOMEM; 1636 1637 /* 1638 * If Caching mode is set, then invalid translations are tagged 1639 * with domain-id 0, hence we need to pre-allocate it. We also 1640 * use domain-id 0 as a marker for non-allocated domain-id, so 1641 * make sure it is not used for a real domain. 1642 */ 1643 set_bit(0, iommu->domain_ids); 1644 1645 /* 1646 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1647 * entry for first-level or pass-through translation modes should 1648 * be programmed with a domain id different from those used for 1649 * second-level or nested translation. We reserve a domain id for 1650 * this purpose. 1651 */ 1652 if (sm_supported(iommu)) 1653 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1654 1655 return 0; 1656 } 1657 1658 static void disable_dmar_iommu(struct intel_iommu *iommu) 1659 { 1660 if (!iommu->domain_ids) 1661 return; 1662 1663 /* 1664 * All iommu domains must have been detached from the devices, 1665 * hence there should be no domain IDs in use. 1666 */ 1667 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1668 > NUM_RESERVED_DID)) 1669 return; 1670 1671 if (iommu->gcmd & DMA_GCMD_TE) 1672 iommu_disable_translation(iommu); 1673 } 1674 1675 static void free_dmar_iommu(struct intel_iommu *iommu) 1676 { 1677 if (iommu->domain_ids) { 1678 bitmap_free(iommu->domain_ids); 1679 iommu->domain_ids = NULL; 1680 } 1681 1682 if (iommu->copied_tables) { 1683 bitmap_free(iommu->copied_tables); 1684 iommu->copied_tables = NULL; 1685 } 1686 1687 /* free context mapping */ 1688 free_context_table(iommu); 1689 1690 #ifdef CONFIG_INTEL_IOMMU_SVM 1691 if (pasid_supported(iommu)) { 1692 if (ecap_prs(iommu->ecap)) 1693 intel_svm_finish_prq(iommu); 1694 } 1695 #endif 1696 } 1697 1698 /* 1699 * Check and return whether first level is used by default for 1700 * DMA translation. 1701 */ 1702 static bool first_level_by_default(unsigned int type) 1703 { 1704 /* Only SL is available in legacy mode */ 1705 if (!scalable_mode_support()) 1706 return false; 1707 1708 /* Only level (either FL or SL) is available, just use it */ 1709 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1710 return intel_cap_flts_sanity(); 1711 1712 /* Both levels are available, decide it based on domain type */ 1713 return type != IOMMU_DOMAIN_UNMANAGED; 1714 } 1715 1716 static struct dmar_domain *alloc_domain(unsigned int type) 1717 { 1718 struct dmar_domain *domain; 1719 1720 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1721 if (!domain) 1722 return NULL; 1723 1724 domain->nid = NUMA_NO_NODE; 1725 if (first_level_by_default(type)) 1726 domain->use_first_level = true; 1727 domain->has_iotlb_device = false; 1728 INIT_LIST_HEAD(&domain->devices); 1729 spin_lock_init(&domain->lock); 1730 xa_init(&domain->iommu_array); 1731 1732 return domain; 1733 } 1734 1735 static int domain_attach_iommu(struct dmar_domain *domain, 1736 struct intel_iommu *iommu) 1737 { 1738 struct iommu_domain_info *info, *curr; 1739 unsigned long ndomains; 1740 int num, ret = -ENOSPC; 1741 1742 info = kzalloc(sizeof(*info), GFP_KERNEL); 1743 if (!info) 1744 return -ENOMEM; 1745 1746 spin_lock(&iommu->lock); 1747 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1748 if (curr) { 1749 curr->refcnt++; 1750 spin_unlock(&iommu->lock); 1751 kfree(info); 1752 return 0; 1753 } 1754 1755 ndomains = cap_ndoms(iommu->cap); 1756 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1757 if (num >= ndomains) { 1758 pr_err("%s: No free domain ids\n", iommu->name); 1759 goto err_unlock; 1760 } 1761 1762 set_bit(num, iommu->domain_ids); 1763 info->refcnt = 1; 1764 info->did = num; 1765 info->iommu = iommu; 1766 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1767 NULL, info, GFP_ATOMIC); 1768 if (curr) { 1769 ret = xa_err(curr) ? : -EBUSY; 1770 goto err_clear; 1771 } 1772 domain_update_iommu_cap(domain); 1773 1774 spin_unlock(&iommu->lock); 1775 return 0; 1776 1777 err_clear: 1778 clear_bit(info->did, iommu->domain_ids); 1779 err_unlock: 1780 spin_unlock(&iommu->lock); 1781 kfree(info); 1782 return ret; 1783 } 1784 1785 static void domain_detach_iommu(struct dmar_domain *domain, 1786 struct intel_iommu *iommu) 1787 { 1788 struct iommu_domain_info *info; 1789 1790 spin_lock(&iommu->lock); 1791 info = xa_load(&domain->iommu_array, iommu->seq_id); 1792 if (--info->refcnt == 0) { 1793 clear_bit(info->did, iommu->domain_ids); 1794 xa_erase(&domain->iommu_array, iommu->seq_id); 1795 domain->nid = NUMA_NO_NODE; 1796 domain_update_iommu_cap(domain); 1797 kfree(info); 1798 } 1799 spin_unlock(&iommu->lock); 1800 } 1801 1802 static inline int guestwidth_to_adjustwidth(int gaw) 1803 { 1804 int agaw; 1805 int r = (gaw - 12) % 9; 1806 1807 if (r == 0) 1808 agaw = gaw; 1809 else 1810 agaw = gaw + 9 - r; 1811 if (agaw > 64) 1812 agaw = 64; 1813 return agaw; 1814 } 1815 1816 static void domain_exit(struct dmar_domain *domain) 1817 { 1818 if (domain->pgd) { 1819 LIST_HEAD(freelist); 1820 1821 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1822 put_pages_list(&freelist); 1823 } 1824 1825 if (WARN_ON(!list_empty(&domain->devices))) 1826 return; 1827 1828 kfree(domain); 1829 } 1830 1831 /* 1832 * Get the PASID directory size for scalable mode context entry. 1833 * Value of X in the PDTS field of a scalable mode context entry 1834 * indicates PASID directory with 2^(X + 7) entries. 1835 */ 1836 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1837 { 1838 unsigned long pds, max_pde; 1839 1840 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1841 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1842 if (pds < 7) 1843 return 0; 1844 1845 return pds - 7; 1846 } 1847 1848 /* 1849 * Set the RID_PASID field of a scalable mode context entry. The 1850 * IOMMU hardware will use the PASID value set in this field for 1851 * DMA translations of DMA requests without PASID. 1852 */ 1853 static inline void 1854 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1855 { 1856 context->hi |= pasid & ((1 << 20) - 1); 1857 } 1858 1859 /* 1860 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1861 * entry. 1862 */ 1863 static inline void context_set_sm_dte(struct context_entry *context) 1864 { 1865 context->lo |= BIT_ULL(2); 1866 } 1867 1868 /* 1869 * Set the PRE(Page Request Enable) field of a scalable mode context 1870 * entry. 1871 */ 1872 static inline void context_set_sm_pre(struct context_entry *context) 1873 { 1874 context->lo |= BIT_ULL(4); 1875 } 1876 1877 /* Convert value to context PASID directory size field coding. */ 1878 #define context_pdts(pds) (((pds) & 0x7) << 9) 1879 1880 static int domain_context_mapping_one(struct dmar_domain *domain, 1881 struct intel_iommu *iommu, 1882 struct pasid_table *table, 1883 u8 bus, u8 devfn) 1884 { 1885 struct device_domain_info *info = 1886 domain_lookup_dev_info(domain, iommu, bus, devfn); 1887 u16 did = domain_id_iommu(domain, iommu); 1888 int translation = CONTEXT_TT_MULTI_LEVEL; 1889 struct context_entry *context; 1890 int ret; 1891 1892 if (hw_pass_through && domain_type_is_si(domain)) 1893 translation = CONTEXT_TT_PASS_THROUGH; 1894 1895 pr_debug("Set context mapping for %02x:%02x.%d\n", 1896 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1897 1898 spin_lock(&iommu->lock); 1899 ret = -ENOMEM; 1900 context = iommu_context_addr(iommu, bus, devfn, 1); 1901 if (!context) 1902 goto out_unlock; 1903 1904 ret = 0; 1905 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1906 goto out_unlock; 1907 1908 /* 1909 * For kdump cases, old valid entries may be cached due to the 1910 * in-flight DMA and copied pgtable, but there is no unmapping 1911 * behaviour for them, thus we need an explicit cache flush for 1912 * the newly-mapped device. For kdump, at this point, the device 1913 * is supposed to finish reset at its driver probe stage, so no 1914 * in-flight DMA will exist, and we don't need to worry anymore 1915 * hereafter. 1916 */ 1917 if (context_copied(iommu, bus, devfn)) { 1918 u16 did_old = context_domain_id(context); 1919 1920 if (did_old < cap_ndoms(iommu->cap)) { 1921 iommu->flush.flush_context(iommu, did_old, 1922 (((u16)bus) << 8) | devfn, 1923 DMA_CCMD_MASK_NOBIT, 1924 DMA_CCMD_DEVICE_INVL); 1925 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1926 DMA_TLB_DSI_FLUSH); 1927 } 1928 1929 clear_context_copied(iommu, bus, devfn); 1930 } 1931 1932 context_clear_entry(context); 1933 1934 if (sm_supported(iommu)) { 1935 unsigned long pds; 1936 1937 /* Setup the PASID DIR pointer: */ 1938 pds = context_get_sm_pds(table); 1939 context->lo = (u64)virt_to_phys(table->table) | 1940 context_pdts(pds); 1941 1942 /* Setup the RID_PASID field: */ 1943 context_set_sm_rid2pasid(context, PASID_RID2PASID); 1944 1945 /* 1946 * Setup the Device-TLB enable bit and Page request 1947 * Enable bit: 1948 */ 1949 if (info && info->ats_supported) 1950 context_set_sm_dte(context); 1951 if (info && info->pri_supported) 1952 context_set_sm_pre(context); 1953 if (info && info->pasid_supported) 1954 context_set_pasid(context); 1955 } else { 1956 struct dma_pte *pgd = domain->pgd; 1957 int agaw; 1958 1959 context_set_domain_id(context, did); 1960 1961 if (translation != CONTEXT_TT_PASS_THROUGH) { 1962 /* 1963 * Skip top levels of page tables for iommu which has 1964 * less agaw than default. Unnecessary for PT mode. 1965 */ 1966 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1967 ret = -ENOMEM; 1968 pgd = phys_to_virt(dma_pte_addr(pgd)); 1969 if (!dma_pte_present(pgd)) 1970 goto out_unlock; 1971 } 1972 1973 if (info && info->ats_supported) 1974 translation = CONTEXT_TT_DEV_IOTLB; 1975 else 1976 translation = CONTEXT_TT_MULTI_LEVEL; 1977 1978 context_set_address_root(context, virt_to_phys(pgd)); 1979 context_set_address_width(context, agaw); 1980 } else { 1981 /* 1982 * In pass through mode, AW must be programmed to 1983 * indicate the largest AGAW value supported by 1984 * hardware. And ASR is ignored by hardware. 1985 */ 1986 context_set_address_width(context, iommu->msagaw); 1987 } 1988 1989 context_set_translation_type(context, translation); 1990 } 1991 1992 context_set_fault_enable(context); 1993 context_set_present(context); 1994 if (!ecap_coherent(iommu->ecap)) 1995 clflush_cache_range(context, sizeof(*context)); 1996 1997 /* 1998 * It's a non-present to present mapping. If hardware doesn't cache 1999 * non-present entry we only need to flush the write-buffer. If the 2000 * _does_ cache non-present entries, then it does so in the special 2001 * domain #0, which we have to flush: 2002 */ 2003 if (cap_caching_mode(iommu->cap)) { 2004 iommu->flush.flush_context(iommu, 0, 2005 (((u16)bus) << 8) | devfn, 2006 DMA_CCMD_MASK_NOBIT, 2007 DMA_CCMD_DEVICE_INVL); 2008 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2009 } else { 2010 iommu_flush_write_buffer(iommu); 2011 } 2012 2013 ret = 0; 2014 2015 out_unlock: 2016 spin_unlock(&iommu->lock); 2017 2018 return ret; 2019 } 2020 2021 struct domain_context_mapping_data { 2022 struct dmar_domain *domain; 2023 struct intel_iommu *iommu; 2024 struct pasid_table *table; 2025 }; 2026 2027 static int domain_context_mapping_cb(struct pci_dev *pdev, 2028 u16 alias, void *opaque) 2029 { 2030 struct domain_context_mapping_data *data = opaque; 2031 2032 return domain_context_mapping_one(data->domain, data->iommu, 2033 data->table, PCI_BUS_NUM(alias), 2034 alias & 0xff); 2035 } 2036 2037 static int 2038 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2039 { 2040 struct domain_context_mapping_data data; 2041 struct pasid_table *table; 2042 struct intel_iommu *iommu; 2043 u8 bus, devfn; 2044 2045 iommu = device_to_iommu(dev, &bus, &devfn); 2046 if (!iommu) 2047 return -ENODEV; 2048 2049 table = intel_pasid_get_table(dev); 2050 2051 if (!dev_is_pci(dev)) 2052 return domain_context_mapping_one(domain, iommu, table, 2053 bus, devfn); 2054 2055 data.domain = domain; 2056 data.iommu = iommu; 2057 data.table = table; 2058 2059 return pci_for_each_dma_alias(to_pci_dev(dev), 2060 &domain_context_mapping_cb, &data); 2061 } 2062 2063 /* Returns a number of VTD pages, but aligned to MM page size */ 2064 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2065 size_t size) 2066 { 2067 host_addr &= ~PAGE_MASK; 2068 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2069 } 2070 2071 /* Return largest possible superpage level for a given mapping */ 2072 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2073 unsigned long iov_pfn, 2074 unsigned long phy_pfn, 2075 unsigned long pages) 2076 { 2077 int support, level = 1; 2078 unsigned long pfnmerge; 2079 2080 support = domain->iommu_superpage; 2081 2082 /* To use a large page, the virtual *and* physical addresses 2083 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2084 of them will mean we have to use smaller pages. So just 2085 merge them and check both at once. */ 2086 pfnmerge = iov_pfn | phy_pfn; 2087 2088 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2089 pages >>= VTD_STRIDE_SHIFT; 2090 if (!pages) 2091 break; 2092 pfnmerge >>= VTD_STRIDE_SHIFT; 2093 level++; 2094 support--; 2095 } 2096 return level; 2097 } 2098 2099 /* 2100 * Ensure that old small page tables are removed to make room for superpage(s). 2101 * We're going to add new large pages, so make sure we don't remove their parent 2102 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2103 */ 2104 static void switch_to_super_page(struct dmar_domain *domain, 2105 unsigned long start_pfn, 2106 unsigned long end_pfn, int level) 2107 { 2108 unsigned long lvl_pages = lvl_to_nr_pages(level); 2109 struct iommu_domain_info *info; 2110 struct dma_pte *pte = NULL; 2111 unsigned long i; 2112 2113 while (start_pfn <= end_pfn) { 2114 if (!pte) 2115 pte = pfn_to_dma_pte(domain, start_pfn, &level, 2116 GFP_ATOMIC); 2117 2118 if (dma_pte_present(pte)) { 2119 dma_pte_free_pagetable(domain, start_pfn, 2120 start_pfn + lvl_pages - 1, 2121 level + 1); 2122 2123 xa_for_each(&domain->iommu_array, i, info) 2124 iommu_flush_iotlb_psi(info->iommu, domain, 2125 start_pfn, lvl_pages, 2126 0, 0); 2127 } 2128 2129 pte++; 2130 start_pfn += lvl_pages; 2131 if (first_pte_in_page(pte)) 2132 pte = NULL; 2133 } 2134 } 2135 2136 static int 2137 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2138 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2139 gfp_t gfp) 2140 { 2141 struct dma_pte *first_pte = NULL, *pte = NULL; 2142 unsigned int largepage_lvl = 0; 2143 unsigned long lvl_pages = 0; 2144 phys_addr_t pteval; 2145 u64 attr; 2146 2147 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2148 return -EINVAL; 2149 2150 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2151 return -EINVAL; 2152 2153 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2154 attr |= DMA_FL_PTE_PRESENT; 2155 if (domain->use_first_level) { 2156 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2157 if (prot & DMA_PTE_WRITE) 2158 attr |= DMA_FL_PTE_DIRTY; 2159 } 2160 2161 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2162 2163 while (nr_pages > 0) { 2164 uint64_t tmp; 2165 2166 if (!pte) { 2167 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2168 phys_pfn, nr_pages); 2169 2170 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2171 gfp); 2172 if (!pte) 2173 return -ENOMEM; 2174 first_pte = pte; 2175 2176 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2177 2178 /* It is large page*/ 2179 if (largepage_lvl > 1) { 2180 unsigned long end_pfn; 2181 unsigned long pages_to_remove; 2182 2183 pteval |= DMA_PTE_LARGE_PAGE; 2184 pages_to_remove = min_t(unsigned long, nr_pages, 2185 nr_pte_to_next_page(pte) * lvl_pages); 2186 end_pfn = iov_pfn + pages_to_remove - 1; 2187 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2188 } else { 2189 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2190 } 2191 2192 } 2193 /* We don't need lock here, nobody else 2194 * touches the iova range 2195 */ 2196 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2197 if (tmp) { 2198 static int dumps = 5; 2199 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2200 iov_pfn, tmp, (unsigned long long)pteval); 2201 if (dumps) { 2202 dumps--; 2203 debug_dma_dump_mappings(NULL); 2204 } 2205 WARN_ON(1); 2206 } 2207 2208 nr_pages -= lvl_pages; 2209 iov_pfn += lvl_pages; 2210 phys_pfn += lvl_pages; 2211 pteval += lvl_pages * VTD_PAGE_SIZE; 2212 2213 /* If the next PTE would be the first in a new page, then we 2214 * need to flush the cache on the entries we've just written. 2215 * And then we'll need to recalculate 'pte', so clear it and 2216 * let it get set again in the if (!pte) block above. 2217 * 2218 * If we're done (!nr_pages) we need to flush the cache too. 2219 * 2220 * Also if we've been setting superpages, we may need to 2221 * recalculate 'pte' and switch back to smaller pages for the 2222 * end of the mapping, if the trailing size is not enough to 2223 * use another superpage (i.e. nr_pages < lvl_pages). 2224 */ 2225 pte++; 2226 if (!nr_pages || first_pte_in_page(pte) || 2227 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2228 domain_flush_cache(domain, first_pte, 2229 (void *)pte - (void *)first_pte); 2230 pte = NULL; 2231 } 2232 } 2233 2234 return 0; 2235 } 2236 2237 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2238 { 2239 struct intel_iommu *iommu = info->iommu; 2240 struct context_entry *context; 2241 u16 did_old; 2242 2243 if (!iommu) 2244 return; 2245 2246 spin_lock(&iommu->lock); 2247 context = iommu_context_addr(iommu, bus, devfn, 0); 2248 if (!context) { 2249 spin_unlock(&iommu->lock); 2250 return; 2251 } 2252 2253 if (sm_supported(iommu)) { 2254 if (hw_pass_through && domain_type_is_si(info->domain)) 2255 did_old = FLPT_DEFAULT_DID; 2256 else 2257 did_old = domain_id_iommu(info->domain, iommu); 2258 } else { 2259 did_old = context_domain_id(context); 2260 } 2261 2262 context_clear_entry(context); 2263 __iommu_flush_cache(iommu, context, sizeof(*context)); 2264 spin_unlock(&iommu->lock); 2265 iommu->flush.flush_context(iommu, 2266 did_old, 2267 (((u16)bus) << 8) | devfn, 2268 DMA_CCMD_MASK_NOBIT, 2269 DMA_CCMD_DEVICE_INVL); 2270 2271 if (sm_supported(iommu)) 2272 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2273 2274 iommu->flush.flush_iotlb(iommu, 2275 did_old, 2276 0, 2277 0, 2278 DMA_TLB_DSI_FLUSH); 2279 2280 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2281 } 2282 2283 static int domain_setup_first_level(struct intel_iommu *iommu, 2284 struct dmar_domain *domain, 2285 struct device *dev, 2286 u32 pasid) 2287 { 2288 struct dma_pte *pgd = domain->pgd; 2289 int agaw, level; 2290 int flags = 0; 2291 2292 /* 2293 * Skip top levels of page tables for iommu which has 2294 * less agaw than default. Unnecessary for PT mode. 2295 */ 2296 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2297 pgd = phys_to_virt(dma_pte_addr(pgd)); 2298 if (!dma_pte_present(pgd)) 2299 return -ENOMEM; 2300 } 2301 2302 level = agaw_to_level(agaw); 2303 if (level != 4 && level != 5) 2304 return -EINVAL; 2305 2306 if (level == 5) 2307 flags |= PASID_FLAG_FL5LP; 2308 2309 if (domain->force_snooping) 2310 flags |= PASID_FLAG_PAGE_SNOOP; 2311 2312 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2313 domain_id_iommu(domain, iommu), 2314 flags); 2315 } 2316 2317 static bool dev_is_real_dma_subdevice(struct device *dev) 2318 { 2319 return dev && dev_is_pci(dev) && 2320 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2321 } 2322 2323 static int iommu_domain_identity_map(struct dmar_domain *domain, 2324 unsigned long first_vpfn, 2325 unsigned long last_vpfn) 2326 { 2327 /* 2328 * RMRR range might have overlap with physical memory range, 2329 * clear it first 2330 */ 2331 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2332 2333 return __domain_mapping(domain, first_vpfn, 2334 first_vpfn, last_vpfn - first_vpfn + 1, 2335 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2336 } 2337 2338 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2339 2340 static int __init si_domain_init(int hw) 2341 { 2342 struct dmar_rmrr_unit *rmrr; 2343 struct device *dev; 2344 int i, nid, ret; 2345 2346 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2347 if (!si_domain) 2348 return -EFAULT; 2349 2350 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2351 domain_exit(si_domain); 2352 si_domain = NULL; 2353 return -EFAULT; 2354 } 2355 2356 if (hw) 2357 return 0; 2358 2359 for_each_online_node(nid) { 2360 unsigned long start_pfn, end_pfn; 2361 int i; 2362 2363 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2364 ret = iommu_domain_identity_map(si_domain, 2365 mm_to_dma_pfn(start_pfn), 2366 mm_to_dma_pfn(end_pfn)); 2367 if (ret) 2368 return ret; 2369 } 2370 } 2371 2372 /* 2373 * Identity map the RMRRs so that devices with RMRRs could also use 2374 * the si_domain. 2375 */ 2376 for_each_rmrr_units(rmrr) { 2377 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2378 i, dev) { 2379 unsigned long long start = rmrr->base_address; 2380 unsigned long long end = rmrr->end_address; 2381 2382 if (WARN_ON(end < start || 2383 end >> agaw_to_width(si_domain->agaw))) 2384 continue; 2385 2386 ret = iommu_domain_identity_map(si_domain, 2387 mm_to_dma_pfn(start >> PAGE_SHIFT), 2388 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2389 if (ret) 2390 return ret; 2391 } 2392 } 2393 2394 return 0; 2395 } 2396 2397 static int dmar_domain_attach_device(struct dmar_domain *domain, 2398 struct device *dev) 2399 { 2400 struct device_domain_info *info = dev_iommu_priv_get(dev); 2401 struct intel_iommu *iommu; 2402 unsigned long flags; 2403 u8 bus, devfn; 2404 int ret; 2405 2406 iommu = device_to_iommu(dev, &bus, &devfn); 2407 if (!iommu) 2408 return -ENODEV; 2409 2410 ret = domain_attach_iommu(domain, iommu); 2411 if (ret) 2412 return ret; 2413 info->domain = domain; 2414 spin_lock_irqsave(&domain->lock, flags); 2415 list_add(&info->link, &domain->devices); 2416 spin_unlock_irqrestore(&domain->lock, flags); 2417 2418 /* PASID table is mandatory for a PCI device in scalable mode. */ 2419 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2420 /* Setup the PASID entry for requests without PASID: */ 2421 if (hw_pass_through && domain_type_is_si(domain)) 2422 ret = intel_pasid_setup_pass_through(iommu, domain, 2423 dev, PASID_RID2PASID); 2424 else if (domain->use_first_level) 2425 ret = domain_setup_first_level(iommu, domain, dev, 2426 PASID_RID2PASID); 2427 else 2428 ret = intel_pasid_setup_second_level(iommu, domain, 2429 dev, PASID_RID2PASID); 2430 if (ret) { 2431 dev_err(dev, "Setup RID2PASID failed\n"); 2432 device_block_translation(dev); 2433 return ret; 2434 } 2435 } 2436 2437 ret = domain_context_mapping(domain, dev); 2438 if (ret) { 2439 dev_err(dev, "Domain context map failed\n"); 2440 device_block_translation(dev); 2441 return ret; 2442 } 2443 2444 iommu_enable_pci_caps(info); 2445 2446 return 0; 2447 } 2448 2449 static bool device_has_rmrr(struct device *dev) 2450 { 2451 struct dmar_rmrr_unit *rmrr; 2452 struct device *tmp; 2453 int i; 2454 2455 rcu_read_lock(); 2456 for_each_rmrr_units(rmrr) { 2457 /* 2458 * Return TRUE if this RMRR contains the device that 2459 * is passed in. 2460 */ 2461 for_each_active_dev_scope(rmrr->devices, 2462 rmrr->devices_cnt, i, tmp) 2463 if (tmp == dev || 2464 is_downstream_to_pci_bridge(dev, tmp)) { 2465 rcu_read_unlock(); 2466 return true; 2467 } 2468 } 2469 rcu_read_unlock(); 2470 return false; 2471 } 2472 2473 /** 2474 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2475 * is relaxable (ie. is allowed to be not enforced under some conditions) 2476 * @dev: device handle 2477 * 2478 * We assume that PCI USB devices with RMRRs have them largely 2479 * for historical reasons and that the RMRR space is not actively used post 2480 * boot. This exclusion may change if vendors begin to abuse it. 2481 * 2482 * The same exception is made for graphics devices, with the requirement that 2483 * any use of the RMRR regions will be torn down before assigning the device 2484 * to a guest. 2485 * 2486 * Return: true if the RMRR is relaxable, false otherwise 2487 */ 2488 static bool device_rmrr_is_relaxable(struct device *dev) 2489 { 2490 struct pci_dev *pdev; 2491 2492 if (!dev_is_pci(dev)) 2493 return false; 2494 2495 pdev = to_pci_dev(dev); 2496 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2497 return true; 2498 else 2499 return false; 2500 } 2501 2502 /* 2503 * There are a couple cases where we need to restrict the functionality of 2504 * devices associated with RMRRs. The first is when evaluating a device for 2505 * identity mapping because problems exist when devices are moved in and out 2506 * of domains and their respective RMRR information is lost. This means that 2507 * a device with associated RMRRs will never be in a "passthrough" domain. 2508 * The second is use of the device through the IOMMU API. This interface 2509 * expects to have full control of the IOVA space for the device. We cannot 2510 * satisfy both the requirement that RMRR access is maintained and have an 2511 * unencumbered IOVA space. We also have no ability to quiesce the device's 2512 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2513 * We therefore prevent devices associated with an RMRR from participating in 2514 * the IOMMU API, which eliminates them from device assignment. 2515 * 2516 * In both cases, devices which have relaxable RMRRs are not concerned by this 2517 * restriction. See device_rmrr_is_relaxable comment. 2518 */ 2519 static bool device_is_rmrr_locked(struct device *dev) 2520 { 2521 if (!device_has_rmrr(dev)) 2522 return false; 2523 2524 if (device_rmrr_is_relaxable(dev)) 2525 return false; 2526 2527 return true; 2528 } 2529 2530 /* 2531 * Return the required default domain type for a specific device. 2532 * 2533 * @dev: the device in query 2534 * @startup: true if this is during early boot 2535 * 2536 * Returns: 2537 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2538 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2539 * - 0: both identity and dynamic domains work for this device 2540 */ 2541 static int device_def_domain_type(struct device *dev) 2542 { 2543 if (dev_is_pci(dev)) { 2544 struct pci_dev *pdev = to_pci_dev(dev); 2545 2546 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2547 return IOMMU_DOMAIN_IDENTITY; 2548 2549 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2550 return IOMMU_DOMAIN_IDENTITY; 2551 } 2552 2553 return 0; 2554 } 2555 2556 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2557 { 2558 /* 2559 * Start from the sane iommu hardware state. 2560 * If the queued invalidation is already initialized by us 2561 * (for example, while enabling interrupt-remapping) then 2562 * we got the things already rolling from a sane state. 2563 */ 2564 if (!iommu->qi) { 2565 /* 2566 * Clear any previous faults. 2567 */ 2568 dmar_fault(-1, iommu); 2569 /* 2570 * Disable queued invalidation if supported and already enabled 2571 * before OS handover. 2572 */ 2573 dmar_disable_qi(iommu); 2574 } 2575 2576 if (dmar_enable_qi(iommu)) { 2577 /* 2578 * Queued Invalidate not enabled, use Register Based Invalidate 2579 */ 2580 iommu->flush.flush_context = __iommu_flush_context; 2581 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2582 pr_info("%s: Using Register based invalidation\n", 2583 iommu->name); 2584 } else { 2585 iommu->flush.flush_context = qi_flush_context; 2586 iommu->flush.flush_iotlb = qi_flush_iotlb; 2587 pr_info("%s: Using Queued invalidation\n", iommu->name); 2588 } 2589 } 2590 2591 static int copy_context_table(struct intel_iommu *iommu, 2592 struct root_entry *old_re, 2593 struct context_entry **tbl, 2594 int bus, bool ext) 2595 { 2596 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2597 struct context_entry *new_ce = NULL, ce; 2598 struct context_entry *old_ce = NULL; 2599 struct root_entry re; 2600 phys_addr_t old_ce_phys; 2601 2602 tbl_idx = ext ? bus * 2 : bus; 2603 memcpy(&re, old_re, sizeof(re)); 2604 2605 for (devfn = 0; devfn < 256; devfn++) { 2606 /* First calculate the correct index */ 2607 idx = (ext ? devfn * 2 : devfn) % 256; 2608 2609 if (idx == 0) { 2610 /* First save what we may have and clean up */ 2611 if (new_ce) { 2612 tbl[tbl_idx] = new_ce; 2613 __iommu_flush_cache(iommu, new_ce, 2614 VTD_PAGE_SIZE); 2615 pos = 1; 2616 } 2617 2618 if (old_ce) 2619 memunmap(old_ce); 2620 2621 ret = 0; 2622 if (devfn < 0x80) 2623 old_ce_phys = root_entry_lctp(&re); 2624 else 2625 old_ce_phys = root_entry_uctp(&re); 2626 2627 if (!old_ce_phys) { 2628 if (ext && devfn == 0) { 2629 /* No LCTP, try UCTP */ 2630 devfn = 0x7f; 2631 continue; 2632 } else { 2633 goto out; 2634 } 2635 } 2636 2637 ret = -ENOMEM; 2638 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2639 MEMREMAP_WB); 2640 if (!old_ce) 2641 goto out; 2642 2643 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2644 if (!new_ce) 2645 goto out_unmap; 2646 2647 ret = 0; 2648 } 2649 2650 /* Now copy the context entry */ 2651 memcpy(&ce, old_ce + idx, sizeof(ce)); 2652 2653 if (!context_present(&ce)) 2654 continue; 2655 2656 did = context_domain_id(&ce); 2657 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2658 set_bit(did, iommu->domain_ids); 2659 2660 set_context_copied(iommu, bus, devfn); 2661 new_ce[idx] = ce; 2662 } 2663 2664 tbl[tbl_idx + pos] = new_ce; 2665 2666 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2667 2668 out_unmap: 2669 memunmap(old_ce); 2670 2671 out: 2672 return ret; 2673 } 2674 2675 static int copy_translation_tables(struct intel_iommu *iommu) 2676 { 2677 struct context_entry **ctxt_tbls; 2678 struct root_entry *old_rt; 2679 phys_addr_t old_rt_phys; 2680 int ctxt_table_entries; 2681 u64 rtaddr_reg; 2682 int bus, ret; 2683 bool new_ext, ext; 2684 2685 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2686 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2687 new_ext = !!sm_supported(iommu); 2688 2689 /* 2690 * The RTT bit can only be changed when translation is disabled, 2691 * but disabling translation means to open a window for data 2692 * corruption. So bail out and don't copy anything if we would 2693 * have to change the bit. 2694 */ 2695 if (new_ext != ext) 2696 return -EINVAL; 2697 2698 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2699 if (!iommu->copied_tables) 2700 return -ENOMEM; 2701 2702 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2703 if (!old_rt_phys) 2704 return -EINVAL; 2705 2706 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2707 if (!old_rt) 2708 return -ENOMEM; 2709 2710 /* This is too big for the stack - allocate it from slab */ 2711 ctxt_table_entries = ext ? 512 : 256; 2712 ret = -ENOMEM; 2713 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2714 if (!ctxt_tbls) 2715 goto out_unmap; 2716 2717 for (bus = 0; bus < 256; bus++) { 2718 ret = copy_context_table(iommu, &old_rt[bus], 2719 ctxt_tbls, bus, ext); 2720 if (ret) { 2721 pr_err("%s: Failed to copy context table for bus %d\n", 2722 iommu->name, bus); 2723 continue; 2724 } 2725 } 2726 2727 spin_lock(&iommu->lock); 2728 2729 /* Context tables are copied, now write them to the root_entry table */ 2730 for (bus = 0; bus < 256; bus++) { 2731 int idx = ext ? bus * 2 : bus; 2732 u64 val; 2733 2734 if (ctxt_tbls[idx]) { 2735 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2736 iommu->root_entry[bus].lo = val; 2737 } 2738 2739 if (!ext || !ctxt_tbls[idx + 1]) 2740 continue; 2741 2742 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2743 iommu->root_entry[bus].hi = val; 2744 } 2745 2746 spin_unlock(&iommu->lock); 2747 2748 kfree(ctxt_tbls); 2749 2750 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2751 2752 ret = 0; 2753 2754 out_unmap: 2755 memunmap(old_rt); 2756 2757 return ret; 2758 } 2759 2760 static int __init init_dmars(void) 2761 { 2762 struct dmar_drhd_unit *drhd; 2763 struct intel_iommu *iommu; 2764 int ret; 2765 2766 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2767 if (ret) 2768 goto free_iommu; 2769 2770 for_each_iommu(iommu, drhd) { 2771 if (drhd->ignored) { 2772 iommu_disable_translation(iommu); 2773 continue; 2774 } 2775 2776 /* 2777 * Find the max pasid size of all IOMMU's in the system. 2778 * We need to ensure the system pasid table is no bigger 2779 * than the smallest supported. 2780 */ 2781 if (pasid_supported(iommu)) { 2782 u32 temp = 2 << ecap_pss(iommu->ecap); 2783 2784 intel_pasid_max_id = min_t(u32, temp, 2785 intel_pasid_max_id); 2786 } 2787 2788 intel_iommu_init_qi(iommu); 2789 2790 ret = iommu_init_domains(iommu); 2791 if (ret) 2792 goto free_iommu; 2793 2794 init_translation_status(iommu); 2795 2796 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2797 iommu_disable_translation(iommu); 2798 clear_translation_pre_enabled(iommu); 2799 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2800 iommu->name); 2801 } 2802 2803 /* 2804 * TBD: 2805 * we could share the same root & context tables 2806 * among all IOMMU's. Need to Split it later. 2807 */ 2808 ret = iommu_alloc_root_entry(iommu); 2809 if (ret) 2810 goto free_iommu; 2811 2812 if (translation_pre_enabled(iommu)) { 2813 pr_info("Translation already enabled - trying to copy translation structures\n"); 2814 2815 ret = copy_translation_tables(iommu); 2816 if (ret) { 2817 /* 2818 * We found the IOMMU with translation 2819 * enabled - but failed to copy over the 2820 * old root-entry table. Try to proceed 2821 * by disabling translation now and 2822 * allocating a clean root-entry table. 2823 * This might cause DMAR faults, but 2824 * probably the dump will still succeed. 2825 */ 2826 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2827 iommu->name); 2828 iommu_disable_translation(iommu); 2829 clear_translation_pre_enabled(iommu); 2830 } else { 2831 pr_info("Copied translation tables from previous kernel for %s\n", 2832 iommu->name); 2833 } 2834 } 2835 2836 if (!ecap_pass_through(iommu->ecap)) 2837 hw_pass_through = 0; 2838 intel_svm_check(iommu); 2839 } 2840 2841 /* 2842 * Now that qi is enabled on all iommus, set the root entry and flush 2843 * caches. This is required on some Intel X58 chipsets, otherwise the 2844 * flush_context function will loop forever and the boot hangs. 2845 */ 2846 for_each_active_iommu(iommu, drhd) { 2847 iommu_flush_write_buffer(iommu); 2848 iommu_set_root_entry(iommu); 2849 } 2850 2851 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2852 dmar_map_gfx = 0; 2853 #endif 2854 2855 if (!dmar_map_gfx) 2856 iommu_identity_mapping |= IDENTMAP_GFX; 2857 2858 check_tylersburg_isoch(); 2859 2860 ret = si_domain_init(hw_pass_through); 2861 if (ret) 2862 goto free_iommu; 2863 2864 /* 2865 * for each drhd 2866 * enable fault log 2867 * global invalidate context cache 2868 * global invalidate iotlb 2869 * enable translation 2870 */ 2871 for_each_iommu(iommu, drhd) { 2872 if (drhd->ignored) { 2873 /* 2874 * we always have to disable PMRs or DMA may fail on 2875 * this device 2876 */ 2877 if (force_on) 2878 iommu_disable_protect_mem_regions(iommu); 2879 continue; 2880 } 2881 2882 iommu_flush_write_buffer(iommu); 2883 2884 #ifdef CONFIG_INTEL_IOMMU_SVM 2885 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2886 /* 2887 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2888 * could cause possible lock race condition. 2889 */ 2890 up_write(&dmar_global_lock); 2891 ret = intel_svm_enable_prq(iommu); 2892 down_write(&dmar_global_lock); 2893 if (ret) 2894 goto free_iommu; 2895 } 2896 #endif 2897 ret = dmar_set_interrupt(iommu); 2898 if (ret) 2899 goto free_iommu; 2900 } 2901 2902 return 0; 2903 2904 free_iommu: 2905 for_each_active_iommu(iommu, drhd) { 2906 disable_dmar_iommu(iommu); 2907 free_dmar_iommu(iommu); 2908 } 2909 if (si_domain) { 2910 domain_exit(si_domain); 2911 si_domain = NULL; 2912 } 2913 2914 return ret; 2915 } 2916 2917 static void __init init_no_remapping_devices(void) 2918 { 2919 struct dmar_drhd_unit *drhd; 2920 struct device *dev; 2921 int i; 2922 2923 for_each_drhd_unit(drhd) { 2924 if (!drhd->include_all) { 2925 for_each_active_dev_scope(drhd->devices, 2926 drhd->devices_cnt, i, dev) 2927 break; 2928 /* ignore DMAR unit if no devices exist */ 2929 if (i == drhd->devices_cnt) 2930 drhd->ignored = 1; 2931 } 2932 } 2933 2934 for_each_active_drhd_unit(drhd) { 2935 if (drhd->include_all) 2936 continue; 2937 2938 for_each_active_dev_scope(drhd->devices, 2939 drhd->devices_cnt, i, dev) 2940 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2941 break; 2942 if (i < drhd->devices_cnt) 2943 continue; 2944 2945 /* This IOMMU has *only* gfx devices. Either bypass it or 2946 set the gfx_mapped flag, as appropriate */ 2947 drhd->gfx_dedicated = 1; 2948 if (!dmar_map_gfx) 2949 drhd->ignored = 1; 2950 } 2951 } 2952 2953 #ifdef CONFIG_SUSPEND 2954 static int init_iommu_hw(void) 2955 { 2956 struct dmar_drhd_unit *drhd; 2957 struct intel_iommu *iommu = NULL; 2958 int ret; 2959 2960 for_each_active_iommu(iommu, drhd) { 2961 if (iommu->qi) { 2962 ret = dmar_reenable_qi(iommu); 2963 if (ret) 2964 return ret; 2965 } 2966 } 2967 2968 for_each_iommu(iommu, drhd) { 2969 if (drhd->ignored) { 2970 /* 2971 * we always have to disable PMRs or DMA may fail on 2972 * this device 2973 */ 2974 if (force_on) 2975 iommu_disable_protect_mem_regions(iommu); 2976 continue; 2977 } 2978 2979 iommu_flush_write_buffer(iommu); 2980 iommu_set_root_entry(iommu); 2981 iommu_enable_translation(iommu); 2982 iommu_disable_protect_mem_regions(iommu); 2983 } 2984 2985 return 0; 2986 } 2987 2988 static void iommu_flush_all(void) 2989 { 2990 struct dmar_drhd_unit *drhd; 2991 struct intel_iommu *iommu; 2992 2993 for_each_active_iommu(iommu, drhd) { 2994 iommu->flush.flush_context(iommu, 0, 0, 0, 2995 DMA_CCMD_GLOBAL_INVL); 2996 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2997 DMA_TLB_GLOBAL_FLUSH); 2998 } 2999 } 3000 3001 static int iommu_suspend(void) 3002 { 3003 struct dmar_drhd_unit *drhd; 3004 struct intel_iommu *iommu = NULL; 3005 unsigned long flag; 3006 3007 for_each_active_iommu(iommu, drhd) { 3008 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3009 GFP_KERNEL); 3010 if (!iommu->iommu_state) 3011 goto nomem; 3012 } 3013 3014 iommu_flush_all(); 3015 3016 for_each_active_iommu(iommu, drhd) { 3017 iommu_disable_translation(iommu); 3018 3019 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3020 3021 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3022 readl(iommu->reg + DMAR_FECTL_REG); 3023 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3024 readl(iommu->reg + DMAR_FEDATA_REG); 3025 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3026 readl(iommu->reg + DMAR_FEADDR_REG); 3027 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3028 readl(iommu->reg + DMAR_FEUADDR_REG); 3029 3030 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3031 } 3032 return 0; 3033 3034 nomem: 3035 for_each_active_iommu(iommu, drhd) 3036 kfree(iommu->iommu_state); 3037 3038 return -ENOMEM; 3039 } 3040 3041 static void iommu_resume(void) 3042 { 3043 struct dmar_drhd_unit *drhd; 3044 struct intel_iommu *iommu = NULL; 3045 unsigned long flag; 3046 3047 if (init_iommu_hw()) { 3048 if (force_on) 3049 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3050 else 3051 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3052 return; 3053 } 3054 3055 for_each_active_iommu(iommu, drhd) { 3056 3057 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3058 3059 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3060 iommu->reg + DMAR_FECTL_REG); 3061 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3062 iommu->reg + DMAR_FEDATA_REG); 3063 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3064 iommu->reg + DMAR_FEADDR_REG); 3065 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3066 iommu->reg + DMAR_FEUADDR_REG); 3067 3068 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3069 } 3070 3071 for_each_active_iommu(iommu, drhd) 3072 kfree(iommu->iommu_state); 3073 } 3074 3075 static struct syscore_ops iommu_syscore_ops = { 3076 .resume = iommu_resume, 3077 .suspend = iommu_suspend, 3078 }; 3079 3080 static void __init init_iommu_pm_ops(void) 3081 { 3082 register_syscore_ops(&iommu_syscore_ops); 3083 } 3084 3085 #else 3086 static inline void init_iommu_pm_ops(void) {} 3087 #endif /* CONFIG_PM */ 3088 3089 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3090 { 3091 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3092 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3093 rmrr->end_address <= rmrr->base_address || 3094 arch_rmrr_sanity_check(rmrr)) 3095 return -EINVAL; 3096 3097 return 0; 3098 } 3099 3100 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3101 { 3102 struct acpi_dmar_reserved_memory *rmrr; 3103 struct dmar_rmrr_unit *rmrru; 3104 3105 rmrr = (struct acpi_dmar_reserved_memory *)header; 3106 if (rmrr_sanity_check(rmrr)) { 3107 pr_warn(FW_BUG 3108 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3109 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3110 rmrr->base_address, rmrr->end_address, 3111 dmi_get_system_info(DMI_BIOS_VENDOR), 3112 dmi_get_system_info(DMI_BIOS_VERSION), 3113 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3114 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3115 } 3116 3117 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3118 if (!rmrru) 3119 goto out; 3120 3121 rmrru->hdr = header; 3122 3123 rmrru->base_address = rmrr->base_address; 3124 rmrru->end_address = rmrr->end_address; 3125 3126 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3127 ((void *)rmrr) + rmrr->header.length, 3128 &rmrru->devices_cnt); 3129 if (rmrru->devices_cnt && rmrru->devices == NULL) 3130 goto free_rmrru; 3131 3132 list_add(&rmrru->list, &dmar_rmrr_units); 3133 3134 return 0; 3135 free_rmrru: 3136 kfree(rmrru); 3137 out: 3138 return -ENOMEM; 3139 } 3140 3141 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3142 { 3143 struct dmar_atsr_unit *atsru; 3144 struct acpi_dmar_atsr *tmp; 3145 3146 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3147 dmar_rcu_check()) { 3148 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3149 if (atsr->segment != tmp->segment) 3150 continue; 3151 if (atsr->header.length != tmp->header.length) 3152 continue; 3153 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3154 return atsru; 3155 } 3156 3157 return NULL; 3158 } 3159 3160 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3161 { 3162 struct acpi_dmar_atsr *atsr; 3163 struct dmar_atsr_unit *atsru; 3164 3165 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3166 return 0; 3167 3168 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3169 atsru = dmar_find_atsr(atsr); 3170 if (atsru) 3171 return 0; 3172 3173 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3174 if (!atsru) 3175 return -ENOMEM; 3176 3177 /* 3178 * If memory is allocated from slab by ACPI _DSM method, we need to 3179 * copy the memory content because the memory buffer will be freed 3180 * on return. 3181 */ 3182 atsru->hdr = (void *)(atsru + 1); 3183 memcpy(atsru->hdr, hdr, hdr->length); 3184 atsru->include_all = atsr->flags & 0x1; 3185 if (!atsru->include_all) { 3186 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3187 (void *)atsr + atsr->header.length, 3188 &atsru->devices_cnt); 3189 if (atsru->devices_cnt && atsru->devices == NULL) { 3190 kfree(atsru); 3191 return -ENOMEM; 3192 } 3193 } 3194 3195 list_add_rcu(&atsru->list, &dmar_atsr_units); 3196 3197 return 0; 3198 } 3199 3200 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3201 { 3202 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3203 kfree(atsru); 3204 } 3205 3206 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3207 { 3208 struct acpi_dmar_atsr *atsr; 3209 struct dmar_atsr_unit *atsru; 3210 3211 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3212 atsru = dmar_find_atsr(atsr); 3213 if (atsru) { 3214 list_del_rcu(&atsru->list); 3215 synchronize_rcu(); 3216 intel_iommu_free_atsr(atsru); 3217 } 3218 3219 return 0; 3220 } 3221 3222 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3223 { 3224 int i; 3225 struct device *dev; 3226 struct acpi_dmar_atsr *atsr; 3227 struct dmar_atsr_unit *atsru; 3228 3229 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3230 atsru = dmar_find_atsr(atsr); 3231 if (!atsru) 3232 return 0; 3233 3234 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3235 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3236 i, dev) 3237 return -EBUSY; 3238 } 3239 3240 return 0; 3241 } 3242 3243 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3244 { 3245 struct dmar_satc_unit *satcu; 3246 struct acpi_dmar_satc *tmp; 3247 3248 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3249 dmar_rcu_check()) { 3250 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3251 if (satc->segment != tmp->segment) 3252 continue; 3253 if (satc->header.length != tmp->header.length) 3254 continue; 3255 if (memcmp(satc, tmp, satc->header.length) == 0) 3256 return satcu; 3257 } 3258 3259 return NULL; 3260 } 3261 3262 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3263 { 3264 struct acpi_dmar_satc *satc; 3265 struct dmar_satc_unit *satcu; 3266 3267 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3268 return 0; 3269 3270 satc = container_of(hdr, struct acpi_dmar_satc, header); 3271 satcu = dmar_find_satc(satc); 3272 if (satcu) 3273 return 0; 3274 3275 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3276 if (!satcu) 3277 return -ENOMEM; 3278 3279 satcu->hdr = (void *)(satcu + 1); 3280 memcpy(satcu->hdr, hdr, hdr->length); 3281 satcu->atc_required = satc->flags & 0x1; 3282 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3283 (void *)satc + satc->header.length, 3284 &satcu->devices_cnt); 3285 if (satcu->devices_cnt && !satcu->devices) { 3286 kfree(satcu); 3287 return -ENOMEM; 3288 } 3289 list_add_rcu(&satcu->list, &dmar_satc_units); 3290 3291 return 0; 3292 } 3293 3294 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3295 { 3296 int sp, ret; 3297 struct intel_iommu *iommu = dmaru->iommu; 3298 3299 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3300 if (ret) 3301 goto out; 3302 3303 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3304 pr_warn("%s: Doesn't support hardware pass through.\n", 3305 iommu->name); 3306 return -ENXIO; 3307 } 3308 3309 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3310 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3311 pr_warn("%s: Doesn't support large page.\n", 3312 iommu->name); 3313 return -ENXIO; 3314 } 3315 3316 /* 3317 * Disable translation if already enabled prior to OS handover. 3318 */ 3319 if (iommu->gcmd & DMA_GCMD_TE) 3320 iommu_disable_translation(iommu); 3321 3322 ret = iommu_init_domains(iommu); 3323 if (ret == 0) 3324 ret = iommu_alloc_root_entry(iommu); 3325 if (ret) 3326 goto out; 3327 3328 intel_svm_check(iommu); 3329 3330 if (dmaru->ignored) { 3331 /* 3332 * we always have to disable PMRs or DMA may fail on this device 3333 */ 3334 if (force_on) 3335 iommu_disable_protect_mem_regions(iommu); 3336 return 0; 3337 } 3338 3339 intel_iommu_init_qi(iommu); 3340 iommu_flush_write_buffer(iommu); 3341 3342 #ifdef CONFIG_INTEL_IOMMU_SVM 3343 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3344 ret = intel_svm_enable_prq(iommu); 3345 if (ret) 3346 goto disable_iommu; 3347 } 3348 #endif 3349 ret = dmar_set_interrupt(iommu); 3350 if (ret) 3351 goto disable_iommu; 3352 3353 iommu_set_root_entry(iommu); 3354 iommu_enable_translation(iommu); 3355 3356 iommu_disable_protect_mem_regions(iommu); 3357 return 0; 3358 3359 disable_iommu: 3360 disable_dmar_iommu(iommu); 3361 out: 3362 free_dmar_iommu(iommu); 3363 return ret; 3364 } 3365 3366 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3367 { 3368 int ret = 0; 3369 struct intel_iommu *iommu = dmaru->iommu; 3370 3371 if (!intel_iommu_enabled) 3372 return 0; 3373 if (iommu == NULL) 3374 return -EINVAL; 3375 3376 if (insert) { 3377 ret = intel_iommu_add(dmaru); 3378 } else { 3379 disable_dmar_iommu(iommu); 3380 free_dmar_iommu(iommu); 3381 } 3382 3383 return ret; 3384 } 3385 3386 static void intel_iommu_free_dmars(void) 3387 { 3388 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3389 struct dmar_atsr_unit *atsru, *atsr_n; 3390 struct dmar_satc_unit *satcu, *satc_n; 3391 3392 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3393 list_del(&rmrru->list); 3394 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3395 kfree(rmrru); 3396 } 3397 3398 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3399 list_del(&atsru->list); 3400 intel_iommu_free_atsr(atsru); 3401 } 3402 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3403 list_del(&satcu->list); 3404 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3405 kfree(satcu); 3406 } 3407 } 3408 3409 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3410 { 3411 struct dmar_satc_unit *satcu; 3412 struct acpi_dmar_satc *satc; 3413 struct device *tmp; 3414 int i; 3415 3416 dev = pci_physfn(dev); 3417 rcu_read_lock(); 3418 3419 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3420 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3421 if (satc->segment != pci_domain_nr(dev->bus)) 3422 continue; 3423 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3424 if (to_pci_dev(tmp) == dev) 3425 goto out; 3426 } 3427 satcu = NULL; 3428 out: 3429 rcu_read_unlock(); 3430 return satcu; 3431 } 3432 3433 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3434 { 3435 int i, ret = 1; 3436 struct pci_bus *bus; 3437 struct pci_dev *bridge = NULL; 3438 struct device *tmp; 3439 struct acpi_dmar_atsr *atsr; 3440 struct dmar_atsr_unit *atsru; 3441 struct dmar_satc_unit *satcu; 3442 3443 dev = pci_physfn(dev); 3444 satcu = dmar_find_matched_satc_unit(dev); 3445 if (satcu) 3446 /* 3447 * This device supports ATS as it is in SATC table. 3448 * When IOMMU is in legacy mode, enabling ATS is done 3449 * automatically by HW for the device that requires 3450 * ATS, hence OS should not enable this device ATS 3451 * to avoid duplicated TLB invalidation. 3452 */ 3453 return !(satcu->atc_required && !sm_supported(iommu)); 3454 3455 for (bus = dev->bus; bus; bus = bus->parent) { 3456 bridge = bus->self; 3457 /* If it's an integrated device, allow ATS */ 3458 if (!bridge) 3459 return 1; 3460 /* Connected via non-PCIe: no ATS */ 3461 if (!pci_is_pcie(bridge) || 3462 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3463 return 0; 3464 /* If we found the root port, look it up in the ATSR */ 3465 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3466 break; 3467 } 3468 3469 rcu_read_lock(); 3470 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3471 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3472 if (atsr->segment != pci_domain_nr(dev->bus)) 3473 continue; 3474 3475 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3476 if (tmp == &bridge->dev) 3477 goto out; 3478 3479 if (atsru->include_all) 3480 goto out; 3481 } 3482 ret = 0; 3483 out: 3484 rcu_read_unlock(); 3485 3486 return ret; 3487 } 3488 3489 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3490 { 3491 int ret; 3492 struct dmar_rmrr_unit *rmrru; 3493 struct dmar_atsr_unit *atsru; 3494 struct dmar_satc_unit *satcu; 3495 struct acpi_dmar_atsr *atsr; 3496 struct acpi_dmar_reserved_memory *rmrr; 3497 struct acpi_dmar_satc *satc; 3498 3499 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3500 return 0; 3501 3502 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3503 rmrr = container_of(rmrru->hdr, 3504 struct acpi_dmar_reserved_memory, header); 3505 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3506 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3507 ((void *)rmrr) + rmrr->header.length, 3508 rmrr->segment, rmrru->devices, 3509 rmrru->devices_cnt); 3510 if (ret < 0) 3511 return ret; 3512 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3513 dmar_remove_dev_scope(info, rmrr->segment, 3514 rmrru->devices, rmrru->devices_cnt); 3515 } 3516 } 3517 3518 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3519 if (atsru->include_all) 3520 continue; 3521 3522 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3523 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3524 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3525 (void *)atsr + atsr->header.length, 3526 atsr->segment, atsru->devices, 3527 atsru->devices_cnt); 3528 if (ret > 0) 3529 break; 3530 else if (ret < 0) 3531 return ret; 3532 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3533 if (dmar_remove_dev_scope(info, atsr->segment, 3534 atsru->devices, atsru->devices_cnt)) 3535 break; 3536 } 3537 } 3538 list_for_each_entry(satcu, &dmar_satc_units, list) { 3539 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3540 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3541 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3542 (void *)satc + satc->header.length, 3543 satc->segment, satcu->devices, 3544 satcu->devices_cnt); 3545 if (ret > 0) 3546 break; 3547 else if (ret < 0) 3548 return ret; 3549 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3550 if (dmar_remove_dev_scope(info, satc->segment, 3551 satcu->devices, satcu->devices_cnt)) 3552 break; 3553 } 3554 } 3555 3556 return 0; 3557 } 3558 3559 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3560 unsigned long val, void *v) 3561 { 3562 struct memory_notify *mhp = v; 3563 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3564 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3565 mhp->nr_pages - 1); 3566 3567 switch (val) { 3568 case MEM_GOING_ONLINE: 3569 if (iommu_domain_identity_map(si_domain, 3570 start_vpfn, last_vpfn)) { 3571 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3572 start_vpfn, last_vpfn); 3573 return NOTIFY_BAD; 3574 } 3575 break; 3576 3577 case MEM_OFFLINE: 3578 case MEM_CANCEL_ONLINE: 3579 { 3580 struct dmar_drhd_unit *drhd; 3581 struct intel_iommu *iommu; 3582 LIST_HEAD(freelist); 3583 3584 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3585 3586 rcu_read_lock(); 3587 for_each_active_iommu(iommu, drhd) 3588 iommu_flush_iotlb_psi(iommu, si_domain, 3589 start_vpfn, mhp->nr_pages, 3590 list_empty(&freelist), 0); 3591 rcu_read_unlock(); 3592 put_pages_list(&freelist); 3593 } 3594 break; 3595 } 3596 3597 return NOTIFY_OK; 3598 } 3599 3600 static struct notifier_block intel_iommu_memory_nb = { 3601 .notifier_call = intel_iommu_memory_notifier, 3602 .priority = 0 3603 }; 3604 3605 static void intel_disable_iommus(void) 3606 { 3607 struct intel_iommu *iommu = NULL; 3608 struct dmar_drhd_unit *drhd; 3609 3610 for_each_iommu(iommu, drhd) 3611 iommu_disable_translation(iommu); 3612 } 3613 3614 void intel_iommu_shutdown(void) 3615 { 3616 struct dmar_drhd_unit *drhd; 3617 struct intel_iommu *iommu = NULL; 3618 3619 if (no_iommu || dmar_disabled) 3620 return; 3621 3622 down_write(&dmar_global_lock); 3623 3624 /* Disable PMRs explicitly here. */ 3625 for_each_iommu(iommu, drhd) 3626 iommu_disable_protect_mem_regions(iommu); 3627 3628 /* Make sure the IOMMUs are switched off */ 3629 intel_disable_iommus(); 3630 3631 up_write(&dmar_global_lock); 3632 } 3633 3634 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3635 { 3636 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3637 3638 return container_of(iommu_dev, struct intel_iommu, iommu); 3639 } 3640 3641 static ssize_t version_show(struct device *dev, 3642 struct device_attribute *attr, char *buf) 3643 { 3644 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3645 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3646 return sysfs_emit(buf, "%d:%d\n", 3647 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3648 } 3649 static DEVICE_ATTR_RO(version); 3650 3651 static ssize_t address_show(struct device *dev, 3652 struct device_attribute *attr, char *buf) 3653 { 3654 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3655 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3656 } 3657 static DEVICE_ATTR_RO(address); 3658 3659 static ssize_t cap_show(struct device *dev, 3660 struct device_attribute *attr, char *buf) 3661 { 3662 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3663 return sysfs_emit(buf, "%llx\n", iommu->cap); 3664 } 3665 static DEVICE_ATTR_RO(cap); 3666 3667 static ssize_t ecap_show(struct device *dev, 3668 struct device_attribute *attr, char *buf) 3669 { 3670 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3671 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3672 } 3673 static DEVICE_ATTR_RO(ecap); 3674 3675 static ssize_t domains_supported_show(struct device *dev, 3676 struct device_attribute *attr, char *buf) 3677 { 3678 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3679 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3680 } 3681 static DEVICE_ATTR_RO(domains_supported); 3682 3683 static ssize_t domains_used_show(struct device *dev, 3684 struct device_attribute *attr, char *buf) 3685 { 3686 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3687 return sysfs_emit(buf, "%d\n", 3688 bitmap_weight(iommu->domain_ids, 3689 cap_ndoms(iommu->cap))); 3690 } 3691 static DEVICE_ATTR_RO(domains_used); 3692 3693 static struct attribute *intel_iommu_attrs[] = { 3694 &dev_attr_version.attr, 3695 &dev_attr_address.attr, 3696 &dev_attr_cap.attr, 3697 &dev_attr_ecap.attr, 3698 &dev_attr_domains_supported.attr, 3699 &dev_attr_domains_used.attr, 3700 NULL, 3701 }; 3702 3703 static struct attribute_group intel_iommu_group = { 3704 .name = "intel-iommu", 3705 .attrs = intel_iommu_attrs, 3706 }; 3707 3708 const struct attribute_group *intel_iommu_groups[] = { 3709 &intel_iommu_group, 3710 NULL, 3711 }; 3712 3713 static inline bool has_external_pci(void) 3714 { 3715 struct pci_dev *pdev = NULL; 3716 3717 for_each_pci_dev(pdev) 3718 if (pdev->external_facing) { 3719 pci_dev_put(pdev); 3720 return true; 3721 } 3722 3723 return false; 3724 } 3725 3726 static int __init platform_optin_force_iommu(void) 3727 { 3728 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3729 return 0; 3730 3731 if (no_iommu || dmar_disabled) 3732 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3733 3734 /* 3735 * If Intel-IOMMU is disabled by default, we will apply identity 3736 * map for all devices except those marked as being untrusted. 3737 */ 3738 if (dmar_disabled) 3739 iommu_set_default_passthrough(false); 3740 3741 dmar_disabled = 0; 3742 no_iommu = 0; 3743 3744 return 1; 3745 } 3746 3747 static int __init probe_acpi_namespace_devices(void) 3748 { 3749 struct dmar_drhd_unit *drhd; 3750 /* To avoid a -Wunused-but-set-variable warning. */ 3751 struct intel_iommu *iommu __maybe_unused; 3752 struct device *dev; 3753 int i, ret = 0; 3754 3755 for_each_active_iommu(iommu, drhd) { 3756 for_each_active_dev_scope(drhd->devices, 3757 drhd->devices_cnt, i, dev) { 3758 struct acpi_device_physical_node *pn; 3759 struct iommu_group *group; 3760 struct acpi_device *adev; 3761 3762 if (dev->bus != &acpi_bus_type) 3763 continue; 3764 3765 adev = to_acpi_device(dev); 3766 mutex_lock(&adev->physical_node_lock); 3767 list_for_each_entry(pn, 3768 &adev->physical_node_list, node) { 3769 group = iommu_group_get(pn->dev); 3770 if (group) { 3771 iommu_group_put(group); 3772 continue; 3773 } 3774 3775 ret = iommu_probe_device(pn->dev); 3776 if (ret) 3777 break; 3778 } 3779 mutex_unlock(&adev->physical_node_lock); 3780 3781 if (ret) 3782 return ret; 3783 } 3784 } 3785 3786 return 0; 3787 } 3788 3789 static __init int tboot_force_iommu(void) 3790 { 3791 if (!tboot_enabled()) 3792 return 0; 3793 3794 if (no_iommu || dmar_disabled) 3795 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3796 3797 dmar_disabled = 0; 3798 no_iommu = 0; 3799 3800 return 1; 3801 } 3802 3803 int __init intel_iommu_init(void) 3804 { 3805 int ret = -ENODEV; 3806 struct dmar_drhd_unit *drhd; 3807 struct intel_iommu *iommu; 3808 3809 /* 3810 * Intel IOMMU is required for a TXT/tboot launch or platform 3811 * opt in, so enforce that. 3812 */ 3813 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3814 platform_optin_force_iommu(); 3815 3816 down_write(&dmar_global_lock); 3817 if (dmar_table_init()) { 3818 if (force_on) 3819 panic("tboot: Failed to initialize DMAR table\n"); 3820 goto out_free_dmar; 3821 } 3822 3823 if (dmar_dev_scope_init() < 0) { 3824 if (force_on) 3825 panic("tboot: Failed to initialize DMAR device scope\n"); 3826 goto out_free_dmar; 3827 } 3828 3829 up_write(&dmar_global_lock); 3830 3831 /* 3832 * The bus notifier takes the dmar_global_lock, so lockdep will 3833 * complain later when we register it under the lock. 3834 */ 3835 dmar_register_bus_notifier(); 3836 3837 down_write(&dmar_global_lock); 3838 3839 if (!no_iommu) 3840 intel_iommu_debugfs_init(); 3841 3842 if (no_iommu || dmar_disabled) { 3843 /* 3844 * We exit the function here to ensure IOMMU's remapping and 3845 * mempool aren't setup, which means that the IOMMU's PMRs 3846 * won't be disabled via the call to init_dmars(). So disable 3847 * it explicitly here. The PMRs were setup by tboot prior to 3848 * calling SENTER, but the kernel is expected to reset/tear 3849 * down the PMRs. 3850 */ 3851 if (intel_iommu_tboot_noforce) { 3852 for_each_iommu(iommu, drhd) 3853 iommu_disable_protect_mem_regions(iommu); 3854 } 3855 3856 /* 3857 * Make sure the IOMMUs are switched off, even when we 3858 * boot into a kexec kernel and the previous kernel left 3859 * them enabled 3860 */ 3861 intel_disable_iommus(); 3862 goto out_free_dmar; 3863 } 3864 3865 if (list_empty(&dmar_rmrr_units)) 3866 pr_info("No RMRR found\n"); 3867 3868 if (list_empty(&dmar_atsr_units)) 3869 pr_info("No ATSR found\n"); 3870 3871 if (list_empty(&dmar_satc_units)) 3872 pr_info("No SATC found\n"); 3873 3874 init_no_remapping_devices(); 3875 3876 ret = init_dmars(); 3877 if (ret) { 3878 if (force_on) 3879 panic("tboot: Failed to initialize DMARs\n"); 3880 pr_err("Initialization failed\n"); 3881 goto out_free_dmar; 3882 } 3883 up_write(&dmar_global_lock); 3884 3885 init_iommu_pm_ops(); 3886 3887 down_read(&dmar_global_lock); 3888 for_each_active_iommu(iommu, drhd) { 3889 /* 3890 * The flush queue implementation does not perform 3891 * page-selective invalidations that are required for efficient 3892 * TLB flushes in virtual environments. The benefit of batching 3893 * is likely to be much lower than the overhead of synchronizing 3894 * the virtual and physical IOMMU page-tables. 3895 */ 3896 if (cap_caching_mode(iommu->cap) && 3897 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3898 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3899 iommu_set_dma_strict(); 3900 } 3901 iommu_device_sysfs_add(&iommu->iommu, NULL, 3902 intel_iommu_groups, 3903 "%s", iommu->name); 3904 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3905 3906 iommu_pmu_register(iommu); 3907 } 3908 up_read(&dmar_global_lock); 3909 3910 if (si_domain && !hw_pass_through) 3911 register_memory_notifier(&intel_iommu_memory_nb); 3912 3913 down_read(&dmar_global_lock); 3914 if (probe_acpi_namespace_devices()) 3915 pr_warn("ACPI name space devices didn't probe correctly\n"); 3916 3917 /* Finally, we enable the DMA remapping hardware. */ 3918 for_each_iommu(iommu, drhd) { 3919 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3920 iommu_enable_translation(iommu); 3921 3922 iommu_disable_protect_mem_regions(iommu); 3923 } 3924 up_read(&dmar_global_lock); 3925 3926 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3927 3928 intel_iommu_enabled = 1; 3929 3930 return 0; 3931 3932 out_free_dmar: 3933 intel_iommu_free_dmars(); 3934 up_write(&dmar_global_lock); 3935 return ret; 3936 } 3937 3938 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3939 { 3940 struct device_domain_info *info = opaque; 3941 3942 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3943 return 0; 3944 } 3945 3946 /* 3947 * NB - intel-iommu lacks any sort of reference counting for the users of 3948 * dependent devices. If multiple endpoints have intersecting dependent 3949 * devices, unbinding the driver from any one of them will possibly leave 3950 * the others unable to operate. 3951 */ 3952 static void domain_context_clear(struct device_domain_info *info) 3953 { 3954 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 3955 return; 3956 3957 pci_for_each_dma_alias(to_pci_dev(info->dev), 3958 &domain_context_clear_one_cb, info); 3959 } 3960 3961 static void dmar_remove_one_dev_info(struct device *dev) 3962 { 3963 struct device_domain_info *info = dev_iommu_priv_get(dev); 3964 struct dmar_domain *domain = info->domain; 3965 struct intel_iommu *iommu = info->iommu; 3966 unsigned long flags; 3967 3968 if (!dev_is_real_dma_subdevice(info->dev)) { 3969 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3970 intel_pasid_tear_down_entry(iommu, info->dev, 3971 PASID_RID2PASID, false); 3972 3973 iommu_disable_pci_caps(info); 3974 domain_context_clear(info); 3975 } 3976 3977 spin_lock_irqsave(&domain->lock, flags); 3978 list_del(&info->link); 3979 spin_unlock_irqrestore(&domain->lock, flags); 3980 3981 domain_detach_iommu(domain, iommu); 3982 info->domain = NULL; 3983 } 3984 3985 /* 3986 * Clear the page table pointer in context or pasid table entries so that 3987 * all DMA requests without PASID from the device are blocked. If the page 3988 * table has been set, clean up the data structures. 3989 */ 3990 static void device_block_translation(struct device *dev) 3991 { 3992 struct device_domain_info *info = dev_iommu_priv_get(dev); 3993 struct intel_iommu *iommu = info->iommu; 3994 unsigned long flags; 3995 3996 iommu_disable_pci_caps(info); 3997 if (!dev_is_real_dma_subdevice(dev)) { 3998 if (sm_supported(iommu)) 3999 intel_pasid_tear_down_entry(iommu, dev, 4000 PASID_RID2PASID, false); 4001 else 4002 domain_context_clear(info); 4003 } 4004 4005 if (!info->domain) 4006 return; 4007 4008 spin_lock_irqsave(&info->domain->lock, flags); 4009 list_del(&info->link); 4010 spin_unlock_irqrestore(&info->domain->lock, flags); 4011 4012 domain_detach_iommu(info->domain, iommu); 4013 info->domain = NULL; 4014 } 4015 4016 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4017 { 4018 int adjust_width; 4019 4020 /* calculate AGAW */ 4021 domain->gaw = guest_width; 4022 adjust_width = guestwidth_to_adjustwidth(guest_width); 4023 domain->agaw = width_to_agaw(adjust_width); 4024 4025 domain->iommu_coherency = false; 4026 domain->iommu_superpage = 0; 4027 domain->max_addr = 0; 4028 4029 /* always allocate the top pgd */ 4030 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 4031 if (!domain->pgd) 4032 return -ENOMEM; 4033 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4034 return 0; 4035 } 4036 4037 static int blocking_domain_attach_dev(struct iommu_domain *domain, 4038 struct device *dev) 4039 { 4040 device_block_translation(dev); 4041 return 0; 4042 } 4043 4044 static struct iommu_domain blocking_domain = { 4045 .ops = &(const struct iommu_domain_ops) { 4046 .attach_dev = blocking_domain_attach_dev, 4047 .free = intel_iommu_domain_free 4048 } 4049 }; 4050 4051 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4052 { 4053 struct dmar_domain *dmar_domain; 4054 struct iommu_domain *domain; 4055 4056 switch (type) { 4057 case IOMMU_DOMAIN_BLOCKED: 4058 return &blocking_domain; 4059 case IOMMU_DOMAIN_DMA: 4060 case IOMMU_DOMAIN_UNMANAGED: 4061 dmar_domain = alloc_domain(type); 4062 if (!dmar_domain) { 4063 pr_err("Can't allocate dmar_domain\n"); 4064 return NULL; 4065 } 4066 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4067 pr_err("Domain initialization failed\n"); 4068 domain_exit(dmar_domain); 4069 return NULL; 4070 } 4071 4072 domain = &dmar_domain->domain; 4073 domain->geometry.aperture_start = 0; 4074 domain->geometry.aperture_end = 4075 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4076 domain->geometry.force_aperture = true; 4077 4078 return domain; 4079 case IOMMU_DOMAIN_IDENTITY: 4080 return &si_domain->domain; 4081 case IOMMU_DOMAIN_SVA: 4082 return intel_svm_domain_alloc(); 4083 default: 4084 return NULL; 4085 } 4086 4087 return NULL; 4088 } 4089 4090 static void intel_iommu_domain_free(struct iommu_domain *domain) 4091 { 4092 if (domain != &si_domain->domain && domain != &blocking_domain) 4093 domain_exit(to_dmar_domain(domain)); 4094 } 4095 4096 static int prepare_domain_attach_device(struct iommu_domain *domain, 4097 struct device *dev) 4098 { 4099 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4100 struct intel_iommu *iommu; 4101 int addr_width; 4102 4103 iommu = device_to_iommu(dev, NULL, NULL); 4104 if (!iommu) 4105 return -ENODEV; 4106 4107 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4108 return -EINVAL; 4109 4110 /* check if this iommu agaw is sufficient for max mapped address */ 4111 addr_width = agaw_to_width(iommu->agaw); 4112 if (addr_width > cap_mgaw(iommu->cap)) 4113 addr_width = cap_mgaw(iommu->cap); 4114 4115 if (dmar_domain->max_addr > (1LL << addr_width)) 4116 return -EINVAL; 4117 dmar_domain->gaw = addr_width; 4118 4119 /* 4120 * Knock out extra levels of page tables if necessary 4121 */ 4122 while (iommu->agaw < dmar_domain->agaw) { 4123 struct dma_pte *pte; 4124 4125 pte = dmar_domain->pgd; 4126 if (dma_pte_present(pte)) { 4127 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4128 free_pgtable_page(pte); 4129 } 4130 dmar_domain->agaw--; 4131 } 4132 4133 return 0; 4134 } 4135 4136 static int intel_iommu_attach_device(struct iommu_domain *domain, 4137 struct device *dev) 4138 { 4139 struct device_domain_info *info = dev_iommu_priv_get(dev); 4140 int ret; 4141 4142 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4143 device_is_rmrr_locked(dev)) { 4144 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4145 return -EPERM; 4146 } 4147 4148 if (info->domain) 4149 device_block_translation(dev); 4150 4151 ret = prepare_domain_attach_device(domain, dev); 4152 if (ret) 4153 return ret; 4154 4155 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4156 } 4157 4158 static int intel_iommu_map(struct iommu_domain *domain, 4159 unsigned long iova, phys_addr_t hpa, 4160 size_t size, int iommu_prot, gfp_t gfp) 4161 { 4162 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4163 u64 max_addr; 4164 int prot = 0; 4165 4166 if (iommu_prot & IOMMU_READ) 4167 prot |= DMA_PTE_READ; 4168 if (iommu_prot & IOMMU_WRITE) 4169 prot |= DMA_PTE_WRITE; 4170 if (dmar_domain->set_pte_snp) 4171 prot |= DMA_PTE_SNP; 4172 4173 max_addr = iova + size; 4174 if (dmar_domain->max_addr < max_addr) { 4175 u64 end; 4176 4177 /* check if minimum agaw is sufficient for mapped address */ 4178 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4179 if (end < max_addr) { 4180 pr_err("%s: iommu width (%d) is not " 4181 "sufficient for the mapped address (%llx)\n", 4182 __func__, dmar_domain->gaw, max_addr); 4183 return -EFAULT; 4184 } 4185 dmar_domain->max_addr = max_addr; 4186 } 4187 /* Round up size to next multiple of PAGE_SIZE, if it and 4188 the low bits of hpa would take us onto the next page */ 4189 size = aligned_nrpages(hpa, size); 4190 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4191 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4192 } 4193 4194 static int intel_iommu_map_pages(struct iommu_domain *domain, 4195 unsigned long iova, phys_addr_t paddr, 4196 size_t pgsize, size_t pgcount, 4197 int prot, gfp_t gfp, size_t *mapped) 4198 { 4199 unsigned long pgshift = __ffs(pgsize); 4200 size_t size = pgcount << pgshift; 4201 int ret; 4202 4203 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4204 return -EINVAL; 4205 4206 if (!IS_ALIGNED(iova | paddr, pgsize)) 4207 return -EINVAL; 4208 4209 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4210 if (!ret && mapped) 4211 *mapped = size; 4212 4213 return ret; 4214 } 4215 4216 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4217 unsigned long iova, size_t size, 4218 struct iommu_iotlb_gather *gather) 4219 { 4220 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4221 unsigned long start_pfn, last_pfn; 4222 int level = 0; 4223 4224 /* Cope with horrid API which requires us to unmap more than the 4225 size argument if it happens to be a large-page mapping. */ 4226 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4227 &level, GFP_ATOMIC))) 4228 return 0; 4229 4230 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4231 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4232 4233 start_pfn = iova >> VTD_PAGE_SHIFT; 4234 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4235 4236 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4237 4238 if (dmar_domain->max_addr == iova + size) 4239 dmar_domain->max_addr = iova; 4240 4241 /* 4242 * We do not use page-selective IOTLB invalidation in flush queue, 4243 * so there is no need to track page and sync iotlb. 4244 */ 4245 if (!iommu_iotlb_gather_queued(gather)) 4246 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4247 4248 return size; 4249 } 4250 4251 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4252 unsigned long iova, 4253 size_t pgsize, size_t pgcount, 4254 struct iommu_iotlb_gather *gather) 4255 { 4256 unsigned long pgshift = __ffs(pgsize); 4257 size_t size = pgcount << pgshift; 4258 4259 return intel_iommu_unmap(domain, iova, size, gather); 4260 } 4261 4262 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4263 struct iommu_iotlb_gather *gather) 4264 { 4265 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4266 unsigned long iova_pfn = IOVA_PFN(gather->start); 4267 size_t size = gather->end - gather->start; 4268 struct iommu_domain_info *info; 4269 unsigned long start_pfn; 4270 unsigned long nrpages; 4271 unsigned long i; 4272 4273 nrpages = aligned_nrpages(gather->start, size); 4274 start_pfn = mm_to_dma_pfn(iova_pfn); 4275 4276 xa_for_each(&dmar_domain->iommu_array, i, info) 4277 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4278 start_pfn, nrpages, 4279 list_empty(&gather->freelist), 0); 4280 4281 put_pages_list(&gather->freelist); 4282 } 4283 4284 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4285 dma_addr_t iova) 4286 { 4287 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4288 struct dma_pte *pte; 4289 int level = 0; 4290 u64 phys = 0; 4291 4292 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4293 GFP_ATOMIC); 4294 if (pte && dma_pte_present(pte)) 4295 phys = dma_pte_addr(pte) + 4296 (iova & (BIT_MASK(level_to_offset_bits(level) + 4297 VTD_PAGE_SHIFT) - 1)); 4298 4299 return phys; 4300 } 4301 4302 static bool domain_support_force_snooping(struct dmar_domain *domain) 4303 { 4304 struct device_domain_info *info; 4305 bool support = true; 4306 4307 assert_spin_locked(&domain->lock); 4308 list_for_each_entry(info, &domain->devices, link) { 4309 if (!ecap_sc_support(info->iommu->ecap)) { 4310 support = false; 4311 break; 4312 } 4313 } 4314 4315 return support; 4316 } 4317 4318 static void domain_set_force_snooping(struct dmar_domain *domain) 4319 { 4320 struct device_domain_info *info; 4321 4322 assert_spin_locked(&domain->lock); 4323 /* 4324 * Second level page table supports per-PTE snoop control. The 4325 * iommu_map() interface will handle this by setting SNP bit. 4326 */ 4327 if (!domain->use_first_level) { 4328 domain->set_pte_snp = true; 4329 return; 4330 } 4331 4332 list_for_each_entry(info, &domain->devices, link) 4333 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4334 PASID_RID2PASID); 4335 } 4336 4337 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4338 { 4339 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4340 unsigned long flags; 4341 4342 if (dmar_domain->force_snooping) 4343 return true; 4344 4345 spin_lock_irqsave(&dmar_domain->lock, flags); 4346 if (!domain_support_force_snooping(dmar_domain)) { 4347 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4348 return false; 4349 } 4350 4351 domain_set_force_snooping(dmar_domain); 4352 dmar_domain->force_snooping = true; 4353 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4354 4355 return true; 4356 } 4357 4358 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4359 { 4360 struct device_domain_info *info = dev_iommu_priv_get(dev); 4361 4362 switch (cap) { 4363 case IOMMU_CAP_CACHE_COHERENCY: 4364 case IOMMU_CAP_DEFERRED_FLUSH: 4365 return true; 4366 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4367 return dmar_platform_optin(); 4368 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4369 return ecap_sc_support(info->iommu->ecap); 4370 default: 4371 return false; 4372 } 4373 } 4374 4375 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4376 { 4377 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4378 struct device_domain_info *info; 4379 struct intel_iommu *iommu; 4380 u8 bus, devfn; 4381 int ret; 4382 4383 iommu = device_to_iommu(dev, &bus, &devfn); 4384 if (!iommu || !iommu->iommu.ops) 4385 return ERR_PTR(-ENODEV); 4386 4387 info = kzalloc(sizeof(*info), GFP_KERNEL); 4388 if (!info) 4389 return ERR_PTR(-ENOMEM); 4390 4391 if (dev_is_real_dma_subdevice(dev)) { 4392 info->bus = pdev->bus->number; 4393 info->devfn = pdev->devfn; 4394 info->segment = pci_domain_nr(pdev->bus); 4395 } else { 4396 info->bus = bus; 4397 info->devfn = devfn; 4398 info->segment = iommu->segment; 4399 } 4400 4401 info->dev = dev; 4402 info->iommu = iommu; 4403 if (dev_is_pci(dev)) { 4404 if (ecap_dev_iotlb_support(iommu->ecap) && 4405 pci_ats_supported(pdev) && 4406 dmar_ats_supported(pdev, iommu)) { 4407 info->ats_supported = 1; 4408 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4409 4410 /* 4411 * For IOMMU that supports device IOTLB throttling 4412 * (DIT), we assign PFSID to the invalidation desc 4413 * of a VF such that IOMMU HW can gauge queue depth 4414 * at PF level. If DIT is not set, PFSID will be 4415 * treated as reserved, which should be set to 0. 4416 */ 4417 if (ecap_dit(iommu->ecap)) 4418 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4419 info->ats_qdep = pci_ats_queue_depth(pdev); 4420 } 4421 if (sm_supported(iommu)) { 4422 if (pasid_supported(iommu)) { 4423 int features = pci_pasid_features(pdev); 4424 4425 if (features >= 0) 4426 info->pasid_supported = features | 1; 4427 } 4428 4429 if (info->ats_supported && ecap_prs(iommu->ecap) && 4430 pci_pri_supported(pdev)) 4431 info->pri_supported = 1; 4432 } 4433 } 4434 4435 dev_iommu_priv_set(dev, info); 4436 4437 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4438 ret = intel_pasid_alloc_table(dev); 4439 if (ret) { 4440 dev_err(dev, "PASID table allocation failed\n"); 4441 dev_iommu_priv_set(dev, NULL); 4442 kfree(info); 4443 return ERR_PTR(ret); 4444 } 4445 } 4446 4447 return &iommu->iommu; 4448 } 4449 4450 static void intel_iommu_release_device(struct device *dev) 4451 { 4452 struct device_domain_info *info = dev_iommu_priv_get(dev); 4453 4454 dmar_remove_one_dev_info(dev); 4455 intel_pasid_free_table(dev); 4456 dev_iommu_priv_set(dev, NULL); 4457 kfree(info); 4458 set_dma_ops(dev, NULL); 4459 } 4460 4461 static void intel_iommu_probe_finalize(struct device *dev) 4462 { 4463 set_dma_ops(dev, NULL); 4464 iommu_setup_dma_ops(dev, 0, U64_MAX); 4465 } 4466 4467 static void intel_iommu_get_resv_regions(struct device *device, 4468 struct list_head *head) 4469 { 4470 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4471 struct iommu_resv_region *reg; 4472 struct dmar_rmrr_unit *rmrr; 4473 struct device *i_dev; 4474 int i; 4475 4476 rcu_read_lock(); 4477 for_each_rmrr_units(rmrr) { 4478 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4479 i, i_dev) { 4480 struct iommu_resv_region *resv; 4481 enum iommu_resv_type type; 4482 size_t length; 4483 4484 if (i_dev != device && 4485 !is_downstream_to_pci_bridge(device, i_dev)) 4486 continue; 4487 4488 length = rmrr->end_address - rmrr->base_address + 1; 4489 4490 type = device_rmrr_is_relaxable(device) ? 4491 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4492 4493 resv = iommu_alloc_resv_region(rmrr->base_address, 4494 length, prot, type, 4495 GFP_ATOMIC); 4496 if (!resv) 4497 break; 4498 4499 list_add_tail(&resv->list, head); 4500 } 4501 } 4502 rcu_read_unlock(); 4503 4504 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4505 if (dev_is_pci(device)) { 4506 struct pci_dev *pdev = to_pci_dev(device); 4507 4508 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4509 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4510 IOMMU_RESV_DIRECT_RELAXABLE, 4511 GFP_KERNEL); 4512 if (reg) 4513 list_add_tail(®->list, head); 4514 } 4515 } 4516 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4517 4518 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4519 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4520 0, IOMMU_RESV_MSI, GFP_KERNEL); 4521 if (!reg) 4522 return; 4523 list_add_tail(®->list, head); 4524 } 4525 4526 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4527 { 4528 if (dev_is_pci(dev)) 4529 return pci_device_group(dev); 4530 return generic_device_group(dev); 4531 } 4532 4533 static int intel_iommu_enable_sva(struct device *dev) 4534 { 4535 struct device_domain_info *info = dev_iommu_priv_get(dev); 4536 struct intel_iommu *iommu; 4537 4538 if (!info || dmar_disabled) 4539 return -EINVAL; 4540 4541 iommu = info->iommu; 4542 if (!iommu) 4543 return -EINVAL; 4544 4545 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4546 return -ENODEV; 4547 4548 if (!info->pasid_enabled || !info->ats_enabled) 4549 return -EINVAL; 4550 4551 /* 4552 * Devices having device-specific I/O fault handling should not 4553 * support PCI/PRI. The IOMMU side has no means to check the 4554 * capability of device-specific IOPF. Therefore, IOMMU can only 4555 * default that if the device driver enables SVA on a non-PRI 4556 * device, it will handle IOPF in its own way. 4557 */ 4558 if (!info->pri_supported) 4559 return 0; 4560 4561 /* Devices supporting PRI should have it enabled. */ 4562 if (!info->pri_enabled) 4563 return -EINVAL; 4564 4565 return 0; 4566 } 4567 4568 static int intel_iommu_enable_iopf(struct device *dev) 4569 { 4570 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4571 struct device_domain_info *info = dev_iommu_priv_get(dev); 4572 struct intel_iommu *iommu; 4573 int ret; 4574 4575 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4576 return -ENODEV; 4577 4578 if (info->pri_enabled) 4579 return -EBUSY; 4580 4581 iommu = info->iommu; 4582 if (!iommu) 4583 return -EINVAL; 4584 4585 /* PASID is required in PRG Response Message. */ 4586 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4587 return -EINVAL; 4588 4589 ret = pci_reset_pri(pdev); 4590 if (ret) 4591 return ret; 4592 4593 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4594 if (ret) 4595 return ret; 4596 4597 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4598 if (ret) 4599 goto iopf_remove_device; 4600 4601 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4602 if (ret) 4603 goto iopf_unregister_handler; 4604 info->pri_enabled = 1; 4605 4606 return 0; 4607 4608 iopf_unregister_handler: 4609 iommu_unregister_device_fault_handler(dev); 4610 iopf_remove_device: 4611 iopf_queue_remove_device(iommu->iopf_queue, dev); 4612 4613 return ret; 4614 } 4615 4616 static int intel_iommu_disable_iopf(struct device *dev) 4617 { 4618 struct device_domain_info *info = dev_iommu_priv_get(dev); 4619 struct intel_iommu *iommu = info->iommu; 4620 4621 if (!info->pri_enabled) 4622 return -EINVAL; 4623 4624 /* 4625 * PCIe spec states that by clearing PRI enable bit, the Page 4626 * Request Interface will not issue new page requests, but has 4627 * outstanding page requests that have been transmitted or are 4628 * queued for transmission. This is supposed to be called after 4629 * the device driver has stopped DMA, all PASIDs have been 4630 * unbound and the outstanding PRQs have been drained. 4631 */ 4632 pci_disable_pri(to_pci_dev(dev)); 4633 info->pri_enabled = 0; 4634 4635 /* 4636 * With PRI disabled and outstanding PRQs drained, unregistering 4637 * fault handler and removing device from iopf queue should never 4638 * fail. 4639 */ 4640 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4641 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4642 4643 return 0; 4644 } 4645 4646 static int 4647 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4648 { 4649 switch (feat) { 4650 case IOMMU_DEV_FEAT_IOPF: 4651 return intel_iommu_enable_iopf(dev); 4652 4653 case IOMMU_DEV_FEAT_SVA: 4654 return intel_iommu_enable_sva(dev); 4655 4656 default: 4657 return -ENODEV; 4658 } 4659 } 4660 4661 static int 4662 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4663 { 4664 switch (feat) { 4665 case IOMMU_DEV_FEAT_IOPF: 4666 return intel_iommu_disable_iopf(dev); 4667 4668 case IOMMU_DEV_FEAT_SVA: 4669 return 0; 4670 4671 default: 4672 return -ENODEV; 4673 } 4674 } 4675 4676 static bool intel_iommu_is_attach_deferred(struct device *dev) 4677 { 4678 struct device_domain_info *info = dev_iommu_priv_get(dev); 4679 4680 return translation_pre_enabled(info->iommu) && !info->domain; 4681 } 4682 4683 /* 4684 * Check that the device does not live on an external facing PCI port that is 4685 * marked as untrusted. Such devices should not be able to apply quirks and 4686 * thus not be able to bypass the IOMMU restrictions. 4687 */ 4688 static bool risky_device(struct pci_dev *pdev) 4689 { 4690 if (pdev->untrusted) { 4691 pci_info(pdev, 4692 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4693 pdev->vendor, pdev->device); 4694 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4695 return true; 4696 } 4697 return false; 4698 } 4699 4700 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4701 unsigned long iova, size_t size) 4702 { 4703 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4704 unsigned long pages = aligned_nrpages(iova, size); 4705 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4706 struct iommu_domain_info *info; 4707 unsigned long i; 4708 4709 xa_for_each(&dmar_domain->iommu_array, i, info) 4710 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4711 } 4712 4713 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4714 { 4715 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 4716 struct iommu_domain *domain; 4717 4718 /* Domain type specific cleanup: */ 4719 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4720 if (domain) { 4721 switch (domain->type) { 4722 case IOMMU_DOMAIN_SVA: 4723 intel_svm_remove_dev_pasid(dev, pasid); 4724 break; 4725 default: 4726 /* should never reach here */ 4727 WARN_ON(1); 4728 break; 4729 } 4730 } 4731 4732 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4733 } 4734 4735 const struct iommu_ops intel_iommu_ops = { 4736 .capable = intel_iommu_capable, 4737 .domain_alloc = intel_iommu_domain_alloc, 4738 .probe_device = intel_iommu_probe_device, 4739 .probe_finalize = intel_iommu_probe_finalize, 4740 .release_device = intel_iommu_release_device, 4741 .get_resv_regions = intel_iommu_get_resv_regions, 4742 .device_group = intel_iommu_device_group, 4743 .dev_enable_feat = intel_iommu_dev_enable_feat, 4744 .dev_disable_feat = intel_iommu_dev_disable_feat, 4745 .is_attach_deferred = intel_iommu_is_attach_deferred, 4746 .def_domain_type = device_def_domain_type, 4747 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4748 .pgsize_bitmap = SZ_4K, 4749 #ifdef CONFIG_INTEL_IOMMU_SVM 4750 .page_response = intel_svm_page_response, 4751 #endif 4752 .default_domain_ops = &(const struct iommu_domain_ops) { 4753 .attach_dev = intel_iommu_attach_device, 4754 .map_pages = intel_iommu_map_pages, 4755 .unmap_pages = intel_iommu_unmap_pages, 4756 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4757 .flush_iotlb_all = intel_flush_iotlb_all, 4758 .iotlb_sync = intel_iommu_tlb_sync, 4759 .iova_to_phys = intel_iommu_iova_to_phys, 4760 .free = intel_iommu_domain_free, 4761 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4762 } 4763 }; 4764 4765 static void quirk_iommu_igfx(struct pci_dev *dev) 4766 { 4767 if (risky_device(dev)) 4768 return; 4769 4770 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4771 dmar_map_gfx = 0; 4772 } 4773 4774 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4782 4783 /* Broadwell igfx malfunctions with dmar */ 4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4808 4809 static void quirk_iommu_rwbf(struct pci_dev *dev) 4810 { 4811 if (risky_device(dev)) 4812 return; 4813 4814 /* 4815 * Mobile 4 Series Chipset neglects to set RWBF capability, 4816 * but needs it. Same seems to hold for the desktop versions. 4817 */ 4818 pci_info(dev, "Forcing write-buffer flush capability\n"); 4819 rwbf_quirk = 1; 4820 } 4821 4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4829 4830 #define GGC 0x52 4831 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4832 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4833 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4834 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4835 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4836 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4837 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4838 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4839 4840 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4841 { 4842 unsigned short ggc; 4843 4844 if (risky_device(dev)) 4845 return; 4846 4847 if (pci_read_config_word(dev, GGC, &ggc)) 4848 return; 4849 4850 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4851 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4852 dmar_map_gfx = 0; 4853 } else if (dmar_map_gfx) { 4854 /* we have to ensure the gfx device is idle before we flush */ 4855 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4856 iommu_set_dma_strict(); 4857 } 4858 } 4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4863 4864 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4865 { 4866 unsigned short ver; 4867 4868 if (!IS_GFX_DEVICE(dev)) 4869 return; 4870 4871 ver = (dev->device >> 8) & 0xff; 4872 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4873 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4874 ver != 0x9a && ver != 0xa7) 4875 return; 4876 4877 if (risky_device(dev)) 4878 return; 4879 4880 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4881 iommu_skip_te_disable = 1; 4882 } 4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4884 4885 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4886 ISOCH DMAR unit for the Azalia sound device, but not give it any 4887 TLB entries, which causes it to deadlock. Check for that. We do 4888 this in a function called from init_dmars(), instead of in a PCI 4889 quirk, because we don't want to print the obnoxious "BIOS broken" 4890 message if VT-d is actually disabled. 4891 */ 4892 static void __init check_tylersburg_isoch(void) 4893 { 4894 struct pci_dev *pdev; 4895 uint32_t vtisochctrl; 4896 4897 /* If there's no Azalia in the system anyway, forget it. */ 4898 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4899 if (!pdev) 4900 return; 4901 4902 if (risky_device(pdev)) { 4903 pci_dev_put(pdev); 4904 return; 4905 } 4906 4907 pci_dev_put(pdev); 4908 4909 /* System Management Registers. Might be hidden, in which case 4910 we can't do the sanity check. But that's OK, because the 4911 known-broken BIOSes _don't_ actually hide it, so far. */ 4912 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4913 if (!pdev) 4914 return; 4915 4916 if (risky_device(pdev)) { 4917 pci_dev_put(pdev); 4918 return; 4919 } 4920 4921 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4922 pci_dev_put(pdev); 4923 return; 4924 } 4925 4926 pci_dev_put(pdev); 4927 4928 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4929 if (vtisochctrl & 1) 4930 return; 4931 4932 /* Drop all bits other than the number of TLB entries */ 4933 vtisochctrl &= 0x1c; 4934 4935 /* If we have the recommended number of TLB entries (16), fine. */ 4936 if (vtisochctrl == 0x10) 4937 return; 4938 4939 /* Zero TLB entries? You get to ride the short bus to school. */ 4940 if (!vtisochctrl) { 4941 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4942 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4943 dmi_get_system_info(DMI_BIOS_VENDOR), 4944 dmi_get_system_info(DMI_BIOS_VERSION), 4945 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4946 iommu_identity_mapping |= IDENTMAP_AZALIA; 4947 return; 4948 } 4949 4950 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4951 vtisochctrl); 4952 } 4953 4954 /* 4955 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4956 * invalidation completion before posted writes initiated with translated address 4957 * that utilized translations matching the invalidation address range, violating 4958 * the invalidation completion ordering. 4959 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4960 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4961 * under the control of the trusted/privileged host device driver must use this 4962 * quirk. 4963 * Device TLBs are invalidated under the following six conditions: 4964 * 1. Device driver does DMA API unmap IOVA 4965 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4966 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4967 * exit_mmap() due to crash 4968 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4969 * VM has to free pages that were unmapped 4970 * 5. Userspace driver unmaps a DMA buffer 4971 * 6. Cache invalidation in vSVA usage (upcoming) 4972 * 4973 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4974 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4975 * invalidate TLB the same way as normal user unmap which will use this quirk. 4976 * The dTLB invalidation after PASID cache flush does not need this quirk. 4977 * 4978 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4979 */ 4980 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4981 unsigned long address, unsigned long mask, 4982 u32 pasid, u16 qdep) 4983 { 4984 u16 sid; 4985 4986 if (likely(!info->dtlb_extra_inval)) 4987 return; 4988 4989 sid = PCI_DEVID(info->bus, info->devfn); 4990 if (pasid == PASID_RID2PASID) { 4991 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4992 qdep, address, mask); 4993 } else { 4994 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 4995 pasid, qdep, address, mask); 4996 } 4997 } 4998 4999 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5000 5001 /* 5002 * Function to submit a command to the enhanced command interface. The 5003 * valid enhanced command descriptions are defined in Table 47 of the 5004 * VT-d spec. The VT-d hardware implementation may support some but not 5005 * all commands, which can be determined by checking the Enhanced 5006 * Command Capability Register. 5007 * 5008 * Return values: 5009 * - 0: Command successful without any error; 5010 * - Negative: software error value; 5011 * - Nonzero positive: failure status code defined in Table 48. 5012 */ 5013 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5014 { 5015 unsigned long flags; 5016 u64 res; 5017 int ret; 5018 5019 if (!cap_ecmds(iommu->cap)) 5020 return -ENODEV; 5021 5022 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5023 5024 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5025 if (res & DMA_ECMD_ECRSP_IP) { 5026 ret = -EBUSY; 5027 goto err; 5028 } 5029 5030 /* 5031 * Unconditionally write the operand B, because 5032 * - There is no side effect if an ecmd doesn't require an 5033 * operand B, but we set the register to some value. 5034 * - It's not invoked in any critical path. The extra MMIO 5035 * write doesn't bring any performance concerns. 5036 */ 5037 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5038 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5039 5040 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5041 !(res & DMA_ECMD_ECRSP_IP), res); 5042 5043 if (res & DMA_ECMD_ECRSP_IP) { 5044 ret = -ETIMEDOUT; 5045 goto err; 5046 } 5047 5048 ret = ecmd_get_status_code(res); 5049 err: 5050 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5051 5052 return ret; 5053 } 5054