1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define MAX_AGAW_WIDTH 64 50 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 51 52 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 54 55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 57 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 59 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 60 61 /* IO virtual address start page frame number */ 62 #define IOVA_START_PFN (1) 63 64 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 65 66 /* page table handling */ 67 #define LEVEL_STRIDE (9) 68 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 69 70 static inline int agaw_to_level(int agaw) 71 { 72 return agaw + 2; 73 } 74 75 static inline int agaw_to_width(int agaw) 76 { 77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 78 } 79 80 static inline int width_to_agaw(int width) 81 { 82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 83 } 84 85 static inline unsigned int level_to_offset_bits(int level) 86 { 87 return (level - 1) * LEVEL_STRIDE; 88 } 89 90 static inline int pfn_level_offset(u64 pfn, int level) 91 { 92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 93 } 94 95 static inline u64 level_mask(int level) 96 { 97 return -1ULL << level_to_offset_bits(level); 98 } 99 100 static inline u64 level_size(int level) 101 { 102 return 1ULL << level_to_offset_bits(level); 103 } 104 105 static inline u64 align_to_level(u64 pfn, int level) 106 { 107 return (pfn + level_size(level) - 1) & level_mask(level); 108 } 109 110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 111 { 112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 113 } 114 115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 116 are never going to work. */ 117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) 118 { 119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 120 } 121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) 122 { 123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; 124 } 125 static inline unsigned long page_to_dma_pfn(struct page *pg) 126 { 127 return mm_to_dma_pfn_start(page_to_pfn(pg)); 128 } 129 static inline unsigned long virt_to_dma_pfn(void *p) 130 { 131 return page_to_dma_pfn(virt_to_page(p)); 132 } 133 134 static void __init check_tylersburg_isoch(void); 135 static int rwbf_quirk; 136 137 /* 138 * set to 1 to panic kernel if can't successfully enable VT-d 139 * (used when kernel is launched w/ TXT) 140 */ 141 static int force_on = 0; 142 static int intel_iommu_tboot_noforce; 143 static int no_platform_optin; 144 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 146 147 /* 148 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 149 * if marked present. 150 */ 151 static phys_addr_t root_entry_lctp(struct root_entry *re) 152 { 153 if (!(re->lo & 1)) 154 return 0; 155 156 return re->lo & VTD_PAGE_MASK; 157 } 158 159 /* 160 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 161 * if marked present. 162 */ 163 static phys_addr_t root_entry_uctp(struct root_entry *re) 164 { 165 if (!(re->hi & 1)) 166 return 0; 167 168 return re->hi & VTD_PAGE_MASK; 169 } 170 171 static inline void context_set_present(struct context_entry *context) 172 { 173 context->lo |= 1; 174 } 175 176 static inline void context_set_fault_enable(struct context_entry *context) 177 { 178 context->lo &= (((u64)-1) << 2) | 1; 179 } 180 181 static inline void context_set_translation_type(struct context_entry *context, 182 unsigned long value) 183 { 184 context->lo &= (((u64)-1) << 4) | 3; 185 context->lo |= (value & 3) << 2; 186 } 187 188 static inline void context_set_address_root(struct context_entry *context, 189 unsigned long value) 190 { 191 context->lo &= ~VTD_PAGE_MASK; 192 context->lo |= value & VTD_PAGE_MASK; 193 } 194 195 static inline void context_set_address_width(struct context_entry *context, 196 unsigned long value) 197 { 198 context->hi |= value & 7; 199 } 200 201 static inline void context_set_domain_id(struct context_entry *context, 202 unsigned long value) 203 { 204 context->hi |= (value & ((1 << 16) - 1)) << 8; 205 } 206 207 static inline void context_set_pasid(struct context_entry *context) 208 { 209 context->lo |= CONTEXT_PASIDE; 210 } 211 212 static inline int context_domain_id(struct context_entry *c) 213 { 214 return((c->hi >> 8) & 0xffff); 215 } 216 217 static inline void context_clear_entry(struct context_entry *context) 218 { 219 context->lo = 0; 220 context->hi = 0; 221 } 222 223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 224 { 225 if (!iommu->copied_tables) 226 return false; 227 228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); 229 } 230 231 static inline void 232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 233 { 234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables); 235 } 236 237 static inline void 238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 239 { 240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); 241 } 242 243 /* 244 * This domain is a statically identity mapping domain. 245 * 1. This domain creats a static 1:1 mapping to all usable memory. 246 * 2. It maps to each iommu if successful. 247 * 3. Each iommu mapps to this domain if successful. 248 */ 249 static struct dmar_domain *si_domain; 250 static int hw_pass_through = 1; 251 252 struct dmar_rmrr_unit { 253 struct list_head list; /* list of rmrr units */ 254 struct acpi_dmar_header *hdr; /* ACPI header */ 255 u64 base_address; /* reserved base address*/ 256 u64 end_address; /* reserved end address */ 257 struct dmar_dev_scope *devices; /* target devices */ 258 int devices_cnt; /* target device count */ 259 }; 260 261 struct dmar_atsr_unit { 262 struct list_head list; /* list of ATSR units */ 263 struct acpi_dmar_header *hdr; /* ACPI header */ 264 struct dmar_dev_scope *devices; /* target devices */ 265 int devices_cnt; /* target device count */ 266 u8 include_all:1; /* include all ports */ 267 }; 268 269 struct dmar_satc_unit { 270 struct list_head list; /* list of SATC units */ 271 struct acpi_dmar_header *hdr; /* ACPI header */ 272 struct dmar_dev_scope *devices; /* target devices */ 273 struct intel_iommu *iommu; /* the corresponding iommu */ 274 int devices_cnt; /* target device count */ 275 u8 atc_required:1; /* ATS is required */ 276 }; 277 278 static LIST_HEAD(dmar_atsr_units); 279 static LIST_HEAD(dmar_rmrr_units); 280 static LIST_HEAD(dmar_satc_units); 281 282 #define for_each_rmrr_units(rmrr) \ 283 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 284 285 static void device_block_translation(struct device *dev); 286 static void intel_iommu_domain_free(struct iommu_domain *domain); 287 288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 290 291 int intel_iommu_enabled = 0; 292 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 293 294 static int dmar_map_gfx = 1; 295 static int intel_iommu_superpage = 1; 296 static int iommu_identity_mapping; 297 static int iommu_skip_te_disable; 298 299 #define IDENTMAP_GFX 2 300 #define IDENTMAP_AZALIA 4 301 302 const struct iommu_ops intel_iommu_ops; 303 304 static bool translation_pre_enabled(struct intel_iommu *iommu) 305 { 306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 307 } 308 309 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 310 { 311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 312 } 313 314 static void init_translation_status(struct intel_iommu *iommu) 315 { 316 u32 gsts; 317 318 gsts = readl(iommu->reg + DMAR_GSTS_REG); 319 if (gsts & DMA_GSTS_TES) 320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 321 } 322 323 static int __init intel_iommu_setup(char *str) 324 { 325 if (!str) 326 return -EINVAL; 327 328 while (*str) { 329 if (!strncmp(str, "on", 2)) { 330 dmar_disabled = 0; 331 pr_info("IOMMU enabled\n"); 332 } else if (!strncmp(str, "off", 3)) { 333 dmar_disabled = 1; 334 no_platform_optin = 1; 335 pr_info("IOMMU disabled\n"); 336 } else if (!strncmp(str, "igfx_off", 8)) { 337 dmar_map_gfx = 0; 338 pr_info("Disable GFX device mapping\n"); 339 } else if (!strncmp(str, "forcedac", 8)) { 340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 341 iommu_dma_forcedac = true; 342 } else if (!strncmp(str, "strict", 6)) { 343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 344 iommu_set_dma_strict(); 345 } else if (!strncmp(str, "sp_off", 6)) { 346 pr_info("Disable supported super page\n"); 347 intel_iommu_superpage = 0; 348 } else if (!strncmp(str, "sm_on", 5)) { 349 pr_info("Enable scalable mode if hardware supports\n"); 350 intel_iommu_sm = 1; 351 } else if (!strncmp(str, "sm_off", 6)) { 352 pr_info("Scalable mode is disallowed\n"); 353 intel_iommu_sm = 0; 354 } else if (!strncmp(str, "tboot_noforce", 13)) { 355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 356 intel_iommu_tboot_noforce = 1; 357 } else { 358 pr_notice("Unknown option - '%s'\n", str); 359 } 360 361 str += strcspn(str, ","); 362 while (*str == ',') 363 str++; 364 } 365 366 return 1; 367 } 368 __setup("intel_iommu=", intel_iommu_setup); 369 370 void *alloc_pgtable_page(int node, gfp_t gfp) 371 { 372 struct page *page; 373 void *vaddr = NULL; 374 375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 376 if (page) 377 vaddr = page_address(page); 378 return vaddr; 379 } 380 381 void free_pgtable_page(void *vaddr) 382 { 383 free_page((unsigned long)vaddr); 384 } 385 386 static inline int domain_type_is_si(struct dmar_domain *domain) 387 { 388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 389 } 390 391 static inline int domain_pfn_supported(struct dmar_domain *domain, 392 unsigned long pfn) 393 { 394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 395 396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 397 } 398 399 /* 400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 402 * the returned SAGAW. 403 */ 404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 405 { 406 unsigned long fl_sagaw, sl_sagaw; 407 408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 409 sl_sagaw = cap_sagaw(iommu->cap); 410 411 /* Second level only. */ 412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 413 return sl_sagaw; 414 415 /* First level only. */ 416 if (!ecap_slts(iommu->ecap)) 417 return fl_sagaw; 418 419 return fl_sagaw & sl_sagaw; 420 } 421 422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 423 { 424 unsigned long sagaw; 425 int agaw; 426 427 sagaw = __iommu_calculate_sagaw(iommu); 428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 429 if (test_bit(agaw, &sagaw)) 430 break; 431 } 432 433 return agaw; 434 } 435 436 /* 437 * Calculate max SAGAW for each iommu. 438 */ 439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 440 { 441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 442 } 443 444 /* 445 * calculate agaw for each iommu. 446 * "SAGAW" may be different across iommus, use a default agaw, and 447 * get a supported less agaw for iommus that don't support the default agaw. 448 */ 449 int iommu_calculate_agaw(struct intel_iommu *iommu) 450 { 451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 452 } 453 454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 455 { 456 return sm_supported(iommu) ? 457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 458 } 459 460 static void domain_update_iommu_coherency(struct dmar_domain *domain) 461 { 462 struct iommu_domain_info *info; 463 struct dmar_drhd_unit *drhd; 464 struct intel_iommu *iommu; 465 bool found = false; 466 unsigned long i; 467 468 domain->iommu_coherency = true; 469 xa_for_each(&domain->iommu_array, i, info) { 470 found = true; 471 if (!iommu_paging_structure_coherency(info->iommu)) { 472 domain->iommu_coherency = false; 473 break; 474 } 475 } 476 if (found) 477 return; 478 479 /* No hardware attached; use lowest common denominator */ 480 rcu_read_lock(); 481 for_each_active_iommu(iommu, drhd) { 482 if (!iommu_paging_structure_coherency(iommu)) { 483 domain->iommu_coherency = false; 484 break; 485 } 486 } 487 rcu_read_unlock(); 488 } 489 490 static int domain_update_iommu_superpage(struct dmar_domain *domain, 491 struct intel_iommu *skip) 492 { 493 struct dmar_drhd_unit *drhd; 494 struct intel_iommu *iommu; 495 int mask = 0x3; 496 497 if (!intel_iommu_superpage) 498 return 0; 499 500 /* set iommu_superpage to the smallest common denominator */ 501 rcu_read_lock(); 502 for_each_active_iommu(iommu, drhd) { 503 if (iommu != skip) { 504 if (domain && domain->use_first_level) { 505 if (!cap_fl1gp_support(iommu->cap)) 506 mask = 0x1; 507 } else { 508 mask &= cap_super_page_val(iommu->cap); 509 } 510 511 if (!mask) 512 break; 513 } 514 } 515 rcu_read_unlock(); 516 517 return fls(mask); 518 } 519 520 static int domain_update_device_node(struct dmar_domain *domain) 521 { 522 struct device_domain_info *info; 523 int nid = NUMA_NO_NODE; 524 unsigned long flags; 525 526 spin_lock_irqsave(&domain->lock, flags); 527 list_for_each_entry(info, &domain->devices, link) { 528 /* 529 * There could possibly be multiple device numa nodes as devices 530 * within the same domain may sit behind different IOMMUs. There 531 * isn't perfect answer in such situation, so we select first 532 * come first served policy. 533 */ 534 nid = dev_to_node(info->dev); 535 if (nid != NUMA_NO_NODE) 536 break; 537 } 538 spin_unlock_irqrestore(&domain->lock, flags); 539 540 return nid; 541 } 542 543 static void domain_update_iotlb(struct dmar_domain *domain); 544 545 /* Return the super pagesize bitmap if supported. */ 546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 547 { 548 unsigned long bitmap = 0; 549 550 /* 551 * 1-level super page supports page size of 2MiB, 2-level super page 552 * supports page size of both 2MiB and 1GiB. 553 */ 554 if (domain->iommu_superpage == 1) 555 bitmap |= SZ_2M; 556 else if (domain->iommu_superpage == 2) 557 bitmap |= SZ_2M | SZ_1G; 558 559 return bitmap; 560 } 561 562 /* Some capabilities may be different across iommus */ 563 static void domain_update_iommu_cap(struct dmar_domain *domain) 564 { 565 domain_update_iommu_coherency(domain); 566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 567 568 /* 569 * If RHSA is missing, we should default to the device numa domain 570 * as fall back. 571 */ 572 if (domain->nid == NUMA_NO_NODE) 573 domain->nid = domain_update_device_node(domain); 574 575 /* 576 * First-level translation restricts the input-address to a 577 * canonical address (i.e., address bits 63:N have the same 578 * value as address bit [N-1], where N is 48-bits with 4-level 579 * paging and 57-bits with 5-level paging). Hence, skip bit 580 * [N-1]. 581 */ 582 if (domain->use_first_level) 583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 584 else 585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 586 587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 588 domain_update_iotlb(domain); 589 } 590 591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 592 u8 devfn, int alloc) 593 { 594 struct root_entry *root = &iommu->root_entry[bus]; 595 struct context_entry *context; 596 u64 *entry; 597 598 /* 599 * Except that the caller requested to allocate a new entry, 600 * returning a copied context entry makes no sense. 601 */ 602 if (!alloc && context_copied(iommu, bus, devfn)) 603 return NULL; 604 605 entry = &root->lo; 606 if (sm_supported(iommu)) { 607 if (devfn >= 0x80) { 608 devfn -= 0x80; 609 entry = &root->hi; 610 } 611 devfn *= 2; 612 } 613 if (*entry & 1) 614 context = phys_to_virt(*entry & VTD_PAGE_MASK); 615 else { 616 unsigned long phy_addr; 617 if (!alloc) 618 return NULL; 619 620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 621 if (!context) 622 return NULL; 623 624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 625 phy_addr = virt_to_phys((void *)context); 626 *entry = phy_addr | 1; 627 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 628 } 629 return &context[devfn]; 630 } 631 632 /** 633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 634 * sub-hierarchy of a candidate PCI-PCI bridge 635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 636 * @bridge: the candidate PCI-PCI bridge 637 * 638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 639 */ 640 static bool 641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 642 { 643 struct pci_dev *pdev, *pbridge; 644 645 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 646 return false; 647 648 pdev = to_pci_dev(dev); 649 pbridge = to_pci_dev(bridge); 650 651 if (pbridge->subordinate && 652 pbridge->subordinate->number <= pdev->bus->number && 653 pbridge->subordinate->busn_res.end >= pdev->bus->number) 654 return true; 655 656 return false; 657 } 658 659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 660 { 661 struct dmar_drhd_unit *drhd; 662 u32 vtbar; 663 int rc; 664 665 /* We know that this device on this chipset has its own IOMMU. 666 * If we find it under a different IOMMU, then the BIOS is lying 667 * to us. Hope that the IOMMU for this device is actually 668 * disabled, and it needs no translation... 669 */ 670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 671 if (rc) { 672 /* "can't" happen */ 673 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 674 return false; 675 } 676 vtbar &= 0xffff0000; 677 678 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 679 drhd = dmar_find_matched_drhd_unit(pdev); 680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 683 return true; 684 } 685 686 return false; 687 } 688 689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 690 { 691 if (!iommu || iommu->drhd->ignored) 692 return true; 693 694 if (dev_is_pci(dev)) { 695 struct pci_dev *pdev = to_pci_dev(dev); 696 697 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 699 quirk_ioat_snb_local_iommu(pdev)) 700 return true; 701 } 702 703 return false; 704 } 705 706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 707 { 708 struct dmar_drhd_unit *drhd = NULL; 709 struct pci_dev *pdev = NULL; 710 struct intel_iommu *iommu; 711 struct device *tmp; 712 u16 segment = 0; 713 int i; 714 715 if (!dev) 716 return NULL; 717 718 if (dev_is_pci(dev)) { 719 struct pci_dev *pf_pdev; 720 721 pdev = pci_real_dma_dev(to_pci_dev(dev)); 722 723 /* VFs aren't listed in scope tables; we need to look up 724 * the PF instead to find the IOMMU. */ 725 pf_pdev = pci_physfn(pdev); 726 dev = &pf_pdev->dev; 727 segment = pci_domain_nr(pdev->bus); 728 } else if (has_acpi_companion(dev)) 729 dev = &ACPI_COMPANION(dev)->dev; 730 731 rcu_read_lock(); 732 for_each_iommu(iommu, drhd) { 733 if (pdev && segment != drhd->segment) 734 continue; 735 736 for_each_active_dev_scope(drhd->devices, 737 drhd->devices_cnt, i, tmp) { 738 if (tmp == dev) { 739 /* For a VF use its original BDF# not that of the PF 740 * which we used for the IOMMU lookup. Strictly speaking 741 * we could do this for all PCI devices; we only need to 742 * get the BDF# from the scope table for ACPI matches. */ 743 if (pdev && pdev->is_virtfn) 744 goto got_pdev; 745 746 if (bus && devfn) { 747 *bus = drhd->devices[i].bus; 748 *devfn = drhd->devices[i].devfn; 749 } 750 goto out; 751 } 752 753 if (is_downstream_to_pci_bridge(dev, tmp)) 754 goto got_pdev; 755 } 756 757 if (pdev && drhd->include_all) { 758 got_pdev: 759 if (bus && devfn) { 760 *bus = pdev->bus->number; 761 *devfn = pdev->devfn; 762 } 763 goto out; 764 } 765 } 766 iommu = NULL; 767 out: 768 if (iommu_is_dummy(iommu, dev)) 769 iommu = NULL; 770 771 rcu_read_unlock(); 772 773 return iommu; 774 } 775 776 static void domain_flush_cache(struct dmar_domain *domain, 777 void *addr, int size) 778 { 779 if (!domain->iommu_coherency) 780 clflush_cache_range(addr, size); 781 } 782 783 static void free_context_table(struct intel_iommu *iommu) 784 { 785 struct context_entry *context; 786 int i; 787 788 if (!iommu->root_entry) 789 return; 790 791 for (i = 0; i < ROOT_ENTRY_NR; i++) { 792 context = iommu_context_addr(iommu, i, 0, 0); 793 if (context) 794 free_pgtable_page(context); 795 796 if (!sm_supported(iommu)) 797 continue; 798 799 context = iommu_context_addr(iommu, i, 0x80, 0); 800 if (context) 801 free_pgtable_page(context); 802 } 803 804 free_pgtable_page(iommu->root_entry); 805 iommu->root_entry = NULL; 806 } 807 808 #ifdef CONFIG_DMAR_DEBUG 809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 810 u8 bus, u8 devfn, struct dma_pte *parent, int level) 811 { 812 struct dma_pte *pte; 813 int offset; 814 815 while (1) { 816 offset = pfn_level_offset(pfn, level); 817 pte = &parent[offset]; 818 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 819 pr_info("PTE not present at level %d\n", level); 820 break; 821 } 822 823 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 824 825 if (level == 1) 826 break; 827 828 parent = phys_to_virt(dma_pte_addr(pte)); 829 level--; 830 } 831 } 832 833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 834 unsigned long long addr, u32 pasid) 835 { 836 struct pasid_dir_entry *dir, *pde; 837 struct pasid_entry *entries, *pte; 838 struct context_entry *ctx_entry; 839 struct root_entry *rt_entry; 840 int i, dir_index, index, level; 841 u8 devfn = source_id & 0xff; 842 u8 bus = source_id >> 8; 843 struct dma_pte *pgtable; 844 845 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 846 847 /* root entry dump */ 848 rt_entry = &iommu->root_entry[bus]; 849 if (!rt_entry) { 850 pr_info("root table entry is not present\n"); 851 return; 852 } 853 854 if (sm_supported(iommu)) 855 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 856 rt_entry->hi, rt_entry->lo); 857 else 858 pr_info("root entry: 0x%016llx", rt_entry->lo); 859 860 /* context entry dump */ 861 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 862 if (!ctx_entry) { 863 pr_info("context table entry is not present\n"); 864 return; 865 } 866 867 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 868 ctx_entry->hi, ctx_entry->lo); 869 870 /* legacy mode does not require PASID entries */ 871 if (!sm_supported(iommu)) { 872 level = agaw_to_level(ctx_entry->hi & 7); 873 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 874 goto pgtable_walk; 875 } 876 877 /* get the pointer to pasid directory entry */ 878 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 879 if (!dir) { 880 pr_info("pasid directory entry is not present\n"); 881 return; 882 } 883 /* For request-without-pasid, get the pasid from context entry */ 884 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 885 pasid = IOMMU_NO_PASID; 886 887 dir_index = pasid >> PASID_PDE_SHIFT; 888 pde = &dir[dir_index]; 889 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 890 891 /* get the pointer to the pasid table entry */ 892 entries = get_pasid_table_from_pde(pde); 893 if (!entries) { 894 pr_info("pasid table entry is not present\n"); 895 return; 896 } 897 index = pasid & PASID_PTE_MASK; 898 pte = &entries[index]; 899 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 900 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 901 902 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 903 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 904 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 905 } else { 906 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 907 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 908 } 909 910 pgtable_walk: 911 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 912 } 913 #endif 914 915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 916 unsigned long pfn, int *target_level, 917 gfp_t gfp) 918 { 919 struct dma_pte *parent, *pte; 920 int level = agaw_to_level(domain->agaw); 921 int offset; 922 923 if (!domain_pfn_supported(domain, pfn)) 924 /* Address beyond IOMMU's addressing capabilities. */ 925 return NULL; 926 927 parent = domain->pgd; 928 929 while (1) { 930 void *tmp_page; 931 932 offset = pfn_level_offset(pfn, level); 933 pte = &parent[offset]; 934 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 935 break; 936 if (level == *target_level) 937 break; 938 939 if (!dma_pte_present(pte)) { 940 uint64_t pteval; 941 942 tmp_page = alloc_pgtable_page(domain->nid, gfp); 943 944 if (!tmp_page) 945 return NULL; 946 947 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 948 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 949 if (domain->use_first_level) 950 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 951 952 if (cmpxchg64(&pte->val, 0ULL, pteval)) 953 /* Someone else set it while we were thinking; use theirs. */ 954 free_pgtable_page(tmp_page); 955 else 956 domain_flush_cache(domain, pte, sizeof(*pte)); 957 } 958 if (level == 1) 959 break; 960 961 parent = phys_to_virt(dma_pte_addr(pte)); 962 level--; 963 } 964 965 if (!*target_level) 966 *target_level = level; 967 968 return pte; 969 } 970 971 /* return address's pte at specific level */ 972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 973 unsigned long pfn, 974 int level, int *large_page) 975 { 976 struct dma_pte *parent, *pte; 977 int total = agaw_to_level(domain->agaw); 978 int offset; 979 980 parent = domain->pgd; 981 while (level <= total) { 982 offset = pfn_level_offset(pfn, total); 983 pte = &parent[offset]; 984 if (level == total) 985 return pte; 986 987 if (!dma_pte_present(pte)) { 988 *large_page = total; 989 break; 990 } 991 992 if (dma_pte_superpage(pte)) { 993 *large_page = total; 994 return pte; 995 } 996 997 parent = phys_to_virt(dma_pte_addr(pte)); 998 total--; 999 } 1000 return NULL; 1001 } 1002 1003 /* clear last level pte, a tlb flush should be followed */ 1004 static void dma_pte_clear_range(struct dmar_domain *domain, 1005 unsigned long start_pfn, 1006 unsigned long last_pfn) 1007 { 1008 unsigned int large_page; 1009 struct dma_pte *first_pte, *pte; 1010 1011 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1012 WARN_ON(start_pfn > last_pfn)) 1013 return; 1014 1015 /* we don't need lock here; nobody else touches the iova range */ 1016 do { 1017 large_page = 1; 1018 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1019 if (!pte) { 1020 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1021 continue; 1022 } 1023 do { 1024 dma_clear_pte(pte); 1025 start_pfn += lvl_to_nr_pages(large_page); 1026 pte++; 1027 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1028 1029 domain_flush_cache(domain, first_pte, 1030 (void *)pte - (void *)first_pte); 1031 1032 } while (start_pfn && start_pfn <= last_pfn); 1033 } 1034 1035 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1036 int retain_level, struct dma_pte *pte, 1037 unsigned long pfn, unsigned long start_pfn, 1038 unsigned long last_pfn) 1039 { 1040 pfn = max(start_pfn, pfn); 1041 pte = &pte[pfn_level_offset(pfn, level)]; 1042 1043 do { 1044 unsigned long level_pfn; 1045 struct dma_pte *level_pte; 1046 1047 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1048 goto next; 1049 1050 level_pfn = pfn & level_mask(level); 1051 level_pte = phys_to_virt(dma_pte_addr(pte)); 1052 1053 if (level > 2) { 1054 dma_pte_free_level(domain, level - 1, retain_level, 1055 level_pte, level_pfn, start_pfn, 1056 last_pfn); 1057 } 1058 1059 /* 1060 * Free the page table if we're below the level we want to 1061 * retain and the range covers the entire table. 1062 */ 1063 if (level < retain_level && !(start_pfn > level_pfn || 1064 last_pfn < level_pfn + level_size(level) - 1)) { 1065 dma_clear_pte(pte); 1066 domain_flush_cache(domain, pte, sizeof(*pte)); 1067 free_pgtable_page(level_pte); 1068 } 1069 next: 1070 pfn += level_size(level); 1071 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1072 } 1073 1074 /* 1075 * clear last level (leaf) ptes and free page table pages below the 1076 * level we wish to keep intact. 1077 */ 1078 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1079 unsigned long start_pfn, 1080 unsigned long last_pfn, 1081 int retain_level) 1082 { 1083 dma_pte_clear_range(domain, start_pfn, last_pfn); 1084 1085 /* We don't need lock here; nobody else touches the iova range */ 1086 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1087 domain->pgd, 0, start_pfn, last_pfn); 1088 1089 /* free pgd */ 1090 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1091 free_pgtable_page(domain->pgd); 1092 domain->pgd = NULL; 1093 } 1094 } 1095 1096 /* When a page at a given level is being unlinked from its parent, we don't 1097 need to *modify* it at all. All we need to do is make a list of all the 1098 pages which can be freed just as soon as we've flushed the IOTLB and we 1099 know the hardware page-walk will no longer touch them. 1100 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1101 be freed. */ 1102 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1103 int level, struct dma_pte *pte, 1104 struct list_head *freelist) 1105 { 1106 struct page *pg; 1107 1108 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1109 list_add_tail(&pg->lru, freelist); 1110 1111 if (level == 1) 1112 return; 1113 1114 pte = page_address(pg); 1115 do { 1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1117 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1118 pte++; 1119 } while (!first_pte_in_page(pte)); 1120 } 1121 1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1123 struct dma_pte *pte, unsigned long pfn, 1124 unsigned long start_pfn, unsigned long last_pfn, 1125 struct list_head *freelist) 1126 { 1127 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1128 1129 pfn = max(start_pfn, pfn); 1130 pte = &pte[pfn_level_offset(pfn, level)]; 1131 1132 do { 1133 unsigned long level_pfn = pfn & level_mask(level); 1134 1135 if (!dma_pte_present(pte)) 1136 goto next; 1137 1138 /* If range covers entire pagetable, free it */ 1139 if (start_pfn <= level_pfn && 1140 last_pfn >= level_pfn + level_size(level) - 1) { 1141 /* These suborbinate page tables are going away entirely. Don't 1142 bother to clear them; we're just going to *free* them. */ 1143 if (level > 1 && !dma_pte_superpage(pte)) 1144 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1145 1146 dma_clear_pte(pte); 1147 if (!first_pte) 1148 first_pte = pte; 1149 last_pte = pte; 1150 } else if (level > 1) { 1151 /* Recurse down into a level that isn't *entirely* obsolete */ 1152 dma_pte_clear_level(domain, level - 1, 1153 phys_to_virt(dma_pte_addr(pte)), 1154 level_pfn, start_pfn, last_pfn, 1155 freelist); 1156 } 1157 next: 1158 pfn = level_pfn + level_size(level); 1159 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1160 1161 if (first_pte) 1162 domain_flush_cache(domain, first_pte, 1163 (void *)++last_pte - (void *)first_pte); 1164 } 1165 1166 /* We can't just free the pages because the IOMMU may still be walking 1167 the page tables, and may have cached the intermediate levels. The 1168 pages can only be freed after the IOTLB flush has been done. */ 1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1170 unsigned long last_pfn, struct list_head *freelist) 1171 { 1172 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1173 WARN_ON(start_pfn > last_pfn)) 1174 return; 1175 1176 /* we don't need lock here; nobody else touches the iova range */ 1177 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1178 domain->pgd, 0, start_pfn, last_pfn, freelist); 1179 1180 /* free pgd */ 1181 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1182 struct page *pgd_page = virt_to_page(domain->pgd); 1183 list_add_tail(&pgd_page->lru, freelist); 1184 domain->pgd = NULL; 1185 } 1186 } 1187 1188 /* iommu handling */ 1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1190 { 1191 struct root_entry *root; 1192 1193 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1194 if (!root) { 1195 pr_err("Allocating root entry for %s failed\n", 1196 iommu->name); 1197 return -ENOMEM; 1198 } 1199 1200 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1201 iommu->root_entry = root; 1202 1203 return 0; 1204 } 1205 1206 static void iommu_set_root_entry(struct intel_iommu *iommu) 1207 { 1208 u64 addr; 1209 u32 sts; 1210 unsigned long flag; 1211 1212 addr = virt_to_phys(iommu->root_entry); 1213 if (sm_supported(iommu)) 1214 addr |= DMA_RTADDR_SMT; 1215 1216 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1217 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1218 1219 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1220 1221 /* Make sure hardware complete it */ 1222 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1223 readl, (sts & DMA_GSTS_RTPS), sts); 1224 1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1226 1227 /* 1228 * Hardware invalidates all DMA remapping hardware translation 1229 * caches as part of SRTP flow. 1230 */ 1231 if (cap_esrtps(iommu->cap)) 1232 return; 1233 1234 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1235 if (sm_supported(iommu)) 1236 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1237 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1238 } 1239 1240 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1241 { 1242 u32 val; 1243 unsigned long flag; 1244 1245 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1246 return; 1247 1248 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1249 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1250 1251 /* Make sure hardware complete it */ 1252 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1253 readl, (!(val & DMA_GSTS_WBFS)), val); 1254 1255 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1256 } 1257 1258 /* return value determine if we need a write buffer flush */ 1259 static void __iommu_flush_context(struct intel_iommu *iommu, 1260 u16 did, u16 source_id, u8 function_mask, 1261 u64 type) 1262 { 1263 u64 val = 0; 1264 unsigned long flag; 1265 1266 switch (type) { 1267 case DMA_CCMD_GLOBAL_INVL: 1268 val = DMA_CCMD_GLOBAL_INVL; 1269 break; 1270 case DMA_CCMD_DOMAIN_INVL: 1271 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1272 break; 1273 case DMA_CCMD_DEVICE_INVL: 1274 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1275 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1276 break; 1277 default: 1278 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1279 iommu->name, type); 1280 return; 1281 } 1282 val |= DMA_CCMD_ICC; 1283 1284 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1286 1287 /* Make sure hardware complete it */ 1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1290 1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1292 } 1293 1294 /* return value determine if we need a write buffer flush */ 1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1296 u64 addr, unsigned int size_order, u64 type) 1297 { 1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1299 u64 val = 0, val_iva = 0; 1300 unsigned long flag; 1301 1302 switch (type) { 1303 case DMA_TLB_GLOBAL_FLUSH: 1304 /* global flush doesn't need set IVA_REG */ 1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1306 break; 1307 case DMA_TLB_DSI_FLUSH: 1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1309 break; 1310 case DMA_TLB_PSI_FLUSH: 1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1312 /* IH bit is passed in as part of address */ 1313 val_iva = size_order | addr; 1314 break; 1315 default: 1316 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1317 iommu->name, type); 1318 return; 1319 } 1320 1321 if (cap_write_drain(iommu->cap)) 1322 val |= DMA_TLB_WRITE_DRAIN; 1323 1324 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1325 /* Note: Only uses first TLB reg currently */ 1326 if (val_iva) 1327 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1328 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1329 1330 /* Make sure hardware complete it */ 1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1332 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1333 1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1335 1336 /* check IOTLB invalidation granularity */ 1337 if (DMA_TLB_IAIG(val) == 0) 1338 pr_err("Flush IOTLB failed\n"); 1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1340 pr_debug("TLB flush request %Lx, actual %Lx\n", 1341 (unsigned long long)DMA_TLB_IIRG(type), 1342 (unsigned long long)DMA_TLB_IAIG(val)); 1343 } 1344 1345 static struct device_domain_info * 1346 domain_lookup_dev_info(struct dmar_domain *domain, 1347 struct intel_iommu *iommu, u8 bus, u8 devfn) 1348 { 1349 struct device_domain_info *info; 1350 unsigned long flags; 1351 1352 spin_lock_irqsave(&domain->lock, flags); 1353 list_for_each_entry(info, &domain->devices, link) { 1354 if (info->iommu == iommu && info->bus == bus && 1355 info->devfn == devfn) { 1356 spin_unlock_irqrestore(&domain->lock, flags); 1357 return info; 1358 } 1359 } 1360 spin_unlock_irqrestore(&domain->lock, flags); 1361 1362 return NULL; 1363 } 1364 1365 static void domain_update_iotlb(struct dmar_domain *domain) 1366 { 1367 struct dev_pasid_info *dev_pasid; 1368 struct device_domain_info *info; 1369 bool has_iotlb_device = false; 1370 unsigned long flags; 1371 1372 spin_lock_irqsave(&domain->lock, flags); 1373 list_for_each_entry(info, &domain->devices, link) { 1374 if (info->ats_enabled) { 1375 has_iotlb_device = true; 1376 break; 1377 } 1378 } 1379 1380 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1381 info = dev_iommu_priv_get(dev_pasid->dev); 1382 if (info->ats_enabled) { 1383 has_iotlb_device = true; 1384 break; 1385 } 1386 } 1387 domain->has_iotlb_device = has_iotlb_device; 1388 spin_unlock_irqrestore(&domain->lock, flags); 1389 } 1390 1391 /* 1392 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1393 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1394 * check because it applies only to the built-in QAT devices and it doesn't 1395 * grant additional privileges. 1396 */ 1397 #define BUGGY_QAT_DEVID_MASK 0x4940 1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1399 { 1400 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1401 return false; 1402 1403 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1404 return false; 1405 1406 return true; 1407 } 1408 1409 static void iommu_enable_pci_caps(struct device_domain_info *info) 1410 { 1411 struct pci_dev *pdev; 1412 1413 if (!dev_is_pci(info->dev)) 1414 return; 1415 1416 pdev = to_pci_dev(info->dev); 1417 1418 /* The PCIe spec, in its wisdom, declares that the behaviour of 1419 the device if you enable PASID support after ATS support is 1420 undefined. So always enable PASID support on devices which 1421 have it, even if we can't yet know if we're ever going to 1422 use it. */ 1423 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1424 info->pasid_enabled = 1; 1425 1426 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1427 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1428 info->ats_enabled = 1; 1429 domain_update_iotlb(info->domain); 1430 } 1431 } 1432 1433 static void iommu_disable_pci_caps(struct device_domain_info *info) 1434 { 1435 struct pci_dev *pdev; 1436 1437 if (!dev_is_pci(info->dev)) 1438 return; 1439 1440 pdev = to_pci_dev(info->dev); 1441 1442 if (info->ats_enabled) { 1443 pci_disable_ats(pdev); 1444 info->ats_enabled = 0; 1445 domain_update_iotlb(info->domain); 1446 } 1447 1448 if (info->pasid_enabled) { 1449 pci_disable_pasid(pdev); 1450 info->pasid_enabled = 0; 1451 } 1452 } 1453 1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1455 u64 addr, unsigned int mask) 1456 { 1457 u16 sid, qdep; 1458 1459 if (!info || !info->ats_enabled) 1460 return; 1461 1462 sid = info->bus << 8 | info->devfn; 1463 qdep = info->ats_qdep; 1464 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1465 qdep, addr, mask); 1466 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); 1467 } 1468 1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1470 u64 addr, unsigned mask) 1471 { 1472 struct dev_pasid_info *dev_pasid; 1473 struct device_domain_info *info; 1474 unsigned long flags; 1475 1476 if (!domain->has_iotlb_device) 1477 return; 1478 1479 spin_lock_irqsave(&domain->lock, flags); 1480 list_for_each_entry(info, &domain->devices, link) 1481 __iommu_flush_dev_iotlb(info, addr, mask); 1482 1483 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1484 info = dev_iommu_priv_get(dev_pasid->dev); 1485 1486 if (!info->ats_enabled) 1487 continue; 1488 1489 qi_flush_dev_iotlb_pasid(info->iommu, 1490 PCI_DEVID(info->bus, info->devfn), 1491 info->pfsid, dev_pasid->pasid, 1492 info->ats_qdep, addr, 1493 mask); 1494 } 1495 spin_unlock_irqrestore(&domain->lock, flags); 1496 } 1497 1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, 1499 struct dmar_domain *domain, u64 addr, 1500 unsigned long npages, bool ih) 1501 { 1502 u16 did = domain_id_iommu(domain, iommu); 1503 struct dev_pasid_info *dev_pasid; 1504 unsigned long flags; 1505 1506 spin_lock_irqsave(&domain->lock, flags); 1507 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) 1508 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); 1509 1510 if (!list_empty(&domain->devices)) 1511 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); 1512 spin_unlock_irqrestore(&domain->lock, flags); 1513 } 1514 1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1516 struct dmar_domain *domain, 1517 unsigned long pfn, unsigned int pages, 1518 int ih, int map) 1519 { 1520 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1521 unsigned int mask = ilog2(aligned_pages); 1522 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1523 u16 did = domain_id_iommu(domain, iommu); 1524 1525 if (WARN_ON(!pages)) 1526 return; 1527 1528 if (ih) 1529 ih = 1 << 6; 1530 1531 if (domain->use_first_level) { 1532 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); 1533 } else { 1534 unsigned long bitmask = aligned_pages - 1; 1535 1536 /* 1537 * PSI masks the low order bits of the base address. If the 1538 * address isn't aligned to the mask, then compute a mask value 1539 * needed to ensure the target range is flushed. 1540 */ 1541 if (unlikely(bitmask & pfn)) { 1542 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1543 1544 /* 1545 * Since end_pfn <= pfn + bitmask, the only way bits 1546 * higher than bitmask can differ in pfn and end_pfn is 1547 * by carrying. This means after masking out bitmask, 1548 * high bits starting with the first set bit in 1549 * shared_bits are all equal in both pfn and end_pfn. 1550 */ 1551 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1552 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1553 } 1554 1555 /* 1556 * Fallback to domain selective flush if no PSI support or 1557 * the size is too big. 1558 */ 1559 if (!cap_pgsel_inv(iommu->cap) || 1560 mask > cap_max_amask_val(iommu->cap)) 1561 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1562 DMA_TLB_DSI_FLUSH); 1563 else 1564 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1565 DMA_TLB_PSI_FLUSH); 1566 } 1567 1568 /* 1569 * In caching mode, changes of pages from non-present to present require 1570 * flush. However, device IOTLB doesn't need to be flushed in this case. 1571 */ 1572 if (!cap_caching_mode(iommu->cap) || !map) 1573 iommu_flush_dev_iotlb(domain, addr, mask); 1574 } 1575 1576 /* Notification for newly created mappings */ 1577 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1578 struct dmar_domain *domain, 1579 unsigned long pfn, unsigned int pages) 1580 { 1581 /* 1582 * It's a non-present to present mapping. Only flush if caching mode 1583 * and second level. 1584 */ 1585 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1586 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1587 else 1588 iommu_flush_write_buffer(iommu); 1589 } 1590 1591 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1592 { 1593 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1594 struct iommu_domain_info *info; 1595 unsigned long idx; 1596 1597 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1598 struct intel_iommu *iommu = info->iommu; 1599 u16 did = domain_id_iommu(dmar_domain, iommu); 1600 1601 if (dmar_domain->use_first_level) 1602 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); 1603 else 1604 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1605 DMA_TLB_DSI_FLUSH); 1606 1607 if (!cap_caching_mode(iommu->cap)) 1608 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1609 } 1610 } 1611 1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1613 { 1614 u32 pmen; 1615 unsigned long flags; 1616 1617 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1618 return; 1619 1620 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1621 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1622 pmen &= ~DMA_PMEN_EPM; 1623 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1624 1625 /* wait for the protected region status bit to clear */ 1626 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1627 readl, !(pmen & DMA_PMEN_PRS), pmen); 1628 1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1630 } 1631 1632 static void iommu_enable_translation(struct intel_iommu *iommu) 1633 { 1634 u32 sts; 1635 unsigned long flags; 1636 1637 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1638 iommu->gcmd |= DMA_GCMD_TE; 1639 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1640 1641 /* Make sure hardware complete it */ 1642 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1643 readl, (sts & DMA_GSTS_TES), sts); 1644 1645 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1646 } 1647 1648 static void iommu_disable_translation(struct intel_iommu *iommu) 1649 { 1650 u32 sts; 1651 unsigned long flag; 1652 1653 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1654 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1655 return; 1656 1657 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1658 iommu->gcmd &= ~DMA_GCMD_TE; 1659 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1660 1661 /* Make sure hardware complete it */ 1662 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1663 readl, (!(sts & DMA_GSTS_TES)), sts); 1664 1665 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1666 } 1667 1668 static int iommu_init_domains(struct intel_iommu *iommu) 1669 { 1670 u32 ndomains; 1671 1672 ndomains = cap_ndoms(iommu->cap); 1673 pr_debug("%s: Number of Domains supported <%d>\n", 1674 iommu->name, ndomains); 1675 1676 spin_lock_init(&iommu->lock); 1677 1678 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1679 if (!iommu->domain_ids) 1680 return -ENOMEM; 1681 1682 /* 1683 * If Caching mode is set, then invalid translations are tagged 1684 * with domain-id 0, hence we need to pre-allocate it. We also 1685 * use domain-id 0 as a marker for non-allocated domain-id, so 1686 * make sure it is not used for a real domain. 1687 */ 1688 set_bit(0, iommu->domain_ids); 1689 1690 /* 1691 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1692 * entry for first-level or pass-through translation modes should 1693 * be programmed with a domain id different from those used for 1694 * second-level or nested translation. We reserve a domain id for 1695 * this purpose. 1696 */ 1697 if (sm_supported(iommu)) 1698 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1699 1700 return 0; 1701 } 1702 1703 static void disable_dmar_iommu(struct intel_iommu *iommu) 1704 { 1705 if (!iommu->domain_ids) 1706 return; 1707 1708 /* 1709 * All iommu domains must have been detached from the devices, 1710 * hence there should be no domain IDs in use. 1711 */ 1712 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1713 > NUM_RESERVED_DID)) 1714 return; 1715 1716 if (iommu->gcmd & DMA_GCMD_TE) 1717 iommu_disable_translation(iommu); 1718 } 1719 1720 static void free_dmar_iommu(struct intel_iommu *iommu) 1721 { 1722 if (iommu->domain_ids) { 1723 bitmap_free(iommu->domain_ids); 1724 iommu->domain_ids = NULL; 1725 } 1726 1727 if (iommu->copied_tables) { 1728 bitmap_free(iommu->copied_tables); 1729 iommu->copied_tables = NULL; 1730 } 1731 1732 /* free context mapping */ 1733 free_context_table(iommu); 1734 1735 #ifdef CONFIG_INTEL_IOMMU_SVM 1736 if (pasid_supported(iommu)) { 1737 if (ecap_prs(iommu->ecap)) 1738 intel_svm_finish_prq(iommu); 1739 } 1740 #endif 1741 } 1742 1743 /* 1744 * Check and return whether first level is used by default for 1745 * DMA translation. 1746 */ 1747 static bool first_level_by_default(unsigned int type) 1748 { 1749 /* Only SL is available in legacy mode */ 1750 if (!scalable_mode_support()) 1751 return false; 1752 1753 /* Only level (either FL or SL) is available, just use it */ 1754 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1755 return intel_cap_flts_sanity(); 1756 1757 /* Both levels are available, decide it based on domain type */ 1758 return type != IOMMU_DOMAIN_UNMANAGED; 1759 } 1760 1761 static struct dmar_domain *alloc_domain(unsigned int type) 1762 { 1763 struct dmar_domain *domain; 1764 1765 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1766 if (!domain) 1767 return NULL; 1768 1769 domain->nid = NUMA_NO_NODE; 1770 if (first_level_by_default(type)) 1771 domain->use_first_level = true; 1772 domain->has_iotlb_device = false; 1773 INIT_LIST_HEAD(&domain->devices); 1774 INIT_LIST_HEAD(&domain->dev_pasids); 1775 spin_lock_init(&domain->lock); 1776 xa_init(&domain->iommu_array); 1777 1778 return domain; 1779 } 1780 1781 static int domain_attach_iommu(struct dmar_domain *domain, 1782 struct intel_iommu *iommu) 1783 { 1784 struct iommu_domain_info *info, *curr; 1785 unsigned long ndomains; 1786 int num, ret = -ENOSPC; 1787 1788 info = kzalloc(sizeof(*info), GFP_KERNEL); 1789 if (!info) 1790 return -ENOMEM; 1791 1792 spin_lock(&iommu->lock); 1793 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1794 if (curr) { 1795 curr->refcnt++; 1796 spin_unlock(&iommu->lock); 1797 kfree(info); 1798 return 0; 1799 } 1800 1801 ndomains = cap_ndoms(iommu->cap); 1802 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1803 if (num >= ndomains) { 1804 pr_err("%s: No free domain ids\n", iommu->name); 1805 goto err_unlock; 1806 } 1807 1808 set_bit(num, iommu->domain_ids); 1809 info->refcnt = 1; 1810 info->did = num; 1811 info->iommu = iommu; 1812 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1813 NULL, info, GFP_ATOMIC); 1814 if (curr) { 1815 ret = xa_err(curr) ? : -EBUSY; 1816 goto err_clear; 1817 } 1818 domain_update_iommu_cap(domain); 1819 1820 spin_unlock(&iommu->lock); 1821 return 0; 1822 1823 err_clear: 1824 clear_bit(info->did, iommu->domain_ids); 1825 err_unlock: 1826 spin_unlock(&iommu->lock); 1827 kfree(info); 1828 return ret; 1829 } 1830 1831 static void domain_detach_iommu(struct dmar_domain *domain, 1832 struct intel_iommu *iommu) 1833 { 1834 struct iommu_domain_info *info; 1835 1836 spin_lock(&iommu->lock); 1837 info = xa_load(&domain->iommu_array, iommu->seq_id); 1838 if (--info->refcnt == 0) { 1839 clear_bit(info->did, iommu->domain_ids); 1840 xa_erase(&domain->iommu_array, iommu->seq_id); 1841 domain->nid = NUMA_NO_NODE; 1842 domain_update_iommu_cap(domain); 1843 kfree(info); 1844 } 1845 spin_unlock(&iommu->lock); 1846 } 1847 1848 static inline int guestwidth_to_adjustwidth(int gaw) 1849 { 1850 int agaw; 1851 int r = (gaw - 12) % 9; 1852 1853 if (r == 0) 1854 agaw = gaw; 1855 else 1856 agaw = gaw + 9 - r; 1857 if (agaw > 64) 1858 agaw = 64; 1859 return agaw; 1860 } 1861 1862 static void domain_exit(struct dmar_domain *domain) 1863 { 1864 if (domain->pgd) { 1865 LIST_HEAD(freelist); 1866 1867 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1868 put_pages_list(&freelist); 1869 } 1870 1871 if (WARN_ON(!list_empty(&domain->devices))) 1872 return; 1873 1874 kfree(domain); 1875 } 1876 1877 /* 1878 * Get the PASID directory size for scalable mode context entry. 1879 * Value of X in the PDTS field of a scalable mode context entry 1880 * indicates PASID directory with 2^(X + 7) entries. 1881 */ 1882 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1883 { 1884 unsigned long pds, max_pde; 1885 1886 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1887 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1888 if (pds < 7) 1889 return 0; 1890 1891 return pds - 7; 1892 } 1893 1894 /* 1895 * Set the RID_PASID field of a scalable mode context entry. The 1896 * IOMMU hardware will use the PASID value set in this field for 1897 * DMA translations of DMA requests without PASID. 1898 */ 1899 static inline void 1900 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1901 { 1902 context->hi |= pasid & ((1 << 20) - 1); 1903 } 1904 1905 /* 1906 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1907 * entry. 1908 */ 1909 static inline void context_set_sm_dte(struct context_entry *context) 1910 { 1911 context->lo |= BIT_ULL(2); 1912 } 1913 1914 /* 1915 * Set the PRE(Page Request Enable) field of a scalable mode context 1916 * entry. 1917 */ 1918 static inline void context_set_sm_pre(struct context_entry *context) 1919 { 1920 context->lo |= BIT_ULL(4); 1921 } 1922 1923 /* Convert value to context PASID directory size field coding. */ 1924 #define context_pdts(pds) (((pds) & 0x7) << 9) 1925 1926 static int domain_context_mapping_one(struct dmar_domain *domain, 1927 struct intel_iommu *iommu, 1928 struct pasid_table *table, 1929 u8 bus, u8 devfn) 1930 { 1931 struct device_domain_info *info = 1932 domain_lookup_dev_info(domain, iommu, bus, devfn); 1933 u16 did = domain_id_iommu(domain, iommu); 1934 int translation = CONTEXT_TT_MULTI_LEVEL; 1935 struct context_entry *context; 1936 int ret; 1937 1938 if (hw_pass_through && domain_type_is_si(domain)) 1939 translation = CONTEXT_TT_PASS_THROUGH; 1940 1941 pr_debug("Set context mapping for %02x:%02x.%d\n", 1942 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1943 1944 spin_lock(&iommu->lock); 1945 ret = -ENOMEM; 1946 context = iommu_context_addr(iommu, bus, devfn, 1); 1947 if (!context) 1948 goto out_unlock; 1949 1950 ret = 0; 1951 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1952 goto out_unlock; 1953 1954 /* 1955 * For kdump cases, old valid entries may be cached due to the 1956 * in-flight DMA and copied pgtable, but there is no unmapping 1957 * behaviour for them, thus we need an explicit cache flush for 1958 * the newly-mapped device. For kdump, at this point, the device 1959 * is supposed to finish reset at its driver probe stage, so no 1960 * in-flight DMA will exist, and we don't need to worry anymore 1961 * hereafter. 1962 */ 1963 if (context_copied(iommu, bus, devfn)) { 1964 u16 did_old = context_domain_id(context); 1965 1966 if (did_old < cap_ndoms(iommu->cap)) { 1967 iommu->flush.flush_context(iommu, did_old, 1968 (((u16)bus) << 8) | devfn, 1969 DMA_CCMD_MASK_NOBIT, 1970 DMA_CCMD_DEVICE_INVL); 1971 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1972 DMA_TLB_DSI_FLUSH); 1973 } 1974 1975 clear_context_copied(iommu, bus, devfn); 1976 } 1977 1978 context_clear_entry(context); 1979 1980 if (sm_supported(iommu)) { 1981 unsigned long pds; 1982 1983 /* Setup the PASID DIR pointer: */ 1984 pds = context_get_sm_pds(table); 1985 context->lo = (u64)virt_to_phys(table->table) | 1986 context_pdts(pds); 1987 1988 /* Setup the RID_PASID field: */ 1989 context_set_sm_rid2pasid(context, IOMMU_NO_PASID); 1990 1991 /* 1992 * Setup the Device-TLB enable bit and Page request 1993 * Enable bit: 1994 */ 1995 if (info && info->ats_supported) 1996 context_set_sm_dte(context); 1997 if (info && info->pri_supported) 1998 context_set_sm_pre(context); 1999 if (info && info->pasid_supported) 2000 context_set_pasid(context); 2001 } else { 2002 struct dma_pte *pgd = domain->pgd; 2003 int agaw; 2004 2005 context_set_domain_id(context, did); 2006 2007 if (translation != CONTEXT_TT_PASS_THROUGH) { 2008 /* 2009 * Skip top levels of page tables for iommu which has 2010 * less agaw than default. Unnecessary for PT mode. 2011 */ 2012 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2013 ret = -ENOMEM; 2014 pgd = phys_to_virt(dma_pte_addr(pgd)); 2015 if (!dma_pte_present(pgd)) 2016 goto out_unlock; 2017 } 2018 2019 if (info && info->ats_supported) 2020 translation = CONTEXT_TT_DEV_IOTLB; 2021 else 2022 translation = CONTEXT_TT_MULTI_LEVEL; 2023 2024 context_set_address_root(context, virt_to_phys(pgd)); 2025 context_set_address_width(context, agaw); 2026 } else { 2027 /* 2028 * In pass through mode, AW must be programmed to 2029 * indicate the largest AGAW value supported by 2030 * hardware. And ASR is ignored by hardware. 2031 */ 2032 context_set_address_width(context, iommu->msagaw); 2033 } 2034 2035 context_set_translation_type(context, translation); 2036 } 2037 2038 context_set_fault_enable(context); 2039 context_set_present(context); 2040 if (!ecap_coherent(iommu->ecap)) 2041 clflush_cache_range(context, sizeof(*context)); 2042 2043 /* 2044 * It's a non-present to present mapping. If hardware doesn't cache 2045 * non-present entry we only need to flush the write-buffer. If the 2046 * _does_ cache non-present entries, then it does so in the special 2047 * domain #0, which we have to flush: 2048 */ 2049 if (cap_caching_mode(iommu->cap)) { 2050 iommu->flush.flush_context(iommu, 0, 2051 (((u16)bus) << 8) | devfn, 2052 DMA_CCMD_MASK_NOBIT, 2053 DMA_CCMD_DEVICE_INVL); 2054 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2055 } else { 2056 iommu_flush_write_buffer(iommu); 2057 } 2058 2059 ret = 0; 2060 2061 out_unlock: 2062 spin_unlock(&iommu->lock); 2063 2064 return ret; 2065 } 2066 2067 struct domain_context_mapping_data { 2068 struct dmar_domain *domain; 2069 struct intel_iommu *iommu; 2070 struct pasid_table *table; 2071 }; 2072 2073 static int domain_context_mapping_cb(struct pci_dev *pdev, 2074 u16 alias, void *opaque) 2075 { 2076 struct domain_context_mapping_data *data = opaque; 2077 2078 return domain_context_mapping_one(data->domain, data->iommu, 2079 data->table, PCI_BUS_NUM(alias), 2080 alias & 0xff); 2081 } 2082 2083 static int 2084 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2085 { 2086 struct domain_context_mapping_data data; 2087 struct pasid_table *table; 2088 struct intel_iommu *iommu; 2089 u8 bus, devfn; 2090 2091 iommu = device_to_iommu(dev, &bus, &devfn); 2092 if (!iommu) 2093 return -ENODEV; 2094 2095 table = intel_pasid_get_table(dev); 2096 2097 if (!dev_is_pci(dev)) 2098 return domain_context_mapping_one(domain, iommu, table, 2099 bus, devfn); 2100 2101 data.domain = domain; 2102 data.iommu = iommu; 2103 data.table = table; 2104 2105 return pci_for_each_dma_alias(to_pci_dev(dev), 2106 &domain_context_mapping_cb, &data); 2107 } 2108 2109 /* Returns a number of VTD pages, but aligned to MM page size */ 2110 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2111 size_t size) 2112 { 2113 host_addr &= ~PAGE_MASK; 2114 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2115 } 2116 2117 /* Return largest possible superpage level for a given mapping */ 2118 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2119 unsigned long iov_pfn, 2120 unsigned long phy_pfn, 2121 unsigned long pages) 2122 { 2123 int support, level = 1; 2124 unsigned long pfnmerge; 2125 2126 support = domain->iommu_superpage; 2127 2128 /* To use a large page, the virtual *and* physical addresses 2129 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2130 of them will mean we have to use smaller pages. So just 2131 merge them and check both at once. */ 2132 pfnmerge = iov_pfn | phy_pfn; 2133 2134 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2135 pages >>= VTD_STRIDE_SHIFT; 2136 if (!pages) 2137 break; 2138 pfnmerge >>= VTD_STRIDE_SHIFT; 2139 level++; 2140 support--; 2141 } 2142 return level; 2143 } 2144 2145 /* 2146 * Ensure that old small page tables are removed to make room for superpage(s). 2147 * We're going to add new large pages, so make sure we don't remove their parent 2148 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2149 */ 2150 static void switch_to_super_page(struct dmar_domain *domain, 2151 unsigned long start_pfn, 2152 unsigned long end_pfn, int level) 2153 { 2154 unsigned long lvl_pages = lvl_to_nr_pages(level); 2155 struct iommu_domain_info *info; 2156 struct dma_pte *pte = NULL; 2157 unsigned long i; 2158 2159 while (start_pfn <= end_pfn) { 2160 if (!pte) 2161 pte = pfn_to_dma_pte(domain, start_pfn, &level, 2162 GFP_ATOMIC); 2163 2164 if (dma_pte_present(pte)) { 2165 dma_pte_free_pagetable(domain, start_pfn, 2166 start_pfn + lvl_pages - 1, 2167 level + 1); 2168 2169 xa_for_each(&domain->iommu_array, i, info) 2170 iommu_flush_iotlb_psi(info->iommu, domain, 2171 start_pfn, lvl_pages, 2172 0, 0); 2173 } 2174 2175 pte++; 2176 start_pfn += lvl_pages; 2177 if (first_pte_in_page(pte)) 2178 pte = NULL; 2179 } 2180 } 2181 2182 static int 2183 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2184 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2185 gfp_t gfp) 2186 { 2187 struct dma_pte *first_pte = NULL, *pte = NULL; 2188 unsigned int largepage_lvl = 0; 2189 unsigned long lvl_pages = 0; 2190 phys_addr_t pteval; 2191 u64 attr; 2192 2193 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2194 return -EINVAL; 2195 2196 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2197 return -EINVAL; 2198 2199 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2200 attr |= DMA_FL_PTE_PRESENT; 2201 if (domain->use_first_level) { 2202 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2203 if (prot & DMA_PTE_WRITE) 2204 attr |= DMA_FL_PTE_DIRTY; 2205 } 2206 2207 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2208 2209 while (nr_pages > 0) { 2210 uint64_t tmp; 2211 2212 if (!pte) { 2213 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2214 phys_pfn, nr_pages); 2215 2216 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2217 gfp); 2218 if (!pte) 2219 return -ENOMEM; 2220 first_pte = pte; 2221 2222 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2223 2224 /* It is large page*/ 2225 if (largepage_lvl > 1) { 2226 unsigned long end_pfn; 2227 unsigned long pages_to_remove; 2228 2229 pteval |= DMA_PTE_LARGE_PAGE; 2230 pages_to_remove = min_t(unsigned long, nr_pages, 2231 nr_pte_to_next_page(pte) * lvl_pages); 2232 end_pfn = iov_pfn + pages_to_remove - 1; 2233 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2234 } else { 2235 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2236 } 2237 2238 } 2239 /* We don't need lock here, nobody else 2240 * touches the iova range 2241 */ 2242 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2243 if (tmp) { 2244 static int dumps = 5; 2245 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2246 iov_pfn, tmp, (unsigned long long)pteval); 2247 if (dumps) { 2248 dumps--; 2249 debug_dma_dump_mappings(NULL); 2250 } 2251 WARN_ON(1); 2252 } 2253 2254 nr_pages -= lvl_pages; 2255 iov_pfn += lvl_pages; 2256 phys_pfn += lvl_pages; 2257 pteval += lvl_pages * VTD_PAGE_SIZE; 2258 2259 /* If the next PTE would be the first in a new page, then we 2260 * need to flush the cache on the entries we've just written. 2261 * And then we'll need to recalculate 'pte', so clear it and 2262 * let it get set again in the if (!pte) block above. 2263 * 2264 * If we're done (!nr_pages) we need to flush the cache too. 2265 * 2266 * Also if we've been setting superpages, we may need to 2267 * recalculate 'pte' and switch back to smaller pages for the 2268 * end of the mapping, if the trailing size is not enough to 2269 * use another superpage (i.e. nr_pages < lvl_pages). 2270 */ 2271 pte++; 2272 if (!nr_pages || first_pte_in_page(pte) || 2273 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2274 domain_flush_cache(domain, first_pte, 2275 (void *)pte - (void *)first_pte); 2276 pte = NULL; 2277 } 2278 } 2279 2280 return 0; 2281 } 2282 2283 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2284 { 2285 struct intel_iommu *iommu = info->iommu; 2286 struct context_entry *context; 2287 u16 did_old; 2288 2289 if (!iommu) 2290 return; 2291 2292 spin_lock(&iommu->lock); 2293 context = iommu_context_addr(iommu, bus, devfn, 0); 2294 if (!context) { 2295 spin_unlock(&iommu->lock); 2296 return; 2297 } 2298 2299 if (sm_supported(iommu)) { 2300 if (hw_pass_through && domain_type_is_si(info->domain)) 2301 did_old = FLPT_DEFAULT_DID; 2302 else 2303 did_old = domain_id_iommu(info->domain, iommu); 2304 } else { 2305 did_old = context_domain_id(context); 2306 } 2307 2308 context_clear_entry(context); 2309 __iommu_flush_cache(iommu, context, sizeof(*context)); 2310 spin_unlock(&iommu->lock); 2311 iommu->flush.flush_context(iommu, 2312 did_old, 2313 (((u16)bus) << 8) | devfn, 2314 DMA_CCMD_MASK_NOBIT, 2315 DMA_CCMD_DEVICE_INVL); 2316 2317 if (sm_supported(iommu)) 2318 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2319 2320 iommu->flush.flush_iotlb(iommu, 2321 did_old, 2322 0, 2323 0, 2324 DMA_TLB_DSI_FLUSH); 2325 2326 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2327 } 2328 2329 static int domain_setup_first_level(struct intel_iommu *iommu, 2330 struct dmar_domain *domain, 2331 struct device *dev, 2332 u32 pasid) 2333 { 2334 struct dma_pte *pgd = domain->pgd; 2335 int agaw, level; 2336 int flags = 0; 2337 2338 /* 2339 * Skip top levels of page tables for iommu which has 2340 * less agaw than default. Unnecessary for PT mode. 2341 */ 2342 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2343 pgd = phys_to_virt(dma_pte_addr(pgd)); 2344 if (!dma_pte_present(pgd)) 2345 return -ENOMEM; 2346 } 2347 2348 level = agaw_to_level(agaw); 2349 if (level != 4 && level != 5) 2350 return -EINVAL; 2351 2352 if (level == 5) 2353 flags |= PASID_FLAG_FL5LP; 2354 2355 if (domain->force_snooping) 2356 flags |= PASID_FLAG_PAGE_SNOOP; 2357 2358 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2359 domain_id_iommu(domain, iommu), 2360 flags); 2361 } 2362 2363 static bool dev_is_real_dma_subdevice(struct device *dev) 2364 { 2365 return dev && dev_is_pci(dev) && 2366 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2367 } 2368 2369 static int iommu_domain_identity_map(struct dmar_domain *domain, 2370 unsigned long first_vpfn, 2371 unsigned long last_vpfn) 2372 { 2373 /* 2374 * RMRR range might have overlap with physical memory range, 2375 * clear it first 2376 */ 2377 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2378 2379 return __domain_mapping(domain, first_vpfn, 2380 first_vpfn, last_vpfn - first_vpfn + 1, 2381 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2382 } 2383 2384 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2385 2386 static int __init si_domain_init(int hw) 2387 { 2388 struct dmar_rmrr_unit *rmrr; 2389 struct device *dev; 2390 int i, nid, ret; 2391 2392 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2393 if (!si_domain) 2394 return -EFAULT; 2395 2396 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2397 domain_exit(si_domain); 2398 si_domain = NULL; 2399 return -EFAULT; 2400 } 2401 2402 if (hw) 2403 return 0; 2404 2405 for_each_online_node(nid) { 2406 unsigned long start_pfn, end_pfn; 2407 int i; 2408 2409 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2410 ret = iommu_domain_identity_map(si_domain, 2411 mm_to_dma_pfn_start(start_pfn), 2412 mm_to_dma_pfn_end(end_pfn)); 2413 if (ret) 2414 return ret; 2415 } 2416 } 2417 2418 /* 2419 * Identity map the RMRRs so that devices with RMRRs could also use 2420 * the si_domain. 2421 */ 2422 for_each_rmrr_units(rmrr) { 2423 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2424 i, dev) { 2425 unsigned long long start = rmrr->base_address; 2426 unsigned long long end = rmrr->end_address; 2427 2428 if (WARN_ON(end < start || 2429 end >> agaw_to_width(si_domain->agaw))) 2430 continue; 2431 2432 ret = iommu_domain_identity_map(si_domain, 2433 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2434 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2435 if (ret) 2436 return ret; 2437 } 2438 } 2439 2440 return 0; 2441 } 2442 2443 static int dmar_domain_attach_device(struct dmar_domain *domain, 2444 struct device *dev) 2445 { 2446 struct device_domain_info *info = dev_iommu_priv_get(dev); 2447 struct intel_iommu *iommu; 2448 unsigned long flags; 2449 u8 bus, devfn; 2450 int ret; 2451 2452 iommu = device_to_iommu(dev, &bus, &devfn); 2453 if (!iommu) 2454 return -ENODEV; 2455 2456 ret = domain_attach_iommu(domain, iommu); 2457 if (ret) 2458 return ret; 2459 info->domain = domain; 2460 spin_lock_irqsave(&domain->lock, flags); 2461 list_add(&info->link, &domain->devices); 2462 spin_unlock_irqrestore(&domain->lock, flags); 2463 2464 /* PASID table is mandatory for a PCI device in scalable mode. */ 2465 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2466 /* Setup the PASID entry for requests without PASID: */ 2467 if (hw_pass_through && domain_type_is_si(domain)) 2468 ret = intel_pasid_setup_pass_through(iommu, domain, 2469 dev, IOMMU_NO_PASID); 2470 else if (domain->use_first_level) 2471 ret = domain_setup_first_level(iommu, domain, dev, 2472 IOMMU_NO_PASID); 2473 else 2474 ret = intel_pasid_setup_second_level(iommu, domain, 2475 dev, IOMMU_NO_PASID); 2476 if (ret) { 2477 dev_err(dev, "Setup RID2PASID failed\n"); 2478 device_block_translation(dev); 2479 return ret; 2480 } 2481 } 2482 2483 ret = domain_context_mapping(domain, dev); 2484 if (ret) { 2485 dev_err(dev, "Domain context map failed\n"); 2486 device_block_translation(dev); 2487 return ret; 2488 } 2489 2490 iommu_enable_pci_caps(info); 2491 2492 return 0; 2493 } 2494 2495 /** 2496 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2497 * is relaxable (ie. is allowed to be not enforced under some conditions) 2498 * @dev: device handle 2499 * 2500 * We assume that PCI USB devices with RMRRs have them largely 2501 * for historical reasons and that the RMRR space is not actively used post 2502 * boot. This exclusion may change if vendors begin to abuse it. 2503 * 2504 * The same exception is made for graphics devices, with the requirement that 2505 * any use of the RMRR regions will be torn down before assigning the device 2506 * to a guest. 2507 * 2508 * Return: true if the RMRR is relaxable, false otherwise 2509 */ 2510 static bool device_rmrr_is_relaxable(struct device *dev) 2511 { 2512 struct pci_dev *pdev; 2513 2514 if (!dev_is_pci(dev)) 2515 return false; 2516 2517 pdev = to_pci_dev(dev); 2518 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2519 return true; 2520 else 2521 return false; 2522 } 2523 2524 /* 2525 * Return the required default domain type for a specific device. 2526 * 2527 * @dev: the device in query 2528 * @startup: true if this is during early boot 2529 * 2530 * Returns: 2531 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2532 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2533 * - 0: both identity and dynamic domains work for this device 2534 */ 2535 static int device_def_domain_type(struct device *dev) 2536 { 2537 if (dev_is_pci(dev)) { 2538 struct pci_dev *pdev = to_pci_dev(dev); 2539 2540 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2541 return IOMMU_DOMAIN_IDENTITY; 2542 2543 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2544 return IOMMU_DOMAIN_IDENTITY; 2545 } 2546 2547 return 0; 2548 } 2549 2550 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2551 { 2552 /* 2553 * Start from the sane iommu hardware state. 2554 * If the queued invalidation is already initialized by us 2555 * (for example, while enabling interrupt-remapping) then 2556 * we got the things already rolling from a sane state. 2557 */ 2558 if (!iommu->qi) { 2559 /* 2560 * Clear any previous faults. 2561 */ 2562 dmar_fault(-1, iommu); 2563 /* 2564 * Disable queued invalidation if supported and already enabled 2565 * before OS handover. 2566 */ 2567 dmar_disable_qi(iommu); 2568 } 2569 2570 if (dmar_enable_qi(iommu)) { 2571 /* 2572 * Queued Invalidate not enabled, use Register Based Invalidate 2573 */ 2574 iommu->flush.flush_context = __iommu_flush_context; 2575 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2576 pr_info("%s: Using Register based invalidation\n", 2577 iommu->name); 2578 } else { 2579 iommu->flush.flush_context = qi_flush_context; 2580 iommu->flush.flush_iotlb = qi_flush_iotlb; 2581 pr_info("%s: Using Queued invalidation\n", iommu->name); 2582 } 2583 } 2584 2585 static int copy_context_table(struct intel_iommu *iommu, 2586 struct root_entry *old_re, 2587 struct context_entry **tbl, 2588 int bus, bool ext) 2589 { 2590 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2591 struct context_entry *new_ce = NULL, ce; 2592 struct context_entry *old_ce = NULL; 2593 struct root_entry re; 2594 phys_addr_t old_ce_phys; 2595 2596 tbl_idx = ext ? bus * 2 : bus; 2597 memcpy(&re, old_re, sizeof(re)); 2598 2599 for (devfn = 0; devfn < 256; devfn++) { 2600 /* First calculate the correct index */ 2601 idx = (ext ? devfn * 2 : devfn) % 256; 2602 2603 if (idx == 0) { 2604 /* First save what we may have and clean up */ 2605 if (new_ce) { 2606 tbl[tbl_idx] = new_ce; 2607 __iommu_flush_cache(iommu, new_ce, 2608 VTD_PAGE_SIZE); 2609 pos = 1; 2610 } 2611 2612 if (old_ce) 2613 memunmap(old_ce); 2614 2615 ret = 0; 2616 if (devfn < 0x80) 2617 old_ce_phys = root_entry_lctp(&re); 2618 else 2619 old_ce_phys = root_entry_uctp(&re); 2620 2621 if (!old_ce_phys) { 2622 if (ext && devfn == 0) { 2623 /* No LCTP, try UCTP */ 2624 devfn = 0x7f; 2625 continue; 2626 } else { 2627 goto out; 2628 } 2629 } 2630 2631 ret = -ENOMEM; 2632 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2633 MEMREMAP_WB); 2634 if (!old_ce) 2635 goto out; 2636 2637 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2638 if (!new_ce) 2639 goto out_unmap; 2640 2641 ret = 0; 2642 } 2643 2644 /* Now copy the context entry */ 2645 memcpy(&ce, old_ce + idx, sizeof(ce)); 2646 2647 if (!context_present(&ce)) 2648 continue; 2649 2650 did = context_domain_id(&ce); 2651 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2652 set_bit(did, iommu->domain_ids); 2653 2654 set_context_copied(iommu, bus, devfn); 2655 new_ce[idx] = ce; 2656 } 2657 2658 tbl[tbl_idx + pos] = new_ce; 2659 2660 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2661 2662 out_unmap: 2663 memunmap(old_ce); 2664 2665 out: 2666 return ret; 2667 } 2668 2669 static int copy_translation_tables(struct intel_iommu *iommu) 2670 { 2671 struct context_entry **ctxt_tbls; 2672 struct root_entry *old_rt; 2673 phys_addr_t old_rt_phys; 2674 int ctxt_table_entries; 2675 u64 rtaddr_reg; 2676 int bus, ret; 2677 bool new_ext, ext; 2678 2679 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2680 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2681 new_ext = !!sm_supported(iommu); 2682 2683 /* 2684 * The RTT bit can only be changed when translation is disabled, 2685 * but disabling translation means to open a window for data 2686 * corruption. So bail out and don't copy anything if we would 2687 * have to change the bit. 2688 */ 2689 if (new_ext != ext) 2690 return -EINVAL; 2691 2692 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2693 if (!iommu->copied_tables) 2694 return -ENOMEM; 2695 2696 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2697 if (!old_rt_phys) 2698 return -EINVAL; 2699 2700 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2701 if (!old_rt) 2702 return -ENOMEM; 2703 2704 /* This is too big for the stack - allocate it from slab */ 2705 ctxt_table_entries = ext ? 512 : 256; 2706 ret = -ENOMEM; 2707 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2708 if (!ctxt_tbls) 2709 goto out_unmap; 2710 2711 for (bus = 0; bus < 256; bus++) { 2712 ret = copy_context_table(iommu, &old_rt[bus], 2713 ctxt_tbls, bus, ext); 2714 if (ret) { 2715 pr_err("%s: Failed to copy context table for bus %d\n", 2716 iommu->name, bus); 2717 continue; 2718 } 2719 } 2720 2721 spin_lock(&iommu->lock); 2722 2723 /* Context tables are copied, now write them to the root_entry table */ 2724 for (bus = 0; bus < 256; bus++) { 2725 int idx = ext ? bus * 2 : bus; 2726 u64 val; 2727 2728 if (ctxt_tbls[idx]) { 2729 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2730 iommu->root_entry[bus].lo = val; 2731 } 2732 2733 if (!ext || !ctxt_tbls[idx + 1]) 2734 continue; 2735 2736 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2737 iommu->root_entry[bus].hi = val; 2738 } 2739 2740 spin_unlock(&iommu->lock); 2741 2742 kfree(ctxt_tbls); 2743 2744 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2745 2746 ret = 0; 2747 2748 out_unmap: 2749 memunmap(old_rt); 2750 2751 return ret; 2752 } 2753 2754 static int __init init_dmars(void) 2755 { 2756 struct dmar_drhd_unit *drhd; 2757 struct intel_iommu *iommu; 2758 int ret; 2759 2760 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2761 if (ret) 2762 goto free_iommu; 2763 2764 for_each_iommu(iommu, drhd) { 2765 if (drhd->ignored) { 2766 iommu_disable_translation(iommu); 2767 continue; 2768 } 2769 2770 /* 2771 * Find the max pasid size of all IOMMU's in the system. 2772 * We need to ensure the system pasid table is no bigger 2773 * than the smallest supported. 2774 */ 2775 if (pasid_supported(iommu)) { 2776 u32 temp = 2 << ecap_pss(iommu->ecap); 2777 2778 intel_pasid_max_id = min_t(u32, temp, 2779 intel_pasid_max_id); 2780 } 2781 2782 intel_iommu_init_qi(iommu); 2783 2784 ret = iommu_init_domains(iommu); 2785 if (ret) 2786 goto free_iommu; 2787 2788 init_translation_status(iommu); 2789 2790 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2791 iommu_disable_translation(iommu); 2792 clear_translation_pre_enabled(iommu); 2793 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2794 iommu->name); 2795 } 2796 2797 /* 2798 * TBD: 2799 * we could share the same root & context tables 2800 * among all IOMMU's. Need to Split it later. 2801 */ 2802 ret = iommu_alloc_root_entry(iommu); 2803 if (ret) 2804 goto free_iommu; 2805 2806 if (translation_pre_enabled(iommu)) { 2807 pr_info("Translation already enabled - trying to copy translation structures\n"); 2808 2809 ret = copy_translation_tables(iommu); 2810 if (ret) { 2811 /* 2812 * We found the IOMMU with translation 2813 * enabled - but failed to copy over the 2814 * old root-entry table. Try to proceed 2815 * by disabling translation now and 2816 * allocating a clean root-entry table. 2817 * This might cause DMAR faults, but 2818 * probably the dump will still succeed. 2819 */ 2820 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2821 iommu->name); 2822 iommu_disable_translation(iommu); 2823 clear_translation_pre_enabled(iommu); 2824 } else { 2825 pr_info("Copied translation tables from previous kernel for %s\n", 2826 iommu->name); 2827 } 2828 } 2829 2830 if (!ecap_pass_through(iommu->ecap)) 2831 hw_pass_through = 0; 2832 intel_svm_check(iommu); 2833 } 2834 2835 /* 2836 * Now that qi is enabled on all iommus, set the root entry and flush 2837 * caches. This is required on some Intel X58 chipsets, otherwise the 2838 * flush_context function will loop forever and the boot hangs. 2839 */ 2840 for_each_active_iommu(iommu, drhd) { 2841 iommu_flush_write_buffer(iommu); 2842 iommu_set_root_entry(iommu); 2843 } 2844 2845 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2846 dmar_map_gfx = 0; 2847 #endif 2848 2849 if (!dmar_map_gfx) 2850 iommu_identity_mapping |= IDENTMAP_GFX; 2851 2852 check_tylersburg_isoch(); 2853 2854 ret = si_domain_init(hw_pass_through); 2855 if (ret) 2856 goto free_iommu; 2857 2858 /* 2859 * for each drhd 2860 * enable fault log 2861 * global invalidate context cache 2862 * global invalidate iotlb 2863 * enable translation 2864 */ 2865 for_each_iommu(iommu, drhd) { 2866 if (drhd->ignored) { 2867 /* 2868 * we always have to disable PMRs or DMA may fail on 2869 * this device 2870 */ 2871 if (force_on) 2872 iommu_disable_protect_mem_regions(iommu); 2873 continue; 2874 } 2875 2876 iommu_flush_write_buffer(iommu); 2877 2878 #ifdef CONFIG_INTEL_IOMMU_SVM 2879 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2880 /* 2881 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2882 * could cause possible lock race condition. 2883 */ 2884 up_write(&dmar_global_lock); 2885 ret = intel_svm_enable_prq(iommu); 2886 down_write(&dmar_global_lock); 2887 if (ret) 2888 goto free_iommu; 2889 } 2890 #endif 2891 ret = dmar_set_interrupt(iommu); 2892 if (ret) 2893 goto free_iommu; 2894 } 2895 2896 return 0; 2897 2898 free_iommu: 2899 for_each_active_iommu(iommu, drhd) { 2900 disable_dmar_iommu(iommu); 2901 free_dmar_iommu(iommu); 2902 } 2903 if (si_domain) { 2904 domain_exit(si_domain); 2905 si_domain = NULL; 2906 } 2907 2908 return ret; 2909 } 2910 2911 static void __init init_no_remapping_devices(void) 2912 { 2913 struct dmar_drhd_unit *drhd; 2914 struct device *dev; 2915 int i; 2916 2917 for_each_drhd_unit(drhd) { 2918 if (!drhd->include_all) { 2919 for_each_active_dev_scope(drhd->devices, 2920 drhd->devices_cnt, i, dev) 2921 break; 2922 /* ignore DMAR unit if no devices exist */ 2923 if (i == drhd->devices_cnt) 2924 drhd->ignored = 1; 2925 } 2926 } 2927 2928 for_each_active_drhd_unit(drhd) { 2929 if (drhd->include_all) 2930 continue; 2931 2932 for_each_active_dev_scope(drhd->devices, 2933 drhd->devices_cnt, i, dev) 2934 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2935 break; 2936 if (i < drhd->devices_cnt) 2937 continue; 2938 2939 /* This IOMMU has *only* gfx devices. Either bypass it or 2940 set the gfx_mapped flag, as appropriate */ 2941 drhd->gfx_dedicated = 1; 2942 if (!dmar_map_gfx) 2943 drhd->ignored = 1; 2944 } 2945 } 2946 2947 #ifdef CONFIG_SUSPEND 2948 static int init_iommu_hw(void) 2949 { 2950 struct dmar_drhd_unit *drhd; 2951 struct intel_iommu *iommu = NULL; 2952 int ret; 2953 2954 for_each_active_iommu(iommu, drhd) { 2955 if (iommu->qi) { 2956 ret = dmar_reenable_qi(iommu); 2957 if (ret) 2958 return ret; 2959 } 2960 } 2961 2962 for_each_iommu(iommu, drhd) { 2963 if (drhd->ignored) { 2964 /* 2965 * we always have to disable PMRs or DMA may fail on 2966 * this device 2967 */ 2968 if (force_on) 2969 iommu_disable_protect_mem_regions(iommu); 2970 continue; 2971 } 2972 2973 iommu_flush_write_buffer(iommu); 2974 iommu_set_root_entry(iommu); 2975 iommu_enable_translation(iommu); 2976 iommu_disable_protect_mem_regions(iommu); 2977 } 2978 2979 return 0; 2980 } 2981 2982 static void iommu_flush_all(void) 2983 { 2984 struct dmar_drhd_unit *drhd; 2985 struct intel_iommu *iommu; 2986 2987 for_each_active_iommu(iommu, drhd) { 2988 iommu->flush.flush_context(iommu, 0, 0, 0, 2989 DMA_CCMD_GLOBAL_INVL); 2990 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2991 DMA_TLB_GLOBAL_FLUSH); 2992 } 2993 } 2994 2995 static int iommu_suspend(void) 2996 { 2997 struct dmar_drhd_unit *drhd; 2998 struct intel_iommu *iommu = NULL; 2999 unsigned long flag; 3000 3001 for_each_active_iommu(iommu, drhd) { 3002 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3003 GFP_KERNEL); 3004 if (!iommu->iommu_state) 3005 goto nomem; 3006 } 3007 3008 iommu_flush_all(); 3009 3010 for_each_active_iommu(iommu, drhd) { 3011 iommu_disable_translation(iommu); 3012 3013 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3014 3015 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3016 readl(iommu->reg + DMAR_FECTL_REG); 3017 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3018 readl(iommu->reg + DMAR_FEDATA_REG); 3019 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3020 readl(iommu->reg + DMAR_FEADDR_REG); 3021 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3022 readl(iommu->reg + DMAR_FEUADDR_REG); 3023 3024 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3025 } 3026 return 0; 3027 3028 nomem: 3029 for_each_active_iommu(iommu, drhd) 3030 kfree(iommu->iommu_state); 3031 3032 return -ENOMEM; 3033 } 3034 3035 static void iommu_resume(void) 3036 { 3037 struct dmar_drhd_unit *drhd; 3038 struct intel_iommu *iommu = NULL; 3039 unsigned long flag; 3040 3041 if (init_iommu_hw()) { 3042 if (force_on) 3043 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3044 else 3045 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3046 return; 3047 } 3048 3049 for_each_active_iommu(iommu, drhd) { 3050 3051 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3052 3053 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3054 iommu->reg + DMAR_FECTL_REG); 3055 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3056 iommu->reg + DMAR_FEDATA_REG); 3057 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3058 iommu->reg + DMAR_FEADDR_REG); 3059 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3060 iommu->reg + DMAR_FEUADDR_REG); 3061 3062 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3063 } 3064 3065 for_each_active_iommu(iommu, drhd) 3066 kfree(iommu->iommu_state); 3067 } 3068 3069 static struct syscore_ops iommu_syscore_ops = { 3070 .resume = iommu_resume, 3071 .suspend = iommu_suspend, 3072 }; 3073 3074 static void __init init_iommu_pm_ops(void) 3075 { 3076 register_syscore_ops(&iommu_syscore_ops); 3077 } 3078 3079 #else 3080 static inline void init_iommu_pm_ops(void) {} 3081 #endif /* CONFIG_PM */ 3082 3083 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3084 { 3085 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3086 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3087 rmrr->end_address <= rmrr->base_address || 3088 arch_rmrr_sanity_check(rmrr)) 3089 return -EINVAL; 3090 3091 return 0; 3092 } 3093 3094 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3095 { 3096 struct acpi_dmar_reserved_memory *rmrr; 3097 struct dmar_rmrr_unit *rmrru; 3098 3099 rmrr = (struct acpi_dmar_reserved_memory *)header; 3100 if (rmrr_sanity_check(rmrr)) { 3101 pr_warn(FW_BUG 3102 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3103 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3104 rmrr->base_address, rmrr->end_address, 3105 dmi_get_system_info(DMI_BIOS_VENDOR), 3106 dmi_get_system_info(DMI_BIOS_VERSION), 3107 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3108 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3109 } 3110 3111 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3112 if (!rmrru) 3113 goto out; 3114 3115 rmrru->hdr = header; 3116 3117 rmrru->base_address = rmrr->base_address; 3118 rmrru->end_address = rmrr->end_address; 3119 3120 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3121 ((void *)rmrr) + rmrr->header.length, 3122 &rmrru->devices_cnt); 3123 if (rmrru->devices_cnt && rmrru->devices == NULL) 3124 goto free_rmrru; 3125 3126 list_add(&rmrru->list, &dmar_rmrr_units); 3127 3128 return 0; 3129 free_rmrru: 3130 kfree(rmrru); 3131 out: 3132 return -ENOMEM; 3133 } 3134 3135 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3136 { 3137 struct dmar_atsr_unit *atsru; 3138 struct acpi_dmar_atsr *tmp; 3139 3140 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3141 dmar_rcu_check()) { 3142 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3143 if (atsr->segment != tmp->segment) 3144 continue; 3145 if (atsr->header.length != tmp->header.length) 3146 continue; 3147 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3148 return atsru; 3149 } 3150 3151 return NULL; 3152 } 3153 3154 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3155 { 3156 struct acpi_dmar_atsr *atsr; 3157 struct dmar_atsr_unit *atsru; 3158 3159 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3160 return 0; 3161 3162 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3163 atsru = dmar_find_atsr(atsr); 3164 if (atsru) 3165 return 0; 3166 3167 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3168 if (!atsru) 3169 return -ENOMEM; 3170 3171 /* 3172 * If memory is allocated from slab by ACPI _DSM method, we need to 3173 * copy the memory content because the memory buffer will be freed 3174 * on return. 3175 */ 3176 atsru->hdr = (void *)(atsru + 1); 3177 memcpy(atsru->hdr, hdr, hdr->length); 3178 atsru->include_all = atsr->flags & 0x1; 3179 if (!atsru->include_all) { 3180 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3181 (void *)atsr + atsr->header.length, 3182 &atsru->devices_cnt); 3183 if (atsru->devices_cnt && atsru->devices == NULL) { 3184 kfree(atsru); 3185 return -ENOMEM; 3186 } 3187 } 3188 3189 list_add_rcu(&atsru->list, &dmar_atsr_units); 3190 3191 return 0; 3192 } 3193 3194 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3195 { 3196 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3197 kfree(atsru); 3198 } 3199 3200 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3201 { 3202 struct acpi_dmar_atsr *atsr; 3203 struct dmar_atsr_unit *atsru; 3204 3205 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3206 atsru = dmar_find_atsr(atsr); 3207 if (atsru) { 3208 list_del_rcu(&atsru->list); 3209 synchronize_rcu(); 3210 intel_iommu_free_atsr(atsru); 3211 } 3212 3213 return 0; 3214 } 3215 3216 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3217 { 3218 int i; 3219 struct device *dev; 3220 struct acpi_dmar_atsr *atsr; 3221 struct dmar_atsr_unit *atsru; 3222 3223 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3224 atsru = dmar_find_atsr(atsr); 3225 if (!atsru) 3226 return 0; 3227 3228 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3229 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3230 i, dev) 3231 return -EBUSY; 3232 } 3233 3234 return 0; 3235 } 3236 3237 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3238 { 3239 struct dmar_satc_unit *satcu; 3240 struct acpi_dmar_satc *tmp; 3241 3242 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3243 dmar_rcu_check()) { 3244 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3245 if (satc->segment != tmp->segment) 3246 continue; 3247 if (satc->header.length != tmp->header.length) 3248 continue; 3249 if (memcmp(satc, tmp, satc->header.length) == 0) 3250 return satcu; 3251 } 3252 3253 return NULL; 3254 } 3255 3256 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3257 { 3258 struct acpi_dmar_satc *satc; 3259 struct dmar_satc_unit *satcu; 3260 3261 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3262 return 0; 3263 3264 satc = container_of(hdr, struct acpi_dmar_satc, header); 3265 satcu = dmar_find_satc(satc); 3266 if (satcu) 3267 return 0; 3268 3269 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3270 if (!satcu) 3271 return -ENOMEM; 3272 3273 satcu->hdr = (void *)(satcu + 1); 3274 memcpy(satcu->hdr, hdr, hdr->length); 3275 satcu->atc_required = satc->flags & 0x1; 3276 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3277 (void *)satc + satc->header.length, 3278 &satcu->devices_cnt); 3279 if (satcu->devices_cnt && !satcu->devices) { 3280 kfree(satcu); 3281 return -ENOMEM; 3282 } 3283 list_add_rcu(&satcu->list, &dmar_satc_units); 3284 3285 return 0; 3286 } 3287 3288 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3289 { 3290 int sp, ret; 3291 struct intel_iommu *iommu = dmaru->iommu; 3292 3293 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3294 if (ret) 3295 goto out; 3296 3297 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3298 pr_warn("%s: Doesn't support hardware pass through.\n", 3299 iommu->name); 3300 return -ENXIO; 3301 } 3302 3303 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3304 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3305 pr_warn("%s: Doesn't support large page.\n", 3306 iommu->name); 3307 return -ENXIO; 3308 } 3309 3310 /* 3311 * Disable translation if already enabled prior to OS handover. 3312 */ 3313 if (iommu->gcmd & DMA_GCMD_TE) 3314 iommu_disable_translation(iommu); 3315 3316 ret = iommu_init_domains(iommu); 3317 if (ret == 0) 3318 ret = iommu_alloc_root_entry(iommu); 3319 if (ret) 3320 goto out; 3321 3322 intel_svm_check(iommu); 3323 3324 if (dmaru->ignored) { 3325 /* 3326 * we always have to disable PMRs or DMA may fail on this device 3327 */ 3328 if (force_on) 3329 iommu_disable_protect_mem_regions(iommu); 3330 return 0; 3331 } 3332 3333 intel_iommu_init_qi(iommu); 3334 iommu_flush_write_buffer(iommu); 3335 3336 #ifdef CONFIG_INTEL_IOMMU_SVM 3337 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3338 ret = intel_svm_enable_prq(iommu); 3339 if (ret) 3340 goto disable_iommu; 3341 } 3342 #endif 3343 ret = dmar_set_interrupt(iommu); 3344 if (ret) 3345 goto disable_iommu; 3346 3347 iommu_set_root_entry(iommu); 3348 iommu_enable_translation(iommu); 3349 3350 iommu_disable_protect_mem_regions(iommu); 3351 return 0; 3352 3353 disable_iommu: 3354 disable_dmar_iommu(iommu); 3355 out: 3356 free_dmar_iommu(iommu); 3357 return ret; 3358 } 3359 3360 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3361 { 3362 int ret = 0; 3363 struct intel_iommu *iommu = dmaru->iommu; 3364 3365 if (!intel_iommu_enabled) 3366 return 0; 3367 if (iommu == NULL) 3368 return -EINVAL; 3369 3370 if (insert) { 3371 ret = intel_iommu_add(dmaru); 3372 } else { 3373 disable_dmar_iommu(iommu); 3374 free_dmar_iommu(iommu); 3375 } 3376 3377 return ret; 3378 } 3379 3380 static void intel_iommu_free_dmars(void) 3381 { 3382 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3383 struct dmar_atsr_unit *atsru, *atsr_n; 3384 struct dmar_satc_unit *satcu, *satc_n; 3385 3386 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3387 list_del(&rmrru->list); 3388 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3389 kfree(rmrru); 3390 } 3391 3392 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3393 list_del(&atsru->list); 3394 intel_iommu_free_atsr(atsru); 3395 } 3396 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3397 list_del(&satcu->list); 3398 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3399 kfree(satcu); 3400 } 3401 } 3402 3403 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3404 { 3405 struct dmar_satc_unit *satcu; 3406 struct acpi_dmar_satc *satc; 3407 struct device *tmp; 3408 int i; 3409 3410 dev = pci_physfn(dev); 3411 rcu_read_lock(); 3412 3413 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3414 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3415 if (satc->segment != pci_domain_nr(dev->bus)) 3416 continue; 3417 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3418 if (to_pci_dev(tmp) == dev) 3419 goto out; 3420 } 3421 satcu = NULL; 3422 out: 3423 rcu_read_unlock(); 3424 return satcu; 3425 } 3426 3427 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3428 { 3429 int i, ret = 1; 3430 struct pci_bus *bus; 3431 struct pci_dev *bridge = NULL; 3432 struct device *tmp; 3433 struct acpi_dmar_atsr *atsr; 3434 struct dmar_atsr_unit *atsru; 3435 struct dmar_satc_unit *satcu; 3436 3437 dev = pci_physfn(dev); 3438 satcu = dmar_find_matched_satc_unit(dev); 3439 if (satcu) 3440 /* 3441 * This device supports ATS as it is in SATC table. 3442 * When IOMMU is in legacy mode, enabling ATS is done 3443 * automatically by HW for the device that requires 3444 * ATS, hence OS should not enable this device ATS 3445 * to avoid duplicated TLB invalidation. 3446 */ 3447 return !(satcu->atc_required && !sm_supported(iommu)); 3448 3449 for (bus = dev->bus; bus; bus = bus->parent) { 3450 bridge = bus->self; 3451 /* If it's an integrated device, allow ATS */ 3452 if (!bridge) 3453 return 1; 3454 /* Connected via non-PCIe: no ATS */ 3455 if (!pci_is_pcie(bridge) || 3456 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3457 return 0; 3458 /* If we found the root port, look it up in the ATSR */ 3459 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3460 break; 3461 } 3462 3463 rcu_read_lock(); 3464 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3465 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3466 if (atsr->segment != pci_domain_nr(dev->bus)) 3467 continue; 3468 3469 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3470 if (tmp == &bridge->dev) 3471 goto out; 3472 3473 if (atsru->include_all) 3474 goto out; 3475 } 3476 ret = 0; 3477 out: 3478 rcu_read_unlock(); 3479 3480 return ret; 3481 } 3482 3483 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3484 { 3485 int ret; 3486 struct dmar_rmrr_unit *rmrru; 3487 struct dmar_atsr_unit *atsru; 3488 struct dmar_satc_unit *satcu; 3489 struct acpi_dmar_atsr *atsr; 3490 struct acpi_dmar_reserved_memory *rmrr; 3491 struct acpi_dmar_satc *satc; 3492 3493 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3494 return 0; 3495 3496 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3497 rmrr = container_of(rmrru->hdr, 3498 struct acpi_dmar_reserved_memory, header); 3499 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3500 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3501 ((void *)rmrr) + rmrr->header.length, 3502 rmrr->segment, rmrru->devices, 3503 rmrru->devices_cnt); 3504 if (ret < 0) 3505 return ret; 3506 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3507 dmar_remove_dev_scope(info, rmrr->segment, 3508 rmrru->devices, rmrru->devices_cnt); 3509 } 3510 } 3511 3512 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3513 if (atsru->include_all) 3514 continue; 3515 3516 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3517 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3518 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3519 (void *)atsr + atsr->header.length, 3520 atsr->segment, atsru->devices, 3521 atsru->devices_cnt); 3522 if (ret > 0) 3523 break; 3524 else if (ret < 0) 3525 return ret; 3526 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3527 if (dmar_remove_dev_scope(info, atsr->segment, 3528 atsru->devices, atsru->devices_cnt)) 3529 break; 3530 } 3531 } 3532 list_for_each_entry(satcu, &dmar_satc_units, list) { 3533 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3534 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3535 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3536 (void *)satc + satc->header.length, 3537 satc->segment, satcu->devices, 3538 satcu->devices_cnt); 3539 if (ret > 0) 3540 break; 3541 else if (ret < 0) 3542 return ret; 3543 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3544 if (dmar_remove_dev_scope(info, satc->segment, 3545 satcu->devices, satcu->devices_cnt)) 3546 break; 3547 } 3548 } 3549 3550 return 0; 3551 } 3552 3553 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3554 unsigned long val, void *v) 3555 { 3556 struct memory_notify *mhp = v; 3557 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3558 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3559 mhp->nr_pages - 1); 3560 3561 switch (val) { 3562 case MEM_GOING_ONLINE: 3563 if (iommu_domain_identity_map(si_domain, 3564 start_vpfn, last_vpfn)) { 3565 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3566 start_vpfn, last_vpfn); 3567 return NOTIFY_BAD; 3568 } 3569 break; 3570 3571 case MEM_OFFLINE: 3572 case MEM_CANCEL_ONLINE: 3573 { 3574 struct dmar_drhd_unit *drhd; 3575 struct intel_iommu *iommu; 3576 LIST_HEAD(freelist); 3577 3578 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3579 3580 rcu_read_lock(); 3581 for_each_active_iommu(iommu, drhd) 3582 iommu_flush_iotlb_psi(iommu, si_domain, 3583 start_vpfn, mhp->nr_pages, 3584 list_empty(&freelist), 0); 3585 rcu_read_unlock(); 3586 put_pages_list(&freelist); 3587 } 3588 break; 3589 } 3590 3591 return NOTIFY_OK; 3592 } 3593 3594 static struct notifier_block intel_iommu_memory_nb = { 3595 .notifier_call = intel_iommu_memory_notifier, 3596 .priority = 0 3597 }; 3598 3599 static void intel_disable_iommus(void) 3600 { 3601 struct intel_iommu *iommu = NULL; 3602 struct dmar_drhd_unit *drhd; 3603 3604 for_each_iommu(iommu, drhd) 3605 iommu_disable_translation(iommu); 3606 } 3607 3608 void intel_iommu_shutdown(void) 3609 { 3610 struct dmar_drhd_unit *drhd; 3611 struct intel_iommu *iommu = NULL; 3612 3613 if (no_iommu || dmar_disabled) 3614 return; 3615 3616 down_write(&dmar_global_lock); 3617 3618 /* Disable PMRs explicitly here. */ 3619 for_each_iommu(iommu, drhd) 3620 iommu_disable_protect_mem_regions(iommu); 3621 3622 /* Make sure the IOMMUs are switched off */ 3623 intel_disable_iommus(); 3624 3625 up_write(&dmar_global_lock); 3626 } 3627 3628 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3629 { 3630 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3631 3632 return container_of(iommu_dev, struct intel_iommu, iommu); 3633 } 3634 3635 static ssize_t version_show(struct device *dev, 3636 struct device_attribute *attr, char *buf) 3637 { 3638 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3639 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3640 return sysfs_emit(buf, "%d:%d\n", 3641 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3642 } 3643 static DEVICE_ATTR_RO(version); 3644 3645 static ssize_t address_show(struct device *dev, 3646 struct device_attribute *attr, char *buf) 3647 { 3648 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3649 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3650 } 3651 static DEVICE_ATTR_RO(address); 3652 3653 static ssize_t cap_show(struct device *dev, 3654 struct device_attribute *attr, char *buf) 3655 { 3656 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3657 return sysfs_emit(buf, "%llx\n", iommu->cap); 3658 } 3659 static DEVICE_ATTR_RO(cap); 3660 3661 static ssize_t ecap_show(struct device *dev, 3662 struct device_attribute *attr, char *buf) 3663 { 3664 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3665 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3666 } 3667 static DEVICE_ATTR_RO(ecap); 3668 3669 static ssize_t domains_supported_show(struct device *dev, 3670 struct device_attribute *attr, char *buf) 3671 { 3672 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3673 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3674 } 3675 static DEVICE_ATTR_RO(domains_supported); 3676 3677 static ssize_t domains_used_show(struct device *dev, 3678 struct device_attribute *attr, char *buf) 3679 { 3680 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3681 return sysfs_emit(buf, "%d\n", 3682 bitmap_weight(iommu->domain_ids, 3683 cap_ndoms(iommu->cap))); 3684 } 3685 static DEVICE_ATTR_RO(domains_used); 3686 3687 static struct attribute *intel_iommu_attrs[] = { 3688 &dev_attr_version.attr, 3689 &dev_attr_address.attr, 3690 &dev_attr_cap.attr, 3691 &dev_attr_ecap.attr, 3692 &dev_attr_domains_supported.attr, 3693 &dev_attr_domains_used.attr, 3694 NULL, 3695 }; 3696 3697 static struct attribute_group intel_iommu_group = { 3698 .name = "intel-iommu", 3699 .attrs = intel_iommu_attrs, 3700 }; 3701 3702 const struct attribute_group *intel_iommu_groups[] = { 3703 &intel_iommu_group, 3704 NULL, 3705 }; 3706 3707 static inline bool has_external_pci(void) 3708 { 3709 struct pci_dev *pdev = NULL; 3710 3711 for_each_pci_dev(pdev) 3712 if (pdev->external_facing) { 3713 pci_dev_put(pdev); 3714 return true; 3715 } 3716 3717 return false; 3718 } 3719 3720 static int __init platform_optin_force_iommu(void) 3721 { 3722 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3723 return 0; 3724 3725 if (no_iommu || dmar_disabled) 3726 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3727 3728 /* 3729 * If Intel-IOMMU is disabled by default, we will apply identity 3730 * map for all devices except those marked as being untrusted. 3731 */ 3732 if (dmar_disabled) 3733 iommu_set_default_passthrough(false); 3734 3735 dmar_disabled = 0; 3736 no_iommu = 0; 3737 3738 return 1; 3739 } 3740 3741 static int __init probe_acpi_namespace_devices(void) 3742 { 3743 struct dmar_drhd_unit *drhd; 3744 /* To avoid a -Wunused-but-set-variable warning. */ 3745 struct intel_iommu *iommu __maybe_unused; 3746 struct device *dev; 3747 int i, ret = 0; 3748 3749 for_each_active_iommu(iommu, drhd) { 3750 for_each_active_dev_scope(drhd->devices, 3751 drhd->devices_cnt, i, dev) { 3752 struct acpi_device_physical_node *pn; 3753 struct acpi_device *adev; 3754 3755 if (dev->bus != &acpi_bus_type) 3756 continue; 3757 3758 adev = to_acpi_device(dev); 3759 mutex_lock(&adev->physical_node_lock); 3760 list_for_each_entry(pn, 3761 &adev->physical_node_list, node) { 3762 ret = iommu_probe_device(pn->dev); 3763 if (ret) 3764 break; 3765 } 3766 mutex_unlock(&adev->physical_node_lock); 3767 3768 if (ret) 3769 return ret; 3770 } 3771 } 3772 3773 return 0; 3774 } 3775 3776 static __init int tboot_force_iommu(void) 3777 { 3778 if (!tboot_enabled()) 3779 return 0; 3780 3781 if (no_iommu || dmar_disabled) 3782 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3783 3784 dmar_disabled = 0; 3785 no_iommu = 0; 3786 3787 return 1; 3788 } 3789 3790 int __init intel_iommu_init(void) 3791 { 3792 int ret = -ENODEV; 3793 struct dmar_drhd_unit *drhd; 3794 struct intel_iommu *iommu; 3795 3796 /* 3797 * Intel IOMMU is required for a TXT/tboot launch or platform 3798 * opt in, so enforce that. 3799 */ 3800 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3801 platform_optin_force_iommu(); 3802 3803 down_write(&dmar_global_lock); 3804 if (dmar_table_init()) { 3805 if (force_on) 3806 panic("tboot: Failed to initialize DMAR table\n"); 3807 goto out_free_dmar; 3808 } 3809 3810 if (dmar_dev_scope_init() < 0) { 3811 if (force_on) 3812 panic("tboot: Failed to initialize DMAR device scope\n"); 3813 goto out_free_dmar; 3814 } 3815 3816 up_write(&dmar_global_lock); 3817 3818 /* 3819 * The bus notifier takes the dmar_global_lock, so lockdep will 3820 * complain later when we register it under the lock. 3821 */ 3822 dmar_register_bus_notifier(); 3823 3824 down_write(&dmar_global_lock); 3825 3826 if (!no_iommu) 3827 intel_iommu_debugfs_init(); 3828 3829 if (no_iommu || dmar_disabled) { 3830 /* 3831 * We exit the function here to ensure IOMMU's remapping and 3832 * mempool aren't setup, which means that the IOMMU's PMRs 3833 * won't be disabled via the call to init_dmars(). So disable 3834 * it explicitly here. The PMRs were setup by tboot prior to 3835 * calling SENTER, but the kernel is expected to reset/tear 3836 * down the PMRs. 3837 */ 3838 if (intel_iommu_tboot_noforce) { 3839 for_each_iommu(iommu, drhd) 3840 iommu_disable_protect_mem_regions(iommu); 3841 } 3842 3843 /* 3844 * Make sure the IOMMUs are switched off, even when we 3845 * boot into a kexec kernel and the previous kernel left 3846 * them enabled 3847 */ 3848 intel_disable_iommus(); 3849 goto out_free_dmar; 3850 } 3851 3852 if (list_empty(&dmar_rmrr_units)) 3853 pr_info("No RMRR found\n"); 3854 3855 if (list_empty(&dmar_atsr_units)) 3856 pr_info("No ATSR found\n"); 3857 3858 if (list_empty(&dmar_satc_units)) 3859 pr_info("No SATC found\n"); 3860 3861 init_no_remapping_devices(); 3862 3863 ret = init_dmars(); 3864 if (ret) { 3865 if (force_on) 3866 panic("tboot: Failed to initialize DMARs\n"); 3867 pr_err("Initialization failed\n"); 3868 goto out_free_dmar; 3869 } 3870 up_write(&dmar_global_lock); 3871 3872 init_iommu_pm_ops(); 3873 3874 down_read(&dmar_global_lock); 3875 for_each_active_iommu(iommu, drhd) { 3876 /* 3877 * The flush queue implementation does not perform 3878 * page-selective invalidations that are required for efficient 3879 * TLB flushes in virtual environments. The benefit of batching 3880 * is likely to be much lower than the overhead of synchronizing 3881 * the virtual and physical IOMMU page-tables. 3882 */ 3883 if (cap_caching_mode(iommu->cap) && 3884 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3885 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3886 iommu_set_dma_strict(); 3887 } 3888 iommu_device_sysfs_add(&iommu->iommu, NULL, 3889 intel_iommu_groups, 3890 "%s", iommu->name); 3891 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3892 3893 iommu_pmu_register(iommu); 3894 } 3895 up_read(&dmar_global_lock); 3896 3897 if (si_domain && !hw_pass_through) 3898 register_memory_notifier(&intel_iommu_memory_nb); 3899 3900 down_read(&dmar_global_lock); 3901 if (probe_acpi_namespace_devices()) 3902 pr_warn("ACPI name space devices didn't probe correctly\n"); 3903 3904 /* Finally, we enable the DMA remapping hardware. */ 3905 for_each_iommu(iommu, drhd) { 3906 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3907 iommu_enable_translation(iommu); 3908 3909 iommu_disable_protect_mem_regions(iommu); 3910 } 3911 up_read(&dmar_global_lock); 3912 3913 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3914 3915 intel_iommu_enabled = 1; 3916 3917 return 0; 3918 3919 out_free_dmar: 3920 intel_iommu_free_dmars(); 3921 up_write(&dmar_global_lock); 3922 return ret; 3923 } 3924 3925 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3926 { 3927 struct device_domain_info *info = opaque; 3928 3929 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3930 return 0; 3931 } 3932 3933 /* 3934 * NB - intel-iommu lacks any sort of reference counting for the users of 3935 * dependent devices. If multiple endpoints have intersecting dependent 3936 * devices, unbinding the driver from any one of them will possibly leave 3937 * the others unable to operate. 3938 */ 3939 static void domain_context_clear(struct device_domain_info *info) 3940 { 3941 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 3942 return; 3943 3944 pci_for_each_dma_alias(to_pci_dev(info->dev), 3945 &domain_context_clear_one_cb, info); 3946 } 3947 3948 static void dmar_remove_one_dev_info(struct device *dev) 3949 { 3950 struct device_domain_info *info = dev_iommu_priv_get(dev); 3951 struct dmar_domain *domain = info->domain; 3952 struct intel_iommu *iommu = info->iommu; 3953 unsigned long flags; 3954 3955 if (!dev_is_real_dma_subdevice(info->dev)) { 3956 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3957 intel_pasid_tear_down_entry(iommu, info->dev, 3958 IOMMU_NO_PASID, false); 3959 3960 iommu_disable_pci_caps(info); 3961 domain_context_clear(info); 3962 } 3963 3964 spin_lock_irqsave(&domain->lock, flags); 3965 list_del(&info->link); 3966 spin_unlock_irqrestore(&domain->lock, flags); 3967 3968 domain_detach_iommu(domain, iommu); 3969 info->domain = NULL; 3970 } 3971 3972 /* 3973 * Clear the page table pointer in context or pasid table entries so that 3974 * all DMA requests without PASID from the device are blocked. If the page 3975 * table has been set, clean up the data structures. 3976 */ 3977 static void device_block_translation(struct device *dev) 3978 { 3979 struct device_domain_info *info = dev_iommu_priv_get(dev); 3980 struct intel_iommu *iommu = info->iommu; 3981 unsigned long flags; 3982 3983 iommu_disable_pci_caps(info); 3984 if (!dev_is_real_dma_subdevice(dev)) { 3985 if (sm_supported(iommu)) 3986 intel_pasid_tear_down_entry(iommu, dev, 3987 IOMMU_NO_PASID, false); 3988 else 3989 domain_context_clear(info); 3990 } 3991 3992 if (!info->domain) 3993 return; 3994 3995 spin_lock_irqsave(&info->domain->lock, flags); 3996 list_del(&info->link); 3997 spin_unlock_irqrestore(&info->domain->lock, flags); 3998 3999 domain_detach_iommu(info->domain, iommu); 4000 info->domain = NULL; 4001 } 4002 4003 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4004 { 4005 int adjust_width; 4006 4007 /* calculate AGAW */ 4008 domain->gaw = guest_width; 4009 adjust_width = guestwidth_to_adjustwidth(guest_width); 4010 domain->agaw = width_to_agaw(adjust_width); 4011 4012 domain->iommu_coherency = false; 4013 domain->iommu_superpage = 0; 4014 domain->max_addr = 0; 4015 4016 /* always allocate the top pgd */ 4017 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 4018 if (!domain->pgd) 4019 return -ENOMEM; 4020 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4021 return 0; 4022 } 4023 4024 static int blocking_domain_attach_dev(struct iommu_domain *domain, 4025 struct device *dev) 4026 { 4027 device_block_translation(dev); 4028 return 0; 4029 } 4030 4031 static struct iommu_domain blocking_domain = { 4032 .ops = &(const struct iommu_domain_ops) { 4033 .attach_dev = blocking_domain_attach_dev, 4034 .free = intel_iommu_domain_free 4035 } 4036 }; 4037 4038 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4039 { 4040 struct dmar_domain *dmar_domain; 4041 struct iommu_domain *domain; 4042 4043 switch (type) { 4044 case IOMMU_DOMAIN_BLOCKED: 4045 return &blocking_domain; 4046 case IOMMU_DOMAIN_DMA: 4047 case IOMMU_DOMAIN_UNMANAGED: 4048 dmar_domain = alloc_domain(type); 4049 if (!dmar_domain) { 4050 pr_err("Can't allocate dmar_domain\n"); 4051 return NULL; 4052 } 4053 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4054 pr_err("Domain initialization failed\n"); 4055 domain_exit(dmar_domain); 4056 return NULL; 4057 } 4058 4059 domain = &dmar_domain->domain; 4060 domain->geometry.aperture_start = 0; 4061 domain->geometry.aperture_end = 4062 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4063 domain->geometry.force_aperture = true; 4064 4065 return domain; 4066 case IOMMU_DOMAIN_IDENTITY: 4067 return &si_domain->domain; 4068 case IOMMU_DOMAIN_SVA: 4069 return intel_svm_domain_alloc(); 4070 default: 4071 return NULL; 4072 } 4073 4074 return NULL; 4075 } 4076 4077 static void intel_iommu_domain_free(struct iommu_domain *domain) 4078 { 4079 if (domain != &si_domain->domain && domain != &blocking_domain) 4080 domain_exit(to_dmar_domain(domain)); 4081 } 4082 4083 static int prepare_domain_attach_device(struct iommu_domain *domain, 4084 struct device *dev) 4085 { 4086 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4087 struct intel_iommu *iommu; 4088 int addr_width; 4089 4090 iommu = device_to_iommu(dev, NULL, NULL); 4091 if (!iommu) 4092 return -ENODEV; 4093 4094 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4095 return -EINVAL; 4096 4097 /* check if this iommu agaw is sufficient for max mapped address */ 4098 addr_width = agaw_to_width(iommu->agaw); 4099 if (addr_width > cap_mgaw(iommu->cap)) 4100 addr_width = cap_mgaw(iommu->cap); 4101 4102 if (dmar_domain->max_addr > (1LL << addr_width)) 4103 return -EINVAL; 4104 dmar_domain->gaw = addr_width; 4105 4106 /* 4107 * Knock out extra levels of page tables if necessary 4108 */ 4109 while (iommu->agaw < dmar_domain->agaw) { 4110 struct dma_pte *pte; 4111 4112 pte = dmar_domain->pgd; 4113 if (dma_pte_present(pte)) { 4114 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4115 free_pgtable_page(pte); 4116 } 4117 dmar_domain->agaw--; 4118 } 4119 4120 return 0; 4121 } 4122 4123 static int intel_iommu_attach_device(struct iommu_domain *domain, 4124 struct device *dev) 4125 { 4126 struct device_domain_info *info = dev_iommu_priv_get(dev); 4127 int ret; 4128 4129 if (info->domain) 4130 device_block_translation(dev); 4131 4132 ret = prepare_domain_attach_device(domain, dev); 4133 if (ret) 4134 return ret; 4135 4136 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4137 } 4138 4139 static int intel_iommu_map(struct iommu_domain *domain, 4140 unsigned long iova, phys_addr_t hpa, 4141 size_t size, int iommu_prot, gfp_t gfp) 4142 { 4143 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4144 u64 max_addr; 4145 int prot = 0; 4146 4147 if (iommu_prot & IOMMU_READ) 4148 prot |= DMA_PTE_READ; 4149 if (iommu_prot & IOMMU_WRITE) 4150 prot |= DMA_PTE_WRITE; 4151 if (dmar_domain->set_pte_snp) 4152 prot |= DMA_PTE_SNP; 4153 4154 max_addr = iova + size; 4155 if (dmar_domain->max_addr < max_addr) { 4156 u64 end; 4157 4158 /* check if minimum agaw is sufficient for mapped address */ 4159 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4160 if (end < max_addr) { 4161 pr_err("%s: iommu width (%d) is not " 4162 "sufficient for the mapped address (%llx)\n", 4163 __func__, dmar_domain->gaw, max_addr); 4164 return -EFAULT; 4165 } 4166 dmar_domain->max_addr = max_addr; 4167 } 4168 /* Round up size to next multiple of PAGE_SIZE, if it and 4169 the low bits of hpa would take us onto the next page */ 4170 size = aligned_nrpages(hpa, size); 4171 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4172 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4173 } 4174 4175 static int intel_iommu_map_pages(struct iommu_domain *domain, 4176 unsigned long iova, phys_addr_t paddr, 4177 size_t pgsize, size_t pgcount, 4178 int prot, gfp_t gfp, size_t *mapped) 4179 { 4180 unsigned long pgshift = __ffs(pgsize); 4181 size_t size = pgcount << pgshift; 4182 int ret; 4183 4184 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4185 return -EINVAL; 4186 4187 if (!IS_ALIGNED(iova | paddr, pgsize)) 4188 return -EINVAL; 4189 4190 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4191 if (!ret && mapped) 4192 *mapped = size; 4193 4194 return ret; 4195 } 4196 4197 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4198 unsigned long iova, size_t size, 4199 struct iommu_iotlb_gather *gather) 4200 { 4201 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4202 unsigned long start_pfn, last_pfn; 4203 int level = 0; 4204 4205 /* Cope with horrid API which requires us to unmap more than the 4206 size argument if it happens to be a large-page mapping. */ 4207 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4208 &level, GFP_ATOMIC))) 4209 return 0; 4210 4211 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4212 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4213 4214 start_pfn = iova >> VTD_PAGE_SHIFT; 4215 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4216 4217 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4218 4219 if (dmar_domain->max_addr == iova + size) 4220 dmar_domain->max_addr = iova; 4221 4222 /* 4223 * We do not use page-selective IOTLB invalidation in flush queue, 4224 * so there is no need to track page and sync iotlb. 4225 */ 4226 if (!iommu_iotlb_gather_queued(gather)) 4227 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4228 4229 return size; 4230 } 4231 4232 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4233 unsigned long iova, 4234 size_t pgsize, size_t pgcount, 4235 struct iommu_iotlb_gather *gather) 4236 { 4237 unsigned long pgshift = __ffs(pgsize); 4238 size_t size = pgcount << pgshift; 4239 4240 return intel_iommu_unmap(domain, iova, size, gather); 4241 } 4242 4243 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4244 struct iommu_iotlb_gather *gather) 4245 { 4246 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4247 unsigned long iova_pfn = IOVA_PFN(gather->start); 4248 size_t size = gather->end - gather->start; 4249 struct iommu_domain_info *info; 4250 unsigned long start_pfn; 4251 unsigned long nrpages; 4252 unsigned long i; 4253 4254 nrpages = aligned_nrpages(gather->start, size); 4255 start_pfn = mm_to_dma_pfn_start(iova_pfn); 4256 4257 xa_for_each(&dmar_domain->iommu_array, i, info) 4258 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4259 start_pfn, nrpages, 4260 list_empty(&gather->freelist), 0); 4261 4262 put_pages_list(&gather->freelist); 4263 } 4264 4265 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4266 dma_addr_t iova) 4267 { 4268 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4269 struct dma_pte *pte; 4270 int level = 0; 4271 u64 phys = 0; 4272 4273 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4274 GFP_ATOMIC); 4275 if (pte && dma_pte_present(pte)) 4276 phys = dma_pte_addr(pte) + 4277 (iova & (BIT_MASK(level_to_offset_bits(level) + 4278 VTD_PAGE_SHIFT) - 1)); 4279 4280 return phys; 4281 } 4282 4283 static bool domain_support_force_snooping(struct dmar_domain *domain) 4284 { 4285 struct device_domain_info *info; 4286 bool support = true; 4287 4288 assert_spin_locked(&domain->lock); 4289 list_for_each_entry(info, &domain->devices, link) { 4290 if (!ecap_sc_support(info->iommu->ecap)) { 4291 support = false; 4292 break; 4293 } 4294 } 4295 4296 return support; 4297 } 4298 4299 static void domain_set_force_snooping(struct dmar_domain *domain) 4300 { 4301 struct device_domain_info *info; 4302 4303 assert_spin_locked(&domain->lock); 4304 /* 4305 * Second level page table supports per-PTE snoop control. The 4306 * iommu_map() interface will handle this by setting SNP bit. 4307 */ 4308 if (!domain->use_first_level) { 4309 domain->set_pte_snp = true; 4310 return; 4311 } 4312 4313 list_for_each_entry(info, &domain->devices, link) 4314 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4315 IOMMU_NO_PASID); 4316 } 4317 4318 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4319 { 4320 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4321 unsigned long flags; 4322 4323 if (dmar_domain->force_snooping) 4324 return true; 4325 4326 spin_lock_irqsave(&dmar_domain->lock, flags); 4327 if (!domain_support_force_snooping(dmar_domain)) { 4328 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4329 return false; 4330 } 4331 4332 domain_set_force_snooping(dmar_domain); 4333 dmar_domain->force_snooping = true; 4334 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4335 4336 return true; 4337 } 4338 4339 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4340 { 4341 struct device_domain_info *info = dev_iommu_priv_get(dev); 4342 4343 switch (cap) { 4344 case IOMMU_CAP_CACHE_COHERENCY: 4345 case IOMMU_CAP_DEFERRED_FLUSH: 4346 return true; 4347 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4348 return dmar_platform_optin(); 4349 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4350 return ecap_sc_support(info->iommu->ecap); 4351 default: 4352 return false; 4353 } 4354 } 4355 4356 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4357 { 4358 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4359 struct device_domain_info *info; 4360 struct intel_iommu *iommu; 4361 u8 bus, devfn; 4362 int ret; 4363 4364 iommu = device_to_iommu(dev, &bus, &devfn); 4365 if (!iommu || !iommu->iommu.ops) 4366 return ERR_PTR(-ENODEV); 4367 4368 info = kzalloc(sizeof(*info), GFP_KERNEL); 4369 if (!info) 4370 return ERR_PTR(-ENOMEM); 4371 4372 if (dev_is_real_dma_subdevice(dev)) { 4373 info->bus = pdev->bus->number; 4374 info->devfn = pdev->devfn; 4375 info->segment = pci_domain_nr(pdev->bus); 4376 } else { 4377 info->bus = bus; 4378 info->devfn = devfn; 4379 info->segment = iommu->segment; 4380 } 4381 4382 info->dev = dev; 4383 info->iommu = iommu; 4384 if (dev_is_pci(dev)) { 4385 if (ecap_dev_iotlb_support(iommu->ecap) && 4386 pci_ats_supported(pdev) && 4387 dmar_ats_supported(pdev, iommu)) { 4388 info->ats_supported = 1; 4389 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4390 4391 /* 4392 * For IOMMU that supports device IOTLB throttling 4393 * (DIT), we assign PFSID to the invalidation desc 4394 * of a VF such that IOMMU HW can gauge queue depth 4395 * at PF level. If DIT is not set, PFSID will be 4396 * treated as reserved, which should be set to 0. 4397 */ 4398 if (ecap_dit(iommu->ecap)) 4399 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4400 info->ats_qdep = pci_ats_queue_depth(pdev); 4401 } 4402 if (sm_supported(iommu)) { 4403 if (pasid_supported(iommu)) { 4404 int features = pci_pasid_features(pdev); 4405 4406 if (features >= 0) 4407 info->pasid_supported = features | 1; 4408 } 4409 4410 if (info->ats_supported && ecap_prs(iommu->ecap) && 4411 pci_pri_supported(pdev)) 4412 info->pri_supported = 1; 4413 } 4414 } 4415 4416 dev_iommu_priv_set(dev, info); 4417 4418 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4419 ret = intel_pasid_alloc_table(dev); 4420 if (ret) { 4421 dev_err(dev, "PASID table allocation failed\n"); 4422 dev_iommu_priv_set(dev, NULL); 4423 kfree(info); 4424 return ERR_PTR(ret); 4425 } 4426 } 4427 4428 return &iommu->iommu; 4429 } 4430 4431 static void intel_iommu_release_device(struct device *dev) 4432 { 4433 struct device_domain_info *info = dev_iommu_priv_get(dev); 4434 4435 dmar_remove_one_dev_info(dev); 4436 intel_pasid_free_table(dev); 4437 dev_iommu_priv_set(dev, NULL); 4438 kfree(info); 4439 set_dma_ops(dev, NULL); 4440 } 4441 4442 static void intel_iommu_probe_finalize(struct device *dev) 4443 { 4444 set_dma_ops(dev, NULL); 4445 iommu_setup_dma_ops(dev, 0, U64_MAX); 4446 } 4447 4448 static void intel_iommu_get_resv_regions(struct device *device, 4449 struct list_head *head) 4450 { 4451 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4452 struct iommu_resv_region *reg; 4453 struct dmar_rmrr_unit *rmrr; 4454 struct device *i_dev; 4455 int i; 4456 4457 rcu_read_lock(); 4458 for_each_rmrr_units(rmrr) { 4459 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4460 i, i_dev) { 4461 struct iommu_resv_region *resv; 4462 enum iommu_resv_type type; 4463 size_t length; 4464 4465 if (i_dev != device && 4466 !is_downstream_to_pci_bridge(device, i_dev)) 4467 continue; 4468 4469 length = rmrr->end_address - rmrr->base_address + 1; 4470 4471 type = device_rmrr_is_relaxable(device) ? 4472 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4473 4474 resv = iommu_alloc_resv_region(rmrr->base_address, 4475 length, prot, type, 4476 GFP_ATOMIC); 4477 if (!resv) 4478 break; 4479 4480 list_add_tail(&resv->list, head); 4481 } 4482 } 4483 rcu_read_unlock(); 4484 4485 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4486 if (dev_is_pci(device)) { 4487 struct pci_dev *pdev = to_pci_dev(device); 4488 4489 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4490 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4491 IOMMU_RESV_DIRECT_RELAXABLE, 4492 GFP_KERNEL); 4493 if (reg) 4494 list_add_tail(®->list, head); 4495 } 4496 } 4497 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4498 4499 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4500 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4501 0, IOMMU_RESV_MSI, GFP_KERNEL); 4502 if (!reg) 4503 return; 4504 list_add_tail(®->list, head); 4505 } 4506 4507 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4508 { 4509 if (dev_is_pci(dev)) 4510 return pci_device_group(dev); 4511 return generic_device_group(dev); 4512 } 4513 4514 static int intel_iommu_enable_sva(struct device *dev) 4515 { 4516 struct device_domain_info *info = dev_iommu_priv_get(dev); 4517 struct intel_iommu *iommu; 4518 4519 if (!info || dmar_disabled) 4520 return -EINVAL; 4521 4522 iommu = info->iommu; 4523 if (!iommu) 4524 return -EINVAL; 4525 4526 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4527 return -ENODEV; 4528 4529 if (!info->pasid_enabled || !info->ats_enabled) 4530 return -EINVAL; 4531 4532 /* 4533 * Devices having device-specific I/O fault handling should not 4534 * support PCI/PRI. The IOMMU side has no means to check the 4535 * capability of device-specific IOPF. Therefore, IOMMU can only 4536 * default that if the device driver enables SVA on a non-PRI 4537 * device, it will handle IOPF in its own way. 4538 */ 4539 if (!info->pri_supported) 4540 return 0; 4541 4542 /* Devices supporting PRI should have it enabled. */ 4543 if (!info->pri_enabled) 4544 return -EINVAL; 4545 4546 return 0; 4547 } 4548 4549 static int intel_iommu_enable_iopf(struct device *dev) 4550 { 4551 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4552 struct device_domain_info *info = dev_iommu_priv_get(dev); 4553 struct intel_iommu *iommu; 4554 int ret; 4555 4556 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4557 return -ENODEV; 4558 4559 if (info->pri_enabled) 4560 return -EBUSY; 4561 4562 iommu = info->iommu; 4563 if (!iommu) 4564 return -EINVAL; 4565 4566 /* PASID is required in PRG Response Message. */ 4567 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4568 return -EINVAL; 4569 4570 ret = pci_reset_pri(pdev); 4571 if (ret) 4572 return ret; 4573 4574 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4575 if (ret) 4576 return ret; 4577 4578 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4579 if (ret) 4580 goto iopf_remove_device; 4581 4582 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4583 if (ret) 4584 goto iopf_unregister_handler; 4585 info->pri_enabled = 1; 4586 4587 return 0; 4588 4589 iopf_unregister_handler: 4590 iommu_unregister_device_fault_handler(dev); 4591 iopf_remove_device: 4592 iopf_queue_remove_device(iommu->iopf_queue, dev); 4593 4594 return ret; 4595 } 4596 4597 static int intel_iommu_disable_iopf(struct device *dev) 4598 { 4599 struct device_domain_info *info = dev_iommu_priv_get(dev); 4600 struct intel_iommu *iommu = info->iommu; 4601 4602 if (!info->pri_enabled) 4603 return -EINVAL; 4604 4605 /* 4606 * PCIe spec states that by clearing PRI enable bit, the Page 4607 * Request Interface will not issue new page requests, but has 4608 * outstanding page requests that have been transmitted or are 4609 * queued for transmission. This is supposed to be called after 4610 * the device driver has stopped DMA, all PASIDs have been 4611 * unbound and the outstanding PRQs have been drained. 4612 */ 4613 pci_disable_pri(to_pci_dev(dev)); 4614 info->pri_enabled = 0; 4615 4616 /* 4617 * With PRI disabled and outstanding PRQs drained, unregistering 4618 * fault handler and removing device from iopf queue should never 4619 * fail. 4620 */ 4621 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4622 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4623 4624 return 0; 4625 } 4626 4627 static int 4628 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4629 { 4630 switch (feat) { 4631 case IOMMU_DEV_FEAT_IOPF: 4632 return intel_iommu_enable_iopf(dev); 4633 4634 case IOMMU_DEV_FEAT_SVA: 4635 return intel_iommu_enable_sva(dev); 4636 4637 default: 4638 return -ENODEV; 4639 } 4640 } 4641 4642 static int 4643 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4644 { 4645 switch (feat) { 4646 case IOMMU_DEV_FEAT_IOPF: 4647 return intel_iommu_disable_iopf(dev); 4648 4649 case IOMMU_DEV_FEAT_SVA: 4650 return 0; 4651 4652 default: 4653 return -ENODEV; 4654 } 4655 } 4656 4657 static bool intel_iommu_is_attach_deferred(struct device *dev) 4658 { 4659 struct device_domain_info *info = dev_iommu_priv_get(dev); 4660 4661 return translation_pre_enabled(info->iommu) && !info->domain; 4662 } 4663 4664 /* 4665 * Check that the device does not live on an external facing PCI port that is 4666 * marked as untrusted. Such devices should not be able to apply quirks and 4667 * thus not be able to bypass the IOMMU restrictions. 4668 */ 4669 static bool risky_device(struct pci_dev *pdev) 4670 { 4671 if (pdev->untrusted) { 4672 pci_info(pdev, 4673 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4674 pdev->vendor, pdev->device); 4675 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4676 return true; 4677 } 4678 return false; 4679 } 4680 4681 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4682 unsigned long iova, size_t size) 4683 { 4684 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4685 unsigned long pages = aligned_nrpages(iova, size); 4686 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4687 struct iommu_domain_info *info; 4688 unsigned long i; 4689 4690 xa_for_each(&dmar_domain->iommu_array, i, info) 4691 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4692 } 4693 4694 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4695 { 4696 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 4697 struct dev_pasid_info *curr, *dev_pasid = NULL; 4698 struct dmar_domain *dmar_domain; 4699 struct iommu_domain *domain; 4700 unsigned long flags; 4701 4702 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4703 if (WARN_ON_ONCE(!domain)) 4704 goto out_tear_down; 4705 4706 /* 4707 * The SVA implementation needs to handle its own stuffs like the mm 4708 * notification. Before consolidating that code into iommu core, let 4709 * the intel sva code handle it. 4710 */ 4711 if (domain->type == IOMMU_DOMAIN_SVA) { 4712 intel_svm_remove_dev_pasid(dev, pasid); 4713 goto out_tear_down; 4714 } 4715 4716 dmar_domain = to_dmar_domain(domain); 4717 spin_lock_irqsave(&dmar_domain->lock, flags); 4718 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4719 if (curr->dev == dev && curr->pasid == pasid) { 4720 list_del(&curr->link_domain); 4721 dev_pasid = curr; 4722 break; 4723 } 4724 } 4725 WARN_ON_ONCE(!dev_pasid); 4726 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4727 4728 domain_detach_iommu(dmar_domain, iommu); 4729 kfree(dev_pasid); 4730 out_tear_down: 4731 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4732 intel_drain_pasid_prq(dev, pasid); 4733 } 4734 4735 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4736 struct device *dev, ioasid_t pasid) 4737 { 4738 struct device_domain_info *info = dev_iommu_priv_get(dev); 4739 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4740 struct intel_iommu *iommu = info->iommu; 4741 struct dev_pasid_info *dev_pasid; 4742 unsigned long flags; 4743 int ret; 4744 4745 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4746 return -EOPNOTSUPP; 4747 4748 if (context_copied(iommu, info->bus, info->devfn)) 4749 return -EBUSY; 4750 4751 ret = prepare_domain_attach_device(domain, dev); 4752 if (ret) 4753 return ret; 4754 4755 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4756 if (!dev_pasid) 4757 return -ENOMEM; 4758 4759 ret = domain_attach_iommu(dmar_domain, iommu); 4760 if (ret) 4761 goto out_free; 4762 4763 if (domain_type_is_si(dmar_domain)) 4764 ret = intel_pasid_setup_pass_through(iommu, dmar_domain, 4765 dev, pasid); 4766 else if (dmar_domain->use_first_level) 4767 ret = domain_setup_first_level(iommu, dmar_domain, 4768 dev, pasid); 4769 else 4770 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4771 dev, pasid); 4772 if (ret) 4773 goto out_detach_iommu; 4774 4775 dev_pasid->dev = dev; 4776 dev_pasid->pasid = pasid; 4777 spin_lock_irqsave(&dmar_domain->lock, flags); 4778 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4779 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4780 4781 return 0; 4782 out_detach_iommu: 4783 domain_detach_iommu(dmar_domain, iommu); 4784 out_free: 4785 kfree(dev_pasid); 4786 return ret; 4787 } 4788 4789 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4790 { 4791 struct device_domain_info *info = dev_iommu_priv_get(dev); 4792 struct intel_iommu *iommu = info->iommu; 4793 struct iommu_hw_info_vtd *vtd; 4794 4795 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4796 if (!vtd) 4797 return ERR_PTR(-ENOMEM); 4798 4799 vtd->cap_reg = iommu->cap; 4800 vtd->ecap_reg = iommu->ecap; 4801 *length = sizeof(*vtd); 4802 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4803 return vtd; 4804 } 4805 4806 const struct iommu_ops intel_iommu_ops = { 4807 .capable = intel_iommu_capable, 4808 .hw_info = intel_iommu_hw_info, 4809 .domain_alloc = intel_iommu_domain_alloc, 4810 .probe_device = intel_iommu_probe_device, 4811 .probe_finalize = intel_iommu_probe_finalize, 4812 .release_device = intel_iommu_release_device, 4813 .get_resv_regions = intel_iommu_get_resv_regions, 4814 .device_group = intel_iommu_device_group, 4815 .dev_enable_feat = intel_iommu_dev_enable_feat, 4816 .dev_disable_feat = intel_iommu_dev_disable_feat, 4817 .is_attach_deferred = intel_iommu_is_attach_deferred, 4818 .def_domain_type = device_def_domain_type, 4819 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4820 .pgsize_bitmap = SZ_4K, 4821 #ifdef CONFIG_INTEL_IOMMU_SVM 4822 .page_response = intel_svm_page_response, 4823 #endif 4824 .default_domain_ops = &(const struct iommu_domain_ops) { 4825 .attach_dev = intel_iommu_attach_device, 4826 .set_dev_pasid = intel_iommu_set_dev_pasid, 4827 .map_pages = intel_iommu_map_pages, 4828 .unmap_pages = intel_iommu_unmap_pages, 4829 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4830 .flush_iotlb_all = intel_flush_iotlb_all, 4831 .iotlb_sync = intel_iommu_tlb_sync, 4832 .iova_to_phys = intel_iommu_iova_to_phys, 4833 .free = intel_iommu_domain_free, 4834 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4835 } 4836 }; 4837 4838 static void quirk_iommu_igfx(struct pci_dev *dev) 4839 { 4840 if (risky_device(dev)) 4841 return; 4842 4843 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4844 dmar_map_gfx = 0; 4845 } 4846 4847 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4855 4856 /* Broadwell igfx malfunctions with dmar */ 4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4881 4882 static void quirk_iommu_rwbf(struct pci_dev *dev) 4883 { 4884 if (risky_device(dev)) 4885 return; 4886 4887 /* 4888 * Mobile 4 Series Chipset neglects to set RWBF capability, 4889 * but needs it. Same seems to hold for the desktop versions. 4890 */ 4891 pci_info(dev, "Forcing write-buffer flush capability\n"); 4892 rwbf_quirk = 1; 4893 } 4894 4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4902 4903 #define GGC 0x52 4904 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4905 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4906 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4907 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4908 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4909 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4910 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4911 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4912 4913 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4914 { 4915 unsigned short ggc; 4916 4917 if (risky_device(dev)) 4918 return; 4919 4920 if (pci_read_config_word(dev, GGC, &ggc)) 4921 return; 4922 4923 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4924 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4925 dmar_map_gfx = 0; 4926 } else if (dmar_map_gfx) { 4927 /* we have to ensure the gfx device is idle before we flush */ 4928 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4929 iommu_set_dma_strict(); 4930 } 4931 } 4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4936 4937 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4938 { 4939 unsigned short ver; 4940 4941 if (!IS_GFX_DEVICE(dev)) 4942 return; 4943 4944 ver = (dev->device >> 8) & 0xff; 4945 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4946 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4947 ver != 0x9a && ver != 0xa7) 4948 return; 4949 4950 if (risky_device(dev)) 4951 return; 4952 4953 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4954 iommu_skip_te_disable = 1; 4955 } 4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4957 4958 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4959 ISOCH DMAR unit for the Azalia sound device, but not give it any 4960 TLB entries, which causes it to deadlock. Check for that. We do 4961 this in a function called from init_dmars(), instead of in a PCI 4962 quirk, because we don't want to print the obnoxious "BIOS broken" 4963 message if VT-d is actually disabled. 4964 */ 4965 static void __init check_tylersburg_isoch(void) 4966 { 4967 struct pci_dev *pdev; 4968 uint32_t vtisochctrl; 4969 4970 /* If there's no Azalia in the system anyway, forget it. */ 4971 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4972 if (!pdev) 4973 return; 4974 4975 if (risky_device(pdev)) { 4976 pci_dev_put(pdev); 4977 return; 4978 } 4979 4980 pci_dev_put(pdev); 4981 4982 /* System Management Registers. Might be hidden, in which case 4983 we can't do the sanity check. But that's OK, because the 4984 known-broken BIOSes _don't_ actually hide it, so far. */ 4985 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4986 if (!pdev) 4987 return; 4988 4989 if (risky_device(pdev)) { 4990 pci_dev_put(pdev); 4991 return; 4992 } 4993 4994 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4995 pci_dev_put(pdev); 4996 return; 4997 } 4998 4999 pci_dev_put(pdev); 5000 5001 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5002 if (vtisochctrl & 1) 5003 return; 5004 5005 /* Drop all bits other than the number of TLB entries */ 5006 vtisochctrl &= 0x1c; 5007 5008 /* If we have the recommended number of TLB entries (16), fine. */ 5009 if (vtisochctrl == 0x10) 5010 return; 5011 5012 /* Zero TLB entries? You get to ride the short bus to school. */ 5013 if (!vtisochctrl) { 5014 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5015 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5016 dmi_get_system_info(DMI_BIOS_VENDOR), 5017 dmi_get_system_info(DMI_BIOS_VERSION), 5018 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5019 iommu_identity_mapping |= IDENTMAP_AZALIA; 5020 return; 5021 } 5022 5023 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5024 vtisochctrl); 5025 } 5026 5027 /* 5028 * Here we deal with a device TLB defect where device may inadvertently issue ATS 5029 * invalidation completion before posted writes initiated with translated address 5030 * that utilized translations matching the invalidation address range, violating 5031 * the invalidation completion ordering. 5032 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 5033 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 5034 * under the control of the trusted/privileged host device driver must use this 5035 * quirk. 5036 * Device TLBs are invalidated under the following six conditions: 5037 * 1. Device driver does DMA API unmap IOVA 5038 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 5039 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 5040 * exit_mmap() due to crash 5041 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 5042 * VM has to free pages that were unmapped 5043 * 5. Userspace driver unmaps a DMA buffer 5044 * 6. Cache invalidation in vSVA usage (upcoming) 5045 * 5046 * For #1 and #2, device drivers are responsible for stopping DMA traffic 5047 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 5048 * invalidate TLB the same way as normal user unmap which will use this quirk. 5049 * The dTLB invalidation after PASID cache flush does not need this quirk. 5050 * 5051 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 5052 */ 5053 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 5054 unsigned long address, unsigned long mask, 5055 u32 pasid, u16 qdep) 5056 { 5057 u16 sid; 5058 5059 if (likely(!info->dtlb_extra_inval)) 5060 return; 5061 5062 sid = PCI_DEVID(info->bus, info->devfn); 5063 if (pasid == IOMMU_NO_PASID) { 5064 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 5065 qdep, address, mask); 5066 } else { 5067 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5068 pasid, qdep, address, mask); 5069 } 5070 } 5071 5072 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5073 5074 /* 5075 * Function to submit a command to the enhanced command interface. The 5076 * valid enhanced command descriptions are defined in Table 47 of the 5077 * VT-d spec. The VT-d hardware implementation may support some but not 5078 * all commands, which can be determined by checking the Enhanced 5079 * Command Capability Register. 5080 * 5081 * Return values: 5082 * - 0: Command successful without any error; 5083 * - Negative: software error value; 5084 * - Nonzero positive: failure status code defined in Table 48. 5085 */ 5086 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5087 { 5088 unsigned long flags; 5089 u64 res; 5090 int ret; 5091 5092 if (!cap_ecmds(iommu->cap)) 5093 return -ENODEV; 5094 5095 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5096 5097 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5098 if (res & DMA_ECMD_ECRSP_IP) { 5099 ret = -EBUSY; 5100 goto err; 5101 } 5102 5103 /* 5104 * Unconditionally write the operand B, because 5105 * - There is no side effect if an ecmd doesn't require an 5106 * operand B, but we set the register to some value. 5107 * - It's not invoked in any critical path. The extra MMIO 5108 * write doesn't bring any performance concerns. 5109 */ 5110 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5111 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5112 5113 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5114 !(res & DMA_ECMD_ECRSP_IP), res); 5115 5116 if (res & DMA_ECMD_ECRSP_IP) { 5117 ret = -ETIMEDOUT; 5118 goto err; 5119 } 5120 5121 ret = ecmd_get_status_code(res); 5122 err: 5123 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5124 5125 return ret; 5126 } 5127