1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define MAX_AGAW_WIDTH 64 50 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 51 52 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 54 55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 57 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 59 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 60 61 /* IO virtual address start page frame number */ 62 #define IOVA_START_PFN (1) 63 64 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 65 66 /* page table handling */ 67 #define LEVEL_STRIDE (9) 68 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 69 70 static inline int agaw_to_level(int agaw) 71 { 72 return agaw + 2; 73 } 74 75 static inline int agaw_to_width(int agaw) 76 { 77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 78 } 79 80 static inline int width_to_agaw(int width) 81 { 82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 83 } 84 85 static inline unsigned int level_to_offset_bits(int level) 86 { 87 return (level - 1) * LEVEL_STRIDE; 88 } 89 90 static inline int pfn_level_offset(u64 pfn, int level) 91 { 92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 93 } 94 95 static inline u64 level_mask(int level) 96 { 97 return -1ULL << level_to_offset_bits(level); 98 } 99 100 static inline u64 level_size(int level) 101 { 102 return 1ULL << level_to_offset_bits(level); 103 } 104 105 static inline u64 align_to_level(u64 pfn, int level) 106 { 107 return (pfn + level_size(level) - 1) & level_mask(level); 108 } 109 110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 111 { 112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 113 } 114 115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 116 are never going to work. */ 117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) 118 { 119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 120 } 121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) 122 { 123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; 124 } 125 static inline unsigned long page_to_dma_pfn(struct page *pg) 126 { 127 return mm_to_dma_pfn_start(page_to_pfn(pg)); 128 } 129 static inline unsigned long virt_to_dma_pfn(void *p) 130 { 131 return page_to_dma_pfn(virt_to_page(p)); 132 } 133 134 static void __init check_tylersburg_isoch(void); 135 static int rwbf_quirk; 136 137 /* 138 * set to 1 to panic kernel if can't successfully enable VT-d 139 * (used when kernel is launched w/ TXT) 140 */ 141 static int force_on = 0; 142 static int intel_iommu_tboot_noforce; 143 static int no_platform_optin; 144 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 146 147 /* 148 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 149 * if marked present. 150 */ 151 static phys_addr_t root_entry_lctp(struct root_entry *re) 152 { 153 if (!(re->lo & 1)) 154 return 0; 155 156 return re->lo & VTD_PAGE_MASK; 157 } 158 159 /* 160 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 161 * if marked present. 162 */ 163 static phys_addr_t root_entry_uctp(struct root_entry *re) 164 { 165 if (!(re->hi & 1)) 166 return 0; 167 168 return re->hi & VTD_PAGE_MASK; 169 } 170 171 static inline void context_set_present(struct context_entry *context) 172 { 173 context->lo |= 1; 174 } 175 176 static inline void context_set_fault_enable(struct context_entry *context) 177 { 178 context->lo &= (((u64)-1) << 2) | 1; 179 } 180 181 static inline void context_set_translation_type(struct context_entry *context, 182 unsigned long value) 183 { 184 context->lo &= (((u64)-1) << 4) | 3; 185 context->lo |= (value & 3) << 2; 186 } 187 188 static inline void context_set_address_root(struct context_entry *context, 189 unsigned long value) 190 { 191 context->lo &= ~VTD_PAGE_MASK; 192 context->lo |= value & VTD_PAGE_MASK; 193 } 194 195 static inline void context_set_address_width(struct context_entry *context, 196 unsigned long value) 197 { 198 context->hi |= value & 7; 199 } 200 201 static inline void context_set_domain_id(struct context_entry *context, 202 unsigned long value) 203 { 204 context->hi |= (value & ((1 << 16) - 1)) << 8; 205 } 206 207 static inline void context_set_pasid(struct context_entry *context) 208 { 209 context->lo |= CONTEXT_PASIDE; 210 } 211 212 static inline int context_domain_id(struct context_entry *c) 213 { 214 return((c->hi >> 8) & 0xffff); 215 } 216 217 static inline void context_clear_entry(struct context_entry *context) 218 { 219 context->lo = 0; 220 context->hi = 0; 221 } 222 223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 224 { 225 if (!iommu->copied_tables) 226 return false; 227 228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); 229 } 230 231 static inline void 232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 233 { 234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables); 235 } 236 237 static inline void 238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 239 { 240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); 241 } 242 243 /* 244 * This domain is a statically identity mapping domain. 245 * 1. This domain creats a static 1:1 mapping to all usable memory. 246 * 2. It maps to each iommu if successful. 247 * 3. Each iommu mapps to this domain if successful. 248 */ 249 static struct dmar_domain *si_domain; 250 static int hw_pass_through = 1; 251 252 struct dmar_rmrr_unit { 253 struct list_head list; /* list of rmrr units */ 254 struct acpi_dmar_header *hdr; /* ACPI header */ 255 u64 base_address; /* reserved base address*/ 256 u64 end_address; /* reserved end address */ 257 struct dmar_dev_scope *devices; /* target devices */ 258 int devices_cnt; /* target device count */ 259 }; 260 261 struct dmar_atsr_unit { 262 struct list_head list; /* list of ATSR units */ 263 struct acpi_dmar_header *hdr; /* ACPI header */ 264 struct dmar_dev_scope *devices; /* target devices */ 265 int devices_cnt; /* target device count */ 266 u8 include_all:1; /* include all ports */ 267 }; 268 269 struct dmar_satc_unit { 270 struct list_head list; /* list of SATC units */ 271 struct acpi_dmar_header *hdr; /* ACPI header */ 272 struct dmar_dev_scope *devices; /* target devices */ 273 struct intel_iommu *iommu; /* the corresponding iommu */ 274 int devices_cnt; /* target device count */ 275 u8 atc_required:1; /* ATS is required */ 276 }; 277 278 static LIST_HEAD(dmar_atsr_units); 279 static LIST_HEAD(dmar_rmrr_units); 280 static LIST_HEAD(dmar_satc_units); 281 282 #define for_each_rmrr_units(rmrr) \ 283 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 284 285 static void intel_iommu_domain_free(struct iommu_domain *domain); 286 287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 289 290 int intel_iommu_enabled = 0; 291 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 292 293 static int dmar_map_gfx = 1; 294 static int intel_iommu_superpage = 1; 295 static int iommu_identity_mapping; 296 static int iommu_skip_te_disable; 297 298 #define IDENTMAP_GFX 2 299 #define IDENTMAP_AZALIA 4 300 301 const struct iommu_ops intel_iommu_ops; 302 static const struct iommu_dirty_ops intel_dirty_ops; 303 304 static bool translation_pre_enabled(struct intel_iommu *iommu) 305 { 306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 307 } 308 309 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 310 { 311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 312 } 313 314 static void init_translation_status(struct intel_iommu *iommu) 315 { 316 u32 gsts; 317 318 gsts = readl(iommu->reg + DMAR_GSTS_REG); 319 if (gsts & DMA_GSTS_TES) 320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 321 } 322 323 static int __init intel_iommu_setup(char *str) 324 { 325 if (!str) 326 return -EINVAL; 327 328 while (*str) { 329 if (!strncmp(str, "on", 2)) { 330 dmar_disabled = 0; 331 pr_info("IOMMU enabled\n"); 332 } else if (!strncmp(str, "off", 3)) { 333 dmar_disabled = 1; 334 no_platform_optin = 1; 335 pr_info("IOMMU disabled\n"); 336 } else if (!strncmp(str, "igfx_off", 8)) { 337 dmar_map_gfx = 0; 338 pr_info("Disable GFX device mapping\n"); 339 } else if (!strncmp(str, "forcedac", 8)) { 340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 341 iommu_dma_forcedac = true; 342 } else if (!strncmp(str, "strict", 6)) { 343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 344 iommu_set_dma_strict(); 345 } else if (!strncmp(str, "sp_off", 6)) { 346 pr_info("Disable supported super page\n"); 347 intel_iommu_superpage = 0; 348 } else if (!strncmp(str, "sm_on", 5)) { 349 pr_info("Enable scalable mode if hardware supports\n"); 350 intel_iommu_sm = 1; 351 } else if (!strncmp(str, "sm_off", 6)) { 352 pr_info("Scalable mode is disallowed\n"); 353 intel_iommu_sm = 0; 354 } else if (!strncmp(str, "tboot_noforce", 13)) { 355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 356 intel_iommu_tboot_noforce = 1; 357 } else { 358 pr_notice("Unknown option - '%s'\n", str); 359 } 360 361 str += strcspn(str, ","); 362 while (*str == ',') 363 str++; 364 } 365 366 return 1; 367 } 368 __setup("intel_iommu=", intel_iommu_setup); 369 370 void *alloc_pgtable_page(int node, gfp_t gfp) 371 { 372 struct page *page; 373 void *vaddr = NULL; 374 375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 376 if (page) 377 vaddr = page_address(page); 378 return vaddr; 379 } 380 381 void free_pgtable_page(void *vaddr) 382 { 383 free_page((unsigned long)vaddr); 384 } 385 386 static inline int domain_type_is_si(struct dmar_domain *domain) 387 { 388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 389 } 390 391 static inline int domain_pfn_supported(struct dmar_domain *domain, 392 unsigned long pfn) 393 { 394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 395 396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 397 } 398 399 /* 400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 402 * the returned SAGAW. 403 */ 404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 405 { 406 unsigned long fl_sagaw, sl_sagaw; 407 408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 409 sl_sagaw = cap_sagaw(iommu->cap); 410 411 /* Second level only. */ 412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 413 return sl_sagaw; 414 415 /* First level only. */ 416 if (!ecap_slts(iommu->ecap)) 417 return fl_sagaw; 418 419 return fl_sagaw & sl_sagaw; 420 } 421 422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 423 { 424 unsigned long sagaw; 425 int agaw; 426 427 sagaw = __iommu_calculate_sagaw(iommu); 428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 429 if (test_bit(agaw, &sagaw)) 430 break; 431 } 432 433 return agaw; 434 } 435 436 /* 437 * Calculate max SAGAW for each iommu. 438 */ 439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 440 { 441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 442 } 443 444 /* 445 * calculate agaw for each iommu. 446 * "SAGAW" may be different across iommus, use a default agaw, and 447 * get a supported less agaw for iommus that don't support the default agaw. 448 */ 449 int iommu_calculate_agaw(struct intel_iommu *iommu) 450 { 451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 452 } 453 454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 455 { 456 return sm_supported(iommu) ? 457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 458 } 459 460 static void domain_update_iommu_coherency(struct dmar_domain *domain) 461 { 462 struct iommu_domain_info *info; 463 struct dmar_drhd_unit *drhd; 464 struct intel_iommu *iommu; 465 bool found = false; 466 unsigned long i; 467 468 domain->iommu_coherency = true; 469 xa_for_each(&domain->iommu_array, i, info) { 470 found = true; 471 if (!iommu_paging_structure_coherency(info->iommu)) { 472 domain->iommu_coherency = false; 473 break; 474 } 475 } 476 if (found) 477 return; 478 479 /* No hardware attached; use lowest common denominator */ 480 rcu_read_lock(); 481 for_each_active_iommu(iommu, drhd) { 482 if (!iommu_paging_structure_coherency(iommu)) { 483 domain->iommu_coherency = false; 484 break; 485 } 486 } 487 rcu_read_unlock(); 488 } 489 490 static int domain_update_iommu_superpage(struct dmar_domain *domain, 491 struct intel_iommu *skip) 492 { 493 struct dmar_drhd_unit *drhd; 494 struct intel_iommu *iommu; 495 int mask = 0x3; 496 497 if (!intel_iommu_superpage) 498 return 0; 499 500 /* set iommu_superpage to the smallest common denominator */ 501 rcu_read_lock(); 502 for_each_active_iommu(iommu, drhd) { 503 if (iommu != skip) { 504 if (domain && domain->use_first_level) { 505 if (!cap_fl1gp_support(iommu->cap)) 506 mask = 0x1; 507 } else { 508 mask &= cap_super_page_val(iommu->cap); 509 } 510 511 if (!mask) 512 break; 513 } 514 } 515 rcu_read_unlock(); 516 517 return fls(mask); 518 } 519 520 static int domain_update_device_node(struct dmar_domain *domain) 521 { 522 struct device_domain_info *info; 523 int nid = NUMA_NO_NODE; 524 unsigned long flags; 525 526 spin_lock_irqsave(&domain->lock, flags); 527 list_for_each_entry(info, &domain->devices, link) { 528 /* 529 * There could possibly be multiple device numa nodes as devices 530 * within the same domain may sit behind different IOMMUs. There 531 * isn't perfect answer in such situation, so we select first 532 * come first served policy. 533 */ 534 nid = dev_to_node(info->dev); 535 if (nid != NUMA_NO_NODE) 536 break; 537 } 538 spin_unlock_irqrestore(&domain->lock, flags); 539 540 return nid; 541 } 542 543 static void domain_update_iotlb(struct dmar_domain *domain); 544 545 /* Return the super pagesize bitmap if supported. */ 546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 547 { 548 unsigned long bitmap = 0; 549 550 /* 551 * 1-level super page supports page size of 2MiB, 2-level super page 552 * supports page size of both 2MiB and 1GiB. 553 */ 554 if (domain->iommu_superpage == 1) 555 bitmap |= SZ_2M; 556 else if (domain->iommu_superpage == 2) 557 bitmap |= SZ_2M | SZ_1G; 558 559 return bitmap; 560 } 561 562 /* Some capabilities may be different across iommus */ 563 void domain_update_iommu_cap(struct dmar_domain *domain) 564 { 565 domain_update_iommu_coherency(domain); 566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 567 568 /* 569 * If RHSA is missing, we should default to the device numa domain 570 * as fall back. 571 */ 572 if (domain->nid == NUMA_NO_NODE) 573 domain->nid = domain_update_device_node(domain); 574 575 /* 576 * First-level translation restricts the input-address to a 577 * canonical address (i.e., address bits 63:N have the same 578 * value as address bit [N-1], where N is 48-bits with 4-level 579 * paging and 57-bits with 5-level paging). Hence, skip bit 580 * [N-1]. 581 */ 582 if (domain->use_first_level) 583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 584 else 585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 586 587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 588 domain_update_iotlb(domain); 589 } 590 591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 592 u8 devfn, int alloc) 593 { 594 struct root_entry *root = &iommu->root_entry[bus]; 595 struct context_entry *context; 596 u64 *entry; 597 598 /* 599 * Except that the caller requested to allocate a new entry, 600 * returning a copied context entry makes no sense. 601 */ 602 if (!alloc && context_copied(iommu, bus, devfn)) 603 return NULL; 604 605 entry = &root->lo; 606 if (sm_supported(iommu)) { 607 if (devfn >= 0x80) { 608 devfn -= 0x80; 609 entry = &root->hi; 610 } 611 devfn *= 2; 612 } 613 if (*entry & 1) 614 context = phys_to_virt(*entry & VTD_PAGE_MASK); 615 else { 616 unsigned long phy_addr; 617 if (!alloc) 618 return NULL; 619 620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 621 if (!context) 622 return NULL; 623 624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 625 phy_addr = virt_to_phys((void *)context); 626 *entry = phy_addr | 1; 627 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 628 } 629 return &context[devfn]; 630 } 631 632 /** 633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 634 * sub-hierarchy of a candidate PCI-PCI bridge 635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 636 * @bridge: the candidate PCI-PCI bridge 637 * 638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 639 */ 640 static bool 641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 642 { 643 struct pci_dev *pdev, *pbridge; 644 645 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 646 return false; 647 648 pdev = to_pci_dev(dev); 649 pbridge = to_pci_dev(bridge); 650 651 if (pbridge->subordinate && 652 pbridge->subordinate->number <= pdev->bus->number && 653 pbridge->subordinate->busn_res.end >= pdev->bus->number) 654 return true; 655 656 return false; 657 } 658 659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 660 { 661 struct dmar_drhd_unit *drhd; 662 u32 vtbar; 663 int rc; 664 665 /* We know that this device on this chipset has its own IOMMU. 666 * If we find it under a different IOMMU, then the BIOS is lying 667 * to us. Hope that the IOMMU for this device is actually 668 * disabled, and it needs no translation... 669 */ 670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 671 if (rc) { 672 /* "can't" happen */ 673 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 674 return false; 675 } 676 vtbar &= 0xffff0000; 677 678 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 679 drhd = dmar_find_matched_drhd_unit(pdev); 680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 683 return true; 684 } 685 686 return false; 687 } 688 689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 690 { 691 if (!iommu || iommu->drhd->ignored) 692 return true; 693 694 if (dev_is_pci(dev)) { 695 struct pci_dev *pdev = to_pci_dev(dev); 696 697 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 699 quirk_ioat_snb_local_iommu(pdev)) 700 return true; 701 } 702 703 return false; 704 } 705 706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 707 { 708 struct dmar_drhd_unit *drhd = NULL; 709 struct pci_dev *pdev = NULL; 710 struct intel_iommu *iommu; 711 struct device *tmp; 712 u16 segment = 0; 713 int i; 714 715 if (!dev) 716 return NULL; 717 718 if (dev_is_pci(dev)) { 719 struct pci_dev *pf_pdev; 720 721 pdev = pci_real_dma_dev(to_pci_dev(dev)); 722 723 /* VFs aren't listed in scope tables; we need to look up 724 * the PF instead to find the IOMMU. */ 725 pf_pdev = pci_physfn(pdev); 726 dev = &pf_pdev->dev; 727 segment = pci_domain_nr(pdev->bus); 728 } else if (has_acpi_companion(dev)) 729 dev = &ACPI_COMPANION(dev)->dev; 730 731 rcu_read_lock(); 732 for_each_iommu(iommu, drhd) { 733 if (pdev && segment != drhd->segment) 734 continue; 735 736 for_each_active_dev_scope(drhd->devices, 737 drhd->devices_cnt, i, tmp) { 738 if (tmp == dev) { 739 /* For a VF use its original BDF# not that of the PF 740 * which we used for the IOMMU lookup. Strictly speaking 741 * we could do this for all PCI devices; we only need to 742 * get the BDF# from the scope table for ACPI matches. */ 743 if (pdev && pdev->is_virtfn) 744 goto got_pdev; 745 746 if (bus && devfn) { 747 *bus = drhd->devices[i].bus; 748 *devfn = drhd->devices[i].devfn; 749 } 750 goto out; 751 } 752 753 if (is_downstream_to_pci_bridge(dev, tmp)) 754 goto got_pdev; 755 } 756 757 if (pdev && drhd->include_all) { 758 got_pdev: 759 if (bus && devfn) { 760 *bus = pdev->bus->number; 761 *devfn = pdev->devfn; 762 } 763 goto out; 764 } 765 } 766 iommu = NULL; 767 out: 768 if (iommu_is_dummy(iommu, dev)) 769 iommu = NULL; 770 771 rcu_read_unlock(); 772 773 return iommu; 774 } 775 776 static void domain_flush_cache(struct dmar_domain *domain, 777 void *addr, int size) 778 { 779 if (!domain->iommu_coherency) 780 clflush_cache_range(addr, size); 781 } 782 783 static void free_context_table(struct intel_iommu *iommu) 784 { 785 struct context_entry *context; 786 int i; 787 788 if (!iommu->root_entry) 789 return; 790 791 for (i = 0; i < ROOT_ENTRY_NR; i++) { 792 context = iommu_context_addr(iommu, i, 0, 0); 793 if (context) 794 free_pgtable_page(context); 795 796 if (!sm_supported(iommu)) 797 continue; 798 799 context = iommu_context_addr(iommu, i, 0x80, 0); 800 if (context) 801 free_pgtable_page(context); 802 } 803 804 free_pgtable_page(iommu->root_entry); 805 iommu->root_entry = NULL; 806 } 807 808 #ifdef CONFIG_DMAR_DEBUG 809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 810 u8 bus, u8 devfn, struct dma_pte *parent, int level) 811 { 812 struct dma_pte *pte; 813 int offset; 814 815 while (1) { 816 offset = pfn_level_offset(pfn, level); 817 pte = &parent[offset]; 818 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 819 pr_info("PTE not present at level %d\n", level); 820 break; 821 } 822 823 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 824 825 if (level == 1) 826 break; 827 828 parent = phys_to_virt(dma_pte_addr(pte)); 829 level--; 830 } 831 } 832 833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 834 unsigned long long addr, u32 pasid) 835 { 836 struct pasid_dir_entry *dir, *pde; 837 struct pasid_entry *entries, *pte; 838 struct context_entry *ctx_entry; 839 struct root_entry *rt_entry; 840 int i, dir_index, index, level; 841 u8 devfn = source_id & 0xff; 842 u8 bus = source_id >> 8; 843 struct dma_pte *pgtable; 844 845 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 846 847 /* root entry dump */ 848 rt_entry = &iommu->root_entry[bus]; 849 if (!rt_entry) { 850 pr_info("root table entry is not present\n"); 851 return; 852 } 853 854 if (sm_supported(iommu)) 855 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 856 rt_entry->hi, rt_entry->lo); 857 else 858 pr_info("root entry: 0x%016llx", rt_entry->lo); 859 860 /* context entry dump */ 861 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 862 if (!ctx_entry) { 863 pr_info("context table entry is not present\n"); 864 return; 865 } 866 867 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 868 ctx_entry->hi, ctx_entry->lo); 869 870 /* legacy mode does not require PASID entries */ 871 if (!sm_supported(iommu)) { 872 level = agaw_to_level(ctx_entry->hi & 7); 873 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 874 goto pgtable_walk; 875 } 876 877 /* get the pointer to pasid directory entry */ 878 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 879 if (!dir) { 880 pr_info("pasid directory entry is not present\n"); 881 return; 882 } 883 /* For request-without-pasid, get the pasid from context entry */ 884 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 885 pasid = IOMMU_NO_PASID; 886 887 dir_index = pasid >> PASID_PDE_SHIFT; 888 pde = &dir[dir_index]; 889 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 890 891 /* get the pointer to the pasid table entry */ 892 entries = get_pasid_table_from_pde(pde); 893 if (!entries) { 894 pr_info("pasid table entry is not present\n"); 895 return; 896 } 897 index = pasid & PASID_PTE_MASK; 898 pte = &entries[index]; 899 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 900 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 901 902 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 903 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 904 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 905 } else { 906 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 907 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 908 } 909 910 pgtable_walk: 911 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 912 } 913 #endif 914 915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 916 unsigned long pfn, int *target_level, 917 gfp_t gfp) 918 { 919 struct dma_pte *parent, *pte; 920 int level = agaw_to_level(domain->agaw); 921 int offset; 922 923 if (!domain_pfn_supported(domain, pfn)) 924 /* Address beyond IOMMU's addressing capabilities. */ 925 return NULL; 926 927 parent = domain->pgd; 928 929 while (1) { 930 void *tmp_page; 931 932 offset = pfn_level_offset(pfn, level); 933 pte = &parent[offset]; 934 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 935 break; 936 if (level == *target_level) 937 break; 938 939 if (!dma_pte_present(pte)) { 940 uint64_t pteval; 941 942 tmp_page = alloc_pgtable_page(domain->nid, gfp); 943 944 if (!tmp_page) 945 return NULL; 946 947 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 948 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 949 if (domain->use_first_level) 950 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 951 952 if (cmpxchg64(&pte->val, 0ULL, pteval)) 953 /* Someone else set it while we were thinking; use theirs. */ 954 free_pgtable_page(tmp_page); 955 else 956 domain_flush_cache(domain, pte, sizeof(*pte)); 957 } 958 if (level == 1) 959 break; 960 961 parent = phys_to_virt(dma_pte_addr(pte)); 962 level--; 963 } 964 965 if (!*target_level) 966 *target_level = level; 967 968 return pte; 969 } 970 971 /* return address's pte at specific level */ 972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 973 unsigned long pfn, 974 int level, int *large_page) 975 { 976 struct dma_pte *parent, *pte; 977 int total = agaw_to_level(domain->agaw); 978 int offset; 979 980 parent = domain->pgd; 981 while (level <= total) { 982 offset = pfn_level_offset(pfn, total); 983 pte = &parent[offset]; 984 if (level == total) 985 return pte; 986 987 if (!dma_pte_present(pte)) { 988 *large_page = total; 989 break; 990 } 991 992 if (dma_pte_superpage(pte)) { 993 *large_page = total; 994 return pte; 995 } 996 997 parent = phys_to_virt(dma_pte_addr(pte)); 998 total--; 999 } 1000 return NULL; 1001 } 1002 1003 /* clear last level pte, a tlb flush should be followed */ 1004 static void dma_pte_clear_range(struct dmar_domain *domain, 1005 unsigned long start_pfn, 1006 unsigned long last_pfn) 1007 { 1008 unsigned int large_page; 1009 struct dma_pte *first_pte, *pte; 1010 1011 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1012 WARN_ON(start_pfn > last_pfn)) 1013 return; 1014 1015 /* we don't need lock here; nobody else touches the iova range */ 1016 do { 1017 large_page = 1; 1018 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1019 if (!pte) { 1020 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1021 continue; 1022 } 1023 do { 1024 dma_clear_pte(pte); 1025 start_pfn += lvl_to_nr_pages(large_page); 1026 pte++; 1027 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1028 1029 domain_flush_cache(domain, first_pte, 1030 (void *)pte - (void *)first_pte); 1031 1032 } while (start_pfn && start_pfn <= last_pfn); 1033 } 1034 1035 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1036 int retain_level, struct dma_pte *pte, 1037 unsigned long pfn, unsigned long start_pfn, 1038 unsigned long last_pfn) 1039 { 1040 pfn = max(start_pfn, pfn); 1041 pte = &pte[pfn_level_offset(pfn, level)]; 1042 1043 do { 1044 unsigned long level_pfn; 1045 struct dma_pte *level_pte; 1046 1047 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1048 goto next; 1049 1050 level_pfn = pfn & level_mask(level); 1051 level_pte = phys_to_virt(dma_pte_addr(pte)); 1052 1053 if (level > 2) { 1054 dma_pte_free_level(domain, level - 1, retain_level, 1055 level_pte, level_pfn, start_pfn, 1056 last_pfn); 1057 } 1058 1059 /* 1060 * Free the page table if we're below the level we want to 1061 * retain and the range covers the entire table. 1062 */ 1063 if (level < retain_level && !(start_pfn > level_pfn || 1064 last_pfn < level_pfn + level_size(level) - 1)) { 1065 dma_clear_pte(pte); 1066 domain_flush_cache(domain, pte, sizeof(*pte)); 1067 free_pgtable_page(level_pte); 1068 } 1069 next: 1070 pfn += level_size(level); 1071 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1072 } 1073 1074 /* 1075 * clear last level (leaf) ptes and free page table pages below the 1076 * level we wish to keep intact. 1077 */ 1078 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1079 unsigned long start_pfn, 1080 unsigned long last_pfn, 1081 int retain_level) 1082 { 1083 dma_pte_clear_range(domain, start_pfn, last_pfn); 1084 1085 /* We don't need lock here; nobody else touches the iova range */ 1086 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1087 domain->pgd, 0, start_pfn, last_pfn); 1088 1089 /* free pgd */ 1090 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1091 free_pgtable_page(domain->pgd); 1092 domain->pgd = NULL; 1093 } 1094 } 1095 1096 /* When a page at a given level is being unlinked from its parent, we don't 1097 need to *modify* it at all. All we need to do is make a list of all the 1098 pages which can be freed just as soon as we've flushed the IOTLB and we 1099 know the hardware page-walk will no longer touch them. 1100 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1101 be freed. */ 1102 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1103 int level, struct dma_pte *pte, 1104 struct list_head *freelist) 1105 { 1106 struct page *pg; 1107 1108 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1109 list_add_tail(&pg->lru, freelist); 1110 1111 if (level == 1) 1112 return; 1113 1114 pte = page_address(pg); 1115 do { 1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1117 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1118 pte++; 1119 } while (!first_pte_in_page(pte)); 1120 } 1121 1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1123 struct dma_pte *pte, unsigned long pfn, 1124 unsigned long start_pfn, unsigned long last_pfn, 1125 struct list_head *freelist) 1126 { 1127 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1128 1129 pfn = max(start_pfn, pfn); 1130 pte = &pte[pfn_level_offset(pfn, level)]; 1131 1132 do { 1133 unsigned long level_pfn = pfn & level_mask(level); 1134 1135 if (!dma_pte_present(pte)) 1136 goto next; 1137 1138 /* If range covers entire pagetable, free it */ 1139 if (start_pfn <= level_pfn && 1140 last_pfn >= level_pfn + level_size(level) - 1) { 1141 /* These suborbinate page tables are going away entirely. Don't 1142 bother to clear them; we're just going to *free* them. */ 1143 if (level > 1 && !dma_pte_superpage(pte)) 1144 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1145 1146 dma_clear_pte(pte); 1147 if (!first_pte) 1148 first_pte = pte; 1149 last_pte = pte; 1150 } else if (level > 1) { 1151 /* Recurse down into a level that isn't *entirely* obsolete */ 1152 dma_pte_clear_level(domain, level - 1, 1153 phys_to_virt(dma_pte_addr(pte)), 1154 level_pfn, start_pfn, last_pfn, 1155 freelist); 1156 } 1157 next: 1158 pfn = level_pfn + level_size(level); 1159 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1160 1161 if (first_pte) 1162 domain_flush_cache(domain, first_pte, 1163 (void *)++last_pte - (void *)first_pte); 1164 } 1165 1166 /* We can't just free the pages because the IOMMU may still be walking 1167 the page tables, and may have cached the intermediate levels. The 1168 pages can only be freed after the IOTLB flush has been done. */ 1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1170 unsigned long last_pfn, struct list_head *freelist) 1171 { 1172 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1173 WARN_ON(start_pfn > last_pfn)) 1174 return; 1175 1176 /* we don't need lock here; nobody else touches the iova range */ 1177 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1178 domain->pgd, 0, start_pfn, last_pfn, freelist); 1179 1180 /* free pgd */ 1181 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1182 struct page *pgd_page = virt_to_page(domain->pgd); 1183 list_add_tail(&pgd_page->lru, freelist); 1184 domain->pgd = NULL; 1185 } 1186 } 1187 1188 /* iommu handling */ 1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1190 { 1191 struct root_entry *root; 1192 1193 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1194 if (!root) { 1195 pr_err("Allocating root entry for %s failed\n", 1196 iommu->name); 1197 return -ENOMEM; 1198 } 1199 1200 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1201 iommu->root_entry = root; 1202 1203 return 0; 1204 } 1205 1206 static void iommu_set_root_entry(struct intel_iommu *iommu) 1207 { 1208 u64 addr; 1209 u32 sts; 1210 unsigned long flag; 1211 1212 addr = virt_to_phys(iommu->root_entry); 1213 if (sm_supported(iommu)) 1214 addr |= DMA_RTADDR_SMT; 1215 1216 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1217 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1218 1219 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1220 1221 /* Make sure hardware complete it */ 1222 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1223 readl, (sts & DMA_GSTS_RTPS), sts); 1224 1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1226 1227 /* 1228 * Hardware invalidates all DMA remapping hardware translation 1229 * caches as part of SRTP flow. 1230 */ 1231 if (cap_esrtps(iommu->cap)) 1232 return; 1233 1234 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1235 if (sm_supported(iommu)) 1236 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1237 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1238 } 1239 1240 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1241 { 1242 u32 val; 1243 unsigned long flag; 1244 1245 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1246 return; 1247 1248 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1249 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1250 1251 /* Make sure hardware complete it */ 1252 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1253 readl, (!(val & DMA_GSTS_WBFS)), val); 1254 1255 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1256 } 1257 1258 /* return value determine if we need a write buffer flush */ 1259 static void __iommu_flush_context(struct intel_iommu *iommu, 1260 u16 did, u16 source_id, u8 function_mask, 1261 u64 type) 1262 { 1263 u64 val = 0; 1264 unsigned long flag; 1265 1266 switch (type) { 1267 case DMA_CCMD_GLOBAL_INVL: 1268 val = DMA_CCMD_GLOBAL_INVL; 1269 break; 1270 case DMA_CCMD_DOMAIN_INVL: 1271 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1272 break; 1273 case DMA_CCMD_DEVICE_INVL: 1274 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1275 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1276 break; 1277 default: 1278 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1279 iommu->name, type); 1280 return; 1281 } 1282 val |= DMA_CCMD_ICC; 1283 1284 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1286 1287 /* Make sure hardware complete it */ 1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1290 1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1292 } 1293 1294 /* return value determine if we need a write buffer flush */ 1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1296 u64 addr, unsigned int size_order, u64 type) 1297 { 1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1299 u64 val = 0, val_iva = 0; 1300 unsigned long flag; 1301 1302 switch (type) { 1303 case DMA_TLB_GLOBAL_FLUSH: 1304 /* global flush doesn't need set IVA_REG */ 1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1306 break; 1307 case DMA_TLB_DSI_FLUSH: 1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1309 break; 1310 case DMA_TLB_PSI_FLUSH: 1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1312 /* IH bit is passed in as part of address */ 1313 val_iva = size_order | addr; 1314 break; 1315 default: 1316 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1317 iommu->name, type); 1318 return; 1319 } 1320 1321 if (cap_write_drain(iommu->cap)) 1322 val |= DMA_TLB_WRITE_DRAIN; 1323 1324 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1325 /* Note: Only uses first TLB reg currently */ 1326 if (val_iva) 1327 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1328 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1329 1330 /* Make sure hardware complete it */ 1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1332 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1333 1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1335 1336 /* check IOTLB invalidation granularity */ 1337 if (DMA_TLB_IAIG(val) == 0) 1338 pr_err("Flush IOTLB failed\n"); 1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1340 pr_debug("TLB flush request %Lx, actual %Lx\n", 1341 (unsigned long long)DMA_TLB_IIRG(type), 1342 (unsigned long long)DMA_TLB_IAIG(val)); 1343 } 1344 1345 static struct device_domain_info * 1346 domain_lookup_dev_info(struct dmar_domain *domain, 1347 struct intel_iommu *iommu, u8 bus, u8 devfn) 1348 { 1349 struct device_domain_info *info; 1350 unsigned long flags; 1351 1352 spin_lock_irqsave(&domain->lock, flags); 1353 list_for_each_entry(info, &domain->devices, link) { 1354 if (info->iommu == iommu && info->bus == bus && 1355 info->devfn == devfn) { 1356 spin_unlock_irqrestore(&domain->lock, flags); 1357 return info; 1358 } 1359 } 1360 spin_unlock_irqrestore(&domain->lock, flags); 1361 1362 return NULL; 1363 } 1364 1365 static void domain_update_iotlb(struct dmar_domain *domain) 1366 { 1367 struct dev_pasid_info *dev_pasid; 1368 struct device_domain_info *info; 1369 bool has_iotlb_device = false; 1370 unsigned long flags; 1371 1372 spin_lock_irqsave(&domain->lock, flags); 1373 list_for_each_entry(info, &domain->devices, link) { 1374 if (info->ats_enabled) { 1375 has_iotlb_device = true; 1376 break; 1377 } 1378 } 1379 1380 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1381 info = dev_iommu_priv_get(dev_pasid->dev); 1382 if (info->ats_enabled) { 1383 has_iotlb_device = true; 1384 break; 1385 } 1386 } 1387 domain->has_iotlb_device = has_iotlb_device; 1388 spin_unlock_irqrestore(&domain->lock, flags); 1389 } 1390 1391 /* 1392 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1393 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1394 * check because it applies only to the built-in QAT devices and it doesn't 1395 * grant additional privileges. 1396 */ 1397 #define BUGGY_QAT_DEVID_MASK 0x4940 1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1399 { 1400 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1401 return false; 1402 1403 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1404 return false; 1405 1406 return true; 1407 } 1408 1409 static void iommu_enable_pci_caps(struct device_domain_info *info) 1410 { 1411 struct pci_dev *pdev; 1412 1413 if (!dev_is_pci(info->dev)) 1414 return; 1415 1416 pdev = to_pci_dev(info->dev); 1417 1418 /* The PCIe spec, in its wisdom, declares that the behaviour of 1419 the device if you enable PASID support after ATS support is 1420 undefined. So always enable PASID support on devices which 1421 have it, even if we can't yet know if we're ever going to 1422 use it. */ 1423 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1424 info->pasid_enabled = 1; 1425 1426 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1427 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1428 info->ats_enabled = 1; 1429 domain_update_iotlb(info->domain); 1430 } 1431 } 1432 1433 static void iommu_disable_pci_caps(struct device_domain_info *info) 1434 { 1435 struct pci_dev *pdev; 1436 1437 if (!dev_is_pci(info->dev)) 1438 return; 1439 1440 pdev = to_pci_dev(info->dev); 1441 1442 if (info->ats_enabled) { 1443 pci_disable_ats(pdev); 1444 info->ats_enabled = 0; 1445 domain_update_iotlb(info->domain); 1446 } 1447 1448 if (info->pasid_enabled) { 1449 pci_disable_pasid(pdev); 1450 info->pasid_enabled = 0; 1451 } 1452 } 1453 1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1455 u64 addr, unsigned int mask) 1456 { 1457 u16 sid, qdep; 1458 1459 if (!info || !info->ats_enabled) 1460 return; 1461 1462 sid = info->bus << 8 | info->devfn; 1463 qdep = info->ats_qdep; 1464 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1465 qdep, addr, mask); 1466 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); 1467 } 1468 1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1470 u64 addr, unsigned mask) 1471 { 1472 struct dev_pasid_info *dev_pasid; 1473 struct device_domain_info *info; 1474 unsigned long flags; 1475 1476 if (!domain->has_iotlb_device) 1477 return; 1478 1479 spin_lock_irqsave(&domain->lock, flags); 1480 list_for_each_entry(info, &domain->devices, link) 1481 __iommu_flush_dev_iotlb(info, addr, mask); 1482 1483 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1484 info = dev_iommu_priv_get(dev_pasid->dev); 1485 1486 if (!info->ats_enabled) 1487 continue; 1488 1489 qi_flush_dev_iotlb_pasid(info->iommu, 1490 PCI_DEVID(info->bus, info->devfn), 1491 info->pfsid, dev_pasid->pasid, 1492 info->ats_qdep, addr, 1493 mask); 1494 } 1495 spin_unlock_irqrestore(&domain->lock, flags); 1496 } 1497 1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, 1499 struct dmar_domain *domain, u64 addr, 1500 unsigned long npages, bool ih) 1501 { 1502 u16 did = domain_id_iommu(domain, iommu); 1503 struct dev_pasid_info *dev_pasid; 1504 unsigned long flags; 1505 1506 spin_lock_irqsave(&domain->lock, flags); 1507 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) 1508 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); 1509 1510 if (!list_empty(&domain->devices)) 1511 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); 1512 spin_unlock_irqrestore(&domain->lock, flags); 1513 } 1514 1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1516 struct dmar_domain *domain, 1517 unsigned long pfn, unsigned int pages, 1518 int ih, int map) 1519 { 1520 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1521 unsigned int mask = ilog2(aligned_pages); 1522 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1523 u16 did = domain_id_iommu(domain, iommu); 1524 1525 if (WARN_ON(!pages)) 1526 return; 1527 1528 if (ih) 1529 ih = 1 << 6; 1530 1531 if (domain->use_first_level) { 1532 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); 1533 } else { 1534 unsigned long bitmask = aligned_pages - 1; 1535 1536 /* 1537 * PSI masks the low order bits of the base address. If the 1538 * address isn't aligned to the mask, then compute a mask value 1539 * needed to ensure the target range is flushed. 1540 */ 1541 if (unlikely(bitmask & pfn)) { 1542 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1543 1544 /* 1545 * Since end_pfn <= pfn + bitmask, the only way bits 1546 * higher than bitmask can differ in pfn and end_pfn is 1547 * by carrying. This means after masking out bitmask, 1548 * high bits starting with the first set bit in 1549 * shared_bits are all equal in both pfn and end_pfn. 1550 */ 1551 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1552 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1553 } 1554 1555 /* 1556 * Fallback to domain selective flush if no PSI support or 1557 * the size is too big. 1558 */ 1559 if (!cap_pgsel_inv(iommu->cap) || 1560 mask > cap_max_amask_val(iommu->cap)) 1561 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1562 DMA_TLB_DSI_FLUSH); 1563 else 1564 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1565 DMA_TLB_PSI_FLUSH); 1566 } 1567 1568 /* 1569 * In caching mode, changes of pages from non-present to present require 1570 * flush. However, device IOTLB doesn't need to be flushed in this case. 1571 */ 1572 if (!cap_caching_mode(iommu->cap) || !map) 1573 iommu_flush_dev_iotlb(domain, addr, mask); 1574 } 1575 1576 /* Notification for newly created mappings */ 1577 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1578 struct dmar_domain *domain, 1579 unsigned long pfn, unsigned int pages) 1580 { 1581 /* 1582 * It's a non-present to present mapping. Only flush if caching mode 1583 * and second level. 1584 */ 1585 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1586 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1587 else 1588 iommu_flush_write_buffer(iommu); 1589 } 1590 1591 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1592 { 1593 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1594 struct iommu_domain_info *info; 1595 unsigned long idx; 1596 1597 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1598 struct intel_iommu *iommu = info->iommu; 1599 u16 did = domain_id_iommu(dmar_domain, iommu); 1600 1601 if (dmar_domain->use_first_level) 1602 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); 1603 else 1604 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1605 DMA_TLB_DSI_FLUSH); 1606 1607 if (!cap_caching_mode(iommu->cap)) 1608 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1609 } 1610 } 1611 1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1613 { 1614 u32 pmen; 1615 unsigned long flags; 1616 1617 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1618 return; 1619 1620 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1621 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1622 pmen &= ~DMA_PMEN_EPM; 1623 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1624 1625 /* wait for the protected region status bit to clear */ 1626 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1627 readl, !(pmen & DMA_PMEN_PRS), pmen); 1628 1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1630 } 1631 1632 static void iommu_enable_translation(struct intel_iommu *iommu) 1633 { 1634 u32 sts; 1635 unsigned long flags; 1636 1637 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1638 iommu->gcmd |= DMA_GCMD_TE; 1639 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1640 1641 /* Make sure hardware complete it */ 1642 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1643 readl, (sts & DMA_GSTS_TES), sts); 1644 1645 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1646 } 1647 1648 static void iommu_disable_translation(struct intel_iommu *iommu) 1649 { 1650 u32 sts; 1651 unsigned long flag; 1652 1653 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1654 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1655 return; 1656 1657 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1658 iommu->gcmd &= ~DMA_GCMD_TE; 1659 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1660 1661 /* Make sure hardware complete it */ 1662 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1663 readl, (!(sts & DMA_GSTS_TES)), sts); 1664 1665 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1666 } 1667 1668 static int iommu_init_domains(struct intel_iommu *iommu) 1669 { 1670 u32 ndomains; 1671 1672 ndomains = cap_ndoms(iommu->cap); 1673 pr_debug("%s: Number of Domains supported <%d>\n", 1674 iommu->name, ndomains); 1675 1676 spin_lock_init(&iommu->lock); 1677 1678 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1679 if (!iommu->domain_ids) 1680 return -ENOMEM; 1681 1682 /* 1683 * If Caching mode is set, then invalid translations are tagged 1684 * with domain-id 0, hence we need to pre-allocate it. We also 1685 * use domain-id 0 as a marker for non-allocated domain-id, so 1686 * make sure it is not used for a real domain. 1687 */ 1688 set_bit(0, iommu->domain_ids); 1689 1690 /* 1691 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1692 * entry for first-level or pass-through translation modes should 1693 * be programmed with a domain id different from those used for 1694 * second-level or nested translation. We reserve a domain id for 1695 * this purpose. 1696 */ 1697 if (sm_supported(iommu)) 1698 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1699 1700 return 0; 1701 } 1702 1703 static void disable_dmar_iommu(struct intel_iommu *iommu) 1704 { 1705 if (!iommu->domain_ids) 1706 return; 1707 1708 /* 1709 * All iommu domains must have been detached from the devices, 1710 * hence there should be no domain IDs in use. 1711 */ 1712 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1713 > NUM_RESERVED_DID)) 1714 return; 1715 1716 if (iommu->gcmd & DMA_GCMD_TE) 1717 iommu_disable_translation(iommu); 1718 } 1719 1720 static void free_dmar_iommu(struct intel_iommu *iommu) 1721 { 1722 if (iommu->domain_ids) { 1723 bitmap_free(iommu->domain_ids); 1724 iommu->domain_ids = NULL; 1725 } 1726 1727 if (iommu->copied_tables) { 1728 bitmap_free(iommu->copied_tables); 1729 iommu->copied_tables = NULL; 1730 } 1731 1732 /* free context mapping */ 1733 free_context_table(iommu); 1734 1735 #ifdef CONFIG_INTEL_IOMMU_SVM 1736 if (pasid_supported(iommu)) { 1737 if (ecap_prs(iommu->ecap)) 1738 intel_svm_finish_prq(iommu); 1739 } 1740 #endif 1741 } 1742 1743 /* 1744 * Check and return whether first level is used by default for 1745 * DMA translation. 1746 */ 1747 static bool first_level_by_default(unsigned int type) 1748 { 1749 /* Only SL is available in legacy mode */ 1750 if (!scalable_mode_support()) 1751 return false; 1752 1753 /* Only level (either FL or SL) is available, just use it */ 1754 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1755 return intel_cap_flts_sanity(); 1756 1757 /* Both levels are available, decide it based on domain type */ 1758 return type != IOMMU_DOMAIN_UNMANAGED; 1759 } 1760 1761 static struct dmar_domain *alloc_domain(unsigned int type) 1762 { 1763 struct dmar_domain *domain; 1764 1765 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1766 if (!domain) 1767 return NULL; 1768 1769 domain->nid = NUMA_NO_NODE; 1770 if (first_level_by_default(type)) 1771 domain->use_first_level = true; 1772 domain->has_iotlb_device = false; 1773 INIT_LIST_HEAD(&domain->devices); 1774 INIT_LIST_HEAD(&domain->dev_pasids); 1775 spin_lock_init(&domain->lock); 1776 xa_init(&domain->iommu_array); 1777 1778 return domain; 1779 } 1780 1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1782 { 1783 struct iommu_domain_info *info, *curr; 1784 unsigned long ndomains; 1785 int num, ret = -ENOSPC; 1786 1787 info = kzalloc(sizeof(*info), GFP_KERNEL); 1788 if (!info) 1789 return -ENOMEM; 1790 1791 spin_lock(&iommu->lock); 1792 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1793 if (curr) { 1794 curr->refcnt++; 1795 spin_unlock(&iommu->lock); 1796 kfree(info); 1797 return 0; 1798 } 1799 1800 ndomains = cap_ndoms(iommu->cap); 1801 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1802 if (num >= ndomains) { 1803 pr_err("%s: No free domain ids\n", iommu->name); 1804 goto err_unlock; 1805 } 1806 1807 set_bit(num, iommu->domain_ids); 1808 info->refcnt = 1; 1809 info->did = num; 1810 info->iommu = iommu; 1811 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1812 NULL, info, GFP_ATOMIC); 1813 if (curr) { 1814 ret = xa_err(curr) ? : -EBUSY; 1815 goto err_clear; 1816 } 1817 domain_update_iommu_cap(domain); 1818 1819 spin_unlock(&iommu->lock); 1820 return 0; 1821 1822 err_clear: 1823 clear_bit(info->did, iommu->domain_ids); 1824 err_unlock: 1825 spin_unlock(&iommu->lock); 1826 kfree(info); 1827 return ret; 1828 } 1829 1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) 1831 { 1832 struct iommu_domain_info *info; 1833 1834 spin_lock(&iommu->lock); 1835 info = xa_load(&domain->iommu_array, iommu->seq_id); 1836 if (--info->refcnt == 0) { 1837 clear_bit(info->did, iommu->domain_ids); 1838 xa_erase(&domain->iommu_array, iommu->seq_id); 1839 domain->nid = NUMA_NO_NODE; 1840 domain_update_iommu_cap(domain); 1841 kfree(info); 1842 } 1843 spin_unlock(&iommu->lock); 1844 } 1845 1846 static inline int guestwidth_to_adjustwidth(int gaw) 1847 { 1848 int agaw; 1849 int r = (gaw - 12) % 9; 1850 1851 if (r == 0) 1852 agaw = gaw; 1853 else 1854 agaw = gaw + 9 - r; 1855 if (agaw > 64) 1856 agaw = 64; 1857 return agaw; 1858 } 1859 1860 static void domain_exit(struct dmar_domain *domain) 1861 { 1862 if (domain->pgd) { 1863 LIST_HEAD(freelist); 1864 1865 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1866 put_pages_list(&freelist); 1867 } 1868 1869 if (WARN_ON(!list_empty(&domain->devices))) 1870 return; 1871 1872 kfree(domain); 1873 } 1874 1875 /* 1876 * Get the PASID directory size for scalable mode context entry. 1877 * Value of X in the PDTS field of a scalable mode context entry 1878 * indicates PASID directory with 2^(X + 7) entries. 1879 */ 1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1881 { 1882 unsigned long pds, max_pde; 1883 1884 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1885 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1886 if (pds < 7) 1887 return 0; 1888 1889 return pds - 7; 1890 } 1891 1892 /* 1893 * Set the RID_PASID field of a scalable mode context entry. The 1894 * IOMMU hardware will use the PASID value set in this field for 1895 * DMA translations of DMA requests without PASID. 1896 */ 1897 static inline void 1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1899 { 1900 context->hi |= pasid & ((1 << 20) - 1); 1901 } 1902 1903 /* 1904 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1905 * entry. 1906 */ 1907 static inline void context_set_sm_dte(struct context_entry *context) 1908 { 1909 context->lo |= BIT_ULL(2); 1910 } 1911 1912 /* 1913 * Set the PRE(Page Request Enable) field of a scalable mode context 1914 * entry. 1915 */ 1916 static inline void context_set_sm_pre(struct context_entry *context) 1917 { 1918 context->lo |= BIT_ULL(4); 1919 } 1920 1921 /* Convert value to context PASID directory size field coding. */ 1922 #define context_pdts(pds) (((pds) & 0x7) << 9) 1923 1924 static int domain_context_mapping_one(struct dmar_domain *domain, 1925 struct intel_iommu *iommu, 1926 struct pasid_table *table, 1927 u8 bus, u8 devfn) 1928 { 1929 struct device_domain_info *info = 1930 domain_lookup_dev_info(domain, iommu, bus, devfn); 1931 u16 did = domain_id_iommu(domain, iommu); 1932 int translation = CONTEXT_TT_MULTI_LEVEL; 1933 struct context_entry *context; 1934 int ret; 1935 1936 if (hw_pass_through && domain_type_is_si(domain)) 1937 translation = CONTEXT_TT_PASS_THROUGH; 1938 1939 pr_debug("Set context mapping for %02x:%02x.%d\n", 1940 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1941 1942 spin_lock(&iommu->lock); 1943 ret = -ENOMEM; 1944 context = iommu_context_addr(iommu, bus, devfn, 1); 1945 if (!context) 1946 goto out_unlock; 1947 1948 ret = 0; 1949 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1950 goto out_unlock; 1951 1952 /* 1953 * For kdump cases, old valid entries may be cached due to the 1954 * in-flight DMA and copied pgtable, but there is no unmapping 1955 * behaviour for them, thus we need an explicit cache flush for 1956 * the newly-mapped device. For kdump, at this point, the device 1957 * is supposed to finish reset at its driver probe stage, so no 1958 * in-flight DMA will exist, and we don't need to worry anymore 1959 * hereafter. 1960 */ 1961 if (context_copied(iommu, bus, devfn)) { 1962 u16 did_old = context_domain_id(context); 1963 1964 if (did_old < cap_ndoms(iommu->cap)) { 1965 iommu->flush.flush_context(iommu, did_old, 1966 (((u16)bus) << 8) | devfn, 1967 DMA_CCMD_MASK_NOBIT, 1968 DMA_CCMD_DEVICE_INVL); 1969 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1970 DMA_TLB_DSI_FLUSH); 1971 } 1972 1973 clear_context_copied(iommu, bus, devfn); 1974 } 1975 1976 context_clear_entry(context); 1977 1978 if (sm_supported(iommu)) { 1979 unsigned long pds; 1980 1981 /* Setup the PASID DIR pointer: */ 1982 pds = context_get_sm_pds(table); 1983 context->lo = (u64)virt_to_phys(table->table) | 1984 context_pdts(pds); 1985 1986 /* Setup the RID_PASID field: */ 1987 context_set_sm_rid2pasid(context, IOMMU_NO_PASID); 1988 1989 /* 1990 * Setup the Device-TLB enable bit and Page request 1991 * Enable bit: 1992 */ 1993 if (info && info->ats_supported) 1994 context_set_sm_dte(context); 1995 if (info && info->pri_supported) 1996 context_set_sm_pre(context); 1997 if (info && info->pasid_supported) 1998 context_set_pasid(context); 1999 } else { 2000 struct dma_pte *pgd = domain->pgd; 2001 int agaw; 2002 2003 context_set_domain_id(context, did); 2004 2005 if (translation != CONTEXT_TT_PASS_THROUGH) { 2006 /* 2007 * Skip top levels of page tables for iommu which has 2008 * less agaw than default. Unnecessary for PT mode. 2009 */ 2010 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2011 ret = -ENOMEM; 2012 pgd = phys_to_virt(dma_pte_addr(pgd)); 2013 if (!dma_pte_present(pgd)) 2014 goto out_unlock; 2015 } 2016 2017 if (info && info->ats_supported) 2018 translation = CONTEXT_TT_DEV_IOTLB; 2019 else 2020 translation = CONTEXT_TT_MULTI_LEVEL; 2021 2022 context_set_address_root(context, virt_to_phys(pgd)); 2023 context_set_address_width(context, agaw); 2024 } else { 2025 /* 2026 * In pass through mode, AW must be programmed to 2027 * indicate the largest AGAW value supported by 2028 * hardware. And ASR is ignored by hardware. 2029 */ 2030 context_set_address_width(context, iommu->msagaw); 2031 } 2032 2033 context_set_translation_type(context, translation); 2034 } 2035 2036 context_set_fault_enable(context); 2037 context_set_present(context); 2038 if (!ecap_coherent(iommu->ecap)) 2039 clflush_cache_range(context, sizeof(*context)); 2040 2041 /* 2042 * It's a non-present to present mapping. If hardware doesn't cache 2043 * non-present entry we only need to flush the write-buffer. If the 2044 * _does_ cache non-present entries, then it does so in the special 2045 * domain #0, which we have to flush: 2046 */ 2047 if (cap_caching_mode(iommu->cap)) { 2048 iommu->flush.flush_context(iommu, 0, 2049 (((u16)bus) << 8) | devfn, 2050 DMA_CCMD_MASK_NOBIT, 2051 DMA_CCMD_DEVICE_INVL); 2052 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2053 } else { 2054 iommu_flush_write_buffer(iommu); 2055 } 2056 2057 ret = 0; 2058 2059 out_unlock: 2060 spin_unlock(&iommu->lock); 2061 2062 return ret; 2063 } 2064 2065 struct domain_context_mapping_data { 2066 struct dmar_domain *domain; 2067 struct intel_iommu *iommu; 2068 struct pasid_table *table; 2069 }; 2070 2071 static int domain_context_mapping_cb(struct pci_dev *pdev, 2072 u16 alias, void *opaque) 2073 { 2074 struct domain_context_mapping_data *data = opaque; 2075 2076 return domain_context_mapping_one(data->domain, data->iommu, 2077 data->table, PCI_BUS_NUM(alias), 2078 alias & 0xff); 2079 } 2080 2081 static int 2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2083 { 2084 struct domain_context_mapping_data data; 2085 struct pasid_table *table; 2086 struct intel_iommu *iommu; 2087 u8 bus, devfn; 2088 2089 iommu = device_to_iommu(dev, &bus, &devfn); 2090 if (!iommu) 2091 return -ENODEV; 2092 2093 table = intel_pasid_get_table(dev); 2094 2095 if (!dev_is_pci(dev)) 2096 return domain_context_mapping_one(domain, iommu, table, 2097 bus, devfn); 2098 2099 data.domain = domain; 2100 data.iommu = iommu; 2101 data.table = table; 2102 2103 return pci_for_each_dma_alias(to_pci_dev(dev), 2104 &domain_context_mapping_cb, &data); 2105 } 2106 2107 /* Returns a number of VTD pages, but aligned to MM page size */ 2108 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2109 size_t size) 2110 { 2111 host_addr &= ~PAGE_MASK; 2112 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2113 } 2114 2115 /* Return largest possible superpage level for a given mapping */ 2116 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2117 unsigned long iov_pfn, 2118 unsigned long phy_pfn, 2119 unsigned long pages) 2120 { 2121 int support, level = 1; 2122 unsigned long pfnmerge; 2123 2124 support = domain->iommu_superpage; 2125 2126 /* To use a large page, the virtual *and* physical addresses 2127 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2128 of them will mean we have to use smaller pages. So just 2129 merge them and check both at once. */ 2130 pfnmerge = iov_pfn | phy_pfn; 2131 2132 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2133 pages >>= VTD_STRIDE_SHIFT; 2134 if (!pages) 2135 break; 2136 pfnmerge >>= VTD_STRIDE_SHIFT; 2137 level++; 2138 support--; 2139 } 2140 return level; 2141 } 2142 2143 /* 2144 * Ensure that old small page tables are removed to make room for superpage(s). 2145 * We're going to add new large pages, so make sure we don't remove their parent 2146 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2147 */ 2148 static void switch_to_super_page(struct dmar_domain *domain, 2149 unsigned long start_pfn, 2150 unsigned long end_pfn, int level) 2151 { 2152 unsigned long lvl_pages = lvl_to_nr_pages(level); 2153 struct iommu_domain_info *info; 2154 struct dma_pte *pte = NULL; 2155 unsigned long i; 2156 2157 while (start_pfn <= end_pfn) { 2158 if (!pte) 2159 pte = pfn_to_dma_pte(domain, start_pfn, &level, 2160 GFP_ATOMIC); 2161 2162 if (dma_pte_present(pte)) { 2163 dma_pte_free_pagetable(domain, start_pfn, 2164 start_pfn + lvl_pages - 1, 2165 level + 1); 2166 2167 xa_for_each(&domain->iommu_array, i, info) 2168 iommu_flush_iotlb_psi(info->iommu, domain, 2169 start_pfn, lvl_pages, 2170 0, 0); 2171 } 2172 2173 pte++; 2174 start_pfn += lvl_pages; 2175 if (first_pte_in_page(pte)) 2176 pte = NULL; 2177 } 2178 } 2179 2180 static int 2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2182 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2183 gfp_t gfp) 2184 { 2185 struct dma_pte *first_pte = NULL, *pte = NULL; 2186 unsigned int largepage_lvl = 0; 2187 unsigned long lvl_pages = 0; 2188 phys_addr_t pteval; 2189 u64 attr; 2190 2191 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2192 return -EINVAL; 2193 2194 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2195 return -EINVAL; 2196 2197 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 2198 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 2199 return -EINVAL; 2200 } 2201 2202 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2203 attr |= DMA_FL_PTE_PRESENT; 2204 if (domain->use_first_level) { 2205 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2206 if (prot & DMA_PTE_WRITE) 2207 attr |= DMA_FL_PTE_DIRTY; 2208 } 2209 2210 domain->has_mappings = true; 2211 2212 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2213 2214 while (nr_pages > 0) { 2215 uint64_t tmp; 2216 2217 if (!pte) { 2218 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2219 phys_pfn, nr_pages); 2220 2221 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2222 gfp); 2223 if (!pte) 2224 return -ENOMEM; 2225 first_pte = pte; 2226 2227 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2228 2229 /* It is large page*/ 2230 if (largepage_lvl > 1) { 2231 unsigned long end_pfn; 2232 unsigned long pages_to_remove; 2233 2234 pteval |= DMA_PTE_LARGE_PAGE; 2235 pages_to_remove = min_t(unsigned long, nr_pages, 2236 nr_pte_to_next_page(pte) * lvl_pages); 2237 end_pfn = iov_pfn + pages_to_remove - 1; 2238 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2239 } else { 2240 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2241 } 2242 2243 } 2244 /* We don't need lock here, nobody else 2245 * touches the iova range 2246 */ 2247 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2248 if (tmp) { 2249 static int dumps = 5; 2250 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2251 iov_pfn, tmp, (unsigned long long)pteval); 2252 if (dumps) { 2253 dumps--; 2254 debug_dma_dump_mappings(NULL); 2255 } 2256 WARN_ON(1); 2257 } 2258 2259 nr_pages -= lvl_pages; 2260 iov_pfn += lvl_pages; 2261 phys_pfn += lvl_pages; 2262 pteval += lvl_pages * VTD_PAGE_SIZE; 2263 2264 /* If the next PTE would be the first in a new page, then we 2265 * need to flush the cache on the entries we've just written. 2266 * And then we'll need to recalculate 'pte', so clear it and 2267 * let it get set again in the if (!pte) block above. 2268 * 2269 * If we're done (!nr_pages) we need to flush the cache too. 2270 * 2271 * Also if we've been setting superpages, we may need to 2272 * recalculate 'pte' and switch back to smaller pages for the 2273 * end of the mapping, if the trailing size is not enough to 2274 * use another superpage (i.e. nr_pages < lvl_pages). 2275 */ 2276 pte++; 2277 if (!nr_pages || first_pte_in_page(pte) || 2278 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2279 domain_flush_cache(domain, first_pte, 2280 (void *)pte - (void *)first_pte); 2281 pte = NULL; 2282 } 2283 } 2284 2285 return 0; 2286 } 2287 2288 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2289 { 2290 struct intel_iommu *iommu = info->iommu; 2291 struct context_entry *context; 2292 u16 did_old; 2293 2294 if (!iommu) 2295 return; 2296 2297 spin_lock(&iommu->lock); 2298 context = iommu_context_addr(iommu, bus, devfn, 0); 2299 if (!context) { 2300 spin_unlock(&iommu->lock); 2301 return; 2302 } 2303 2304 if (sm_supported(iommu)) { 2305 if (hw_pass_through && domain_type_is_si(info->domain)) 2306 did_old = FLPT_DEFAULT_DID; 2307 else 2308 did_old = domain_id_iommu(info->domain, iommu); 2309 } else { 2310 did_old = context_domain_id(context); 2311 } 2312 2313 context_clear_entry(context); 2314 __iommu_flush_cache(iommu, context, sizeof(*context)); 2315 spin_unlock(&iommu->lock); 2316 iommu->flush.flush_context(iommu, 2317 did_old, 2318 (((u16)bus) << 8) | devfn, 2319 DMA_CCMD_MASK_NOBIT, 2320 DMA_CCMD_DEVICE_INVL); 2321 2322 if (sm_supported(iommu)) 2323 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2324 2325 iommu->flush.flush_iotlb(iommu, 2326 did_old, 2327 0, 2328 0, 2329 DMA_TLB_DSI_FLUSH); 2330 2331 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2332 } 2333 2334 static int domain_setup_first_level(struct intel_iommu *iommu, 2335 struct dmar_domain *domain, 2336 struct device *dev, 2337 u32 pasid) 2338 { 2339 struct dma_pte *pgd = domain->pgd; 2340 int agaw, level; 2341 int flags = 0; 2342 2343 /* 2344 * Skip top levels of page tables for iommu which has 2345 * less agaw than default. Unnecessary for PT mode. 2346 */ 2347 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2348 pgd = phys_to_virt(dma_pte_addr(pgd)); 2349 if (!dma_pte_present(pgd)) 2350 return -ENOMEM; 2351 } 2352 2353 level = agaw_to_level(agaw); 2354 if (level != 4 && level != 5) 2355 return -EINVAL; 2356 2357 if (level == 5) 2358 flags |= PASID_FLAG_FL5LP; 2359 2360 if (domain->force_snooping) 2361 flags |= PASID_FLAG_PAGE_SNOOP; 2362 2363 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2364 domain_id_iommu(domain, iommu), 2365 flags); 2366 } 2367 2368 static bool dev_is_real_dma_subdevice(struct device *dev) 2369 { 2370 return dev && dev_is_pci(dev) && 2371 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2372 } 2373 2374 static int iommu_domain_identity_map(struct dmar_domain *domain, 2375 unsigned long first_vpfn, 2376 unsigned long last_vpfn) 2377 { 2378 /* 2379 * RMRR range might have overlap with physical memory range, 2380 * clear it first 2381 */ 2382 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2383 2384 return __domain_mapping(domain, first_vpfn, 2385 first_vpfn, last_vpfn - first_vpfn + 1, 2386 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2387 } 2388 2389 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2390 2391 static int __init si_domain_init(int hw) 2392 { 2393 struct dmar_rmrr_unit *rmrr; 2394 struct device *dev; 2395 int i, nid, ret; 2396 2397 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2398 if (!si_domain) 2399 return -EFAULT; 2400 2401 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2402 domain_exit(si_domain); 2403 si_domain = NULL; 2404 return -EFAULT; 2405 } 2406 2407 if (hw) 2408 return 0; 2409 2410 for_each_online_node(nid) { 2411 unsigned long start_pfn, end_pfn; 2412 int i; 2413 2414 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2415 ret = iommu_domain_identity_map(si_domain, 2416 mm_to_dma_pfn_start(start_pfn), 2417 mm_to_dma_pfn_end(end_pfn)); 2418 if (ret) 2419 return ret; 2420 } 2421 } 2422 2423 /* 2424 * Identity map the RMRRs so that devices with RMRRs could also use 2425 * the si_domain. 2426 */ 2427 for_each_rmrr_units(rmrr) { 2428 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2429 i, dev) { 2430 unsigned long long start = rmrr->base_address; 2431 unsigned long long end = rmrr->end_address; 2432 2433 if (WARN_ON(end < start || 2434 end >> agaw_to_width(si_domain->agaw))) 2435 continue; 2436 2437 ret = iommu_domain_identity_map(si_domain, 2438 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2439 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2440 if (ret) 2441 return ret; 2442 } 2443 } 2444 2445 return 0; 2446 } 2447 2448 static int dmar_domain_attach_device(struct dmar_domain *domain, 2449 struct device *dev) 2450 { 2451 struct device_domain_info *info = dev_iommu_priv_get(dev); 2452 struct intel_iommu *iommu; 2453 unsigned long flags; 2454 u8 bus, devfn; 2455 int ret; 2456 2457 iommu = device_to_iommu(dev, &bus, &devfn); 2458 if (!iommu) 2459 return -ENODEV; 2460 2461 ret = domain_attach_iommu(domain, iommu); 2462 if (ret) 2463 return ret; 2464 info->domain = domain; 2465 spin_lock_irqsave(&domain->lock, flags); 2466 list_add(&info->link, &domain->devices); 2467 spin_unlock_irqrestore(&domain->lock, flags); 2468 2469 /* PASID table is mandatory for a PCI device in scalable mode. */ 2470 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2471 /* Setup the PASID entry for requests without PASID: */ 2472 if (hw_pass_through && domain_type_is_si(domain)) 2473 ret = intel_pasid_setup_pass_through(iommu, domain, 2474 dev, IOMMU_NO_PASID); 2475 else if (domain->use_first_level) 2476 ret = domain_setup_first_level(iommu, domain, dev, 2477 IOMMU_NO_PASID); 2478 else 2479 ret = intel_pasid_setup_second_level(iommu, domain, 2480 dev, IOMMU_NO_PASID); 2481 if (ret) { 2482 dev_err(dev, "Setup RID2PASID failed\n"); 2483 device_block_translation(dev); 2484 return ret; 2485 } 2486 } 2487 2488 ret = domain_context_mapping(domain, dev); 2489 if (ret) { 2490 dev_err(dev, "Domain context map failed\n"); 2491 device_block_translation(dev); 2492 return ret; 2493 } 2494 2495 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2496 iommu_enable_pci_caps(info); 2497 2498 return 0; 2499 } 2500 2501 /** 2502 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2503 * is relaxable (ie. is allowed to be not enforced under some conditions) 2504 * @dev: device handle 2505 * 2506 * We assume that PCI USB devices with RMRRs have them largely 2507 * for historical reasons and that the RMRR space is not actively used post 2508 * boot. This exclusion may change if vendors begin to abuse it. 2509 * 2510 * The same exception is made for graphics devices, with the requirement that 2511 * any use of the RMRR regions will be torn down before assigning the device 2512 * to a guest. 2513 * 2514 * Return: true if the RMRR is relaxable, false otherwise 2515 */ 2516 static bool device_rmrr_is_relaxable(struct device *dev) 2517 { 2518 struct pci_dev *pdev; 2519 2520 if (!dev_is_pci(dev)) 2521 return false; 2522 2523 pdev = to_pci_dev(dev); 2524 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2525 return true; 2526 else 2527 return false; 2528 } 2529 2530 /* 2531 * Return the required default domain type for a specific device. 2532 * 2533 * @dev: the device in query 2534 * @startup: true if this is during early boot 2535 * 2536 * Returns: 2537 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2538 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2539 * - 0: both identity and dynamic domains work for this device 2540 */ 2541 static int device_def_domain_type(struct device *dev) 2542 { 2543 if (dev_is_pci(dev)) { 2544 struct pci_dev *pdev = to_pci_dev(dev); 2545 2546 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2547 return IOMMU_DOMAIN_IDENTITY; 2548 2549 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2550 return IOMMU_DOMAIN_IDENTITY; 2551 } 2552 2553 return 0; 2554 } 2555 2556 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2557 { 2558 /* 2559 * Start from the sane iommu hardware state. 2560 * If the queued invalidation is already initialized by us 2561 * (for example, while enabling interrupt-remapping) then 2562 * we got the things already rolling from a sane state. 2563 */ 2564 if (!iommu->qi) { 2565 /* 2566 * Clear any previous faults. 2567 */ 2568 dmar_fault(-1, iommu); 2569 /* 2570 * Disable queued invalidation if supported and already enabled 2571 * before OS handover. 2572 */ 2573 dmar_disable_qi(iommu); 2574 } 2575 2576 if (dmar_enable_qi(iommu)) { 2577 /* 2578 * Queued Invalidate not enabled, use Register Based Invalidate 2579 */ 2580 iommu->flush.flush_context = __iommu_flush_context; 2581 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2582 pr_info("%s: Using Register based invalidation\n", 2583 iommu->name); 2584 } else { 2585 iommu->flush.flush_context = qi_flush_context; 2586 iommu->flush.flush_iotlb = qi_flush_iotlb; 2587 pr_info("%s: Using Queued invalidation\n", iommu->name); 2588 } 2589 } 2590 2591 static int copy_context_table(struct intel_iommu *iommu, 2592 struct root_entry *old_re, 2593 struct context_entry **tbl, 2594 int bus, bool ext) 2595 { 2596 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2597 struct context_entry *new_ce = NULL, ce; 2598 struct context_entry *old_ce = NULL; 2599 struct root_entry re; 2600 phys_addr_t old_ce_phys; 2601 2602 tbl_idx = ext ? bus * 2 : bus; 2603 memcpy(&re, old_re, sizeof(re)); 2604 2605 for (devfn = 0; devfn < 256; devfn++) { 2606 /* First calculate the correct index */ 2607 idx = (ext ? devfn * 2 : devfn) % 256; 2608 2609 if (idx == 0) { 2610 /* First save what we may have and clean up */ 2611 if (new_ce) { 2612 tbl[tbl_idx] = new_ce; 2613 __iommu_flush_cache(iommu, new_ce, 2614 VTD_PAGE_SIZE); 2615 pos = 1; 2616 } 2617 2618 if (old_ce) 2619 memunmap(old_ce); 2620 2621 ret = 0; 2622 if (devfn < 0x80) 2623 old_ce_phys = root_entry_lctp(&re); 2624 else 2625 old_ce_phys = root_entry_uctp(&re); 2626 2627 if (!old_ce_phys) { 2628 if (ext && devfn == 0) { 2629 /* No LCTP, try UCTP */ 2630 devfn = 0x7f; 2631 continue; 2632 } else { 2633 goto out; 2634 } 2635 } 2636 2637 ret = -ENOMEM; 2638 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2639 MEMREMAP_WB); 2640 if (!old_ce) 2641 goto out; 2642 2643 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2644 if (!new_ce) 2645 goto out_unmap; 2646 2647 ret = 0; 2648 } 2649 2650 /* Now copy the context entry */ 2651 memcpy(&ce, old_ce + idx, sizeof(ce)); 2652 2653 if (!context_present(&ce)) 2654 continue; 2655 2656 did = context_domain_id(&ce); 2657 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2658 set_bit(did, iommu->domain_ids); 2659 2660 set_context_copied(iommu, bus, devfn); 2661 new_ce[idx] = ce; 2662 } 2663 2664 tbl[tbl_idx + pos] = new_ce; 2665 2666 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2667 2668 out_unmap: 2669 memunmap(old_ce); 2670 2671 out: 2672 return ret; 2673 } 2674 2675 static int copy_translation_tables(struct intel_iommu *iommu) 2676 { 2677 struct context_entry **ctxt_tbls; 2678 struct root_entry *old_rt; 2679 phys_addr_t old_rt_phys; 2680 int ctxt_table_entries; 2681 u64 rtaddr_reg; 2682 int bus, ret; 2683 bool new_ext, ext; 2684 2685 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2686 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2687 new_ext = !!sm_supported(iommu); 2688 2689 /* 2690 * The RTT bit can only be changed when translation is disabled, 2691 * but disabling translation means to open a window for data 2692 * corruption. So bail out and don't copy anything if we would 2693 * have to change the bit. 2694 */ 2695 if (new_ext != ext) 2696 return -EINVAL; 2697 2698 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2699 if (!iommu->copied_tables) 2700 return -ENOMEM; 2701 2702 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2703 if (!old_rt_phys) 2704 return -EINVAL; 2705 2706 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2707 if (!old_rt) 2708 return -ENOMEM; 2709 2710 /* This is too big for the stack - allocate it from slab */ 2711 ctxt_table_entries = ext ? 512 : 256; 2712 ret = -ENOMEM; 2713 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2714 if (!ctxt_tbls) 2715 goto out_unmap; 2716 2717 for (bus = 0; bus < 256; bus++) { 2718 ret = copy_context_table(iommu, &old_rt[bus], 2719 ctxt_tbls, bus, ext); 2720 if (ret) { 2721 pr_err("%s: Failed to copy context table for bus %d\n", 2722 iommu->name, bus); 2723 continue; 2724 } 2725 } 2726 2727 spin_lock(&iommu->lock); 2728 2729 /* Context tables are copied, now write them to the root_entry table */ 2730 for (bus = 0; bus < 256; bus++) { 2731 int idx = ext ? bus * 2 : bus; 2732 u64 val; 2733 2734 if (ctxt_tbls[idx]) { 2735 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2736 iommu->root_entry[bus].lo = val; 2737 } 2738 2739 if (!ext || !ctxt_tbls[idx + 1]) 2740 continue; 2741 2742 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2743 iommu->root_entry[bus].hi = val; 2744 } 2745 2746 spin_unlock(&iommu->lock); 2747 2748 kfree(ctxt_tbls); 2749 2750 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2751 2752 ret = 0; 2753 2754 out_unmap: 2755 memunmap(old_rt); 2756 2757 return ret; 2758 } 2759 2760 static int __init init_dmars(void) 2761 { 2762 struct dmar_drhd_unit *drhd; 2763 struct intel_iommu *iommu; 2764 int ret; 2765 2766 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2767 if (ret) 2768 goto free_iommu; 2769 2770 for_each_iommu(iommu, drhd) { 2771 if (drhd->ignored) { 2772 iommu_disable_translation(iommu); 2773 continue; 2774 } 2775 2776 /* 2777 * Find the max pasid size of all IOMMU's in the system. 2778 * We need to ensure the system pasid table is no bigger 2779 * than the smallest supported. 2780 */ 2781 if (pasid_supported(iommu)) { 2782 u32 temp = 2 << ecap_pss(iommu->ecap); 2783 2784 intel_pasid_max_id = min_t(u32, temp, 2785 intel_pasid_max_id); 2786 } 2787 2788 intel_iommu_init_qi(iommu); 2789 2790 ret = iommu_init_domains(iommu); 2791 if (ret) 2792 goto free_iommu; 2793 2794 init_translation_status(iommu); 2795 2796 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2797 iommu_disable_translation(iommu); 2798 clear_translation_pre_enabled(iommu); 2799 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2800 iommu->name); 2801 } 2802 2803 /* 2804 * TBD: 2805 * we could share the same root & context tables 2806 * among all IOMMU's. Need to Split it later. 2807 */ 2808 ret = iommu_alloc_root_entry(iommu); 2809 if (ret) 2810 goto free_iommu; 2811 2812 if (translation_pre_enabled(iommu)) { 2813 pr_info("Translation already enabled - trying to copy translation structures\n"); 2814 2815 ret = copy_translation_tables(iommu); 2816 if (ret) { 2817 /* 2818 * We found the IOMMU with translation 2819 * enabled - but failed to copy over the 2820 * old root-entry table. Try to proceed 2821 * by disabling translation now and 2822 * allocating a clean root-entry table. 2823 * This might cause DMAR faults, but 2824 * probably the dump will still succeed. 2825 */ 2826 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2827 iommu->name); 2828 iommu_disable_translation(iommu); 2829 clear_translation_pre_enabled(iommu); 2830 } else { 2831 pr_info("Copied translation tables from previous kernel for %s\n", 2832 iommu->name); 2833 } 2834 } 2835 2836 if (!ecap_pass_through(iommu->ecap)) 2837 hw_pass_through = 0; 2838 intel_svm_check(iommu); 2839 } 2840 2841 /* 2842 * Now that qi is enabled on all iommus, set the root entry and flush 2843 * caches. This is required on some Intel X58 chipsets, otherwise the 2844 * flush_context function will loop forever and the boot hangs. 2845 */ 2846 for_each_active_iommu(iommu, drhd) { 2847 iommu_flush_write_buffer(iommu); 2848 iommu_set_root_entry(iommu); 2849 } 2850 2851 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2852 dmar_map_gfx = 0; 2853 #endif 2854 2855 if (!dmar_map_gfx) 2856 iommu_identity_mapping |= IDENTMAP_GFX; 2857 2858 check_tylersburg_isoch(); 2859 2860 ret = si_domain_init(hw_pass_through); 2861 if (ret) 2862 goto free_iommu; 2863 2864 /* 2865 * for each drhd 2866 * enable fault log 2867 * global invalidate context cache 2868 * global invalidate iotlb 2869 * enable translation 2870 */ 2871 for_each_iommu(iommu, drhd) { 2872 if (drhd->ignored) { 2873 /* 2874 * we always have to disable PMRs or DMA may fail on 2875 * this device 2876 */ 2877 if (force_on) 2878 iommu_disable_protect_mem_regions(iommu); 2879 continue; 2880 } 2881 2882 iommu_flush_write_buffer(iommu); 2883 2884 #ifdef CONFIG_INTEL_IOMMU_SVM 2885 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2886 /* 2887 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2888 * could cause possible lock race condition. 2889 */ 2890 up_write(&dmar_global_lock); 2891 ret = intel_svm_enable_prq(iommu); 2892 down_write(&dmar_global_lock); 2893 if (ret) 2894 goto free_iommu; 2895 } 2896 #endif 2897 ret = dmar_set_interrupt(iommu); 2898 if (ret) 2899 goto free_iommu; 2900 } 2901 2902 return 0; 2903 2904 free_iommu: 2905 for_each_active_iommu(iommu, drhd) { 2906 disable_dmar_iommu(iommu); 2907 free_dmar_iommu(iommu); 2908 } 2909 if (si_domain) { 2910 domain_exit(si_domain); 2911 si_domain = NULL; 2912 } 2913 2914 return ret; 2915 } 2916 2917 static void __init init_no_remapping_devices(void) 2918 { 2919 struct dmar_drhd_unit *drhd; 2920 struct device *dev; 2921 int i; 2922 2923 for_each_drhd_unit(drhd) { 2924 if (!drhd->include_all) { 2925 for_each_active_dev_scope(drhd->devices, 2926 drhd->devices_cnt, i, dev) 2927 break; 2928 /* ignore DMAR unit if no devices exist */ 2929 if (i == drhd->devices_cnt) 2930 drhd->ignored = 1; 2931 } 2932 } 2933 2934 for_each_active_drhd_unit(drhd) { 2935 if (drhd->include_all) 2936 continue; 2937 2938 for_each_active_dev_scope(drhd->devices, 2939 drhd->devices_cnt, i, dev) 2940 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2941 break; 2942 if (i < drhd->devices_cnt) 2943 continue; 2944 2945 /* This IOMMU has *only* gfx devices. Either bypass it or 2946 set the gfx_mapped flag, as appropriate */ 2947 drhd->gfx_dedicated = 1; 2948 if (!dmar_map_gfx) 2949 drhd->ignored = 1; 2950 } 2951 } 2952 2953 #ifdef CONFIG_SUSPEND 2954 static int init_iommu_hw(void) 2955 { 2956 struct dmar_drhd_unit *drhd; 2957 struct intel_iommu *iommu = NULL; 2958 int ret; 2959 2960 for_each_active_iommu(iommu, drhd) { 2961 if (iommu->qi) { 2962 ret = dmar_reenable_qi(iommu); 2963 if (ret) 2964 return ret; 2965 } 2966 } 2967 2968 for_each_iommu(iommu, drhd) { 2969 if (drhd->ignored) { 2970 /* 2971 * we always have to disable PMRs or DMA may fail on 2972 * this device 2973 */ 2974 if (force_on) 2975 iommu_disable_protect_mem_regions(iommu); 2976 continue; 2977 } 2978 2979 iommu_flush_write_buffer(iommu); 2980 iommu_set_root_entry(iommu); 2981 iommu_enable_translation(iommu); 2982 iommu_disable_protect_mem_regions(iommu); 2983 } 2984 2985 return 0; 2986 } 2987 2988 static void iommu_flush_all(void) 2989 { 2990 struct dmar_drhd_unit *drhd; 2991 struct intel_iommu *iommu; 2992 2993 for_each_active_iommu(iommu, drhd) { 2994 iommu->flush.flush_context(iommu, 0, 0, 0, 2995 DMA_CCMD_GLOBAL_INVL); 2996 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 2997 DMA_TLB_GLOBAL_FLUSH); 2998 } 2999 } 3000 3001 static int iommu_suspend(void) 3002 { 3003 struct dmar_drhd_unit *drhd; 3004 struct intel_iommu *iommu = NULL; 3005 unsigned long flag; 3006 3007 iommu_flush_all(); 3008 3009 for_each_active_iommu(iommu, drhd) { 3010 iommu_disable_translation(iommu); 3011 3012 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3013 3014 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3015 readl(iommu->reg + DMAR_FECTL_REG); 3016 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3017 readl(iommu->reg + DMAR_FEDATA_REG); 3018 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3019 readl(iommu->reg + DMAR_FEADDR_REG); 3020 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3021 readl(iommu->reg + DMAR_FEUADDR_REG); 3022 3023 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3024 } 3025 return 0; 3026 } 3027 3028 static void iommu_resume(void) 3029 { 3030 struct dmar_drhd_unit *drhd; 3031 struct intel_iommu *iommu = NULL; 3032 unsigned long flag; 3033 3034 if (init_iommu_hw()) { 3035 if (force_on) 3036 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3037 else 3038 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3039 return; 3040 } 3041 3042 for_each_active_iommu(iommu, drhd) { 3043 3044 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3045 3046 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3047 iommu->reg + DMAR_FECTL_REG); 3048 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3049 iommu->reg + DMAR_FEDATA_REG); 3050 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3051 iommu->reg + DMAR_FEADDR_REG); 3052 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3053 iommu->reg + DMAR_FEUADDR_REG); 3054 3055 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3056 } 3057 } 3058 3059 static struct syscore_ops iommu_syscore_ops = { 3060 .resume = iommu_resume, 3061 .suspend = iommu_suspend, 3062 }; 3063 3064 static void __init init_iommu_pm_ops(void) 3065 { 3066 register_syscore_ops(&iommu_syscore_ops); 3067 } 3068 3069 #else 3070 static inline void init_iommu_pm_ops(void) {} 3071 #endif /* CONFIG_PM */ 3072 3073 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3074 { 3075 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3076 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3077 rmrr->end_address <= rmrr->base_address || 3078 arch_rmrr_sanity_check(rmrr)) 3079 return -EINVAL; 3080 3081 return 0; 3082 } 3083 3084 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3085 { 3086 struct acpi_dmar_reserved_memory *rmrr; 3087 struct dmar_rmrr_unit *rmrru; 3088 3089 rmrr = (struct acpi_dmar_reserved_memory *)header; 3090 if (rmrr_sanity_check(rmrr)) { 3091 pr_warn(FW_BUG 3092 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3093 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3094 rmrr->base_address, rmrr->end_address, 3095 dmi_get_system_info(DMI_BIOS_VENDOR), 3096 dmi_get_system_info(DMI_BIOS_VERSION), 3097 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3098 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3099 } 3100 3101 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3102 if (!rmrru) 3103 goto out; 3104 3105 rmrru->hdr = header; 3106 3107 rmrru->base_address = rmrr->base_address; 3108 rmrru->end_address = rmrr->end_address; 3109 3110 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3111 ((void *)rmrr) + rmrr->header.length, 3112 &rmrru->devices_cnt); 3113 if (rmrru->devices_cnt && rmrru->devices == NULL) 3114 goto free_rmrru; 3115 3116 list_add(&rmrru->list, &dmar_rmrr_units); 3117 3118 return 0; 3119 free_rmrru: 3120 kfree(rmrru); 3121 out: 3122 return -ENOMEM; 3123 } 3124 3125 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3126 { 3127 struct dmar_atsr_unit *atsru; 3128 struct acpi_dmar_atsr *tmp; 3129 3130 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3131 dmar_rcu_check()) { 3132 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3133 if (atsr->segment != tmp->segment) 3134 continue; 3135 if (atsr->header.length != tmp->header.length) 3136 continue; 3137 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3138 return atsru; 3139 } 3140 3141 return NULL; 3142 } 3143 3144 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3145 { 3146 struct acpi_dmar_atsr *atsr; 3147 struct dmar_atsr_unit *atsru; 3148 3149 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3150 return 0; 3151 3152 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3153 atsru = dmar_find_atsr(atsr); 3154 if (atsru) 3155 return 0; 3156 3157 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3158 if (!atsru) 3159 return -ENOMEM; 3160 3161 /* 3162 * If memory is allocated from slab by ACPI _DSM method, we need to 3163 * copy the memory content because the memory buffer will be freed 3164 * on return. 3165 */ 3166 atsru->hdr = (void *)(atsru + 1); 3167 memcpy(atsru->hdr, hdr, hdr->length); 3168 atsru->include_all = atsr->flags & 0x1; 3169 if (!atsru->include_all) { 3170 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3171 (void *)atsr + atsr->header.length, 3172 &atsru->devices_cnt); 3173 if (atsru->devices_cnt && atsru->devices == NULL) { 3174 kfree(atsru); 3175 return -ENOMEM; 3176 } 3177 } 3178 3179 list_add_rcu(&atsru->list, &dmar_atsr_units); 3180 3181 return 0; 3182 } 3183 3184 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3185 { 3186 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3187 kfree(atsru); 3188 } 3189 3190 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3191 { 3192 struct acpi_dmar_atsr *atsr; 3193 struct dmar_atsr_unit *atsru; 3194 3195 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3196 atsru = dmar_find_atsr(atsr); 3197 if (atsru) { 3198 list_del_rcu(&atsru->list); 3199 synchronize_rcu(); 3200 intel_iommu_free_atsr(atsru); 3201 } 3202 3203 return 0; 3204 } 3205 3206 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3207 { 3208 int i; 3209 struct device *dev; 3210 struct acpi_dmar_atsr *atsr; 3211 struct dmar_atsr_unit *atsru; 3212 3213 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3214 atsru = dmar_find_atsr(atsr); 3215 if (!atsru) 3216 return 0; 3217 3218 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3219 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3220 i, dev) 3221 return -EBUSY; 3222 } 3223 3224 return 0; 3225 } 3226 3227 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3228 { 3229 struct dmar_satc_unit *satcu; 3230 struct acpi_dmar_satc *tmp; 3231 3232 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3233 dmar_rcu_check()) { 3234 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3235 if (satc->segment != tmp->segment) 3236 continue; 3237 if (satc->header.length != tmp->header.length) 3238 continue; 3239 if (memcmp(satc, tmp, satc->header.length) == 0) 3240 return satcu; 3241 } 3242 3243 return NULL; 3244 } 3245 3246 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3247 { 3248 struct acpi_dmar_satc *satc; 3249 struct dmar_satc_unit *satcu; 3250 3251 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3252 return 0; 3253 3254 satc = container_of(hdr, struct acpi_dmar_satc, header); 3255 satcu = dmar_find_satc(satc); 3256 if (satcu) 3257 return 0; 3258 3259 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3260 if (!satcu) 3261 return -ENOMEM; 3262 3263 satcu->hdr = (void *)(satcu + 1); 3264 memcpy(satcu->hdr, hdr, hdr->length); 3265 satcu->atc_required = satc->flags & 0x1; 3266 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3267 (void *)satc + satc->header.length, 3268 &satcu->devices_cnt); 3269 if (satcu->devices_cnt && !satcu->devices) { 3270 kfree(satcu); 3271 return -ENOMEM; 3272 } 3273 list_add_rcu(&satcu->list, &dmar_satc_units); 3274 3275 return 0; 3276 } 3277 3278 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3279 { 3280 int sp, ret; 3281 struct intel_iommu *iommu = dmaru->iommu; 3282 3283 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3284 if (ret) 3285 goto out; 3286 3287 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3288 pr_warn("%s: Doesn't support hardware pass through.\n", 3289 iommu->name); 3290 return -ENXIO; 3291 } 3292 3293 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3294 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3295 pr_warn("%s: Doesn't support large page.\n", 3296 iommu->name); 3297 return -ENXIO; 3298 } 3299 3300 /* 3301 * Disable translation if already enabled prior to OS handover. 3302 */ 3303 if (iommu->gcmd & DMA_GCMD_TE) 3304 iommu_disable_translation(iommu); 3305 3306 ret = iommu_init_domains(iommu); 3307 if (ret == 0) 3308 ret = iommu_alloc_root_entry(iommu); 3309 if (ret) 3310 goto out; 3311 3312 intel_svm_check(iommu); 3313 3314 if (dmaru->ignored) { 3315 /* 3316 * we always have to disable PMRs or DMA may fail on this device 3317 */ 3318 if (force_on) 3319 iommu_disable_protect_mem_regions(iommu); 3320 return 0; 3321 } 3322 3323 intel_iommu_init_qi(iommu); 3324 iommu_flush_write_buffer(iommu); 3325 3326 #ifdef CONFIG_INTEL_IOMMU_SVM 3327 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3328 ret = intel_svm_enable_prq(iommu); 3329 if (ret) 3330 goto disable_iommu; 3331 } 3332 #endif 3333 ret = dmar_set_interrupt(iommu); 3334 if (ret) 3335 goto disable_iommu; 3336 3337 iommu_set_root_entry(iommu); 3338 iommu_enable_translation(iommu); 3339 3340 iommu_disable_protect_mem_regions(iommu); 3341 return 0; 3342 3343 disable_iommu: 3344 disable_dmar_iommu(iommu); 3345 out: 3346 free_dmar_iommu(iommu); 3347 return ret; 3348 } 3349 3350 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3351 { 3352 int ret = 0; 3353 struct intel_iommu *iommu = dmaru->iommu; 3354 3355 if (!intel_iommu_enabled) 3356 return 0; 3357 if (iommu == NULL) 3358 return -EINVAL; 3359 3360 if (insert) { 3361 ret = intel_iommu_add(dmaru); 3362 } else { 3363 disable_dmar_iommu(iommu); 3364 free_dmar_iommu(iommu); 3365 } 3366 3367 return ret; 3368 } 3369 3370 static void intel_iommu_free_dmars(void) 3371 { 3372 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3373 struct dmar_atsr_unit *atsru, *atsr_n; 3374 struct dmar_satc_unit *satcu, *satc_n; 3375 3376 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3377 list_del(&rmrru->list); 3378 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3379 kfree(rmrru); 3380 } 3381 3382 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3383 list_del(&atsru->list); 3384 intel_iommu_free_atsr(atsru); 3385 } 3386 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3387 list_del(&satcu->list); 3388 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3389 kfree(satcu); 3390 } 3391 } 3392 3393 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3394 { 3395 struct dmar_satc_unit *satcu; 3396 struct acpi_dmar_satc *satc; 3397 struct device *tmp; 3398 int i; 3399 3400 dev = pci_physfn(dev); 3401 rcu_read_lock(); 3402 3403 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3404 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3405 if (satc->segment != pci_domain_nr(dev->bus)) 3406 continue; 3407 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3408 if (to_pci_dev(tmp) == dev) 3409 goto out; 3410 } 3411 satcu = NULL; 3412 out: 3413 rcu_read_unlock(); 3414 return satcu; 3415 } 3416 3417 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3418 { 3419 int i, ret = 1; 3420 struct pci_bus *bus; 3421 struct pci_dev *bridge = NULL; 3422 struct device *tmp; 3423 struct acpi_dmar_atsr *atsr; 3424 struct dmar_atsr_unit *atsru; 3425 struct dmar_satc_unit *satcu; 3426 3427 dev = pci_physfn(dev); 3428 satcu = dmar_find_matched_satc_unit(dev); 3429 if (satcu) 3430 /* 3431 * This device supports ATS as it is in SATC table. 3432 * When IOMMU is in legacy mode, enabling ATS is done 3433 * automatically by HW for the device that requires 3434 * ATS, hence OS should not enable this device ATS 3435 * to avoid duplicated TLB invalidation. 3436 */ 3437 return !(satcu->atc_required && !sm_supported(iommu)); 3438 3439 for (bus = dev->bus; bus; bus = bus->parent) { 3440 bridge = bus->self; 3441 /* If it's an integrated device, allow ATS */ 3442 if (!bridge) 3443 return 1; 3444 /* Connected via non-PCIe: no ATS */ 3445 if (!pci_is_pcie(bridge) || 3446 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3447 return 0; 3448 /* If we found the root port, look it up in the ATSR */ 3449 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3450 break; 3451 } 3452 3453 rcu_read_lock(); 3454 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3455 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3456 if (atsr->segment != pci_domain_nr(dev->bus)) 3457 continue; 3458 3459 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3460 if (tmp == &bridge->dev) 3461 goto out; 3462 3463 if (atsru->include_all) 3464 goto out; 3465 } 3466 ret = 0; 3467 out: 3468 rcu_read_unlock(); 3469 3470 return ret; 3471 } 3472 3473 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3474 { 3475 int ret; 3476 struct dmar_rmrr_unit *rmrru; 3477 struct dmar_atsr_unit *atsru; 3478 struct dmar_satc_unit *satcu; 3479 struct acpi_dmar_atsr *atsr; 3480 struct acpi_dmar_reserved_memory *rmrr; 3481 struct acpi_dmar_satc *satc; 3482 3483 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3484 return 0; 3485 3486 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3487 rmrr = container_of(rmrru->hdr, 3488 struct acpi_dmar_reserved_memory, header); 3489 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3490 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3491 ((void *)rmrr) + rmrr->header.length, 3492 rmrr->segment, rmrru->devices, 3493 rmrru->devices_cnt); 3494 if (ret < 0) 3495 return ret; 3496 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3497 dmar_remove_dev_scope(info, rmrr->segment, 3498 rmrru->devices, rmrru->devices_cnt); 3499 } 3500 } 3501 3502 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3503 if (atsru->include_all) 3504 continue; 3505 3506 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3507 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3508 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3509 (void *)atsr + atsr->header.length, 3510 atsr->segment, atsru->devices, 3511 atsru->devices_cnt); 3512 if (ret > 0) 3513 break; 3514 else if (ret < 0) 3515 return ret; 3516 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3517 if (dmar_remove_dev_scope(info, atsr->segment, 3518 atsru->devices, atsru->devices_cnt)) 3519 break; 3520 } 3521 } 3522 list_for_each_entry(satcu, &dmar_satc_units, list) { 3523 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3524 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3525 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3526 (void *)satc + satc->header.length, 3527 satc->segment, satcu->devices, 3528 satcu->devices_cnt); 3529 if (ret > 0) 3530 break; 3531 else if (ret < 0) 3532 return ret; 3533 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3534 if (dmar_remove_dev_scope(info, satc->segment, 3535 satcu->devices, satcu->devices_cnt)) 3536 break; 3537 } 3538 } 3539 3540 return 0; 3541 } 3542 3543 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3544 unsigned long val, void *v) 3545 { 3546 struct memory_notify *mhp = v; 3547 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3548 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3549 mhp->nr_pages - 1); 3550 3551 switch (val) { 3552 case MEM_GOING_ONLINE: 3553 if (iommu_domain_identity_map(si_domain, 3554 start_vpfn, last_vpfn)) { 3555 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3556 start_vpfn, last_vpfn); 3557 return NOTIFY_BAD; 3558 } 3559 break; 3560 3561 case MEM_OFFLINE: 3562 case MEM_CANCEL_ONLINE: 3563 { 3564 struct dmar_drhd_unit *drhd; 3565 struct intel_iommu *iommu; 3566 LIST_HEAD(freelist); 3567 3568 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3569 3570 rcu_read_lock(); 3571 for_each_active_iommu(iommu, drhd) 3572 iommu_flush_iotlb_psi(iommu, si_domain, 3573 start_vpfn, mhp->nr_pages, 3574 list_empty(&freelist), 0); 3575 rcu_read_unlock(); 3576 put_pages_list(&freelist); 3577 } 3578 break; 3579 } 3580 3581 return NOTIFY_OK; 3582 } 3583 3584 static struct notifier_block intel_iommu_memory_nb = { 3585 .notifier_call = intel_iommu_memory_notifier, 3586 .priority = 0 3587 }; 3588 3589 static void intel_disable_iommus(void) 3590 { 3591 struct intel_iommu *iommu = NULL; 3592 struct dmar_drhd_unit *drhd; 3593 3594 for_each_iommu(iommu, drhd) 3595 iommu_disable_translation(iommu); 3596 } 3597 3598 void intel_iommu_shutdown(void) 3599 { 3600 struct dmar_drhd_unit *drhd; 3601 struct intel_iommu *iommu = NULL; 3602 3603 if (no_iommu || dmar_disabled) 3604 return; 3605 3606 down_write(&dmar_global_lock); 3607 3608 /* Disable PMRs explicitly here. */ 3609 for_each_iommu(iommu, drhd) 3610 iommu_disable_protect_mem_regions(iommu); 3611 3612 /* Make sure the IOMMUs are switched off */ 3613 intel_disable_iommus(); 3614 3615 up_write(&dmar_global_lock); 3616 } 3617 3618 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3619 { 3620 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3621 3622 return container_of(iommu_dev, struct intel_iommu, iommu); 3623 } 3624 3625 static ssize_t version_show(struct device *dev, 3626 struct device_attribute *attr, char *buf) 3627 { 3628 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3629 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3630 return sysfs_emit(buf, "%d:%d\n", 3631 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3632 } 3633 static DEVICE_ATTR_RO(version); 3634 3635 static ssize_t address_show(struct device *dev, 3636 struct device_attribute *attr, char *buf) 3637 { 3638 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3639 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3640 } 3641 static DEVICE_ATTR_RO(address); 3642 3643 static ssize_t cap_show(struct device *dev, 3644 struct device_attribute *attr, char *buf) 3645 { 3646 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3647 return sysfs_emit(buf, "%llx\n", iommu->cap); 3648 } 3649 static DEVICE_ATTR_RO(cap); 3650 3651 static ssize_t ecap_show(struct device *dev, 3652 struct device_attribute *attr, char *buf) 3653 { 3654 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3655 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3656 } 3657 static DEVICE_ATTR_RO(ecap); 3658 3659 static ssize_t domains_supported_show(struct device *dev, 3660 struct device_attribute *attr, char *buf) 3661 { 3662 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3663 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3664 } 3665 static DEVICE_ATTR_RO(domains_supported); 3666 3667 static ssize_t domains_used_show(struct device *dev, 3668 struct device_attribute *attr, char *buf) 3669 { 3670 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3671 return sysfs_emit(buf, "%d\n", 3672 bitmap_weight(iommu->domain_ids, 3673 cap_ndoms(iommu->cap))); 3674 } 3675 static DEVICE_ATTR_RO(domains_used); 3676 3677 static struct attribute *intel_iommu_attrs[] = { 3678 &dev_attr_version.attr, 3679 &dev_attr_address.attr, 3680 &dev_attr_cap.attr, 3681 &dev_attr_ecap.attr, 3682 &dev_attr_domains_supported.attr, 3683 &dev_attr_domains_used.attr, 3684 NULL, 3685 }; 3686 3687 static struct attribute_group intel_iommu_group = { 3688 .name = "intel-iommu", 3689 .attrs = intel_iommu_attrs, 3690 }; 3691 3692 const struct attribute_group *intel_iommu_groups[] = { 3693 &intel_iommu_group, 3694 NULL, 3695 }; 3696 3697 static inline bool has_external_pci(void) 3698 { 3699 struct pci_dev *pdev = NULL; 3700 3701 for_each_pci_dev(pdev) 3702 if (pdev->external_facing) { 3703 pci_dev_put(pdev); 3704 return true; 3705 } 3706 3707 return false; 3708 } 3709 3710 static int __init platform_optin_force_iommu(void) 3711 { 3712 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3713 return 0; 3714 3715 if (no_iommu || dmar_disabled) 3716 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3717 3718 /* 3719 * If Intel-IOMMU is disabled by default, we will apply identity 3720 * map for all devices except those marked as being untrusted. 3721 */ 3722 if (dmar_disabled) 3723 iommu_set_default_passthrough(false); 3724 3725 dmar_disabled = 0; 3726 no_iommu = 0; 3727 3728 return 1; 3729 } 3730 3731 static int __init probe_acpi_namespace_devices(void) 3732 { 3733 struct dmar_drhd_unit *drhd; 3734 /* To avoid a -Wunused-but-set-variable warning. */ 3735 struct intel_iommu *iommu __maybe_unused; 3736 struct device *dev; 3737 int i, ret = 0; 3738 3739 for_each_active_iommu(iommu, drhd) { 3740 for_each_active_dev_scope(drhd->devices, 3741 drhd->devices_cnt, i, dev) { 3742 struct acpi_device_physical_node *pn; 3743 struct acpi_device *adev; 3744 3745 if (dev->bus != &acpi_bus_type) 3746 continue; 3747 3748 adev = to_acpi_device(dev); 3749 mutex_lock(&adev->physical_node_lock); 3750 list_for_each_entry(pn, 3751 &adev->physical_node_list, node) { 3752 ret = iommu_probe_device(pn->dev); 3753 if (ret) 3754 break; 3755 } 3756 mutex_unlock(&adev->physical_node_lock); 3757 3758 if (ret) 3759 return ret; 3760 } 3761 } 3762 3763 return 0; 3764 } 3765 3766 static __init int tboot_force_iommu(void) 3767 { 3768 if (!tboot_enabled()) 3769 return 0; 3770 3771 if (no_iommu || dmar_disabled) 3772 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3773 3774 dmar_disabled = 0; 3775 no_iommu = 0; 3776 3777 return 1; 3778 } 3779 3780 int __init intel_iommu_init(void) 3781 { 3782 int ret = -ENODEV; 3783 struct dmar_drhd_unit *drhd; 3784 struct intel_iommu *iommu; 3785 3786 /* 3787 * Intel IOMMU is required for a TXT/tboot launch or platform 3788 * opt in, so enforce that. 3789 */ 3790 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3791 platform_optin_force_iommu(); 3792 3793 down_write(&dmar_global_lock); 3794 if (dmar_table_init()) { 3795 if (force_on) 3796 panic("tboot: Failed to initialize DMAR table\n"); 3797 goto out_free_dmar; 3798 } 3799 3800 if (dmar_dev_scope_init() < 0) { 3801 if (force_on) 3802 panic("tboot: Failed to initialize DMAR device scope\n"); 3803 goto out_free_dmar; 3804 } 3805 3806 up_write(&dmar_global_lock); 3807 3808 /* 3809 * The bus notifier takes the dmar_global_lock, so lockdep will 3810 * complain later when we register it under the lock. 3811 */ 3812 dmar_register_bus_notifier(); 3813 3814 down_write(&dmar_global_lock); 3815 3816 if (!no_iommu) 3817 intel_iommu_debugfs_init(); 3818 3819 if (no_iommu || dmar_disabled) { 3820 /* 3821 * We exit the function here to ensure IOMMU's remapping and 3822 * mempool aren't setup, which means that the IOMMU's PMRs 3823 * won't be disabled via the call to init_dmars(). So disable 3824 * it explicitly here. The PMRs were setup by tboot prior to 3825 * calling SENTER, but the kernel is expected to reset/tear 3826 * down the PMRs. 3827 */ 3828 if (intel_iommu_tboot_noforce) { 3829 for_each_iommu(iommu, drhd) 3830 iommu_disable_protect_mem_regions(iommu); 3831 } 3832 3833 /* 3834 * Make sure the IOMMUs are switched off, even when we 3835 * boot into a kexec kernel and the previous kernel left 3836 * them enabled 3837 */ 3838 intel_disable_iommus(); 3839 goto out_free_dmar; 3840 } 3841 3842 if (list_empty(&dmar_rmrr_units)) 3843 pr_info("No RMRR found\n"); 3844 3845 if (list_empty(&dmar_atsr_units)) 3846 pr_info("No ATSR found\n"); 3847 3848 if (list_empty(&dmar_satc_units)) 3849 pr_info("No SATC found\n"); 3850 3851 init_no_remapping_devices(); 3852 3853 ret = init_dmars(); 3854 if (ret) { 3855 if (force_on) 3856 panic("tboot: Failed to initialize DMARs\n"); 3857 pr_err("Initialization failed\n"); 3858 goto out_free_dmar; 3859 } 3860 up_write(&dmar_global_lock); 3861 3862 init_iommu_pm_ops(); 3863 3864 down_read(&dmar_global_lock); 3865 for_each_active_iommu(iommu, drhd) { 3866 /* 3867 * The flush queue implementation does not perform 3868 * page-selective invalidations that are required for efficient 3869 * TLB flushes in virtual environments. The benefit of batching 3870 * is likely to be much lower than the overhead of synchronizing 3871 * the virtual and physical IOMMU page-tables. 3872 */ 3873 if (cap_caching_mode(iommu->cap) && 3874 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3875 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3876 iommu_set_dma_strict(); 3877 } 3878 iommu_device_sysfs_add(&iommu->iommu, NULL, 3879 intel_iommu_groups, 3880 "%s", iommu->name); 3881 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3882 3883 iommu_pmu_register(iommu); 3884 } 3885 up_read(&dmar_global_lock); 3886 3887 if (si_domain && !hw_pass_through) 3888 register_memory_notifier(&intel_iommu_memory_nb); 3889 3890 down_read(&dmar_global_lock); 3891 if (probe_acpi_namespace_devices()) 3892 pr_warn("ACPI name space devices didn't probe correctly\n"); 3893 3894 /* Finally, we enable the DMA remapping hardware. */ 3895 for_each_iommu(iommu, drhd) { 3896 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3897 iommu_enable_translation(iommu); 3898 3899 iommu_disable_protect_mem_regions(iommu); 3900 } 3901 up_read(&dmar_global_lock); 3902 3903 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3904 3905 intel_iommu_enabled = 1; 3906 3907 return 0; 3908 3909 out_free_dmar: 3910 intel_iommu_free_dmars(); 3911 up_write(&dmar_global_lock); 3912 return ret; 3913 } 3914 3915 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3916 { 3917 struct device_domain_info *info = opaque; 3918 3919 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3920 return 0; 3921 } 3922 3923 /* 3924 * NB - intel-iommu lacks any sort of reference counting for the users of 3925 * dependent devices. If multiple endpoints have intersecting dependent 3926 * devices, unbinding the driver from any one of them will possibly leave 3927 * the others unable to operate. 3928 */ 3929 static void domain_context_clear(struct device_domain_info *info) 3930 { 3931 if (!dev_is_pci(info->dev)) 3932 domain_context_clear_one(info, info->bus, info->devfn); 3933 3934 pci_for_each_dma_alias(to_pci_dev(info->dev), 3935 &domain_context_clear_one_cb, info); 3936 } 3937 3938 static void dmar_remove_one_dev_info(struct device *dev) 3939 { 3940 struct device_domain_info *info = dev_iommu_priv_get(dev); 3941 struct dmar_domain *domain = info->domain; 3942 struct intel_iommu *iommu = info->iommu; 3943 unsigned long flags; 3944 3945 if (!dev_is_real_dma_subdevice(info->dev)) { 3946 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3947 intel_pasid_tear_down_entry(iommu, info->dev, 3948 IOMMU_NO_PASID, false); 3949 3950 iommu_disable_pci_caps(info); 3951 domain_context_clear(info); 3952 } 3953 3954 spin_lock_irqsave(&domain->lock, flags); 3955 list_del(&info->link); 3956 spin_unlock_irqrestore(&domain->lock, flags); 3957 3958 domain_detach_iommu(domain, iommu); 3959 info->domain = NULL; 3960 } 3961 3962 /* 3963 * Clear the page table pointer in context or pasid table entries so that 3964 * all DMA requests without PASID from the device are blocked. If the page 3965 * table has been set, clean up the data structures. 3966 */ 3967 void device_block_translation(struct device *dev) 3968 { 3969 struct device_domain_info *info = dev_iommu_priv_get(dev); 3970 struct intel_iommu *iommu = info->iommu; 3971 unsigned long flags; 3972 3973 iommu_disable_pci_caps(info); 3974 if (!dev_is_real_dma_subdevice(dev)) { 3975 if (sm_supported(iommu)) 3976 intel_pasid_tear_down_entry(iommu, dev, 3977 IOMMU_NO_PASID, false); 3978 else 3979 domain_context_clear(info); 3980 } 3981 3982 if (!info->domain) 3983 return; 3984 3985 spin_lock_irqsave(&info->domain->lock, flags); 3986 list_del(&info->link); 3987 spin_unlock_irqrestore(&info->domain->lock, flags); 3988 3989 domain_detach_iommu(info->domain, iommu); 3990 info->domain = NULL; 3991 } 3992 3993 static int md_domain_init(struct dmar_domain *domain, int guest_width) 3994 { 3995 int adjust_width; 3996 3997 /* calculate AGAW */ 3998 domain->gaw = guest_width; 3999 adjust_width = guestwidth_to_adjustwidth(guest_width); 4000 domain->agaw = width_to_agaw(adjust_width); 4001 4002 domain->iommu_coherency = false; 4003 domain->iommu_superpage = 0; 4004 domain->max_addr = 0; 4005 4006 /* always allocate the top pgd */ 4007 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 4008 if (!domain->pgd) 4009 return -ENOMEM; 4010 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4011 return 0; 4012 } 4013 4014 static int blocking_domain_attach_dev(struct iommu_domain *domain, 4015 struct device *dev) 4016 { 4017 device_block_translation(dev); 4018 return 0; 4019 } 4020 4021 static struct iommu_domain blocking_domain = { 4022 .type = IOMMU_DOMAIN_BLOCKED, 4023 .ops = &(const struct iommu_domain_ops) { 4024 .attach_dev = blocking_domain_attach_dev, 4025 } 4026 }; 4027 4028 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4029 { 4030 struct dmar_domain *dmar_domain; 4031 struct iommu_domain *domain; 4032 4033 switch (type) { 4034 case IOMMU_DOMAIN_DMA: 4035 case IOMMU_DOMAIN_UNMANAGED: 4036 dmar_domain = alloc_domain(type); 4037 if (!dmar_domain) { 4038 pr_err("Can't allocate dmar_domain\n"); 4039 return NULL; 4040 } 4041 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4042 pr_err("Domain initialization failed\n"); 4043 domain_exit(dmar_domain); 4044 return NULL; 4045 } 4046 4047 domain = &dmar_domain->domain; 4048 domain->geometry.aperture_start = 0; 4049 domain->geometry.aperture_end = 4050 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4051 domain->geometry.force_aperture = true; 4052 4053 return domain; 4054 case IOMMU_DOMAIN_IDENTITY: 4055 return &si_domain->domain; 4056 case IOMMU_DOMAIN_SVA: 4057 return intel_svm_domain_alloc(); 4058 default: 4059 return NULL; 4060 } 4061 4062 return NULL; 4063 } 4064 4065 static struct iommu_domain * 4066 intel_iommu_domain_alloc_user(struct device *dev, u32 flags, 4067 struct iommu_domain *parent, 4068 const struct iommu_user_data *user_data) 4069 { 4070 struct device_domain_info *info = dev_iommu_priv_get(dev); 4071 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 4072 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 4073 struct intel_iommu *iommu = info->iommu; 4074 struct iommu_domain *domain; 4075 4076 /* Must be NESTING domain */ 4077 if (parent) { 4078 if (!nested_supported(iommu) || flags) 4079 return ERR_PTR(-EOPNOTSUPP); 4080 return intel_nested_domain_alloc(parent, user_data); 4081 } 4082 4083 if (flags & 4084 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) 4085 return ERR_PTR(-EOPNOTSUPP); 4086 if (nested_parent && !nested_supported(iommu)) 4087 return ERR_PTR(-EOPNOTSUPP); 4088 if (user_data || (dirty_tracking && !ssads_supported(iommu))) 4089 return ERR_PTR(-EOPNOTSUPP); 4090 4091 /* 4092 * domain_alloc_user op needs to fully initialize a domain before 4093 * return, so uses iommu_domain_alloc() here for simple. 4094 */ 4095 domain = iommu_domain_alloc(dev->bus); 4096 if (!domain) 4097 return ERR_PTR(-ENOMEM); 4098 4099 if (nested_parent) 4100 to_dmar_domain(domain)->nested_parent = true; 4101 4102 if (dirty_tracking) { 4103 if (to_dmar_domain(domain)->use_first_level) { 4104 iommu_domain_free(domain); 4105 return ERR_PTR(-EOPNOTSUPP); 4106 } 4107 domain->dirty_ops = &intel_dirty_ops; 4108 } 4109 4110 return domain; 4111 } 4112 4113 static void intel_iommu_domain_free(struct iommu_domain *domain) 4114 { 4115 if (domain != &si_domain->domain) 4116 domain_exit(to_dmar_domain(domain)); 4117 } 4118 4119 int prepare_domain_attach_device(struct iommu_domain *domain, 4120 struct device *dev) 4121 { 4122 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4123 struct intel_iommu *iommu; 4124 int addr_width; 4125 4126 iommu = device_to_iommu(dev, NULL, NULL); 4127 if (!iommu) 4128 return -ENODEV; 4129 4130 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4131 return -EINVAL; 4132 4133 if (domain->dirty_ops && !ssads_supported(iommu)) 4134 return -EINVAL; 4135 4136 /* check if this iommu agaw is sufficient for max mapped address */ 4137 addr_width = agaw_to_width(iommu->agaw); 4138 if (addr_width > cap_mgaw(iommu->cap)) 4139 addr_width = cap_mgaw(iommu->cap); 4140 4141 if (dmar_domain->max_addr > (1LL << addr_width)) 4142 return -EINVAL; 4143 dmar_domain->gaw = addr_width; 4144 4145 /* 4146 * Knock out extra levels of page tables if necessary 4147 */ 4148 while (iommu->agaw < dmar_domain->agaw) { 4149 struct dma_pte *pte; 4150 4151 pte = dmar_domain->pgd; 4152 if (dma_pte_present(pte)) { 4153 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4154 free_pgtable_page(pte); 4155 } 4156 dmar_domain->agaw--; 4157 } 4158 4159 return 0; 4160 } 4161 4162 static int intel_iommu_attach_device(struct iommu_domain *domain, 4163 struct device *dev) 4164 { 4165 struct device_domain_info *info = dev_iommu_priv_get(dev); 4166 int ret; 4167 4168 if (info->domain) 4169 device_block_translation(dev); 4170 4171 ret = prepare_domain_attach_device(domain, dev); 4172 if (ret) 4173 return ret; 4174 4175 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4176 } 4177 4178 static int intel_iommu_map(struct iommu_domain *domain, 4179 unsigned long iova, phys_addr_t hpa, 4180 size_t size, int iommu_prot, gfp_t gfp) 4181 { 4182 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4183 u64 max_addr; 4184 int prot = 0; 4185 4186 if (iommu_prot & IOMMU_READ) 4187 prot |= DMA_PTE_READ; 4188 if (iommu_prot & IOMMU_WRITE) 4189 prot |= DMA_PTE_WRITE; 4190 if (dmar_domain->set_pte_snp) 4191 prot |= DMA_PTE_SNP; 4192 4193 max_addr = iova + size; 4194 if (dmar_domain->max_addr < max_addr) { 4195 u64 end; 4196 4197 /* check if minimum agaw is sufficient for mapped address */ 4198 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4199 if (end < max_addr) { 4200 pr_err("%s: iommu width (%d) is not " 4201 "sufficient for the mapped address (%llx)\n", 4202 __func__, dmar_domain->gaw, max_addr); 4203 return -EFAULT; 4204 } 4205 dmar_domain->max_addr = max_addr; 4206 } 4207 /* Round up size to next multiple of PAGE_SIZE, if it and 4208 the low bits of hpa would take us onto the next page */ 4209 size = aligned_nrpages(hpa, size); 4210 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4211 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4212 } 4213 4214 static int intel_iommu_map_pages(struct iommu_domain *domain, 4215 unsigned long iova, phys_addr_t paddr, 4216 size_t pgsize, size_t pgcount, 4217 int prot, gfp_t gfp, size_t *mapped) 4218 { 4219 unsigned long pgshift = __ffs(pgsize); 4220 size_t size = pgcount << pgshift; 4221 int ret; 4222 4223 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4224 return -EINVAL; 4225 4226 if (!IS_ALIGNED(iova | paddr, pgsize)) 4227 return -EINVAL; 4228 4229 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4230 if (!ret && mapped) 4231 *mapped = size; 4232 4233 return ret; 4234 } 4235 4236 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4237 unsigned long iova, size_t size, 4238 struct iommu_iotlb_gather *gather) 4239 { 4240 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4241 unsigned long start_pfn, last_pfn; 4242 int level = 0; 4243 4244 /* Cope with horrid API which requires us to unmap more than the 4245 size argument if it happens to be a large-page mapping. */ 4246 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4247 &level, GFP_ATOMIC))) 4248 return 0; 4249 4250 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4251 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4252 4253 start_pfn = iova >> VTD_PAGE_SHIFT; 4254 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4255 4256 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4257 4258 if (dmar_domain->max_addr == iova + size) 4259 dmar_domain->max_addr = iova; 4260 4261 /* 4262 * We do not use page-selective IOTLB invalidation in flush queue, 4263 * so there is no need to track page and sync iotlb. 4264 */ 4265 if (!iommu_iotlb_gather_queued(gather)) 4266 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4267 4268 return size; 4269 } 4270 4271 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4272 unsigned long iova, 4273 size_t pgsize, size_t pgcount, 4274 struct iommu_iotlb_gather *gather) 4275 { 4276 unsigned long pgshift = __ffs(pgsize); 4277 size_t size = pgcount << pgshift; 4278 4279 return intel_iommu_unmap(domain, iova, size, gather); 4280 } 4281 4282 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4283 struct iommu_iotlb_gather *gather) 4284 { 4285 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4286 unsigned long iova_pfn = IOVA_PFN(gather->start); 4287 size_t size = gather->end - gather->start; 4288 struct iommu_domain_info *info; 4289 unsigned long start_pfn; 4290 unsigned long nrpages; 4291 unsigned long i; 4292 4293 nrpages = aligned_nrpages(gather->start, size); 4294 start_pfn = mm_to_dma_pfn_start(iova_pfn); 4295 4296 xa_for_each(&dmar_domain->iommu_array, i, info) 4297 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4298 start_pfn, nrpages, 4299 list_empty(&gather->freelist), 0); 4300 4301 put_pages_list(&gather->freelist); 4302 } 4303 4304 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4305 dma_addr_t iova) 4306 { 4307 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4308 struct dma_pte *pte; 4309 int level = 0; 4310 u64 phys = 0; 4311 4312 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4313 GFP_ATOMIC); 4314 if (pte && dma_pte_present(pte)) 4315 phys = dma_pte_addr(pte) + 4316 (iova & (BIT_MASK(level_to_offset_bits(level) + 4317 VTD_PAGE_SHIFT) - 1)); 4318 4319 return phys; 4320 } 4321 4322 static bool domain_support_force_snooping(struct dmar_domain *domain) 4323 { 4324 struct device_domain_info *info; 4325 bool support = true; 4326 4327 assert_spin_locked(&domain->lock); 4328 list_for_each_entry(info, &domain->devices, link) { 4329 if (!ecap_sc_support(info->iommu->ecap)) { 4330 support = false; 4331 break; 4332 } 4333 } 4334 4335 return support; 4336 } 4337 4338 static void domain_set_force_snooping(struct dmar_domain *domain) 4339 { 4340 struct device_domain_info *info; 4341 4342 assert_spin_locked(&domain->lock); 4343 /* 4344 * Second level page table supports per-PTE snoop control. The 4345 * iommu_map() interface will handle this by setting SNP bit. 4346 */ 4347 if (!domain->use_first_level) { 4348 domain->set_pte_snp = true; 4349 return; 4350 } 4351 4352 list_for_each_entry(info, &domain->devices, link) 4353 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4354 IOMMU_NO_PASID); 4355 } 4356 4357 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4358 { 4359 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4360 unsigned long flags; 4361 4362 if (dmar_domain->force_snooping) 4363 return true; 4364 4365 spin_lock_irqsave(&dmar_domain->lock, flags); 4366 if (!domain_support_force_snooping(dmar_domain) || 4367 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 4368 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4369 return false; 4370 } 4371 4372 domain_set_force_snooping(dmar_domain); 4373 dmar_domain->force_snooping = true; 4374 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4375 4376 return true; 4377 } 4378 4379 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4380 { 4381 struct device_domain_info *info = dev_iommu_priv_get(dev); 4382 4383 switch (cap) { 4384 case IOMMU_CAP_CACHE_COHERENCY: 4385 case IOMMU_CAP_DEFERRED_FLUSH: 4386 return true; 4387 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4388 return dmar_platform_optin(); 4389 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4390 return ecap_sc_support(info->iommu->ecap); 4391 case IOMMU_CAP_DIRTY_TRACKING: 4392 return ssads_supported(info->iommu); 4393 default: 4394 return false; 4395 } 4396 } 4397 4398 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4399 { 4400 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4401 struct device_domain_info *info; 4402 struct intel_iommu *iommu; 4403 u8 bus, devfn; 4404 int ret; 4405 4406 iommu = device_to_iommu(dev, &bus, &devfn); 4407 if (!iommu || !iommu->iommu.ops) 4408 return ERR_PTR(-ENODEV); 4409 4410 info = kzalloc(sizeof(*info), GFP_KERNEL); 4411 if (!info) 4412 return ERR_PTR(-ENOMEM); 4413 4414 if (dev_is_real_dma_subdevice(dev)) { 4415 info->bus = pdev->bus->number; 4416 info->devfn = pdev->devfn; 4417 info->segment = pci_domain_nr(pdev->bus); 4418 } else { 4419 info->bus = bus; 4420 info->devfn = devfn; 4421 info->segment = iommu->segment; 4422 } 4423 4424 info->dev = dev; 4425 info->iommu = iommu; 4426 if (dev_is_pci(dev)) { 4427 if (ecap_dev_iotlb_support(iommu->ecap) && 4428 pci_ats_supported(pdev) && 4429 dmar_ats_supported(pdev, iommu)) { 4430 info->ats_supported = 1; 4431 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4432 4433 /* 4434 * For IOMMU that supports device IOTLB throttling 4435 * (DIT), we assign PFSID to the invalidation desc 4436 * of a VF such that IOMMU HW can gauge queue depth 4437 * at PF level. If DIT is not set, PFSID will be 4438 * treated as reserved, which should be set to 0. 4439 */ 4440 if (ecap_dit(iommu->ecap)) 4441 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4442 info->ats_qdep = pci_ats_queue_depth(pdev); 4443 } 4444 if (sm_supported(iommu)) { 4445 if (pasid_supported(iommu)) { 4446 int features = pci_pasid_features(pdev); 4447 4448 if (features >= 0) 4449 info->pasid_supported = features | 1; 4450 } 4451 4452 if (info->ats_supported && ecap_prs(iommu->ecap) && 4453 pci_pri_supported(pdev)) 4454 info->pri_supported = 1; 4455 } 4456 } 4457 4458 dev_iommu_priv_set(dev, info); 4459 4460 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4461 ret = intel_pasid_alloc_table(dev); 4462 if (ret) { 4463 dev_err(dev, "PASID table allocation failed\n"); 4464 dev_iommu_priv_set(dev, NULL); 4465 kfree(info); 4466 return ERR_PTR(ret); 4467 } 4468 } 4469 4470 intel_iommu_debugfs_create_dev(info); 4471 4472 return &iommu->iommu; 4473 } 4474 4475 static void intel_iommu_release_device(struct device *dev) 4476 { 4477 struct device_domain_info *info = dev_iommu_priv_get(dev); 4478 4479 dmar_remove_one_dev_info(dev); 4480 intel_pasid_free_table(dev); 4481 intel_iommu_debugfs_remove_dev(info); 4482 dev_iommu_priv_set(dev, NULL); 4483 kfree(info); 4484 set_dma_ops(dev, NULL); 4485 } 4486 4487 static void intel_iommu_probe_finalize(struct device *dev) 4488 { 4489 set_dma_ops(dev, NULL); 4490 iommu_setup_dma_ops(dev, 0, U64_MAX); 4491 } 4492 4493 static void intel_iommu_get_resv_regions(struct device *device, 4494 struct list_head *head) 4495 { 4496 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4497 struct iommu_resv_region *reg; 4498 struct dmar_rmrr_unit *rmrr; 4499 struct device *i_dev; 4500 int i; 4501 4502 rcu_read_lock(); 4503 for_each_rmrr_units(rmrr) { 4504 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4505 i, i_dev) { 4506 struct iommu_resv_region *resv; 4507 enum iommu_resv_type type; 4508 size_t length; 4509 4510 if (i_dev != device && 4511 !is_downstream_to_pci_bridge(device, i_dev)) 4512 continue; 4513 4514 length = rmrr->end_address - rmrr->base_address + 1; 4515 4516 type = device_rmrr_is_relaxable(device) ? 4517 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4518 4519 resv = iommu_alloc_resv_region(rmrr->base_address, 4520 length, prot, type, 4521 GFP_ATOMIC); 4522 if (!resv) 4523 break; 4524 4525 list_add_tail(&resv->list, head); 4526 } 4527 } 4528 rcu_read_unlock(); 4529 4530 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4531 if (dev_is_pci(device)) { 4532 struct pci_dev *pdev = to_pci_dev(device); 4533 4534 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4535 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4536 IOMMU_RESV_DIRECT_RELAXABLE, 4537 GFP_KERNEL); 4538 if (reg) 4539 list_add_tail(®->list, head); 4540 } 4541 } 4542 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4543 4544 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4545 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4546 0, IOMMU_RESV_MSI, GFP_KERNEL); 4547 if (!reg) 4548 return; 4549 list_add_tail(®->list, head); 4550 } 4551 4552 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4553 { 4554 if (dev_is_pci(dev)) 4555 return pci_device_group(dev); 4556 return generic_device_group(dev); 4557 } 4558 4559 static int intel_iommu_enable_sva(struct device *dev) 4560 { 4561 struct device_domain_info *info = dev_iommu_priv_get(dev); 4562 struct intel_iommu *iommu; 4563 4564 if (!info || dmar_disabled) 4565 return -EINVAL; 4566 4567 iommu = info->iommu; 4568 if (!iommu) 4569 return -EINVAL; 4570 4571 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4572 return -ENODEV; 4573 4574 if (!info->pasid_enabled || !info->ats_enabled) 4575 return -EINVAL; 4576 4577 /* 4578 * Devices having device-specific I/O fault handling should not 4579 * support PCI/PRI. The IOMMU side has no means to check the 4580 * capability of device-specific IOPF. Therefore, IOMMU can only 4581 * default that if the device driver enables SVA on a non-PRI 4582 * device, it will handle IOPF in its own way. 4583 */ 4584 if (!info->pri_supported) 4585 return 0; 4586 4587 /* Devices supporting PRI should have it enabled. */ 4588 if (!info->pri_enabled) 4589 return -EINVAL; 4590 4591 return 0; 4592 } 4593 4594 static int intel_iommu_enable_iopf(struct device *dev) 4595 { 4596 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4597 struct device_domain_info *info = dev_iommu_priv_get(dev); 4598 struct intel_iommu *iommu; 4599 int ret; 4600 4601 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4602 return -ENODEV; 4603 4604 if (info->pri_enabled) 4605 return -EBUSY; 4606 4607 iommu = info->iommu; 4608 if (!iommu) 4609 return -EINVAL; 4610 4611 /* PASID is required in PRG Response Message. */ 4612 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4613 return -EINVAL; 4614 4615 ret = pci_reset_pri(pdev); 4616 if (ret) 4617 return ret; 4618 4619 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4620 if (ret) 4621 return ret; 4622 4623 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4624 if (ret) 4625 goto iopf_remove_device; 4626 4627 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4628 if (ret) 4629 goto iopf_unregister_handler; 4630 info->pri_enabled = 1; 4631 4632 return 0; 4633 4634 iopf_unregister_handler: 4635 iommu_unregister_device_fault_handler(dev); 4636 iopf_remove_device: 4637 iopf_queue_remove_device(iommu->iopf_queue, dev); 4638 4639 return ret; 4640 } 4641 4642 static int intel_iommu_disable_iopf(struct device *dev) 4643 { 4644 struct device_domain_info *info = dev_iommu_priv_get(dev); 4645 struct intel_iommu *iommu = info->iommu; 4646 4647 if (!info->pri_enabled) 4648 return -EINVAL; 4649 4650 /* 4651 * PCIe spec states that by clearing PRI enable bit, the Page 4652 * Request Interface will not issue new page requests, but has 4653 * outstanding page requests that have been transmitted or are 4654 * queued for transmission. This is supposed to be called after 4655 * the device driver has stopped DMA, all PASIDs have been 4656 * unbound and the outstanding PRQs have been drained. 4657 */ 4658 pci_disable_pri(to_pci_dev(dev)); 4659 info->pri_enabled = 0; 4660 4661 /* 4662 * With PRI disabled and outstanding PRQs drained, unregistering 4663 * fault handler and removing device from iopf queue should never 4664 * fail. 4665 */ 4666 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4667 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4668 4669 return 0; 4670 } 4671 4672 static int 4673 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4674 { 4675 switch (feat) { 4676 case IOMMU_DEV_FEAT_IOPF: 4677 return intel_iommu_enable_iopf(dev); 4678 4679 case IOMMU_DEV_FEAT_SVA: 4680 return intel_iommu_enable_sva(dev); 4681 4682 default: 4683 return -ENODEV; 4684 } 4685 } 4686 4687 static int 4688 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4689 { 4690 switch (feat) { 4691 case IOMMU_DEV_FEAT_IOPF: 4692 return intel_iommu_disable_iopf(dev); 4693 4694 case IOMMU_DEV_FEAT_SVA: 4695 return 0; 4696 4697 default: 4698 return -ENODEV; 4699 } 4700 } 4701 4702 static bool intel_iommu_is_attach_deferred(struct device *dev) 4703 { 4704 struct device_domain_info *info = dev_iommu_priv_get(dev); 4705 4706 return translation_pre_enabled(info->iommu) && !info->domain; 4707 } 4708 4709 /* 4710 * Check that the device does not live on an external facing PCI port that is 4711 * marked as untrusted. Such devices should not be able to apply quirks and 4712 * thus not be able to bypass the IOMMU restrictions. 4713 */ 4714 static bool risky_device(struct pci_dev *pdev) 4715 { 4716 if (pdev->untrusted) { 4717 pci_info(pdev, 4718 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4719 pdev->vendor, pdev->device); 4720 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4721 return true; 4722 } 4723 return false; 4724 } 4725 4726 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4727 unsigned long iova, size_t size) 4728 { 4729 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4730 unsigned long pages = aligned_nrpages(iova, size); 4731 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4732 struct iommu_domain_info *info; 4733 unsigned long i; 4734 4735 xa_for_each(&dmar_domain->iommu_array, i, info) 4736 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4737 return 0; 4738 } 4739 4740 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4741 { 4742 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 4743 struct dev_pasid_info *curr, *dev_pasid = NULL; 4744 struct dmar_domain *dmar_domain; 4745 struct iommu_domain *domain; 4746 unsigned long flags; 4747 4748 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4749 if (WARN_ON_ONCE(!domain)) 4750 goto out_tear_down; 4751 4752 /* 4753 * The SVA implementation needs to handle its own stuffs like the mm 4754 * notification. Before consolidating that code into iommu core, let 4755 * the intel sva code handle it. 4756 */ 4757 if (domain->type == IOMMU_DOMAIN_SVA) { 4758 intel_svm_remove_dev_pasid(dev, pasid); 4759 goto out_tear_down; 4760 } 4761 4762 dmar_domain = to_dmar_domain(domain); 4763 spin_lock_irqsave(&dmar_domain->lock, flags); 4764 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4765 if (curr->dev == dev && curr->pasid == pasid) { 4766 list_del(&curr->link_domain); 4767 dev_pasid = curr; 4768 break; 4769 } 4770 } 4771 WARN_ON_ONCE(!dev_pasid); 4772 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4773 4774 domain_detach_iommu(dmar_domain, iommu); 4775 intel_iommu_debugfs_remove_dev_pasid(dev_pasid); 4776 kfree(dev_pasid); 4777 out_tear_down: 4778 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4779 intel_drain_pasid_prq(dev, pasid); 4780 } 4781 4782 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4783 struct device *dev, ioasid_t pasid) 4784 { 4785 struct device_domain_info *info = dev_iommu_priv_get(dev); 4786 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4787 struct intel_iommu *iommu = info->iommu; 4788 struct dev_pasid_info *dev_pasid; 4789 unsigned long flags; 4790 int ret; 4791 4792 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4793 return -EOPNOTSUPP; 4794 4795 if (domain->dirty_ops) 4796 return -EINVAL; 4797 4798 if (context_copied(iommu, info->bus, info->devfn)) 4799 return -EBUSY; 4800 4801 ret = prepare_domain_attach_device(domain, dev); 4802 if (ret) 4803 return ret; 4804 4805 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4806 if (!dev_pasid) 4807 return -ENOMEM; 4808 4809 ret = domain_attach_iommu(dmar_domain, iommu); 4810 if (ret) 4811 goto out_free; 4812 4813 if (domain_type_is_si(dmar_domain)) 4814 ret = intel_pasid_setup_pass_through(iommu, dmar_domain, 4815 dev, pasid); 4816 else if (dmar_domain->use_first_level) 4817 ret = domain_setup_first_level(iommu, dmar_domain, 4818 dev, pasid); 4819 else 4820 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4821 dev, pasid); 4822 if (ret) 4823 goto out_detach_iommu; 4824 4825 dev_pasid->dev = dev; 4826 dev_pasid->pasid = pasid; 4827 spin_lock_irqsave(&dmar_domain->lock, flags); 4828 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4829 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4830 4831 if (domain->type & __IOMMU_DOMAIN_PAGING) 4832 intel_iommu_debugfs_create_dev_pasid(dev_pasid); 4833 4834 return 0; 4835 out_detach_iommu: 4836 domain_detach_iommu(dmar_domain, iommu); 4837 out_free: 4838 kfree(dev_pasid); 4839 return ret; 4840 } 4841 4842 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4843 { 4844 struct device_domain_info *info = dev_iommu_priv_get(dev); 4845 struct intel_iommu *iommu = info->iommu; 4846 struct iommu_hw_info_vtd *vtd; 4847 4848 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4849 if (!vtd) 4850 return ERR_PTR(-ENOMEM); 4851 4852 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; 4853 vtd->cap_reg = iommu->cap; 4854 vtd->ecap_reg = iommu->ecap; 4855 *length = sizeof(*vtd); 4856 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4857 return vtd; 4858 } 4859 4860 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 4861 bool enable) 4862 { 4863 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4864 struct device_domain_info *info; 4865 int ret; 4866 4867 spin_lock(&dmar_domain->lock); 4868 if (dmar_domain->dirty_tracking == enable) 4869 goto out_unlock; 4870 4871 list_for_each_entry(info, &dmar_domain->devices, link) { 4872 ret = intel_pasid_setup_dirty_tracking(info->iommu, 4873 info->domain, info->dev, 4874 IOMMU_NO_PASID, enable); 4875 if (ret) 4876 goto err_unwind; 4877 } 4878 4879 dmar_domain->dirty_tracking = enable; 4880 out_unlock: 4881 spin_unlock(&dmar_domain->lock); 4882 4883 return 0; 4884 4885 err_unwind: 4886 list_for_each_entry(info, &dmar_domain->devices, link) 4887 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, 4888 info->dev, IOMMU_NO_PASID, 4889 dmar_domain->dirty_tracking); 4890 spin_unlock(&dmar_domain->lock); 4891 return ret; 4892 } 4893 4894 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 4895 unsigned long iova, size_t size, 4896 unsigned long flags, 4897 struct iommu_dirty_bitmap *dirty) 4898 { 4899 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4900 unsigned long end = iova + size - 1; 4901 unsigned long pgsize; 4902 4903 /* 4904 * IOMMUFD core calls into a dirty tracking disabled domain without an 4905 * IOVA bitmap set in order to clean dirty bits in all PTEs that might 4906 * have occurred when we stopped dirty tracking. This ensures that we 4907 * never inherit dirtied bits from a previous cycle. 4908 */ 4909 if (!dmar_domain->dirty_tracking && dirty->bitmap) 4910 return -EINVAL; 4911 4912 do { 4913 struct dma_pte *pte; 4914 int lvl = 0; 4915 4916 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 4917 GFP_ATOMIC); 4918 pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 4919 if (!pte || !dma_pte_present(pte)) { 4920 iova += pgsize; 4921 continue; 4922 } 4923 4924 if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 4925 iommu_dirty_bitmap_record(dirty, iova, pgsize); 4926 iova += pgsize; 4927 } while (iova < end); 4928 4929 return 0; 4930 } 4931 4932 static const struct iommu_dirty_ops intel_dirty_ops = { 4933 .set_dirty_tracking = intel_iommu_set_dirty_tracking, 4934 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 4935 }; 4936 4937 const struct iommu_ops intel_iommu_ops = { 4938 .blocked_domain = &blocking_domain, 4939 .capable = intel_iommu_capable, 4940 .hw_info = intel_iommu_hw_info, 4941 .domain_alloc = intel_iommu_domain_alloc, 4942 .domain_alloc_user = intel_iommu_domain_alloc_user, 4943 .probe_device = intel_iommu_probe_device, 4944 .probe_finalize = intel_iommu_probe_finalize, 4945 .release_device = intel_iommu_release_device, 4946 .get_resv_regions = intel_iommu_get_resv_regions, 4947 .device_group = intel_iommu_device_group, 4948 .dev_enable_feat = intel_iommu_dev_enable_feat, 4949 .dev_disable_feat = intel_iommu_dev_disable_feat, 4950 .is_attach_deferred = intel_iommu_is_attach_deferred, 4951 .def_domain_type = device_def_domain_type, 4952 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4953 .pgsize_bitmap = SZ_4K, 4954 #ifdef CONFIG_INTEL_IOMMU_SVM 4955 .page_response = intel_svm_page_response, 4956 #endif 4957 .default_domain_ops = &(const struct iommu_domain_ops) { 4958 .attach_dev = intel_iommu_attach_device, 4959 .set_dev_pasid = intel_iommu_set_dev_pasid, 4960 .map_pages = intel_iommu_map_pages, 4961 .unmap_pages = intel_iommu_unmap_pages, 4962 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4963 .flush_iotlb_all = intel_flush_iotlb_all, 4964 .iotlb_sync = intel_iommu_tlb_sync, 4965 .iova_to_phys = intel_iommu_iova_to_phys, 4966 .free = intel_iommu_domain_free, 4967 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4968 } 4969 }; 4970 4971 static void quirk_iommu_igfx(struct pci_dev *dev) 4972 { 4973 if (risky_device(dev)) 4974 return; 4975 4976 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4977 dmar_map_gfx = 0; 4978 } 4979 4980 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4988 4989 /* Broadwell igfx malfunctions with dmar */ 4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5014 5015 static void quirk_iommu_rwbf(struct pci_dev *dev) 5016 { 5017 if (risky_device(dev)) 5018 return; 5019 5020 /* 5021 * Mobile 4 Series Chipset neglects to set RWBF capability, 5022 * but needs it. Same seems to hold for the desktop versions. 5023 */ 5024 pci_info(dev, "Forcing write-buffer flush capability\n"); 5025 rwbf_quirk = 1; 5026 } 5027 5028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5035 5036 #define GGC 0x52 5037 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5038 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5039 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5040 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5041 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5042 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5043 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5044 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5045 5046 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5047 { 5048 unsigned short ggc; 5049 5050 if (risky_device(dev)) 5051 return; 5052 5053 if (pci_read_config_word(dev, GGC, &ggc)) 5054 return; 5055 5056 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5057 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5058 dmar_map_gfx = 0; 5059 } else if (dmar_map_gfx) { 5060 /* we have to ensure the gfx device is idle before we flush */ 5061 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5062 iommu_set_dma_strict(); 5063 } 5064 } 5065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5069 5070 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5071 { 5072 unsigned short ver; 5073 5074 if (!IS_GFX_DEVICE(dev)) 5075 return; 5076 5077 ver = (dev->device >> 8) & 0xff; 5078 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5079 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5080 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 5081 return; 5082 5083 if (risky_device(dev)) 5084 return; 5085 5086 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5087 iommu_skip_te_disable = 1; 5088 } 5089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5090 5091 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5092 ISOCH DMAR unit for the Azalia sound device, but not give it any 5093 TLB entries, which causes it to deadlock. Check for that. We do 5094 this in a function called from init_dmars(), instead of in a PCI 5095 quirk, because we don't want to print the obnoxious "BIOS broken" 5096 message if VT-d is actually disabled. 5097 */ 5098 static void __init check_tylersburg_isoch(void) 5099 { 5100 struct pci_dev *pdev; 5101 uint32_t vtisochctrl; 5102 5103 /* If there's no Azalia in the system anyway, forget it. */ 5104 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5105 if (!pdev) 5106 return; 5107 5108 if (risky_device(pdev)) { 5109 pci_dev_put(pdev); 5110 return; 5111 } 5112 5113 pci_dev_put(pdev); 5114 5115 /* System Management Registers. Might be hidden, in which case 5116 we can't do the sanity check. But that's OK, because the 5117 known-broken BIOSes _don't_ actually hide it, so far. */ 5118 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5119 if (!pdev) 5120 return; 5121 5122 if (risky_device(pdev)) { 5123 pci_dev_put(pdev); 5124 return; 5125 } 5126 5127 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5128 pci_dev_put(pdev); 5129 return; 5130 } 5131 5132 pci_dev_put(pdev); 5133 5134 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5135 if (vtisochctrl & 1) 5136 return; 5137 5138 /* Drop all bits other than the number of TLB entries */ 5139 vtisochctrl &= 0x1c; 5140 5141 /* If we have the recommended number of TLB entries (16), fine. */ 5142 if (vtisochctrl == 0x10) 5143 return; 5144 5145 /* Zero TLB entries? You get to ride the short bus to school. */ 5146 if (!vtisochctrl) { 5147 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5148 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5149 dmi_get_system_info(DMI_BIOS_VENDOR), 5150 dmi_get_system_info(DMI_BIOS_VERSION), 5151 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5152 iommu_identity_mapping |= IDENTMAP_AZALIA; 5153 return; 5154 } 5155 5156 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5157 vtisochctrl); 5158 } 5159 5160 /* 5161 * Here we deal with a device TLB defect where device may inadvertently issue ATS 5162 * invalidation completion before posted writes initiated with translated address 5163 * that utilized translations matching the invalidation address range, violating 5164 * the invalidation completion ordering. 5165 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 5166 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 5167 * under the control of the trusted/privileged host device driver must use this 5168 * quirk. 5169 * Device TLBs are invalidated under the following six conditions: 5170 * 1. Device driver does DMA API unmap IOVA 5171 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 5172 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 5173 * exit_mmap() due to crash 5174 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 5175 * VM has to free pages that were unmapped 5176 * 5. Userspace driver unmaps a DMA buffer 5177 * 6. Cache invalidation in vSVA usage (upcoming) 5178 * 5179 * For #1 and #2, device drivers are responsible for stopping DMA traffic 5180 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 5181 * invalidate TLB the same way as normal user unmap which will use this quirk. 5182 * The dTLB invalidation after PASID cache flush does not need this quirk. 5183 * 5184 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 5185 */ 5186 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 5187 unsigned long address, unsigned long mask, 5188 u32 pasid, u16 qdep) 5189 { 5190 u16 sid; 5191 5192 if (likely(!info->dtlb_extra_inval)) 5193 return; 5194 5195 sid = PCI_DEVID(info->bus, info->devfn); 5196 if (pasid == IOMMU_NO_PASID) { 5197 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 5198 qdep, address, mask); 5199 } else { 5200 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5201 pasid, qdep, address, mask); 5202 } 5203 } 5204 5205 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5206 5207 /* 5208 * Function to submit a command to the enhanced command interface. The 5209 * valid enhanced command descriptions are defined in Table 47 of the 5210 * VT-d spec. The VT-d hardware implementation may support some but not 5211 * all commands, which can be determined by checking the Enhanced 5212 * Command Capability Register. 5213 * 5214 * Return values: 5215 * - 0: Command successful without any error; 5216 * - Negative: software error value; 5217 * - Nonzero positive: failure status code defined in Table 48. 5218 */ 5219 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5220 { 5221 unsigned long flags; 5222 u64 res; 5223 int ret; 5224 5225 if (!cap_ecmds(iommu->cap)) 5226 return -ENODEV; 5227 5228 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5229 5230 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5231 if (res & DMA_ECMD_ECRSP_IP) { 5232 ret = -EBUSY; 5233 goto err; 5234 } 5235 5236 /* 5237 * Unconditionally write the operand B, because 5238 * - There is no side effect if an ecmd doesn't require an 5239 * operand B, but we set the register to some value. 5240 * - It's not invoked in any critical path. The extra MMIO 5241 * write doesn't bring any performance concerns. 5242 */ 5243 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5244 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5245 5246 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5247 !(res & DMA_ECMD_ECRSP_IP), res); 5248 5249 if (res & DMA_ECMD_ECRSP_IP) { 5250 ret = -ETIMEDOUT; 5251 goto err; 5252 } 5253 5254 ret = ecmd_get_status_code(res); 5255 err: 5256 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5257 5258 return ret; 5259 } 5260