1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dma-iommu.h> 19 #include <linux/dmi.h> 20 #include <linux/intel-iommu.h> 21 #include <linux/intel-svm.h> 22 #include <linux/memory.h> 23 #include <linux/pci.h> 24 #include <linux/pci-ats.h> 25 #include <linux/spinlock.h> 26 #include <linux/syscore_ops.h> 27 #include <linux/tboot.h> 28 29 #include "../irq_remapping.h" 30 #include "../iommu-sva-lib.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 /* global iommu list, set NULL for ignored DMAR units */ 130 static struct intel_iommu **g_iommus; 131 132 static void __init check_tylersburg_isoch(void); 133 static int rwbf_quirk; 134 static inline struct device_domain_info * 135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn); 136 137 /* 138 * set to 1 to panic kernel if can't successfully enable VT-d 139 * (used when kernel is launched w/ TXT) 140 */ 141 static int force_on = 0; 142 static int intel_iommu_tboot_noforce; 143 static int no_platform_optin; 144 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 146 147 /* 148 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 149 * if marked present. 150 */ 151 static phys_addr_t root_entry_lctp(struct root_entry *re) 152 { 153 if (!(re->lo & 1)) 154 return 0; 155 156 return re->lo & VTD_PAGE_MASK; 157 } 158 159 /* 160 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 161 * if marked present. 162 */ 163 static phys_addr_t root_entry_uctp(struct root_entry *re) 164 { 165 if (!(re->hi & 1)) 166 return 0; 167 168 return re->hi & VTD_PAGE_MASK; 169 } 170 171 static inline void context_clear_pasid_enable(struct context_entry *context) 172 { 173 context->lo &= ~(1ULL << 11); 174 } 175 176 static inline bool context_pasid_enabled(struct context_entry *context) 177 { 178 return !!(context->lo & (1ULL << 11)); 179 } 180 181 static inline void context_set_copied(struct context_entry *context) 182 { 183 context->hi |= (1ull << 3); 184 } 185 186 static inline bool context_copied(struct context_entry *context) 187 { 188 return !!(context->hi & (1ULL << 3)); 189 } 190 191 static inline bool __context_present(struct context_entry *context) 192 { 193 return (context->lo & 1); 194 } 195 196 bool context_present(struct context_entry *context) 197 { 198 return context_pasid_enabled(context) ? 199 __context_present(context) : 200 __context_present(context) && !context_copied(context); 201 } 202 203 static inline void context_set_present(struct context_entry *context) 204 { 205 context->lo |= 1; 206 } 207 208 static inline void context_set_fault_enable(struct context_entry *context) 209 { 210 context->lo &= (((u64)-1) << 2) | 1; 211 } 212 213 static inline void context_set_translation_type(struct context_entry *context, 214 unsigned long value) 215 { 216 context->lo &= (((u64)-1) << 4) | 3; 217 context->lo |= (value & 3) << 2; 218 } 219 220 static inline void context_set_address_root(struct context_entry *context, 221 unsigned long value) 222 { 223 context->lo &= ~VTD_PAGE_MASK; 224 context->lo |= value & VTD_PAGE_MASK; 225 } 226 227 static inline void context_set_address_width(struct context_entry *context, 228 unsigned long value) 229 { 230 context->hi |= value & 7; 231 } 232 233 static inline void context_set_domain_id(struct context_entry *context, 234 unsigned long value) 235 { 236 context->hi |= (value & ((1 << 16) - 1)) << 8; 237 } 238 239 static inline int context_domain_id(struct context_entry *c) 240 { 241 return((c->hi >> 8) & 0xffff); 242 } 243 244 static inline void context_clear_entry(struct context_entry *context) 245 { 246 context->lo = 0; 247 context->hi = 0; 248 } 249 250 /* 251 * This domain is a statically identity mapping domain. 252 * 1. This domain creats a static 1:1 mapping to all usable memory. 253 * 2. It maps to each iommu if successful. 254 * 3. Each iommu mapps to this domain if successful. 255 */ 256 static struct dmar_domain *si_domain; 257 static int hw_pass_through = 1; 258 259 #define for_each_domain_iommu(idx, domain) \ 260 for (idx = 0; idx < g_num_of_iommus; idx++) \ 261 if (domain->iommu_refcnt[idx]) 262 263 struct dmar_rmrr_unit { 264 struct list_head list; /* list of rmrr units */ 265 struct acpi_dmar_header *hdr; /* ACPI header */ 266 u64 base_address; /* reserved base address*/ 267 u64 end_address; /* reserved end address */ 268 struct dmar_dev_scope *devices; /* target devices */ 269 int devices_cnt; /* target device count */ 270 }; 271 272 struct dmar_atsr_unit { 273 struct list_head list; /* list of ATSR units */ 274 struct acpi_dmar_header *hdr; /* ACPI header */ 275 struct dmar_dev_scope *devices; /* target devices */ 276 int devices_cnt; /* target device count */ 277 u8 include_all:1; /* include all ports */ 278 }; 279 280 struct dmar_satc_unit { 281 struct list_head list; /* list of SATC units */ 282 struct acpi_dmar_header *hdr; /* ACPI header */ 283 struct dmar_dev_scope *devices; /* target devices */ 284 struct intel_iommu *iommu; /* the corresponding iommu */ 285 int devices_cnt; /* target device count */ 286 u8 atc_required:1; /* ATS is required */ 287 }; 288 289 static LIST_HEAD(dmar_atsr_units); 290 static LIST_HEAD(dmar_rmrr_units); 291 static LIST_HEAD(dmar_satc_units); 292 293 #define for_each_rmrr_units(rmrr) \ 294 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 295 296 /* bitmap for indexing intel_iommus */ 297 static int g_num_of_iommus; 298 299 static void domain_remove_dev_info(struct dmar_domain *domain); 300 static void dmar_remove_one_dev_info(struct device *dev); 301 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 302 303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 305 306 int intel_iommu_enabled = 0; 307 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 308 309 static int dmar_map_gfx = 1; 310 static int intel_iommu_superpage = 1; 311 static int iommu_identity_mapping; 312 static int iommu_skip_te_disable; 313 314 #define IDENTMAP_GFX 2 315 #define IDENTMAP_AZALIA 4 316 317 int intel_iommu_gfx_mapped; 318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 319 320 DEFINE_SPINLOCK(device_domain_lock); 321 static LIST_HEAD(device_domain_list); 322 323 /* 324 * Iterate over elements in device_domain_list and call the specified 325 * callback @fn against each element. 326 */ 327 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 328 void *data), void *data) 329 { 330 int ret = 0; 331 unsigned long flags; 332 struct device_domain_info *info; 333 334 spin_lock_irqsave(&device_domain_lock, flags); 335 list_for_each_entry(info, &device_domain_list, global) { 336 ret = fn(info, data); 337 if (ret) { 338 spin_unlock_irqrestore(&device_domain_lock, flags); 339 return ret; 340 } 341 } 342 spin_unlock_irqrestore(&device_domain_lock, flags); 343 344 return 0; 345 } 346 347 const struct iommu_ops intel_iommu_ops; 348 349 static bool translation_pre_enabled(struct intel_iommu *iommu) 350 { 351 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 352 } 353 354 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 355 { 356 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 357 } 358 359 static void init_translation_status(struct intel_iommu *iommu) 360 { 361 u32 gsts; 362 363 gsts = readl(iommu->reg + DMAR_GSTS_REG); 364 if (gsts & DMA_GSTS_TES) 365 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 366 } 367 368 static int __init intel_iommu_setup(char *str) 369 { 370 if (!str) 371 return -EINVAL; 372 373 while (*str) { 374 if (!strncmp(str, "on", 2)) { 375 dmar_disabled = 0; 376 pr_info("IOMMU enabled\n"); 377 } else if (!strncmp(str, "off", 3)) { 378 dmar_disabled = 1; 379 no_platform_optin = 1; 380 pr_info("IOMMU disabled\n"); 381 } else if (!strncmp(str, "igfx_off", 8)) { 382 dmar_map_gfx = 0; 383 pr_info("Disable GFX device mapping\n"); 384 } else if (!strncmp(str, "forcedac", 8)) { 385 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 386 iommu_dma_forcedac = true; 387 } else if (!strncmp(str, "strict", 6)) { 388 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 389 iommu_set_dma_strict(); 390 } else if (!strncmp(str, "sp_off", 6)) { 391 pr_info("Disable supported super page\n"); 392 intel_iommu_superpage = 0; 393 } else if (!strncmp(str, "sm_on", 5)) { 394 pr_info("Enable scalable mode if hardware supports\n"); 395 intel_iommu_sm = 1; 396 } else if (!strncmp(str, "sm_off", 6)) { 397 pr_info("Scalable mode is disallowed\n"); 398 intel_iommu_sm = 0; 399 } else if (!strncmp(str, "tboot_noforce", 13)) { 400 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 401 intel_iommu_tboot_noforce = 1; 402 } else { 403 pr_notice("Unknown option - '%s'\n", str); 404 } 405 406 str += strcspn(str, ","); 407 while (*str == ',') 408 str++; 409 } 410 411 return 1; 412 } 413 __setup("intel_iommu=", intel_iommu_setup); 414 415 void *alloc_pgtable_page(int node) 416 { 417 struct page *page; 418 void *vaddr = NULL; 419 420 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 421 if (page) 422 vaddr = page_address(page); 423 return vaddr; 424 } 425 426 void free_pgtable_page(void *vaddr) 427 { 428 free_page((unsigned long)vaddr); 429 } 430 431 static inline int domain_type_is_si(struct dmar_domain *domain) 432 { 433 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 434 } 435 436 static inline bool domain_use_first_level(struct dmar_domain *domain) 437 { 438 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 439 } 440 441 static inline int domain_pfn_supported(struct dmar_domain *domain, 442 unsigned long pfn) 443 { 444 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 445 446 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 447 } 448 449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 450 { 451 unsigned long sagaw; 452 int agaw; 453 454 sagaw = cap_sagaw(iommu->cap); 455 for (agaw = width_to_agaw(max_gaw); 456 agaw >= 0; agaw--) { 457 if (test_bit(agaw, &sagaw)) 458 break; 459 } 460 461 return agaw; 462 } 463 464 /* 465 * Calculate max SAGAW for each iommu. 466 */ 467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 468 { 469 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 470 } 471 472 /* 473 * calculate agaw for each iommu. 474 * "SAGAW" may be different across iommus, use a default agaw, and 475 * get a supported less agaw for iommus that don't support the default agaw. 476 */ 477 int iommu_calculate_agaw(struct intel_iommu *iommu) 478 { 479 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 480 } 481 482 /* This functionin only returns single iommu in a domain */ 483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 484 { 485 int iommu_id; 486 487 /* si_domain and vm domain should not get here. */ 488 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 489 return NULL; 490 491 for_each_domain_iommu(iommu_id, domain) 492 break; 493 494 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 495 return NULL; 496 497 return g_iommus[iommu_id]; 498 } 499 500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 501 { 502 return sm_supported(iommu) ? 503 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 504 } 505 506 static void domain_update_iommu_coherency(struct dmar_domain *domain) 507 { 508 struct dmar_drhd_unit *drhd; 509 struct intel_iommu *iommu; 510 bool found = false; 511 int i; 512 513 domain->iommu_coherency = true; 514 515 for_each_domain_iommu(i, domain) { 516 found = true; 517 if (!iommu_paging_structure_coherency(g_iommus[i])) { 518 domain->iommu_coherency = false; 519 break; 520 } 521 } 522 if (found) 523 return; 524 525 /* No hardware attached; use lowest common denominator */ 526 rcu_read_lock(); 527 for_each_active_iommu(iommu, drhd) { 528 if (!iommu_paging_structure_coherency(iommu)) { 529 domain->iommu_coherency = false; 530 break; 531 } 532 } 533 rcu_read_unlock(); 534 } 535 536 static bool domain_update_iommu_snooping(struct intel_iommu *skip) 537 { 538 struct dmar_drhd_unit *drhd; 539 struct intel_iommu *iommu; 540 bool ret = true; 541 542 rcu_read_lock(); 543 for_each_active_iommu(iommu, drhd) { 544 if (iommu != skip) { 545 /* 546 * If the hardware is operating in the scalable mode, 547 * the snooping control is always supported since we 548 * always set PASID-table-entry.PGSNP bit if the domain 549 * is managed outside (UNMANAGED). 550 */ 551 if (!sm_supported(iommu) && 552 !ecap_sc_support(iommu->ecap)) { 553 ret = false; 554 break; 555 } 556 } 557 } 558 rcu_read_unlock(); 559 560 return ret; 561 } 562 563 static int domain_update_iommu_superpage(struct dmar_domain *domain, 564 struct intel_iommu *skip) 565 { 566 struct dmar_drhd_unit *drhd; 567 struct intel_iommu *iommu; 568 int mask = 0x3; 569 570 if (!intel_iommu_superpage) 571 return 0; 572 573 /* set iommu_superpage to the smallest common denominator */ 574 rcu_read_lock(); 575 for_each_active_iommu(iommu, drhd) { 576 if (iommu != skip) { 577 if (domain && domain_use_first_level(domain)) { 578 if (!cap_fl1gp_support(iommu->cap)) 579 mask = 0x1; 580 } else { 581 mask &= cap_super_page_val(iommu->cap); 582 } 583 584 if (!mask) 585 break; 586 } 587 } 588 rcu_read_unlock(); 589 590 return fls(mask); 591 } 592 593 static int domain_update_device_node(struct dmar_domain *domain) 594 { 595 struct device_domain_info *info; 596 int nid = NUMA_NO_NODE; 597 598 assert_spin_locked(&device_domain_lock); 599 600 if (list_empty(&domain->devices)) 601 return NUMA_NO_NODE; 602 603 list_for_each_entry(info, &domain->devices, link) { 604 if (!info->dev) 605 continue; 606 607 /* 608 * There could possibly be multiple device numa nodes as devices 609 * within the same domain may sit behind different IOMMUs. There 610 * isn't perfect answer in such situation, so we select first 611 * come first served policy. 612 */ 613 nid = dev_to_node(info->dev); 614 if (nid != NUMA_NO_NODE) 615 break; 616 } 617 618 return nid; 619 } 620 621 static void domain_update_iotlb(struct dmar_domain *domain); 622 623 /* Return the super pagesize bitmap if supported. */ 624 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 625 { 626 unsigned long bitmap = 0; 627 628 /* 629 * 1-level super page supports page size of 2MiB, 2-level super page 630 * supports page size of both 2MiB and 1GiB. 631 */ 632 if (domain->iommu_superpage == 1) 633 bitmap |= SZ_2M; 634 else if (domain->iommu_superpage == 2) 635 bitmap |= SZ_2M | SZ_1G; 636 637 return bitmap; 638 } 639 640 /* Some capabilities may be different across iommus */ 641 static void domain_update_iommu_cap(struct dmar_domain *domain) 642 { 643 domain_update_iommu_coherency(domain); 644 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 645 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 646 647 /* 648 * If RHSA is missing, we should default to the device numa domain 649 * as fall back. 650 */ 651 if (domain->nid == NUMA_NO_NODE) 652 domain->nid = domain_update_device_node(domain); 653 654 /* 655 * First-level translation restricts the input-address to a 656 * canonical address (i.e., address bits 63:N have the same 657 * value as address bit [N-1], where N is 48-bits with 4-level 658 * paging and 57-bits with 5-level paging). Hence, skip bit 659 * [N-1]. 660 */ 661 if (domain_use_first_level(domain)) 662 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 663 else 664 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 665 666 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 667 domain_update_iotlb(domain); 668 } 669 670 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 671 u8 devfn, int alloc) 672 { 673 struct root_entry *root = &iommu->root_entry[bus]; 674 struct context_entry *context; 675 u64 *entry; 676 677 entry = &root->lo; 678 if (sm_supported(iommu)) { 679 if (devfn >= 0x80) { 680 devfn -= 0x80; 681 entry = &root->hi; 682 } 683 devfn *= 2; 684 } 685 if (*entry & 1) 686 context = phys_to_virt(*entry & VTD_PAGE_MASK); 687 else { 688 unsigned long phy_addr; 689 if (!alloc) 690 return NULL; 691 692 context = alloc_pgtable_page(iommu->node); 693 if (!context) 694 return NULL; 695 696 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 697 phy_addr = virt_to_phys((void *)context); 698 *entry = phy_addr | 1; 699 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 700 } 701 return &context[devfn]; 702 } 703 704 /** 705 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 706 * sub-hierarchy of a candidate PCI-PCI bridge 707 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 708 * @bridge: the candidate PCI-PCI bridge 709 * 710 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 711 */ 712 static bool 713 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 714 { 715 struct pci_dev *pdev, *pbridge; 716 717 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 718 return false; 719 720 pdev = to_pci_dev(dev); 721 pbridge = to_pci_dev(bridge); 722 723 if (pbridge->subordinate && 724 pbridge->subordinate->number <= pdev->bus->number && 725 pbridge->subordinate->busn_res.end >= pdev->bus->number) 726 return true; 727 728 return false; 729 } 730 731 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 732 { 733 struct dmar_drhd_unit *drhd; 734 u32 vtbar; 735 int rc; 736 737 /* We know that this device on this chipset has its own IOMMU. 738 * If we find it under a different IOMMU, then the BIOS is lying 739 * to us. Hope that the IOMMU for this device is actually 740 * disabled, and it needs no translation... 741 */ 742 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 743 if (rc) { 744 /* "can't" happen */ 745 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 746 return false; 747 } 748 vtbar &= 0xffff0000; 749 750 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 751 drhd = dmar_find_matched_drhd_unit(pdev); 752 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 753 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 754 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 755 return true; 756 } 757 758 return false; 759 } 760 761 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 762 { 763 if (!iommu || iommu->drhd->ignored) 764 return true; 765 766 if (dev_is_pci(dev)) { 767 struct pci_dev *pdev = to_pci_dev(dev); 768 769 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 770 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 771 quirk_ioat_snb_local_iommu(pdev)) 772 return true; 773 } 774 775 return false; 776 } 777 778 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 779 { 780 struct dmar_drhd_unit *drhd = NULL; 781 struct pci_dev *pdev = NULL; 782 struct intel_iommu *iommu; 783 struct device *tmp; 784 u16 segment = 0; 785 int i; 786 787 if (!dev) 788 return NULL; 789 790 if (dev_is_pci(dev)) { 791 struct pci_dev *pf_pdev; 792 793 pdev = pci_real_dma_dev(to_pci_dev(dev)); 794 795 /* VFs aren't listed in scope tables; we need to look up 796 * the PF instead to find the IOMMU. */ 797 pf_pdev = pci_physfn(pdev); 798 dev = &pf_pdev->dev; 799 segment = pci_domain_nr(pdev->bus); 800 } else if (has_acpi_companion(dev)) 801 dev = &ACPI_COMPANION(dev)->dev; 802 803 rcu_read_lock(); 804 for_each_iommu(iommu, drhd) { 805 if (pdev && segment != drhd->segment) 806 continue; 807 808 for_each_active_dev_scope(drhd->devices, 809 drhd->devices_cnt, i, tmp) { 810 if (tmp == dev) { 811 /* For a VF use its original BDF# not that of the PF 812 * which we used for the IOMMU lookup. Strictly speaking 813 * we could do this for all PCI devices; we only need to 814 * get the BDF# from the scope table for ACPI matches. */ 815 if (pdev && pdev->is_virtfn) 816 goto got_pdev; 817 818 if (bus && devfn) { 819 *bus = drhd->devices[i].bus; 820 *devfn = drhd->devices[i].devfn; 821 } 822 goto out; 823 } 824 825 if (is_downstream_to_pci_bridge(dev, tmp)) 826 goto got_pdev; 827 } 828 829 if (pdev && drhd->include_all) { 830 got_pdev: 831 if (bus && devfn) { 832 *bus = pdev->bus->number; 833 *devfn = pdev->devfn; 834 } 835 goto out; 836 } 837 } 838 iommu = NULL; 839 out: 840 if (iommu_is_dummy(iommu, dev)) 841 iommu = NULL; 842 843 rcu_read_unlock(); 844 845 return iommu; 846 } 847 848 static void domain_flush_cache(struct dmar_domain *domain, 849 void *addr, int size) 850 { 851 if (!domain->iommu_coherency) 852 clflush_cache_range(addr, size); 853 } 854 855 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 856 { 857 struct context_entry *context; 858 int ret = 0; 859 unsigned long flags; 860 861 spin_lock_irqsave(&iommu->lock, flags); 862 context = iommu_context_addr(iommu, bus, devfn, 0); 863 if (context) 864 ret = context_present(context); 865 spin_unlock_irqrestore(&iommu->lock, flags); 866 return ret; 867 } 868 869 static void free_context_table(struct intel_iommu *iommu) 870 { 871 int i; 872 unsigned long flags; 873 struct context_entry *context; 874 875 spin_lock_irqsave(&iommu->lock, flags); 876 if (!iommu->root_entry) { 877 goto out; 878 } 879 for (i = 0; i < ROOT_ENTRY_NR; i++) { 880 context = iommu_context_addr(iommu, i, 0, 0); 881 if (context) 882 free_pgtable_page(context); 883 884 if (!sm_supported(iommu)) 885 continue; 886 887 context = iommu_context_addr(iommu, i, 0x80, 0); 888 if (context) 889 free_pgtable_page(context); 890 891 } 892 free_pgtable_page(iommu->root_entry); 893 iommu->root_entry = NULL; 894 out: 895 spin_unlock_irqrestore(&iommu->lock, flags); 896 } 897 898 #ifdef CONFIG_DMAR_DEBUG 899 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 900 { 901 struct device_domain_info *info; 902 struct dma_pte *parent, *pte; 903 struct dmar_domain *domain; 904 int offset, level; 905 906 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); 907 if (!info || !info->domain) { 908 pr_info("device [%02x:%02x.%d] not probed\n", 909 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 910 return; 911 } 912 913 domain = info->domain; 914 level = agaw_to_level(domain->agaw); 915 parent = domain->pgd; 916 if (!parent) { 917 pr_info("no page table setup\n"); 918 return; 919 } 920 921 while (1) { 922 offset = pfn_level_offset(pfn, level); 923 pte = &parent[offset]; 924 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 925 pr_info("PTE not present at level %d\n", level); 926 break; 927 } 928 929 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 930 931 if (level == 1) 932 break; 933 934 parent = phys_to_virt(dma_pte_addr(pte)); 935 level--; 936 } 937 } 938 939 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 940 unsigned long long addr, u32 pasid) 941 { 942 struct pasid_dir_entry *dir, *pde; 943 struct pasid_entry *entries, *pte; 944 struct context_entry *ctx_entry; 945 struct root_entry *rt_entry; 946 u8 devfn = source_id & 0xff; 947 u8 bus = source_id >> 8; 948 int i, dir_index, index; 949 950 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 951 952 /* root entry dump */ 953 rt_entry = &iommu->root_entry[bus]; 954 if (!rt_entry) { 955 pr_info("root table entry is not present\n"); 956 return; 957 } 958 959 if (sm_supported(iommu)) 960 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 961 rt_entry->hi, rt_entry->lo); 962 else 963 pr_info("root entry: 0x%016llx", rt_entry->lo); 964 965 /* context entry dump */ 966 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 967 if (!ctx_entry) { 968 pr_info("context table entry is not present\n"); 969 return; 970 } 971 972 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 973 ctx_entry->hi, ctx_entry->lo); 974 975 /* legacy mode does not require PASID entries */ 976 if (!sm_supported(iommu)) 977 goto pgtable_walk; 978 979 /* get the pointer to pasid directory entry */ 980 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 981 if (!dir) { 982 pr_info("pasid directory entry is not present\n"); 983 return; 984 } 985 /* For request-without-pasid, get the pasid from context entry */ 986 if (intel_iommu_sm && pasid == INVALID_IOASID) 987 pasid = PASID_RID2PASID; 988 989 dir_index = pasid >> PASID_PDE_SHIFT; 990 pde = &dir[dir_index]; 991 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 992 993 /* get the pointer to the pasid table entry */ 994 entries = get_pasid_table_from_pde(pde); 995 if (!entries) { 996 pr_info("pasid table entry is not present\n"); 997 return; 998 } 999 index = pasid & PASID_PTE_MASK; 1000 pte = &entries[index]; 1001 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 1002 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 1003 1004 pgtable_walk: 1005 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 1006 } 1007 #endif 1008 1009 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 1010 unsigned long pfn, int *target_level) 1011 { 1012 struct dma_pte *parent, *pte; 1013 int level = agaw_to_level(domain->agaw); 1014 int offset; 1015 1016 BUG_ON(!domain->pgd); 1017 1018 if (!domain_pfn_supported(domain, pfn)) 1019 /* Address beyond IOMMU's addressing capabilities. */ 1020 return NULL; 1021 1022 parent = domain->pgd; 1023 1024 while (1) { 1025 void *tmp_page; 1026 1027 offset = pfn_level_offset(pfn, level); 1028 pte = &parent[offset]; 1029 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1030 break; 1031 if (level == *target_level) 1032 break; 1033 1034 if (!dma_pte_present(pte)) { 1035 uint64_t pteval; 1036 1037 tmp_page = alloc_pgtable_page(domain->nid); 1038 1039 if (!tmp_page) 1040 return NULL; 1041 1042 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1043 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1044 if (domain_use_first_level(domain)) { 1045 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1046 if (iommu_is_dma_domain(&domain->domain)) 1047 pteval |= DMA_FL_PTE_ACCESS; 1048 } 1049 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1050 /* Someone else set it while we were thinking; use theirs. */ 1051 free_pgtable_page(tmp_page); 1052 else 1053 domain_flush_cache(domain, pte, sizeof(*pte)); 1054 } 1055 if (level == 1) 1056 break; 1057 1058 parent = phys_to_virt(dma_pte_addr(pte)); 1059 level--; 1060 } 1061 1062 if (!*target_level) 1063 *target_level = level; 1064 1065 return pte; 1066 } 1067 1068 /* return address's pte at specific level */ 1069 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1070 unsigned long pfn, 1071 int level, int *large_page) 1072 { 1073 struct dma_pte *parent, *pte; 1074 int total = agaw_to_level(domain->agaw); 1075 int offset; 1076 1077 parent = domain->pgd; 1078 while (level <= total) { 1079 offset = pfn_level_offset(pfn, total); 1080 pte = &parent[offset]; 1081 if (level == total) 1082 return pte; 1083 1084 if (!dma_pte_present(pte)) { 1085 *large_page = total; 1086 break; 1087 } 1088 1089 if (dma_pte_superpage(pte)) { 1090 *large_page = total; 1091 return pte; 1092 } 1093 1094 parent = phys_to_virt(dma_pte_addr(pte)); 1095 total--; 1096 } 1097 return NULL; 1098 } 1099 1100 /* clear last level pte, a tlb flush should be followed */ 1101 static void dma_pte_clear_range(struct dmar_domain *domain, 1102 unsigned long start_pfn, 1103 unsigned long last_pfn) 1104 { 1105 unsigned int large_page; 1106 struct dma_pte *first_pte, *pte; 1107 1108 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1109 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1110 BUG_ON(start_pfn > last_pfn); 1111 1112 /* we don't need lock here; nobody else touches the iova range */ 1113 do { 1114 large_page = 1; 1115 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1116 if (!pte) { 1117 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1118 continue; 1119 } 1120 do { 1121 dma_clear_pte(pte); 1122 start_pfn += lvl_to_nr_pages(large_page); 1123 pte++; 1124 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1125 1126 domain_flush_cache(domain, first_pte, 1127 (void *)pte - (void *)first_pte); 1128 1129 } while (start_pfn && start_pfn <= last_pfn); 1130 } 1131 1132 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1133 int retain_level, struct dma_pte *pte, 1134 unsigned long pfn, unsigned long start_pfn, 1135 unsigned long last_pfn) 1136 { 1137 pfn = max(start_pfn, pfn); 1138 pte = &pte[pfn_level_offset(pfn, level)]; 1139 1140 do { 1141 unsigned long level_pfn; 1142 struct dma_pte *level_pte; 1143 1144 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1145 goto next; 1146 1147 level_pfn = pfn & level_mask(level); 1148 level_pte = phys_to_virt(dma_pte_addr(pte)); 1149 1150 if (level > 2) { 1151 dma_pte_free_level(domain, level - 1, retain_level, 1152 level_pte, level_pfn, start_pfn, 1153 last_pfn); 1154 } 1155 1156 /* 1157 * Free the page table if we're below the level we want to 1158 * retain and the range covers the entire table. 1159 */ 1160 if (level < retain_level && !(start_pfn > level_pfn || 1161 last_pfn < level_pfn + level_size(level) - 1)) { 1162 dma_clear_pte(pte); 1163 domain_flush_cache(domain, pte, sizeof(*pte)); 1164 free_pgtable_page(level_pte); 1165 } 1166 next: 1167 pfn += level_size(level); 1168 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1169 } 1170 1171 /* 1172 * clear last level (leaf) ptes and free page table pages below the 1173 * level we wish to keep intact. 1174 */ 1175 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1176 unsigned long start_pfn, 1177 unsigned long last_pfn, 1178 int retain_level) 1179 { 1180 dma_pte_clear_range(domain, start_pfn, last_pfn); 1181 1182 /* We don't need lock here; nobody else touches the iova range */ 1183 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1184 domain->pgd, 0, start_pfn, last_pfn); 1185 1186 /* free pgd */ 1187 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1188 free_pgtable_page(domain->pgd); 1189 domain->pgd = NULL; 1190 } 1191 } 1192 1193 /* When a page at a given level is being unlinked from its parent, we don't 1194 need to *modify* it at all. All we need to do is make a list of all the 1195 pages which can be freed just as soon as we've flushed the IOTLB and we 1196 know the hardware page-walk will no longer touch them. 1197 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1198 be freed. */ 1199 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1200 int level, struct dma_pte *pte, 1201 struct list_head *freelist) 1202 { 1203 struct page *pg; 1204 1205 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1206 list_add_tail(&pg->lru, freelist); 1207 1208 if (level == 1) 1209 return; 1210 1211 pte = page_address(pg); 1212 do { 1213 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1214 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1215 pte++; 1216 } while (!first_pte_in_page(pte)); 1217 } 1218 1219 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1220 struct dma_pte *pte, unsigned long pfn, 1221 unsigned long start_pfn, unsigned long last_pfn, 1222 struct list_head *freelist) 1223 { 1224 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1225 1226 pfn = max(start_pfn, pfn); 1227 pte = &pte[pfn_level_offset(pfn, level)]; 1228 1229 do { 1230 unsigned long level_pfn = pfn & level_mask(level); 1231 1232 if (!dma_pte_present(pte)) 1233 goto next; 1234 1235 /* If range covers entire pagetable, free it */ 1236 if (start_pfn <= level_pfn && 1237 last_pfn >= level_pfn + level_size(level) - 1) { 1238 /* These suborbinate page tables are going away entirely. Don't 1239 bother to clear them; we're just going to *free* them. */ 1240 if (level > 1 && !dma_pte_superpage(pte)) 1241 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1242 1243 dma_clear_pte(pte); 1244 if (!first_pte) 1245 first_pte = pte; 1246 last_pte = pte; 1247 } else if (level > 1) { 1248 /* Recurse down into a level that isn't *entirely* obsolete */ 1249 dma_pte_clear_level(domain, level - 1, 1250 phys_to_virt(dma_pte_addr(pte)), 1251 level_pfn, start_pfn, last_pfn, 1252 freelist); 1253 } 1254 next: 1255 pfn = level_pfn + level_size(level); 1256 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1257 1258 if (first_pte) 1259 domain_flush_cache(domain, first_pte, 1260 (void *)++last_pte - (void *)first_pte); 1261 } 1262 1263 /* We can't just free the pages because the IOMMU may still be walking 1264 the page tables, and may have cached the intermediate levels. The 1265 pages can only be freed after the IOTLB flush has been done. */ 1266 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1267 unsigned long last_pfn, struct list_head *freelist) 1268 { 1269 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1270 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1271 BUG_ON(start_pfn > last_pfn); 1272 1273 /* we don't need lock here; nobody else touches the iova range */ 1274 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1275 domain->pgd, 0, start_pfn, last_pfn, freelist); 1276 1277 /* free pgd */ 1278 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1279 struct page *pgd_page = virt_to_page(domain->pgd); 1280 list_add_tail(&pgd_page->lru, freelist); 1281 domain->pgd = NULL; 1282 } 1283 } 1284 1285 /* iommu handling */ 1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1287 { 1288 struct root_entry *root; 1289 unsigned long flags; 1290 1291 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1292 if (!root) { 1293 pr_err("Allocating root entry for %s failed\n", 1294 iommu->name); 1295 return -ENOMEM; 1296 } 1297 1298 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1299 1300 spin_lock_irqsave(&iommu->lock, flags); 1301 iommu->root_entry = root; 1302 spin_unlock_irqrestore(&iommu->lock, flags); 1303 1304 return 0; 1305 } 1306 1307 static void iommu_set_root_entry(struct intel_iommu *iommu) 1308 { 1309 u64 addr; 1310 u32 sts; 1311 unsigned long flag; 1312 1313 addr = virt_to_phys(iommu->root_entry); 1314 if (sm_supported(iommu)) 1315 addr |= DMA_RTADDR_SMT; 1316 1317 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1318 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1319 1320 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1321 1322 /* Make sure hardware complete it */ 1323 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1324 readl, (sts & DMA_GSTS_RTPS), sts); 1325 1326 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1327 1328 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1329 if (sm_supported(iommu)) 1330 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1331 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1332 } 1333 1334 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1335 { 1336 u32 val; 1337 unsigned long flag; 1338 1339 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1340 return; 1341 1342 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1343 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1344 1345 /* Make sure hardware complete it */ 1346 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1347 readl, (!(val & DMA_GSTS_WBFS)), val); 1348 1349 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1350 } 1351 1352 /* return value determine if we need a write buffer flush */ 1353 static void __iommu_flush_context(struct intel_iommu *iommu, 1354 u16 did, u16 source_id, u8 function_mask, 1355 u64 type) 1356 { 1357 u64 val = 0; 1358 unsigned long flag; 1359 1360 switch (type) { 1361 case DMA_CCMD_GLOBAL_INVL: 1362 val = DMA_CCMD_GLOBAL_INVL; 1363 break; 1364 case DMA_CCMD_DOMAIN_INVL: 1365 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1366 break; 1367 case DMA_CCMD_DEVICE_INVL: 1368 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1369 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1370 break; 1371 default: 1372 BUG(); 1373 } 1374 val |= DMA_CCMD_ICC; 1375 1376 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1377 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1378 1379 /* Make sure hardware complete it */ 1380 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1381 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1382 1383 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1384 } 1385 1386 /* return value determine if we need a write buffer flush */ 1387 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1388 u64 addr, unsigned int size_order, u64 type) 1389 { 1390 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1391 u64 val = 0, val_iva = 0; 1392 unsigned long flag; 1393 1394 switch (type) { 1395 case DMA_TLB_GLOBAL_FLUSH: 1396 /* global flush doesn't need set IVA_REG */ 1397 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1398 break; 1399 case DMA_TLB_DSI_FLUSH: 1400 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1401 break; 1402 case DMA_TLB_PSI_FLUSH: 1403 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1404 /* IH bit is passed in as part of address */ 1405 val_iva = size_order | addr; 1406 break; 1407 default: 1408 BUG(); 1409 } 1410 /* Note: set drain read/write */ 1411 #if 0 1412 /* 1413 * This is probably to be super secure.. Looks like we can 1414 * ignore it without any impact. 1415 */ 1416 if (cap_read_drain(iommu->cap)) 1417 val |= DMA_TLB_READ_DRAIN; 1418 #endif 1419 if (cap_write_drain(iommu->cap)) 1420 val |= DMA_TLB_WRITE_DRAIN; 1421 1422 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1423 /* Note: Only uses first TLB reg currently */ 1424 if (val_iva) 1425 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1426 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1427 1428 /* Make sure hardware complete it */ 1429 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1430 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1431 1432 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1433 1434 /* check IOTLB invalidation granularity */ 1435 if (DMA_TLB_IAIG(val) == 0) 1436 pr_err("Flush IOTLB failed\n"); 1437 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1438 pr_debug("TLB flush request %Lx, actual %Lx\n", 1439 (unsigned long long)DMA_TLB_IIRG(type), 1440 (unsigned long long)DMA_TLB_IAIG(val)); 1441 } 1442 1443 static struct device_domain_info * 1444 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1445 u8 bus, u8 devfn) 1446 { 1447 struct device_domain_info *info; 1448 1449 assert_spin_locked(&device_domain_lock); 1450 1451 if (!iommu->qi) 1452 return NULL; 1453 1454 list_for_each_entry(info, &domain->devices, link) 1455 if (info->iommu == iommu && info->bus == bus && 1456 info->devfn == devfn) { 1457 if (info->ats_supported && info->dev) 1458 return info; 1459 break; 1460 } 1461 1462 return NULL; 1463 } 1464 1465 static void domain_update_iotlb(struct dmar_domain *domain) 1466 { 1467 struct device_domain_info *info; 1468 bool has_iotlb_device = false; 1469 1470 assert_spin_locked(&device_domain_lock); 1471 1472 list_for_each_entry(info, &domain->devices, link) 1473 if (info->ats_enabled) { 1474 has_iotlb_device = true; 1475 break; 1476 } 1477 1478 domain->has_iotlb_device = has_iotlb_device; 1479 } 1480 1481 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1482 { 1483 struct pci_dev *pdev; 1484 1485 assert_spin_locked(&device_domain_lock); 1486 1487 if (!info || !dev_is_pci(info->dev)) 1488 return; 1489 1490 pdev = to_pci_dev(info->dev); 1491 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1492 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1493 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1494 * reserved, which should be set to 0. 1495 */ 1496 if (!ecap_dit(info->iommu->ecap)) 1497 info->pfsid = 0; 1498 else { 1499 struct pci_dev *pf_pdev; 1500 1501 /* pdev will be returned if device is not a vf */ 1502 pf_pdev = pci_physfn(pdev); 1503 info->pfsid = pci_dev_id(pf_pdev); 1504 } 1505 1506 #ifdef CONFIG_INTEL_IOMMU_SVM 1507 /* The PCIe spec, in its wisdom, declares that the behaviour of 1508 the device if you enable PASID support after ATS support is 1509 undefined. So always enable PASID support on devices which 1510 have it, even if we can't yet know if we're ever going to 1511 use it. */ 1512 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1513 info->pasid_enabled = 1; 1514 1515 if (info->pri_supported && 1516 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1517 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1518 info->pri_enabled = 1; 1519 #endif 1520 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1521 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1522 info->ats_enabled = 1; 1523 domain_update_iotlb(info->domain); 1524 info->ats_qdep = pci_ats_queue_depth(pdev); 1525 } 1526 } 1527 1528 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1529 { 1530 struct pci_dev *pdev; 1531 1532 assert_spin_locked(&device_domain_lock); 1533 1534 if (!dev_is_pci(info->dev)) 1535 return; 1536 1537 pdev = to_pci_dev(info->dev); 1538 1539 if (info->ats_enabled) { 1540 pci_disable_ats(pdev); 1541 info->ats_enabled = 0; 1542 domain_update_iotlb(info->domain); 1543 } 1544 #ifdef CONFIG_INTEL_IOMMU_SVM 1545 if (info->pri_enabled) { 1546 pci_disable_pri(pdev); 1547 info->pri_enabled = 0; 1548 } 1549 if (info->pasid_enabled) { 1550 pci_disable_pasid(pdev); 1551 info->pasid_enabled = 0; 1552 } 1553 #endif 1554 } 1555 1556 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1557 u64 addr, unsigned int mask) 1558 { 1559 u16 sid, qdep; 1560 1561 if (!info || !info->ats_enabled) 1562 return; 1563 1564 sid = info->bus << 8 | info->devfn; 1565 qdep = info->ats_qdep; 1566 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1567 qdep, addr, mask); 1568 } 1569 1570 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1571 u64 addr, unsigned mask) 1572 { 1573 unsigned long flags; 1574 struct device_domain_info *info; 1575 1576 if (!domain->has_iotlb_device) 1577 return; 1578 1579 spin_lock_irqsave(&device_domain_lock, flags); 1580 list_for_each_entry(info, &domain->devices, link) 1581 __iommu_flush_dev_iotlb(info, addr, mask); 1582 1583 spin_unlock_irqrestore(&device_domain_lock, flags); 1584 } 1585 1586 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1587 struct dmar_domain *domain, 1588 unsigned long pfn, unsigned int pages, 1589 int ih, int map) 1590 { 1591 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1592 unsigned int mask = ilog2(aligned_pages); 1593 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1594 u16 did = domain->iommu_did[iommu->seq_id]; 1595 1596 BUG_ON(pages == 0); 1597 1598 if (ih) 1599 ih = 1 << 6; 1600 1601 if (domain_use_first_level(domain)) { 1602 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1603 } else { 1604 unsigned long bitmask = aligned_pages - 1; 1605 1606 /* 1607 * PSI masks the low order bits of the base address. If the 1608 * address isn't aligned to the mask, then compute a mask value 1609 * needed to ensure the target range is flushed. 1610 */ 1611 if (unlikely(bitmask & pfn)) { 1612 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1613 1614 /* 1615 * Since end_pfn <= pfn + bitmask, the only way bits 1616 * higher than bitmask can differ in pfn and end_pfn is 1617 * by carrying. This means after masking out bitmask, 1618 * high bits starting with the first set bit in 1619 * shared_bits are all equal in both pfn and end_pfn. 1620 */ 1621 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1622 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1623 } 1624 1625 /* 1626 * Fallback to domain selective flush if no PSI support or 1627 * the size is too big. 1628 */ 1629 if (!cap_pgsel_inv(iommu->cap) || 1630 mask > cap_max_amask_val(iommu->cap)) 1631 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1632 DMA_TLB_DSI_FLUSH); 1633 else 1634 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1635 DMA_TLB_PSI_FLUSH); 1636 } 1637 1638 /* 1639 * In caching mode, changes of pages from non-present to present require 1640 * flush. However, device IOTLB doesn't need to be flushed in this case. 1641 */ 1642 if (!cap_caching_mode(iommu->cap) || !map) 1643 iommu_flush_dev_iotlb(domain, addr, mask); 1644 } 1645 1646 /* Notification for newly created mappings */ 1647 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1648 struct dmar_domain *domain, 1649 unsigned long pfn, unsigned int pages) 1650 { 1651 /* 1652 * It's a non-present to present mapping. Only flush if caching mode 1653 * and second level. 1654 */ 1655 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1656 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1657 else 1658 iommu_flush_write_buffer(iommu); 1659 } 1660 1661 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1662 { 1663 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1664 int idx; 1665 1666 for_each_domain_iommu(idx, dmar_domain) { 1667 struct intel_iommu *iommu = g_iommus[idx]; 1668 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1669 1670 if (domain_use_first_level(dmar_domain)) 1671 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1672 else 1673 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1674 DMA_TLB_DSI_FLUSH); 1675 1676 if (!cap_caching_mode(iommu->cap)) 1677 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1678 } 1679 } 1680 1681 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1682 { 1683 u32 pmen; 1684 unsigned long flags; 1685 1686 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1687 return; 1688 1689 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1690 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1691 pmen &= ~DMA_PMEN_EPM; 1692 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1693 1694 /* wait for the protected region status bit to clear */ 1695 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1696 readl, !(pmen & DMA_PMEN_PRS), pmen); 1697 1698 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1699 } 1700 1701 static void iommu_enable_translation(struct intel_iommu *iommu) 1702 { 1703 u32 sts; 1704 unsigned long flags; 1705 1706 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1707 iommu->gcmd |= DMA_GCMD_TE; 1708 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1709 1710 /* Make sure hardware complete it */ 1711 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1712 readl, (sts & DMA_GSTS_TES), sts); 1713 1714 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1715 } 1716 1717 static void iommu_disable_translation(struct intel_iommu *iommu) 1718 { 1719 u32 sts; 1720 unsigned long flag; 1721 1722 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1723 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1724 return; 1725 1726 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1727 iommu->gcmd &= ~DMA_GCMD_TE; 1728 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1729 1730 /* Make sure hardware complete it */ 1731 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1732 readl, (!(sts & DMA_GSTS_TES)), sts); 1733 1734 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1735 } 1736 1737 static int iommu_init_domains(struct intel_iommu *iommu) 1738 { 1739 u32 ndomains; 1740 1741 ndomains = cap_ndoms(iommu->cap); 1742 pr_debug("%s: Number of Domains supported <%d>\n", 1743 iommu->name, ndomains); 1744 1745 spin_lock_init(&iommu->lock); 1746 1747 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1748 if (!iommu->domain_ids) 1749 return -ENOMEM; 1750 1751 /* 1752 * If Caching mode is set, then invalid translations are tagged 1753 * with domain-id 0, hence we need to pre-allocate it. We also 1754 * use domain-id 0 as a marker for non-allocated domain-id, so 1755 * make sure it is not used for a real domain. 1756 */ 1757 set_bit(0, iommu->domain_ids); 1758 1759 /* 1760 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1761 * entry for first-level or pass-through translation modes should 1762 * be programmed with a domain id different from those used for 1763 * second-level or nested translation. We reserve a domain id for 1764 * this purpose. 1765 */ 1766 if (sm_supported(iommu)) 1767 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1768 1769 return 0; 1770 } 1771 1772 static void disable_dmar_iommu(struct intel_iommu *iommu) 1773 { 1774 struct device_domain_info *info, *tmp; 1775 unsigned long flags; 1776 1777 if (!iommu->domain_ids) 1778 return; 1779 1780 spin_lock_irqsave(&device_domain_lock, flags); 1781 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1782 if (info->iommu != iommu) 1783 continue; 1784 1785 if (!info->dev || !info->domain) 1786 continue; 1787 1788 __dmar_remove_one_dev_info(info); 1789 } 1790 spin_unlock_irqrestore(&device_domain_lock, flags); 1791 1792 if (iommu->gcmd & DMA_GCMD_TE) 1793 iommu_disable_translation(iommu); 1794 } 1795 1796 static void free_dmar_iommu(struct intel_iommu *iommu) 1797 { 1798 if (iommu->domain_ids) { 1799 bitmap_free(iommu->domain_ids); 1800 iommu->domain_ids = NULL; 1801 } 1802 1803 g_iommus[iommu->seq_id] = NULL; 1804 1805 /* free context mapping */ 1806 free_context_table(iommu); 1807 1808 #ifdef CONFIG_INTEL_IOMMU_SVM 1809 if (pasid_supported(iommu)) { 1810 if (ecap_prs(iommu->ecap)) 1811 intel_svm_finish_prq(iommu); 1812 } 1813 if (vccap_pasid(iommu->vccap)) 1814 ioasid_unregister_allocator(&iommu->pasid_allocator); 1815 1816 #endif 1817 } 1818 1819 /* 1820 * Check and return whether first level is used by default for 1821 * DMA translation. 1822 */ 1823 static bool first_level_by_default(unsigned int type) 1824 { 1825 /* Only SL is available in legacy mode */ 1826 if (!scalable_mode_support()) 1827 return false; 1828 1829 /* Only level (either FL or SL) is available, just use it */ 1830 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1831 return intel_cap_flts_sanity(); 1832 1833 /* Both levels are available, decide it based on domain type */ 1834 return type != IOMMU_DOMAIN_UNMANAGED; 1835 } 1836 1837 static struct dmar_domain *alloc_domain(unsigned int type) 1838 { 1839 struct dmar_domain *domain; 1840 1841 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1842 if (!domain) 1843 return NULL; 1844 1845 domain->nid = NUMA_NO_NODE; 1846 if (first_level_by_default(type)) 1847 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1848 domain->has_iotlb_device = false; 1849 INIT_LIST_HEAD(&domain->devices); 1850 1851 return domain; 1852 } 1853 1854 /* Must be called with iommu->lock */ 1855 static int domain_attach_iommu(struct dmar_domain *domain, 1856 struct intel_iommu *iommu) 1857 { 1858 unsigned long ndomains; 1859 int num; 1860 1861 assert_spin_locked(&device_domain_lock); 1862 assert_spin_locked(&iommu->lock); 1863 1864 domain->iommu_refcnt[iommu->seq_id] += 1; 1865 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1866 ndomains = cap_ndoms(iommu->cap); 1867 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1868 1869 if (num >= ndomains) { 1870 pr_err("%s: No free domain ids\n", iommu->name); 1871 domain->iommu_refcnt[iommu->seq_id] -= 1; 1872 return -ENOSPC; 1873 } 1874 1875 set_bit(num, iommu->domain_ids); 1876 domain->iommu_did[iommu->seq_id] = num; 1877 domain->nid = iommu->node; 1878 domain_update_iommu_cap(domain); 1879 } 1880 1881 return 0; 1882 } 1883 1884 static void domain_detach_iommu(struct dmar_domain *domain, 1885 struct intel_iommu *iommu) 1886 { 1887 int num; 1888 1889 assert_spin_locked(&device_domain_lock); 1890 assert_spin_locked(&iommu->lock); 1891 1892 domain->iommu_refcnt[iommu->seq_id] -= 1; 1893 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1894 num = domain->iommu_did[iommu->seq_id]; 1895 clear_bit(num, iommu->domain_ids); 1896 domain_update_iommu_cap(domain); 1897 domain->iommu_did[iommu->seq_id] = 0; 1898 } 1899 } 1900 1901 static inline int guestwidth_to_adjustwidth(int gaw) 1902 { 1903 int agaw; 1904 int r = (gaw - 12) % 9; 1905 1906 if (r == 0) 1907 agaw = gaw; 1908 else 1909 agaw = gaw + 9 - r; 1910 if (agaw > 64) 1911 agaw = 64; 1912 return agaw; 1913 } 1914 1915 static void domain_exit(struct dmar_domain *domain) 1916 { 1917 1918 /* Remove associated devices and clear attached or cached domains */ 1919 domain_remove_dev_info(domain); 1920 1921 if (domain->pgd) { 1922 LIST_HEAD(freelist); 1923 1924 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1925 put_pages_list(&freelist); 1926 } 1927 1928 kfree(domain); 1929 } 1930 1931 /* 1932 * Get the PASID directory size for scalable mode context entry. 1933 * Value of X in the PDTS field of a scalable mode context entry 1934 * indicates PASID directory with 2^(X + 7) entries. 1935 */ 1936 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1937 { 1938 unsigned long pds, max_pde; 1939 1940 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1941 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1942 if (pds < 7) 1943 return 0; 1944 1945 return pds - 7; 1946 } 1947 1948 /* 1949 * Set the RID_PASID field of a scalable mode context entry. The 1950 * IOMMU hardware will use the PASID value set in this field for 1951 * DMA translations of DMA requests without PASID. 1952 */ 1953 static inline void 1954 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1955 { 1956 context->hi |= pasid & ((1 << 20) - 1); 1957 } 1958 1959 /* 1960 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1961 * entry. 1962 */ 1963 static inline void context_set_sm_dte(struct context_entry *context) 1964 { 1965 context->lo |= (1 << 2); 1966 } 1967 1968 /* 1969 * Set the PRE(Page Request Enable) field of a scalable mode context 1970 * entry. 1971 */ 1972 static inline void context_set_sm_pre(struct context_entry *context) 1973 { 1974 context->lo |= (1 << 4); 1975 } 1976 1977 /* Convert value to context PASID directory size field coding. */ 1978 #define context_pdts(pds) (((pds) & 0x7) << 9) 1979 1980 static int domain_context_mapping_one(struct dmar_domain *domain, 1981 struct intel_iommu *iommu, 1982 struct pasid_table *table, 1983 u8 bus, u8 devfn) 1984 { 1985 u16 did = domain->iommu_did[iommu->seq_id]; 1986 int translation = CONTEXT_TT_MULTI_LEVEL; 1987 struct device_domain_info *info = NULL; 1988 struct context_entry *context; 1989 unsigned long flags; 1990 int ret; 1991 1992 WARN_ON(did == 0); 1993 1994 if (hw_pass_through && domain_type_is_si(domain)) 1995 translation = CONTEXT_TT_PASS_THROUGH; 1996 1997 pr_debug("Set context mapping for %02x:%02x.%d\n", 1998 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1999 2000 BUG_ON(!domain->pgd); 2001 2002 spin_lock_irqsave(&device_domain_lock, flags); 2003 spin_lock(&iommu->lock); 2004 2005 ret = -ENOMEM; 2006 context = iommu_context_addr(iommu, bus, devfn, 1); 2007 if (!context) 2008 goto out_unlock; 2009 2010 ret = 0; 2011 if (context_present(context)) 2012 goto out_unlock; 2013 2014 /* 2015 * For kdump cases, old valid entries may be cached due to the 2016 * in-flight DMA and copied pgtable, but there is no unmapping 2017 * behaviour for them, thus we need an explicit cache flush for 2018 * the newly-mapped device. For kdump, at this point, the device 2019 * is supposed to finish reset at its driver probe stage, so no 2020 * in-flight DMA will exist, and we don't need to worry anymore 2021 * hereafter. 2022 */ 2023 if (context_copied(context)) { 2024 u16 did_old = context_domain_id(context); 2025 2026 if (did_old < cap_ndoms(iommu->cap)) { 2027 iommu->flush.flush_context(iommu, did_old, 2028 (((u16)bus) << 8) | devfn, 2029 DMA_CCMD_MASK_NOBIT, 2030 DMA_CCMD_DEVICE_INVL); 2031 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2032 DMA_TLB_DSI_FLUSH); 2033 } 2034 } 2035 2036 context_clear_entry(context); 2037 2038 if (sm_supported(iommu)) { 2039 unsigned long pds; 2040 2041 WARN_ON(!table); 2042 2043 /* Setup the PASID DIR pointer: */ 2044 pds = context_get_sm_pds(table); 2045 context->lo = (u64)virt_to_phys(table->table) | 2046 context_pdts(pds); 2047 2048 /* Setup the RID_PASID field: */ 2049 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2050 2051 /* 2052 * Setup the Device-TLB enable bit and Page request 2053 * Enable bit: 2054 */ 2055 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2056 if (info && info->ats_supported) 2057 context_set_sm_dte(context); 2058 if (info && info->pri_supported) 2059 context_set_sm_pre(context); 2060 } else { 2061 struct dma_pte *pgd = domain->pgd; 2062 int agaw; 2063 2064 context_set_domain_id(context, did); 2065 2066 if (translation != CONTEXT_TT_PASS_THROUGH) { 2067 /* 2068 * Skip top levels of page tables for iommu which has 2069 * less agaw than default. Unnecessary for PT mode. 2070 */ 2071 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2072 ret = -ENOMEM; 2073 pgd = phys_to_virt(dma_pte_addr(pgd)); 2074 if (!dma_pte_present(pgd)) 2075 goto out_unlock; 2076 } 2077 2078 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2079 if (info && info->ats_supported) 2080 translation = CONTEXT_TT_DEV_IOTLB; 2081 else 2082 translation = CONTEXT_TT_MULTI_LEVEL; 2083 2084 context_set_address_root(context, virt_to_phys(pgd)); 2085 context_set_address_width(context, agaw); 2086 } else { 2087 /* 2088 * In pass through mode, AW must be programmed to 2089 * indicate the largest AGAW value supported by 2090 * hardware. And ASR is ignored by hardware. 2091 */ 2092 context_set_address_width(context, iommu->msagaw); 2093 } 2094 2095 context_set_translation_type(context, translation); 2096 } 2097 2098 context_set_fault_enable(context); 2099 context_set_present(context); 2100 if (!ecap_coherent(iommu->ecap)) 2101 clflush_cache_range(context, sizeof(*context)); 2102 2103 /* 2104 * It's a non-present to present mapping. If hardware doesn't cache 2105 * non-present entry we only need to flush the write-buffer. If the 2106 * _does_ cache non-present entries, then it does so in the special 2107 * domain #0, which we have to flush: 2108 */ 2109 if (cap_caching_mode(iommu->cap)) { 2110 iommu->flush.flush_context(iommu, 0, 2111 (((u16)bus) << 8) | devfn, 2112 DMA_CCMD_MASK_NOBIT, 2113 DMA_CCMD_DEVICE_INVL); 2114 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2115 } else { 2116 iommu_flush_write_buffer(iommu); 2117 } 2118 iommu_enable_dev_iotlb(info); 2119 2120 ret = 0; 2121 2122 out_unlock: 2123 spin_unlock(&iommu->lock); 2124 spin_unlock_irqrestore(&device_domain_lock, flags); 2125 2126 return ret; 2127 } 2128 2129 struct domain_context_mapping_data { 2130 struct dmar_domain *domain; 2131 struct intel_iommu *iommu; 2132 struct pasid_table *table; 2133 }; 2134 2135 static int domain_context_mapping_cb(struct pci_dev *pdev, 2136 u16 alias, void *opaque) 2137 { 2138 struct domain_context_mapping_data *data = opaque; 2139 2140 return domain_context_mapping_one(data->domain, data->iommu, 2141 data->table, PCI_BUS_NUM(alias), 2142 alias & 0xff); 2143 } 2144 2145 static int 2146 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2147 { 2148 struct domain_context_mapping_data data; 2149 struct pasid_table *table; 2150 struct intel_iommu *iommu; 2151 u8 bus, devfn; 2152 2153 iommu = device_to_iommu(dev, &bus, &devfn); 2154 if (!iommu) 2155 return -ENODEV; 2156 2157 table = intel_pasid_get_table(dev); 2158 2159 if (!dev_is_pci(dev)) 2160 return domain_context_mapping_one(domain, iommu, table, 2161 bus, devfn); 2162 2163 data.domain = domain; 2164 data.iommu = iommu; 2165 data.table = table; 2166 2167 return pci_for_each_dma_alias(to_pci_dev(dev), 2168 &domain_context_mapping_cb, &data); 2169 } 2170 2171 static int domain_context_mapped_cb(struct pci_dev *pdev, 2172 u16 alias, void *opaque) 2173 { 2174 struct intel_iommu *iommu = opaque; 2175 2176 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2177 } 2178 2179 static int domain_context_mapped(struct device *dev) 2180 { 2181 struct intel_iommu *iommu; 2182 u8 bus, devfn; 2183 2184 iommu = device_to_iommu(dev, &bus, &devfn); 2185 if (!iommu) 2186 return -ENODEV; 2187 2188 if (!dev_is_pci(dev)) 2189 return device_context_mapped(iommu, bus, devfn); 2190 2191 return !pci_for_each_dma_alias(to_pci_dev(dev), 2192 domain_context_mapped_cb, iommu); 2193 } 2194 2195 /* Returns a number of VTD pages, but aligned to MM page size */ 2196 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2197 size_t size) 2198 { 2199 host_addr &= ~PAGE_MASK; 2200 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2201 } 2202 2203 /* Return largest possible superpage level for a given mapping */ 2204 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2205 unsigned long iov_pfn, 2206 unsigned long phy_pfn, 2207 unsigned long pages) 2208 { 2209 int support, level = 1; 2210 unsigned long pfnmerge; 2211 2212 support = domain->iommu_superpage; 2213 2214 /* To use a large page, the virtual *and* physical addresses 2215 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2216 of them will mean we have to use smaller pages. So just 2217 merge them and check both at once. */ 2218 pfnmerge = iov_pfn | phy_pfn; 2219 2220 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2221 pages >>= VTD_STRIDE_SHIFT; 2222 if (!pages) 2223 break; 2224 pfnmerge >>= VTD_STRIDE_SHIFT; 2225 level++; 2226 support--; 2227 } 2228 return level; 2229 } 2230 2231 /* 2232 * Ensure that old small page tables are removed to make room for superpage(s). 2233 * We're going to add new large pages, so make sure we don't remove their parent 2234 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2235 */ 2236 static void switch_to_super_page(struct dmar_domain *domain, 2237 unsigned long start_pfn, 2238 unsigned long end_pfn, int level) 2239 { 2240 unsigned long lvl_pages = lvl_to_nr_pages(level); 2241 struct dma_pte *pte = NULL; 2242 int i; 2243 2244 while (start_pfn <= end_pfn) { 2245 if (!pte) 2246 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2247 2248 if (dma_pte_present(pte)) { 2249 dma_pte_free_pagetable(domain, start_pfn, 2250 start_pfn + lvl_pages - 1, 2251 level + 1); 2252 2253 for_each_domain_iommu(i, domain) 2254 iommu_flush_iotlb_psi(g_iommus[i], domain, 2255 start_pfn, lvl_pages, 2256 0, 0); 2257 } 2258 2259 pte++; 2260 start_pfn += lvl_pages; 2261 if (first_pte_in_page(pte)) 2262 pte = NULL; 2263 } 2264 } 2265 2266 static int 2267 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2268 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2269 { 2270 struct dma_pte *first_pte = NULL, *pte = NULL; 2271 unsigned int largepage_lvl = 0; 2272 unsigned long lvl_pages = 0; 2273 phys_addr_t pteval; 2274 u64 attr; 2275 2276 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2277 2278 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2279 return -EINVAL; 2280 2281 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2282 attr |= DMA_FL_PTE_PRESENT; 2283 if (domain_use_first_level(domain)) { 2284 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2285 if (prot & DMA_PTE_WRITE) 2286 attr |= DMA_FL_PTE_DIRTY; 2287 } 2288 2289 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2290 2291 while (nr_pages > 0) { 2292 uint64_t tmp; 2293 2294 if (!pte) { 2295 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2296 phys_pfn, nr_pages); 2297 2298 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2299 if (!pte) 2300 return -ENOMEM; 2301 first_pte = pte; 2302 2303 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2304 2305 /* It is large page*/ 2306 if (largepage_lvl > 1) { 2307 unsigned long end_pfn; 2308 unsigned long pages_to_remove; 2309 2310 pteval |= DMA_PTE_LARGE_PAGE; 2311 pages_to_remove = min_t(unsigned long, nr_pages, 2312 nr_pte_to_next_page(pte) * lvl_pages); 2313 end_pfn = iov_pfn + pages_to_remove - 1; 2314 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2315 } else { 2316 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2317 } 2318 2319 } 2320 /* We don't need lock here, nobody else 2321 * touches the iova range 2322 */ 2323 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2324 if (tmp) { 2325 static int dumps = 5; 2326 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2327 iov_pfn, tmp, (unsigned long long)pteval); 2328 if (dumps) { 2329 dumps--; 2330 debug_dma_dump_mappings(NULL); 2331 } 2332 WARN_ON(1); 2333 } 2334 2335 nr_pages -= lvl_pages; 2336 iov_pfn += lvl_pages; 2337 phys_pfn += lvl_pages; 2338 pteval += lvl_pages * VTD_PAGE_SIZE; 2339 2340 /* If the next PTE would be the first in a new page, then we 2341 * need to flush the cache on the entries we've just written. 2342 * And then we'll need to recalculate 'pte', so clear it and 2343 * let it get set again in the if (!pte) block above. 2344 * 2345 * If we're done (!nr_pages) we need to flush the cache too. 2346 * 2347 * Also if we've been setting superpages, we may need to 2348 * recalculate 'pte' and switch back to smaller pages for the 2349 * end of the mapping, if the trailing size is not enough to 2350 * use another superpage (i.e. nr_pages < lvl_pages). 2351 */ 2352 pte++; 2353 if (!nr_pages || first_pte_in_page(pte) || 2354 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2355 domain_flush_cache(domain, first_pte, 2356 (void *)pte - (void *)first_pte); 2357 pte = NULL; 2358 } 2359 } 2360 2361 return 0; 2362 } 2363 2364 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2365 { 2366 struct intel_iommu *iommu = info->iommu; 2367 struct context_entry *context; 2368 unsigned long flags; 2369 u16 did_old; 2370 2371 if (!iommu) 2372 return; 2373 2374 spin_lock_irqsave(&iommu->lock, flags); 2375 context = iommu_context_addr(iommu, bus, devfn, 0); 2376 if (!context) { 2377 spin_unlock_irqrestore(&iommu->lock, flags); 2378 return; 2379 } 2380 2381 if (sm_supported(iommu)) { 2382 if (hw_pass_through && domain_type_is_si(info->domain)) 2383 did_old = FLPT_DEFAULT_DID; 2384 else 2385 did_old = info->domain->iommu_did[iommu->seq_id]; 2386 } else { 2387 did_old = context_domain_id(context); 2388 } 2389 2390 context_clear_entry(context); 2391 __iommu_flush_cache(iommu, context, sizeof(*context)); 2392 spin_unlock_irqrestore(&iommu->lock, flags); 2393 iommu->flush.flush_context(iommu, 2394 did_old, 2395 (((u16)bus) << 8) | devfn, 2396 DMA_CCMD_MASK_NOBIT, 2397 DMA_CCMD_DEVICE_INVL); 2398 2399 if (sm_supported(iommu)) 2400 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2401 2402 iommu->flush.flush_iotlb(iommu, 2403 did_old, 2404 0, 2405 0, 2406 DMA_TLB_DSI_FLUSH); 2407 2408 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2409 } 2410 2411 static void domain_remove_dev_info(struct dmar_domain *domain) 2412 { 2413 struct device_domain_info *info, *tmp; 2414 unsigned long flags; 2415 2416 spin_lock_irqsave(&device_domain_lock, flags); 2417 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2418 __dmar_remove_one_dev_info(info); 2419 spin_unlock_irqrestore(&device_domain_lock, flags); 2420 } 2421 2422 static inline struct device_domain_info * 2423 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2424 { 2425 struct device_domain_info *info; 2426 2427 list_for_each_entry(info, &device_domain_list, global) 2428 if (info->segment == segment && info->bus == bus && 2429 info->devfn == devfn) 2430 return info; 2431 2432 return NULL; 2433 } 2434 2435 static int domain_setup_first_level(struct intel_iommu *iommu, 2436 struct dmar_domain *domain, 2437 struct device *dev, 2438 u32 pasid) 2439 { 2440 struct dma_pte *pgd = domain->pgd; 2441 int agaw, level; 2442 int flags = 0; 2443 2444 /* 2445 * Skip top levels of page tables for iommu which has 2446 * less agaw than default. Unnecessary for PT mode. 2447 */ 2448 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2449 pgd = phys_to_virt(dma_pte_addr(pgd)); 2450 if (!dma_pte_present(pgd)) 2451 return -ENOMEM; 2452 } 2453 2454 level = agaw_to_level(agaw); 2455 if (level != 4 && level != 5) 2456 return -EINVAL; 2457 2458 if (pasid != PASID_RID2PASID) 2459 flags |= PASID_FLAG_SUPERVISOR_MODE; 2460 if (level == 5) 2461 flags |= PASID_FLAG_FL5LP; 2462 2463 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2464 flags |= PASID_FLAG_PAGE_SNOOP; 2465 2466 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2467 domain->iommu_did[iommu->seq_id], 2468 flags); 2469 } 2470 2471 static bool dev_is_real_dma_subdevice(struct device *dev) 2472 { 2473 return dev && dev_is_pci(dev) && 2474 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2475 } 2476 2477 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2478 int bus, int devfn, 2479 struct device *dev, 2480 struct dmar_domain *domain) 2481 { 2482 struct device_domain_info *info = dev_iommu_priv_get(dev); 2483 unsigned long flags; 2484 int ret; 2485 2486 spin_lock_irqsave(&device_domain_lock, flags); 2487 info->domain = domain; 2488 spin_lock(&iommu->lock); 2489 ret = domain_attach_iommu(domain, iommu); 2490 spin_unlock(&iommu->lock); 2491 if (ret) { 2492 spin_unlock_irqrestore(&device_domain_lock, flags); 2493 return NULL; 2494 } 2495 list_add(&info->link, &domain->devices); 2496 spin_unlock_irqrestore(&device_domain_lock, flags); 2497 2498 /* PASID table is mandatory for a PCI device in scalable mode. */ 2499 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2500 ret = intel_pasid_alloc_table(dev); 2501 if (ret) { 2502 dev_err(dev, "PASID table allocation failed\n"); 2503 dmar_remove_one_dev_info(dev); 2504 return NULL; 2505 } 2506 2507 /* Setup the PASID entry for requests without PASID: */ 2508 spin_lock_irqsave(&iommu->lock, flags); 2509 if (hw_pass_through && domain_type_is_si(domain)) 2510 ret = intel_pasid_setup_pass_through(iommu, domain, 2511 dev, PASID_RID2PASID); 2512 else if (domain_use_first_level(domain)) 2513 ret = domain_setup_first_level(iommu, domain, dev, 2514 PASID_RID2PASID); 2515 else 2516 ret = intel_pasid_setup_second_level(iommu, domain, 2517 dev, PASID_RID2PASID); 2518 spin_unlock_irqrestore(&iommu->lock, flags); 2519 if (ret) { 2520 dev_err(dev, "Setup RID2PASID failed\n"); 2521 dmar_remove_one_dev_info(dev); 2522 return NULL; 2523 } 2524 } 2525 2526 if (dev && domain_context_mapping(domain, dev)) { 2527 dev_err(dev, "Domain context map failed\n"); 2528 dmar_remove_one_dev_info(dev); 2529 return NULL; 2530 } 2531 2532 return domain; 2533 } 2534 2535 static int iommu_domain_identity_map(struct dmar_domain *domain, 2536 unsigned long first_vpfn, 2537 unsigned long last_vpfn) 2538 { 2539 /* 2540 * RMRR range might have overlap with physical memory range, 2541 * clear it first 2542 */ 2543 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2544 2545 return __domain_mapping(domain, first_vpfn, 2546 first_vpfn, last_vpfn - first_vpfn + 1, 2547 DMA_PTE_READ|DMA_PTE_WRITE); 2548 } 2549 2550 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2551 2552 static int __init si_domain_init(int hw) 2553 { 2554 struct dmar_rmrr_unit *rmrr; 2555 struct device *dev; 2556 int i, nid, ret; 2557 2558 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2559 if (!si_domain) 2560 return -EFAULT; 2561 2562 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2563 domain_exit(si_domain); 2564 return -EFAULT; 2565 } 2566 2567 if (hw) 2568 return 0; 2569 2570 for_each_online_node(nid) { 2571 unsigned long start_pfn, end_pfn; 2572 int i; 2573 2574 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2575 ret = iommu_domain_identity_map(si_domain, 2576 mm_to_dma_pfn(start_pfn), 2577 mm_to_dma_pfn(end_pfn)); 2578 if (ret) 2579 return ret; 2580 } 2581 } 2582 2583 /* 2584 * Identity map the RMRRs so that devices with RMRRs could also use 2585 * the si_domain. 2586 */ 2587 for_each_rmrr_units(rmrr) { 2588 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2589 i, dev) { 2590 unsigned long long start = rmrr->base_address; 2591 unsigned long long end = rmrr->end_address; 2592 2593 if (WARN_ON(end < start || 2594 end >> agaw_to_width(si_domain->agaw))) 2595 continue; 2596 2597 ret = iommu_domain_identity_map(si_domain, 2598 mm_to_dma_pfn(start >> PAGE_SHIFT), 2599 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2600 if (ret) 2601 return ret; 2602 } 2603 } 2604 2605 return 0; 2606 } 2607 2608 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2609 { 2610 struct dmar_domain *ndomain; 2611 struct intel_iommu *iommu; 2612 u8 bus, devfn; 2613 2614 iommu = device_to_iommu(dev, &bus, &devfn); 2615 if (!iommu) 2616 return -ENODEV; 2617 2618 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2619 if (ndomain != domain) 2620 return -EBUSY; 2621 2622 return 0; 2623 } 2624 2625 static bool device_has_rmrr(struct device *dev) 2626 { 2627 struct dmar_rmrr_unit *rmrr; 2628 struct device *tmp; 2629 int i; 2630 2631 rcu_read_lock(); 2632 for_each_rmrr_units(rmrr) { 2633 /* 2634 * Return TRUE if this RMRR contains the device that 2635 * is passed in. 2636 */ 2637 for_each_active_dev_scope(rmrr->devices, 2638 rmrr->devices_cnt, i, tmp) 2639 if (tmp == dev || 2640 is_downstream_to_pci_bridge(dev, tmp)) { 2641 rcu_read_unlock(); 2642 return true; 2643 } 2644 } 2645 rcu_read_unlock(); 2646 return false; 2647 } 2648 2649 /** 2650 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2651 * is relaxable (ie. is allowed to be not enforced under some conditions) 2652 * @dev: device handle 2653 * 2654 * We assume that PCI USB devices with RMRRs have them largely 2655 * for historical reasons and that the RMRR space is not actively used post 2656 * boot. This exclusion may change if vendors begin to abuse it. 2657 * 2658 * The same exception is made for graphics devices, with the requirement that 2659 * any use of the RMRR regions will be torn down before assigning the device 2660 * to a guest. 2661 * 2662 * Return: true if the RMRR is relaxable, false otherwise 2663 */ 2664 static bool device_rmrr_is_relaxable(struct device *dev) 2665 { 2666 struct pci_dev *pdev; 2667 2668 if (!dev_is_pci(dev)) 2669 return false; 2670 2671 pdev = to_pci_dev(dev); 2672 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2673 return true; 2674 else 2675 return false; 2676 } 2677 2678 /* 2679 * There are a couple cases where we need to restrict the functionality of 2680 * devices associated with RMRRs. The first is when evaluating a device for 2681 * identity mapping because problems exist when devices are moved in and out 2682 * of domains and their respective RMRR information is lost. This means that 2683 * a device with associated RMRRs will never be in a "passthrough" domain. 2684 * The second is use of the device through the IOMMU API. This interface 2685 * expects to have full control of the IOVA space for the device. We cannot 2686 * satisfy both the requirement that RMRR access is maintained and have an 2687 * unencumbered IOVA space. We also have no ability to quiesce the device's 2688 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2689 * We therefore prevent devices associated with an RMRR from participating in 2690 * the IOMMU API, which eliminates them from device assignment. 2691 * 2692 * In both cases, devices which have relaxable RMRRs are not concerned by this 2693 * restriction. See device_rmrr_is_relaxable comment. 2694 */ 2695 static bool device_is_rmrr_locked(struct device *dev) 2696 { 2697 if (!device_has_rmrr(dev)) 2698 return false; 2699 2700 if (device_rmrr_is_relaxable(dev)) 2701 return false; 2702 2703 return true; 2704 } 2705 2706 /* 2707 * Return the required default domain type for a specific device. 2708 * 2709 * @dev: the device in query 2710 * @startup: true if this is during early boot 2711 * 2712 * Returns: 2713 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2714 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2715 * - 0: both identity and dynamic domains work for this device 2716 */ 2717 static int device_def_domain_type(struct device *dev) 2718 { 2719 if (dev_is_pci(dev)) { 2720 struct pci_dev *pdev = to_pci_dev(dev); 2721 2722 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2723 return IOMMU_DOMAIN_IDENTITY; 2724 2725 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2726 return IOMMU_DOMAIN_IDENTITY; 2727 } 2728 2729 return 0; 2730 } 2731 2732 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2733 { 2734 /* 2735 * Start from the sane iommu hardware state. 2736 * If the queued invalidation is already initialized by us 2737 * (for example, while enabling interrupt-remapping) then 2738 * we got the things already rolling from a sane state. 2739 */ 2740 if (!iommu->qi) { 2741 /* 2742 * Clear any previous faults. 2743 */ 2744 dmar_fault(-1, iommu); 2745 /* 2746 * Disable queued invalidation if supported and already enabled 2747 * before OS handover. 2748 */ 2749 dmar_disable_qi(iommu); 2750 } 2751 2752 if (dmar_enable_qi(iommu)) { 2753 /* 2754 * Queued Invalidate not enabled, use Register Based Invalidate 2755 */ 2756 iommu->flush.flush_context = __iommu_flush_context; 2757 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2758 pr_info("%s: Using Register based invalidation\n", 2759 iommu->name); 2760 } else { 2761 iommu->flush.flush_context = qi_flush_context; 2762 iommu->flush.flush_iotlb = qi_flush_iotlb; 2763 pr_info("%s: Using Queued invalidation\n", iommu->name); 2764 } 2765 } 2766 2767 static int copy_context_table(struct intel_iommu *iommu, 2768 struct root_entry *old_re, 2769 struct context_entry **tbl, 2770 int bus, bool ext) 2771 { 2772 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2773 struct context_entry *new_ce = NULL, ce; 2774 struct context_entry *old_ce = NULL; 2775 struct root_entry re; 2776 phys_addr_t old_ce_phys; 2777 2778 tbl_idx = ext ? bus * 2 : bus; 2779 memcpy(&re, old_re, sizeof(re)); 2780 2781 for (devfn = 0; devfn < 256; devfn++) { 2782 /* First calculate the correct index */ 2783 idx = (ext ? devfn * 2 : devfn) % 256; 2784 2785 if (idx == 0) { 2786 /* First save what we may have and clean up */ 2787 if (new_ce) { 2788 tbl[tbl_idx] = new_ce; 2789 __iommu_flush_cache(iommu, new_ce, 2790 VTD_PAGE_SIZE); 2791 pos = 1; 2792 } 2793 2794 if (old_ce) 2795 memunmap(old_ce); 2796 2797 ret = 0; 2798 if (devfn < 0x80) 2799 old_ce_phys = root_entry_lctp(&re); 2800 else 2801 old_ce_phys = root_entry_uctp(&re); 2802 2803 if (!old_ce_phys) { 2804 if (ext && devfn == 0) { 2805 /* No LCTP, try UCTP */ 2806 devfn = 0x7f; 2807 continue; 2808 } else { 2809 goto out; 2810 } 2811 } 2812 2813 ret = -ENOMEM; 2814 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2815 MEMREMAP_WB); 2816 if (!old_ce) 2817 goto out; 2818 2819 new_ce = alloc_pgtable_page(iommu->node); 2820 if (!new_ce) 2821 goto out_unmap; 2822 2823 ret = 0; 2824 } 2825 2826 /* Now copy the context entry */ 2827 memcpy(&ce, old_ce + idx, sizeof(ce)); 2828 2829 if (!__context_present(&ce)) 2830 continue; 2831 2832 did = context_domain_id(&ce); 2833 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2834 set_bit(did, iommu->domain_ids); 2835 2836 /* 2837 * We need a marker for copied context entries. This 2838 * marker needs to work for the old format as well as 2839 * for extended context entries. 2840 * 2841 * Bit 67 of the context entry is used. In the old 2842 * format this bit is available to software, in the 2843 * extended format it is the PGE bit, but PGE is ignored 2844 * by HW if PASIDs are disabled (and thus still 2845 * available). 2846 * 2847 * So disable PASIDs first and then mark the entry 2848 * copied. This means that we don't copy PASID 2849 * translations from the old kernel, but this is fine as 2850 * faults there are not fatal. 2851 */ 2852 context_clear_pasid_enable(&ce); 2853 context_set_copied(&ce); 2854 2855 new_ce[idx] = ce; 2856 } 2857 2858 tbl[tbl_idx + pos] = new_ce; 2859 2860 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2861 2862 out_unmap: 2863 memunmap(old_ce); 2864 2865 out: 2866 return ret; 2867 } 2868 2869 static int copy_translation_tables(struct intel_iommu *iommu) 2870 { 2871 struct context_entry **ctxt_tbls; 2872 struct root_entry *old_rt; 2873 phys_addr_t old_rt_phys; 2874 int ctxt_table_entries; 2875 unsigned long flags; 2876 u64 rtaddr_reg; 2877 int bus, ret; 2878 bool new_ext, ext; 2879 2880 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2881 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2882 new_ext = !!ecap_ecs(iommu->ecap); 2883 2884 /* 2885 * The RTT bit can only be changed when translation is disabled, 2886 * but disabling translation means to open a window for data 2887 * corruption. So bail out and don't copy anything if we would 2888 * have to change the bit. 2889 */ 2890 if (new_ext != ext) 2891 return -EINVAL; 2892 2893 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2894 if (!old_rt_phys) 2895 return -EINVAL; 2896 2897 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2898 if (!old_rt) 2899 return -ENOMEM; 2900 2901 /* This is too big for the stack - allocate it from slab */ 2902 ctxt_table_entries = ext ? 512 : 256; 2903 ret = -ENOMEM; 2904 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2905 if (!ctxt_tbls) 2906 goto out_unmap; 2907 2908 for (bus = 0; bus < 256; bus++) { 2909 ret = copy_context_table(iommu, &old_rt[bus], 2910 ctxt_tbls, bus, ext); 2911 if (ret) { 2912 pr_err("%s: Failed to copy context table for bus %d\n", 2913 iommu->name, bus); 2914 continue; 2915 } 2916 } 2917 2918 spin_lock_irqsave(&iommu->lock, flags); 2919 2920 /* Context tables are copied, now write them to the root_entry table */ 2921 for (bus = 0; bus < 256; bus++) { 2922 int idx = ext ? bus * 2 : bus; 2923 u64 val; 2924 2925 if (ctxt_tbls[idx]) { 2926 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2927 iommu->root_entry[bus].lo = val; 2928 } 2929 2930 if (!ext || !ctxt_tbls[idx + 1]) 2931 continue; 2932 2933 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2934 iommu->root_entry[bus].hi = val; 2935 } 2936 2937 spin_unlock_irqrestore(&iommu->lock, flags); 2938 2939 kfree(ctxt_tbls); 2940 2941 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2942 2943 ret = 0; 2944 2945 out_unmap: 2946 memunmap(old_rt); 2947 2948 return ret; 2949 } 2950 2951 #ifdef CONFIG_INTEL_IOMMU_SVM 2952 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 2953 { 2954 struct intel_iommu *iommu = data; 2955 ioasid_t ioasid; 2956 2957 if (!iommu) 2958 return INVALID_IOASID; 2959 /* 2960 * VT-d virtual command interface always uses the full 20 bit 2961 * PASID range. Host can partition guest PASID range based on 2962 * policies but it is out of guest's control. 2963 */ 2964 if (min < PASID_MIN || max > intel_pasid_max_id) 2965 return INVALID_IOASID; 2966 2967 if (vcmd_alloc_pasid(iommu, &ioasid)) 2968 return INVALID_IOASID; 2969 2970 return ioasid; 2971 } 2972 2973 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 2974 { 2975 struct intel_iommu *iommu = data; 2976 2977 if (!iommu) 2978 return; 2979 /* 2980 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 2981 * We can only free the PASID when all the devices are unbound. 2982 */ 2983 if (ioasid_find(NULL, ioasid, NULL)) { 2984 pr_alert("Cannot free active IOASID %d\n", ioasid); 2985 return; 2986 } 2987 vcmd_free_pasid(iommu, ioasid); 2988 } 2989 2990 static void register_pasid_allocator(struct intel_iommu *iommu) 2991 { 2992 /* 2993 * If we are running in the host, no need for custom allocator 2994 * in that PASIDs are allocated from the host system-wide. 2995 */ 2996 if (!cap_caching_mode(iommu->cap)) 2997 return; 2998 2999 if (!sm_supported(iommu)) { 3000 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3001 return; 3002 } 3003 3004 /* 3005 * Register a custom PASID allocator if we are running in a guest, 3006 * guest PASID must be obtained via virtual command interface. 3007 * There can be multiple vIOMMUs in each guest but only one allocator 3008 * is active. All vIOMMU allocators will eventually be calling the same 3009 * host allocator. 3010 */ 3011 if (!vccap_pasid(iommu->vccap)) 3012 return; 3013 3014 pr_info("Register custom PASID allocator\n"); 3015 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3016 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3017 iommu->pasid_allocator.pdata = (void *)iommu; 3018 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3019 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3020 /* 3021 * Disable scalable mode on this IOMMU if there 3022 * is no custom allocator. Mixing SM capable vIOMMU 3023 * and non-SM vIOMMU are not supported. 3024 */ 3025 intel_iommu_sm = 0; 3026 } 3027 } 3028 #endif 3029 3030 static int __init init_dmars(void) 3031 { 3032 struct dmar_drhd_unit *drhd; 3033 struct intel_iommu *iommu; 3034 int ret; 3035 3036 /* 3037 * for each drhd 3038 * allocate root 3039 * initialize and program root entry to not present 3040 * endfor 3041 */ 3042 for_each_drhd_unit(drhd) { 3043 /* 3044 * lock not needed as this is only incremented in the single 3045 * threaded kernel __init code path all other access are read 3046 * only 3047 */ 3048 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3049 g_num_of_iommus++; 3050 continue; 3051 } 3052 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3053 } 3054 3055 /* Preallocate enough resources for IOMMU hot-addition */ 3056 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3057 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3058 3059 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3060 GFP_KERNEL); 3061 if (!g_iommus) { 3062 ret = -ENOMEM; 3063 goto error; 3064 } 3065 3066 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3067 if (ret) 3068 goto free_iommu; 3069 3070 for_each_iommu(iommu, drhd) { 3071 if (drhd->ignored) { 3072 iommu_disable_translation(iommu); 3073 continue; 3074 } 3075 3076 /* 3077 * Find the max pasid size of all IOMMU's in the system. 3078 * We need to ensure the system pasid table is no bigger 3079 * than the smallest supported. 3080 */ 3081 if (pasid_supported(iommu)) { 3082 u32 temp = 2 << ecap_pss(iommu->ecap); 3083 3084 intel_pasid_max_id = min_t(u32, temp, 3085 intel_pasid_max_id); 3086 } 3087 3088 g_iommus[iommu->seq_id] = iommu; 3089 3090 intel_iommu_init_qi(iommu); 3091 3092 ret = iommu_init_domains(iommu); 3093 if (ret) 3094 goto free_iommu; 3095 3096 init_translation_status(iommu); 3097 3098 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3099 iommu_disable_translation(iommu); 3100 clear_translation_pre_enabled(iommu); 3101 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3102 iommu->name); 3103 } 3104 3105 /* 3106 * TBD: 3107 * we could share the same root & context tables 3108 * among all IOMMU's. Need to Split it later. 3109 */ 3110 ret = iommu_alloc_root_entry(iommu); 3111 if (ret) 3112 goto free_iommu; 3113 3114 if (translation_pre_enabled(iommu)) { 3115 pr_info("Translation already enabled - trying to copy translation structures\n"); 3116 3117 ret = copy_translation_tables(iommu); 3118 if (ret) { 3119 /* 3120 * We found the IOMMU with translation 3121 * enabled - but failed to copy over the 3122 * old root-entry table. Try to proceed 3123 * by disabling translation now and 3124 * allocating a clean root-entry table. 3125 * This might cause DMAR faults, but 3126 * probably the dump will still succeed. 3127 */ 3128 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3129 iommu->name); 3130 iommu_disable_translation(iommu); 3131 clear_translation_pre_enabled(iommu); 3132 } else { 3133 pr_info("Copied translation tables from previous kernel for %s\n", 3134 iommu->name); 3135 } 3136 } 3137 3138 if (!ecap_pass_through(iommu->ecap)) 3139 hw_pass_through = 0; 3140 intel_svm_check(iommu); 3141 } 3142 3143 /* 3144 * Now that qi is enabled on all iommus, set the root entry and flush 3145 * caches. This is required on some Intel X58 chipsets, otherwise the 3146 * flush_context function will loop forever and the boot hangs. 3147 */ 3148 for_each_active_iommu(iommu, drhd) { 3149 iommu_flush_write_buffer(iommu); 3150 #ifdef CONFIG_INTEL_IOMMU_SVM 3151 register_pasid_allocator(iommu); 3152 #endif 3153 iommu_set_root_entry(iommu); 3154 } 3155 3156 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3157 dmar_map_gfx = 0; 3158 #endif 3159 3160 if (!dmar_map_gfx) 3161 iommu_identity_mapping |= IDENTMAP_GFX; 3162 3163 check_tylersburg_isoch(); 3164 3165 ret = si_domain_init(hw_pass_through); 3166 if (ret) 3167 goto free_iommu; 3168 3169 /* 3170 * for each drhd 3171 * enable fault log 3172 * global invalidate context cache 3173 * global invalidate iotlb 3174 * enable translation 3175 */ 3176 for_each_iommu(iommu, drhd) { 3177 if (drhd->ignored) { 3178 /* 3179 * we always have to disable PMRs or DMA may fail on 3180 * this device 3181 */ 3182 if (force_on) 3183 iommu_disable_protect_mem_regions(iommu); 3184 continue; 3185 } 3186 3187 iommu_flush_write_buffer(iommu); 3188 3189 #ifdef CONFIG_INTEL_IOMMU_SVM 3190 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3191 /* 3192 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3193 * could cause possible lock race condition. 3194 */ 3195 up_write(&dmar_global_lock); 3196 ret = intel_svm_enable_prq(iommu); 3197 down_write(&dmar_global_lock); 3198 if (ret) 3199 goto free_iommu; 3200 } 3201 #endif 3202 ret = dmar_set_interrupt(iommu); 3203 if (ret) 3204 goto free_iommu; 3205 } 3206 3207 return 0; 3208 3209 free_iommu: 3210 for_each_active_iommu(iommu, drhd) { 3211 disable_dmar_iommu(iommu); 3212 free_dmar_iommu(iommu); 3213 } 3214 3215 kfree(g_iommus); 3216 3217 error: 3218 return ret; 3219 } 3220 3221 static void __init init_no_remapping_devices(void) 3222 { 3223 struct dmar_drhd_unit *drhd; 3224 struct device *dev; 3225 int i; 3226 3227 for_each_drhd_unit(drhd) { 3228 if (!drhd->include_all) { 3229 for_each_active_dev_scope(drhd->devices, 3230 drhd->devices_cnt, i, dev) 3231 break; 3232 /* ignore DMAR unit if no devices exist */ 3233 if (i == drhd->devices_cnt) 3234 drhd->ignored = 1; 3235 } 3236 } 3237 3238 for_each_active_drhd_unit(drhd) { 3239 if (drhd->include_all) 3240 continue; 3241 3242 for_each_active_dev_scope(drhd->devices, 3243 drhd->devices_cnt, i, dev) 3244 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3245 break; 3246 if (i < drhd->devices_cnt) 3247 continue; 3248 3249 /* This IOMMU has *only* gfx devices. Either bypass it or 3250 set the gfx_mapped flag, as appropriate */ 3251 drhd->gfx_dedicated = 1; 3252 if (!dmar_map_gfx) 3253 drhd->ignored = 1; 3254 } 3255 } 3256 3257 #ifdef CONFIG_SUSPEND 3258 static int init_iommu_hw(void) 3259 { 3260 struct dmar_drhd_unit *drhd; 3261 struct intel_iommu *iommu = NULL; 3262 3263 for_each_active_iommu(iommu, drhd) 3264 if (iommu->qi) 3265 dmar_reenable_qi(iommu); 3266 3267 for_each_iommu(iommu, drhd) { 3268 if (drhd->ignored) { 3269 /* 3270 * we always have to disable PMRs or DMA may fail on 3271 * this device 3272 */ 3273 if (force_on) 3274 iommu_disable_protect_mem_regions(iommu); 3275 continue; 3276 } 3277 3278 iommu_flush_write_buffer(iommu); 3279 iommu_set_root_entry(iommu); 3280 iommu_enable_translation(iommu); 3281 iommu_disable_protect_mem_regions(iommu); 3282 } 3283 3284 return 0; 3285 } 3286 3287 static void iommu_flush_all(void) 3288 { 3289 struct dmar_drhd_unit *drhd; 3290 struct intel_iommu *iommu; 3291 3292 for_each_active_iommu(iommu, drhd) { 3293 iommu->flush.flush_context(iommu, 0, 0, 0, 3294 DMA_CCMD_GLOBAL_INVL); 3295 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3296 DMA_TLB_GLOBAL_FLUSH); 3297 } 3298 } 3299 3300 static int iommu_suspend(void) 3301 { 3302 struct dmar_drhd_unit *drhd; 3303 struct intel_iommu *iommu = NULL; 3304 unsigned long flag; 3305 3306 for_each_active_iommu(iommu, drhd) { 3307 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3308 GFP_KERNEL); 3309 if (!iommu->iommu_state) 3310 goto nomem; 3311 } 3312 3313 iommu_flush_all(); 3314 3315 for_each_active_iommu(iommu, drhd) { 3316 iommu_disable_translation(iommu); 3317 3318 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3319 3320 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3321 readl(iommu->reg + DMAR_FECTL_REG); 3322 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3323 readl(iommu->reg + DMAR_FEDATA_REG); 3324 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3325 readl(iommu->reg + DMAR_FEADDR_REG); 3326 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3327 readl(iommu->reg + DMAR_FEUADDR_REG); 3328 3329 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3330 } 3331 return 0; 3332 3333 nomem: 3334 for_each_active_iommu(iommu, drhd) 3335 kfree(iommu->iommu_state); 3336 3337 return -ENOMEM; 3338 } 3339 3340 static void iommu_resume(void) 3341 { 3342 struct dmar_drhd_unit *drhd; 3343 struct intel_iommu *iommu = NULL; 3344 unsigned long flag; 3345 3346 if (init_iommu_hw()) { 3347 if (force_on) 3348 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3349 else 3350 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3351 return; 3352 } 3353 3354 for_each_active_iommu(iommu, drhd) { 3355 3356 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3357 3358 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3359 iommu->reg + DMAR_FECTL_REG); 3360 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3361 iommu->reg + DMAR_FEDATA_REG); 3362 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3363 iommu->reg + DMAR_FEADDR_REG); 3364 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3365 iommu->reg + DMAR_FEUADDR_REG); 3366 3367 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3368 } 3369 3370 for_each_active_iommu(iommu, drhd) 3371 kfree(iommu->iommu_state); 3372 } 3373 3374 static struct syscore_ops iommu_syscore_ops = { 3375 .resume = iommu_resume, 3376 .suspend = iommu_suspend, 3377 }; 3378 3379 static void __init init_iommu_pm_ops(void) 3380 { 3381 register_syscore_ops(&iommu_syscore_ops); 3382 } 3383 3384 #else 3385 static inline void init_iommu_pm_ops(void) {} 3386 #endif /* CONFIG_PM */ 3387 3388 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3389 { 3390 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3391 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3392 rmrr->end_address <= rmrr->base_address || 3393 arch_rmrr_sanity_check(rmrr)) 3394 return -EINVAL; 3395 3396 return 0; 3397 } 3398 3399 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3400 { 3401 struct acpi_dmar_reserved_memory *rmrr; 3402 struct dmar_rmrr_unit *rmrru; 3403 3404 rmrr = (struct acpi_dmar_reserved_memory *)header; 3405 if (rmrr_sanity_check(rmrr)) { 3406 pr_warn(FW_BUG 3407 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3408 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3409 rmrr->base_address, rmrr->end_address, 3410 dmi_get_system_info(DMI_BIOS_VENDOR), 3411 dmi_get_system_info(DMI_BIOS_VERSION), 3412 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3413 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3414 } 3415 3416 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3417 if (!rmrru) 3418 goto out; 3419 3420 rmrru->hdr = header; 3421 3422 rmrru->base_address = rmrr->base_address; 3423 rmrru->end_address = rmrr->end_address; 3424 3425 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3426 ((void *)rmrr) + rmrr->header.length, 3427 &rmrru->devices_cnt); 3428 if (rmrru->devices_cnt && rmrru->devices == NULL) 3429 goto free_rmrru; 3430 3431 list_add(&rmrru->list, &dmar_rmrr_units); 3432 3433 return 0; 3434 free_rmrru: 3435 kfree(rmrru); 3436 out: 3437 return -ENOMEM; 3438 } 3439 3440 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3441 { 3442 struct dmar_atsr_unit *atsru; 3443 struct acpi_dmar_atsr *tmp; 3444 3445 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3446 dmar_rcu_check()) { 3447 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3448 if (atsr->segment != tmp->segment) 3449 continue; 3450 if (atsr->header.length != tmp->header.length) 3451 continue; 3452 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3453 return atsru; 3454 } 3455 3456 return NULL; 3457 } 3458 3459 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3460 { 3461 struct acpi_dmar_atsr *atsr; 3462 struct dmar_atsr_unit *atsru; 3463 3464 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3465 return 0; 3466 3467 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3468 atsru = dmar_find_atsr(atsr); 3469 if (atsru) 3470 return 0; 3471 3472 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3473 if (!atsru) 3474 return -ENOMEM; 3475 3476 /* 3477 * If memory is allocated from slab by ACPI _DSM method, we need to 3478 * copy the memory content because the memory buffer will be freed 3479 * on return. 3480 */ 3481 atsru->hdr = (void *)(atsru + 1); 3482 memcpy(atsru->hdr, hdr, hdr->length); 3483 atsru->include_all = atsr->flags & 0x1; 3484 if (!atsru->include_all) { 3485 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3486 (void *)atsr + atsr->header.length, 3487 &atsru->devices_cnt); 3488 if (atsru->devices_cnt && atsru->devices == NULL) { 3489 kfree(atsru); 3490 return -ENOMEM; 3491 } 3492 } 3493 3494 list_add_rcu(&atsru->list, &dmar_atsr_units); 3495 3496 return 0; 3497 } 3498 3499 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3500 { 3501 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3502 kfree(atsru); 3503 } 3504 3505 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3506 { 3507 struct acpi_dmar_atsr *atsr; 3508 struct dmar_atsr_unit *atsru; 3509 3510 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3511 atsru = dmar_find_atsr(atsr); 3512 if (atsru) { 3513 list_del_rcu(&atsru->list); 3514 synchronize_rcu(); 3515 intel_iommu_free_atsr(atsru); 3516 } 3517 3518 return 0; 3519 } 3520 3521 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3522 { 3523 int i; 3524 struct device *dev; 3525 struct acpi_dmar_atsr *atsr; 3526 struct dmar_atsr_unit *atsru; 3527 3528 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3529 atsru = dmar_find_atsr(atsr); 3530 if (!atsru) 3531 return 0; 3532 3533 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3534 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3535 i, dev) 3536 return -EBUSY; 3537 } 3538 3539 return 0; 3540 } 3541 3542 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3543 { 3544 struct dmar_satc_unit *satcu; 3545 struct acpi_dmar_satc *tmp; 3546 3547 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3548 dmar_rcu_check()) { 3549 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3550 if (satc->segment != tmp->segment) 3551 continue; 3552 if (satc->header.length != tmp->header.length) 3553 continue; 3554 if (memcmp(satc, tmp, satc->header.length) == 0) 3555 return satcu; 3556 } 3557 3558 return NULL; 3559 } 3560 3561 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3562 { 3563 struct acpi_dmar_satc *satc; 3564 struct dmar_satc_unit *satcu; 3565 3566 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3567 return 0; 3568 3569 satc = container_of(hdr, struct acpi_dmar_satc, header); 3570 satcu = dmar_find_satc(satc); 3571 if (satcu) 3572 return 0; 3573 3574 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3575 if (!satcu) 3576 return -ENOMEM; 3577 3578 satcu->hdr = (void *)(satcu + 1); 3579 memcpy(satcu->hdr, hdr, hdr->length); 3580 satcu->atc_required = satc->flags & 0x1; 3581 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3582 (void *)satc + satc->header.length, 3583 &satcu->devices_cnt); 3584 if (satcu->devices_cnt && !satcu->devices) { 3585 kfree(satcu); 3586 return -ENOMEM; 3587 } 3588 list_add_rcu(&satcu->list, &dmar_satc_units); 3589 3590 return 0; 3591 } 3592 3593 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3594 { 3595 int sp, ret; 3596 struct intel_iommu *iommu = dmaru->iommu; 3597 3598 if (g_iommus[iommu->seq_id]) 3599 return 0; 3600 3601 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3602 if (ret) 3603 goto out; 3604 3605 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3606 pr_warn("%s: Doesn't support hardware pass through.\n", 3607 iommu->name); 3608 return -ENXIO; 3609 } 3610 if (!ecap_sc_support(iommu->ecap) && 3611 domain_update_iommu_snooping(iommu)) { 3612 pr_warn("%s: Doesn't support snooping.\n", 3613 iommu->name); 3614 return -ENXIO; 3615 } 3616 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3617 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3618 pr_warn("%s: Doesn't support large page.\n", 3619 iommu->name); 3620 return -ENXIO; 3621 } 3622 3623 /* 3624 * Disable translation if already enabled prior to OS handover. 3625 */ 3626 if (iommu->gcmd & DMA_GCMD_TE) 3627 iommu_disable_translation(iommu); 3628 3629 g_iommus[iommu->seq_id] = iommu; 3630 ret = iommu_init_domains(iommu); 3631 if (ret == 0) 3632 ret = iommu_alloc_root_entry(iommu); 3633 if (ret) 3634 goto out; 3635 3636 intel_svm_check(iommu); 3637 3638 if (dmaru->ignored) { 3639 /* 3640 * we always have to disable PMRs or DMA may fail on this device 3641 */ 3642 if (force_on) 3643 iommu_disable_protect_mem_regions(iommu); 3644 return 0; 3645 } 3646 3647 intel_iommu_init_qi(iommu); 3648 iommu_flush_write_buffer(iommu); 3649 3650 #ifdef CONFIG_INTEL_IOMMU_SVM 3651 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3652 ret = intel_svm_enable_prq(iommu); 3653 if (ret) 3654 goto disable_iommu; 3655 } 3656 #endif 3657 ret = dmar_set_interrupt(iommu); 3658 if (ret) 3659 goto disable_iommu; 3660 3661 iommu_set_root_entry(iommu); 3662 iommu_enable_translation(iommu); 3663 3664 iommu_disable_protect_mem_regions(iommu); 3665 return 0; 3666 3667 disable_iommu: 3668 disable_dmar_iommu(iommu); 3669 out: 3670 free_dmar_iommu(iommu); 3671 return ret; 3672 } 3673 3674 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3675 { 3676 int ret = 0; 3677 struct intel_iommu *iommu = dmaru->iommu; 3678 3679 if (!intel_iommu_enabled) 3680 return 0; 3681 if (iommu == NULL) 3682 return -EINVAL; 3683 3684 if (insert) { 3685 ret = intel_iommu_add(dmaru); 3686 } else { 3687 disable_dmar_iommu(iommu); 3688 free_dmar_iommu(iommu); 3689 } 3690 3691 return ret; 3692 } 3693 3694 static void intel_iommu_free_dmars(void) 3695 { 3696 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3697 struct dmar_atsr_unit *atsru, *atsr_n; 3698 struct dmar_satc_unit *satcu, *satc_n; 3699 3700 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3701 list_del(&rmrru->list); 3702 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3703 kfree(rmrru); 3704 } 3705 3706 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3707 list_del(&atsru->list); 3708 intel_iommu_free_atsr(atsru); 3709 } 3710 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3711 list_del(&satcu->list); 3712 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3713 kfree(satcu); 3714 } 3715 } 3716 3717 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3718 { 3719 struct dmar_satc_unit *satcu; 3720 struct acpi_dmar_satc *satc; 3721 struct device *tmp; 3722 int i; 3723 3724 dev = pci_physfn(dev); 3725 rcu_read_lock(); 3726 3727 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3728 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3729 if (satc->segment != pci_domain_nr(dev->bus)) 3730 continue; 3731 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3732 if (to_pci_dev(tmp) == dev) 3733 goto out; 3734 } 3735 satcu = NULL; 3736 out: 3737 rcu_read_unlock(); 3738 return satcu; 3739 } 3740 3741 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3742 { 3743 int i, ret = 1; 3744 struct pci_bus *bus; 3745 struct pci_dev *bridge = NULL; 3746 struct device *tmp; 3747 struct acpi_dmar_atsr *atsr; 3748 struct dmar_atsr_unit *atsru; 3749 struct dmar_satc_unit *satcu; 3750 3751 dev = pci_physfn(dev); 3752 satcu = dmar_find_matched_satc_unit(dev); 3753 if (satcu) 3754 /* 3755 * This device supports ATS as it is in SATC table. 3756 * When IOMMU is in legacy mode, enabling ATS is done 3757 * automatically by HW for the device that requires 3758 * ATS, hence OS should not enable this device ATS 3759 * to avoid duplicated TLB invalidation. 3760 */ 3761 return !(satcu->atc_required && !sm_supported(iommu)); 3762 3763 for (bus = dev->bus; bus; bus = bus->parent) { 3764 bridge = bus->self; 3765 /* If it's an integrated device, allow ATS */ 3766 if (!bridge) 3767 return 1; 3768 /* Connected via non-PCIe: no ATS */ 3769 if (!pci_is_pcie(bridge) || 3770 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3771 return 0; 3772 /* If we found the root port, look it up in the ATSR */ 3773 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3774 break; 3775 } 3776 3777 rcu_read_lock(); 3778 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3779 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3780 if (atsr->segment != pci_domain_nr(dev->bus)) 3781 continue; 3782 3783 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3784 if (tmp == &bridge->dev) 3785 goto out; 3786 3787 if (atsru->include_all) 3788 goto out; 3789 } 3790 ret = 0; 3791 out: 3792 rcu_read_unlock(); 3793 3794 return ret; 3795 } 3796 3797 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3798 { 3799 int ret; 3800 struct dmar_rmrr_unit *rmrru; 3801 struct dmar_atsr_unit *atsru; 3802 struct dmar_satc_unit *satcu; 3803 struct acpi_dmar_atsr *atsr; 3804 struct acpi_dmar_reserved_memory *rmrr; 3805 struct acpi_dmar_satc *satc; 3806 3807 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3808 return 0; 3809 3810 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3811 rmrr = container_of(rmrru->hdr, 3812 struct acpi_dmar_reserved_memory, header); 3813 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3814 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3815 ((void *)rmrr) + rmrr->header.length, 3816 rmrr->segment, rmrru->devices, 3817 rmrru->devices_cnt); 3818 if (ret < 0) 3819 return ret; 3820 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3821 dmar_remove_dev_scope(info, rmrr->segment, 3822 rmrru->devices, rmrru->devices_cnt); 3823 } 3824 } 3825 3826 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3827 if (atsru->include_all) 3828 continue; 3829 3830 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3831 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3832 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3833 (void *)atsr + atsr->header.length, 3834 atsr->segment, atsru->devices, 3835 atsru->devices_cnt); 3836 if (ret > 0) 3837 break; 3838 else if (ret < 0) 3839 return ret; 3840 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3841 if (dmar_remove_dev_scope(info, atsr->segment, 3842 atsru->devices, atsru->devices_cnt)) 3843 break; 3844 } 3845 } 3846 list_for_each_entry(satcu, &dmar_satc_units, list) { 3847 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3848 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3849 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3850 (void *)satc + satc->header.length, 3851 satc->segment, satcu->devices, 3852 satcu->devices_cnt); 3853 if (ret > 0) 3854 break; 3855 else if (ret < 0) 3856 return ret; 3857 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3858 if (dmar_remove_dev_scope(info, satc->segment, 3859 satcu->devices, satcu->devices_cnt)) 3860 break; 3861 } 3862 } 3863 3864 return 0; 3865 } 3866 3867 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3868 unsigned long val, void *v) 3869 { 3870 struct memory_notify *mhp = v; 3871 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3872 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3873 mhp->nr_pages - 1); 3874 3875 switch (val) { 3876 case MEM_GOING_ONLINE: 3877 if (iommu_domain_identity_map(si_domain, 3878 start_vpfn, last_vpfn)) { 3879 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3880 start_vpfn, last_vpfn); 3881 return NOTIFY_BAD; 3882 } 3883 break; 3884 3885 case MEM_OFFLINE: 3886 case MEM_CANCEL_ONLINE: 3887 { 3888 struct dmar_drhd_unit *drhd; 3889 struct intel_iommu *iommu; 3890 LIST_HEAD(freelist); 3891 3892 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3893 3894 rcu_read_lock(); 3895 for_each_active_iommu(iommu, drhd) 3896 iommu_flush_iotlb_psi(iommu, si_domain, 3897 start_vpfn, mhp->nr_pages, 3898 list_empty(&freelist), 0); 3899 rcu_read_unlock(); 3900 put_pages_list(&freelist); 3901 } 3902 break; 3903 } 3904 3905 return NOTIFY_OK; 3906 } 3907 3908 static struct notifier_block intel_iommu_memory_nb = { 3909 .notifier_call = intel_iommu_memory_notifier, 3910 .priority = 0 3911 }; 3912 3913 static void intel_disable_iommus(void) 3914 { 3915 struct intel_iommu *iommu = NULL; 3916 struct dmar_drhd_unit *drhd; 3917 3918 for_each_iommu(iommu, drhd) 3919 iommu_disable_translation(iommu); 3920 } 3921 3922 void intel_iommu_shutdown(void) 3923 { 3924 struct dmar_drhd_unit *drhd; 3925 struct intel_iommu *iommu = NULL; 3926 3927 if (no_iommu || dmar_disabled) 3928 return; 3929 3930 down_write(&dmar_global_lock); 3931 3932 /* Disable PMRs explicitly here. */ 3933 for_each_iommu(iommu, drhd) 3934 iommu_disable_protect_mem_regions(iommu); 3935 3936 /* Make sure the IOMMUs are switched off */ 3937 intel_disable_iommus(); 3938 3939 up_write(&dmar_global_lock); 3940 } 3941 3942 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3943 { 3944 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3945 3946 return container_of(iommu_dev, struct intel_iommu, iommu); 3947 } 3948 3949 static ssize_t version_show(struct device *dev, 3950 struct device_attribute *attr, char *buf) 3951 { 3952 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3953 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3954 return sprintf(buf, "%d:%d\n", 3955 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3956 } 3957 static DEVICE_ATTR_RO(version); 3958 3959 static ssize_t address_show(struct device *dev, 3960 struct device_attribute *attr, char *buf) 3961 { 3962 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3963 return sprintf(buf, "%llx\n", iommu->reg_phys); 3964 } 3965 static DEVICE_ATTR_RO(address); 3966 3967 static ssize_t cap_show(struct device *dev, 3968 struct device_attribute *attr, char *buf) 3969 { 3970 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3971 return sprintf(buf, "%llx\n", iommu->cap); 3972 } 3973 static DEVICE_ATTR_RO(cap); 3974 3975 static ssize_t ecap_show(struct device *dev, 3976 struct device_attribute *attr, char *buf) 3977 { 3978 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3979 return sprintf(buf, "%llx\n", iommu->ecap); 3980 } 3981 static DEVICE_ATTR_RO(ecap); 3982 3983 static ssize_t domains_supported_show(struct device *dev, 3984 struct device_attribute *attr, char *buf) 3985 { 3986 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3987 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 3988 } 3989 static DEVICE_ATTR_RO(domains_supported); 3990 3991 static ssize_t domains_used_show(struct device *dev, 3992 struct device_attribute *attr, char *buf) 3993 { 3994 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3995 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 3996 cap_ndoms(iommu->cap))); 3997 } 3998 static DEVICE_ATTR_RO(domains_used); 3999 4000 static struct attribute *intel_iommu_attrs[] = { 4001 &dev_attr_version.attr, 4002 &dev_attr_address.attr, 4003 &dev_attr_cap.attr, 4004 &dev_attr_ecap.attr, 4005 &dev_attr_domains_supported.attr, 4006 &dev_attr_domains_used.attr, 4007 NULL, 4008 }; 4009 4010 static struct attribute_group intel_iommu_group = { 4011 .name = "intel-iommu", 4012 .attrs = intel_iommu_attrs, 4013 }; 4014 4015 const struct attribute_group *intel_iommu_groups[] = { 4016 &intel_iommu_group, 4017 NULL, 4018 }; 4019 4020 static inline bool has_external_pci(void) 4021 { 4022 struct pci_dev *pdev = NULL; 4023 4024 for_each_pci_dev(pdev) 4025 if (pdev->external_facing) 4026 return true; 4027 4028 return false; 4029 } 4030 4031 static int __init platform_optin_force_iommu(void) 4032 { 4033 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4034 return 0; 4035 4036 if (no_iommu || dmar_disabled) 4037 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4038 4039 /* 4040 * If Intel-IOMMU is disabled by default, we will apply identity 4041 * map for all devices except those marked as being untrusted. 4042 */ 4043 if (dmar_disabled) 4044 iommu_set_default_passthrough(false); 4045 4046 dmar_disabled = 0; 4047 no_iommu = 0; 4048 4049 return 1; 4050 } 4051 4052 static int __init probe_acpi_namespace_devices(void) 4053 { 4054 struct dmar_drhd_unit *drhd; 4055 /* To avoid a -Wunused-but-set-variable warning. */ 4056 struct intel_iommu *iommu __maybe_unused; 4057 struct device *dev; 4058 int i, ret = 0; 4059 4060 for_each_active_iommu(iommu, drhd) { 4061 for_each_active_dev_scope(drhd->devices, 4062 drhd->devices_cnt, i, dev) { 4063 struct acpi_device_physical_node *pn; 4064 struct iommu_group *group; 4065 struct acpi_device *adev; 4066 4067 if (dev->bus != &acpi_bus_type) 4068 continue; 4069 4070 adev = to_acpi_device(dev); 4071 mutex_lock(&adev->physical_node_lock); 4072 list_for_each_entry(pn, 4073 &adev->physical_node_list, node) { 4074 group = iommu_group_get(pn->dev); 4075 if (group) { 4076 iommu_group_put(group); 4077 continue; 4078 } 4079 4080 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4081 ret = iommu_probe_device(pn->dev); 4082 if (ret) 4083 break; 4084 } 4085 mutex_unlock(&adev->physical_node_lock); 4086 4087 if (ret) 4088 return ret; 4089 } 4090 } 4091 4092 return 0; 4093 } 4094 4095 int __init intel_iommu_init(void) 4096 { 4097 int ret = -ENODEV; 4098 struct dmar_drhd_unit *drhd; 4099 struct intel_iommu *iommu; 4100 4101 /* 4102 * Intel IOMMU is required for a TXT/tboot launch or platform 4103 * opt in, so enforce that. 4104 */ 4105 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4106 platform_optin_force_iommu(); 4107 4108 down_write(&dmar_global_lock); 4109 if (dmar_table_init()) { 4110 if (force_on) 4111 panic("tboot: Failed to initialize DMAR table\n"); 4112 goto out_free_dmar; 4113 } 4114 4115 if (dmar_dev_scope_init() < 0) { 4116 if (force_on) 4117 panic("tboot: Failed to initialize DMAR device scope\n"); 4118 goto out_free_dmar; 4119 } 4120 4121 up_write(&dmar_global_lock); 4122 4123 /* 4124 * The bus notifier takes the dmar_global_lock, so lockdep will 4125 * complain later when we register it under the lock. 4126 */ 4127 dmar_register_bus_notifier(); 4128 4129 down_write(&dmar_global_lock); 4130 4131 if (!no_iommu) 4132 intel_iommu_debugfs_init(); 4133 4134 if (no_iommu || dmar_disabled) { 4135 /* 4136 * We exit the function here to ensure IOMMU's remapping and 4137 * mempool aren't setup, which means that the IOMMU's PMRs 4138 * won't be disabled via the call to init_dmars(). So disable 4139 * it explicitly here. The PMRs were setup by tboot prior to 4140 * calling SENTER, but the kernel is expected to reset/tear 4141 * down the PMRs. 4142 */ 4143 if (intel_iommu_tboot_noforce) { 4144 for_each_iommu(iommu, drhd) 4145 iommu_disable_protect_mem_regions(iommu); 4146 } 4147 4148 /* 4149 * Make sure the IOMMUs are switched off, even when we 4150 * boot into a kexec kernel and the previous kernel left 4151 * them enabled 4152 */ 4153 intel_disable_iommus(); 4154 goto out_free_dmar; 4155 } 4156 4157 if (list_empty(&dmar_rmrr_units)) 4158 pr_info("No RMRR found\n"); 4159 4160 if (list_empty(&dmar_atsr_units)) 4161 pr_info("No ATSR found\n"); 4162 4163 if (list_empty(&dmar_satc_units)) 4164 pr_info("No SATC found\n"); 4165 4166 if (dmar_map_gfx) 4167 intel_iommu_gfx_mapped = 1; 4168 4169 init_no_remapping_devices(); 4170 4171 ret = init_dmars(); 4172 if (ret) { 4173 if (force_on) 4174 panic("tboot: Failed to initialize DMARs\n"); 4175 pr_err("Initialization failed\n"); 4176 goto out_free_dmar; 4177 } 4178 up_write(&dmar_global_lock); 4179 4180 init_iommu_pm_ops(); 4181 4182 down_read(&dmar_global_lock); 4183 for_each_active_iommu(iommu, drhd) { 4184 /* 4185 * The flush queue implementation does not perform 4186 * page-selective invalidations that are required for efficient 4187 * TLB flushes in virtual environments. The benefit of batching 4188 * is likely to be much lower than the overhead of synchronizing 4189 * the virtual and physical IOMMU page-tables. 4190 */ 4191 if (cap_caching_mode(iommu->cap)) { 4192 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4193 iommu_set_dma_strict(); 4194 } 4195 iommu_device_sysfs_add(&iommu->iommu, NULL, 4196 intel_iommu_groups, 4197 "%s", iommu->name); 4198 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4199 } 4200 up_read(&dmar_global_lock); 4201 4202 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4203 if (si_domain && !hw_pass_through) 4204 register_memory_notifier(&intel_iommu_memory_nb); 4205 4206 down_read(&dmar_global_lock); 4207 if (probe_acpi_namespace_devices()) 4208 pr_warn("ACPI name space devices didn't probe correctly\n"); 4209 4210 /* Finally, we enable the DMA remapping hardware. */ 4211 for_each_iommu(iommu, drhd) { 4212 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4213 iommu_enable_translation(iommu); 4214 4215 iommu_disable_protect_mem_regions(iommu); 4216 } 4217 up_read(&dmar_global_lock); 4218 4219 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4220 4221 intel_iommu_enabled = 1; 4222 4223 return 0; 4224 4225 out_free_dmar: 4226 intel_iommu_free_dmars(); 4227 up_write(&dmar_global_lock); 4228 return ret; 4229 } 4230 4231 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4232 { 4233 struct device_domain_info *info = opaque; 4234 4235 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4236 return 0; 4237 } 4238 4239 /* 4240 * NB - intel-iommu lacks any sort of reference counting for the users of 4241 * dependent devices. If multiple endpoints have intersecting dependent 4242 * devices, unbinding the driver from any one of them will possibly leave 4243 * the others unable to operate. 4244 */ 4245 static void domain_context_clear(struct device_domain_info *info) 4246 { 4247 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4248 return; 4249 4250 pci_for_each_dma_alias(to_pci_dev(info->dev), 4251 &domain_context_clear_one_cb, info); 4252 } 4253 4254 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4255 { 4256 struct dmar_domain *domain; 4257 struct intel_iommu *iommu; 4258 unsigned long flags; 4259 4260 assert_spin_locked(&device_domain_lock); 4261 4262 if (WARN_ON(!info)) 4263 return; 4264 4265 iommu = info->iommu; 4266 domain = info->domain; 4267 4268 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4269 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4270 intel_pasid_tear_down_entry(iommu, info->dev, 4271 PASID_RID2PASID, false); 4272 4273 iommu_disable_dev_iotlb(info); 4274 domain_context_clear(info); 4275 intel_pasid_free_table(info->dev); 4276 } 4277 4278 list_del(&info->link); 4279 4280 spin_lock_irqsave(&iommu->lock, flags); 4281 domain_detach_iommu(domain, iommu); 4282 spin_unlock_irqrestore(&iommu->lock, flags); 4283 } 4284 4285 static void dmar_remove_one_dev_info(struct device *dev) 4286 { 4287 struct device_domain_info *info; 4288 unsigned long flags; 4289 4290 spin_lock_irqsave(&device_domain_lock, flags); 4291 info = dev_iommu_priv_get(dev); 4292 if (info) 4293 __dmar_remove_one_dev_info(info); 4294 spin_unlock_irqrestore(&device_domain_lock, flags); 4295 } 4296 4297 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4298 { 4299 int adjust_width; 4300 4301 /* calculate AGAW */ 4302 domain->gaw = guest_width; 4303 adjust_width = guestwidth_to_adjustwidth(guest_width); 4304 domain->agaw = width_to_agaw(adjust_width); 4305 4306 domain->iommu_coherency = false; 4307 domain->iommu_snooping = false; 4308 domain->iommu_superpage = 0; 4309 domain->max_addr = 0; 4310 4311 /* always allocate the top pgd */ 4312 domain->pgd = alloc_pgtable_page(domain->nid); 4313 if (!domain->pgd) 4314 return -ENOMEM; 4315 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4316 return 0; 4317 } 4318 4319 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4320 { 4321 struct dmar_domain *dmar_domain; 4322 struct iommu_domain *domain; 4323 4324 switch (type) { 4325 case IOMMU_DOMAIN_DMA: 4326 case IOMMU_DOMAIN_DMA_FQ: 4327 case IOMMU_DOMAIN_UNMANAGED: 4328 dmar_domain = alloc_domain(type); 4329 if (!dmar_domain) { 4330 pr_err("Can't allocate dmar_domain\n"); 4331 return NULL; 4332 } 4333 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4334 pr_err("Domain initialization failed\n"); 4335 domain_exit(dmar_domain); 4336 return NULL; 4337 } 4338 4339 domain = &dmar_domain->domain; 4340 domain->geometry.aperture_start = 0; 4341 domain->geometry.aperture_end = 4342 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4343 domain->geometry.force_aperture = true; 4344 4345 return domain; 4346 case IOMMU_DOMAIN_IDENTITY: 4347 return &si_domain->domain; 4348 default: 4349 return NULL; 4350 } 4351 4352 return NULL; 4353 } 4354 4355 static void intel_iommu_domain_free(struct iommu_domain *domain) 4356 { 4357 if (domain != &si_domain->domain) 4358 domain_exit(to_dmar_domain(domain)); 4359 } 4360 4361 static int prepare_domain_attach_device(struct iommu_domain *domain, 4362 struct device *dev) 4363 { 4364 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4365 struct intel_iommu *iommu; 4366 int addr_width; 4367 4368 iommu = device_to_iommu(dev, NULL, NULL); 4369 if (!iommu) 4370 return -ENODEV; 4371 4372 /* check if this iommu agaw is sufficient for max mapped address */ 4373 addr_width = agaw_to_width(iommu->agaw); 4374 if (addr_width > cap_mgaw(iommu->cap)) 4375 addr_width = cap_mgaw(iommu->cap); 4376 4377 if (dmar_domain->max_addr > (1LL << addr_width)) { 4378 dev_err(dev, "%s: iommu width (%d) is not " 4379 "sufficient for the mapped address (%llx)\n", 4380 __func__, addr_width, dmar_domain->max_addr); 4381 return -EFAULT; 4382 } 4383 dmar_domain->gaw = addr_width; 4384 4385 /* 4386 * Knock out extra levels of page tables if necessary 4387 */ 4388 while (iommu->agaw < dmar_domain->agaw) { 4389 struct dma_pte *pte; 4390 4391 pte = dmar_domain->pgd; 4392 if (dma_pte_present(pte)) { 4393 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4394 free_pgtable_page(pte); 4395 } 4396 dmar_domain->agaw--; 4397 } 4398 4399 return 0; 4400 } 4401 4402 static int intel_iommu_attach_device(struct iommu_domain *domain, 4403 struct device *dev) 4404 { 4405 int ret; 4406 4407 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4408 device_is_rmrr_locked(dev)) { 4409 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4410 return -EPERM; 4411 } 4412 4413 /* normally dev is not mapped */ 4414 if (unlikely(domain_context_mapped(dev))) { 4415 struct device_domain_info *info = dev_iommu_priv_get(dev); 4416 4417 if (info->domain) 4418 dmar_remove_one_dev_info(dev); 4419 } 4420 4421 ret = prepare_domain_attach_device(domain, dev); 4422 if (ret) 4423 return ret; 4424 4425 return domain_add_dev_info(to_dmar_domain(domain), dev); 4426 } 4427 4428 static void intel_iommu_detach_device(struct iommu_domain *domain, 4429 struct device *dev) 4430 { 4431 dmar_remove_one_dev_info(dev); 4432 } 4433 4434 static int intel_iommu_map(struct iommu_domain *domain, 4435 unsigned long iova, phys_addr_t hpa, 4436 size_t size, int iommu_prot, gfp_t gfp) 4437 { 4438 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4439 u64 max_addr; 4440 int prot = 0; 4441 4442 if (iommu_prot & IOMMU_READ) 4443 prot |= DMA_PTE_READ; 4444 if (iommu_prot & IOMMU_WRITE) 4445 prot |= DMA_PTE_WRITE; 4446 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 4447 prot |= DMA_PTE_SNP; 4448 4449 max_addr = iova + size; 4450 if (dmar_domain->max_addr < max_addr) { 4451 u64 end; 4452 4453 /* check if minimum agaw is sufficient for mapped address */ 4454 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4455 if (end < max_addr) { 4456 pr_err("%s: iommu width (%d) is not " 4457 "sufficient for the mapped address (%llx)\n", 4458 __func__, dmar_domain->gaw, max_addr); 4459 return -EFAULT; 4460 } 4461 dmar_domain->max_addr = max_addr; 4462 } 4463 /* Round up size to next multiple of PAGE_SIZE, if it and 4464 the low bits of hpa would take us onto the next page */ 4465 size = aligned_nrpages(hpa, size); 4466 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4467 hpa >> VTD_PAGE_SHIFT, size, prot); 4468 } 4469 4470 static int intel_iommu_map_pages(struct iommu_domain *domain, 4471 unsigned long iova, phys_addr_t paddr, 4472 size_t pgsize, size_t pgcount, 4473 int prot, gfp_t gfp, size_t *mapped) 4474 { 4475 unsigned long pgshift = __ffs(pgsize); 4476 size_t size = pgcount << pgshift; 4477 int ret; 4478 4479 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4480 return -EINVAL; 4481 4482 if (!IS_ALIGNED(iova | paddr, pgsize)) 4483 return -EINVAL; 4484 4485 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4486 if (!ret && mapped) 4487 *mapped = size; 4488 4489 return ret; 4490 } 4491 4492 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4493 unsigned long iova, size_t size, 4494 struct iommu_iotlb_gather *gather) 4495 { 4496 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4497 unsigned long start_pfn, last_pfn; 4498 int level = 0; 4499 4500 /* Cope with horrid API which requires us to unmap more than the 4501 size argument if it happens to be a large-page mapping. */ 4502 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4503 4504 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4505 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4506 4507 start_pfn = iova >> VTD_PAGE_SHIFT; 4508 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4509 4510 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4511 4512 if (dmar_domain->max_addr == iova + size) 4513 dmar_domain->max_addr = iova; 4514 4515 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4516 4517 return size; 4518 } 4519 4520 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4521 unsigned long iova, 4522 size_t pgsize, size_t pgcount, 4523 struct iommu_iotlb_gather *gather) 4524 { 4525 unsigned long pgshift = __ffs(pgsize); 4526 size_t size = pgcount << pgshift; 4527 4528 return intel_iommu_unmap(domain, iova, size, gather); 4529 } 4530 4531 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4532 struct iommu_iotlb_gather *gather) 4533 { 4534 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4535 unsigned long iova_pfn = IOVA_PFN(gather->start); 4536 size_t size = gather->end - gather->start; 4537 unsigned long start_pfn; 4538 unsigned long nrpages; 4539 int iommu_id; 4540 4541 nrpages = aligned_nrpages(gather->start, size); 4542 start_pfn = mm_to_dma_pfn(iova_pfn); 4543 4544 for_each_domain_iommu(iommu_id, dmar_domain) 4545 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 4546 start_pfn, nrpages, 4547 list_empty(&gather->freelist), 0); 4548 4549 put_pages_list(&gather->freelist); 4550 } 4551 4552 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4553 dma_addr_t iova) 4554 { 4555 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4556 struct dma_pte *pte; 4557 int level = 0; 4558 u64 phys = 0; 4559 4560 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 4561 if (pte && dma_pte_present(pte)) 4562 phys = dma_pte_addr(pte) + 4563 (iova & (BIT_MASK(level_to_offset_bits(level) + 4564 VTD_PAGE_SHIFT) - 1)); 4565 4566 return phys; 4567 } 4568 4569 static bool intel_iommu_capable(enum iommu_cap cap) 4570 { 4571 if (cap == IOMMU_CAP_CACHE_COHERENCY) 4572 return domain_update_iommu_snooping(NULL); 4573 if (cap == IOMMU_CAP_INTR_REMAP) 4574 return irq_remapping_enabled == 1; 4575 4576 return false; 4577 } 4578 4579 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4580 { 4581 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4582 struct device_domain_info *info; 4583 struct intel_iommu *iommu; 4584 unsigned long flags; 4585 u8 bus, devfn; 4586 4587 iommu = device_to_iommu(dev, &bus, &devfn); 4588 if (!iommu) 4589 return ERR_PTR(-ENODEV); 4590 4591 info = kzalloc(sizeof(*info), GFP_KERNEL); 4592 if (!info) 4593 return ERR_PTR(-ENOMEM); 4594 4595 if (dev_is_real_dma_subdevice(dev)) { 4596 info->bus = pdev->bus->number; 4597 info->devfn = pdev->devfn; 4598 info->segment = pci_domain_nr(pdev->bus); 4599 } else { 4600 info->bus = bus; 4601 info->devfn = devfn; 4602 info->segment = iommu->segment; 4603 } 4604 4605 info->dev = dev; 4606 info->iommu = iommu; 4607 if (dev_is_pci(dev)) { 4608 if (ecap_dev_iotlb_support(iommu->ecap) && 4609 pci_ats_supported(pdev) && 4610 dmar_ats_supported(pdev, iommu)) 4611 info->ats_supported = 1; 4612 4613 if (sm_supported(iommu)) { 4614 if (pasid_supported(iommu)) { 4615 int features = pci_pasid_features(pdev); 4616 4617 if (features >= 0) 4618 info->pasid_supported = features | 1; 4619 } 4620 4621 if (info->ats_supported && ecap_prs(iommu->ecap) && 4622 pci_pri_supported(pdev)) 4623 info->pri_supported = 1; 4624 } 4625 } 4626 4627 spin_lock_irqsave(&device_domain_lock, flags); 4628 list_add(&info->global, &device_domain_list); 4629 dev_iommu_priv_set(dev, info); 4630 spin_unlock_irqrestore(&device_domain_lock, flags); 4631 4632 return &iommu->iommu; 4633 } 4634 4635 static void intel_iommu_release_device(struct device *dev) 4636 { 4637 struct device_domain_info *info = dev_iommu_priv_get(dev); 4638 unsigned long flags; 4639 4640 dmar_remove_one_dev_info(dev); 4641 4642 spin_lock_irqsave(&device_domain_lock, flags); 4643 dev_iommu_priv_set(dev, NULL); 4644 list_del(&info->global); 4645 spin_unlock_irqrestore(&device_domain_lock, flags); 4646 4647 kfree(info); 4648 set_dma_ops(dev, NULL); 4649 } 4650 4651 static void intel_iommu_probe_finalize(struct device *dev) 4652 { 4653 set_dma_ops(dev, NULL); 4654 iommu_setup_dma_ops(dev, 0, U64_MAX); 4655 } 4656 4657 static void intel_iommu_get_resv_regions(struct device *device, 4658 struct list_head *head) 4659 { 4660 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4661 struct iommu_resv_region *reg; 4662 struct dmar_rmrr_unit *rmrr; 4663 struct device *i_dev; 4664 int i; 4665 4666 down_read(&dmar_global_lock); 4667 for_each_rmrr_units(rmrr) { 4668 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4669 i, i_dev) { 4670 struct iommu_resv_region *resv; 4671 enum iommu_resv_type type; 4672 size_t length; 4673 4674 if (i_dev != device && 4675 !is_downstream_to_pci_bridge(device, i_dev)) 4676 continue; 4677 4678 length = rmrr->end_address - rmrr->base_address + 1; 4679 4680 type = device_rmrr_is_relaxable(device) ? 4681 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4682 4683 resv = iommu_alloc_resv_region(rmrr->base_address, 4684 length, prot, type); 4685 if (!resv) 4686 break; 4687 4688 list_add_tail(&resv->list, head); 4689 } 4690 } 4691 up_read(&dmar_global_lock); 4692 4693 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4694 if (dev_is_pci(device)) { 4695 struct pci_dev *pdev = to_pci_dev(device); 4696 4697 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4698 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4699 IOMMU_RESV_DIRECT_RELAXABLE); 4700 if (reg) 4701 list_add_tail(®->list, head); 4702 } 4703 } 4704 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4705 4706 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4707 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4708 0, IOMMU_RESV_MSI); 4709 if (!reg) 4710 return; 4711 list_add_tail(®->list, head); 4712 } 4713 4714 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 4715 { 4716 struct device_domain_info *info = dev_iommu_priv_get(dev); 4717 struct context_entry *context; 4718 struct dmar_domain *domain; 4719 unsigned long flags; 4720 u64 ctx_lo; 4721 int ret; 4722 4723 domain = info->domain; 4724 if (!domain) 4725 return -EINVAL; 4726 4727 spin_lock_irqsave(&device_domain_lock, flags); 4728 spin_lock(&iommu->lock); 4729 4730 ret = -EINVAL; 4731 if (!info->pasid_supported) 4732 goto out; 4733 4734 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 4735 if (WARN_ON(!context)) 4736 goto out; 4737 4738 ctx_lo = context[0].lo; 4739 4740 if (!(ctx_lo & CONTEXT_PASIDE)) { 4741 ctx_lo |= CONTEXT_PASIDE; 4742 context[0].lo = ctx_lo; 4743 wmb(); 4744 iommu->flush.flush_context(iommu, 4745 domain->iommu_did[iommu->seq_id], 4746 PCI_DEVID(info->bus, info->devfn), 4747 DMA_CCMD_MASK_NOBIT, 4748 DMA_CCMD_DEVICE_INVL); 4749 } 4750 4751 /* Enable PASID support in the device, if it wasn't already */ 4752 if (!info->pasid_enabled) 4753 iommu_enable_dev_iotlb(info); 4754 4755 ret = 0; 4756 4757 out: 4758 spin_unlock(&iommu->lock); 4759 spin_unlock_irqrestore(&device_domain_lock, flags); 4760 4761 return ret; 4762 } 4763 4764 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4765 { 4766 if (dev_is_pci(dev)) 4767 return pci_device_group(dev); 4768 return generic_device_group(dev); 4769 } 4770 4771 static int intel_iommu_enable_sva(struct device *dev) 4772 { 4773 struct device_domain_info *info = dev_iommu_priv_get(dev); 4774 struct intel_iommu *iommu; 4775 int ret; 4776 4777 if (!info || dmar_disabled) 4778 return -EINVAL; 4779 4780 iommu = info->iommu; 4781 if (!iommu) 4782 return -EINVAL; 4783 4784 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4785 return -ENODEV; 4786 4787 if (intel_iommu_enable_pasid(iommu, dev)) 4788 return -ENODEV; 4789 4790 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 4791 return -EINVAL; 4792 4793 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4794 if (!ret) 4795 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4796 4797 return ret; 4798 } 4799 4800 static int intel_iommu_disable_sva(struct device *dev) 4801 { 4802 struct device_domain_info *info = dev_iommu_priv_get(dev); 4803 struct intel_iommu *iommu = info->iommu; 4804 int ret; 4805 4806 ret = iommu_unregister_device_fault_handler(dev); 4807 if (!ret) 4808 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 4809 4810 return ret; 4811 } 4812 4813 static int intel_iommu_enable_iopf(struct device *dev) 4814 { 4815 struct device_domain_info *info = dev_iommu_priv_get(dev); 4816 4817 if (info && info->pri_supported) 4818 return 0; 4819 4820 return -ENODEV; 4821 } 4822 4823 static int 4824 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4825 { 4826 switch (feat) { 4827 case IOMMU_DEV_FEAT_IOPF: 4828 return intel_iommu_enable_iopf(dev); 4829 4830 case IOMMU_DEV_FEAT_SVA: 4831 return intel_iommu_enable_sva(dev); 4832 4833 default: 4834 return -ENODEV; 4835 } 4836 } 4837 4838 static int 4839 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4840 { 4841 switch (feat) { 4842 case IOMMU_DEV_FEAT_IOPF: 4843 return 0; 4844 4845 case IOMMU_DEV_FEAT_SVA: 4846 return intel_iommu_disable_sva(dev); 4847 4848 default: 4849 return -ENODEV; 4850 } 4851 } 4852 4853 static bool intel_iommu_is_attach_deferred(struct device *dev) 4854 { 4855 struct device_domain_info *info = dev_iommu_priv_get(dev); 4856 4857 return translation_pre_enabled(info->iommu) && !info->domain; 4858 } 4859 4860 /* 4861 * Check that the device does not live on an external facing PCI port that is 4862 * marked as untrusted. Such devices should not be able to apply quirks and 4863 * thus not be able to bypass the IOMMU restrictions. 4864 */ 4865 static bool risky_device(struct pci_dev *pdev) 4866 { 4867 if (pdev->untrusted) { 4868 pci_info(pdev, 4869 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4870 pdev->vendor, pdev->device); 4871 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4872 return true; 4873 } 4874 return false; 4875 } 4876 4877 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4878 unsigned long iova, size_t size) 4879 { 4880 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4881 unsigned long pages = aligned_nrpages(iova, size); 4882 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4883 struct intel_iommu *iommu; 4884 int iommu_id; 4885 4886 for_each_domain_iommu(iommu_id, dmar_domain) { 4887 iommu = g_iommus[iommu_id]; 4888 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 4889 } 4890 } 4891 4892 const struct iommu_ops intel_iommu_ops = { 4893 .capable = intel_iommu_capable, 4894 .domain_alloc = intel_iommu_domain_alloc, 4895 .probe_device = intel_iommu_probe_device, 4896 .probe_finalize = intel_iommu_probe_finalize, 4897 .release_device = intel_iommu_release_device, 4898 .get_resv_regions = intel_iommu_get_resv_regions, 4899 .put_resv_regions = generic_iommu_put_resv_regions, 4900 .device_group = intel_iommu_device_group, 4901 .dev_enable_feat = intel_iommu_dev_enable_feat, 4902 .dev_disable_feat = intel_iommu_dev_disable_feat, 4903 .is_attach_deferred = intel_iommu_is_attach_deferred, 4904 .def_domain_type = device_def_domain_type, 4905 .pgsize_bitmap = SZ_4K, 4906 #ifdef CONFIG_INTEL_IOMMU_SVM 4907 .sva_bind = intel_svm_bind, 4908 .sva_unbind = intel_svm_unbind, 4909 .sva_get_pasid = intel_svm_get_pasid, 4910 .page_response = intel_svm_page_response, 4911 #endif 4912 .default_domain_ops = &(const struct iommu_domain_ops) { 4913 .attach_dev = intel_iommu_attach_device, 4914 .detach_dev = intel_iommu_detach_device, 4915 .map_pages = intel_iommu_map_pages, 4916 .unmap_pages = intel_iommu_unmap_pages, 4917 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4918 .flush_iotlb_all = intel_flush_iotlb_all, 4919 .iotlb_sync = intel_iommu_tlb_sync, 4920 .iova_to_phys = intel_iommu_iova_to_phys, 4921 .free = intel_iommu_domain_free, 4922 } 4923 }; 4924 4925 static void quirk_iommu_igfx(struct pci_dev *dev) 4926 { 4927 if (risky_device(dev)) 4928 return; 4929 4930 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4931 dmar_map_gfx = 0; 4932 } 4933 4934 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4942 4943 /* Broadwell igfx malfunctions with dmar */ 4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4968 4969 static void quirk_iommu_rwbf(struct pci_dev *dev) 4970 { 4971 if (risky_device(dev)) 4972 return; 4973 4974 /* 4975 * Mobile 4 Series Chipset neglects to set RWBF capability, 4976 * but needs it. Same seems to hold for the desktop versions. 4977 */ 4978 pci_info(dev, "Forcing write-buffer flush capability\n"); 4979 rwbf_quirk = 1; 4980 } 4981 4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4989 4990 #define GGC 0x52 4991 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4992 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4993 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4994 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4995 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4996 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4997 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4998 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4999 5000 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5001 { 5002 unsigned short ggc; 5003 5004 if (risky_device(dev)) 5005 return; 5006 5007 if (pci_read_config_word(dev, GGC, &ggc)) 5008 return; 5009 5010 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5011 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5012 dmar_map_gfx = 0; 5013 } else if (dmar_map_gfx) { 5014 /* we have to ensure the gfx device is idle before we flush */ 5015 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5016 iommu_set_dma_strict(); 5017 } 5018 } 5019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5023 5024 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5025 { 5026 unsigned short ver; 5027 5028 if (!IS_GFX_DEVICE(dev)) 5029 return; 5030 5031 ver = (dev->device >> 8) & 0xff; 5032 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5033 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5034 ver != 0x9a) 5035 return; 5036 5037 if (risky_device(dev)) 5038 return; 5039 5040 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5041 iommu_skip_te_disable = 1; 5042 } 5043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5044 5045 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5046 ISOCH DMAR unit for the Azalia sound device, but not give it any 5047 TLB entries, which causes it to deadlock. Check for that. We do 5048 this in a function called from init_dmars(), instead of in a PCI 5049 quirk, because we don't want to print the obnoxious "BIOS broken" 5050 message if VT-d is actually disabled. 5051 */ 5052 static void __init check_tylersburg_isoch(void) 5053 { 5054 struct pci_dev *pdev; 5055 uint32_t vtisochctrl; 5056 5057 /* If there's no Azalia in the system anyway, forget it. */ 5058 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5059 if (!pdev) 5060 return; 5061 5062 if (risky_device(pdev)) { 5063 pci_dev_put(pdev); 5064 return; 5065 } 5066 5067 pci_dev_put(pdev); 5068 5069 /* System Management Registers. Might be hidden, in which case 5070 we can't do the sanity check. But that's OK, because the 5071 known-broken BIOSes _don't_ actually hide it, so far. */ 5072 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5073 if (!pdev) 5074 return; 5075 5076 if (risky_device(pdev)) { 5077 pci_dev_put(pdev); 5078 return; 5079 } 5080 5081 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5082 pci_dev_put(pdev); 5083 return; 5084 } 5085 5086 pci_dev_put(pdev); 5087 5088 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5089 if (vtisochctrl & 1) 5090 return; 5091 5092 /* Drop all bits other than the number of TLB entries */ 5093 vtisochctrl &= 0x1c; 5094 5095 /* If we have the recommended number of TLB entries (16), fine. */ 5096 if (vtisochctrl == 0x10) 5097 return; 5098 5099 /* Zero TLB entries? You get to ride the short bus to school. */ 5100 if (!vtisochctrl) { 5101 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5102 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5103 dmi_get_system_info(DMI_BIOS_VENDOR), 5104 dmi_get_system_info(DMI_BIOS_VERSION), 5105 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5106 iommu_identity_mapping |= IDENTMAP_AZALIA; 5107 return; 5108 } 5109 5110 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5111 vtisochctrl); 5112 } 5113