1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/intel-svm.h> 37 #include <linux/syscore_ops.h> 38 #include <linux/tboot.h> 39 #include <linux/dmi.h> 40 #include <linux/pci-ats.h> 41 #include <linux/memblock.h> 42 #include <linux/dma-direct.h> 43 #include <linux/crash_dump.h> 44 #include <linux/numa.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 49 #include "../irq_remapping.h" 50 #include "../iommu-sva-lib.h" 51 #include "pasid.h" 52 #include "cap_audit.h" 53 54 #define ROOT_SIZE VTD_PAGE_SIZE 55 #define CONTEXT_SIZE VTD_PAGE_SIZE 56 57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 61 62 #define IOAPIC_RANGE_START (0xfee00000) 63 #define IOAPIC_RANGE_END (0xfeefffff) 64 #define IOVA_START_ADDR (0x1000) 65 66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 67 68 #define MAX_AGAW_WIDTH 64 69 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 70 71 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 73 74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 75 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 76 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 77 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 78 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 79 80 /* IO virtual address start page frame number */ 81 #define IOVA_START_PFN (1) 82 83 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 84 85 /* page table handling */ 86 #define LEVEL_STRIDE (9) 87 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 88 89 static inline int agaw_to_level(int agaw) 90 { 91 return agaw + 2; 92 } 93 94 static inline int agaw_to_width(int agaw) 95 { 96 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 97 } 98 99 static inline int width_to_agaw(int width) 100 { 101 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 102 } 103 104 static inline unsigned int level_to_offset_bits(int level) 105 { 106 return (level - 1) * LEVEL_STRIDE; 107 } 108 109 static inline int pfn_level_offset(u64 pfn, int level) 110 { 111 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 112 } 113 114 static inline u64 level_mask(int level) 115 { 116 return -1ULL << level_to_offset_bits(level); 117 } 118 119 static inline u64 level_size(int level) 120 { 121 return 1ULL << level_to_offset_bits(level); 122 } 123 124 static inline u64 align_to_level(u64 pfn, int level) 125 { 126 return (pfn + level_size(level) - 1) & level_mask(level); 127 } 128 129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 130 { 131 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 132 } 133 134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 135 are never going to work. */ 136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 137 { 138 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 139 } 140 141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 142 { 143 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 144 } 145 static inline unsigned long page_to_dma_pfn(struct page *pg) 146 { 147 return mm_to_dma_pfn(page_to_pfn(pg)); 148 } 149 static inline unsigned long virt_to_dma_pfn(void *p) 150 { 151 return page_to_dma_pfn(virt_to_page(p)); 152 } 153 154 /* global iommu list, set NULL for ignored DMAR units */ 155 static struct intel_iommu **g_iommus; 156 157 static void __init check_tylersburg_isoch(void); 158 static int rwbf_quirk; 159 160 /* 161 * set to 1 to panic kernel if can't successfully enable VT-d 162 * (used when kernel is launched w/ TXT) 163 */ 164 static int force_on = 0; 165 static int intel_iommu_tboot_noforce; 166 static int no_platform_optin; 167 168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 169 170 /* 171 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 172 * if marked present. 173 */ 174 static phys_addr_t root_entry_lctp(struct root_entry *re) 175 { 176 if (!(re->lo & 1)) 177 return 0; 178 179 return re->lo & VTD_PAGE_MASK; 180 } 181 182 /* 183 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 184 * if marked present. 185 */ 186 static phys_addr_t root_entry_uctp(struct root_entry *re) 187 { 188 if (!(re->hi & 1)) 189 return 0; 190 191 return re->hi & VTD_PAGE_MASK; 192 } 193 194 static inline void context_clear_pasid_enable(struct context_entry *context) 195 { 196 context->lo &= ~(1ULL << 11); 197 } 198 199 static inline bool context_pasid_enabled(struct context_entry *context) 200 { 201 return !!(context->lo & (1ULL << 11)); 202 } 203 204 static inline void context_set_copied(struct context_entry *context) 205 { 206 context->hi |= (1ull << 3); 207 } 208 209 static inline bool context_copied(struct context_entry *context) 210 { 211 return !!(context->hi & (1ULL << 3)); 212 } 213 214 static inline bool __context_present(struct context_entry *context) 215 { 216 return (context->lo & 1); 217 } 218 219 bool context_present(struct context_entry *context) 220 { 221 return context_pasid_enabled(context) ? 222 __context_present(context) : 223 __context_present(context) && !context_copied(context); 224 } 225 226 static inline void context_set_present(struct context_entry *context) 227 { 228 context->lo |= 1; 229 } 230 231 static inline void context_set_fault_enable(struct context_entry *context) 232 { 233 context->lo &= (((u64)-1) << 2) | 1; 234 } 235 236 static inline void context_set_translation_type(struct context_entry *context, 237 unsigned long value) 238 { 239 context->lo &= (((u64)-1) << 4) | 3; 240 context->lo |= (value & 3) << 2; 241 } 242 243 static inline void context_set_address_root(struct context_entry *context, 244 unsigned long value) 245 { 246 context->lo &= ~VTD_PAGE_MASK; 247 context->lo |= value & VTD_PAGE_MASK; 248 } 249 250 static inline void context_set_address_width(struct context_entry *context, 251 unsigned long value) 252 { 253 context->hi |= value & 7; 254 } 255 256 static inline void context_set_domain_id(struct context_entry *context, 257 unsigned long value) 258 { 259 context->hi |= (value & ((1 << 16) - 1)) << 8; 260 } 261 262 static inline int context_domain_id(struct context_entry *c) 263 { 264 return((c->hi >> 8) & 0xffff); 265 } 266 267 static inline void context_clear_entry(struct context_entry *context) 268 { 269 context->lo = 0; 270 context->hi = 0; 271 } 272 273 /* 274 * This domain is a statically identity mapping domain. 275 * 1. This domain creats a static 1:1 mapping to all usable memory. 276 * 2. It maps to each iommu if successful. 277 * 3. Each iommu mapps to this domain if successful. 278 */ 279 static struct dmar_domain *si_domain; 280 static int hw_pass_through = 1; 281 282 #define for_each_domain_iommu(idx, domain) \ 283 for (idx = 0; idx < g_num_of_iommus; idx++) \ 284 if (domain->iommu_refcnt[idx]) 285 286 struct dmar_rmrr_unit { 287 struct list_head list; /* list of rmrr units */ 288 struct acpi_dmar_header *hdr; /* ACPI header */ 289 u64 base_address; /* reserved base address*/ 290 u64 end_address; /* reserved end address */ 291 struct dmar_dev_scope *devices; /* target devices */ 292 int devices_cnt; /* target device count */ 293 }; 294 295 struct dmar_atsr_unit { 296 struct list_head list; /* list of ATSR units */ 297 struct acpi_dmar_header *hdr; /* ACPI header */ 298 struct dmar_dev_scope *devices; /* target devices */ 299 int devices_cnt; /* target device count */ 300 u8 include_all:1; /* include all ports */ 301 }; 302 303 struct dmar_satc_unit { 304 struct list_head list; /* list of SATC units */ 305 struct acpi_dmar_header *hdr; /* ACPI header */ 306 struct dmar_dev_scope *devices; /* target devices */ 307 struct intel_iommu *iommu; /* the corresponding iommu */ 308 int devices_cnt; /* target device count */ 309 u8 atc_required:1; /* ATS is required */ 310 }; 311 312 static LIST_HEAD(dmar_atsr_units); 313 static LIST_HEAD(dmar_rmrr_units); 314 static LIST_HEAD(dmar_satc_units); 315 316 #define for_each_rmrr_units(rmrr) \ 317 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 318 319 /* bitmap for indexing intel_iommus */ 320 static int g_num_of_iommus; 321 322 static void domain_exit(struct dmar_domain *domain); 323 static void domain_remove_dev_info(struct dmar_domain *domain); 324 static void dmar_remove_one_dev_info(struct device *dev); 325 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 326 static int intel_iommu_attach_device(struct iommu_domain *domain, 327 struct device *dev); 328 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 329 dma_addr_t iova); 330 331 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 332 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 333 334 int intel_iommu_enabled = 0; 335 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 336 337 static int dmar_map_gfx = 1; 338 static int intel_iommu_superpage = 1; 339 static int iommu_identity_mapping; 340 static int iommu_skip_te_disable; 341 342 #define IDENTMAP_GFX 2 343 #define IDENTMAP_AZALIA 4 344 345 int intel_iommu_gfx_mapped; 346 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 347 348 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 349 struct device_domain_info *get_domain_info(struct device *dev) 350 { 351 struct device_domain_info *info; 352 353 if (!dev) 354 return NULL; 355 356 info = dev_iommu_priv_get(dev); 357 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 358 return NULL; 359 360 return info; 361 } 362 363 DEFINE_SPINLOCK(device_domain_lock); 364 static LIST_HEAD(device_domain_list); 365 366 /* 367 * Iterate over elements in device_domain_list and call the specified 368 * callback @fn against each element. 369 */ 370 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 371 void *data), void *data) 372 { 373 int ret = 0; 374 unsigned long flags; 375 struct device_domain_info *info; 376 377 spin_lock_irqsave(&device_domain_lock, flags); 378 list_for_each_entry(info, &device_domain_list, global) { 379 ret = fn(info, data); 380 if (ret) { 381 spin_unlock_irqrestore(&device_domain_lock, flags); 382 return ret; 383 } 384 } 385 spin_unlock_irqrestore(&device_domain_lock, flags); 386 387 return 0; 388 } 389 390 const struct iommu_ops intel_iommu_ops; 391 392 static bool translation_pre_enabled(struct intel_iommu *iommu) 393 { 394 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 395 } 396 397 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 398 { 399 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 400 } 401 402 static void init_translation_status(struct intel_iommu *iommu) 403 { 404 u32 gsts; 405 406 gsts = readl(iommu->reg + DMAR_GSTS_REG); 407 if (gsts & DMA_GSTS_TES) 408 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 409 } 410 411 static int __init intel_iommu_setup(char *str) 412 { 413 if (!str) 414 return -EINVAL; 415 while (*str) { 416 if (!strncmp(str, "on", 2)) { 417 dmar_disabled = 0; 418 pr_info("IOMMU enabled\n"); 419 } else if (!strncmp(str, "off", 3)) { 420 dmar_disabled = 1; 421 no_platform_optin = 1; 422 pr_info("IOMMU disabled\n"); 423 } else if (!strncmp(str, "igfx_off", 8)) { 424 dmar_map_gfx = 0; 425 pr_info("Disable GFX device mapping\n"); 426 } else if (!strncmp(str, "forcedac", 8)) { 427 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 428 iommu_dma_forcedac = true; 429 } else if (!strncmp(str, "strict", 6)) { 430 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 431 iommu_set_dma_strict(); 432 } else if (!strncmp(str, "sp_off", 6)) { 433 pr_info("Disable supported super page\n"); 434 intel_iommu_superpage = 0; 435 } else if (!strncmp(str, "sm_on", 5)) { 436 pr_info("Enable scalable mode if hardware supports\n"); 437 intel_iommu_sm = 1; 438 } else if (!strncmp(str, "sm_off", 6)) { 439 pr_info("Scalable mode is disallowed\n"); 440 intel_iommu_sm = 0; 441 } else if (!strncmp(str, "tboot_noforce", 13)) { 442 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 443 intel_iommu_tboot_noforce = 1; 444 } 445 446 str += strcspn(str, ","); 447 while (*str == ',') 448 str++; 449 } 450 return 0; 451 } 452 __setup("intel_iommu=", intel_iommu_setup); 453 454 static struct kmem_cache *iommu_domain_cache; 455 static struct kmem_cache *iommu_devinfo_cache; 456 457 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 458 { 459 struct dmar_domain **domains; 460 int idx = did >> 8; 461 462 domains = iommu->domains[idx]; 463 if (!domains) 464 return NULL; 465 466 return domains[did & 0xff]; 467 } 468 469 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 470 struct dmar_domain *domain) 471 { 472 struct dmar_domain **domains; 473 int idx = did >> 8; 474 475 if (!iommu->domains[idx]) { 476 size_t size = 256 * sizeof(struct dmar_domain *); 477 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 478 } 479 480 domains = iommu->domains[idx]; 481 if (WARN_ON(!domains)) 482 return; 483 else 484 domains[did & 0xff] = domain; 485 } 486 487 void *alloc_pgtable_page(int node) 488 { 489 struct page *page; 490 void *vaddr = NULL; 491 492 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 493 if (page) 494 vaddr = page_address(page); 495 return vaddr; 496 } 497 498 void free_pgtable_page(void *vaddr) 499 { 500 free_page((unsigned long)vaddr); 501 } 502 503 static inline void *alloc_domain_mem(void) 504 { 505 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 506 } 507 508 static void free_domain_mem(void *vaddr) 509 { 510 kmem_cache_free(iommu_domain_cache, vaddr); 511 } 512 513 static inline void * alloc_devinfo_mem(void) 514 { 515 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 516 } 517 518 static inline void free_devinfo_mem(void *vaddr) 519 { 520 kmem_cache_free(iommu_devinfo_cache, vaddr); 521 } 522 523 static inline int domain_type_is_si(struct dmar_domain *domain) 524 { 525 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 526 } 527 528 static inline bool domain_use_first_level(struct dmar_domain *domain) 529 { 530 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 531 } 532 533 static inline int domain_pfn_supported(struct dmar_domain *domain, 534 unsigned long pfn) 535 { 536 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 537 538 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 539 } 540 541 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 542 { 543 unsigned long sagaw; 544 int agaw; 545 546 sagaw = cap_sagaw(iommu->cap); 547 for (agaw = width_to_agaw(max_gaw); 548 agaw >= 0; agaw--) { 549 if (test_bit(agaw, &sagaw)) 550 break; 551 } 552 553 return agaw; 554 } 555 556 /* 557 * Calculate max SAGAW for each iommu. 558 */ 559 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 560 { 561 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 562 } 563 564 /* 565 * calculate agaw for each iommu. 566 * "SAGAW" may be different across iommus, use a default agaw, and 567 * get a supported less agaw for iommus that don't support the default agaw. 568 */ 569 int iommu_calculate_agaw(struct intel_iommu *iommu) 570 { 571 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 572 } 573 574 /* This functionin only returns single iommu in a domain */ 575 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 576 { 577 int iommu_id; 578 579 /* si_domain and vm domain should not get here. */ 580 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 581 return NULL; 582 583 for_each_domain_iommu(iommu_id, domain) 584 break; 585 586 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 587 return NULL; 588 589 return g_iommus[iommu_id]; 590 } 591 592 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 593 { 594 return sm_supported(iommu) ? 595 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 596 } 597 598 static void domain_update_iommu_coherency(struct dmar_domain *domain) 599 { 600 struct dmar_drhd_unit *drhd; 601 struct intel_iommu *iommu; 602 bool found = false; 603 int i; 604 605 domain->iommu_coherency = true; 606 607 for_each_domain_iommu(i, domain) { 608 found = true; 609 if (!iommu_paging_structure_coherency(g_iommus[i])) { 610 domain->iommu_coherency = false; 611 break; 612 } 613 } 614 if (found) 615 return; 616 617 /* No hardware attached; use lowest common denominator */ 618 rcu_read_lock(); 619 for_each_active_iommu(iommu, drhd) { 620 if (!iommu_paging_structure_coherency(iommu)) { 621 domain->iommu_coherency = false; 622 break; 623 } 624 } 625 rcu_read_unlock(); 626 } 627 628 static bool domain_update_iommu_snooping(struct intel_iommu *skip) 629 { 630 struct dmar_drhd_unit *drhd; 631 struct intel_iommu *iommu; 632 bool ret = true; 633 634 rcu_read_lock(); 635 for_each_active_iommu(iommu, drhd) { 636 if (iommu != skip) { 637 /* 638 * If the hardware is operating in the scalable mode, 639 * the snooping control is always supported since we 640 * always set PASID-table-entry.PGSNP bit if the domain 641 * is managed outside (UNMANAGED). 642 */ 643 if (!sm_supported(iommu) && 644 !ecap_sc_support(iommu->ecap)) { 645 ret = false; 646 break; 647 } 648 } 649 } 650 rcu_read_unlock(); 651 652 return ret; 653 } 654 655 static int domain_update_iommu_superpage(struct dmar_domain *domain, 656 struct intel_iommu *skip) 657 { 658 struct dmar_drhd_unit *drhd; 659 struct intel_iommu *iommu; 660 int mask = 0x3; 661 662 if (!intel_iommu_superpage) 663 return 0; 664 665 /* set iommu_superpage to the smallest common denominator */ 666 rcu_read_lock(); 667 for_each_active_iommu(iommu, drhd) { 668 if (iommu != skip) { 669 if (domain && domain_use_first_level(domain)) { 670 if (!cap_fl1gp_support(iommu->cap)) 671 mask = 0x1; 672 } else { 673 mask &= cap_super_page_val(iommu->cap); 674 } 675 676 if (!mask) 677 break; 678 } 679 } 680 rcu_read_unlock(); 681 682 return fls(mask); 683 } 684 685 static int domain_update_device_node(struct dmar_domain *domain) 686 { 687 struct device_domain_info *info; 688 int nid = NUMA_NO_NODE; 689 690 assert_spin_locked(&device_domain_lock); 691 692 if (list_empty(&domain->devices)) 693 return NUMA_NO_NODE; 694 695 list_for_each_entry(info, &domain->devices, link) { 696 if (!info->dev) 697 continue; 698 699 /* 700 * There could possibly be multiple device numa nodes as devices 701 * within the same domain may sit behind different IOMMUs. There 702 * isn't perfect answer in such situation, so we select first 703 * come first served policy. 704 */ 705 nid = dev_to_node(info->dev); 706 if (nid != NUMA_NO_NODE) 707 break; 708 } 709 710 return nid; 711 } 712 713 static void domain_update_iotlb(struct dmar_domain *domain); 714 715 /* Return the super pagesize bitmap if supported. */ 716 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 717 { 718 unsigned long bitmap = 0; 719 720 /* 721 * 1-level super page supports page size of 2MiB, 2-level super page 722 * supports page size of both 2MiB and 1GiB. 723 */ 724 if (domain->iommu_superpage == 1) 725 bitmap |= SZ_2M; 726 else if (domain->iommu_superpage == 2) 727 bitmap |= SZ_2M | SZ_1G; 728 729 return bitmap; 730 } 731 732 /* Some capabilities may be different across iommus */ 733 static void domain_update_iommu_cap(struct dmar_domain *domain) 734 { 735 domain_update_iommu_coherency(domain); 736 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 737 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 738 739 /* 740 * If RHSA is missing, we should default to the device numa domain 741 * as fall back. 742 */ 743 if (domain->nid == NUMA_NO_NODE) 744 domain->nid = domain_update_device_node(domain); 745 746 /* 747 * First-level translation restricts the input-address to a 748 * canonical address (i.e., address bits 63:N have the same 749 * value as address bit [N-1], where N is 48-bits with 4-level 750 * paging and 57-bits with 5-level paging). Hence, skip bit 751 * [N-1]. 752 */ 753 if (domain_use_first_level(domain)) 754 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 755 else 756 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 757 758 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 759 domain_update_iotlb(domain); 760 } 761 762 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 763 u8 devfn, int alloc) 764 { 765 struct root_entry *root = &iommu->root_entry[bus]; 766 struct context_entry *context; 767 u64 *entry; 768 769 entry = &root->lo; 770 if (sm_supported(iommu)) { 771 if (devfn >= 0x80) { 772 devfn -= 0x80; 773 entry = &root->hi; 774 } 775 devfn *= 2; 776 } 777 if (*entry & 1) 778 context = phys_to_virt(*entry & VTD_PAGE_MASK); 779 else { 780 unsigned long phy_addr; 781 if (!alloc) 782 return NULL; 783 784 context = alloc_pgtable_page(iommu->node); 785 if (!context) 786 return NULL; 787 788 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 789 phy_addr = virt_to_phys((void *)context); 790 *entry = phy_addr | 1; 791 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 792 } 793 return &context[devfn]; 794 } 795 796 static bool attach_deferred(struct device *dev) 797 { 798 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 799 } 800 801 /** 802 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 803 * sub-hierarchy of a candidate PCI-PCI bridge 804 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 805 * @bridge: the candidate PCI-PCI bridge 806 * 807 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 808 */ 809 static bool 810 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 811 { 812 struct pci_dev *pdev, *pbridge; 813 814 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 815 return false; 816 817 pdev = to_pci_dev(dev); 818 pbridge = to_pci_dev(bridge); 819 820 if (pbridge->subordinate && 821 pbridge->subordinate->number <= pdev->bus->number && 822 pbridge->subordinate->busn_res.end >= pdev->bus->number) 823 return true; 824 825 return false; 826 } 827 828 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 829 { 830 struct dmar_drhd_unit *drhd; 831 u32 vtbar; 832 int rc; 833 834 /* We know that this device on this chipset has its own IOMMU. 835 * If we find it under a different IOMMU, then the BIOS is lying 836 * to us. Hope that the IOMMU for this device is actually 837 * disabled, and it needs no translation... 838 */ 839 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 840 if (rc) { 841 /* "can't" happen */ 842 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 843 return false; 844 } 845 vtbar &= 0xffff0000; 846 847 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 848 drhd = dmar_find_matched_drhd_unit(pdev); 849 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 850 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 851 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 852 return true; 853 } 854 855 return false; 856 } 857 858 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 859 { 860 if (!iommu || iommu->drhd->ignored) 861 return true; 862 863 if (dev_is_pci(dev)) { 864 struct pci_dev *pdev = to_pci_dev(dev); 865 866 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 867 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 868 quirk_ioat_snb_local_iommu(pdev)) 869 return true; 870 } 871 872 return false; 873 } 874 875 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 876 { 877 struct dmar_drhd_unit *drhd = NULL; 878 struct pci_dev *pdev = NULL; 879 struct intel_iommu *iommu; 880 struct device *tmp; 881 u16 segment = 0; 882 int i; 883 884 if (!dev) 885 return NULL; 886 887 if (dev_is_pci(dev)) { 888 struct pci_dev *pf_pdev; 889 890 pdev = pci_real_dma_dev(to_pci_dev(dev)); 891 892 /* VFs aren't listed in scope tables; we need to look up 893 * the PF instead to find the IOMMU. */ 894 pf_pdev = pci_physfn(pdev); 895 dev = &pf_pdev->dev; 896 segment = pci_domain_nr(pdev->bus); 897 } else if (has_acpi_companion(dev)) 898 dev = &ACPI_COMPANION(dev)->dev; 899 900 rcu_read_lock(); 901 for_each_iommu(iommu, drhd) { 902 if (pdev && segment != drhd->segment) 903 continue; 904 905 for_each_active_dev_scope(drhd->devices, 906 drhd->devices_cnt, i, tmp) { 907 if (tmp == dev) { 908 /* For a VF use its original BDF# not that of the PF 909 * which we used for the IOMMU lookup. Strictly speaking 910 * we could do this for all PCI devices; we only need to 911 * get the BDF# from the scope table for ACPI matches. */ 912 if (pdev && pdev->is_virtfn) 913 goto got_pdev; 914 915 if (bus && devfn) { 916 *bus = drhd->devices[i].bus; 917 *devfn = drhd->devices[i].devfn; 918 } 919 goto out; 920 } 921 922 if (is_downstream_to_pci_bridge(dev, tmp)) 923 goto got_pdev; 924 } 925 926 if (pdev && drhd->include_all) { 927 got_pdev: 928 if (bus && devfn) { 929 *bus = pdev->bus->number; 930 *devfn = pdev->devfn; 931 } 932 goto out; 933 } 934 } 935 iommu = NULL; 936 out: 937 if (iommu_is_dummy(iommu, dev)) 938 iommu = NULL; 939 940 rcu_read_unlock(); 941 942 return iommu; 943 } 944 945 static void domain_flush_cache(struct dmar_domain *domain, 946 void *addr, int size) 947 { 948 if (!domain->iommu_coherency) 949 clflush_cache_range(addr, size); 950 } 951 952 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 953 { 954 struct context_entry *context; 955 int ret = 0; 956 unsigned long flags; 957 958 spin_lock_irqsave(&iommu->lock, flags); 959 context = iommu_context_addr(iommu, bus, devfn, 0); 960 if (context) 961 ret = context_present(context); 962 spin_unlock_irqrestore(&iommu->lock, flags); 963 return ret; 964 } 965 966 static void free_context_table(struct intel_iommu *iommu) 967 { 968 int i; 969 unsigned long flags; 970 struct context_entry *context; 971 972 spin_lock_irqsave(&iommu->lock, flags); 973 if (!iommu->root_entry) { 974 goto out; 975 } 976 for (i = 0; i < ROOT_ENTRY_NR; i++) { 977 context = iommu_context_addr(iommu, i, 0, 0); 978 if (context) 979 free_pgtable_page(context); 980 981 if (!sm_supported(iommu)) 982 continue; 983 984 context = iommu_context_addr(iommu, i, 0x80, 0); 985 if (context) 986 free_pgtable_page(context); 987 988 } 989 free_pgtable_page(iommu->root_entry); 990 iommu->root_entry = NULL; 991 out: 992 spin_unlock_irqrestore(&iommu->lock, flags); 993 } 994 995 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 996 unsigned long pfn, int *target_level) 997 { 998 struct dma_pte *parent, *pte; 999 int level = agaw_to_level(domain->agaw); 1000 int offset; 1001 1002 BUG_ON(!domain->pgd); 1003 1004 if (!domain_pfn_supported(domain, pfn)) 1005 /* Address beyond IOMMU's addressing capabilities. */ 1006 return NULL; 1007 1008 parent = domain->pgd; 1009 1010 while (1) { 1011 void *tmp_page; 1012 1013 offset = pfn_level_offset(pfn, level); 1014 pte = &parent[offset]; 1015 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1016 break; 1017 if (level == *target_level) 1018 break; 1019 1020 if (!dma_pte_present(pte)) { 1021 uint64_t pteval; 1022 1023 tmp_page = alloc_pgtable_page(domain->nid); 1024 1025 if (!tmp_page) 1026 return NULL; 1027 1028 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1029 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1030 if (domain_use_first_level(domain)) { 1031 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1032 if (iommu_is_dma_domain(&domain->domain)) 1033 pteval |= DMA_FL_PTE_ACCESS; 1034 } 1035 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1036 /* Someone else set it while we were thinking; use theirs. */ 1037 free_pgtable_page(tmp_page); 1038 else 1039 domain_flush_cache(domain, pte, sizeof(*pte)); 1040 } 1041 if (level == 1) 1042 break; 1043 1044 parent = phys_to_virt(dma_pte_addr(pte)); 1045 level--; 1046 } 1047 1048 if (!*target_level) 1049 *target_level = level; 1050 1051 return pte; 1052 } 1053 1054 /* return address's pte at specific level */ 1055 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1056 unsigned long pfn, 1057 int level, int *large_page) 1058 { 1059 struct dma_pte *parent, *pte; 1060 int total = agaw_to_level(domain->agaw); 1061 int offset; 1062 1063 parent = domain->pgd; 1064 while (level <= total) { 1065 offset = pfn_level_offset(pfn, total); 1066 pte = &parent[offset]; 1067 if (level == total) 1068 return pte; 1069 1070 if (!dma_pte_present(pte)) { 1071 *large_page = total; 1072 break; 1073 } 1074 1075 if (dma_pte_superpage(pte)) { 1076 *large_page = total; 1077 return pte; 1078 } 1079 1080 parent = phys_to_virt(dma_pte_addr(pte)); 1081 total--; 1082 } 1083 return NULL; 1084 } 1085 1086 /* clear last level pte, a tlb flush should be followed */ 1087 static void dma_pte_clear_range(struct dmar_domain *domain, 1088 unsigned long start_pfn, 1089 unsigned long last_pfn) 1090 { 1091 unsigned int large_page; 1092 struct dma_pte *first_pte, *pte; 1093 1094 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1095 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1096 BUG_ON(start_pfn > last_pfn); 1097 1098 /* we don't need lock here; nobody else touches the iova range */ 1099 do { 1100 large_page = 1; 1101 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1102 if (!pte) { 1103 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1104 continue; 1105 } 1106 do { 1107 dma_clear_pte(pte); 1108 start_pfn += lvl_to_nr_pages(large_page); 1109 pte++; 1110 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1111 1112 domain_flush_cache(domain, first_pte, 1113 (void *)pte - (void *)first_pte); 1114 1115 } while (start_pfn && start_pfn <= last_pfn); 1116 } 1117 1118 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1119 int retain_level, struct dma_pte *pte, 1120 unsigned long pfn, unsigned long start_pfn, 1121 unsigned long last_pfn) 1122 { 1123 pfn = max(start_pfn, pfn); 1124 pte = &pte[pfn_level_offset(pfn, level)]; 1125 1126 do { 1127 unsigned long level_pfn; 1128 struct dma_pte *level_pte; 1129 1130 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1131 goto next; 1132 1133 level_pfn = pfn & level_mask(level); 1134 level_pte = phys_to_virt(dma_pte_addr(pte)); 1135 1136 if (level > 2) { 1137 dma_pte_free_level(domain, level - 1, retain_level, 1138 level_pte, level_pfn, start_pfn, 1139 last_pfn); 1140 } 1141 1142 /* 1143 * Free the page table if we're below the level we want to 1144 * retain and the range covers the entire table. 1145 */ 1146 if (level < retain_level && !(start_pfn > level_pfn || 1147 last_pfn < level_pfn + level_size(level) - 1)) { 1148 dma_clear_pte(pte); 1149 domain_flush_cache(domain, pte, sizeof(*pte)); 1150 free_pgtable_page(level_pte); 1151 } 1152 next: 1153 pfn += level_size(level); 1154 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1155 } 1156 1157 /* 1158 * clear last level (leaf) ptes and free page table pages below the 1159 * level we wish to keep intact. 1160 */ 1161 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1162 unsigned long start_pfn, 1163 unsigned long last_pfn, 1164 int retain_level) 1165 { 1166 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1167 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1168 BUG_ON(start_pfn > last_pfn); 1169 1170 dma_pte_clear_range(domain, start_pfn, last_pfn); 1171 1172 /* We don't need lock here; nobody else touches the iova range */ 1173 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1174 domain->pgd, 0, start_pfn, last_pfn); 1175 1176 /* free pgd */ 1177 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1178 free_pgtable_page(domain->pgd); 1179 domain->pgd = NULL; 1180 } 1181 } 1182 1183 /* When a page at a given level is being unlinked from its parent, we don't 1184 need to *modify* it at all. All we need to do is make a list of all the 1185 pages which can be freed just as soon as we've flushed the IOTLB and we 1186 know the hardware page-walk will no longer touch them. 1187 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1188 be freed. */ 1189 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1190 int level, struct dma_pte *pte, 1191 struct page *freelist) 1192 { 1193 struct page *pg; 1194 1195 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1196 pg->freelist = freelist; 1197 freelist = pg; 1198 1199 if (level == 1) 1200 return freelist; 1201 1202 pte = page_address(pg); 1203 do { 1204 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1205 freelist = dma_pte_list_pagetables(domain, level - 1, 1206 pte, freelist); 1207 pte++; 1208 } while (!first_pte_in_page(pte)); 1209 1210 return freelist; 1211 } 1212 1213 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1214 struct dma_pte *pte, unsigned long pfn, 1215 unsigned long start_pfn, 1216 unsigned long last_pfn, 1217 struct page *freelist) 1218 { 1219 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1220 1221 pfn = max(start_pfn, pfn); 1222 pte = &pte[pfn_level_offset(pfn, level)]; 1223 1224 do { 1225 unsigned long level_pfn; 1226 1227 if (!dma_pte_present(pte)) 1228 goto next; 1229 1230 level_pfn = pfn & level_mask(level); 1231 1232 /* If range covers entire pagetable, free it */ 1233 if (start_pfn <= level_pfn && 1234 last_pfn >= level_pfn + level_size(level) - 1) { 1235 /* These suborbinate page tables are going away entirely. Don't 1236 bother to clear them; we're just going to *free* them. */ 1237 if (level > 1 && !dma_pte_superpage(pte)) 1238 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1239 1240 dma_clear_pte(pte); 1241 if (!first_pte) 1242 first_pte = pte; 1243 last_pte = pte; 1244 } else if (level > 1) { 1245 /* Recurse down into a level that isn't *entirely* obsolete */ 1246 freelist = dma_pte_clear_level(domain, level - 1, 1247 phys_to_virt(dma_pte_addr(pte)), 1248 level_pfn, start_pfn, last_pfn, 1249 freelist); 1250 } 1251 next: 1252 pfn += level_size(level); 1253 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1254 1255 if (first_pte) 1256 domain_flush_cache(domain, first_pte, 1257 (void *)++last_pte - (void *)first_pte); 1258 1259 return freelist; 1260 } 1261 1262 /* We can't just free the pages because the IOMMU may still be walking 1263 the page tables, and may have cached the intermediate levels. The 1264 pages can only be freed after the IOTLB flush has been done. */ 1265 static struct page *domain_unmap(struct dmar_domain *domain, 1266 unsigned long start_pfn, 1267 unsigned long last_pfn, 1268 struct page *freelist) 1269 { 1270 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1271 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1272 BUG_ON(start_pfn > last_pfn); 1273 1274 /* we don't need lock here; nobody else touches the iova range */ 1275 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1276 domain->pgd, 0, start_pfn, last_pfn, 1277 freelist); 1278 1279 /* free pgd */ 1280 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1281 struct page *pgd_page = virt_to_page(domain->pgd); 1282 pgd_page->freelist = freelist; 1283 freelist = pgd_page; 1284 1285 domain->pgd = NULL; 1286 } 1287 1288 return freelist; 1289 } 1290 1291 static void dma_free_pagelist(struct page *freelist) 1292 { 1293 struct page *pg; 1294 1295 while ((pg = freelist)) { 1296 freelist = pg->freelist; 1297 free_pgtable_page(page_address(pg)); 1298 } 1299 } 1300 1301 /* iommu handling */ 1302 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1303 { 1304 struct root_entry *root; 1305 unsigned long flags; 1306 1307 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1308 if (!root) { 1309 pr_err("Allocating root entry for %s failed\n", 1310 iommu->name); 1311 return -ENOMEM; 1312 } 1313 1314 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1315 1316 spin_lock_irqsave(&iommu->lock, flags); 1317 iommu->root_entry = root; 1318 spin_unlock_irqrestore(&iommu->lock, flags); 1319 1320 return 0; 1321 } 1322 1323 static void iommu_set_root_entry(struct intel_iommu *iommu) 1324 { 1325 u64 addr; 1326 u32 sts; 1327 unsigned long flag; 1328 1329 addr = virt_to_phys(iommu->root_entry); 1330 if (sm_supported(iommu)) 1331 addr |= DMA_RTADDR_SMT; 1332 1333 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1334 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1335 1336 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1337 1338 /* Make sure hardware complete it */ 1339 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1340 readl, (sts & DMA_GSTS_RTPS), sts); 1341 1342 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1343 1344 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1345 if (sm_supported(iommu)) 1346 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1347 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1348 } 1349 1350 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1351 { 1352 u32 val; 1353 unsigned long flag; 1354 1355 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1356 return; 1357 1358 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1359 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1360 1361 /* Make sure hardware complete it */ 1362 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1363 readl, (!(val & DMA_GSTS_WBFS)), val); 1364 1365 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1366 } 1367 1368 /* return value determine if we need a write buffer flush */ 1369 static void __iommu_flush_context(struct intel_iommu *iommu, 1370 u16 did, u16 source_id, u8 function_mask, 1371 u64 type) 1372 { 1373 u64 val = 0; 1374 unsigned long flag; 1375 1376 switch (type) { 1377 case DMA_CCMD_GLOBAL_INVL: 1378 val = DMA_CCMD_GLOBAL_INVL; 1379 break; 1380 case DMA_CCMD_DOMAIN_INVL: 1381 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1382 break; 1383 case DMA_CCMD_DEVICE_INVL: 1384 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1385 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1386 break; 1387 default: 1388 BUG(); 1389 } 1390 val |= DMA_CCMD_ICC; 1391 1392 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1393 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1394 1395 /* Make sure hardware complete it */ 1396 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1397 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1398 1399 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1400 } 1401 1402 /* return value determine if we need a write buffer flush */ 1403 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1404 u64 addr, unsigned int size_order, u64 type) 1405 { 1406 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1407 u64 val = 0, val_iva = 0; 1408 unsigned long flag; 1409 1410 switch (type) { 1411 case DMA_TLB_GLOBAL_FLUSH: 1412 /* global flush doesn't need set IVA_REG */ 1413 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1414 break; 1415 case DMA_TLB_DSI_FLUSH: 1416 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1417 break; 1418 case DMA_TLB_PSI_FLUSH: 1419 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1420 /* IH bit is passed in as part of address */ 1421 val_iva = size_order | addr; 1422 break; 1423 default: 1424 BUG(); 1425 } 1426 /* Note: set drain read/write */ 1427 #if 0 1428 /* 1429 * This is probably to be super secure.. Looks like we can 1430 * ignore it without any impact. 1431 */ 1432 if (cap_read_drain(iommu->cap)) 1433 val |= DMA_TLB_READ_DRAIN; 1434 #endif 1435 if (cap_write_drain(iommu->cap)) 1436 val |= DMA_TLB_WRITE_DRAIN; 1437 1438 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1439 /* Note: Only uses first TLB reg currently */ 1440 if (val_iva) 1441 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1442 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1443 1444 /* Make sure hardware complete it */ 1445 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1446 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1447 1448 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1449 1450 /* check IOTLB invalidation granularity */ 1451 if (DMA_TLB_IAIG(val) == 0) 1452 pr_err("Flush IOTLB failed\n"); 1453 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1454 pr_debug("TLB flush request %Lx, actual %Lx\n", 1455 (unsigned long long)DMA_TLB_IIRG(type), 1456 (unsigned long long)DMA_TLB_IAIG(val)); 1457 } 1458 1459 static struct device_domain_info * 1460 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1461 u8 bus, u8 devfn) 1462 { 1463 struct device_domain_info *info; 1464 1465 assert_spin_locked(&device_domain_lock); 1466 1467 if (!iommu->qi) 1468 return NULL; 1469 1470 list_for_each_entry(info, &domain->devices, link) 1471 if (info->iommu == iommu && info->bus == bus && 1472 info->devfn == devfn) { 1473 if (info->ats_supported && info->dev) 1474 return info; 1475 break; 1476 } 1477 1478 return NULL; 1479 } 1480 1481 static void domain_update_iotlb(struct dmar_domain *domain) 1482 { 1483 struct device_domain_info *info; 1484 bool has_iotlb_device = false; 1485 1486 assert_spin_locked(&device_domain_lock); 1487 1488 list_for_each_entry(info, &domain->devices, link) 1489 if (info->ats_enabled) { 1490 has_iotlb_device = true; 1491 break; 1492 } 1493 1494 if (!has_iotlb_device) { 1495 struct subdev_domain_info *sinfo; 1496 1497 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1498 info = get_domain_info(sinfo->pdev); 1499 if (info && info->ats_enabled) { 1500 has_iotlb_device = true; 1501 break; 1502 } 1503 } 1504 } 1505 1506 domain->has_iotlb_device = has_iotlb_device; 1507 } 1508 1509 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1510 { 1511 struct pci_dev *pdev; 1512 1513 assert_spin_locked(&device_domain_lock); 1514 1515 if (!info || !dev_is_pci(info->dev)) 1516 return; 1517 1518 pdev = to_pci_dev(info->dev); 1519 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1520 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1521 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1522 * reserved, which should be set to 0. 1523 */ 1524 if (!ecap_dit(info->iommu->ecap)) 1525 info->pfsid = 0; 1526 else { 1527 struct pci_dev *pf_pdev; 1528 1529 /* pdev will be returned if device is not a vf */ 1530 pf_pdev = pci_physfn(pdev); 1531 info->pfsid = pci_dev_id(pf_pdev); 1532 } 1533 1534 #ifdef CONFIG_INTEL_IOMMU_SVM 1535 /* The PCIe spec, in its wisdom, declares that the behaviour of 1536 the device if you enable PASID support after ATS support is 1537 undefined. So always enable PASID support on devices which 1538 have it, even if we can't yet know if we're ever going to 1539 use it. */ 1540 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1541 info->pasid_enabled = 1; 1542 1543 if (info->pri_supported && 1544 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1545 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1546 info->pri_enabled = 1; 1547 #endif 1548 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1549 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1550 info->ats_enabled = 1; 1551 domain_update_iotlb(info->domain); 1552 info->ats_qdep = pci_ats_queue_depth(pdev); 1553 } 1554 } 1555 1556 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1557 { 1558 struct pci_dev *pdev; 1559 1560 assert_spin_locked(&device_domain_lock); 1561 1562 if (!dev_is_pci(info->dev)) 1563 return; 1564 1565 pdev = to_pci_dev(info->dev); 1566 1567 if (info->ats_enabled) { 1568 pci_disable_ats(pdev); 1569 info->ats_enabled = 0; 1570 domain_update_iotlb(info->domain); 1571 } 1572 #ifdef CONFIG_INTEL_IOMMU_SVM 1573 if (info->pri_enabled) { 1574 pci_disable_pri(pdev); 1575 info->pri_enabled = 0; 1576 } 1577 if (info->pasid_enabled) { 1578 pci_disable_pasid(pdev); 1579 info->pasid_enabled = 0; 1580 } 1581 #endif 1582 } 1583 1584 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1585 u64 addr, unsigned int mask) 1586 { 1587 u16 sid, qdep; 1588 1589 if (!info || !info->ats_enabled) 1590 return; 1591 1592 sid = info->bus << 8 | info->devfn; 1593 qdep = info->ats_qdep; 1594 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1595 qdep, addr, mask); 1596 } 1597 1598 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1599 u64 addr, unsigned mask) 1600 { 1601 unsigned long flags; 1602 struct device_domain_info *info; 1603 struct subdev_domain_info *sinfo; 1604 1605 if (!domain->has_iotlb_device) 1606 return; 1607 1608 spin_lock_irqsave(&device_domain_lock, flags); 1609 list_for_each_entry(info, &domain->devices, link) 1610 __iommu_flush_dev_iotlb(info, addr, mask); 1611 1612 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1613 info = get_domain_info(sinfo->pdev); 1614 __iommu_flush_dev_iotlb(info, addr, mask); 1615 } 1616 spin_unlock_irqrestore(&device_domain_lock, flags); 1617 } 1618 1619 static void domain_flush_piotlb(struct intel_iommu *iommu, 1620 struct dmar_domain *domain, 1621 u64 addr, unsigned long npages, bool ih) 1622 { 1623 u16 did = domain->iommu_did[iommu->seq_id]; 1624 1625 if (domain->default_pasid) 1626 qi_flush_piotlb(iommu, did, domain->default_pasid, 1627 addr, npages, ih); 1628 1629 if (!list_empty(&domain->devices)) 1630 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1631 } 1632 1633 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1634 struct dmar_domain *domain, 1635 unsigned long pfn, unsigned int pages, 1636 int ih, int map) 1637 { 1638 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1639 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1640 u16 did = domain->iommu_did[iommu->seq_id]; 1641 1642 BUG_ON(pages == 0); 1643 1644 if (ih) 1645 ih = 1 << 6; 1646 1647 if (domain_use_first_level(domain)) { 1648 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1649 } else { 1650 /* 1651 * Fallback to domain selective flush if no PSI support or 1652 * the size is too big. PSI requires page size to be 2 ^ x, 1653 * and the base address is naturally aligned to the size. 1654 */ 1655 if (!cap_pgsel_inv(iommu->cap) || 1656 mask > cap_max_amask_val(iommu->cap)) 1657 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1658 DMA_TLB_DSI_FLUSH); 1659 else 1660 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1661 DMA_TLB_PSI_FLUSH); 1662 } 1663 1664 /* 1665 * In caching mode, changes of pages from non-present to present require 1666 * flush. However, device IOTLB doesn't need to be flushed in this case. 1667 */ 1668 if (!cap_caching_mode(iommu->cap) || !map) 1669 iommu_flush_dev_iotlb(domain, addr, mask); 1670 } 1671 1672 /* Notification for newly created mappings */ 1673 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1674 struct dmar_domain *domain, 1675 unsigned long pfn, unsigned int pages) 1676 { 1677 /* 1678 * It's a non-present to present mapping. Only flush if caching mode 1679 * and second level. 1680 */ 1681 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1682 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1683 else 1684 iommu_flush_write_buffer(iommu); 1685 } 1686 1687 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1688 { 1689 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1690 int idx; 1691 1692 for_each_domain_iommu(idx, dmar_domain) { 1693 struct intel_iommu *iommu = g_iommus[idx]; 1694 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1695 1696 if (domain_use_first_level(dmar_domain)) 1697 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1698 else 1699 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1700 DMA_TLB_DSI_FLUSH); 1701 1702 if (!cap_caching_mode(iommu->cap)) 1703 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1704 0, MAX_AGAW_PFN_WIDTH); 1705 } 1706 } 1707 1708 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1709 { 1710 u32 pmen; 1711 unsigned long flags; 1712 1713 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1714 return; 1715 1716 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1717 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1718 pmen &= ~DMA_PMEN_EPM; 1719 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1720 1721 /* wait for the protected region status bit to clear */ 1722 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1723 readl, !(pmen & DMA_PMEN_PRS), pmen); 1724 1725 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1726 } 1727 1728 static void iommu_enable_translation(struct intel_iommu *iommu) 1729 { 1730 u32 sts; 1731 unsigned long flags; 1732 1733 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1734 iommu->gcmd |= DMA_GCMD_TE; 1735 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1736 1737 /* Make sure hardware complete it */ 1738 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1739 readl, (sts & DMA_GSTS_TES), sts); 1740 1741 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1742 } 1743 1744 static void iommu_disable_translation(struct intel_iommu *iommu) 1745 { 1746 u32 sts; 1747 unsigned long flag; 1748 1749 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1750 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1751 return; 1752 1753 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1754 iommu->gcmd &= ~DMA_GCMD_TE; 1755 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1756 1757 /* Make sure hardware complete it */ 1758 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1759 readl, (!(sts & DMA_GSTS_TES)), sts); 1760 1761 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1762 } 1763 1764 static int iommu_init_domains(struct intel_iommu *iommu) 1765 { 1766 u32 ndomains, nlongs; 1767 size_t size; 1768 1769 ndomains = cap_ndoms(iommu->cap); 1770 pr_debug("%s: Number of Domains supported <%d>\n", 1771 iommu->name, ndomains); 1772 nlongs = BITS_TO_LONGS(ndomains); 1773 1774 spin_lock_init(&iommu->lock); 1775 1776 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1777 if (!iommu->domain_ids) 1778 return -ENOMEM; 1779 1780 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1781 iommu->domains = kzalloc(size, GFP_KERNEL); 1782 1783 if (iommu->domains) { 1784 size = 256 * sizeof(struct dmar_domain *); 1785 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1786 } 1787 1788 if (!iommu->domains || !iommu->domains[0]) { 1789 pr_err("%s: Allocating domain array failed\n", 1790 iommu->name); 1791 kfree(iommu->domain_ids); 1792 kfree(iommu->domains); 1793 iommu->domain_ids = NULL; 1794 iommu->domains = NULL; 1795 return -ENOMEM; 1796 } 1797 1798 /* 1799 * If Caching mode is set, then invalid translations are tagged 1800 * with domain-id 0, hence we need to pre-allocate it. We also 1801 * use domain-id 0 as a marker for non-allocated domain-id, so 1802 * make sure it is not used for a real domain. 1803 */ 1804 set_bit(0, iommu->domain_ids); 1805 1806 /* 1807 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1808 * entry for first-level or pass-through translation modes should 1809 * be programmed with a domain id different from those used for 1810 * second-level or nested translation. We reserve a domain id for 1811 * this purpose. 1812 */ 1813 if (sm_supported(iommu)) 1814 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1815 1816 return 0; 1817 } 1818 1819 static void disable_dmar_iommu(struct intel_iommu *iommu) 1820 { 1821 struct device_domain_info *info, *tmp; 1822 unsigned long flags; 1823 1824 if (!iommu->domains || !iommu->domain_ids) 1825 return; 1826 1827 spin_lock_irqsave(&device_domain_lock, flags); 1828 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1829 if (info->iommu != iommu) 1830 continue; 1831 1832 if (!info->dev || !info->domain) 1833 continue; 1834 1835 __dmar_remove_one_dev_info(info); 1836 } 1837 spin_unlock_irqrestore(&device_domain_lock, flags); 1838 1839 if (iommu->gcmd & DMA_GCMD_TE) 1840 iommu_disable_translation(iommu); 1841 } 1842 1843 static void free_dmar_iommu(struct intel_iommu *iommu) 1844 { 1845 if ((iommu->domains) && (iommu->domain_ids)) { 1846 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1847 int i; 1848 1849 for (i = 0; i < elems; i++) 1850 kfree(iommu->domains[i]); 1851 kfree(iommu->domains); 1852 kfree(iommu->domain_ids); 1853 iommu->domains = NULL; 1854 iommu->domain_ids = NULL; 1855 } 1856 1857 g_iommus[iommu->seq_id] = NULL; 1858 1859 /* free context mapping */ 1860 free_context_table(iommu); 1861 1862 #ifdef CONFIG_INTEL_IOMMU_SVM 1863 if (pasid_supported(iommu)) { 1864 if (ecap_prs(iommu->ecap)) 1865 intel_svm_finish_prq(iommu); 1866 } 1867 if (vccap_pasid(iommu->vccap)) 1868 ioasid_unregister_allocator(&iommu->pasid_allocator); 1869 1870 #endif 1871 } 1872 1873 /* 1874 * Check and return whether first level is used by default for 1875 * DMA translation. 1876 */ 1877 static bool first_level_by_default(void) 1878 { 1879 return scalable_mode_support() && intel_cap_flts_sanity(); 1880 } 1881 1882 static struct dmar_domain *alloc_domain(int flags) 1883 { 1884 struct dmar_domain *domain; 1885 1886 domain = alloc_domain_mem(); 1887 if (!domain) 1888 return NULL; 1889 1890 memset(domain, 0, sizeof(*domain)); 1891 domain->nid = NUMA_NO_NODE; 1892 domain->flags = flags; 1893 if (first_level_by_default()) 1894 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1895 domain->has_iotlb_device = false; 1896 INIT_LIST_HEAD(&domain->devices); 1897 INIT_LIST_HEAD(&domain->subdevices); 1898 1899 return domain; 1900 } 1901 1902 /* Must be called with iommu->lock */ 1903 static int domain_attach_iommu(struct dmar_domain *domain, 1904 struct intel_iommu *iommu) 1905 { 1906 unsigned long ndomains; 1907 int num; 1908 1909 assert_spin_locked(&device_domain_lock); 1910 assert_spin_locked(&iommu->lock); 1911 1912 domain->iommu_refcnt[iommu->seq_id] += 1; 1913 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1914 ndomains = cap_ndoms(iommu->cap); 1915 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1916 1917 if (num >= ndomains) { 1918 pr_err("%s: No free domain ids\n", iommu->name); 1919 domain->iommu_refcnt[iommu->seq_id] -= 1; 1920 return -ENOSPC; 1921 } 1922 1923 set_bit(num, iommu->domain_ids); 1924 set_iommu_domain(iommu, num, domain); 1925 1926 domain->iommu_did[iommu->seq_id] = num; 1927 domain->nid = iommu->node; 1928 1929 domain_update_iommu_cap(domain); 1930 } 1931 1932 return 0; 1933 } 1934 1935 static void domain_detach_iommu(struct dmar_domain *domain, 1936 struct intel_iommu *iommu) 1937 { 1938 int num; 1939 1940 assert_spin_locked(&device_domain_lock); 1941 assert_spin_locked(&iommu->lock); 1942 1943 domain->iommu_refcnt[iommu->seq_id] -= 1; 1944 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1945 num = domain->iommu_did[iommu->seq_id]; 1946 clear_bit(num, iommu->domain_ids); 1947 set_iommu_domain(iommu, num, NULL); 1948 1949 domain_update_iommu_cap(domain); 1950 domain->iommu_did[iommu->seq_id] = 0; 1951 } 1952 } 1953 1954 static inline int guestwidth_to_adjustwidth(int gaw) 1955 { 1956 int agaw; 1957 int r = (gaw - 12) % 9; 1958 1959 if (r == 0) 1960 agaw = gaw; 1961 else 1962 agaw = gaw + 9 - r; 1963 if (agaw > 64) 1964 agaw = 64; 1965 return agaw; 1966 } 1967 1968 static void domain_exit(struct dmar_domain *domain) 1969 { 1970 1971 /* Remove associated devices and clear attached or cached domains */ 1972 domain_remove_dev_info(domain); 1973 1974 if (domain->pgd) { 1975 struct page *freelist; 1976 1977 freelist = domain_unmap(domain, 0, 1978 DOMAIN_MAX_PFN(domain->gaw), NULL); 1979 dma_free_pagelist(freelist); 1980 } 1981 1982 free_domain_mem(domain); 1983 } 1984 1985 /* 1986 * Get the PASID directory size for scalable mode context entry. 1987 * Value of X in the PDTS field of a scalable mode context entry 1988 * indicates PASID directory with 2^(X + 7) entries. 1989 */ 1990 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1991 { 1992 int pds, max_pde; 1993 1994 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1995 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 1996 if (pds < 7) 1997 return 0; 1998 1999 return pds - 7; 2000 } 2001 2002 /* 2003 * Set the RID_PASID field of a scalable mode context entry. The 2004 * IOMMU hardware will use the PASID value set in this field for 2005 * DMA translations of DMA requests without PASID. 2006 */ 2007 static inline void 2008 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2009 { 2010 context->hi |= pasid & ((1 << 20) - 1); 2011 } 2012 2013 /* 2014 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2015 * entry. 2016 */ 2017 static inline void context_set_sm_dte(struct context_entry *context) 2018 { 2019 context->lo |= (1 << 2); 2020 } 2021 2022 /* 2023 * Set the PRE(Page Request Enable) field of a scalable mode context 2024 * entry. 2025 */ 2026 static inline void context_set_sm_pre(struct context_entry *context) 2027 { 2028 context->lo |= (1 << 4); 2029 } 2030 2031 /* Convert value to context PASID directory size field coding. */ 2032 #define context_pdts(pds) (((pds) & 0x7) << 9) 2033 2034 static int domain_context_mapping_one(struct dmar_domain *domain, 2035 struct intel_iommu *iommu, 2036 struct pasid_table *table, 2037 u8 bus, u8 devfn) 2038 { 2039 u16 did = domain->iommu_did[iommu->seq_id]; 2040 int translation = CONTEXT_TT_MULTI_LEVEL; 2041 struct device_domain_info *info = NULL; 2042 struct context_entry *context; 2043 unsigned long flags; 2044 int ret; 2045 2046 WARN_ON(did == 0); 2047 2048 if (hw_pass_through && domain_type_is_si(domain)) 2049 translation = CONTEXT_TT_PASS_THROUGH; 2050 2051 pr_debug("Set context mapping for %02x:%02x.%d\n", 2052 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2053 2054 BUG_ON(!domain->pgd); 2055 2056 spin_lock_irqsave(&device_domain_lock, flags); 2057 spin_lock(&iommu->lock); 2058 2059 ret = -ENOMEM; 2060 context = iommu_context_addr(iommu, bus, devfn, 1); 2061 if (!context) 2062 goto out_unlock; 2063 2064 ret = 0; 2065 if (context_present(context)) 2066 goto out_unlock; 2067 2068 /* 2069 * For kdump cases, old valid entries may be cached due to the 2070 * in-flight DMA and copied pgtable, but there is no unmapping 2071 * behaviour for them, thus we need an explicit cache flush for 2072 * the newly-mapped device. For kdump, at this point, the device 2073 * is supposed to finish reset at its driver probe stage, so no 2074 * in-flight DMA will exist, and we don't need to worry anymore 2075 * hereafter. 2076 */ 2077 if (context_copied(context)) { 2078 u16 did_old = context_domain_id(context); 2079 2080 if (did_old < cap_ndoms(iommu->cap)) { 2081 iommu->flush.flush_context(iommu, did_old, 2082 (((u16)bus) << 8) | devfn, 2083 DMA_CCMD_MASK_NOBIT, 2084 DMA_CCMD_DEVICE_INVL); 2085 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2086 DMA_TLB_DSI_FLUSH); 2087 } 2088 } 2089 2090 context_clear_entry(context); 2091 2092 if (sm_supported(iommu)) { 2093 unsigned long pds; 2094 2095 WARN_ON(!table); 2096 2097 /* Setup the PASID DIR pointer: */ 2098 pds = context_get_sm_pds(table); 2099 context->lo = (u64)virt_to_phys(table->table) | 2100 context_pdts(pds); 2101 2102 /* Setup the RID_PASID field: */ 2103 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2104 2105 /* 2106 * Setup the Device-TLB enable bit and Page request 2107 * Enable bit: 2108 */ 2109 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2110 if (info && info->ats_supported) 2111 context_set_sm_dte(context); 2112 if (info && info->pri_supported) 2113 context_set_sm_pre(context); 2114 } else { 2115 struct dma_pte *pgd = domain->pgd; 2116 int agaw; 2117 2118 context_set_domain_id(context, did); 2119 2120 if (translation != CONTEXT_TT_PASS_THROUGH) { 2121 /* 2122 * Skip top levels of page tables for iommu which has 2123 * less agaw than default. Unnecessary for PT mode. 2124 */ 2125 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2126 ret = -ENOMEM; 2127 pgd = phys_to_virt(dma_pte_addr(pgd)); 2128 if (!dma_pte_present(pgd)) 2129 goto out_unlock; 2130 } 2131 2132 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2133 if (info && info->ats_supported) 2134 translation = CONTEXT_TT_DEV_IOTLB; 2135 else 2136 translation = CONTEXT_TT_MULTI_LEVEL; 2137 2138 context_set_address_root(context, virt_to_phys(pgd)); 2139 context_set_address_width(context, agaw); 2140 } else { 2141 /* 2142 * In pass through mode, AW must be programmed to 2143 * indicate the largest AGAW value supported by 2144 * hardware. And ASR is ignored by hardware. 2145 */ 2146 context_set_address_width(context, iommu->msagaw); 2147 } 2148 2149 context_set_translation_type(context, translation); 2150 } 2151 2152 context_set_fault_enable(context); 2153 context_set_present(context); 2154 if (!ecap_coherent(iommu->ecap)) 2155 clflush_cache_range(context, sizeof(*context)); 2156 2157 /* 2158 * It's a non-present to present mapping. If hardware doesn't cache 2159 * non-present entry we only need to flush the write-buffer. If the 2160 * _does_ cache non-present entries, then it does so in the special 2161 * domain #0, which we have to flush: 2162 */ 2163 if (cap_caching_mode(iommu->cap)) { 2164 iommu->flush.flush_context(iommu, 0, 2165 (((u16)bus) << 8) | devfn, 2166 DMA_CCMD_MASK_NOBIT, 2167 DMA_CCMD_DEVICE_INVL); 2168 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2169 } else { 2170 iommu_flush_write_buffer(iommu); 2171 } 2172 iommu_enable_dev_iotlb(info); 2173 2174 ret = 0; 2175 2176 out_unlock: 2177 spin_unlock(&iommu->lock); 2178 spin_unlock_irqrestore(&device_domain_lock, flags); 2179 2180 return ret; 2181 } 2182 2183 struct domain_context_mapping_data { 2184 struct dmar_domain *domain; 2185 struct intel_iommu *iommu; 2186 struct pasid_table *table; 2187 }; 2188 2189 static int domain_context_mapping_cb(struct pci_dev *pdev, 2190 u16 alias, void *opaque) 2191 { 2192 struct domain_context_mapping_data *data = opaque; 2193 2194 return domain_context_mapping_one(data->domain, data->iommu, 2195 data->table, PCI_BUS_NUM(alias), 2196 alias & 0xff); 2197 } 2198 2199 static int 2200 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2201 { 2202 struct domain_context_mapping_data data; 2203 struct pasid_table *table; 2204 struct intel_iommu *iommu; 2205 u8 bus, devfn; 2206 2207 iommu = device_to_iommu(dev, &bus, &devfn); 2208 if (!iommu) 2209 return -ENODEV; 2210 2211 table = intel_pasid_get_table(dev); 2212 2213 if (!dev_is_pci(dev)) 2214 return domain_context_mapping_one(domain, iommu, table, 2215 bus, devfn); 2216 2217 data.domain = domain; 2218 data.iommu = iommu; 2219 data.table = table; 2220 2221 return pci_for_each_dma_alias(to_pci_dev(dev), 2222 &domain_context_mapping_cb, &data); 2223 } 2224 2225 static int domain_context_mapped_cb(struct pci_dev *pdev, 2226 u16 alias, void *opaque) 2227 { 2228 struct intel_iommu *iommu = opaque; 2229 2230 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2231 } 2232 2233 static int domain_context_mapped(struct device *dev) 2234 { 2235 struct intel_iommu *iommu; 2236 u8 bus, devfn; 2237 2238 iommu = device_to_iommu(dev, &bus, &devfn); 2239 if (!iommu) 2240 return -ENODEV; 2241 2242 if (!dev_is_pci(dev)) 2243 return device_context_mapped(iommu, bus, devfn); 2244 2245 return !pci_for_each_dma_alias(to_pci_dev(dev), 2246 domain_context_mapped_cb, iommu); 2247 } 2248 2249 /* Returns a number of VTD pages, but aligned to MM page size */ 2250 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2251 size_t size) 2252 { 2253 host_addr &= ~PAGE_MASK; 2254 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2255 } 2256 2257 /* Return largest possible superpage level for a given mapping */ 2258 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2259 unsigned long iov_pfn, 2260 unsigned long phy_pfn, 2261 unsigned long pages) 2262 { 2263 int support, level = 1; 2264 unsigned long pfnmerge; 2265 2266 support = domain->iommu_superpage; 2267 2268 /* To use a large page, the virtual *and* physical addresses 2269 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2270 of them will mean we have to use smaller pages. So just 2271 merge them and check both at once. */ 2272 pfnmerge = iov_pfn | phy_pfn; 2273 2274 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2275 pages >>= VTD_STRIDE_SHIFT; 2276 if (!pages) 2277 break; 2278 pfnmerge >>= VTD_STRIDE_SHIFT; 2279 level++; 2280 support--; 2281 } 2282 return level; 2283 } 2284 2285 /* 2286 * Ensure that old small page tables are removed to make room for superpage(s). 2287 * We're going to add new large pages, so make sure we don't remove their parent 2288 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2289 */ 2290 static void switch_to_super_page(struct dmar_domain *domain, 2291 unsigned long start_pfn, 2292 unsigned long end_pfn, int level) 2293 { 2294 unsigned long lvl_pages = lvl_to_nr_pages(level); 2295 struct dma_pte *pte = NULL; 2296 int i; 2297 2298 while (start_pfn <= end_pfn) { 2299 if (!pte) 2300 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2301 2302 if (dma_pte_present(pte)) { 2303 dma_pte_free_pagetable(domain, start_pfn, 2304 start_pfn + lvl_pages - 1, 2305 level + 1); 2306 2307 for_each_domain_iommu(i, domain) 2308 iommu_flush_iotlb_psi(g_iommus[i], domain, 2309 start_pfn, lvl_pages, 2310 0, 0); 2311 } 2312 2313 pte++; 2314 start_pfn += lvl_pages; 2315 if (first_pte_in_page(pte)) 2316 pte = NULL; 2317 } 2318 } 2319 2320 static int 2321 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2322 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2323 { 2324 struct dma_pte *first_pte = NULL, *pte = NULL; 2325 unsigned int largepage_lvl = 0; 2326 unsigned long lvl_pages = 0; 2327 phys_addr_t pteval; 2328 u64 attr; 2329 2330 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2331 2332 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2333 return -EINVAL; 2334 2335 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2336 attr |= DMA_FL_PTE_PRESENT; 2337 if (domain_use_first_level(domain)) { 2338 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2339 if (prot & DMA_PTE_WRITE) 2340 attr |= DMA_FL_PTE_DIRTY; 2341 } 2342 2343 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2344 2345 while (nr_pages > 0) { 2346 uint64_t tmp; 2347 2348 if (!pte) { 2349 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2350 phys_pfn, nr_pages); 2351 2352 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2353 if (!pte) 2354 return -ENOMEM; 2355 first_pte = pte; 2356 2357 /* It is large page*/ 2358 if (largepage_lvl > 1) { 2359 unsigned long end_pfn; 2360 2361 pteval |= DMA_PTE_LARGE_PAGE; 2362 end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1; 2363 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2364 } else { 2365 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2366 } 2367 2368 } 2369 /* We don't need lock here, nobody else 2370 * touches the iova range 2371 */ 2372 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2373 if (tmp) { 2374 static int dumps = 5; 2375 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2376 iov_pfn, tmp, (unsigned long long)pteval); 2377 if (dumps) { 2378 dumps--; 2379 debug_dma_dump_mappings(NULL); 2380 } 2381 WARN_ON(1); 2382 } 2383 2384 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2385 2386 BUG_ON(nr_pages < lvl_pages); 2387 2388 nr_pages -= lvl_pages; 2389 iov_pfn += lvl_pages; 2390 phys_pfn += lvl_pages; 2391 pteval += lvl_pages * VTD_PAGE_SIZE; 2392 2393 /* If the next PTE would be the first in a new page, then we 2394 * need to flush the cache on the entries we've just written. 2395 * And then we'll need to recalculate 'pte', so clear it and 2396 * let it get set again in the if (!pte) block above. 2397 * 2398 * If we're done (!nr_pages) we need to flush the cache too. 2399 * 2400 * Also if we've been setting superpages, we may need to 2401 * recalculate 'pte' and switch back to smaller pages for the 2402 * end of the mapping, if the trailing size is not enough to 2403 * use another superpage (i.e. nr_pages < lvl_pages). 2404 */ 2405 pte++; 2406 if (!nr_pages || first_pte_in_page(pte) || 2407 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2408 domain_flush_cache(domain, first_pte, 2409 (void *)pte - (void *)first_pte); 2410 pte = NULL; 2411 } 2412 } 2413 2414 return 0; 2415 } 2416 2417 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2418 { 2419 struct intel_iommu *iommu = info->iommu; 2420 struct context_entry *context; 2421 unsigned long flags; 2422 u16 did_old; 2423 2424 if (!iommu) 2425 return; 2426 2427 spin_lock_irqsave(&iommu->lock, flags); 2428 context = iommu_context_addr(iommu, bus, devfn, 0); 2429 if (!context) { 2430 spin_unlock_irqrestore(&iommu->lock, flags); 2431 return; 2432 } 2433 2434 if (sm_supported(iommu)) { 2435 if (hw_pass_through && domain_type_is_si(info->domain)) 2436 did_old = FLPT_DEFAULT_DID; 2437 else 2438 did_old = info->domain->iommu_did[iommu->seq_id]; 2439 } else { 2440 did_old = context_domain_id(context); 2441 } 2442 2443 context_clear_entry(context); 2444 __iommu_flush_cache(iommu, context, sizeof(*context)); 2445 spin_unlock_irqrestore(&iommu->lock, flags); 2446 iommu->flush.flush_context(iommu, 2447 did_old, 2448 (((u16)bus) << 8) | devfn, 2449 DMA_CCMD_MASK_NOBIT, 2450 DMA_CCMD_DEVICE_INVL); 2451 2452 if (sm_supported(iommu)) 2453 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2454 2455 iommu->flush.flush_iotlb(iommu, 2456 did_old, 2457 0, 2458 0, 2459 DMA_TLB_DSI_FLUSH); 2460 2461 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2462 } 2463 2464 static inline void unlink_domain_info(struct device_domain_info *info) 2465 { 2466 assert_spin_locked(&device_domain_lock); 2467 list_del(&info->link); 2468 list_del(&info->global); 2469 if (info->dev) 2470 dev_iommu_priv_set(info->dev, NULL); 2471 } 2472 2473 static void domain_remove_dev_info(struct dmar_domain *domain) 2474 { 2475 struct device_domain_info *info, *tmp; 2476 unsigned long flags; 2477 2478 spin_lock_irqsave(&device_domain_lock, flags); 2479 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2480 __dmar_remove_one_dev_info(info); 2481 spin_unlock_irqrestore(&device_domain_lock, flags); 2482 } 2483 2484 struct dmar_domain *find_domain(struct device *dev) 2485 { 2486 struct device_domain_info *info; 2487 2488 if (unlikely(!dev || !dev->iommu)) 2489 return NULL; 2490 2491 if (unlikely(attach_deferred(dev))) 2492 return NULL; 2493 2494 /* No lock here, assumes no domain exit in normal case */ 2495 info = get_domain_info(dev); 2496 if (likely(info)) 2497 return info->domain; 2498 2499 return NULL; 2500 } 2501 2502 static inline struct device_domain_info * 2503 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2504 { 2505 struct device_domain_info *info; 2506 2507 list_for_each_entry(info, &device_domain_list, global) 2508 if (info->segment == segment && info->bus == bus && 2509 info->devfn == devfn) 2510 return info; 2511 2512 return NULL; 2513 } 2514 2515 static int domain_setup_first_level(struct intel_iommu *iommu, 2516 struct dmar_domain *domain, 2517 struct device *dev, 2518 u32 pasid) 2519 { 2520 struct dma_pte *pgd = domain->pgd; 2521 int agaw, level; 2522 int flags = 0; 2523 2524 /* 2525 * Skip top levels of page tables for iommu which has 2526 * less agaw than default. Unnecessary for PT mode. 2527 */ 2528 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2529 pgd = phys_to_virt(dma_pte_addr(pgd)); 2530 if (!dma_pte_present(pgd)) 2531 return -ENOMEM; 2532 } 2533 2534 level = agaw_to_level(agaw); 2535 if (level != 4 && level != 5) 2536 return -EINVAL; 2537 2538 if (pasid != PASID_RID2PASID) 2539 flags |= PASID_FLAG_SUPERVISOR_MODE; 2540 if (level == 5) 2541 flags |= PASID_FLAG_FL5LP; 2542 2543 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2544 flags |= PASID_FLAG_PAGE_SNOOP; 2545 2546 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2547 domain->iommu_did[iommu->seq_id], 2548 flags); 2549 } 2550 2551 static bool dev_is_real_dma_subdevice(struct device *dev) 2552 { 2553 return dev && dev_is_pci(dev) && 2554 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2555 } 2556 2557 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2558 int bus, int devfn, 2559 struct device *dev, 2560 struct dmar_domain *domain) 2561 { 2562 struct dmar_domain *found = NULL; 2563 struct device_domain_info *info; 2564 unsigned long flags; 2565 int ret; 2566 2567 info = alloc_devinfo_mem(); 2568 if (!info) 2569 return NULL; 2570 2571 if (!dev_is_real_dma_subdevice(dev)) { 2572 info->bus = bus; 2573 info->devfn = devfn; 2574 info->segment = iommu->segment; 2575 } else { 2576 struct pci_dev *pdev = to_pci_dev(dev); 2577 2578 info->bus = pdev->bus->number; 2579 info->devfn = pdev->devfn; 2580 info->segment = pci_domain_nr(pdev->bus); 2581 } 2582 2583 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2584 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2585 info->ats_qdep = 0; 2586 info->dev = dev; 2587 info->domain = domain; 2588 info->iommu = iommu; 2589 info->pasid_table = NULL; 2590 info->auxd_enabled = 0; 2591 INIT_LIST_HEAD(&info->subdevices); 2592 2593 if (dev && dev_is_pci(dev)) { 2594 struct pci_dev *pdev = to_pci_dev(info->dev); 2595 2596 if (ecap_dev_iotlb_support(iommu->ecap) && 2597 pci_ats_supported(pdev) && 2598 dmar_find_matched_atsr_unit(pdev)) 2599 info->ats_supported = 1; 2600 2601 if (sm_supported(iommu)) { 2602 if (pasid_supported(iommu)) { 2603 int features = pci_pasid_features(pdev); 2604 if (features >= 0) 2605 info->pasid_supported = features | 1; 2606 } 2607 2608 if (info->ats_supported && ecap_prs(iommu->ecap) && 2609 pci_pri_supported(pdev)) 2610 info->pri_supported = 1; 2611 } 2612 } 2613 2614 spin_lock_irqsave(&device_domain_lock, flags); 2615 if (dev) 2616 found = find_domain(dev); 2617 2618 if (!found) { 2619 struct device_domain_info *info2; 2620 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2621 info->devfn); 2622 if (info2) { 2623 found = info2->domain; 2624 info2->dev = dev; 2625 } 2626 } 2627 2628 if (found) { 2629 spin_unlock_irqrestore(&device_domain_lock, flags); 2630 free_devinfo_mem(info); 2631 /* Caller must free the original domain */ 2632 return found; 2633 } 2634 2635 spin_lock(&iommu->lock); 2636 ret = domain_attach_iommu(domain, iommu); 2637 spin_unlock(&iommu->lock); 2638 2639 if (ret) { 2640 spin_unlock_irqrestore(&device_domain_lock, flags); 2641 free_devinfo_mem(info); 2642 return NULL; 2643 } 2644 2645 list_add(&info->link, &domain->devices); 2646 list_add(&info->global, &device_domain_list); 2647 if (dev) 2648 dev_iommu_priv_set(dev, info); 2649 spin_unlock_irqrestore(&device_domain_lock, flags); 2650 2651 /* PASID table is mandatory for a PCI device in scalable mode. */ 2652 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2653 ret = intel_pasid_alloc_table(dev); 2654 if (ret) { 2655 dev_err(dev, "PASID table allocation failed\n"); 2656 dmar_remove_one_dev_info(dev); 2657 return NULL; 2658 } 2659 2660 /* Setup the PASID entry for requests without PASID: */ 2661 spin_lock_irqsave(&iommu->lock, flags); 2662 if (hw_pass_through && domain_type_is_si(domain)) 2663 ret = intel_pasid_setup_pass_through(iommu, domain, 2664 dev, PASID_RID2PASID); 2665 else if (domain_use_first_level(domain)) 2666 ret = domain_setup_first_level(iommu, domain, dev, 2667 PASID_RID2PASID); 2668 else 2669 ret = intel_pasid_setup_second_level(iommu, domain, 2670 dev, PASID_RID2PASID); 2671 spin_unlock_irqrestore(&iommu->lock, flags); 2672 if (ret) { 2673 dev_err(dev, "Setup RID2PASID failed\n"); 2674 dmar_remove_one_dev_info(dev); 2675 return NULL; 2676 } 2677 } 2678 2679 if (dev && domain_context_mapping(domain, dev)) { 2680 dev_err(dev, "Domain context map failed\n"); 2681 dmar_remove_one_dev_info(dev); 2682 return NULL; 2683 } 2684 2685 return domain; 2686 } 2687 2688 static int iommu_domain_identity_map(struct dmar_domain *domain, 2689 unsigned long first_vpfn, 2690 unsigned long last_vpfn) 2691 { 2692 /* 2693 * RMRR range might have overlap with physical memory range, 2694 * clear it first 2695 */ 2696 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2697 2698 return __domain_mapping(domain, first_vpfn, 2699 first_vpfn, last_vpfn - first_vpfn + 1, 2700 DMA_PTE_READ|DMA_PTE_WRITE); 2701 } 2702 2703 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2704 2705 static int __init si_domain_init(int hw) 2706 { 2707 struct dmar_rmrr_unit *rmrr; 2708 struct device *dev; 2709 int i, nid, ret; 2710 2711 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2712 if (!si_domain) 2713 return -EFAULT; 2714 2715 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2716 domain_exit(si_domain); 2717 return -EFAULT; 2718 } 2719 2720 if (hw) 2721 return 0; 2722 2723 for_each_online_node(nid) { 2724 unsigned long start_pfn, end_pfn; 2725 int i; 2726 2727 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2728 ret = iommu_domain_identity_map(si_domain, 2729 mm_to_dma_pfn(start_pfn), 2730 mm_to_dma_pfn(end_pfn)); 2731 if (ret) 2732 return ret; 2733 } 2734 } 2735 2736 /* 2737 * Identity map the RMRRs so that devices with RMRRs could also use 2738 * the si_domain. 2739 */ 2740 for_each_rmrr_units(rmrr) { 2741 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2742 i, dev) { 2743 unsigned long long start = rmrr->base_address; 2744 unsigned long long end = rmrr->end_address; 2745 2746 if (WARN_ON(end < start || 2747 end >> agaw_to_width(si_domain->agaw))) 2748 continue; 2749 2750 ret = iommu_domain_identity_map(si_domain, 2751 mm_to_dma_pfn(start >> PAGE_SHIFT), 2752 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2753 if (ret) 2754 return ret; 2755 } 2756 } 2757 2758 return 0; 2759 } 2760 2761 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2762 { 2763 struct dmar_domain *ndomain; 2764 struct intel_iommu *iommu; 2765 u8 bus, devfn; 2766 2767 iommu = device_to_iommu(dev, &bus, &devfn); 2768 if (!iommu) 2769 return -ENODEV; 2770 2771 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2772 if (ndomain != domain) 2773 return -EBUSY; 2774 2775 return 0; 2776 } 2777 2778 static bool device_has_rmrr(struct device *dev) 2779 { 2780 struct dmar_rmrr_unit *rmrr; 2781 struct device *tmp; 2782 int i; 2783 2784 rcu_read_lock(); 2785 for_each_rmrr_units(rmrr) { 2786 /* 2787 * Return TRUE if this RMRR contains the device that 2788 * is passed in. 2789 */ 2790 for_each_active_dev_scope(rmrr->devices, 2791 rmrr->devices_cnt, i, tmp) 2792 if (tmp == dev || 2793 is_downstream_to_pci_bridge(dev, tmp)) { 2794 rcu_read_unlock(); 2795 return true; 2796 } 2797 } 2798 rcu_read_unlock(); 2799 return false; 2800 } 2801 2802 /** 2803 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2804 * is relaxable (ie. is allowed to be not enforced under some conditions) 2805 * @dev: device handle 2806 * 2807 * We assume that PCI USB devices with RMRRs have them largely 2808 * for historical reasons and that the RMRR space is not actively used post 2809 * boot. This exclusion may change if vendors begin to abuse it. 2810 * 2811 * The same exception is made for graphics devices, with the requirement that 2812 * any use of the RMRR regions will be torn down before assigning the device 2813 * to a guest. 2814 * 2815 * Return: true if the RMRR is relaxable, false otherwise 2816 */ 2817 static bool device_rmrr_is_relaxable(struct device *dev) 2818 { 2819 struct pci_dev *pdev; 2820 2821 if (!dev_is_pci(dev)) 2822 return false; 2823 2824 pdev = to_pci_dev(dev); 2825 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2826 return true; 2827 else 2828 return false; 2829 } 2830 2831 /* 2832 * There are a couple cases where we need to restrict the functionality of 2833 * devices associated with RMRRs. The first is when evaluating a device for 2834 * identity mapping because problems exist when devices are moved in and out 2835 * of domains and their respective RMRR information is lost. This means that 2836 * a device with associated RMRRs will never be in a "passthrough" domain. 2837 * The second is use of the device through the IOMMU API. This interface 2838 * expects to have full control of the IOVA space for the device. We cannot 2839 * satisfy both the requirement that RMRR access is maintained and have an 2840 * unencumbered IOVA space. We also have no ability to quiesce the device's 2841 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2842 * We therefore prevent devices associated with an RMRR from participating in 2843 * the IOMMU API, which eliminates them from device assignment. 2844 * 2845 * In both cases, devices which have relaxable RMRRs are not concerned by this 2846 * restriction. See device_rmrr_is_relaxable comment. 2847 */ 2848 static bool device_is_rmrr_locked(struct device *dev) 2849 { 2850 if (!device_has_rmrr(dev)) 2851 return false; 2852 2853 if (device_rmrr_is_relaxable(dev)) 2854 return false; 2855 2856 return true; 2857 } 2858 2859 /* 2860 * Return the required default domain type for a specific device. 2861 * 2862 * @dev: the device in query 2863 * @startup: true if this is during early boot 2864 * 2865 * Returns: 2866 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2867 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2868 * - 0: both identity and dynamic domains work for this device 2869 */ 2870 static int device_def_domain_type(struct device *dev) 2871 { 2872 if (dev_is_pci(dev)) { 2873 struct pci_dev *pdev = to_pci_dev(dev); 2874 2875 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2876 return IOMMU_DOMAIN_IDENTITY; 2877 2878 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2879 return IOMMU_DOMAIN_IDENTITY; 2880 } 2881 2882 return 0; 2883 } 2884 2885 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2886 { 2887 /* 2888 * Start from the sane iommu hardware state. 2889 * If the queued invalidation is already initialized by us 2890 * (for example, while enabling interrupt-remapping) then 2891 * we got the things already rolling from a sane state. 2892 */ 2893 if (!iommu->qi) { 2894 /* 2895 * Clear any previous faults. 2896 */ 2897 dmar_fault(-1, iommu); 2898 /* 2899 * Disable queued invalidation if supported and already enabled 2900 * before OS handover. 2901 */ 2902 dmar_disable_qi(iommu); 2903 } 2904 2905 if (dmar_enable_qi(iommu)) { 2906 /* 2907 * Queued Invalidate not enabled, use Register Based Invalidate 2908 */ 2909 iommu->flush.flush_context = __iommu_flush_context; 2910 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2911 pr_info("%s: Using Register based invalidation\n", 2912 iommu->name); 2913 } else { 2914 iommu->flush.flush_context = qi_flush_context; 2915 iommu->flush.flush_iotlb = qi_flush_iotlb; 2916 pr_info("%s: Using Queued invalidation\n", iommu->name); 2917 } 2918 } 2919 2920 static int copy_context_table(struct intel_iommu *iommu, 2921 struct root_entry *old_re, 2922 struct context_entry **tbl, 2923 int bus, bool ext) 2924 { 2925 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2926 struct context_entry *new_ce = NULL, ce; 2927 struct context_entry *old_ce = NULL; 2928 struct root_entry re; 2929 phys_addr_t old_ce_phys; 2930 2931 tbl_idx = ext ? bus * 2 : bus; 2932 memcpy(&re, old_re, sizeof(re)); 2933 2934 for (devfn = 0; devfn < 256; devfn++) { 2935 /* First calculate the correct index */ 2936 idx = (ext ? devfn * 2 : devfn) % 256; 2937 2938 if (idx == 0) { 2939 /* First save what we may have and clean up */ 2940 if (new_ce) { 2941 tbl[tbl_idx] = new_ce; 2942 __iommu_flush_cache(iommu, new_ce, 2943 VTD_PAGE_SIZE); 2944 pos = 1; 2945 } 2946 2947 if (old_ce) 2948 memunmap(old_ce); 2949 2950 ret = 0; 2951 if (devfn < 0x80) 2952 old_ce_phys = root_entry_lctp(&re); 2953 else 2954 old_ce_phys = root_entry_uctp(&re); 2955 2956 if (!old_ce_phys) { 2957 if (ext && devfn == 0) { 2958 /* No LCTP, try UCTP */ 2959 devfn = 0x7f; 2960 continue; 2961 } else { 2962 goto out; 2963 } 2964 } 2965 2966 ret = -ENOMEM; 2967 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2968 MEMREMAP_WB); 2969 if (!old_ce) 2970 goto out; 2971 2972 new_ce = alloc_pgtable_page(iommu->node); 2973 if (!new_ce) 2974 goto out_unmap; 2975 2976 ret = 0; 2977 } 2978 2979 /* Now copy the context entry */ 2980 memcpy(&ce, old_ce + idx, sizeof(ce)); 2981 2982 if (!__context_present(&ce)) 2983 continue; 2984 2985 did = context_domain_id(&ce); 2986 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2987 set_bit(did, iommu->domain_ids); 2988 2989 /* 2990 * We need a marker for copied context entries. This 2991 * marker needs to work for the old format as well as 2992 * for extended context entries. 2993 * 2994 * Bit 67 of the context entry is used. In the old 2995 * format this bit is available to software, in the 2996 * extended format it is the PGE bit, but PGE is ignored 2997 * by HW if PASIDs are disabled (and thus still 2998 * available). 2999 * 3000 * So disable PASIDs first and then mark the entry 3001 * copied. This means that we don't copy PASID 3002 * translations from the old kernel, but this is fine as 3003 * faults there are not fatal. 3004 */ 3005 context_clear_pasid_enable(&ce); 3006 context_set_copied(&ce); 3007 3008 new_ce[idx] = ce; 3009 } 3010 3011 tbl[tbl_idx + pos] = new_ce; 3012 3013 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3014 3015 out_unmap: 3016 memunmap(old_ce); 3017 3018 out: 3019 return ret; 3020 } 3021 3022 static int copy_translation_tables(struct intel_iommu *iommu) 3023 { 3024 struct context_entry **ctxt_tbls; 3025 struct root_entry *old_rt; 3026 phys_addr_t old_rt_phys; 3027 int ctxt_table_entries; 3028 unsigned long flags; 3029 u64 rtaddr_reg; 3030 int bus, ret; 3031 bool new_ext, ext; 3032 3033 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3034 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3035 new_ext = !!ecap_ecs(iommu->ecap); 3036 3037 /* 3038 * The RTT bit can only be changed when translation is disabled, 3039 * but disabling translation means to open a window for data 3040 * corruption. So bail out and don't copy anything if we would 3041 * have to change the bit. 3042 */ 3043 if (new_ext != ext) 3044 return -EINVAL; 3045 3046 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3047 if (!old_rt_phys) 3048 return -EINVAL; 3049 3050 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3051 if (!old_rt) 3052 return -ENOMEM; 3053 3054 /* This is too big for the stack - allocate it from slab */ 3055 ctxt_table_entries = ext ? 512 : 256; 3056 ret = -ENOMEM; 3057 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3058 if (!ctxt_tbls) 3059 goto out_unmap; 3060 3061 for (bus = 0; bus < 256; bus++) { 3062 ret = copy_context_table(iommu, &old_rt[bus], 3063 ctxt_tbls, bus, ext); 3064 if (ret) { 3065 pr_err("%s: Failed to copy context table for bus %d\n", 3066 iommu->name, bus); 3067 continue; 3068 } 3069 } 3070 3071 spin_lock_irqsave(&iommu->lock, flags); 3072 3073 /* Context tables are copied, now write them to the root_entry table */ 3074 for (bus = 0; bus < 256; bus++) { 3075 int idx = ext ? bus * 2 : bus; 3076 u64 val; 3077 3078 if (ctxt_tbls[idx]) { 3079 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3080 iommu->root_entry[bus].lo = val; 3081 } 3082 3083 if (!ext || !ctxt_tbls[idx + 1]) 3084 continue; 3085 3086 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3087 iommu->root_entry[bus].hi = val; 3088 } 3089 3090 spin_unlock_irqrestore(&iommu->lock, flags); 3091 3092 kfree(ctxt_tbls); 3093 3094 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3095 3096 ret = 0; 3097 3098 out_unmap: 3099 memunmap(old_rt); 3100 3101 return ret; 3102 } 3103 3104 #ifdef CONFIG_INTEL_IOMMU_SVM 3105 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3106 { 3107 struct intel_iommu *iommu = data; 3108 ioasid_t ioasid; 3109 3110 if (!iommu) 3111 return INVALID_IOASID; 3112 /* 3113 * VT-d virtual command interface always uses the full 20 bit 3114 * PASID range. Host can partition guest PASID range based on 3115 * policies but it is out of guest's control. 3116 */ 3117 if (min < PASID_MIN || max > intel_pasid_max_id) 3118 return INVALID_IOASID; 3119 3120 if (vcmd_alloc_pasid(iommu, &ioasid)) 3121 return INVALID_IOASID; 3122 3123 return ioasid; 3124 } 3125 3126 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3127 { 3128 struct intel_iommu *iommu = data; 3129 3130 if (!iommu) 3131 return; 3132 /* 3133 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3134 * We can only free the PASID when all the devices are unbound. 3135 */ 3136 if (ioasid_find(NULL, ioasid, NULL)) { 3137 pr_alert("Cannot free active IOASID %d\n", ioasid); 3138 return; 3139 } 3140 vcmd_free_pasid(iommu, ioasid); 3141 } 3142 3143 static void register_pasid_allocator(struct intel_iommu *iommu) 3144 { 3145 /* 3146 * If we are running in the host, no need for custom allocator 3147 * in that PASIDs are allocated from the host system-wide. 3148 */ 3149 if (!cap_caching_mode(iommu->cap)) 3150 return; 3151 3152 if (!sm_supported(iommu)) { 3153 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3154 return; 3155 } 3156 3157 /* 3158 * Register a custom PASID allocator if we are running in a guest, 3159 * guest PASID must be obtained via virtual command interface. 3160 * There can be multiple vIOMMUs in each guest but only one allocator 3161 * is active. All vIOMMU allocators will eventually be calling the same 3162 * host allocator. 3163 */ 3164 if (!vccap_pasid(iommu->vccap)) 3165 return; 3166 3167 pr_info("Register custom PASID allocator\n"); 3168 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3169 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3170 iommu->pasid_allocator.pdata = (void *)iommu; 3171 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3172 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3173 /* 3174 * Disable scalable mode on this IOMMU if there 3175 * is no custom allocator. Mixing SM capable vIOMMU 3176 * and non-SM vIOMMU are not supported. 3177 */ 3178 intel_iommu_sm = 0; 3179 } 3180 } 3181 #endif 3182 3183 static int __init init_dmars(void) 3184 { 3185 struct dmar_drhd_unit *drhd; 3186 struct intel_iommu *iommu; 3187 int ret; 3188 3189 /* 3190 * for each drhd 3191 * allocate root 3192 * initialize and program root entry to not present 3193 * endfor 3194 */ 3195 for_each_drhd_unit(drhd) { 3196 /* 3197 * lock not needed as this is only incremented in the single 3198 * threaded kernel __init code path all other access are read 3199 * only 3200 */ 3201 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3202 g_num_of_iommus++; 3203 continue; 3204 } 3205 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3206 } 3207 3208 /* Preallocate enough resources for IOMMU hot-addition */ 3209 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3210 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3211 3212 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3213 GFP_KERNEL); 3214 if (!g_iommus) { 3215 ret = -ENOMEM; 3216 goto error; 3217 } 3218 3219 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3220 if (ret) 3221 goto free_iommu; 3222 3223 for_each_iommu(iommu, drhd) { 3224 if (drhd->ignored) { 3225 iommu_disable_translation(iommu); 3226 continue; 3227 } 3228 3229 /* 3230 * Find the max pasid size of all IOMMU's in the system. 3231 * We need to ensure the system pasid table is no bigger 3232 * than the smallest supported. 3233 */ 3234 if (pasid_supported(iommu)) { 3235 u32 temp = 2 << ecap_pss(iommu->ecap); 3236 3237 intel_pasid_max_id = min_t(u32, temp, 3238 intel_pasid_max_id); 3239 } 3240 3241 g_iommus[iommu->seq_id] = iommu; 3242 3243 intel_iommu_init_qi(iommu); 3244 3245 ret = iommu_init_domains(iommu); 3246 if (ret) 3247 goto free_iommu; 3248 3249 init_translation_status(iommu); 3250 3251 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3252 iommu_disable_translation(iommu); 3253 clear_translation_pre_enabled(iommu); 3254 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3255 iommu->name); 3256 } 3257 3258 /* 3259 * TBD: 3260 * we could share the same root & context tables 3261 * among all IOMMU's. Need to Split it later. 3262 */ 3263 ret = iommu_alloc_root_entry(iommu); 3264 if (ret) 3265 goto free_iommu; 3266 3267 if (translation_pre_enabled(iommu)) { 3268 pr_info("Translation already enabled - trying to copy translation structures\n"); 3269 3270 ret = copy_translation_tables(iommu); 3271 if (ret) { 3272 /* 3273 * We found the IOMMU with translation 3274 * enabled - but failed to copy over the 3275 * old root-entry table. Try to proceed 3276 * by disabling translation now and 3277 * allocating a clean root-entry table. 3278 * This might cause DMAR faults, but 3279 * probably the dump will still succeed. 3280 */ 3281 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3282 iommu->name); 3283 iommu_disable_translation(iommu); 3284 clear_translation_pre_enabled(iommu); 3285 } else { 3286 pr_info("Copied translation tables from previous kernel for %s\n", 3287 iommu->name); 3288 } 3289 } 3290 3291 if (!ecap_pass_through(iommu->ecap)) 3292 hw_pass_through = 0; 3293 intel_svm_check(iommu); 3294 } 3295 3296 /* 3297 * Now that qi is enabled on all iommus, set the root entry and flush 3298 * caches. This is required on some Intel X58 chipsets, otherwise the 3299 * flush_context function will loop forever and the boot hangs. 3300 */ 3301 for_each_active_iommu(iommu, drhd) { 3302 iommu_flush_write_buffer(iommu); 3303 #ifdef CONFIG_INTEL_IOMMU_SVM 3304 register_pasid_allocator(iommu); 3305 #endif 3306 iommu_set_root_entry(iommu); 3307 } 3308 3309 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3310 dmar_map_gfx = 0; 3311 #endif 3312 3313 if (!dmar_map_gfx) 3314 iommu_identity_mapping |= IDENTMAP_GFX; 3315 3316 check_tylersburg_isoch(); 3317 3318 ret = si_domain_init(hw_pass_through); 3319 if (ret) 3320 goto free_iommu; 3321 3322 /* 3323 * for each drhd 3324 * enable fault log 3325 * global invalidate context cache 3326 * global invalidate iotlb 3327 * enable translation 3328 */ 3329 for_each_iommu(iommu, drhd) { 3330 if (drhd->ignored) { 3331 /* 3332 * we always have to disable PMRs or DMA may fail on 3333 * this device 3334 */ 3335 if (force_on) 3336 iommu_disable_protect_mem_regions(iommu); 3337 continue; 3338 } 3339 3340 iommu_flush_write_buffer(iommu); 3341 3342 #ifdef CONFIG_INTEL_IOMMU_SVM 3343 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3344 /* 3345 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3346 * could cause possible lock race condition. 3347 */ 3348 up_write(&dmar_global_lock); 3349 ret = intel_svm_enable_prq(iommu); 3350 down_write(&dmar_global_lock); 3351 if (ret) 3352 goto free_iommu; 3353 } 3354 #endif 3355 ret = dmar_set_interrupt(iommu); 3356 if (ret) 3357 goto free_iommu; 3358 } 3359 3360 return 0; 3361 3362 free_iommu: 3363 for_each_active_iommu(iommu, drhd) { 3364 disable_dmar_iommu(iommu); 3365 free_dmar_iommu(iommu); 3366 } 3367 3368 kfree(g_iommus); 3369 3370 error: 3371 return ret; 3372 } 3373 3374 static inline int iommu_domain_cache_init(void) 3375 { 3376 int ret = 0; 3377 3378 iommu_domain_cache = kmem_cache_create("iommu_domain", 3379 sizeof(struct dmar_domain), 3380 0, 3381 SLAB_HWCACHE_ALIGN, 3382 3383 NULL); 3384 if (!iommu_domain_cache) { 3385 pr_err("Couldn't create iommu_domain cache\n"); 3386 ret = -ENOMEM; 3387 } 3388 3389 return ret; 3390 } 3391 3392 static inline int iommu_devinfo_cache_init(void) 3393 { 3394 int ret = 0; 3395 3396 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3397 sizeof(struct device_domain_info), 3398 0, 3399 SLAB_HWCACHE_ALIGN, 3400 NULL); 3401 if (!iommu_devinfo_cache) { 3402 pr_err("Couldn't create devinfo cache\n"); 3403 ret = -ENOMEM; 3404 } 3405 3406 return ret; 3407 } 3408 3409 static int __init iommu_init_mempool(void) 3410 { 3411 int ret; 3412 ret = iova_cache_get(); 3413 if (ret) 3414 return ret; 3415 3416 ret = iommu_domain_cache_init(); 3417 if (ret) 3418 goto domain_error; 3419 3420 ret = iommu_devinfo_cache_init(); 3421 if (!ret) 3422 return ret; 3423 3424 kmem_cache_destroy(iommu_domain_cache); 3425 domain_error: 3426 iova_cache_put(); 3427 3428 return -ENOMEM; 3429 } 3430 3431 static void __init iommu_exit_mempool(void) 3432 { 3433 kmem_cache_destroy(iommu_devinfo_cache); 3434 kmem_cache_destroy(iommu_domain_cache); 3435 iova_cache_put(); 3436 } 3437 3438 static void __init init_no_remapping_devices(void) 3439 { 3440 struct dmar_drhd_unit *drhd; 3441 struct device *dev; 3442 int i; 3443 3444 for_each_drhd_unit(drhd) { 3445 if (!drhd->include_all) { 3446 for_each_active_dev_scope(drhd->devices, 3447 drhd->devices_cnt, i, dev) 3448 break; 3449 /* ignore DMAR unit if no devices exist */ 3450 if (i == drhd->devices_cnt) 3451 drhd->ignored = 1; 3452 } 3453 } 3454 3455 for_each_active_drhd_unit(drhd) { 3456 if (drhd->include_all) 3457 continue; 3458 3459 for_each_active_dev_scope(drhd->devices, 3460 drhd->devices_cnt, i, dev) 3461 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3462 break; 3463 if (i < drhd->devices_cnt) 3464 continue; 3465 3466 /* This IOMMU has *only* gfx devices. Either bypass it or 3467 set the gfx_mapped flag, as appropriate */ 3468 drhd->gfx_dedicated = 1; 3469 if (!dmar_map_gfx) 3470 drhd->ignored = 1; 3471 } 3472 } 3473 3474 #ifdef CONFIG_SUSPEND 3475 static int init_iommu_hw(void) 3476 { 3477 struct dmar_drhd_unit *drhd; 3478 struct intel_iommu *iommu = NULL; 3479 3480 for_each_active_iommu(iommu, drhd) 3481 if (iommu->qi) 3482 dmar_reenable_qi(iommu); 3483 3484 for_each_iommu(iommu, drhd) { 3485 if (drhd->ignored) { 3486 /* 3487 * we always have to disable PMRs or DMA may fail on 3488 * this device 3489 */ 3490 if (force_on) 3491 iommu_disable_protect_mem_regions(iommu); 3492 continue; 3493 } 3494 3495 iommu_flush_write_buffer(iommu); 3496 iommu_set_root_entry(iommu); 3497 iommu_enable_translation(iommu); 3498 iommu_disable_protect_mem_regions(iommu); 3499 } 3500 3501 return 0; 3502 } 3503 3504 static void iommu_flush_all(void) 3505 { 3506 struct dmar_drhd_unit *drhd; 3507 struct intel_iommu *iommu; 3508 3509 for_each_active_iommu(iommu, drhd) { 3510 iommu->flush.flush_context(iommu, 0, 0, 0, 3511 DMA_CCMD_GLOBAL_INVL); 3512 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3513 DMA_TLB_GLOBAL_FLUSH); 3514 } 3515 } 3516 3517 static int iommu_suspend(void) 3518 { 3519 struct dmar_drhd_unit *drhd; 3520 struct intel_iommu *iommu = NULL; 3521 unsigned long flag; 3522 3523 for_each_active_iommu(iommu, drhd) { 3524 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3525 GFP_KERNEL); 3526 if (!iommu->iommu_state) 3527 goto nomem; 3528 } 3529 3530 iommu_flush_all(); 3531 3532 for_each_active_iommu(iommu, drhd) { 3533 iommu_disable_translation(iommu); 3534 3535 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3536 3537 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3538 readl(iommu->reg + DMAR_FECTL_REG); 3539 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3540 readl(iommu->reg + DMAR_FEDATA_REG); 3541 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3542 readl(iommu->reg + DMAR_FEADDR_REG); 3543 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3544 readl(iommu->reg + DMAR_FEUADDR_REG); 3545 3546 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3547 } 3548 return 0; 3549 3550 nomem: 3551 for_each_active_iommu(iommu, drhd) 3552 kfree(iommu->iommu_state); 3553 3554 return -ENOMEM; 3555 } 3556 3557 static void iommu_resume(void) 3558 { 3559 struct dmar_drhd_unit *drhd; 3560 struct intel_iommu *iommu = NULL; 3561 unsigned long flag; 3562 3563 if (init_iommu_hw()) { 3564 if (force_on) 3565 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3566 else 3567 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3568 return; 3569 } 3570 3571 for_each_active_iommu(iommu, drhd) { 3572 3573 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3574 3575 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3576 iommu->reg + DMAR_FECTL_REG); 3577 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3578 iommu->reg + DMAR_FEDATA_REG); 3579 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3580 iommu->reg + DMAR_FEADDR_REG); 3581 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3582 iommu->reg + DMAR_FEUADDR_REG); 3583 3584 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3585 } 3586 3587 for_each_active_iommu(iommu, drhd) 3588 kfree(iommu->iommu_state); 3589 } 3590 3591 static struct syscore_ops iommu_syscore_ops = { 3592 .resume = iommu_resume, 3593 .suspend = iommu_suspend, 3594 }; 3595 3596 static void __init init_iommu_pm_ops(void) 3597 { 3598 register_syscore_ops(&iommu_syscore_ops); 3599 } 3600 3601 #else 3602 static inline void init_iommu_pm_ops(void) {} 3603 #endif /* CONFIG_PM */ 3604 3605 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3606 { 3607 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3608 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3609 rmrr->end_address <= rmrr->base_address || 3610 arch_rmrr_sanity_check(rmrr)) 3611 return -EINVAL; 3612 3613 return 0; 3614 } 3615 3616 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3617 { 3618 struct acpi_dmar_reserved_memory *rmrr; 3619 struct dmar_rmrr_unit *rmrru; 3620 3621 rmrr = (struct acpi_dmar_reserved_memory *)header; 3622 if (rmrr_sanity_check(rmrr)) { 3623 pr_warn(FW_BUG 3624 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3625 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3626 rmrr->base_address, rmrr->end_address, 3627 dmi_get_system_info(DMI_BIOS_VENDOR), 3628 dmi_get_system_info(DMI_BIOS_VERSION), 3629 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3630 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3631 } 3632 3633 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3634 if (!rmrru) 3635 goto out; 3636 3637 rmrru->hdr = header; 3638 3639 rmrru->base_address = rmrr->base_address; 3640 rmrru->end_address = rmrr->end_address; 3641 3642 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3643 ((void *)rmrr) + rmrr->header.length, 3644 &rmrru->devices_cnt); 3645 if (rmrru->devices_cnt && rmrru->devices == NULL) 3646 goto free_rmrru; 3647 3648 list_add(&rmrru->list, &dmar_rmrr_units); 3649 3650 return 0; 3651 free_rmrru: 3652 kfree(rmrru); 3653 out: 3654 return -ENOMEM; 3655 } 3656 3657 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3658 { 3659 struct dmar_atsr_unit *atsru; 3660 struct acpi_dmar_atsr *tmp; 3661 3662 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3663 dmar_rcu_check()) { 3664 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3665 if (atsr->segment != tmp->segment) 3666 continue; 3667 if (atsr->header.length != tmp->header.length) 3668 continue; 3669 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3670 return atsru; 3671 } 3672 3673 return NULL; 3674 } 3675 3676 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3677 { 3678 struct acpi_dmar_atsr *atsr; 3679 struct dmar_atsr_unit *atsru; 3680 3681 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3682 return 0; 3683 3684 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3685 atsru = dmar_find_atsr(atsr); 3686 if (atsru) 3687 return 0; 3688 3689 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3690 if (!atsru) 3691 return -ENOMEM; 3692 3693 /* 3694 * If memory is allocated from slab by ACPI _DSM method, we need to 3695 * copy the memory content because the memory buffer will be freed 3696 * on return. 3697 */ 3698 atsru->hdr = (void *)(atsru + 1); 3699 memcpy(atsru->hdr, hdr, hdr->length); 3700 atsru->include_all = atsr->flags & 0x1; 3701 if (!atsru->include_all) { 3702 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3703 (void *)atsr + atsr->header.length, 3704 &atsru->devices_cnt); 3705 if (atsru->devices_cnt && atsru->devices == NULL) { 3706 kfree(atsru); 3707 return -ENOMEM; 3708 } 3709 } 3710 3711 list_add_rcu(&atsru->list, &dmar_atsr_units); 3712 3713 return 0; 3714 } 3715 3716 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3717 { 3718 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3719 kfree(atsru); 3720 } 3721 3722 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3723 { 3724 struct acpi_dmar_atsr *atsr; 3725 struct dmar_atsr_unit *atsru; 3726 3727 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3728 atsru = dmar_find_atsr(atsr); 3729 if (atsru) { 3730 list_del_rcu(&atsru->list); 3731 synchronize_rcu(); 3732 intel_iommu_free_atsr(atsru); 3733 } 3734 3735 return 0; 3736 } 3737 3738 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3739 { 3740 int i; 3741 struct device *dev; 3742 struct acpi_dmar_atsr *atsr; 3743 struct dmar_atsr_unit *atsru; 3744 3745 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3746 atsru = dmar_find_atsr(atsr); 3747 if (!atsru) 3748 return 0; 3749 3750 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3751 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3752 i, dev) 3753 return -EBUSY; 3754 } 3755 3756 return 0; 3757 } 3758 3759 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3760 { 3761 struct dmar_satc_unit *satcu; 3762 struct acpi_dmar_satc *tmp; 3763 3764 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3765 dmar_rcu_check()) { 3766 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3767 if (satc->segment != tmp->segment) 3768 continue; 3769 if (satc->header.length != tmp->header.length) 3770 continue; 3771 if (memcmp(satc, tmp, satc->header.length) == 0) 3772 return satcu; 3773 } 3774 3775 return NULL; 3776 } 3777 3778 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3779 { 3780 struct acpi_dmar_satc *satc; 3781 struct dmar_satc_unit *satcu; 3782 3783 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3784 return 0; 3785 3786 satc = container_of(hdr, struct acpi_dmar_satc, header); 3787 satcu = dmar_find_satc(satc); 3788 if (satcu) 3789 return 0; 3790 3791 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3792 if (!satcu) 3793 return -ENOMEM; 3794 3795 satcu->hdr = (void *)(satcu + 1); 3796 memcpy(satcu->hdr, hdr, hdr->length); 3797 satcu->atc_required = satc->flags & 0x1; 3798 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3799 (void *)satc + satc->header.length, 3800 &satcu->devices_cnt); 3801 if (satcu->devices_cnt && !satcu->devices) { 3802 kfree(satcu); 3803 return -ENOMEM; 3804 } 3805 list_add_rcu(&satcu->list, &dmar_satc_units); 3806 3807 return 0; 3808 } 3809 3810 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3811 { 3812 int sp, ret; 3813 struct intel_iommu *iommu = dmaru->iommu; 3814 3815 if (g_iommus[iommu->seq_id]) 3816 return 0; 3817 3818 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3819 if (ret) 3820 goto out; 3821 3822 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3823 pr_warn("%s: Doesn't support hardware pass through.\n", 3824 iommu->name); 3825 return -ENXIO; 3826 } 3827 if (!ecap_sc_support(iommu->ecap) && 3828 domain_update_iommu_snooping(iommu)) { 3829 pr_warn("%s: Doesn't support snooping.\n", 3830 iommu->name); 3831 return -ENXIO; 3832 } 3833 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3834 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3835 pr_warn("%s: Doesn't support large page.\n", 3836 iommu->name); 3837 return -ENXIO; 3838 } 3839 3840 /* 3841 * Disable translation if already enabled prior to OS handover. 3842 */ 3843 if (iommu->gcmd & DMA_GCMD_TE) 3844 iommu_disable_translation(iommu); 3845 3846 g_iommus[iommu->seq_id] = iommu; 3847 ret = iommu_init_domains(iommu); 3848 if (ret == 0) 3849 ret = iommu_alloc_root_entry(iommu); 3850 if (ret) 3851 goto out; 3852 3853 intel_svm_check(iommu); 3854 3855 if (dmaru->ignored) { 3856 /* 3857 * we always have to disable PMRs or DMA may fail on this device 3858 */ 3859 if (force_on) 3860 iommu_disable_protect_mem_regions(iommu); 3861 return 0; 3862 } 3863 3864 intel_iommu_init_qi(iommu); 3865 iommu_flush_write_buffer(iommu); 3866 3867 #ifdef CONFIG_INTEL_IOMMU_SVM 3868 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3869 ret = intel_svm_enable_prq(iommu); 3870 if (ret) 3871 goto disable_iommu; 3872 } 3873 #endif 3874 ret = dmar_set_interrupt(iommu); 3875 if (ret) 3876 goto disable_iommu; 3877 3878 iommu_set_root_entry(iommu); 3879 iommu_enable_translation(iommu); 3880 3881 iommu_disable_protect_mem_regions(iommu); 3882 return 0; 3883 3884 disable_iommu: 3885 disable_dmar_iommu(iommu); 3886 out: 3887 free_dmar_iommu(iommu); 3888 return ret; 3889 } 3890 3891 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3892 { 3893 int ret = 0; 3894 struct intel_iommu *iommu = dmaru->iommu; 3895 3896 if (!intel_iommu_enabled) 3897 return 0; 3898 if (iommu == NULL) 3899 return -EINVAL; 3900 3901 if (insert) { 3902 ret = intel_iommu_add(dmaru); 3903 } else { 3904 disable_dmar_iommu(iommu); 3905 free_dmar_iommu(iommu); 3906 } 3907 3908 return ret; 3909 } 3910 3911 static void intel_iommu_free_dmars(void) 3912 { 3913 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3914 struct dmar_atsr_unit *atsru, *atsr_n; 3915 struct dmar_satc_unit *satcu, *satc_n; 3916 3917 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3918 list_del(&rmrru->list); 3919 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3920 kfree(rmrru); 3921 } 3922 3923 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3924 list_del(&atsru->list); 3925 intel_iommu_free_atsr(atsru); 3926 } 3927 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3928 list_del(&satcu->list); 3929 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3930 kfree(satcu); 3931 } 3932 } 3933 3934 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 3935 { 3936 int i, ret = 1; 3937 struct pci_bus *bus; 3938 struct pci_dev *bridge = NULL; 3939 struct device *tmp; 3940 struct acpi_dmar_atsr *atsr; 3941 struct dmar_atsr_unit *atsru; 3942 3943 dev = pci_physfn(dev); 3944 for (bus = dev->bus; bus; bus = bus->parent) { 3945 bridge = bus->self; 3946 /* If it's an integrated device, allow ATS */ 3947 if (!bridge) 3948 return 1; 3949 /* Connected via non-PCIe: no ATS */ 3950 if (!pci_is_pcie(bridge) || 3951 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3952 return 0; 3953 /* If we found the root port, look it up in the ATSR */ 3954 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3955 break; 3956 } 3957 3958 rcu_read_lock(); 3959 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3960 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3961 if (atsr->segment != pci_domain_nr(dev->bus)) 3962 continue; 3963 3964 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3965 if (tmp == &bridge->dev) 3966 goto out; 3967 3968 if (atsru->include_all) 3969 goto out; 3970 } 3971 ret = 0; 3972 out: 3973 rcu_read_unlock(); 3974 3975 return ret; 3976 } 3977 3978 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3979 { 3980 int ret; 3981 struct dmar_rmrr_unit *rmrru; 3982 struct dmar_atsr_unit *atsru; 3983 struct dmar_satc_unit *satcu; 3984 struct acpi_dmar_atsr *atsr; 3985 struct acpi_dmar_reserved_memory *rmrr; 3986 struct acpi_dmar_satc *satc; 3987 3988 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3989 return 0; 3990 3991 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3992 rmrr = container_of(rmrru->hdr, 3993 struct acpi_dmar_reserved_memory, header); 3994 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3995 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3996 ((void *)rmrr) + rmrr->header.length, 3997 rmrr->segment, rmrru->devices, 3998 rmrru->devices_cnt); 3999 if (ret < 0) 4000 return ret; 4001 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4002 dmar_remove_dev_scope(info, rmrr->segment, 4003 rmrru->devices, rmrru->devices_cnt); 4004 } 4005 } 4006 4007 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4008 if (atsru->include_all) 4009 continue; 4010 4011 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4012 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4013 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4014 (void *)atsr + atsr->header.length, 4015 atsr->segment, atsru->devices, 4016 atsru->devices_cnt); 4017 if (ret > 0) 4018 break; 4019 else if (ret < 0) 4020 return ret; 4021 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4022 if (dmar_remove_dev_scope(info, atsr->segment, 4023 atsru->devices, atsru->devices_cnt)) 4024 break; 4025 } 4026 } 4027 list_for_each_entry(satcu, &dmar_satc_units, list) { 4028 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 4029 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4030 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 4031 (void *)satc + satc->header.length, 4032 satc->segment, satcu->devices, 4033 satcu->devices_cnt); 4034 if (ret > 0) 4035 break; 4036 else if (ret < 0) 4037 return ret; 4038 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4039 if (dmar_remove_dev_scope(info, satc->segment, 4040 satcu->devices, satcu->devices_cnt)) 4041 break; 4042 } 4043 } 4044 4045 return 0; 4046 } 4047 4048 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4049 unsigned long val, void *v) 4050 { 4051 struct memory_notify *mhp = v; 4052 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4053 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4054 mhp->nr_pages - 1); 4055 4056 switch (val) { 4057 case MEM_GOING_ONLINE: 4058 if (iommu_domain_identity_map(si_domain, 4059 start_vpfn, last_vpfn)) { 4060 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4061 start_vpfn, last_vpfn); 4062 return NOTIFY_BAD; 4063 } 4064 break; 4065 4066 case MEM_OFFLINE: 4067 case MEM_CANCEL_ONLINE: 4068 { 4069 struct dmar_drhd_unit *drhd; 4070 struct intel_iommu *iommu; 4071 struct page *freelist; 4072 4073 freelist = domain_unmap(si_domain, 4074 start_vpfn, last_vpfn, 4075 NULL); 4076 4077 rcu_read_lock(); 4078 for_each_active_iommu(iommu, drhd) 4079 iommu_flush_iotlb_psi(iommu, si_domain, 4080 start_vpfn, mhp->nr_pages, 4081 !freelist, 0); 4082 rcu_read_unlock(); 4083 dma_free_pagelist(freelist); 4084 } 4085 break; 4086 } 4087 4088 return NOTIFY_OK; 4089 } 4090 4091 static struct notifier_block intel_iommu_memory_nb = { 4092 .notifier_call = intel_iommu_memory_notifier, 4093 .priority = 0 4094 }; 4095 4096 static void intel_disable_iommus(void) 4097 { 4098 struct intel_iommu *iommu = NULL; 4099 struct dmar_drhd_unit *drhd; 4100 4101 for_each_iommu(iommu, drhd) 4102 iommu_disable_translation(iommu); 4103 } 4104 4105 void intel_iommu_shutdown(void) 4106 { 4107 struct dmar_drhd_unit *drhd; 4108 struct intel_iommu *iommu = NULL; 4109 4110 if (no_iommu || dmar_disabled) 4111 return; 4112 4113 down_write(&dmar_global_lock); 4114 4115 /* Disable PMRs explicitly here. */ 4116 for_each_iommu(iommu, drhd) 4117 iommu_disable_protect_mem_regions(iommu); 4118 4119 /* Make sure the IOMMUs are switched off */ 4120 intel_disable_iommus(); 4121 4122 up_write(&dmar_global_lock); 4123 } 4124 4125 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4126 { 4127 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4128 4129 return container_of(iommu_dev, struct intel_iommu, iommu); 4130 } 4131 4132 static ssize_t version_show(struct device *dev, 4133 struct device_attribute *attr, char *buf) 4134 { 4135 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4136 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4137 return sprintf(buf, "%d:%d\n", 4138 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4139 } 4140 static DEVICE_ATTR_RO(version); 4141 4142 static ssize_t address_show(struct device *dev, 4143 struct device_attribute *attr, char *buf) 4144 { 4145 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4146 return sprintf(buf, "%llx\n", iommu->reg_phys); 4147 } 4148 static DEVICE_ATTR_RO(address); 4149 4150 static ssize_t cap_show(struct device *dev, 4151 struct device_attribute *attr, char *buf) 4152 { 4153 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4154 return sprintf(buf, "%llx\n", iommu->cap); 4155 } 4156 static DEVICE_ATTR_RO(cap); 4157 4158 static ssize_t ecap_show(struct device *dev, 4159 struct device_attribute *attr, char *buf) 4160 { 4161 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4162 return sprintf(buf, "%llx\n", iommu->ecap); 4163 } 4164 static DEVICE_ATTR_RO(ecap); 4165 4166 static ssize_t domains_supported_show(struct device *dev, 4167 struct device_attribute *attr, char *buf) 4168 { 4169 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4170 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4171 } 4172 static DEVICE_ATTR_RO(domains_supported); 4173 4174 static ssize_t domains_used_show(struct device *dev, 4175 struct device_attribute *attr, char *buf) 4176 { 4177 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4178 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4179 cap_ndoms(iommu->cap))); 4180 } 4181 static DEVICE_ATTR_RO(domains_used); 4182 4183 static struct attribute *intel_iommu_attrs[] = { 4184 &dev_attr_version.attr, 4185 &dev_attr_address.attr, 4186 &dev_attr_cap.attr, 4187 &dev_attr_ecap.attr, 4188 &dev_attr_domains_supported.attr, 4189 &dev_attr_domains_used.attr, 4190 NULL, 4191 }; 4192 4193 static struct attribute_group intel_iommu_group = { 4194 .name = "intel-iommu", 4195 .attrs = intel_iommu_attrs, 4196 }; 4197 4198 const struct attribute_group *intel_iommu_groups[] = { 4199 &intel_iommu_group, 4200 NULL, 4201 }; 4202 4203 static inline bool has_external_pci(void) 4204 { 4205 struct pci_dev *pdev = NULL; 4206 4207 for_each_pci_dev(pdev) 4208 if (pdev->external_facing) 4209 return true; 4210 4211 return false; 4212 } 4213 4214 static int __init platform_optin_force_iommu(void) 4215 { 4216 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4217 return 0; 4218 4219 if (no_iommu || dmar_disabled) 4220 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4221 4222 /* 4223 * If Intel-IOMMU is disabled by default, we will apply identity 4224 * map for all devices except those marked as being untrusted. 4225 */ 4226 if (dmar_disabled) 4227 iommu_set_default_passthrough(false); 4228 4229 dmar_disabled = 0; 4230 no_iommu = 0; 4231 4232 return 1; 4233 } 4234 4235 static int __init probe_acpi_namespace_devices(void) 4236 { 4237 struct dmar_drhd_unit *drhd; 4238 /* To avoid a -Wunused-but-set-variable warning. */ 4239 struct intel_iommu *iommu __maybe_unused; 4240 struct device *dev; 4241 int i, ret = 0; 4242 4243 for_each_active_iommu(iommu, drhd) { 4244 for_each_active_dev_scope(drhd->devices, 4245 drhd->devices_cnt, i, dev) { 4246 struct acpi_device_physical_node *pn; 4247 struct iommu_group *group; 4248 struct acpi_device *adev; 4249 4250 if (dev->bus != &acpi_bus_type) 4251 continue; 4252 4253 adev = to_acpi_device(dev); 4254 mutex_lock(&adev->physical_node_lock); 4255 list_for_each_entry(pn, 4256 &adev->physical_node_list, node) { 4257 group = iommu_group_get(pn->dev); 4258 if (group) { 4259 iommu_group_put(group); 4260 continue; 4261 } 4262 4263 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4264 ret = iommu_probe_device(pn->dev); 4265 if (ret) 4266 break; 4267 } 4268 mutex_unlock(&adev->physical_node_lock); 4269 4270 if (ret) 4271 return ret; 4272 } 4273 } 4274 4275 return 0; 4276 } 4277 4278 int __init intel_iommu_init(void) 4279 { 4280 int ret = -ENODEV; 4281 struct dmar_drhd_unit *drhd; 4282 struct intel_iommu *iommu; 4283 4284 /* 4285 * Intel IOMMU is required for a TXT/tboot launch or platform 4286 * opt in, so enforce that. 4287 */ 4288 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4289 platform_optin_force_iommu(); 4290 4291 if (iommu_init_mempool()) { 4292 if (force_on) 4293 panic("tboot: Failed to initialize iommu memory\n"); 4294 return -ENOMEM; 4295 } 4296 4297 down_write(&dmar_global_lock); 4298 if (dmar_table_init()) { 4299 if (force_on) 4300 panic("tboot: Failed to initialize DMAR table\n"); 4301 goto out_free_dmar; 4302 } 4303 4304 if (dmar_dev_scope_init() < 0) { 4305 if (force_on) 4306 panic("tboot: Failed to initialize DMAR device scope\n"); 4307 goto out_free_dmar; 4308 } 4309 4310 up_write(&dmar_global_lock); 4311 4312 /* 4313 * The bus notifier takes the dmar_global_lock, so lockdep will 4314 * complain later when we register it under the lock. 4315 */ 4316 dmar_register_bus_notifier(); 4317 4318 down_write(&dmar_global_lock); 4319 4320 if (!no_iommu) 4321 intel_iommu_debugfs_init(); 4322 4323 if (no_iommu || dmar_disabled) { 4324 /* 4325 * We exit the function here to ensure IOMMU's remapping and 4326 * mempool aren't setup, which means that the IOMMU's PMRs 4327 * won't be disabled via the call to init_dmars(). So disable 4328 * it explicitly here. The PMRs were setup by tboot prior to 4329 * calling SENTER, but the kernel is expected to reset/tear 4330 * down the PMRs. 4331 */ 4332 if (intel_iommu_tboot_noforce) { 4333 for_each_iommu(iommu, drhd) 4334 iommu_disable_protect_mem_regions(iommu); 4335 } 4336 4337 /* 4338 * Make sure the IOMMUs are switched off, even when we 4339 * boot into a kexec kernel and the previous kernel left 4340 * them enabled 4341 */ 4342 intel_disable_iommus(); 4343 goto out_free_dmar; 4344 } 4345 4346 if (list_empty(&dmar_rmrr_units)) 4347 pr_info("No RMRR found\n"); 4348 4349 if (list_empty(&dmar_atsr_units)) 4350 pr_info("No ATSR found\n"); 4351 4352 if (list_empty(&dmar_satc_units)) 4353 pr_info("No SATC found\n"); 4354 4355 if (dmar_map_gfx) 4356 intel_iommu_gfx_mapped = 1; 4357 4358 init_no_remapping_devices(); 4359 4360 ret = init_dmars(); 4361 if (ret) { 4362 if (force_on) 4363 panic("tboot: Failed to initialize DMARs\n"); 4364 pr_err("Initialization failed\n"); 4365 goto out_free_dmar; 4366 } 4367 up_write(&dmar_global_lock); 4368 4369 init_iommu_pm_ops(); 4370 4371 down_read(&dmar_global_lock); 4372 for_each_active_iommu(iommu, drhd) { 4373 /* 4374 * The flush queue implementation does not perform 4375 * page-selective invalidations that are required for efficient 4376 * TLB flushes in virtual environments. The benefit of batching 4377 * is likely to be much lower than the overhead of synchronizing 4378 * the virtual and physical IOMMU page-tables. 4379 */ 4380 if (cap_caching_mode(iommu->cap)) { 4381 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4382 iommu_set_dma_strict(); 4383 } 4384 iommu_device_sysfs_add(&iommu->iommu, NULL, 4385 intel_iommu_groups, 4386 "%s", iommu->name); 4387 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4388 } 4389 up_read(&dmar_global_lock); 4390 4391 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4392 if (si_domain && !hw_pass_through) 4393 register_memory_notifier(&intel_iommu_memory_nb); 4394 4395 down_read(&dmar_global_lock); 4396 if (probe_acpi_namespace_devices()) 4397 pr_warn("ACPI name space devices didn't probe correctly\n"); 4398 4399 /* Finally, we enable the DMA remapping hardware. */ 4400 for_each_iommu(iommu, drhd) { 4401 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4402 iommu_enable_translation(iommu); 4403 4404 iommu_disable_protect_mem_regions(iommu); 4405 } 4406 up_read(&dmar_global_lock); 4407 4408 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4409 4410 intel_iommu_enabled = 1; 4411 4412 return 0; 4413 4414 out_free_dmar: 4415 intel_iommu_free_dmars(); 4416 up_write(&dmar_global_lock); 4417 iommu_exit_mempool(); 4418 return ret; 4419 } 4420 4421 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4422 { 4423 struct device_domain_info *info = opaque; 4424 4425 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4426 return 0; 4427 } 4428 4429 /* 4430 * NB - intel-iommu lacks any sort of reference counting for the users of 4431 * dependent devices. If multiple endpoints have intersecting dependent 4432 * devices, unbinding the driver from any one of them will possibly leave 4433 * the others unable to operate. 4434 */ 4435 static void domain_context_clear(struct device_domain_info *info) 4436 { 4437 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4438 return; 4439 4440 pci_for_each_dma_alias(to_pci_dev(info->dev), 4441 &domain_context_clear_one_cb, info); 4442 } 4443 4444 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4445 { 4446 struct dmar_domain *domain; 4447 struct intel_iommu *iommu; 4448 unsigned long flags; 4449 4450 assert_spin_locked(&device_domain_lock); 4451 4452 if (WARN_ON(!info)) 4453 return; 4454 4455 iommu = info->iommu; 4456 domain = info->domain; 4457 4458 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4459 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4460 intel_pasid_tear_down_entry(iommu, info->dev, 4461 PASID_RID2PASID, false); 4462 4463 iommu_disable_dev_iotlb(info); 4464 domain_context_clear(info); 4465 intel_pasid_free_table(info->dev); 4466 } 4467 4468 unlink_domain_info(info); 4469 4470 spin_lock_irqsave(&iommu->lock, flags); 4471 domain_detach_iommu(domain, iommu); 4472 spin_unlock_irqrestore(&iommu->lock, flags); 4473 4474 free_devinfo_mem(info); 4475 } 4476 4477 static void dmar_remove_one_dev_info(struct device *dev) 4478 { 4479 struct device_domain_info *info; 4480 unsigned long flags; 4481 4482 spin_lock_irqsave(&device_domain_lock, flags); 4483 info = get_domain_info(dev); 4484 if (info) 4485 __dmar_remove_one_dev_info(info); 4486 spin_unlock_irqrestore(&device_domain_lock, flags); 4487 } 4488 4489 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4490 { 4491 int adjust_width; 4492 4493 /* calculate AGAW */ 4494 domain->gaw = guest_width; 4495 adjust_width = guestwidth_to_adjustwidth(guest_width); 4496 domain->agaw = width_to_agaw(adjust_width); 4497 4498 domain->iommu_coherency = false; 4499 domain->iommu_snooping = false; 4500 domain->iommu_superpage = 0; 4501 domain->max_addr = 0; 4502 4503 /* always allocate the top pgd */ 4504 domain->pgd = alloc_pgtable_page(domain->nid); 4505 if (!domain->pgd) 4506 return -ENOMEM; 4507 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4508 return 0; 4509 } 4510 4511 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4512 { 4513 struct dmar_domain *dmar_domain; 4514 struct iommu_domain *domain; 4515 4516 switch (type) { 4517 case IOMMU_DOMAIN_DMA: 4518 case IOMMU_DOMAIN_DMA_FQ: 4519 case IOMMU_DOMAIN_UNMANAGED: 4520 dmar_domain = alloc_domain(0); 4521 if (!dmar_domain) { 4522 pr_err("Can't allocate dmar_domain\n"); 4523 return NULL; 4524 } 4525 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4526 pr_err("Domain initialization failed\n"); 4527 domain_exit(dmar_domain); 4528 return NULL; 4529 } 4530 4531 domain = &dmar_domain->domain; 4532 domain->geometry.aperture_start = 0; 4533 domain->geometry.aperture_end = 4534 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4535 domain->geometry.force_aperture = true; 4536 4537 return domain; 4538 case IOMMU_DOMAIN_IDENTITY: 4539 return &si_domain->domain; 4540 default: 4541 return NULL; 4542 } 4543 4544 return NULL; 4545 } 4546 4547 static void intel_iommu_domain_free(struct iommu_domain *domain) 4548 { 4549 if (domain != &si_domain->domain) 4550 domain_exit(to_dmar_domain(domain)); 4551 } 4552 4553 /* 4554 * Check whether a @domain could be attached to the @dev through the 4555 * aux-domain attach/detach APIs. 4556 */ 4557 static inline bool 4558 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4559 { 4560 struct device_domain_info *info = get_domain_info(dev); 4561 4562 return info && info->auxd_enabled && 4563 domain->type == IOMMU_DOMAIN_UNMANAGED; 4564 } 4565 4566 static inline struct subdev_domain_info * 4567 lookup_subdev_info(struct dmar_domain *domain, struct device *dev) 4568 { 4569 struct subdev_domain_info *sinfo; 4570 4571 if (!list_empty(&domain->subdevices)) { 4572 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 4573 if (sinfo->pdev == dev) 4574 return sinfo; 4575 } 4576 } 4577 4578 return NULL; 4579 } 4580 4581 static int auxiliary_link_device(struct dmar_domain *domain, 4582 struct device *dev) 4583 { 4584 struct device_domain_info *info = get_domain_info(dev); 4585 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4586 4587 assert_spin_locked(&device_domain_lock); 4588 if (WARN_ON(!info)) 4589 return -EINVAL; 4590 4591 if (!sinfo) { 4592 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); 4593 if (!sinfo) 4594 return -ENOMEM; 4595 sinfo->domain = domain; 4596 sinfo->pdev = dev; 4597 list_add(&sinfo->link_phys, &info->subdevices); 4598 list_add(&sinfo->link_domain, &domain->subdevices); 4599 } 4600 4601 return ++sinfo->users; 4602 } 4603 4604 static int auxiliary_unlink_device(struct dmar_domain *domain, 4605 struct device *dev) 4606 { 4607 struct device_domain_info *info = get_domain_info(dev); 4608 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4609 int ret; 4610 4611 assert_spin_locked(&device_domain_lock); 4612 if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) 4613 return -EINVAL; 4614 4615 ret = --sinfo->users; 4616 if (!ret) { 4617 list_del(&sinfo->link_phys); 4618 list_del(&sinfo->link_domain); 4619 kfree(sinfo); 4620 } 4621 4622 return ret; 4623 } 4624 4625 static int aux_domain_add_dev(struct dmar_domain *domain, 4626 struct device *dev) 4627 { 4628 int ret; 4629 unsigned long flags; 4630 struct intel_iommu *iommu; 4631 4632 iommu = device_to_iommu(dev, NULL, NULL); 4633 if (!iommu) 4634 return -ENODEV; 4635 4636 if (domain->default_pasid <= 0) { 4637 u32 pasid; 4638 4639 /* No private data needed for the default pasid */ 4640 pasid = ioasid_alloc(NULL, PASID_MIN, 4641 pci_max_pasids(to_pci_dev(dev)) - 1, 4642 NULL); 4643 if (pasid == INVALID_IOASID) { 4644 pr_err("Can't allocate default pasid\n"); 4645 return -ENODEV; 4646 } 4647 domain->default_pasid = pasid; 4648 } 4649 4650 spin_lock_irqsave(&device_domain_lock, flags); 4651 ret = auxiliary_link_device(domain, dev); 4652 if (ret <= 0) 4653 goto link_failed; 4654 4655 /* 4656 * Subdevices from the same physical device can be attached to the 4657 * same domain. For such cases, only the first subdevice attachment 4658 * needs to go through the full steps in this function. So if ret > 4659 * 1, just goto out. 4660 */ 4661 if (ret > 1) 4662 goto out; 4663 4664 /* 4665 * iommu->lock must be held to attach domain to iommu and setup the 4666 * pasid entry for second level translation. 4667 */ 4668 spin_lock(&iommu->lock); 4669 ret = domain_attach_iommu(domain, iommu); 4670 if (ret) 4671 goto attach_failed; 4672 4673 /* Setup the PASID entry for mediated devices: */ 4674 if (domain_use_first_level(domain)) 4675 ret = domain_setup_first_level(iommu, domain, dev, 4676 domain->default_pasid); 4677 else 4678 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4679 domain->default_pasid); 4680 if (ret) 4681 goto table_failed; 4682 4683 spin_unlock(&iommu->lock); 4684 out: 4685 spin_unlock_irqrestore(&device_domain_lock, flags); 4686 4687 return 0; 4688 4689 table_failed: 4690 domain_detach_iommu(domain, iommu); 4691 attach_failed: 4692 spin_unlock(&iommu->lock); 4693 auxiliary_unlink_device(domain, dev); 4694 link_failed: 4695 spin_unlock_irqrestore(&device_domain_lock, flags); 4696 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4697 ioasid_put(domain->default_pasid); 4698 4699 return ret; 4700 } 4701 4702 static void aux_domain_remove_dev(struct dmar_domain *domain, 4703 struct device *dev) 4704 { 4705 struct device_domain_info *info; 4706 struct intel_iommu *iommu; 4707 unsigned long flags; 4708 4709 if (!is_aux_domain(dev, &domain->domain)) 4710 return; 4711 4712 spin_lock_irqsave(&device_domain_lock, flags); 4713 info = get_domain_info(dev); 4714 iommu = info->iommu; 4715 4716 if (!auxiliary_unlink_device(domain, dev)) { 4717 spin_lock(&iommu->lock); 4718 intel_pasid_tear_down_entry(iommu, dev, 4719 domain->default_pasid, false); 4720 domain_detach_iommu(domain, iommu); 4721 spin_unlock(&iommu->lock); 4722 } 4723 4724 spin_unlock_irqrestore(&device_domain_lock, flags); 4725 4726 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4727 ioasid_put(domain->default_pasid); 4728 } 4729 4730 static int prepare_domain_attach_device(struct iommu_domain *domain, 4731 struct device *dev) 4732 { 4733 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4734 struct intel_iommu *iommu; 4735 int addr_width; 4736 4737 iommu = device_to_iommu(dev, NULL, NULL); 4738 if (!iommu) 4739 return -ENODEV; 4740 4741 if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) && 4742 !ecap_nest(iommu->ecap)) { 4743 dev_err(dev, "%s: iommu not support nested translation\n", 4744 iommu->name); 4745 return -EINVAL; 4746 } 4747 4748 /* check if this iommu agaw is sufficient for max mapped address */ 4749 addr_width = agaw_to_width(iommu->agaw); 4750 if (addr_width > cap_mgaw(iommu->cap)) 4751 addr_width = cap_mgaw(iommu->cap); 4752 4753 if (dmar_domain->max_addr > (1LL << addr_width)) { 4754 dev_err(dev, "%s: iommu width (%d) is not " 4755 "sufficient for the mapped address (%llx)\n", 4756 __func__, addr_width, dmar_domain->max_addr); 4757 return -EFAULT; 4758 } 4759 dmar_domain->gaw = addr_width; 4760 4761 /* 4762 * Knock out extra levels of page tables if necessary 4763 */ 4764 while (iommu->agaw < dmar_domain->agaw) { 4765 struct dma_pte *pte; 4766 4767 pte = dmar_domain->pgd; 4768 if (dma_pte_present(pte)) { 4769 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4770 free_pgtable_page(pte); 4771 } 4772 dmar_domain->agaw--; 4773 } 4774 4775 return 0; 4776 } 4777 4778 static int intel_iommu_attach_device(struct iommu_domain *domain, 4779 struct device *dev) 4780 { 4781 int ret; 4782 4783 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4784 device_is_rmrr_locked(dev)) { 4785 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4786 return -EPERM; 4787 } 4788 4789 if (is_aux_domain(dev, domain)) 4790 return -EPERM; 4791 4792 /* normally dev is not mapped */ 4793 if (unlikely(domain_context_mapped(dev))) { 4794 struct dmar_domain *old_domain; 4795 4796 old_domain = find_domain(dev); 4797 if (old_domain) 4798 dmar_remove_one_dev_info(dev); 4799 } 4800 4801 ret = prepare_domain_attach_device(domain, dev); 4802 if (ret) 4803 return ret; 4804 4805 return domain_add_dev_info(to_dmar_domain(domain), dev); 4806 } 4807 4808 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4809 struct device *dev) 4810 { 4811 int ret; 4812 4813 if (!is_aux_domain(dev, domain)) 4814 return -EPERM; 4815 4816 ret = prepare_domain_attach_device(domain, dev); 4817 if (ret) 4818 return ret; 4819 4820 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4821 } 4822 4823 static void intel_iommu_detach_device(struct iommu_domain *domain, 4824 struct device *dev) 4825 { 4826 dmar_remove_one_dev_info(dev); 4827 } 4828 4829 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4830 struct device *dev) 4831 { 4832 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4833 } 4834 4835 #ifdef CONFIG_INTEL_IOMMU_SVM 4836 /* 4837 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4838 * VT-d granularity. Invalidation is typically included in the unmap operation 4839 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4840 * owns the first level page tables. Invalidations of translation caches in the 4841 * guest are trapped and passed down to the host. 4842 * 4843 * vIOMMU in the guest will only expose first level page tables, therefore 4844 * we do not support IOTLB granularity for request without PASID (second level). 4845 * 4846 * For example, to find the VT-d granularity encoding for IOTLB 4847 * type and page selective granularity within PASID: 4848 * X: indexed by iommu cache type 4849 * Y: indexed by enum iommu_inv_granularity 4850 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4851 */ 4852 4853 static const int 4854 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4855 /* 4856 * PASID based IOTLB invalidation: PASID selective (per PASID), 4857 * page selective (address granularity) 4858 */ 4859 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4860 /* PASID based dev TLBs */ 4861 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4862 /* PASID cache */ 4863 {-EINVAL, -EINVAL, -EINVAL} 4864 }; 4865 4866 static inline int to_vtd_granularity(int type, int granu) 4867 { 4868 return inv_type_granu_table[type][granu]; 4869 } 4870 4871 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4872 { 4873 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 4874 4875 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 4876 * IOMMU cache invalidate API passes granu_size in bytes, and number of 4877 * granu size in contiguous memory. 4878 */ 4879 return order_base_2(nr_pages); 4880 } 4881 4882 static int 4883 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 4884 struct iommu_cache_invalidate_info *inv_info) 4885 { 4886 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4887 struct device_domain_info *info; 4888 struct intel_iommu *iommu; 4889 unsigned long flags; 4890 int cache_type; 4891 u8 bus, devfn; 4892 u16 did, sid; 4893 int ret = 0; 4894 u64 size = 0; 4895 4896 if (!inv_info || !dmar_domain) 4897 return -EINVAL; 4898 4899 if (!dev || !dev_is_pci(dev)) 4900 return -ENODEV; 4901 4902 iommu = device_to_iommu(dev, &bus, &devfn); 4903 if (!iommu) 4904 return -ENODEV; 4905 4906 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 4907 return -EINVAL; 4908 4909 spin_lock_irqsave(&device_domain_lock, flags); 4910 spin_lock(&iommu->lock); 4911 info = get_domain_info(dev); 4912 if (!info) { 4913 ret = -EINVAL; 4914 goto out_unlock; 4915 } 4916 did = dmar_domain->iommu_did[iommu->seq_id]; 4917 sid = PCI_DEVID(bus, devfn); 4918 4919 /* Size is only valid in address selective invalidation */ 4920 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 4921 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 4922 inv_info->granu.addr_info.nb_granules); 4923 4924 for_each_set_bit(cache_type, 4925 (unsigned long *)&inv_info->cache, 4926 IOMMU_CACHE_INV_TYPE_NR) { 4927 int granu = 0; 4928 u64 pasid = 0; 4929 u64 addr = 0; 4930 4931 granu = to_vtd_granularity(cache_type, inv_info->granularity); 4932 if (granu == -EINVAL) { 4933 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 4934 cache_type, inv_info->granularity); 4935 break; 4936 } 4937 4938 /* 4939 * PASID is stored in different locations based on the 4940 * granularity. 4941 */ 4942 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 4943 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 4944 pasid = inv_info->granu.pasid_info.pasid; 4945 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4946 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 4947 pasid = inv_info->granu.addr_info.pasid; 4948 4949 switch (BIT(cache_type)) { 4950 case IOMMU_CACHE_INV_TYPE_IOTLB: 4951 /* HW will ignore LSB bits based on address mask */ 4952 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4953 size && 4954 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 4955 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 4956 inv_info->granu.addr_info.addr, size); 4957 } 4958 4959 /* 4960 * If granu is PASID-selective, address is ignored. 4961 * We use npages = -1 to indicate that. 4962 */ 4963 qi_flush_piotlb(iommu, did, pasid, 4964 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 4965 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 4966 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 4967 4968 if (!info->ats_enabled) 4969 break; 4970 /* 4971 * Always flush device IOTLB if ATS is enabled. vIOMMU 4972 * in the guest may assume IOTLB flush is inclusive, 4973 * which is more efficient. 4974 */ 4975 fallthrough; 4976 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 4977 /* 4978 * PASID based device TLB invalidation does not support 4979 * IOMMU_INV_GRANU_PASID granularity but only supports 4980 * IOMMU_INV_GRANU_ADDR. 4981 * The equivalent of that is we set the size to be the 4982 * entire range of 64 bit. User only provides PASID info 4983 * without address info. So we set addr to 0. 4984 */ 4985 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 4986 size = 64 - VTD_PAGE_SHIFT; 4987 addr = 0; 4988 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 4989 addr = inv_info->granu.addr_info.addr; 4990 } 4991 4992 if (info->ats_enabled) 4993 qi_flush_dev_iotlb_pasid(iommu, sid, 4994 info->pfsid, pasid, 4995 info->ats_qdep, addr, 4996 size); 4997 else 4998 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 4999 break; 5000 default: 5001 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5002 cache_type); 5003 ret = -EINVAL; 5004 } 5005 } 5006 out_unlock: 5007 spin_unlock(&iommu->lock); 5008 spin_unlock_irqrestore(&device_domain_lock, flags); 5009 5010 return ret; 5011 } 5012 #endif 5013 5014 static int intel_iommu_map(struct iommu_domain *domain, 5015 unsigned long iova, phys_addr_t hpa, 5016 size_t size, int iommu_prot, gfp_t gfp) 5017 { 5018 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5019 u64 max_addr; 5020 int prot = 0; 5021 5022 if (iommu_prot & IOMMU_READ) 5023 prot |= DMA_PTE_READ; 5024 if (iommu_prot & IOMMU_WRITE) 5025 prot |= DMA_PTE_WRITE; 5026 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5027 prot |= DMA_PTE_SNP; 5028 5029 max_addr = iova + size; 5030 if (dmar_domain->max_addr < max_addr) { 5031 u64 end; 5032 5033 /* check if minimum agaw is sufficient for mapped address */ 5034 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5035 if (end < max_addr) { 5036 pr_err("%s: iommu width (%d) is not " 5037 "sufficient for the mapped address (%llx)\n", 5038 __func__, dmar_domain->gaw, max_addr); 5039 return -EFAULT; 5040 } 5041 dmar_domain->max_addr = max_addr; 5042 } 5043 /* Round up size to next multiple of PAGE_SIZE, if it and 5044 the low bits of hpa would take us onto the next page */ 5045 size = aligned_nrpages(hpa, size); 5046 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5047 hpa >> VTD_PAGE_SHIFT, size, prot); 5048 } 5049 5050 static int intel_iommu_map_pages(struct iommu_domain *domain, 5051 unsigned long iova, phys_addr_t paddr, 5052 size_t pgsize, size_t pgcount, 5053 int prot, gfp_t gfp, size_t *mapped) 5054 { 5055 unsigned long pgshift = __ffs(pgsize); 5056 size_t size = pgcount << pgshift; 5057 int ret; 5058 5059 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 5060 return -EINVAL; 5061 5062 if (!IS_ALIGNED(iova | paddr, pgsize)) 5063 return -EINVAL; 5064 5065 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 5066 if (!ret && mapped) 5067 *mapped = size; 5068 5069 return ret; 5070 } 5071 5072 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5073 unsigned long iova, size_t size, 5074 struct iommu_iotlb_gather *gather) 5075 { 5076 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5077 unsigned long start_pfn, last_pfn; 5078 int level = 0; 5079 5080 /* Cope with horrid API which requires us to unmap more than the 5081 size argument if it happens to be a large-page mapping. */ 5082 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5083 5084 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5085 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5086 5087 start_pfn = iova >> VTD_PAGE_SHIFT; 5088 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5089 5090 gather->freelist = domain_unmap(dmar_domain, start_pfn, 5091 last_pfn, gather->freelist); 5092 5093 if (dmar_domain->max_addr == iova + size) 5094 dmar_domain->max_addr = iova; 5095 5096 iommu_iotlb_gather_add_page(domain, gather, iova, size); 5097 5098 return size; 5099 } 5100 5101 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 5102 unsigned long iova, 5103 size_t pgsize, size_t pgcount, 5104 struct iommu_iotlb_gather *gather) 5105 { 5106 unsigned long pgshift = __ffs(pgsize); 5107 size_t size = pgcount << pgshift; 5108 5109 return intel_iommu_unmap(domain, iova, size, gather); 5110 } 5111 5112 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 5113 struct iommu_iotlb_gather *gather) 5114 { 5115 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5116 unsigned long iova_pfn = IOVA_PFN(gather->start); 5117 size_t size = gather->end - gather->start; 5118 unsigned long start_pfn; 5119 unsigned long nrpages; 5120 int iommu_id; 5121 5122 nrpages = aligned_nrpages(gather->start, size); 5123 start_pfn = mm_to_dma_pfn(iova_pfn); 5124 5125 for_each_domain_iommu(iommu_id, dmar_domain) 5126 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5127 start_pfn, nrpages, !gather->freelist, 0); 5128 5129 dma_free_pagelist(gather->freelist); 5130 } 5131 5132 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5133 dma_addr_t iova) 5134 { 5135 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5136 struct dma_pte *pte; 5137 int level = 0; 5138 u64 phys = 0; 5139 5140 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5141 if (pte && dma_pte_present(pte)) 5142 phys = dma_pte_addr(pte) + 5143 (iova & (BIT_MASK(level_to_offset_bits(level) + 5144 VTD_PAGE_SHIFT) - 1)); 5145 5146 return phys; 5147 } 5148 5149 static bool intel_iommu_capable(enum iommu_cap cap) 5150 { 5151 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5152 return domain_update_iommu_snooping(NULL); 5153 if (cap == IOMMU_CAP_INTR_REMAP) 5154 return irq_remapping_enabled == 1; 5155 5156 return false; 5157 } 5158 5159 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5160 { 5161 struct intel_iommu *iommu; 5162 5163 iommu = device_to_iommu(dev, NULL, NULL); 5164 if (!iommu) 5165 return ERR_PTR(-ENODEV); 5166 5167 if (translation_pre_enabled(iommu)) 5168 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5169 5170 return &iommu->iommu; 5171 } 5172 5173 static void intel_iommu_release_device(struct device *dev) 5174 { 5175 struct intel_iommu *iommu; 5176 5177 iommu = device_to_iommu(dev, NULL, NULL); 5178 if (!iommu) 5179 return; 5180 5181 dmar_remove_one_dev_info(dev); 5182 5183 set_dma_ops(dev, NULL); 5184 } 5185 5186 static void intel_iommu_probe_finalize(struct device *dev) 5187 { 5188 set_dma_ops(dev, NULL); 5189 iommu_setup_dma_ops(dev, 0, U64_MAX); 5190 } 5191 5192 static void intel_iommu_get_resv_regions(struct device *device, 5193 struct list_head *head) 5194 { 5195 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5196 struct iommu_resv_region *reg; 5197 struct dmar_rmrr_unit *rmrr; 5198 struct device *i_dev; 5199 int i; 5200 5201 down_read(&dmar_global_lock); 5202 for_each_rmrr_units(rmrr) { 5203 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5204 i, i_dev) { 5205 struct iommu_resv_region *resv; 5206 enum iommu_resv_type type; 5207 size_t length; 5208 5209 if (i_dev != device && 5210 !is_downstream_to_pci_bridge(device, i_dev)) 5211 continue; 5212 5213 length = rmrr->end_address - rmrr->base_address + 1; 5214 5215 type = device_rmrr_is_relaxable(device) ? 5216 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5217 5218 resv = iommu_alloc_resv_region(rmrr->base_address, 5219 length, prot, type); 5220 if (!resv) 5221 break; 5222 5223 list_add_tail(&resv->list, head); 5224 } 5225 } 5226 up_read(&dmar_global_lock); 5227 5228 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5229 if (dev_is_pci(device)) { 5230 struct pci_dev *pdev = to_pci_dev(device); 5231 5232 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5233 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5234 IOMMU_RESV_DIRECT_RELAXABLE); 5235 if (reg) 5236 list_add_tail(®->list, head); 5237 } 5238 } 5239 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5240 5241 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5242 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5243 0, IOMMU_RESV_MSI); 5244 if (!reg) 5245 return; 5246 list_add_tail(®->list, head); 5247 } 5248 5249 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5250 { 5251 struct device_domain_info *info; 5252 struct context_entry *context; 5253 struct dmar_domain *domain; 5254 unsigned long flags; 5255 u64 ctx_lo; 5256 int ret; 5257 5258 domain = find_domain(dev); 5259 if (!domain) 5260 return -EINVAL; 5261 5262 spin_lock_irqsave(&device_domain_lock, flags); 5263 spin_lock(&iommu->lock); 5264 5265 ret = -EINVAL; 5266 info = get_domain_info(dev); 5267 if (!info || !info->pasid_supported) 5268 goto out; 5269 5270 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5271 if (WARN_ON(!context)) 5272 goto out; 5273 5274 ctx_lo = context[0].lo; 5275 5276 if (!(ctx_lo & CONTEXT_PASIDE)) { 5277 ctx_lo |= CONTEXT_PASIDE; 5278 context[0].lo = ctx_lo; 5279 wmb(); 5280 iommu->flush.flush_context(iommu, 5281 domain->iommu_did[iommu->seq_id], 5282 PCI_DEVID(info->bus, info->devfn), 5283 DMA_CCMD_MASK_NOBIT, 5284 DMA_CCMD_DEVICE_INVL); 5285 } 5286 5287 /* Enable PASID support in the device, if it wasn't already */ 5288 if (!info->pasid_enabled) 5289 iommu_enable_dev_iotlb(info); 5290 5291 ret = 0; 5292 5293 out: 5294 spin_unlock(&iommu->lock); 5295 spin_unlock_irqrestore(&device_domain_lock, flags); 5296 5297 return ret; 5298 } 5299 5300 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5301 { 5302 if (dev_is_pci(dev)) 5303 return pci_device_group(dev); 5304 return generic_device_group(dev); 5305 } 5306 5307 static int intel_iommu_enable_auxd(struct device *dev) 5308 { 5309 struct device_domain_info *info; 5310 struct intel_iommu *iommu; 5311 unsigned long flags; 5312 int ret; 5313 5314 iommu = device_to_iommu(dev, NULL, NULL); 5315 if (!iommu || dmar_disabled) 5316 return -EINVAL; 5317 5318 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5319 return -EINVAL; 5320 5321 ret = intel_iommu_enable_pasid(iommu, dev); 5322 if (ret) 5323 return -ENODEV; 5324 5325 spin_lock_irqsave(&device_domain_lock, flags); 5326 info = get_domain_info(dev); 5327 info->auxd_enabled = 1; 5328 spin_unlock_irqrestore(&device_domain_lock, flags); 5329 5330 return 0; 5331 } 5332 5333 static int intel_iommu_disable_auxd(struct device *dev) 5334 { 5335 struct device_domain_info *info; 5336 unsigned long flags; 5337 5338 spin_lock_irqsave(&device_domain_lock, flags); 5339 info = get_domain_info(dev); 5340 if (!WARN_ON(!info)) 5341 info->auxd_enabled = 0; 5342 spin_unlock_irqrestore(&device_domain_lock, flags); 5343 5344 return 0; 5345 } 5346 5347 static int intel_iommu_enable_sva(struct device *dev) 5348 { 5349 struct device_domain_info *info = get_domain_info(dev); 5350 struct intel_iommu *iommu; 5351 int ret; 5352 5353 if (!info || dmar_disabled) 5354 return -EINVAL; 5355 5356 iommu = info->iommu; 5357 if (!iommu) 5358 return -EINVAL; 5359 5360 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 5361 return -ENODEV; 5362 5363 if (intel_iommu_enable_pasid(iommu, dev)) 5364 return -ENODEV; 5365 5366 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 5367 return -EINVAL; 5368 5369 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 5370 if (!ret) 5371 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 5372 5373 return ret; 5374 } 5375 5376 static int intel_iommu_disable_sva(struct device *dev) 5377 { 5378 struct device_domain_info *info = get_domain_info(dev); 5379 struct intel_iommu *iommu = info->iommu; 5380 int ret; 5381 5382 ret = iommu_unregister_device_fault_handler(dev); 5383 if (!ret) 5384 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 5385 5386 return ret; 5387 } 5388 5389 /* 5390 * A PCI express designated vendor specific extended capability is defined 5391 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5392 * for system software and tools to detect endpoint devices supporting the 5393 * Intel scalable IO virtualization without host driver dependency. 5394 * 5395 * Returns the address of the matching extended capability structure within 5396 * the device's PCI configuration space or 0 if the device does not support 5397 * it. 5398 */ 5399 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5400 { 5401 int pos; 5402 u16 vendor, id; 5403 5404 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5405 while (pos) { 5406 pci_read_config_word(pdev, pos + 4, &vendor); 5407 pci_read_config_word(pdev, pos + 8, &id); 5408 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5409 return pos; 5410 5411 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5412 } 5413 5414 return 0; 5415 } 5416 5417 static bool 5418 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5419 { 5420 struct device_domain_info *info = get_domain_info(dev); 5421 5422 if (feat == IOMMU_DEV_FEAT_AUX) { 5423 int ret; 5424 5425 if (!dev_is_pci(dev) || dmar_disabled || 5426 !scalable_mode_support() || !pasid_mode_support()) 5427 return false; 5428 5429 ret = pci_pasid_features(to_pci_dev(dev)); 5430 if (ret < 0) 5431 return false; 5432 5433 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5434 } 5435 5436 if (feat == IOMMU_DEV_FEAT_IOPF) 5437 return info && info->pri_supported; 5438 5439 if (feat == IOMMU_DEV_FEAT_SVA) 5440 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5441 info->pasid_supported && info->pri_supported && 5442 info->ats_supported; 5443 5444 return false; 5445 } 5446 5447 static int 5448 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5449 { 5450 switch (feat) { 5451 case IOMMU_DEV_FEAT_AUX: 5452 return intel_iommu_enable_auxd(dev); 5453 5454 case IOMMU_DEV_FEAT_IOPF: 5455 return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV; 5456 5457 case IOMMU_DEV_FEAT_SVA: 5458 return intel_iommu_enable_sva(dev); 5459 5460 default: 5461 return -ENODEV; 5462 } 5463 } 5464 5465 static int 5466 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5467 { 5468 switch (feat) { 5469 case IOMMU_DEV_FEAT_AUX: 5470 return intel_iommu_disable_auxd(dev); 5471 5472 case IOMMU_DEV_FEAT_IOPF: 5473 return 0; 5474 5475 case IOMMU_DEV_FEAT_SVA: 5476 return intel_iommu_disable_sva(dev); 5477 5478 default: 5479 return -ENODEV; 5480 } 5481 } 5482 5483 static bool 5484 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5485 { 5486 struct device_domain_info *info = get_domain_info(dev); 5487 5488 if (feat == IOMMU_DEV_FEAT_AUX) 5489 return scalable_mode_support() && info && info->auxd_enabled; 5490 5491 return false; 5492 } 5493 5494 static int 5495 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5496 { 5497 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5498 5499 return dmar_domain->default_pasid > 0 ? 5500 dmar_domain->default_pasid : -EINVAL; 5501 } 5502 5503 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5504 struct device *dev) 5505 { 5506 return attach_deferred(dev); 5507 } 5508 5509 static int 5510 intel_iommu_enable_nesting(struct iommu_domain *domain) 5511 { 5512 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5513 unsigned long flags; 5514 int ret = -ENODEV; 5515 5516 spin_lock_irqsave(&device_domain_lock, flags); 5517 if (list_empty(&dmar_domain->devices)) { 5518 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5519 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5520 ret = 0; 5521 } 5522 spin_unlock_irqrestore(&device_domain_lock, flags); 5523 5524 return ret; 5525 } 5526 5527 /* 5528 * Check that the device does not live on an external facing PCI port that is 5529 * marked as untrusted. Such devices should not be able to apply quirks and 5530 * thus not be able to bypass the IOMMU restrictions. 5531 */ 5532 static bool risky_device(struct pci_dev *pdev) 5533 { 5534 if (pdev->untrusted) { 5535 pci_info(pdev, 5536 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5537 pdev->vendor, pdev->device); 5538 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5539 return true; 5540 } 5541 return false; 5542 } 5543 5544 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 5545 unsigned long iova, size_t size) 5546 { 5547 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5548 unsigned long pages = aligned_nrpages(iova, size); 5549 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 5550 struct intel_iommu *iommu; 5551 int iommu_id; 5552 5553 for_each_domain_iommu(iommu_id, dmar_domain) { 5554 iommu = g_iommus[iommu_id]; 5555 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 5556 } 5557 } 5558 5559 const struct iommu_ops intel_iommu_ops = { 5560 .capable = intel_iommu_capable, 5561 .domain_alloc = intel_iommu_domain_alloc, 5562 .domain_free = intel_iommu_domain_free, 5563 .enable_nesting = intel_iommu_enable_nesting, 5564 .attach_dev = intel_iommu_attach_device, 5565 .detach_dev = intel_iommu_detach_device, 5566 .aux_attach_dev = intel_iommu_aux_attach_device, 5567 .aux_detach_dev = intel_iommu_aux_detach_device, 5568 .aux_get_pasid = intel_iommu_aux_get_pasid, 5569 .map_pages = intel_iommu_map_pages, 5570 .unmap_pages = intel_iommu_unmap_pages, 5571 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 5572 .flush_iotlb_all = intel_flush_iotlb_all, 5573 .iotlb_sync = intel_iommu_tlb_sync, 5574 .iova_to_phys = intel_iommu_iova_to_phys, 5575 .probe_device = intel_iommu_probe_device, 5576 .probe_finalize = intel_iommu_probe_finalize, 5577 .release_device = intel_iommu_release_device, 5578 .get_resv_regions = intel_iommu_get_resv_regions, 5579 .put_resv_regions = generic_iommu_put_resv_regions, 5580 .device_group = intel_iommu_device_group, 5581 .dev_has_feat = intel_iommu_dev_has_feat, 5582 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5583 .dev_enable_feat = intel_iommu_dev_enable_feat, 5584 .dev_disable_feat = intel_iommu_dev_disable_feat, 5585 .is_attach_deferred = intel_iommu_is_attach_deferred, 5586 .def_domain_type = device_def_domain_type, 5587 .pgsize_bitmap = SZ_4K, 5588 #ifdef CONFIG_INTEL_IOMMU_SVM 5589 .cache_invalidate = intel_iommu_sva_invalidate, 5590 .sva_bind_gpasid = intel_svm_bind_gpasid, 5591 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5592 .sva_bind = intel_svm_bind, 5593 .sva_unbind = intel_svm_unbind, 5594 .sva_get_pasid = intel_svm_get_pasid, 5595 .page_response = intel_svm_page_response, 5596 #endif 5597 }; 5598 5599 static void quirk_iommu_igfx(struct pci_dev *dev) 5600 { 5601 if (risky_device(dev)) 5602 return; 5603 5604 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5605 dmar_map_gfx = 0; 5606 } 5607 5608 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5609 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5610 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5611 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5612 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5613 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5615 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5616 5617 /* Broadwell igfx malfunctions with dmar */ 5618 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5619 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5620 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5621 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5622 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5623 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5624 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5625 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5633 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5642 5643 static void quirk_iommu_rwbf(struct pci_dev *dev) 5644 { 5645 if (risky_device(dev)) 5646 return; 5647 5648 /* 5649 * Mobile 4 Series Chipset neglects to set RWBF capability, 5650 * but needs it. Same seems to hold for the desktop versions. 5651 */ 5652 pci_info(dev, "Forcing write-buffer flush capability\n"); 5653 rwbf_quirk = 1; 5654 } 5655 5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5663 5664 #define GGC 0x52 5665 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5666 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5667 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5668 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5669 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5670 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5671 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5672 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5673 5674 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5675 { 5676 unsigned short ggc; 5677 5678 if (risky_device(dev)) 5679 return; 5680 5681 if (pci_read_config_word(dev, GGC, &ggc)) 5682 return; 5683 5684 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5685 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5686 dmar_map_gfx = 0; 5687 } else if (dmar_map_gfx) { 5688 /* we have to ensure the gfx device is idle before we flush */ 5689 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5690 iommu_set_dma_strict(); 5691 } 5692 } 5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5697 5698 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5699 { 5700 unsigned short ver; 5701 5702 if (!IS_GFX_DEVICE(dev)) 5703 return; 5704 5705 ver = (dev->device >> 8) & 0xff; 5706 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5707 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5708 ver != 0x9a) 5709 return; 5710 5711 if (risky_device(dev)) 5712 return; 5713 5714 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5715 iommu_skip_te_disable = 1; 5716 } 5717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5718 5719 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5720 ISOCH DMAR unit for the Azalia sound device, but not give it any 5721 TLB entries, which causes it to deadlock. Check for that. We do 5722 this in a function called from init_dmars(), instead of in a PCI 5723 quirk, because we don't want to print the obnoxious "BIOS broken" 5724 message if VT-d is actually disabled. 5725 */ 5726 static void __init check_tylersburg_isoch(void) 5727 { 5728 struct pci_dev *pdev; 5729 uint32_t vtisochctrl; 5730 5731 /* If there's no Azalia in the system anyway, forget it. */ 5732 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5733 if (!pdev) 5734 return; 5735 5736 if (risky_device(pdev)) { 5737 pci_dev_put(pdev); 5738 return; 5739 } 5740 5741 pci_dev_put(pdev); 5742 5743 /* System Management Registers. Might be hidden, in which case 5744 we can't do the sanity check. But that's OK, because the 5745 known-broken BIOSes _don't_ actually hide it, so far. */ 5746 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5747 if (!pdev) 5748 return; 5749 5750 if (risky_device(pdev)) { 5751 pci_dev_put(pdev); 5752 return; 5753 } 5754 5755 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5756 pci_dev_put(pdev); 5757 return; 5758 } 5759 5760 pci_dev_put(pdev); 5761 5762 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5763 if (vtisochctrl & 1) 5764 return; 5765 5766 /* Drop all bits other than the number of TLB entries */ 5767 vtisochctrl &= 0x1c; 5768 5769 /* If we have the recommended number of TLB entries (16), fine. */ 5770 if (vtisochctrl == 0x10) 5771 return; 5772 5773 /* Zero TLB entries? You get to ride the short bus to school. */ 5774 if (!vtisochctrl) { 5775 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5776 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5777 dmi_get_system_info(DMI_BIOS_VENDOR), 5778 dmi_get_system_info(DMI_BIOS_VERSION), 5779 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5780 iommu_identity_mapping |= IDENTMAP_AZALIA; 5781 return; 5782 } 5783 5784 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5785 vtisochctrl); 5786 } 5787