1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * intel-pasid.c - PASID idr, table and entry manipulation 4 * 5 * Copyright (C) 2018 Intel Corporation 6 * 7 * Author: Lu Baolu <baolu.lu@linux.intel.com> 8 */ 9 10 #define pr_fmt(fmt) "DMAR: " fmt 11 12 #include <linux/bitops.h> 13 #include <linux/cpufeature.h> 14 #include <linux/dmar.h> 15 #include <linux/iommu.h> 16 #include <linux/memory.h> 17 #include <linux/pci.h> 18 #include <linux/pci-ats.h> 19 #include <linux/spinlock.h> 20 21 #include "iommu.h" 22 #include "pasid.h" 23 24 /* 25 * Intel IOMMU system wide PASID name space: 26 */ 27 u32 intel_pasid_max_id = PASID_MAX; 28 29 /* 30 * Per device pasid table management: 31 */ 32 33 /* 34 * Allocate a pasid table for @dev. It should be called in a 35 * single-thread context. 36 */ 37 int intel_pasid_alloc_table(struct device *dev) 38 { 39 struct device_domain_info *info; 40 struct pasid_table *pasid_table; 41 struct page *pages; 42 u32 max_pasid = 0; 43 int order, size; 44 45 might_sleep(); 46 info = dev_iommu_priv_get(dev); 47 if (WARN_ON(!info || !dev_is_pci(dev))) 48 return -ENODEV; 49 if (WARN_ON(info->pasid_table)) 50 return -EEXIST; 51 52 pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); 53 if (!pasid_table) 54 return -ENOMEM; 55 56 if (info->pasid_supported) 57 max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)), 58 intel_pasid_max_id); 59 60 size = max_pasid >> (PASID_PDE_SHIFT - 3); 61 order = size ? get_order(size) : 0; 62 pages = alloc_pages_node(info->iommu->node, 63 GFP_KERNEL | __GFP_ZERO, order); 64 if (!pages) { 65 kfree(pasid_table); 66 return -ENOMEM; 67 } 68 69 pasid_table->table = page_address(pages); 70 pasid_table->order = order; 71 pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); 72 info->pasid_table = pasid_table; 73 74 if (!ecap_coherent(info->iommu->ecap)) 75 clflush_cache_range(pasid_table->table, (1 << order) * PAGE_SIZE); 76 77 return 0; 78 } 79 80 void intel_pasid_free_table(struct device *dev) 81 { 82 struct device_domain_info *info; 83 struct pasid_table *pasid_table; 84 struct pasid_dir_entry *dir; 85 struct pasid_entry *table; 86 int i, max_pde; 87 88 info = dev_iommu_priv_get(dev); 89 if (!info || !dev_is_pci(dev) || !info->pasid_table) 90 return; 91 92 pasid_table = info->pasid_table; 93 info->pasid_table = NULL; 94 95 /* Free scalable mode PASID directory tables: */ 96 dir = pasid_table->table; 97 max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; 98 for (i = 0; i < max_pde; i++) { 99 table = get_pasid_table_from_pde(&dir[i]); 100 free_pgtable_page(table); 101 } 102 103 free_pages((unsigned long)pasid_table->table, pasid_table->order); 104 kfree(pasid_table); 105 } 106 107 struct pasid_table *intel_pasid_get_table(struct device *dev) 108 { 109 struct device_domain_info *info; 110 111 info = dev_iommu_priv_get(dev); 112 if (!info) 113 return NULL; 114 115 return info->pasid_table; 116 } 117 118 static int intel_pasid_get_dev_max_id(struct device *dev) 119 { 120 struct device_domain_info *info; 121 122 info = dev_iommu_priv_get(dev); 123 if (!info || !info->pasid_table) 124 return 0; 125 126 return info->pasid_table->max_pasid; 127 } 128 129 static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) 130 { 131 struct device_domain_info *info; 132 struct pasid_table *pasid_table; 133 struct pasid_dir_entry *dir; 134 struct pasid_entry *entries; 135 int dir_index, index; 136 137 pasid_table = intel_pasid_get_table(dev); 138 if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev))) 139 return NULL; 140 141 dir = pasid_table->table; 142 info = dev_iommu_priv_get(dev); 143 dir_index = pasid >> PASID_PDE_SHIFT; 144 index = pasid & PASID_PTE_MASK; 145 146 retry: 147 entries = get_pasid_table_from_pde(&dir[dir_index]); 148 if (!entries) { 149 entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC); 150 if (!entries) 151 return NULL; 152 153 /* 154 * The pasid directory table entry won't be freed after 155 * allocation. No worry about the race with free and 156 * clear. However, this entry might be populated by others 157 * while we are preparing it. Use theirs with a retry. 158 */ 159 if (cmpxchg64(&dir[dir_index].val, 0ULL, 160 (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { 161 free_pgtable_page(entries); 162 goto retry; 163 } 164 if (!ecap_coherent(info->iommu->ecap)) { 165 clflush_cache_range(entries, VTD_PAGE_SIZE); 166 clflush_cache_range(&dir[dir_index].val, sizeof(*dir)); 167 } 168 } 169 170 return &entries[index]; 171 } 172 173 /* 174 * Interfaces for PASID table entry manipulation: 175 */ 176 static void 177 intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) 178 { 179 struct pasid_entry *pe; 180 181 pe = intel_pasid_get_entry(dev, pasid); 182 if (WARN_ON(!pe)) 183 return; 184 185 if (fault_ignore && pasid_pte_is_present(pe)) 186 pasid_clear_entry_with_fpd(pe); 187 else 188 pasid_clear_entry(pe); 189 } 190 191 static void 192 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, 193 u16 did, u32 pasid) 194 { 195 struct qi_desc desc; 196 197 desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) | 198 QI_PC_PASID(pasid) | QI_PC_TYPE; 199 desc.qw1 = 0; 200 desc.qw2 = 0; 201 desc.qw3 = 0; 202 203 qi_submit_sync(iommu, &desc, 1, 0); 204 } 205 206 static void 207 devtlb_invalidation_with_pasid(struct intel_iommu *iommu, 208 struct device *dev, u32 pasid) 209 { 210 struct device_domain_info *info; 211 u16 sid, qdep, pfsid; 212 213 info = dev_iommu_priv_get(dev); 214 if (!info || !info->ats_enabled) 215 return; 216 217 sid = info->bus << 8 | info->devfn; 218 qdep = info->ats_qdep; 219 pfsid = info->pfsid; 220 221 /* 222 * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID), 223 * devTLB flush w/o PASID should be used. For non-zero PASID under 224 * SVA usage, device could do DMA with multiple PASIDs. It is more 225 * efficient to flush devTLB specific to the PASID. 226 */ 227 if (pasid == IOMMU_NO_PASID) 228 qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT); 229 else 230 qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT); 231 } 232 233 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, 234 u32 pasid, bool fault_ignore) 235 { 236 struct pasid_entry *pte; 237 u16 did, pgtt; 238 239 spin_lock(&iommu->lock); 240 pte = intel_pasid_get_entry(dev, pasid); 241 if (WARN_ON(!pte) || !pasid_pte_is_present(pte)) { 242 spin_unlock(&iommu->lock); 243 return; 244 } 245 246 did = pasid_get_domain_id(pte); 247 pgtt = pasid_pte_get_pgtt(pte); 248 intel_pasid_clear_entry(dev, pasid, fault_ignore); 249 spin_unlock(&iommu->lock); 250 251 if (!ecap_coherent(iommu->ecap)) 252 clflush_cache_range(pte, sizeof(*pte)); 253 254 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 255 256 if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY) 257 qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 258 else 259 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 260 261 /* Device IOTLB doesn't need to be flushed in caching mode. */ 262 if (!cap_caching_mode(iommu->cap)) 263 devtlb_invalidation_with_pasid(iommu, dev, pasid); 264 } 265 266 /* 267 * This function flushes cache for a newly setup pasid table entry. 268 * Caller of it should not modify the in-use pasid table entries. 269 */ 270 static void pasid_flush_caches(struct intel_iommu *iommu, 271 struct pasid_entry *pte, 272 u32 pasid, u16 did) 273 { 274 if (!ecap_coherent(iommu->ecap)) 275 clflush_cache_range(pte, sizeof(*pte)); 276 277 if (cap_caching_mode(iommu->cap)) { 278 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 279 qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 280 } else { 281 iommu_flush_write_buffer(iommu); 282 } 283 } 284 285 /* 286 * Set up the scalable mode pasid table entry for first only 287 * translation type. 288 */ 289 int intel_pasid_setup_first_level(struct intel_iommu *iommu, 290 struct device *dev, pgd_t *pgd, 291 u32 pasid, u16 did, int flags) 292 { 293 struct pasid_entry *pte; 294 295 if (!ecap_flts(iommu->ecap)) { 296 pr_err("No first level translation support on %s\n", 297 iommu->name); 298 return -EINVAL; 299 } 300 301 if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { 302 pr_err("No 5-level paging support for first-level on %s\n", 303 iommu->name); 304 return -EINVAL; 305 } 306 307 spin_lock(&iommu->lock); 308 pte = intel_pasid_get_entry(dev, pasid); 309 if (!pte) { 310 spin_unlock(&iommu->lock); 311 return -ENODEV; 312 } 313 314 if (pasid_pte_is_present(pte)) { 315 spin_unlock(&iommu->lock); 316 return -EBUSY; 317 } 318 319 pasid_clear_entry(pte); 320 321 /* Setup the first level page table pointer: */ 322 pasid_set_flptr(pte, (u64)__pa(pgd)); 323 324 if (flags & PASID_FLAG_FL5LP) 325 pasid_set_flpm(pte, 1); 326 327 if (flags & PASID_FLAG_PAGE_SNOOP) 328 pasid_set_pgsnp(pte); 329 330 pasid_set_domain_id(pte, did); 331 pasid_set_address_width(pte, iommu->agaw); 332 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 333 pasid_set_nxe(pte); 334 335 /* Setup Present and PASID Granular Transfer Type: */ 336 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); 337 pasid_set_present(pte); 338 spin_unlock(&iommu->lock); 339 340 pasid_flush_caches(iommu, pte, pasid, did); 341 342 return 0; 343 } 344 345 /* 346 * Skip top levels of page tables for iommu which has less agaw 347 * than default. Unnecessary for PT mode. 348 */ 349 static int iommu_skip_agaw(struct dmar_domain *domain, 350 struct intel_iommu *iommu, 351 struct dma_pte **pgd) 352 { 353 int agaw; 354 355 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 356 *pgd = phys_to_virt(dma_pte_addr(*pgd)); 357 if (!dma_pte_present(*pgd)) 358 return -EINVAL; 359 } 360 361 return agaw; 362 } 363 364 /* 365 * Set up the scalable mode pasid entry for second only translation type. 366 */ 367 int intel_pasid_setup_second_level(struct intel_iommu *iommu, 368 struct dmar_domain *domain, 369 struct device *dev, u32 pasid) 370 { 371 struct pasid_entry *pte; 372 struct dma_pte *pgd; 373 u64 pgd_val; 374 int agaw; 375 u16 did; 376 377 /* 378 * If hardware advertises no support for second level 379 * translation, return directly. 380 */ 381 if (!ecap_slts(iommu->ecap)) { 382 pr_err("No second level translation support on %s\n", 383 iommu->name); 384 return -EINVAL; 385 } 386 387 pgd = domain->pgd; 388 agaw = iommu_skip_agaw(domain, iommu, &pgd); 389 if (agaw < 0) { 390 dev_err(dev, "Invalid domain page table\n"); 391 return -EINVAL; 392 } 393 394 pgd_val = virt_to_phys(pgd); 395 did = domain_id_iommu(domain, iommu); 396 397 spin_lock(&iommu->lock); 398 pte = intel_pasid_get_entry(dev, pasid); 399 if (!pte) { 400 spin_unlock(&iommu->lock); 401 return -ENODEV; 402 } 403 404 if (pasid_pte_is_present(pte)) { 405 spin_unlock(&iommu->lock); 406 return -EBUSY; 407 } 408 409 pasid_clear_entry(pte); 410 pasid_set_domain_id(pte, did); 411 pasid_set_slptr(pte, pgd_val); 412 pasid_set_address_width(pte, agaw); 413 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); 414 pasid_set_fault_enable(pte); 415 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 416 if (domain->dirty_tracking) 417 pasid_set_ssade(pte); 418 419 pasid_set_present(pte); 420 spin_unlock(&iommu->lock); 421 422 pasid_flush_caches(iommu, pte, pasid, did); 423 424 return 0; 425 } 426 427 /* 428 * Set up dirty tracking on a second only or nested translation type. 429 */ 430 int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, 431 struct device *dev, u32 pasid, 432 bool enabled) 433 { 434 struct pasid_entry *pte; 435 u16 did, pgtt; 436 437 spin_lock(&iommu->lock); 438 439 pte = intel_pasid_get_entry(dev, pasid); 440 if (!pte) { 441 spin_unlock(&iommu->lock); 442 dev_err_ratelimited( 443 dev, "Failed to get pasid entry of PASID %d\n", pasid); 444 return -ENODEV; 445 } 446 447 did = pasid_get_domain_id(pte); 448 pgtt = pasid_pte_get_pgtt(pte); 449 if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && 450 pgtt != PASID_ENTRY_PGTT_NESTED) { 451 spin_unlock(&iommu->lock); 452 dev_err_ratelimited( 453 dev, 454 "Dirty tracking not supported on translation type %d\n", 455 pgtt); 456 return -EOPNOTSUPP; 457 } 458 459 if (pasid_get_ssade(pte) == enabled) { 460 spin_unlock(&iommu->lock); 461 return 0; 462 } 463 464 if (enabled) 465 pasid_set_ssade(pte); 466 else 467 pasid_clear_ssade(pte); 468 spin_unlock(&iommu->lock); 469 470 if (!ecap_coherent(iommu->ecap)) 471 clflush_cache_range(pte, sizeof(*pte)); 472 473 /* 474 * From VT-d spec table 25 "Guidance to Software for Invalidations": 475 * 476 * - PASID-selective-within-Domain PASID-cache invalidation 477 * If (PGTT=SS or Nested) 478 * - Domain-selective IOTLB invalidation 479 * Else 480 * - PASID-selective PASID-based IOTLB invalidation 481 * - If (pasid is RID_PASID) 482 * - Global Device-TLB invalidation to affected functions 483 * Else 484 * - PASID-based Device-TLB invalidation (with S=1 and 485 * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions 486 */ 487 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 488 489 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 490 491 /* Device IOTLB doesn't need to be flushed in caching mode. */ 492 if (!cap_caching_mode(iommu->cap)) 493 devtlb_invalidation_with_pasid(iommu, dev, pasid); 494 495 return 0; 496 } 497 498 /* 499 * Set up the scalable mode pasid entry for passthrough translation type. 500 */ 501 int intel_pasid_setup_pass_through(struct intel_iommu *iommu, 502 struct device *dev, u32 pasid) 503 { 504 u16 did = FLPT_DEFAULT_DID; 505 struct pasid_entry *pte; 506 507 spin_lock(&iommu->lock); 508 pte = intel_pasid_get_entry(dev, pasid); 509 if (!pte) { 510 spin_unlock(&iommu->lock); 511 return -ENODEV; 512 } 513 514 if (pasid_pte_is_present(pte)) { 515 spin_unlock(&iommu->lock); 516 return -EBUSY; 517 } 518 519 pasid_clear_entry(pte); 520 pasid_set_domain_id(pte, did); 521 pasid_set_address_width(pte, iommu->agaw); 522 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); 523 pasid_set_fault_enable(pte); 524 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 525 pasid_set_present(pte); 526 spin_unlock(&iommu->lock); 527 528 pasid_flush_caches(iommu, pte, pasid, did); 529 530 return 0; 531 } 532 533 /* 534 * Set the page snoop control for a pasid entry which has been set up. 535 */ 536 void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, 537 struct device *dev, u32 pasid) 538 { 539 struct pasid_entry *pte; 540 u16 did; 541 542 spin_lock(&iommu->lock); 543 pte = intel_pasid_get_entry(dev, pasid); 544 if (WARN_ON(!pte || !pasid_pte_is_present(pte))) { 545 spin_unlock(&iommu->lock); 546 return; 547 } 548 549 pasid_set_pgsnp(pte); 550 did = pasid_get_domain_id(pte); 551 spin_unlock(&iommu->lock); 552 553 if (!ecap_coherent(iommu->ecap)) 554 clflush_cache_range(pte, sizeof(*pte)); 555 556 /* 557 * VT-d spec 3.4 table23 states guides for cache invalidation: 558 * 559 * - PASID-selective-within-Domain PASID-cache invalidation 560 * - PASID-selective PASID-based IOTLB invalidation 561 * - If (pasid is RID_PASID) 562 * - Global Device-TLB invalidation to affected functions 563 * Else 564 * - PASID-based Device-TLB invalidation (with S=1 and 565 * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions 566 */ 567 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 568 qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 569 570 /* Device IOTLB doesn't need to be flushed in caching mode. */ 571 if (!cap_caching_mode(iommu->cap)) 572 devtlb_invalidation_with_pasid(iommu, dev, pasid); 573 } 574 575 /** 576 * intel_pasid_setup_nested() - Set up PASID entry for nested translation. 577 * @iommu: IOMMU which the device belong to 578 * @dev: Device to be set up for translation 579 * @pasid: PASID to be programmed in the device PASID table 580 * @domain: User stage-1 domain nested on a stage-2 domain 581 * 582 * This is used for nested translation. The input domain should be 583 * nested type and nested on a parent with 'is_nested_parent' flag 584 * set. 585 */ 586 int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, 587 u32 pasid, struct dmar_domain *domain) 588 { 589 struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; 590 pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; 591 struct dmar_domain *s2_domain = domain->s2_domain; 592 u16 did = domain_id_iommu(domain, iommu); 593 struct dma_pte *pgd = s2_domain->pgd; 594 struct pasid_entry *pte; 595 596 /* Address width should match the address width supported by hardware */ 597 switch (s1_cfg->addr_width) { 598 case ADDR_WIDTH_4LEVEL: 599 break; 600 case ADDR_WIDTH_5LEVEL: 601 if (!cap_fl5lp_support(iommu->cap)) { 602 dev_err_ratelimited(dev, 603 "5-level paging not supported\n"); 604 return -EINVAL; 605 } 606 break; 607 default: 608 dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", 609 s1_cfg->addr_width); 610 return -EINVAL; 611 } 612 613 if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { 614 pr_err_ratelimited("No supervisor request support on %s\n", 615 iommu->name); 616 return -EINVAL; 617 } 618 619 if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { 620 pr_err_ratelimited("No extended access flag support on %s\n", 621 iommu->name); 622 return -EINVAL; 623 } 624 625 spin_lock(&iommu->lock); 626 pte = intel_pasid_get_entry(dev, pasid); 627 if (!pte) { 628 spin_unlock(&iommu->lock); 629 return -ENODEV; 630 } 631 if (pasid_pte_is_present(pte)) { 632 spin_unlock(&iommu->lock); 633 return -EBUSY; 634 } 635 636 pasid_clear_entry(pte); 637 638 if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) 639 pasid_set_flpm(pte, 1); 640 641 pasid_set_flptr(pte, (uintptr_t)s1_gpgd); 642 643 if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { 644 pasid_set_sre(pte); 645 if (s1_cfg->flags & IOMMU_VTD_S1_WPE) 646 pasid_set_wpe(pte); 647 } 648 649 if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) 650 pasid_set_eafe(pte); 651 652 if (s2_domain->force_snooping) 653 pasid_set_pgsnp(pte); 654 655 pasid_set_slptr(pte, virt_to_phys(pgd)); 656 pasid_set_fault_enable(pte); 657 pasid_set_domain_id(pte, did); 658 pasid_set_address_width(pte, s2_domain->agaw); 659 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 660 if (s2_domain->dirty_tracking) 661 pasid_set_ssade(pte); 662 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); 663 pasid_set_present(pte); 664 spin_unlock(&iommu->lock); 665 666 pasid_flush_caches(iommu, pte, pasid, did); 667 668 return 0; 669 } 670