1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * intel-pasid.c - PASID idr, table and entry manipulation 4 * 5 * Copyright (C) 2018 Intel Corporation 6 * 7 * Author: Lu Baolu <baolu.lu@linux.intel.com> 8 */ 9 10 #define pr_fmt(fmt) "DMAR: " fmt 11 12 #include <linux/bitops.h> 13 #include <linux/cpufeature.h> 14 #include <linux/dmar.h> 15 #include <linux/iommu.h> 16 #include <linux/memory.h> 17 #include <linux/pci.h> 18 #include <linux/pci-ats.h> 19 #include <linux/spinlock.h> 20 21 #include "iommu.h" 22 #include "pasid.h" 23 24 /* 25 * Intel IOMMU system wide PASID name space: 26 */ 27 u32 intel_pasid_max_id = PASID_MAX; 28 29 /* 30 * Per device pasid table management: 31 */ 32 33 /* 34 * Allocate a pasid table for @dev. It should be called in a 35 * single-thread context. 36 */ 37 int intel_pasid_alloc_table(struct device *dev) 38 { 39 struct device_domain_info *info; 40 struct pasid_table *pasid_table; 41 struct page *pages; 42 u32 max_pasid = 0; 43 int order, size; 44 45 might_sleep(); 46 info = dev_iommu_priv_get(dev); 47 if (WARN_ON(!info || !dev_is_pci(dev))) 48 return -ENODEV; 49 if (WARN_ON(info->pasid_table)) 50 return -EEXIST; 51 52 pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); 53 if (!pasid_table) 54 return -ENOMEM; 55 56 if (info->pasid_supported) 57 max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)), 58 intel_pasid_max_id); 59 60 size = max_pasid >> (PASID_PDE_SHIFT - 3); 61 order = size ? get_order(size) : 0; 62 pages = alloc_pages_node(info->iommu->node, 63 GFP_KERNEL | __GFP_ZERO, order); 64 if (!pages) { 65 kfree(pasid_table); 66 return -ENOMEM; 67 } 68 69 pasid_table->table = page_address(pages); 70 pasid_table->order = order; 71 pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); 72 info->pasid_table = pasid_table; 73 74 if (!ecap_coherent(info->iommu->ecap)) 75 clflush_cache_range(pasid_table->table, (1 << order) * PAGE_SIZE); 76 77 return 0; 78 } 79 80 void intel_pasid_free_table(struct device *dev) 81 { 82 struct device_domain_info *info; 83 struct pasid_table *pasid_table; 84 struct pasid_dir_entry *dir; 85 struct pasid_entry *table; 86 int i, max_pde; 87 88 info = dev_iommu_priv_get(dev); 89 if (!info || !dev_is_pci(dev) || !info->pasid_table) 90 return; 91 92 pasid_table = info->pasid_table; 93 info->pasid_table = NULL; 94 95 /* Free scalable mode PASID directory tables: */ 96 dir = pasid_table->table; 97 max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; 98 for (i = 0; i < max_pde; i++) { 99 table = get_pasid_table_from_pde(&dir[i]); 100 free_pgtable_page(table); 101 } 102 103 free_pages((unsigned long)pasid_table->table, pasid_table->order); 104 kfree(pasid_table); 105 } 106 107 struct pasid_table *intel_pasid_get_table(struct device *dev) 108 { 109 struct device_domain_info *info; 110 111 info = dev_iommu_priv_get(dev); 112 if (!info) 113 return NULL; 114 115 return info->pasid_table; 116 } 117 118 static int intel_pasid_get_dev_max_id(struct device *dev) 119 { 120 struct device_domain_info *info; 121 122 info = dev_iommu_priv_get(dev); 123 if (!info || !info->pasid_table) 124 return 0; 125 126 return info->pasid_table->max_pasid; 127 } 128 129 static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) 130 { 131 struct device_domain_info *info; 132 struct pasid_table *pasid_table; 133 struct pasid_dir_entry *dir; 134 struct pasid_entry *entries; 135 int dir_index, index; 136 137 pasid_table = intel_pasid_get_table(dev); 138 if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev))) 139 return NULL; 140 141 dir = pasid_table->table; 142 info = dev_iommu_priv_get(dev); 143 dir_index = pasid >> PASID_PDE_SHIFT; 144 index = pasid & PASID_PTE_MASK; 145 146 retry: 147 entries = get_pasid_table_from_pde(&dir[dir_index]); 148 if (!entries) { 149 entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC); 150 if (!entries) 151 return NULL; 152 153 /* 154 * The pasid directory table entry won't be freed after 155 * allocation. No worry about the race with free and 156 * clear. However, this entry might be populated by others 157 * while we are preparing it. Use theirs with a retry. 158 */ 159 if (cmpxchg64(&dir[dir_index].val, 0ULL, 160 (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { 161 free_pgtable_page(entries); 162 goto retry; 163 } 164 if (!ecap_coherent(info->iommu->ecap)) { 165 clflush_cache_range(entries, VTD_PAGE_SIZE); 166 clflush_cache_range(&dir[dir_index].val, sizeof(*dir)); 167 } 168 } 169 170 return &entries[index]; 171 } 172 173 /* 174 * Interfaces for PASID table entry manipulation: 175 */ 176 static void 177 intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) 178 { 179 struct pasid_entry *pe; 180 181 pe = intel_pasid_get_entry(dev, pasid); 182 if (WARN_ON(!pe)) 183 return; 184 185 if (fault_ignore && pasid_pte_is_present(pe)) 186 pasid_clear_entry_with_fpd(pe); 187 else 188 pasid_clear_entry(pe); 189 } 190 191 static void 192 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, 193 u16 did, u32 pasid) 194 { 195 struct qi_desc desc; 196 197 desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) | 198 QI_PC_PASID(pasid) | QI_PC_TYPE; 199 desc.qw1 = 0; 200 desc.qw2 = 0; 201 desc.qw3 = 0; 202 203 qi_submit_sync(iommu, &desc, 1, 0); 204 } 205 206 static void 207 devtlb_invalidation_with_pasid(struct intel_iommu *iommu, 208 struct device *dev, u32 pasid) 209 { 210 struct device_domain_info *info; 211 u16 sid, qdep, pfsid; 212 213 info = dev_iommu_priv_get(dev); 214 if (!info || !info->ats_enabled) 215 return; 216 217 sid = info->bus << 8 | info->devfn; 218 qdep = info->ats_qdep; 219 pfsid = info->pfsid; 220 221 /* 222 * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID), 223 * devTLB flush w/o PASID should be used. For non-zero PASID under 224 * SVA usage, device could do DMA with multiple PASIDs. It is more 225 * efficient to flush devTLB specific to the PASID. 226 */ 227 if (pasid == IOMMU_NO_PASID) 228 qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT); 229 else 230 qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT); 231 } 232 233 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, 234 u32 pasid, bool fault_ignore) 235 { 236 struct pasid_entry *pte; 237 u16 did, pgtt; 238 239 spin_lock(&iommu->lock); 240 pte = intel_pasid_get_entry(dev, pasid); 241 if (WARN_ON(!pte) || !pasid_pte_is_present(pte)) { 242 spin_unlock(&iommu->lock); 243 return; 244 } 245 246 did = pasid_get_domain_id(pte); 247 pgtt = pasid_pte_get_pgtt(pte); 248 intel_pasid_clear_entry(dev, pasid, fault_ignore); 249 spin_unlock(&iommu->lock); 250 251 if (!ecap_coherent(iommu->ecap)) 252 clflush_cache_range(pte, sizeof(*pte)); 253 254 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 255 256 if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY) 257 qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 258 else 259 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 260 261 /* Device IOTLB doesn't need to be flushed in caching mode. */ 262 if (!cap_caching_mode(iommu->cap)) 263 devtlb_invalidation_with_pasid(iommu, dev, pasid); 264 } 265 266 /* 267 * This function flushes cache for a newly setup pasid table entry. 268 * Caller of it should not modify the in-use pasid table entries. 269 */ 270 static void pasid_flush_caches(struct intel_iommu *iommu, 271 struct pasid_entry *pte, 272 u32 pasid, u16 did) 273 { 274 if (!ecap_coherent(iommu->ecap)) 275 clflush_cache_range(pte, sizeof(*pte)); 276 277 if (cap_caching_mode(iommu->cap)) { 278 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 279 qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 280 } else { 281 iommu_flush_write_buffer(iommu); 282 } 283 } 284 285 /* 286 * Set up the scalable mode pasid table entry for first only 287 * translation type. 288 */ 289 int intel_pasid_setup_first_level(struct intel_iommu *iommu, 290 struct device *dev, pgd_t *pgd, 291 u32 pasid, u16 did, int flags) 292 { 293 struct pasid_entry *pte; 294 295 if (!ecap_flts(iommu->ecap)) { 296 pr_err("No first level translation support on %s\n", 297 iommu->name); 298 return -EINVAL; 299 } 300 301 if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { 302 pr_err("No 5-level paging support for first-level on %s\n", 303 iommu->name); 304 return -EINVAL; 305 } 306 307 spin_lock(&iommu->lock); 308 pte = intel_pasid_get_entry(dev, pasid); 309 if (!pte) { 310 spin_unlock(&iommu->lock); 311 return -ENODEV; 312 } 313 314 if (pasid_pte_is_present(pte)) { 315 spin_unlock(&iommu->lock); 316 return -EBUSY; 317 } 318 319 pasid_clear_entry(pte); 320 321 /* Setup the first level page table pointer: */ 322 pasid_set_flptr(pte, (u64)__pa(pgd)); 323 324 if (flags & PASID_FLAG_FL5LP) 325 pasid_set_flpm(pte, 1); 326 327 if (flags & PASID_FLAG_PAGE_SNOOP) 328 pasid_set_pgsnp(pte); 329 330 pasid_set_domain_id(pte, did); 331 pasid_set_address_width(pte, iommu->agaw); 332 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 333 pasid_set_nxe(pte); 334 335 /* Setup Present and PASID Granular Transfer Type: */ 336 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); 337 pasid_set_present(pte); 338 spin_unlock(&iommu->lock); 339 340 pasid_flush_caches(iommu, pte, pasid, did); 341 342 return 0; 343 } 344 345 /* 346 * Skip top levels of page tables for iommu which has less agaw 347 * than default. Unnecessary for PT mode. 348 */ 349 static int iommu_skip_agaw(struct dmar_domain *domain, 350 struct intel_iommu *iommu, 351 struct dma_pte **pgd) 352 { 353 int agaw; 354 355 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 356 *pgd = phys_to_virt(dma_pte_addr(*pgd)); 357 if (!dma_pte_present(*pgd)) 358 return -EINVAL; 359 } 360 361 return agaw; 362 } 363 364 /* 365 * Set up the scalable mode pasid entry for second only translation type. 366 */ 367 int intel_pasid_setup_second_level(struct intel_iommu *iommu, 368 struct dmar_domain *domain, 369 struct device *dev, u32 pasid) 370 { 371 struct pasid_entry *pte; 372 struct dma_pte *pgd; 373 u64 pgd_val; 374 int agaw; 375 u16 did; 376 377 /* 378 * If hardware advertises no support for second level 379 * translation, return directly. 380 */ 381 if (!ecap_slts(iommu->ecap)) { 382 pr_err("No second level translation support on %s\n", 383 iommu->name); 384 return -EINVAL; 385 } 386 387 pgd = domain->pgd; 388 agaw = iommu_skip_agaw(domain, iommu, &pgd); 389 if (agaw < 0) { 390 dev_err(dev, "Invalid domain page table\n"); 391 return -EINVAL; 392 } 393 394 pgd_val = virt_to_phys(pgd); 395 did = domain_id_iommu(domain, iommu); 396 397 spin_lock(&iommu->lock); 398 pte = intel_pasid_get_entry(dev, pasid); 399 if (!pte) { 400 spin_unlock(&iommu->lock); 401 return -ENODEV; 402 } 403 404 if (pasid_pte_is_present(pte)) { 405 spin_unlock(&iommu->lock); 406 return -EBUSY; 407 } 408 409 pasid_clear_entry(pte); 410 pasid_set_domain_id(pte, did); 411 pasid_set_slptr(pte, pgd_val); 412 pasid_set_address_width(pte, agaw); 413 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); 414 pasid_set_fault_enable(pte); 415 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 416 if (domain->dirty_tracking) 417 pasid_set_ssade(pte); 418 419 pasid_set_present(pte); 420 spin_unlock(&iommu->lock); 421 422 pasid_flush_caches(iommu, pte, pasid, did); 423 424 return 0; 425 } 426 427 /* 428 * Set up dirty tracking on a second only or nested translation type. 429 */ 430 int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, 431 struct dmar_domain *domain, 432 struct device *dev, u32 pasid, 433 bool enabled) 434 { 435 struct pasid_entry *pte; 436 u16 did, pgtt; 437 438 spin_lock(&iommu->lock); 439 440 pte = intel_pasid_get_entry(dev, pasid); 441 if (!pte) { 442 spin_unlock(&iommu->lock); 443 dev_err_ratelimited( 444 dev, "Failed to get pasid entry of PASID %d\n", pasid); 445 return -ENODEV; 446 } 447 448 did = domain_id_iommu(domain, iommu); 449 pgtt = pasid_pte_get_pgtt(pte); 450 if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && 451 pgtt != PASID_ENTRY_PGTT_NESTED) { 452 spin_unlock(&iommu->lock); 453 dev_err_ratelimited( 454 dev, 455 "Dirty tracking not supported on translation type %d\n", 456 pgtt); 457 return -EOPNOTSUPP; 458 } 459 460 if (pasid_get_ssade(pte) == enabled) { 461 spin_unlock(&iommu->lock); 462 return 0; 463 } 464 465 if (enabled) 466 pasid_set_ssade(pte); 467 else 468 pasid_clear_ssade(pte); 469 spin_unlock(&iommu->lock); 470 471 if (!ecap_coherent(iommu->ecap)) 472 clflush_cache_range(pte, sizeof(*pte)); 473 474 /* 475 * From VT-d spec table 25 "Guidance to Software for Invalidations": 476 * 477 * - PASID-selective-within-Domain PASID-cache invalidation 478 * If (PGTT=SS or Nested) 479 * - Domain-selective IOTLB invalidation 480 * Else 481 * - PASID-selective PASID-based IOTLB invalidation 482 * - If (pasid is RID_PASID) 483 * - Global Device-TLB invalidation to affected functions 484 * Else 485 * - PASID-based Device-TLB invalidation (with S=1 and 486 * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions 487 */ 488 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 489 490 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 491 492 /* Device IOTLB doesn't need to be flushed in caching mode. */ 493 if (!cap_caching_mode(iommu->cap)) 494 devtlb_invalidation_with_pasid(iommu, dev, pasid); 495 496 return 0; 497 } 498 499 /* 500 * Set up the scalable mode pasid entry for passthrough translation type. 501 */ 502 int intel_pasid_setup_pass_through(struct intel_iommu *iommu, 503 struct device *dev, u32 pasid) 504 { 505 u16 did = FLPT_DEFAULT_DID; 506 struct pasid_entry *pte; 507 508 spin_lock(&iommu->lock); 509 pte = intel_pasid_get_entry(dev, pasid); 510 if (!pte) { 511 spin_unlock(&iommu->lock); 512 return -ENODEV; 513 } 514 515 if (pasid_pte_is_present(pte)) { 516 spin_unlock(&iommu->lock); 517 return -EBUSY; 518 } 519 520 pasid_clear_entry(pte); 521 pasid_set_domain_id(pte, did); 522 pasid_set_address_width(pte, iommu->agaw); 523 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); 524 pasid_set_fault_enable(pte); 525 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 526 pasid_set_present(pte); 527 spin_unlock(&iommu->lock); 528 529 pasid_flush_caches(iommu, pte, pasid, did); 530 531 return 0; 532 } 533 534 /* 535 * Set the page snoop control for a pasid entry which has been set up. 536 */ 537 void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, 538 struct device *dev, u32 pasid) 539 { 540 struct pasid_entry *pte; 541 u16 did; 542 543 spin_lock(&iommu->lock); 544 pte = intel_pasid_get_entry(dev, pasid); 545 if (WARN_ON(!pte || !pasid_pte_is_present(pte))) { 546 spin_unlock(&iommu->lock); 547 return; 548 } 549 550 pasid_set_pgsnp(pte); 551 did = pasid_get_domain_id(pte); 552 spin_unlock(&iommu->lock); 553 554 if (!ecap_coherent(iommu->ecap)) 555 clflush_cache_range(pte, sizeof(*pte)); 556 557 /* 558 * VT-d spec 3.4 table23 states guides for cache invalidation: 559 * 560 * - PASID-selective-within-Domain PASID-cache invalidation 561 * - PASID-selective PASID-based IOTLB invalidation 562 * - If (pasid is RID_PASID) 563 * - Global Device-TLB invalidation to affected functions 564 * Else 565 * - PASID-based Device-TLB invalidation (with S=1 and 566 * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions 567 */ 568 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 569 qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 570 571 /* Device IOTLB doesn't need to be flushed in caching mode. */ 572 if (!cap_caching_mode(iommu->cap)) 573 devtlb_invalidation_with_pasid(iommu, dev, pasid); 574 } 575 576 /** 577 * intel_pasid_setup_nested() - Set up PASID entry for nested translation. 578 * @iommu: IOMMU which the device belong to 579 * @dev: Device to be set up for translation 580 * @pasid: PASID to be programmed in the device PASID table 581 * @domain: User stage-1 domain nested on a stage-2 domain 582 * 583 * This is used for nested translation. The input domain should be 584 * nested type and nested on a parent with 'is_nested_parent' flag 585 * set. 586 */ 587 int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, 588 u32 pasid, struct dmar_domain *domain) 589 { 590 struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; 591 pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; 592 struct dmar_domain *s2_domain = domain->s2_domain; 593 u16 did = domain_id_iommu(domain, iommu); 594 struct dma_pte *pgd = s2_domain->pgd; 595 struct pasid_entry *pte; 596 597 /* Address width should match the address width supported by hardware */ 598 switch (s1_cfg->addr_width) { 599 case ADDR_WIDTH_4LEVEL: 600 break; 601 case ADDR_WIDTH_5LEVEL: 602 if (!cap_fl5lp_support(iommu->cap)) { 603 dev_err_ratelimited(dev, 604 "5-level paging not supported\n"); 605 return -EINVAL; 606 } 607 break; 608 default: 609 dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", 610 s1_cfg->addr_width); 611 return -EINVAL; 612 } 613 614 if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { 615 pr_err_ratelimited("No supervisor request support on %s\n", 616 iommu->name); 617 return -EINVAL; 618 } 619 620 if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { 621 pr_err_ratelimited("No extended access flag support on %s\n", 622 iommu->name); 623 return -EINVAL; 624 } 625 626 spin_lock(&iommu->lock); 627 pte = intel_pasid_get_entry(dev, pasid); 628 if (!pte) { 629 spin_unlock(&iommu->lock); 630 return -ENODEV; 631 } 632 if (pasid_pte_is_present(pte)) { 633 spin_unlock(&iommu->lock); 634 return -EBUSY; 635 } 636 637 pasid_clear_entry(pte); 638 639 if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) 640 pasid_set_flpm(pte, 1); 641 642 pasid_set_flptr(pte, (uintptr_t)s1_gpgd); 643 644 if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { 645 pasid_set_sre(pte); 646 if (s1_cfg->flags & IOMMU_VTD_S1_WPE) 647 pasid_set_wpe(pte); 648 } 649 650 if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) 651 pasid_set_eafe(pte); 652 653 if (s2_domain->force_snooping) 654 pasid_set_pgsnp(pte); 655 656 pasid_set_slptr(pte, virt_to_phys(pgd)); 657 pasid_set_fault_enable(pte); 658 pasid_set_domain_id(pte, did); 659 pasid_set_address_width(pte, s2_domain->agaw); 660 pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 661 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); 662 pasid_set_present(pte); 663 spin_unlock(&iommu->lock); 664 665 pasid_flush_caches(iommu, pte, pasid, did); 666 667 return 0; 668 } 669