1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 #include "../iommu-pages.h" 26 27 static void v1_tlb_flush_all(void *cookie) 28 { 29 } 30 31 static void v1_tlb_flush_walk(unsigned long iova, size_t size, 32 size_t granule, void *cookie) 33 { 34 } 35 36 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, 37 unsigned long iova, size_t granule, 38 void *cookie) 39 { 40 } 41 42 static const struct iommu_flush_ops v1_flush_ops = { 43 .tlb_flush_all = v1_tlb_flush_all, 44 .tlb_flush_walk = v1_tlb_flush_walk, 45 .tlb_add_page = v1_tlb_add_page, 46 }; 47 48 /* 49 * Helper function to get the first pte of a large mapping 50 */ 51 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 52 unsigned long *count) 53 { 54 unsigned long pte_mask, pg_size, cnt; 55 u64 *fpte; 56 57 pg_size = PTE_PAGE_SIZE(*pte); 58 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 59 pte_mask = ~((cnt << 3) - 1); 60 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 61 62 if (page_size) 63 *page_size = pg_size; 64 65 if (count) 66 *count = cnt; 67 68 return fpte; 69 } 70 71 /**************************************************************************** 72 * 73 * The functions below are used the create the page table mappings for 74 * unity mapped regions. 75 * 76 ****************************************************************************/ 77 78 static void free_pt_page(u64 *pt, struct list_head *freelist) 79 { 80 struct page *p = virt_to_page(pt); 81 82 list_add_tail(&p->lru, freelist); 83 } 84 85 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 86 { 87 u64 *p; 88 int i; 89 90 for (i = 0; i < 512; ++i) { 91 /* PTE present? */ 92 if (!IOMMU_PTE_PRESENT(pt[i])) 93 continue; 94 95 /* Large PTE? */ 96 if (PM_PTE_LEVEL(pt[i]) == 0 || 97 PM_PTE_LEVEL(pt[i]) == 7) 98 continue; 99 100 /* 101 * Free the next level. No need to look at l1 tables here since 102 * they can only contain leaf PTEs; just free them directly. 103 */ 104 p = IOMMU_PTE_PAGE(pt[i]); 105 if (lvl > 2) 106 free_pt_lvl(p, freelist, lvl - 1); 107 else 108 free_pt_page(p, freelist); 109 } 110 111 free_pt_page(pt, freelist); 112 } 113 114 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 115 { 116 switch (mode) { 117 case PAGE_MODE_NONE: 118 case PAGE_MODE_7_LEVEL: 119 break; 120 case PAGE_MODE_1_LEVEL: 121 free_pt_page(root, freelist); 122 break; 123 case PAGE_MODE_2_LEVEL: 124 case PAGE_MODE_3_LEVEL: 125 case PAGE_MODE_4_LEVEL: 126 case PAGE_MODE_5_LEVEL: 127 case PAGE_MODE_6_LEVEL: 128 free_pt_lvl(root, freelist, mode); 129 break; 130 default: 131 BUG(); 132 } 133 } 134 135 void amd_iommu_domain_set_pgtable(struct protection_domain *domain, 136 u64 *root, int mode) 137 { 138 u64 pt_root; 139 140 /* lowest 3 bits encode pgtable mode */ 141 pt_root = mode & 7; 142 pt_root |= (u64)root; 143 144 amd_iommu_domain_set_pt_root(domain, pt_root); 145 } 146 147 /* 148 * This function is used to add another level to an IO page table. Adding 149 * another level increases the size of the address space by 9 bits to a size up 150 * to 64 bits. 151 */ 152 static bool increase_address_space(struct protection_domain *domain, 153 unsigned long address, 154 gfp_t gfp) 155 { 156 unsigned long flags; 157 bool ret = true; 158 u64 *pte; 159 160 pte = iommu_alloc_page_node(domain->nid, gfp); 161 if (!pte) 162 return false; 163 164 spin_lock_irqsave(&domain->lock, flags); 165 166 if (address <= PM_LEVEL_SIZE(domain->iop.mode)) 167 goto out; 168 169 ret = false; 170 if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) 171 goto out; 172 173 *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); 174 175 domain->iop.root = pte; 176 domain->iop.mode += 1; 177 amd_iommu_update_and_flush_device_table(domain); 178 amd_iommu_domain_flush_complete(domain); 179 180 /* 181 * Device Table needs to be updated and flushed before the new root can 182 * be published. 183 */ 184 amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); 185 186 pte = NULL; 187 ret = true; 188 189 out: 190 spin_unlock_irqrestore(&domain->lock, flags); 191 iommu_free_page(pte); 192 193 return ret; 194 } 195 196 static u64 *alloc_pte(struct protection_domain *domain, 197 unsigned long address, 198 unsigned long page_size, 199 u64 **pte_page, 200 gfp_t gfp, 201 bool *updated) 202 { 203 int level, end_lvl; 204 u64 *pte, *page; 205 206 BUG_ON(!is_power_of_2(page_size)); 207 208 while (address > PM_LEVEL_SIZE(domain->iop.mode)) { 209 /* 210 * Return an error if there is no memory to update the 211 * page-table. 212 */ 213 if (!increase_address_space(domain, address, gfp)) 214 return NULL; 215 } 216 217 218 level = domain->iop.mode - 1; 219 pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; 220 address = PAGE_SIZE_ALIGN(address, page_size); 221 end_lvl = PAGE_SIZE_LEVEL(page_size); 222 223 while (level > end_lvl) { 224 u64 __pte, __npte; 225 int pte_level; 226 227 __pte = *pte; 228 pte_level = PM_PTE_LEVEL(__pte); 229 230 /* 231 * If we replace a series of large PTEs, we need 232 * to tear down all of them. 233 */ 234 if (IOMMU_PTE_PRESENT(__pte) && 235 pte_level == PAGE_MODE_7_LEVEL) { 236 unsigned long count, i; 237 u64 *lpte; 238 239 lpte = first_pte_l7(pte, NULL, &count); 240 241 /* 242 * Unmap the replicated PTEs that still match the 243 * original large mapping 244 */ 245 for (i = 0; i < count; ++i) 246 cmpxchg64(&lpte[i], __pte, 0ULL); 247 248 *updated = true; 249 continue; 250 } 251 252 if (!IOMMU_PTE_PRESENT(__pte) || 253 pte_level == PAGE_MODE_NONE) { 254 page = iommu_alloc_page_node(domain->nid, gfp); 255 256 if (!page) 257 return NULL; 258 259 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 260 261 /* pte could have been changed somewhere. */ 262 if (!try_cmpxchg64(pte, &__pte, __npte)) 263 iommu_free_page(page); 264 else if (IOMMU_PTE_PRESENT(__pte)) 265 *updated = true; 266 267 continue; 268 } 269 270 /* No level skipping support yet */ 271 if (pte_level != level) 272 return NULL; 273 274 level -= 1; 275 276 pte = IOMMU_PTE_PAGE(__pte); 277 278 if (pte_page && level == end_lvl) 279 *pte_page = pte; 280 281 pte = &pte[PM_LEVEL_INDEX(level, address)]; 282 } 283 284 return pte; 285 } 286 287 /* 288 * This function checks if there is a PTE for a given dma address. If 289 * there is one, it returns the pointer to it. 290 */ 291 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 292 unsigned long address, 293 unsigned long *page_size) 294 { 295 int level; 296 u64 *pte; 297 298 *page_size = 0; 299 300 if (address > PM_LEVEL_SIZE(pgtable->mode)) 301 return NULL; 302 303 level = pgtable->mode - 1; 304 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 305 *page_size = PTE_LEVEL_PAGE_SIZE(level); 306 307 while (level > 0) { 308 309 /* Not Present */ 310 if (!IOMMU_PTE_PRESENT(*pte)) 311 return NULL; 312 313 /* Large PTE */ 314 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 315 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 316 break; 317 318 /* No level skipping support yet */ 319 if (PM_PTE_LEVEL(*pte) != level) 320 return NULL; 321 322 level -= 1; 323 324 /* Walk to the next level */ 325 pte = IOMMU_PTE_PAGE(*pte); 326 pte = &pte[PM_LEVEL_INDEX(level, address)]; 327 *page_size = PTE_LEVEL_PAGE_SIZE(level); 328 } 329 330 /* 331 * If we have a series of large PTEs, make 332 * sure to return a pointer to the first one. 333 */ 334 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 335 pte = first_pte_l7(pte, page_size, NULL); 336 337 return pte; 338 } 339 340 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 341 { 342 u64 *pt; 343 int mode; 344 345 while (!try_cmpxchg64(pte, &pteval, 0)) 346 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 347 348 if (!IOMMU_PTE_PRESENT(pteval)) 349 return; 350 351 pt = IOMMU_PTE_PAGE(pteval); 352 mode = IOMMU_PTE_MODE(pteval); 353 354 free_sub_pt(pt, mode, freelist); 355 } 356 357 /* 358 * Generic mapping functions. It maps a physical address into a DMA 359 * address space. It allocates the page table pages if necessary. 360 * In the future it can be extended to a generic mapping function 361 * supporting all features of AMD IOMMU page tables like level skipping 362 * and full 64 bit address spaces. 363 */ 364 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 365 phys_addr_t paddr, size_t pgsize, size_t pgcount, 366 int prot, gfp_t gfp, size_t *mapped) 367 { 368 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 369 LIST_HEAD(freelist); 370 bool updated = false; 371 u64 __pte, *pte; 372 int ret, i, count; 373 size_t size = pgcount << __ffs(pgsize); 374 unsigned long o_iova = iova; 375 376 BUG_ON(!IS_ALIGNED(iova, pgsize)); 377 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 378 379 ret = -EINVAL; 380 if (!(prot & IOMMU_PROT_MASK)) 381 goto out; 382 383 while (pgcount > 0) { 384 count = PAGE_SIZE_PTE_COUNT(pgsize); 385 pte = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated); 386 387 ret = -ENOMEM; 388 if (!pte) 389 goto out; 390 391 for (i = 0; i < count; ++i) 392 free_clear_pte(&pte[i], pte[i], &freelist); 393 394 if (!list_empty(&freelist)) 395 updated = true; 396 397 if (count > 1) { 398 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 399 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 400 } else 401 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 402 403 if (prot & IOMMU_PROT_IR) 404 __pte |= IOMMU_PTE_IR; 405 if (prot & IOMMU_PROT_IW) 406 __pte |= IOMMU_PTE_IW; 407 408 for (i = 0; i < count; ++i) 409 pte[i] = __pte; 410 411 iova += pgsize; 412 paddr += pgsize; 413 pgcount--; 414 if (mapped) 415 *mapped += pgsize; 416 } 417 418 ret = 0; 419 420 out: 421 if (updated) { 422 unsigned long flags; 423 424 spin_lock_irqsave(&dom->lock, flags); 425 /* 426 * Flush domain TLB(s) and wait for completion. Any Device-Table 427 * Updates and flushing already happened in 428 * increase_address_space(). 429 */ 430 amd_iommu_domain_flush_pages(dom, o_iova, size); 431 spin_unlock_irqrestore(&dom->lock, flags); 432 } 433 434 /* Everything flushed out, free pages now */ 435 iommu_put_pages_list(&freelist); 436 437 return ret; 438 } 439 440 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 441 unsigned long iova, 442 size_t pgsize, size_t pgcount, 443 struct iommu_iotlb_gather *gather) 444 { 445 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 446 unsigned long long unmapped; 447 unsigned long unmap_size; 448 u64 *pte; 449 size_t size = pgcount << __ffs(pgsize); 450 451 BUG_ON(!is_power_of_2(pgsize)); 452 453 unmapped = 0; 454 455 while (unmapped < size) { 456 pte = fetch_pte(pgtable, iova, &unmap_size); 457 if (pte) { 458 int i, count; 459 460 count = PAGE_SIZE_PTE_COUNT(unmap_size); 461 for (i = 0; i < count; i++) 462 pte[i] = 0ULL; 463 } else { 464 return unmapped; 465 } 466 467 iova = (iova & ~(unmap_size - 1)) + unmap_size; 468 unmapped += unmap_size; 469 } 470 471 return unmapped; 472 } 473 474 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 475 { 476 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 477 unsigned long offset_mask, pte_pgsize; 478 u64 *pte, __pte; 479 480 pte = fetch_pte(pgtable, iova, &pte_pgsize); 481 482 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 483 return 0; 484 485 offset_mask = pte_pgsize - 1; 486 __pte = __sme_clr(*pte & PM_ADDR_MASK); 487 488 return (__pte & ~offset_mask) | (iova & offset_mask); 489 } 490 491 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 492 unsigned long flags) 493 { 494 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 495 bool dirty = false; 496 int i, count; 497 498 /* 499 * 2.2.3.2 Host Dirty Support 500 * When a non-default page size is used , software must OR the 501 * Dirty bits in all of the replicated host PTEs used to map 502 * the page. The IOMMU does not guarantee the Dirty bits are 503 * set in all of the replicated PTEs. Any portion of the page 504 * may have been written even if the Dirty bit is set in only 505 * one of the replicated PTEs. 506 */ 507 count = PAGE_SIZE_PTE_COUNT(size); 508 for (i = 0; i < count && test_only; i++) { 509 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 510 dirty = true; 511 break; 512 } 513 } 514 515 for (i = 0; i < count && !test_only; i++) { 516 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 517 (unsigned long *)&ptep[i])) { 518 dirty = true; 519 } 520 } 521 522 return dirty; 523 } 524 525 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 526 unsigned long iova, size_t size, 527 unsigned long flags, 528 struct iommu_dirty_bitmap *dirty) 529 { 530 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 531 unsigned long end = iova + size - 1; 532 533 do { 534 unsigned long pgsize = 0; 535 u64 *ptep, pte; 536 537 ptep = fetch_pte(pgtable, iova, &pgsize); 538 if (ptep) 539 pte = READ_ONCE(*ptep); 540 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 541 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 542 iova += pgsize; 543 continue; 544 } 545 546 /* 547 * Mark the whole IOVA range as dirty even if only one of 548 * the replicated PTEs were marked dirty. 549 */ 550 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 551 iommu_dirty_bitmap_record(dirty, iova, pgsize); 552 iova += pgsize; 553 } while (iova < end); 554 555 return 0; 556 } 557 558 /* 559 * ---------------------------------------------------- 560 */ 561 static void v1_free_pgtable(struct io_pgtable *iop) 562 { 563 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); 564 struct protection_domain *dom; 565 LIST_HEAD(freelist); 566 567 if (pgtable->mode == PAGE_MODE_NONE) 568 return; 569 570 dom = container_of(pgtable, struct protection_domain, iop); 571 572 /* Page-table is not visible to IOMMU anymore, so free it */ 573 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 574 pgtable->mode > PAGE_MODE_6_LEVEL); 575 576 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 577 578 /* Update data structure */ 579 amd_iommu_domain_clr_pt_root(dom); 580 581 /* Make changes visible to IOMMUs */ 582 amd_iommu_domain_update(dom); 583 584 iommu_put_pages_list(&freelist); 585 } 586 587 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 588 { 589 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 590 591 cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, 592 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, 593 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, 594 cfg->tlb = &v1_flush_ops; 595 596 pgtable->iop.ops.map_pages = iommu_v1_map_pages; 597 pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; 598 pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; 599 pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 600 601 return &pgtable->iop; 602 } 603 604 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 605 .alloc = v1_alloc_pgtable, 606 .free = v1_free_pgtable, 607 }; 608