1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 26 static void v1_tlb_flush_all(void *cookie) 27 { 28 } 29 30 static void v1_tlb_flush_walk(unsigned long iova, size_t size, 31 size_t granule, void *cookie) 32 { 33 } 34 35 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, 36 unsigned long iova, size_t granule, 37 void *cookie) 38 { 39 } 40 41 static const struct iommu_flush_ops v1_flush_ops = { 42 .tlb_flush_all = v1_tlb_flush_all, 43 .tlb_flush_walk = v1_tlb_flush_walk, 44 .tlb_add_page = v1_tlb_add_page, 45 }; 46 47 /* 48 * Helper function to get the first pte of a large mapping 49 */ 50 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 51 unsigned long *count) 52 { 53 unsigned long pte_mask, pg_size, cnt; 54 u64 *fpte; 55 56 pg_size = PTE_PAGE_SIZE(*pte); 57 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 58 pte_mask = ~((cnt << 3) - 1); 59 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 60 61 if (page_size) 62 *page_size = pg_size; 63 64 if (count) 65 *count = cnt; 66 67 return fpte; 68 } 69 70 /**************************************************************************** 71 * 72 * The functions below are used the create the page table mappings for 73 * unity mapped regions. 74 * 75 ****************************************************************************/ 76 77 static void free_pt_page(u64 *pt, struct list_head *freelist) 78 { 79 struct page *p = virt_to_page(pt); 80 81 list_add_tail(&p->lru, freelist); 82 } 83 84 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 85 { 86 u64 *p; 87 int i; 88 89 for (i = 0; i < 512; ++i) { 90 /* PTE present? */ 91 if (!IOMMU_PTE_PRESENT(pt[i])) 92 continue; 93 94 /* Large PTE? */ 95 if (PM_PTE_LEVEL(pt[i]) == 0 || 96 PM_PTE_LEVEL(pt[i]) == 7) 97 continue; 98 99 /* 100 * Free the next level. No need to look at l1 tables here since 101 * they can only contain leaf PTEs; just free them directly. 102 */ 103 p = IOMMU_PTE_PAGE(pt[i]); 104 if (lvl > 2) 105 free_pt_lvl(p, freelist, lvl - 1); 106 else 107 free_pt_page(p, freelist); 108 } 109 110 free_pt_page(pt, freelist); 111 } 112 113 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 114 { 115 switch (mode) { 116 case PAGE_MODE_NONE: 117 case PAGE_MODE_7_LEVEL: 118 break; 119 case PAGE_MODE_1_LEVEL: 120 free_pt_page(root, freelist); 121 break; 122 case PAGE_MODE_2_LEVEL: 123 case PAGE_MODE_3_LEVEL: 124 case PAGE_MODE_4_LEVEL: 125 case PAGE_MODE_5_LEVEL: 126 case PAGE_MODE_6_LEVEL: 127 free_pt_lvl(root, freelist, mode); 128 break; 129 default: 130 BUG(); 131 } 132 } 133 134 void amd_iommu_domain_set_pgtable(struct protection_domain *domain, 135 u64 *root, int mode) 136 { 137 u64 pt_root; 138 139 /* lowest 3 bits encode pgtable mode */ 140 pt_root = mode & 7; 141 pt_root |= (u64)root; 142 143 amd_iommu_domain_set_pt_root(domain, pt_root); 144 } 145 146 /* 147 * This function is used to add another level to an IO page table. Adding 148 * another level increases the size of the address space by 9 bits to a size up 149 * to 64 bits. 150 */ 151 static bool increase_address_space(struct protection_domain *domain, 152 unsigned long address, 153 gfp_t gfp) 154 { 155 unsigned long flags; 156 bool ret = true; 157 u64 *pte; 158 159 pte = alloc_pgtable_page(domain->nid, gfp); 160 if (!pte) 161 return false; 162 163 spin_lock_irqsave(&domain->lock, flags); 164 165 if (address <= PM_LEVEL_SIZE(domain->iop.mode)) 166 goto out; 167 168 ret = false; 169 if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) 170 goto out; 171 172 *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); 173 174 domain->iop.root = pte; 175 domain->iop.mode += 1; 176 amd_iommu_update_and_flush_device_table(domain); 177 amd_iommu_domain_flush_complete(domain); 178 179 /* 180 * Device Table needs to be updated and flushed before the new root can 181 * be published. 182 */ 183 amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); 184 185 pte = NULL; 186 ret = true; 187 188 out: 189 spin_unlock_irqrestore(&domain->lock, flags); 190 free_page((unsigned long)pte); 191 192 return ret; 193 } 194 195 static u64 *alloc_pte(struct protection_domain *domain, 196 unsigned long address, 197 unsigned long page_size, 198 u64 **pte_page, 199 gfp_t gfp, 200 bool *updated) 201 { 202 int level, end_lvl; 203 u64 *pte, *page; 204 205 BUG_ON(!is_power_of_2(page_size)); 206 207 while (address > PM_LEVEL_SIZE(domain->iop.mode)) { 208 /* 209 * Return an error if there is no memory to update the 210 * page-table. 211 */ 212 if (!increase_address_space(domain, address, gfp)) 213 return NULL; 214 } 215 216 217 level = domain->iop.mode - 1; 218 pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; 219 address = PAGE_SIZE_ALIGN(address, page_size); 220 end_lvl = PAGE_SIZE_LEVEL(page_size); 221 222 while (level > end_lvl) { 223 u64 __pte, __npte; 224 int pte_level; 225 226 __pte = *pte; 227 pte_level = PM_PTE_LEVEL(__pte); 228 229 /* 230 * If we replace a series of large PTEs, we need 231 * to tear down all of them. 232 */ 233 if (IOMMU_PTE_PRESENT(__pte) && 234 pte_level == PAGE_MODE_7_LEVEL) { 235 unsigned long count, i; 236 u64 *lpte; 237 238 lpte = first_pte_l7(pte, NULL, &count); 239 240 /* 241 * Unmap the replicated PTEs that still match the 242 * original large mapping 243 */ 244 for (i = 0; i < count; ++i) 245 cmpxchg64(&lpte[i], __pte, 0ULL); 246 247 *updated = true; 248 continue; 249 } 250 251 if (!IOMMU_PTE_PRESENT(__pte) || 252 pte_level == PAGE_MODE_NONE) { 253 page = alloc_pgtable_page(domain->nid, gfp); 254 255 if (!page) 256 return NULL; 257 258 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 259 260 /* pte could have been changed somewhere. */ 261 if (!try_cmpxchg64(pte, &__pte, __npte)) 262 free_page((unsigned long)page); 263 else if (IOMMU_PTE_PRESENT(__pte)) 264 *updated = true; 265 266 continue; 267 } 268 269 /* No level skipping support yet */ 270 if (pte_level != level) 271 return NULL; 272 273 level -= 1; 274 275 pte = IOMMU_PTE_PAGE(__pte); 276 277 if (pte_page && level == end_lvl) 278 *pte_page = pte; 279 280 pte = &pte[PM_LEVEL_INDEX(level, address)]; 281 } 282 283 return pte; 284 } 285 286 /* 287 * This function checks if there is a PTE for a given dma address. If 288 * there is one, it returns the pointer to it. 289 */ 290 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 291 unsigned long address, 292 unsigned long *page_size) 293 { 294 int level; 295 u64 *pte; 296 297 *page_size = 0; 298 299 if (address > PM_LEVEL_SIZE(pgtable->mode)) 300 return NULL; 301 302 level = pgtable->mode - 1; 303 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 304 *page_size = PTE_LEVEL_PAGE_SIZE(level); 305 306 while (level > 0) { 307 308 /* Not Present */ 309 if (!IOMMU_PTE_PRESENT(*pte)) 310 return NULL; 311 312 /* Large PTE */ 313 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 314 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 315 break; 316 317 /* No level skipping support yet */ 318 if (PM_PTE_LEVEL(*pte) != level) 319 return NULL; 320 321 level -= 1; 322 323 /* Walk to the next level */ 324 pte = IOMMU_PTE_PAGE(*pte); 325 pte = &pte[PM_LEVEL_INDEX(level, address)]; 326 *page_size = PTE_LEVEL_PAGE_SIZE(level); 327 } 328 329 /* 330 * If we have a series of large PTEs, make 331 * sure to return a pointer to the first one. 332 */ 333 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 334 pte = first_pte_l7(pte, page_size, NULL); 335 336 return pte; 337 } 338 339 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 340 { 341 u64 *pt; 342 int mode; 343 344 while (!try_cmpxchg64(pte, &pteval, 0)) 345 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 346 347 if (!IOMMU_PTE_PRESENT(pteval)) 348 return; 349 350 pt = IOMMU_PTE_PAGE(pteval); 351 mode = IOMMU_PTE_MODE(pteval); 352 353 free_sub_pt(pt, mode, freelist); 354 } 355 356 /* 357 * Generic mapping functions. It maps a physical address into a DMA 358 * address space. It allocates the page table pages if necessary. 359 * In the future it can be extended to a generic mapping function 360 * supporting all features of AMD IOMMU page tables like level skipping 361 * and full 64 bit address spaces. 362 */ 363 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 364 phys_addr_t paddr, size_t pgsize, size_t pgcount, 365 int prot, gfp_t gfp, size_t *mapped) 366 { 367 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 368 LIST_HEAD(freelist); 369 bool updated = false; 370 u64 __pte, *pte; 371 int ret, i, count; 372 size_t size = pgcount << __ffs(pgsize); 373 unsigned long o_iova = iova; 374 375 BUG_ON(!IS_ALIGNED(iova, pgsize)); 376 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 377 378 ret = -EINVAL; 379 if (!(prot & IOMMU_PROT_MASK)) 380 goto out; 381 382 while (pgcount > 0) { 383 count = PAGE_SIZE_PTE_COUNT(pgsize); 384 pte = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated); 385 386 ret = -ENOMEM; 387 if (!pte) 388 goto out; 389 390 for (i = 0; i < count; ++i) 391 free_clear_pte(&pte[i], pte[i], &freelist); 392 393 if (!list_empty(&freelist)) 394 updated = true; 395 396 if (count > 1) { 397 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 398 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 399 } else 400 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 401 402 if (prot & IOMMU_PROT_IR) 403 __pte |= IOMMU_PTE_IR; 404 if (prot & IOMMU_PROT_IW) 405 __pte |= IOMMU_PTE_IW; 406 407 for (i = 0; i < count; ++i) 408 pte[i] = __pte; 409 410 iova += pgsize; 411 paddr += pgsize; 412 pgcount--; 413 if (mapped) 414 *mapped += pgsize; 415 } 416 417 ret = 0; 418 419 out: 420 if (updated) { 421 unsigned long flags; 422 423 spin_lock_irqsave(&dom->lock, flags); 424 /* 425 * Flush domain TLB(s) and wait for completion. Any Device-Table 426 * Updates and flushing already happened in 427 * increase_address_space(). 428 */ 429 amd_iommu_domain_flush_pages(dom, o_iova, size); 430 spin_unlock_irqrestore(&dom->lock, flags); 431 } 432 433 /* Everything flushed out, free pages now */ 434 put_pages_list(&freelist); 435 436 return ret; 437 } 438 439 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 440 unsigned long iova, 441 size_t pgsize, size_t pgcount, 442 struct iommu_iotlb_gather *gather) 443 { 444 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 445 unsigned long long unmapped; 446 unsigned long unmap_size; 447 u64 *pte; 448 size_t size = pgcount << __ffs(pgsize); 449 450 BUG_ON(!is_power_of_2(pgsize)); 451 452 unmapped = 0; 453 454 while (unmapped < size) { 455 pte = fetch_pte(pgtable, iova, &unmap_size); 456 if (pte) { 457 int i, count; 458 459 count = PAGE_SIZE_PTE_COUNT(unmap_size); 460 for (i = 0; i < count; i++) 461 pte[i] = 0ULL; 462 } else { 463 return unmapped; 464 } 465 466 iova = (iova & ~(unmap_size - 1)) + unmap_size; 467 unmapped += unmap_size; 468 } 469 470 return unmapped; 471 } 472 473 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 474 { 475 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 476 unsigned long offset_mask, pte_pgsize; 477 u64 *pte, __pte; 478 479 pte = fetch_pte(pgtable, iova, &pte_pgsize); 480 481 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 482 return 0; 483 484 offset_mask = pte_pgsize - 1; 485 __pte = __sme_clr(*pte & PM_ADDR_MASK); 486 487 return (__pte & ~offset_mask) | (iova & offset_mask); 488 } 489 490 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 491 unsigned long flags) 492 { 493 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 494 bool dirty = false; 495 int i, count; 496 497 /* 498 * 2.2.3.2 Host Dirty Support 499 * When a non-default page size is used , software must OR the 500 * Dirty bits in all of the replicated host PTEs used to map 501 * the page. The IOMMU does not guarantee the Dirty bits are 502 * set in all of the replicated PTEs. Any portion of the page 503 * may have been written even if the Dirty bit is set in only 504 * one of the replicated PTEs. 505 */ 506 count = PAGE_SIZE_PTE_COUNT(size); 507 for (i = 0; i < count && test_only; i++) { 508 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 509 dirty = true; 510 break; 511 } 512 } 513 514 for (i = 0; i < count && !test_only; i++) { 515 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 516 (unsigned long *)&ptep[i])) { 517 dirty = true; 518 } 519 } 520 521 return dirty; 522 } 523 524 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 525 unsigned long iova, size_t size, 526 unsigned long flags, 527 struct iommu_dirty_bitmap *dirty) 528 { 529 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 530 unsigned long end = iova + size - 1; 531 532 do { 533 unsigned long pgsize = 0; 534 u64 *ptep, pte; 535 536 ptep = fetch_pte(pgtable, iova, &pgsize); 537 if (ptep) 538 pte = READ_ONCE(*ptep); 539 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 540 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 541 iova += pgsize; 542 continue; 543 } 544 545 /* 546 * Mark the whole IOVA range as dirty even if only one of 547 * the replicated PTEs were marked dirty. 548 */ 549 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 550 iommu_dirty_bitmap_record(dirty, iova, pgsize); 551 iova += pgsize; 552 } while (iova < end); 553 554 return 0; 555 } 556 557 /* 558 * ---------------------------------------------------- 559 */ 560 static void v1_free_pgtable(struct io_pgtable *iop) 561 { 562 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); 563 struct protection_domain *dom; 564 LIST_HEAD(freelist); 565 566 if (pgtable->mode == PAGE_MODE_NONE) 567 return; 568 569 dom = container_of(pgtable, struct protection_domain, iop); 570 571 /* Page-table is not visible to IOMMU anymore, so free it */ 572 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 573 pgtable->mode > PAGE_MODE_6_LEVEL); 574 575 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 576 577 /* Update data structure */ 578 amd_iommu_domain_clr_pt_root(dom); 579 580 /* Make changes visible to IOMMUs */ 581 amd_iommu_domain_update(dom); 582 583 put_pages_list(&freelist); 584 } 585 586 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 587 { 588 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 589 590 cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, 591 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, 592 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, 593 cfg->tlb = &v1_flush_ops; 594 595 pgtable->iop.ops.map_pages = iommu_v1_map_pages; 596 pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; 597 pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; 598 pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 599 600 return &pgtable->iop; 601 } 602 603 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 604 .alloc = v1_alloc_pgtable, 605 .free = v1_free_pgtable, 606 }; 607