1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 #include <linux/seqlock.h> 21 22 #include <asm/barrier.h> 23 24 #include "amd_iommu_types.h" 25 #include "amd_iommu.h" 26 #include "../iommu-pages.h" 27 28 /* 29 * Helper function to get the first pte of a large mapping 30 */ 31 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 32 unsigned long *count) 33 { 34 unsigned long pte_mask, pg_size, cnt; 35 u64 *fpte; 36 37 pg_size = PTE_PAGE_SIZE(*pte); 38 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 39 pte_mask = ~((cnt << 3) - 1); 40 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 41 42 if (page_size) 43 *page_size = pg_size; 44 45 if (count) 46 *count = cnt; 47 48 return fpte; 49 } 50 51 static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) 52 { 53 u64 *p; 54 int i; 55 56 for (i = 0; i < 512; ++i) { 57 /* PTE present? */ 58 if (!IOMMU_PTE_PRESENT(pt[i])) 59 continue; 60 61 /* Large PTE? */ 62 if (PM_PTE_LEVEL(pt[i]) == 0 || 63 PM_PTE_LEVEL(pt[i]) == 7) 64 continue; 65 66 /* 67 * Free the next level. No need to look at l1 tables here since 68 * they can only contain leaf PTEs; just free them directly. 69 */ 70 p = IOMMU_PTE_PAGE(pt[i]); 71 if (lvl > 2) 72 free_pt_lvl(p, freelist, lvl - 1); 73 else 74 iommu_pages_list_add(freelist, p); 75 } 76 77 iommu_pages_list_add(freelist, pt); 78 } 79 80 static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) 81 { 82 switch (mode) { 83 case PAGE_MODE_NONE: 84 case PAGE_MODE_7_LEVEL: 85 break; 86 case PAGE_MODE_1_LEVEL: 87 iommu_pages_list_add(freelist, root); 88 break; 89 case PAGE_MODE_2_LEVEL: 90 case PAGE_MODE_3_LEVEL: 91 case PAGE_MODE_4_LEVEL: 92 case PAGE_MODE_5_LEVEL: 93 case PAGE_MODE_6_LEVEL: 94 free_pt_lvl(root, freelist, mode); 95 break; 96 default: 97 BUG(); 98 } 99 } 100 101 /* 102 * This function is used to add another level to an IO page table. Adding 103 * another level increases the size of the address space by 9 bits to a size up 104 * to 64 bits. 105 */ 106 static bool increase_address_space(struct amd_io_pgtable *pgtable, 107 unsigned long address, 108 unsigned int page_size_level, 109 gfp_t gfp) 110 { 111 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 112 struct protection_domain *domain = 113 container_of(pgtable, struct protection_domain, iop); 114 unsigned long flags; 115 bool ret = true; 116 u64 *pte; 117 118 pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); 119 if (!pte) 120 return false; 121 122 spin_lock_irqsave(&domain->lock, flags); 123 124 if (address <= PM_LEVEL_SIZE(pgtable->mode) && 125 pgtable->mode - 1 >= page_size_level) 126 goto out; 127 128 ret = false; 129 if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) 130 goto out; 131 132 *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); 133 134 write_seqcount_begin(&pgtable->seqcount); 135 pgtable->root = pte; 136 pgtable->mode += 1; 137 write_seqcount_end(&pgtable->seqcount); 138 139 amd_iommu_update_and_flush_device_table(domain); 140 141 pte = NULL; 142 ret = true; 143 144 out: 145 spin_unlock_irqrestore(&domain->lock, flags); 146 iommu_free_pages(pte); 147 148 return ret; 149 } 150 151 static u64 *alloc_pte(struct amd_io_pgtable *pgtable, 152 unsigned long address, 153 unsigned long page_size, 154 u64 **pte_page, 155 gfp_t gfp, 156 bool *updated) 157 { 158 unsigned long last_addr = address + (page_size - 1); 159 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 160 unsigned int seqcount; 161 int level, end_lvl; 162 u64 *pte, *page; 163 164 BUG_ON(!is_power_of_2(page_size)); 165 166 while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || 167 pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { 168 /* 169 * Return an error if there is no memory to update the 170 * page-table. 171 */ 172 if (!increase_address_space(pgtable, last_addr, 173 PAGE_SIZE_LEVEL(page_size), gfp)) 174 return NULL; 175 } 176 177 178 do { 179 seqcount = read_seqcount_begin(&pgtable->seqcount); 180 181 level = pgtable->mode - 1; 182 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 183 } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); 184 185 186 address = PAGE_SIZE_ALIGN(address, page_size); 187 end_lvl = PAGE_SIZE_LEVEL(page_size); 188 189 while (level > end_lvl) { 190 u64 __pte, __npte; 191 int pte_level; 192 193 __pte = *pte; 194 pte_level = PM_PTE_LEVEL(__pte); 195 196 /* 197 * If we replace a series of large PTEs, we need 198 * to tear down all of them. 199 */ 200 if (IOMMU_PTE_PRESENT(__pte) && 201 pte_level == PAGE_MODE_7_LEVEL) { 202 unsigned long count, i; 203 u64 *lpte; 204 205 lpte = first_pte_l7(pte, NULL, &count); 206 207 /* 208 * Unmap the replicated PTEs that still match the 209 * original large mapping 210 */ 211 for (i = 0; i < count; ++i) 212 cmpxchg64(&lpte[i], __pte, 0ULL); 213 214 *updated = true; 215 continue; 216 } 217 218 if (!IOMMU_PTE_PRESENT(__pte) || 219 pte_level == PAGE_MODE_NONE) { 220 page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, 221 SZ_4K); 222 223 if (!page) 224 return NULL; 225 226 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 227 228 /* pte could have been changed somewhere. */ 229 if (!try_cmpxchg64(pte, &__pte, __npte)) 230 iommu_free_pages(page); 231 else if (IOMMU_PTE_PRESENT(__pte)) 232 *updated = true; 233 234 continue; 235 } 236 237 /* No level skipping support yet */ 238 if (pte_level != level) 239 return NULL; 240 241 level -= 1; 242 243 pte = IOMMU_PTE_PAGE(__pte); 244 245 if (pte_page && level == end_lvl) 246 *pte_page = pte; 247 248 pte = &pte[PM_LEVEL_INDEX(level, address)]; 249 } 250 251 return pte; 252 } 253 254 /* 255 * This function checks if there is a PTE for a given dma address. If 256 * there is one, it returns the pointer to it. 257 */ 258 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 259 unsigned long address, 260 unsigned long *page_size) 261 { 262 int level; 263 unsigned int seqcount; 264 u64 *pte; 265 266 *page_size = 0; 267 268 if (address > PM_LEVEL_SIZE(pgtable->mode)) 269 return NULL; 270 271 do { 272 seqcount = read_seqcount_begin(&pgtable->seqcount); 273 level = pgtable->mode - 1; 274 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 275 } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); 276 277 *page_size = PTE_LEVEL_PAGE_SIZE(level); 278 279 while (level > 0) { 280 281 /* Not Present */ 282 if (!IOMMU_PTE_PRESENT(*pte)) 283 return NULL; 284 285 /* Large PTE */ 286 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 287 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 288 break; 289 290 /* No level skipping support yet */ 291 if (PM_PTE_LEVEL(*pte) != level) 292 return NULL; 293 294 level -= 1; 295 296 /* Walk to the next level */ 297 pte = IOMMU_PTE_PAGE(*pte); 298 pte = &pte[PM_LEVEL_INDEX(level, address)]; 299 *page_size = PTE_LEVEL_PAGE_SIZE(level); 300 } 301 302 /* 303 * If we have a series of large PTEs, make 304 * sure to return a pointer to the first one. 305 */ 306 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 307 pte = first_pte_l7(pte, page_size, NULL); 308 309 return pte; 310 } 311 312 static void free_clear_pte(u64 *pte, u64 pteval, 313 struct iommu_pages_list *freelist) 314 { 315 u64 *pt; 316 int mode; 317 318 while (!try_cmpxchg64(pte, &pteval, 0)) 319 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 320 321 if (!IOMMU_PTE_PRESENT(pteval)) 322 return; 323 324 pt = IOMMU_PTE_PAGE(pteval); 325 mode = IOMMU_PTE_MODE(pteval); 326 327 free_sub_pt(pt, mode, freelist); 328 } 329 330 /* 331 * Generic mapping functions. It maps a physical address into a DMA 332 * address space. It allocates the page table pages if necessary. 333 * In the future it can be extended to a generic mapping function 334 * supporting all features of AMD IOMMU page tables like level skipping 335 * and full 64 bit address spaces. 336 */ 337 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 338 phys_addr_t paddr, size_t pgsize, size_t pgcount, 339 int prot, gfp_t gfp, size_t *mapped) 340 { 341 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 342 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 343 bool updated = false; 344 u64 __pte, *pte; 345 int ret, i, count; 346 size_t size = pgcount << __ffs(pgsize); 347 unsigned long o_iova = iova; 348 349 BUG_ON(!IS_ALIGNED(iova, pgsize)); 350 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 351 352 ret = -EINVAL; 353 if (!(prot & IOMMU_PROT_MASK)) 354 goto out; 355 356 while (pgcount > 0) { 357 count = PAGE_SIZE_PTE_COUNT(pgsize); 358 pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); 359 360 ret = -ENOMEM; 361 if (!pte) 362 goto out; 363 364 for (i = 0; i < count; ++i) 365 free_clear_pte(&pte[i], pte[i], &freelist); 366 367 if (!iommu_pages_list_empty(&freelist)) 368 updated = true; 369 370 if (count > 1) { 371 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 372 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 373 } else 374 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 375 376 if (prot & IOMMU_PROT_IR) 377 __pte |= IOMMU_PTE_IR; 378 if (prot & IOMMU_PROT_IW) 379 __pte |= IOMMU_PTE_IW; 380 381 for (i = 0; i < count; ++i) 382 pte[i] = __pte; 383 384 iova += pgsize; 385 paddr += pgsize; 386 pgcount--; 387 if (mapped) 388 *mapped += pgsize; 389 } 390 391 ret = 0; 392 393 out: 394 if (updated) { 395 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 396 unsigned long flags; 397 398 spin_lock_irqsave(&dom->lock, flags); 399 /* 400 * Flush domain TLB(s) and wait for completion. Any Device-Table 401 * Updates and flushing already happened in 402 * increase_address_space(). 403 */ 404 amd_iommu_domain_flush_pages(dom, o_iova, size); 405 spin_unlock_irqrestore(&dom->lock, flags); 406 } 407 408 /* Everything flushed out, free pages now */ 409 iommu_put_pages_list(&freelist); 410 411 return ret; 412 } 413 414 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 415 unsigned long iova, 416 size_t pgsize, size_t pgcount, 417 struct iommu_iotlb_gather *gather) 418 { 419 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 420 unsigned long long unmapped; 421 unsigned long unmap_size; 422 u64 *pte; 423 size_t size = pgcount << __ffs(pgsize); 424 425 BUG_ON(!is_power_of_2(pgsize)); 426 427 unmapped = 0; 428 429 while (unmapped < size) { 430 pte = fetch_pte(pgtable, iova, &unmap_size); 431 if (pte) { 432 int i, count; 433 434 count = PAGE_SIZE_PTE_COUNT(unmap_size); 435 for (i = 0; i < count; i++) 436 pte[i] = 0ULL; 437 } else { 438 return unmapped; 439 } 440 441 iova = (iova & ~(unmap_size - 1)) + unmap_size; 442 unmapped += unmap_size; 443 } 444 445 return unmapped; 446 } 447 448 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 449 { 450 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 451 unsigned long offset_mask, pte_pgsize; 452 u64 *pte, __pte; 453 454 pte = fetch_pte(pgtable, iova, &pte_pgsize); 455 456 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 457 return 0; 458 459 offset_mask = pte_pgsize - 1; 460 __pte = __sme_clr(*pte & PM_ADDR_MASK); 461 462 return (__pte & ~offset_mask) | (iova & offset_mask); 463 } 464 465 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 466 unsigned long flags) 467 { 468 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 469 bool dirty = false; 470 int i, count; 471 472 /* 473 * 2.2.3.2 Host Dirty Support 474 * When a non-default page size is used , software must OR the 475 * Dirty bits in all of the replicated host PTEs used to map 476 * the page. The IOMMU does not guarantee the Dirty bits are 477 * set in all of the replicated PTEs. Any portion of the page 478 * may have been written even if the Dirty bit is set in only 479 * one of the replicated PTEs. 480 */ 481 count = PAGE_SIZE_PTE_COUNT(size); 482 for (i = 0; i < count && test_only; i++) { 483 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 484 dirty = true; 485 break; 486 } 487 } 488 489 for (i = 0; i < count && !test_only; i++) { 490 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 491 (unsigned long *)&ptep[i])) { 492 dirty = true; 493 } 494 } 495 496 return dirty; 497 } 498 499 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 500 unsigned long iova, size_t size, 501 unsigned long flags, 502 struct iommu_dirty_bitmap *dirty) 503 { 504 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 505 unsigned long end = iova + size - 1; 506 507 do { 508 unsigned long pgsize = 0; 509 u64 *ptep, pte; 510 511 ptep = fetch_pte(pgtable, iova, &pgsize); 512 if (ptep) 513 pte = READ_ONCE(*ptep); 514 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 515 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 516 iova += pgsize; 517 continue; 518 } 519 520 /* 521 * Mark the whole IOVA range as dirty even if only one of 522 * the replicated PTEs were marked dirty. 523 */ 524 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 525 iommu_dirty_bitmap_record(dirty, iova, pgsize); 526 iova += pgsize; 527 } while (iova < end); 528 529 return 0; 530 } 531 532 /* 533 * ---------------------------------------------------- 534 */ 535 static void v1_free_pgtable(struct io_pgtable *iop) 536 { 537 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 538 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 539 540 if (pgtable->mode == PAGE_MODE_NONE) 541 return; 542 543 /* Page-table is not visible to IOMMU anymore, so free it */ 544 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 545 pgtable->mode > amd_iommu_hpt_level); 546 547 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 548 iommu_put_pages_list(&freelist); 549 } 550 551 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 552 { 553 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 554 555 pgtable->root = 556 iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); 557 if (!pgtable->root) 558 return NULL; 559 pgtable->mode = PAGE_MODE_3_LEVEL; 560 seqcount_init(&pgtable->seqcount); 561 562 cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; 563 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; 564 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 565 566 pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; 567 pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; 568 pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; 569 pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 570 571 return &pgtable->pgtbl; 572 } 573 574 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 575 .alloc = v1_alloc_pgtable, 576 .free = v1_free_pgtable, 577 }; 578