1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 #include "../iommu-pages.h" 26 27 /* 28 * Helper function to get the first pte of a large mapping 29 */ 30 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 31 unsigned long *count) 32 { 33 unsigned long pte_mask, pg_size, cnt; 34 u64 *fpte; 35 36 pg_size = PTE_PAGE_SIZE(*pte); 37 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 38 pte_mask = ~((cnt << 3) - 1); 39 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 40 41 if (page_size) 42 *page_size = pg_size; 43 44 if (count) 45 *count = cnt; 46 47 return fpte; 48 } 49 50 /**************************************************************************** 51 * 52 * The functions below are used the create the page table mappings for 53 * unity mapped regions. 54 * 55 ****************************************************************************/ 56 57 static void free_pt_page(u64 *pt, struct list_head *freelist) 58 { 59 struct page *p = virt_to_page(pt); 60 61 list_add_tail(&p->lru, freelist); 62 } 63 64 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 65 { 66 u64 *p; 67 int i; 68 69 for (i = 0; i < 512; ++i) { 70 /* PTE present? */ 71 if (!IOMMU_PTE_PRESENT(pt[i])) 72 continue; 73 74 /* Large PTE? */ 75 if (PM_PTE_LEVEL(pt[i]) == 0 || 76 PM_PTE_LEVEL(pt[i]) == 7) 77 continue; 78 79 /* 80 * Free the next level. No need to look at l1 tables here since 81 * they can only contain leaf PTEs; just free them directly. 82 */ 83 p = IOMMU_PTE_PAGE(pt[i]); 84 if (lvl > 2) 85 free_pt_lvl(p, freelist, lvl - 1); 86 else 87 free_pt_page(p, freelist); 88 } 89 90 free_pt_page(pt, freelist); 91 } 92 93 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 94 { 95 switch (mode) { 96 case PAGE_MODE_NONE: 97 case PAGE_MODE_7_LEVEL: 98 break; 99 case PAGE_MODE_1_LEVEL: 100 free_pt_page(root, freelist); 101 break; 102 case PAGE_MODE_2_LEVEL: 103 case PAGE_MODE_3_LEVEL: 104 case PAGE_MODE_4_LEVEL: 105 case PAGE_MODE_5_LEVEL: 106 case PAGE_MODE_6_LEVEL: 107 free_pt_lvl(root, freelist, mode); 108 break; 109 default: 110 BUG(); 111 } 112 } 113 114 /* 115 * This function is used to add another level to an IO page table. Adding 116 * another level increases the size of the address space by 9 bits to a size up 117 * to 64 bits. 118 */ 119 static bool increase_address_space(struct amd_io_pgtable *pgtable, 120 unsigned long address, 121 unsigned int page_size_level, 122 gfp_t gfp) 123 { 124 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 125 struct protection_domain *domain = 126 container_of(pgtable, struct protection_domain, iop); 127 unsigned long flags; 128 bool ret = true; 129 u64 *pte; 130 131 pte = iommu_alloc_page_node(cfg->amd.nid, gfp); 132 if (!pte) 133 return false; 134 135 spin_lock_irqsave(&domain->lock, flags); 136 137 if (address <= PM_LEVEL_SIZE(pgtable->mode) && 138 pgtable->mode - 1 >= page_size_level) 139 goto out; 140 141 ret = false; 142 if (WARN_ON_ONCE(pgtable->mode == PAGE_MODE_6_LEVEL)) 143 goto out; 144 145 *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); 146 147 pgtable->root = pte; 148 pgtable->mode += 1; 149 amd_iommu_update_and_flush_device_table(domain); 150 151 pte = NULL; 152 ret = true; 153 154 out: 155 spin_unlock_irqrestore(&domain->lock, flags); 156 iommu_free_page(pte); 157 158 return ret; 159 } 160 161 static u64 *alloc_pte(struct amd_io_pgtable *pgtable, 162 unsigned long address, 163 unsigned long page_size, 164 u64 **pte_page, 165 gfp_t gfp, 166 bool *updated) 167 { 168 unsigned long last_addr = address + (page_size - 1); 169 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 170 int level, end_lvl; 171 u64 *pte, *page; 172 173 BUG_ON(!is_power_of_2(page_size)); 174 175 while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || 176 pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { 177 /* 178 * Return an error if there is no memory to update the 179 * page-table. 180 */ 181 if (!increase_address_space(pgtable, last_addr, 182 PAGE_SIZE_LEVEL(page_size), gfp)) 183 return NULL; 184 } 185 186 187 level = pgtable->mode - 1; 188 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 189 address = PAGE_SIZE_ALIGN(address, page_size); 190 end_lvl = PAGE_SIZE_LEVEL(page_size); 191 192 while (level > end_lvl) { 193 u64 __pte, __npte; 194 int pte_level; 195 196 __pte = *pte; 197 pte_level = PM_PTE_LEVEL(__pte); 198 199 /* 200 * If we replace a series of large PTEs, we need 201 * to tear down all of them. 202 */ 203 if (IOMMU_PTE_PRESENT(__pte) && 204 pte_level == PAGE_MODE_7_LEVEL) { 205 unsigned long count, i; 206 u64 *lpte; 207 208 lpte = first_pte_l7(pte, NULL, &count); 209 210 /* 211 * Unmap the replicated PTEs that still match the 212 * original large mapping 213 */ 214 for (i = 0; i < count; ++i) 215 cmpxchg64(&lpte[i], __pte, 0ULL); 216 217 *updated = true; 218 continue; 219 } 220 221 if (!IOMMU_PTE_PRESENT(__pte) || 222 pte_level == PAGE_MODE_NONE) { 223 page = iommu_alloc_page_node(cfg->amd.nid, gfp); 224 225 if (!page) 226 return NULL; 227 228 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 229 230 /* pte could have been changed somewhere. */ 231 if (!try_cmpxchg64(pte, &__pte, __npte)) 232 iommu_free_page(page); 233 else if (IOMMU_PTE_PRESENT(__pte)) 234 *updated = true; 235 236 continue; 237 } 238 239 /* No level skipping support yet */ 240 if (pte_level != level) 241 return NULL; 242 243 level -= 1; 244 245 pte = IOMMU_PTE_PAGE(__pte); 246 247 if (pte_page && level == end_lvl) 248 *pte_page = pte; 249 250 pte = &pte[PM_LEVEL_INDEX(level, address)]; 251 } 252 253 return pte; 254 } 255 256 /* 257 * This function checks if there is a PTE for a given dma address. If 258 * there is one, it returns the pointer to it. 259 */ 260 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 261 unsigned long address, 262 unsigned long *page_size) 263 { 264 int level; 265 u64 *pte; 266 267 *page_size = 0; 268 269 if (address > PM_LEVEL_SIZE(pgtable->mode)) 270 return NULL; 271 272 level = pgtable->mode - 1; 273 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 274 *page_size = PTE_LEVEL_PAGE_SIZE(level); 275 276 while (level > 0) { 277 278 /* Not Present */ 279 if (!IOMMU_PTE_PRESENT(*pte)) 280 return NULL; 281 282 /* Large PTE */ 283 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 284 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 285 break; 286 287 /* No level skipping support yet */ 288 if (PM_PTE_LEVEL(*pte) != level) 289 return NULL; 290 291 level -= 1; 292 293 /* Walk to the next level */ 294 pte = IOMMU_PTE_PAGE(*pte); 295 pte = &pte[PM_LEVEL_INDEX(level, address)]; 296 *page_size = PTE_LEVEL_PAGE_SIZE(level); 297 } 298 299 /* 300 * If we have a series of large PTEs, make 301 * sure to return a pointer to the first one. 302 */ 303 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 304 pte = first_pte_l7(pte, page_size, NULL); 305 306 return pte; 307 } 308 309 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 310 { 311 u64 *pt; 312 int mode; 313 314 while (!try_cmpxchg64(pte, &pteval, 0)) 315 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 316 317 if (!IOMMU_PTE_PRESENT(pteval)) 318 return; 319 320 pt = IOMMU_PTE_PAGE(pteval); 321 mode = IOMMU_PTE_MODE(pteval); 322 323 free_sub_pt(pt, mode, freelist); 324 } 325 326 /* 327 * Generic mapping functions. It maps a physical address into a DMA 328 * address space. It allocates the page table pages if necessary. 329 * In the future it can be extended to a generic mapping function 330 * supporting all features of AMD IOMMU page tables like level skipping 331 * and full 64 bit address spaces. 332 */ 333 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 334 phys_addr_t paddr, size_t pgsize, size_t pgcount, 335 int prot, gfp_t gfp, size_t *mapped) 336 { 337 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 338 LIST_HEAD(freelist); 339 bool updated = false; 340 u64 __pte, *pte; 341 int ret, i, count; 342 size_t size = pgcount << __ffs(pgsize); 343 unsigned long o_iova = iova; 344 345 BUG_ON(!IS_ALIGNED(iova, pgsize)); 346 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 347 348 ret = -EINVAL; 349 if (!(prot & IOMMU_PROT_MASK)) 350 goto out; 351 352 while (pgcount > 0) { 353 count = PAGE_SIZE_PTE_COUNT(pgsize); 354 pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); 355 356 ret = -ENOMEM; 357 if (!pte) 358 goto out; 359 360 for (i = 0; i < count; ++i) 361 free_clear_pte(&pte[i], pte[i], &freelist); 362 363 if (!list_empty(&freelist)) 364 updated = true; 365 366 if (count > 1) { 367 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 368 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 369 } else 370 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 371 372 if (prot & IOMMU_PROT_IR) 373 __pte |= IOMMU_PTE_IR; 374 if (prot & IOMMU_PROT_IW) 375 __pte |= IOMMU_PTE_IW; 376 377 for (i = 0; i < count; ++i) 378 pte[i] = __pte; 379 380 iova += pgsize; 381 paddr += pgsize; 382 pgcount--; 383 if (mapped) 384 *mapped += pgsize; 385 } 386 387 ret = 0; 388 389 out: 390 if (updated) { 391 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 392 unsigned long flags; 393 394 spin_lock_irqsave(&dom->lock, flags); 395 /* 396 * Flush domain TLB(s) and wait for completion. Any Device-Table 397 * Updates and flushing already happened in 398 * increase_address_space(). 399 */ 400 amd_iommu_domain_flush_pages(dom, o_iova, size); 401 spin_unlock_irqrestore(&dom->lock, flags); 402 } 403 404 /* Everything flushed out, free pages now */ 405 iommu_put_pages_list(&freelist); 406 407 return ret; 408 } 409 410 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 411 unsigned long iova, 412 size_t pgsize, size_t pgcount, 413 struct iommu_iotlb_gather *gather) 414 { 415 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 416 unsigned long long unmapped; 417 unsigned long unmap_size; 418 u64 *pte; 419 size_t size = pgcount << __ffs(pgsize); 420 421 BUG_ON(!is_power_of_2(pgsize)); 422 423 unmapped = 0; 424 425 while (unmapped < size) { 426 pte = fetch_pte(pgtable, iova, &unmap_size); 427 if (pte) { 428 int i, count; 429 430 count = PAGE_SIZE_PTE_COUNT(unmap_size); 431 for (i = 0; i < count; i++) 432 pte[i] = 0ULL; 433 } else { 434 return unmapped; 435 } 436 437 iova = (iova & ~(unmap_size - 1)) + unmap_size; 438 unmapped += unmap_size; 439 } 440 441 return unmapped; 442 } 443 444 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 445 { 446 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 447 unsigned long offset_mask, pte_pgsize; 448 u64 *pte, __pte; 449 450 pte = fetch_pte(pgtable, iova, &pte_pgsize); 451 452 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 453 return 0; 454 455 offset_mask = pte_pgsize - 1; 456 __pte = __sme_clr(*pte & PM_ADDR_MASK); 457 458 return (__pte & ~offset_mask) | (iova & offset_mask); 459 } 460 461 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 462 unsigned long flags) 463 { 464 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 465 bool dirty = false; 466 int i, count; 467 468 /* 469 * 2.2.3.2 Host Dirty Support 470 * When a non-default page size is used , software must OR the 471 * Dirty bits in all of the replicated host PTEs used to map 472 * the page. The IOMMU does not guarantee the Dirty bits are 473 * set in all of the replicated PTEs. Any portion of the page 474 * may have been written even if the Dirty bit is set in only 475 * one of the replicated PTEs. 476 */ 477 count = PAGE_SIZE_PTE_COUNT(size); 478 for (i = 0; i < count && test_only; i++) { 479 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 480 dirty = true; 481 break; 482 } 483 } 484 485 for (i = 0; i < count && !test_only; i++) { 486 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 487 (unsigned long *)&ptep[i])) { 488 dirty = true; 489 } 490 } 491 492 return dirty; 493 } 494 495 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 496 unsigned long iova, size_t size, 497 unsigned long flags, 498 struct iommu_dirty_bitmap *dirty) 499 { 500 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 501 unsigned long end = iova + size - 1; 502 503 do { 504 unsigned long pgsize = 0; 505 u64 *ptep, pte; 506 507 ptep = fetch_pte(pgtable, iova, &pgsize); 508 if (ptep) 509 pte = READ_ONCE(*ptep); 510 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 511 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 512 iova += pgsize; 513 continue; 514 } 515 516 /* 517 * Mark the whole IOVA range as dirty even if only one of 518 * the replicated PTEs were marked dirty. 519 */ 520 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 521 iommu_dirty_bitmap_record(dirty, iova, pgsize); 522 iova += pgsize; 523 } while (iova < end); 524 525 return 0; 526 } 527 528 /* 529 * ---------------------------------------------------- 530 */ 531 static void v1_free_pgtable(struct io_pgtable *iop) 532 { 533 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 534 LIST_HEAD(freelist); 535 536 if (pgtable->mode == PAGE_MODE_NONE) 537 return; 538 539 /* Page-table is not visible to IOMMU anymore, so free it */ 540 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 541 pgtable->mode > PAGE_MODE_6_LEVEL); 542 543 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 544 iommu_put_pages_list(&freelist); 545 } 546 547 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 548 { 549 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 550 551 pgtable->root = iommu_alloc_page_node(cfg->amd.nid, GFP_KERNEL); 552 if (!pgtable->root) 553 return NULL; 554 pgtable->mode = PAGE_MODE_3_LEVEL; 555 556 cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; 557 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; 558 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 559 560 pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; 561 pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; 562 pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; 563 pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 564 565 return &pgtable->pgtbl; 566 } 567 568 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 569 .alloc = v1_alloc_pgtable, 570 .free = v1_free_pgtable, 571 }; 572