1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 #include "../iommu-pages.h" 26 27 /* 28 * Helper function to get the first pte of a large mapping 29 */ 30 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 31 unsigned long *count) 32 { 33 unsigned long pte_mask, pg_size, cnt; 34 u64 *fpte; 35 36 pg_size = PTE_PAGE_SIZE(*pte); 37 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 38 pte_mask = ~((cnt << 3) - 1); 39 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 40 41 if (page_size) 42 *page_size = pg_size; 43 44 if (count) 45 *count = cnt; 46 47 return fpte; 48 } 49 50 /**************************************************************************** 51 * 52 * The functions below are used the create the page table mappings for 53 * unity mapped regions. 54 * 55 ****************************************************************************/ 56 57 static void free_pt_page(u64 *pt, struct list_head *freelist) 58 { 59 struct page *p = virt_to_page(pt); 60 61 list_add_tail(&p->lru, freelist); 62 } 63 64 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 65 { 66 u64 *p; 67 int i; 68 69 for (i = 0; i < 512; ++i) { 70 /* PTE present? */ 71 if (!IOMMU_PTE_PRESENT(pt[i])) 72 continue; 73 74 /* Large PTE? */ 75 if (PM_PTE_LEVEL(pt[i]) == 0 || 76 PM_PTE_LEVEL(pt[i]) == 7) 77 continue; 78 79 /* 80 * Free the next level. No need to look at l1 tables here since 81 * they can only contain leaf PTEs; just free them directly. 82 */ 83 p = IOMMU_PTE_PAGE(pt[i]); 84 if (lvl > 2) 85 free_pt_lvl(p, freelist, lvl - 1); 86 else 87 free_pt_page(p, freelist); 88 } 89 90 free_pt_page(pt, freelist); 91 } 92 93 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 94 { 95 switch (mode) { 96 case PAGE_MODE_NONE: 97 case PAGE_MODE_7_LEVEL: 98 break; 99 case PAGE_MODE_1_LEVEL: 100 free_pt_page(root, freelist); 101 break; 102 case PAGE_MODE_2_LEVEL: 103 case PAGE_MODE_3_LEVEL: 104 case PAGE_MODE_4_LEVEL: 105 case PAGE_MODE_5_LEVEL: 106 case PAGE_MODE_6_LEVEL: 107 free_pt_lvl(root, freelist, mode); 108 break; 109 default: 110 BUG(); 111 } 112 } 113 114 /* 115 * This function is used to add another level to an IO page table. Adding 116 * another level increases the size of the address space by 9 bits to a size up 117 * to 64 bits. 118 */ 119 static bool increase_address_space(struct amd_io_pgtable *pgtable, 120 unsigned long address, 121 gfp_t gfp) 122 { 123 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 124 struct protection_domain *domain = 125 container_of(pgtable, struct protection_domain, iop); 126 unsigned long flags; 127 bool ret = true; 128 u64 *pte; 129 130 pte = iommu_alloc_page_node(cfg->amd.nid, gfp); 131 if (!pte) 132 return false; 133 134 spin_lock_irqsave(&domain->lock, flags); 135 136 if (address <= PM_LEVEL_SIZE(pgtable->mode)) 137 goto out; 138 139 ret = false; 140 if (WARN_ON_ONCE(pgtable->mode == PAGE_MODE_6_LEVEL)) 141 goto out; 142 143 *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); 144 145 pgtable->root = pte; 146 pgtable->mode += 1; 147 amd_iommu_update_and_flush_device_table(domain); 148 149 pte = NULL; 150 ret = true; 151 152 out: 153 spin_unlock_irqrestore(&domain->lock, flags); 154 iommu_free_page(pte); 155 156 return ret; 157 } 158 159 static u64 *alloc_pte(struct amd_io_pgtable *pgtable, 160 unsigned long address, 161 unsigned long page_size, 162 u64 **pte_page, 163 gfp_t gfp, 164 bool *updated) 165 { 166 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 167 int level, end_lvl; 168 u64 *pte, *page; 169 170 BUG_ON(!is_power_of_2(page_size)); 171 172 while (address > PM_LEVEL_SIZE(pgtable->mode)) { 173 /* 174 * Return an error if there is no memory to update the 175 * page-table. 176 */ 177 if (!increase_address_space(pgtable, address, gfp)) 178 return NULL; 179 } 180 181 182 level = pgtable->mode - 1; 183 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 184 address = PAGE_SIZE_ALIGN(address, page_size); 185 end_lvl = PAGE_SIZE_LEVEL(page_size); 186 187 while (level > end_lvl) { 188 u64 __pte, __npte; 189 int pte_level; 190 191 __pte = *pte; 192 pte_level = PM_PTE_LEVEL(__pte); 193 194 /* 195 * If we replace a series of large PTEs, we need 196 * to tear down all of them. 197 */ 198 if (IOMMU_PTE_PRESENT(__pte) && 199 pte_level == PAGE_MODE_7_LEVEL) { 200 unsigned long count, i; 201 u64 *lpte; 202 203 lpte = first_pte_l7(pte, NULL, &count); 204 205 /* 206 * Unmap the replicated PTEs that still match the 207 * original large mapping 208 */ 209 for (i = 0; i < count; ++i) 210 cmpxchg64(&lpte[i], __pte, 0ULL); 211 212 *updated = true; 213 continue; 214 } 215 216 if (!IOMMU_PTE_PRESENT(__pte) || 217 pte_level == PAGE_MODE_NONE) { 218 page = iommu_alloc_page_node(cfg->amd.nid, gfp); 219 220 if (!page) 221 return NULL; 222 223 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 224 225 /* pte could have been changed somewhere. */ 226 if (!try_cmpxchg64(pte, &__pte, __npte)) 227 iommu_free_page(page); 228 else if (IOMMU_PTE_PRESENT(__pte)) 229 *updated = true; 230 231 continue; 232 } 233 234 /* No level skipping support yet */ 235 if (pte_level != level) 236 return NULL; 237 238 level -= 1; 239 240 pte = IOMMU_PTE_PAGE(__pte); 241 242 if (pte_page && level == end_lvl) 243 *pte_page = pte; 244 245 pte = &pte[PM_LEVEL_INDEX(level, address)]; 246 } 247 248 return pte; 249 } 250 251 /* 252 * This function checks if there is a PTE for a given dma address. If 253 * there is one, it returns the pointer to it. 254 */ 255 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 256 unsigned long address, 257 unsigned long *page_size) 258 { 259 int level; 260 u64 *pte; 261 262 *page_size = 0; 263 264 if (address > PM_LEVEL_SIZE(pgtable->mode)) 265 return NULL; 266 267 level = pgtable->mode - 1; 268 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 269 *page_size = PTE_LEVEL_PAGE_SIZE(level); 270 271 while (level > 0) { 272 273 /* Not Present */ 274 if (!IOMMU_PTE_PRESENT(*pte)) 275 return NULL; 276 277 /* Large PTE */ 278 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 279 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 280 break; 281 282 /* No level skipping support yet */ 283 if (PM_PTE_LEVEL(*pte) != level) 284 return NULL; 285 286 level -= 1; 287 288 /* Walk to the next level */ 289 pte = IOMMU_PTE_PAGE(*pte); 290 pte = &pte[PM_LEVEL_INDEX(level, address)]; 291 *page_size = PTE_LEVEL_PAGE_SIZE(level); 292 } 293 294 /* 295 * If we have a series of large PTEs, make 296 * sure to return a pointer to the first one. 297 */ 298 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 299 pte = first_pte_l7(pte, page_size, NULL); 300 301 return pte; 302 } 303 304 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 305 { 306 u64 *pt; 307 int mode; 308 309 while (!try_cmpxchg64(pte, &pteval, 0)) 310 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 311 312 if (!IOMMU_PTE_PRESENT(pteval)) 313 return; 314 315 pt = IOMMU_PTE_PAGE(pteval); 316 mode = IOMMU_PTE_MODE(pteval); 317 318 free_sub_pt(pt, mode, freelist); 319 } 320 321 /* 322 * Generic mapping functions. It maps a physical address into a DMA 323 * address space. It allocates the page table pages if necessary. 324 * In the future it can be extended to a generic mapping function 325 * supporting all features of AMD IOMMU page tables like level skipping 326 * and full 64 bit address spaces. 327 */ 328 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 329 phys_addr_t paddr, size_t pgsize, size_t pgcount, 330 int prot, gfp_t gfp, size_t *mapped) 331 { 332 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 333 LIST_HEAD(freelist); 334 bool updated = false; 335 u64 __pte, *pte; 336 int ret, i, count; 337 size_t size = pgcount << __ffs(pgsize); 338 unsigned long o_iova = iova; 339 340 BUG_ON(!IS_ALIGNED(iova, pgsize)); 341 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 342 343 ret = -EINVAL; 344 if (!(prot & IOMMU_PROT_MASK)) 345 goto out; 346 347 while (pgcount > 0) { 348 count = PAGE_SIZE_PTE_COUNT(pgsize); 349 pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); 350 351 ret = -ENOMEM; 352 if (!pte) 353 goto out; 354 355 for (i = 0; i < count; ++i) 356 free_clear_pte(&pte[i], pte[i], &freelist); 357 358 if (!list_empty(&freelist)) 359 updated = true; 360 361 if (count > 1) { 362 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 363 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 364 } else 365 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 366 367 if (prot & IOMMU_PROT_IR) 368 __pte |= IOMMU_PTE_IR; 369 if (prot & IOMMU_PROT_IW) 370 __pte |= IOMMU_PTE_IW; 371 372 for (i = 0; i < count; ++i) 373 pte[i] = __pte; 374 375 iova += pgsize; 376 paddr += pgsize; 377 pgcount--; 378 if (mapped) 379 *mapped += pgsize; 380 } 381 382 ret = 0; 383 384 out: 385 if (updated) { 386 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 387 unsigned long flags; 388 389 spin_lock_irqsave(&dom->lock, flags); 390 /* 391 * Flush domain TLB(s) and wait for completion. Any Device-Table 392 * Updates and flushing already happened in 393 * increase_address_space(). 394 */ 395 amd_iommu_domain_flush_pages(dom, o_iova, size); 396 spin_unlock_irqrestore(&dom->lock, flags); 397 } 398 399 /* Everything flushed out, free pages now */ 400 iommu_put_pages_list(&freelist); 401 402 return ret; 403 } 404 405 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 406 unsigned long iova, 407 size_t pgsize, size_t pgcount, 408 struct iommu_iotlb_gather *gather) 409 { 410 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 411 unsigned long long unmapped; 412 unsigned long unmap_size; 413 u64 *pte; 414 size_t size = pgcount << __ffs(pgsize); 415 416 BUG_ON(!is_power_of_2(pgsize)); 417 418 unmapped = 0; 419 420 while (unmapped < size) { 421 pte = fetch_pte(pgtable, iova, &unmap_size); 422 if (pte) { 423 int i, count; 424 425 count = PAGE_SIZE_PTE_COUNT(unmap_size); 426 for (i = 0; i < count; i++) 427 pte[i] = 0ULL; 428 } else { 429 return unmapped; 430 } 431 432 iova = (iova & ~(unmap_size - 1)) + unmap_size; 433 unmapped += unmap_size; 434 } 435 436 return unmapped; 437 } 438 439 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 440 { 441 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 442 unsigned long offset_mask, pte_pgsize; 443 u64 *pte, __pte; 444 445 pte = fetch_pte(pgtable, iova, &pte_pgsize); 446 447 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 448 return 0; 449 450 offset_mask = pte_pgsize - 1; 451 __pte = __sme_clr(*pte & PM_ADDR_MASK); 452 453 return (__pte & ~offset_mask) | (iova & offset_mask); 454 } 455 456 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 457 unsigned long flags) 458 { 459 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 460 bool dirty = false; 461 int i, count; 462 463 /* 464 * 2.2.3.2 Host Dirty Support 465 * When a non-default page size is used , software must OR the 466 * Dirty bits in all of the replicated host PTEs used to map 467 * the page. The IOMMU does not guarantee the Dirty bits are 468 * set in all of the replicated PTEs. Any portion of the page 469 * may have been written even if the Dirty bit is set in only 470 * one of the replicated PTEs. 471 */ 472 count = PAGE_SIZE_PTE_COUNT(size); 473 for (i = 0; i < count && test_only; i++) { 474 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 475 dirty = true; 476 break; 477 } 478 } 479 480 for (i = 0; i < count && !test_only; i++) { 481 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 482 (unsigned long *)&ptep[i])) { 483 dirty = true; 484 } 485 } 486 487 return dirty; 488 } 489 490 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 491 unsigned long iova, size_t size, 492 unsigned long flags, 493 struct iommu_dirty_bitmap *dirty) 494 { 495 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 496 unsigned long end = iova + size - 1; 497 498 do { 499 unsigned long pgsize = 0; 500 u64 *ptep, pte; 501 502 ptep = fetch_pte(pgtable, iova, &pgsize); 503 if (ptep) 504 pte = READ_ONCE(*ptep); 505 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 506 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 507 iova += pgsize; 508 continue; 509 } 510 511 /* 512 * Mark the whole IOVA range as dirty even if only one of 513 * the replicated PTEs were marked dirty. 514 */ 515 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 516 iommu_dirty_bitmap_record(dirty, iova, pgsize); 517 iova += pgsize; 518 } while (iova < end); 519 520 return 0; 521 } 522 523 /* 524 * ---------------------------------------------------- 525 */ 526 static void v1_free_pgtable(struct io_pgtable *iop) 527 { 528 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 529 LIST_HEAD(freelist); 530 531 if (pgtable->mode == PAGE_MODE_NONE) 532 return; 533 534 /* Page-table is not visible to IOMMU anymore, so free it */ 535 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 536 pgtable->mode > PAGE_MODE_6_LEVEL); 537 538 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 539 iommu_put_pages_list(&freelist); 540 } 541 542 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 543 { 544 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 545 546 pgtable->root = iommu_alloc_page_node(cfg->amd.nid, GFP_KERNEL); 547 if (!pgtable->root) 548 return NULL; 549 pgtable->mode = PAGE_MODE_3_LEVEL; 550 551 cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; 552 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; 553 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 554 555 pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; 556 pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; 557 pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; 558 pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 559 560 return &pgtable->pgtbl; 561 } 562 563 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 564 .alloc = v1_alloc_pgtable, 565 .free = v1_free_pgtable, 566 }; 567