1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 #include "../iommu-pages.h" 26 27 /* 28 * Helper function to get the first pte of a large mapping 29 */ 30 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 31 unsigned long *count) 32 { 33 unsigned long pte_mask, pg_size, cnt; 34 u64 *fpte; 35 36 pg_size = PTE_PAGE_SIZE(*pte); 37 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 38 pte_mask = ~((cnt << 3) - 1); 39 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 40 41 if (page_size) 42 *page_size = pg_size; 43 44 if (count) 45 *count = cnt; 46 47 return fpte; 48 } 49 50 static void free_pt_page(u64 *pt, struct list_head *freelist) 51 { 52 struct page *p = virt_to_page(pt); 53 54 list_add_tail(&p->lru, freelist); 55 } 56 57 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 58 { 59 u64 *p; 60 int i; 61 62 for (i = 0; i < 512; ++i) { 63 /* PTE present? */ 64 if (!IOMMU_PTE_PRESENT(pt[i])) 65 continue; 66 67 /* Large PTE? */ 68 if (PM_PTE_LEVEL(pt[i]) == 0 || 69 PM_PTE_LEVEL(pt[i]) == 7) 70 continue; 71 72 /* 73 * Free the next level. No need to look at l1 tables here since 74 * they can only contain leaf PTEs; just free them directly. 75 */ 76 p = IOMMU_PTE_PAGE(pt[i]); 77 if (lvl > 2) 78 free_pt_lvl(p, freelist, lvl - 1); 79 else 80 free_pt_page(p, freelist); 81 } 82 83 free_pt_page(pt, freelist); 84 } 85 86 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 87 { 88 switch (mode) { 89 case PAGE_MODE_NONE: 90 case PAGE_MODE_7_LEVEL: 91 break; 92 case PAGE_MODE_1_LEVEL: 93 free_pt_page(root, freelist); 94 break; 95 case PAGE_MODE_2_LEVEL: 96 case PAGE_MODE_3_LEVEL: 97 case PAGE_MODE_4_LEVEL: 98 case PAGE_MODE_5_LEVEL: 99 case PAGE_MODE_6_LEVEL: 100 free_pt_lvl(root, freelist, mode); 101 break; 102 default: 103 BUG(); 104 } 105 } 106 107 /* 108 * This function is used to add another level to an IO page table. Adding 109 * another level increases the size of the address space by 9 bits to a size up 110 * to 64 bits. 111 */ 112 static bool increase_address_space(struct amd_io_pgtable *pgtable, 113 unsigned long address, 114 unsigned int page_size_level, 115 gfp_t gfp) 116 { 117 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 118 struct protection_domain *domain = 119 container_of(pgtable, struct protection_domain, iop); 120 unsigned long flags; 121 bool ret = true; 122 u64 *pte; 123 124 pte = iommu_alloc_page_node(cfg->amd.nid, gfp); 125 if (!pte) 126 return false; 127 128 spin_lock_irqsave(&domain->lock, flags); 129 130 if (address <= PM_LEVEL_SIZE(pgtable->mode) && 131 pgtable->mode - 1 >= page_size_level) 132 goto out; 133 134 ret = false; 135 if (WARN_ON_ONCE(pgtable->mode == PAGE_MODE_6_LEVEL)) 136 goto out; 137 138 *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); 139 140 pgtable->root = pte; 141 pgtable->mode += 1; 142 amd_iommu_update_and_flush_device_table(domain); 143 144 pte = NULL; 145 ret = true; 146 147 out: 148 spin_unlock_irqrestore(&domain->lock, flags); 149 iommu_free_page(pte); 150 151 return ret; 152 } 153 154 static u64 *alloc_pte(struct amd_io_pgtable *pgtable, 155 unsigned long address, 156 unsigned long page_size, 157 u64 **pte_page, 158 gfp_t gfp, 159 bool *updated) 160 { 161 unsigned long last_addr = address + (page_size - 1); 162 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 163 int level, end_lvl; 164 u64 *pte, *page; 165 166 BUG_ON(!is_power_of_2(page_size)); 167 168 while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || 169 pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { 170 /* 171 * Return an error if there is no memory to update the 172 * page-table. 173 */ 174 if (!increase_address_space(pgtable, last_addr, 175 PAGE_SIZE_LEVEL(page_size), gfp)) 176 return NULL; 177 } 178 179 180 level = pgtable->mode - 1; 181 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 182 address = PAGE_SIZE_ALIGN(address, page_size); 183 end_lvl = PAGE_SIZE_LEVEL(page_size); 184 185 while (level > end_lvl) { 186 u64 __pte, __npte; 187 int pte_level; 188 189 __pte = *pte; 190 pte_level = PM_PTE_LEVEL(__pte); 191 192 /* 193 * If we replace a series of large PTEs, we need 194 * to tear down all of them. 195 */ 196 if (IOMMU_PTE_PRESENT(__pte) && 197 pte_level == PAGE_MODE_7_LEVEL) { 198 unsigned long count, i; 199 u64 *lpte; 200 201 lpte = first_pte_l7(pte, NULL, &count); 202 203 /* 204 * Unmap the replicated PTEs that still match the 205 * original large mapping 206 */ 207 for (i = 0; i < count; ++i) 208 cmpxchg64(&lpte[i], __pte, 0ULL); 209 210 *updated = true; 211 continue; 212 } 213 214 if (!IOMMU_PTE_PRESENT(__pte) || 215 pte_level == PAGE_MODE_NONE) { 216 page = iommu_alloc_page_node(cfg->amd.nid, gfp); 217 218 if (!page) 219 return NULL; 220 221 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 222 223 /* pte could have been changed somewhere. */ 224 if (!try_cmpxchg64(pte, &__pte, __npte)) 225 iommu_free_page(page); 226 else if (IOMMU_PTE_PRESENT(__pte)) 227 *updated = true; 228 229 continue; 230 } 231 232 /* No level skipping support yet */ 233 if (pte_level != level) 234 return NULL; 235 236 level -= 1; 237 238 pte = IOMMU_PTE_PAGE(__pte); 239 240 if (pte_page && level == end_lvl) 241 *pte_page = pte; 242 243 pte = &pte[PM_LEVEL_INDEX(level, address)]; 244 } 245 246 return pte; 247 } 248 249 /* 250 * This function checks if there is a PTE for a given dma address. If 251 * there is one, it returns the pointer to it. 252 */ 253 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 254 unsigned long address, 255 unsigned long *page_size) 256 { 257 int level; 258 u64 *pte; 259 260 *page_size = 0; 261 262 if (address > PM_LEVEL_SIZE(pgtable->mode)) 263 return NULL; 264 265 level = pgtable->mode - 1; 266 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 267 *page_size = PTE_LEVEL_PAGE_SIZE(level); 268 269 while (level > 0) { 270 271 /* Not Present */ 272 if (!IOMMU_PTE_PRESENT(*pte)) 273 return NULL; 274 275 /* Large PTE */ 276 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 277 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 278 break; 279 280 /* No level skipping support yet */ 281 if (PM_PTE_LEVEL(*pte) != level) 282 return NULL; 283 284 level -= 1; 285 286 /* Walk to the next level */ 287 pte = IOMMU_PTE_PAGE(*pte); 288 pte = &pte[PM_LEVEL_INDEX(level, address)]; 289 *page_size = PTE_LEVEL_PAGE_SIZE(level); 290 } 291 292 /* 293 * If we have a series of large PTEs, make 294 * sure to return a pointer to the first one. 295 */ 296 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 297 pte = first_pte_l7(pte, page_size, NULL); 298 299 return pte; 300 } 301 302 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 303 { 304 u64 *pt; 305 int mode; 306 307 while (!try_cmpxchg64(pte, &pteval, 0)) 308 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 309 310 if (!IOMMU_PTE_PRESENT(pteval)) 311 return; 312 313 pt = IOMMU_PTE_PAGE(pteval); 314 mode = IOMMU_PTE_MODE(pteval); 315 316 free_sub_pt(pt, mode, freelist); 317 } 318 319 /* 320 * Generic mapping functions. It maps a physical address into a DMA 321 * address space. It allocates the page table pages if necessary. 322 * In the future it can be extended to a generic mapping function 323 * supporting all features of AMD IOMMU page tables like level skipping 324 * and full 64 bit address spaces. 325 */ 326 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 327 phys_addr_t paddr, size_t pgsize, size_t pgcount, 328 int prot, gfp_t gfp, size_t *mapped) 329 { 330 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 331 LIST_HEAD(freelist); 332 bool updated = false; 333 u64 __pte, *pte; 334 int ret, i, count; 335 size_t size = pgcount << __ffs(pgsize); 336 unsigned long o_iova = iova; 337 338 BUG_ON(!IS_ALIGNED(iova, pgsize)); 339 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 340 341 ret = -EINVAL; 342 if (!(prot & IOMMU_PROT_MASK)) 343 goto out; 344 345 while (pgcount > 0) { 346 count = PAGE_SIZE_PTE_COUNT(pgsize); 347 pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); 348 349 ret = -ENOMEM; 350 if (!pte) 351 goto out; 352 353 for (i = 0; i < count; ++i) 354 free_clear_pte(&pte[i], pte[i], &freelist); 355 356 if (!list_empty(&freelist)) 357 updated = true; 358 359 if (count > 1) { 360 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 361 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 362 } else 363 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 364 365 if (prot & IOMMU_PROT_IR) 366 __pte |= IOMMU_PTE_IR; 367 if (prot & IOMMU_PROT_IW) 368 __pte |= IOMMU_PTE_IW; 369 370 for (i = 0; i < count; ++i) 371 pte[i] = __pte; 372 373 iova += pgsize; 374 paddr += pgsize; 375 pgcount--; 376 if (mapped) 377 *mapped += pgsize; 378 } 379 380 ret = 0; 381 382 out: 383 if (updated) { 384 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 385 unsigned long flags; 386 387 spin_lock_irqsave(&dom->lock, flags); 388 /* 389 * Flush domain TLB(s) and wait for completion. Any Device-Table 390 * Updates and flushing already happened in 391 * increase_address_space(). 392 */ 393 amd_iommu_domain_flush_pages(dom, o_iova, size); 394 spin_unlock_irqrestore(&dom->lock, flags); 395 } 396 397 /* Everything flushed out, free pages now */ 398 iommu_put_pages_list(&freelist); 399 400 return ret; 401 } 402 403 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 404 unsigned long iova, 405 size_t pgsize, size_t pgcount, 406 struct iommu_iotlb_gather *gather) 407 { 408 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 409 unsigned long long unmapped; 410 unsigned long unmap_size; 411 u64 *pte; 412 size_t size = pgcount << __ffs(pgsize); 413 414 BUG_ON(!is_power_of_2(pgsize)); 415 416 unmapped = 0; 417 418 while (unmapped < size) { 419 pte = fetch_pte(pgtable, iova, &unmap_size); 420 if (pte) { 421 int i, count; 422 423 count = PAGE_SIZE_PTE_COUNT(unmap_size); 424 for (i = 0; i < count; i++) 425 pte[i] = 0ULL; 426 } else { 427 return unmapped; 428 } 429 430 iova = (iova & ~(unmap_size - 1)) + unmap_size; 431 unmapped += unmap_size; 432 } 433 434 return unmapped; 435 } 436 437 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 438 { 439 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 440 unsigned long offset_mask, pte_pgsize; 441 u64 *pte, __pte; 442 443 pte = fetch_pte(pgtable, iova, &pte_pgsize); 444 445 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 446 return 0; 447 448 offset_mask = pte_pgsize - 1; 449 __pte = __sme_clr(*pte & PM_ADDR_MASK); 450 451 return (__pte & ~offset_mask) | (iova & offset_mask); 452 } 453 454 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 455 unsigned long flags) 456 { 457 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 458 bool dirty = false; 459 int i, count; 460 461 /* 462 * 2.2.3.2 Host Dirty Support 463 * When a non-default page size is used , software must OR the 464 * Dirty bits in all of the replicated host PTEs used to map 465 * the page. The IOMMU does not guarantee the Dirty bits are 466 * set in all of the replicated PTEs. Any portion of the page 467 * may have been written even if the Dirty bit is set in only 468 * one of the replicated PTEs. 469 */ 470 count = PAGE_SIZE_PTE_COUNT(size); 471 for (i = 0; i < count && test_only; i++) { 472 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 473 dirty = true; 474 break; 475 } 476 } 477 478 for (i = 0; i < count && !test_only; i++) { 479 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 480 (unsigned long *)&ptep[i])) { 481 dirty = true; 482 } 483 } 484 485 return dirty; 486 } 487 488 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 489 unsigned long iova, size_t size, 490 unsigned long flags, 491 struct iommu_dirty_bitmap *dirty) 492 { 493 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 494 unsigned long end = iova + size - 1; 495 496 do { 497 unsigned long pgsize = 0; 498 u64 *ptep, pte; 499 500 ptep = fetch_pte(pgtable, iova, &pgsize); 501 if (ptep) 502 pte = READ_ONCE(*ptep); 503 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 504 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 505 iova += pgsize; 506 continue; 507 } 508 509 /* 510 * Mark the whole IOVA range as dirty even if only one of 511 * the replicated PTEs were marked dirty. 512 */ 513 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 514 iommu_dirty_bitmap_record(dirty, iova, pgsize); 515 iova += pgsize; 516 } while (iova < end); 517 518 return 0; 519 } 520 521 /* 522 * ---------------------------------------------------- 523 */ 524 static void v1_free_pgtable(struct io_pgtable *iop) 525 { 526 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 527 LIST_HEAD(freelist); 528 529 if (pgtable->mode == PAGE_MODE_NONE) 530 return; 531 532 /* Page-table is not visible to IOMMU anymore, so free it */ 533 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 534 pgtable->mode > PAGE_MODE_6_LEVEL); 535 536 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 537 iommu_put_pages_list(&freelist); 538 } 539 540 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 541 { 542 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 543 544 pgtable->root = iommu_alloc_page_node(cfg->amd.nid, GFP_KERNEL); 545 if (!pgtable->root) 546 return NULL; 547 pgtable->mode = PAGE_MODE_3_LEVEL; 548 549 cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; 550 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; 551 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 552 553 pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; 554 pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; 555 pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; 556 pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 557 558 return &pgtable->pgtbl; 559 } 560 561 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 562 .alloc = v1_alloc_pgtable, 563 .free = v1_free_pgtable, 564 }; 565