1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 #include "../iommu-pages.h" 26 27 /* 28 * Helper function to get the first pte of a large mapping 29 */ 30 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 31 unsigned long *count) 32 { 33 unsigned long pte_mask, pg_size, cnt; 34 u64 *fpte; 35 36 pg_size = PTE_PAGE_SIZE(*pte); 37 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 38 pte_mask = ~((cnt << 3) - 1); 39 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 40 41 if (page_size) 42 *page_size = pg_size; 43 44 if (count) 45 *count = cnt; 46 47 return fpte; 48 } 49 50 static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) 51 { 52 u64 *p; 53 int i; 54 55 for (i = 0; i < 512; ++i) { 56 /* PTE present? */ 57 if (!IOMMU_PTE_PRESENT(pt[i])) 58 continue; 59 60 /* Large PTE? */ 61 if (PM_PTE_LEVEL(pt[i]) == 0 || 62 PM_PTE_LEVEL(pt[i]) == 7) 63 continue; 64 65 /* 66 * Free the next level. No need to look at l1 tables here since 67 * they can only contain leaf PTEs; just free them directly. 68 */ 69 p = IOMMU_PTE_PAGE(pt[i]); 70 if (lvl > 2) 71 free_pt_lvl(p, freelist, lvl - 1); 72 else 73 iommu_pages_list_add(freelist, p); 74 } 75 76 iommu_pages_list_add(freelist, pt); 77 } 78 79 static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) 80 { 81 switch (mode) { 82 case PAGE_MODE_NONE: 83 case PAGE_MODE_7_LEVEL: 84 break; 85 case PAGE_MODE_1_LEVEL: 86 iommu_pages_list_add(freelist, root); 87 break; 88 case PAGE_MODE_2_LEVEL: 89 case PAGE_MODE_3_LEVEL: 90 case PAGE_MODE_4_LEVEL: 91 case PAGE_MODE_5_LEVEL: 92 case PAGE_MODE_6_LEVEL: 93 free_pt_lvl(root, freelist, mode); 94 break; 95 default: 96 BUG(); 97 } 98 } 99 100 /* 101 * This function is used to add another level to an IO page table. Adding 102 * another level increases the size of the address space by 9 bits to a size up 103 * to 64 bits. 104 */ 105 static bool increase_address_space(struct amd_io_pgtable *pgtable, 106 unsigned long address, 107 unsigned int page_size_level, 108 gfp_t gfp) 109 { 110 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 111 struct protection_domain *domain = 112 container_of(pgtable, struct protection_domain, iop); 113 unsigned long flags; 114 bool ret = true; 115 u64 *pte; 116 117 pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); 118 if (!pte) 119 return false; 120 121 spin_lock_irqsave(&domain->lock, flags); 122 123 if (address <= PM_LEVEL_SIZE(pgtable->mode) && 124 pgtable->mode - 1 >= page_size_level) 125 goto out; 126 127 ret = false; 128 if (WARN_ON_ONCE(pgtable->mode == PAGE_MODE_6_LEVEL)) 129 goto out; 130 131 *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); 132 133 pgtable->root = pte; 134 pgtable->mode += 1; 135 amd_iommu_update_and_flush_device_table(domain); 136 137 pte = NULL; 138 ret = true; 139 140 out: 141 spin_unlock_irqrestore(&domain->lock, flags); 142 iommu_free_pages(pte); 143 144 return ret; 145 } 146 147 static u64 *alloc_pte(struct amd_io_pgtable *pgtable, 148 unsigned long address, 149 unsigned long page_size, 150 u64 **pte_page, 151 gfp_t gfp, 152 bool *updated) 153 { 154 unsigned long last_addr = address + (page_size - 1); 155 struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 156 int level, end_lvl; 157 u64 *pte, *page; 158 159 BUG_ON(!is_power_of_2(page_size)); 160 161 while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || 162 pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { 163 /* 164 * Return an error if there is no memory to update the 165 * page-table. 166 */ 167 if (!increase_address_space(pgtable, last_addr, 168 PAGE_SIZE_LEVEL(page_size), gfp)) 169 return NULL; 170 } 171 172 173 level = pgtable->mode - 1; 174 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 175 address = PAGE_SIZE_ALIGN(address, page_size); 176 end_lvl = PAGE_SIZE_LEVEL(page_size); 177 178 while (level > end_lvl) { 179 u64 __pte, __npte; 180 int pte_level; 181 182 __pte = *pte; 183 pte_level = PM_PTE_LEVEL(__pte); 184 185 /* 186 * If we replace a series of large PTEs, we need 187 * to tear down all of them. 188 */ 189 if (IOMMU_PTE_PRESENT(__pte) && 190 pte_level == PAGE_MODE_7_LEVEL) { 191 unsigned long count, i; 192 u64 *lpte; 193 194 lpte = first_pte_l7(pte, NULL, &count); 195 196 /* 197 * Unmap the replicated PTEs that still match the 198 * original large mapping 199 */ 200 for (i = 0; i < count; ++i) 201 cmpxchg64(&lpte[i], __pte, 0ULL); 202 203 *updated = true; 204 continue; 205 } 206 207 if (!IOMMU_PTE_PRESENT(__pte) || 208 pte_level == PAGE_MODE_NONE) { 209 page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, 210 SZ_4K); 211 212 if (!page) 213 return NULL; 214 215 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 216 217 /* pte could have been changed somewhere. */ 218 if (!try_cmpxchg64(pte, &__pte, __npte)) 219 iommu_free_pages(page); 220 else if (IOMMU_PTE_PRESENT(__pte)) 221 *updated = true; 222 223 continue; 224 } 225 226 /* No level skipping support yet */ 227 if (pte_level != level) 228 return NULL; 229 230 level -= 1; 231 232 pte = IOMMU_PTE_PAGE(__pte); 233 234 if (pte_page && level == end_lvl) 235 *pte_page = pte; 236 237 pte = &pte[PM_LEVEL_INDEX(level, address)]; 238 } 239 240 return pte; 241 } 242 243 /* 244 * This function checks if there is a PTE for a given dma address. If 245 * there is one, it returns the pointer to it. 246 */ 247 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 248 unsigned long address, 249 unsigned long *page_size) 250 { 251 int level; 252 u64 *pte; 253 254 *page_size = 0; 255 256 if (address > PM_LEVEL_SIZE(pgtable->mode)) 257 return NULL; 258 259 level = pgtable->mode - 1; 260 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 261 *page_size = PTE_LEVEL_PAGE_SIZE(level); 262 263 while (level > 0) { 264 265 /* Not Present */ 266 if (!IOMMU_PTE_PRESENT(*pte)) 267 return NULL; 268 269 /* Large PTE */ 270 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 271 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 272 break; 273 274 /* No level skipping support yet */ 275 if (PM_PTE_LEVEL(*pte) != level) 276 return NULL; 277 278 level -= 1; 279 280 /* Walk to the next level */ 281 pte = IOMMU_PTE_PAGE(*pte); 282 pte = &pte[PM_LEVEL_INDEX(level, address)]; 283 *page_size = PTE_LEVEL_PAGE_SIZE(level); 284 } 285 286 /* 287 * If we have a series of large PTEs, make 288 * sure to return a pointer to the first one. 289 */ 290 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 291 pte = first_pte_l7(pte, page_size, NULL); 292 293 return pte; 294 } 295 296 static void free_clear_pte(u64 *pte, u64 pteval, 297 struct iommu_pages_list *freelist) 298 { 299 u64 *pt; 300 int mode; 301 302 while (!try_cmpxchg64(pte, &pteval, 0)) 303 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 304 305 if (!IOMMU_PTE_PRESENT(pteval)) 306 return; 307 308 pt = IOMMU_PTE_PAGE(pteval); 309 mode = IOMMU_PTE_MODE(pteval); 310 311 free_sub_pt(pt, mode, freelist); 312 } 313 314 /* 315 * Generic mapping functions. It maps a physical address into a DMA 316 * address space. It allocates the page table pages if necessary. 317 * In the future it can be extended to a generic mapping function 318 * supporting all features of AMD IOMMU page tables like level skipping 319 * and full 64 bit address spaces. 320 */ 321 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 322 phys_addr_t paddr, size_t pgsize, size_t pgcount, 323 int prot, gfp_t gfp, size_t *mapped) 324 { 325 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 326 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 327 bool updated = false; 328 u64 __pte, *pte; 329 int ret, i, count; 330 size_t size = pgcount << __ffs(pgsize); 331 unsigned long o_iova = iova; 332 333 BUG_ON(!IS_ALIGNED(iova, pgsize)); 334 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 335 336 ret = -EINVAL; 337 if (!(prot & IOMMU_PROT_MASK)) 338 goto out; 339 340 while (pgcount > 0) { 341 count = PAGE_SIZE_PTE_COUNT(pgsize); 342 pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); 343 344 ret = -ENOMEM; 345 if (!pte) 346 goto out; 347 348 for (i = 0; i < count; ++i) 349 free_clear_pte(&pte[i], pte[i], &freelist); 350 351 if (!iommu_pages_list_empty(&freelist)) 352 updated = true; 353 354 if (count > 1) { 355 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 356 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 357 } else 358 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 359 360 if (prot & IOMMU_PROT_IR) 361 __pte |= IOMMU_PTE_IR; 362 if (prot & IOMMU_PROT_IW) 363 __pte |= IOMMU_PTE_IW; 364 365 for (i = 0; i < count; ++i) 366 pte[i] = __pte; 367 368 iova += pgsize; 369 paddr += pgsize; 370 pgcount--; 371 if (mapped) 372 *mapped += pgsize; 373 } 374 375 ret = 0; 376 377 out: 378 if (updated) { 379 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 380 unsigned long flags; 381 382 spin_lock_irqsave(&dom->lock, flags); 383 /* 384 * Flush domain TLB(s) and wait for completion. Any Device-Table 385 * Updates and flushing already happened in 386 * increase_address_space(). 387 */ 388 amd_iommu_domain_flush_pages(dom, o_iova, size); 389 spin_unlock_irqrestore(&dom->lock, flags); 390 } 391 392 /* Everything flushed out, free pages now */ 393 iommu_put_pages_list(&freelist); 394 395 return ret; 396 } 397 398 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 399 unsigned long iova, 400 size_t pgsize, size_t pgcount, 401 struct iommu_iotlb_gather *gather) 402 { 403 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 404 unsigned long long unmapped; 405 unsigned long unmap_size; 406 u64 *pte; 407 size_t size = pgcount << __ffs(pgsize); 408 409 BUG_ON(!is_power_of_2(pgsize)); 410 411 unmapped = 0; 412 413 while (unmapped < size) { 414 pte = fetch_pte(pgtable, iova, &unmap_size); 415 if (pte) { 416 int i, count; 417 418 count = PAGE_SIZE_PTE_COUNT(unmap_size); 419 for (i = 0; i < count; i++) 420 pte[i] = 0ULL; 421 } else { 422 return unmapped; 423 } 424 425 iova = (iova & ~(unmap_size - 1)) + unmap_size; 426 unmapped += unmap_size; 427 } 428 429 return unmapped; 430 } 431 432 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 433 { 434 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 435 unsigned long offset_mask, pte_pgsize; 436 u64 *pte, __pte; 437 438 pte = fetch_pte(pgtable, iova, &pte_pgsize); 439 440 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 441 return 0; 442 443 offset_mask = pte_pgsize - 1; 444 __pte = __sme_clr(*pte & PM_ADDR_MASK); 445 446 return (__pte & ~offset_mask) | (iova & offset_mask); 447 } 448 449 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 450 unsigned long flags) 451 { 452 bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 453 bool dirty = false; 454 int i, count; 455 456 /* 457 * 2.2.3.2 Host Dirty Support 458 * When a non-default page size is used , software must OR the 459 * Dirty bits in all of the replicated host PTEs used to map 460 * the page. The IOMMU does not guarantee the Dirty bits are 461 * set in all of the replicated PTEs. Any portion of the page 462 * may have been written even if the Dirty bit is set in only 463 * one of the replicated PTEs. 464 */ 465 count = PAGE_SIZE_PTE_COUNT(size); 466 for (i = 0; i < count && test_only; i++) { 467 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 468 dirty = true; 469 break; 470 } 471 } 472 473 for (i = 0; i < count && !test_only; i++) { 474 if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 475 (unsigned long *)&ptep[i])) { 476 dirty = true; 477 } 478 } 479 480 return dirty; 481 } 482 483 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 484 unsigned long iova, size_t size, 485 unsigned long flags, 486 struct iommu_dirty_bitmap *dirty) 487 { 488 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 489 unsigned long end = iova + size - 1; 490 491 do { 492 unsigned long pgsize = 0; 493 u64 *ptep, pte; 494 495 ptep = fetch_pte(pgtable, iova, &pgsize); 496 if (ptep) 497 pte = READ_ONCE(*ptep); 498 if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 499 pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 500 iova += pgsize; 501 continue; 502 } 503 504 /* 505 * Mark the whole IOVA range as dirty even if only one of 506 * the replicated PTEs were marked dirty. 507 */ 508 if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 509 iommu_dirty_bitmap_record(dirty, iova, pgsize); 510 iova += pgsize; 511 } while (iova < end); 512 513 return 0; 514 } 515 516 /* 517 * ---------------------------------------------------- 518 */ 519 static void v1_free_pgtable(struct io_pgtable *iop) 520 { 521 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 522 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 523 524 if (pgtable->mode == PAGE_MODE_NONE) 525 return; 526 527 /* Page-table is not visible to IOMMU anymore, so free it */ 528 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 529 pgtable->mode > PAGE_MODE_6_LEVEL); 530 531 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 532 iommu_put_pages_list(&freelist); 533 } 534 535 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 536 { 537 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 538 539 pgtable->root = 540 iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); 541 if (!pgtable->root) 542 return NULL; 543 pgtable->mode = PAGE_MODE_3_LEVEL; 544 545 cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; 546 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; 547 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 548 549 pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; 550 pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; 551 pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; 552 pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 553 554 return &pgtable->pgtbl; 555 } 556 557 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 558 .alloc = v1_alloc_pgtable, 559 .free = v1_free_pgtable, 560 }; 561