1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9 #define pr_fmt(fmt) "AMD-Vi: " fmt 10 #define dev_fmt(fmt) pr_fmt(fmt) 11 12 #include <linux/atomic.h> 13 #include <linux/bitops.h> 14 #include <linux/io-pgtable.h> 15 #include <linux/kernel.h> 16 #include <linux/sizes.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/dma-mapping.h> 20 21 #include <asm/barrier.h> 22 23 #include "amd_iommu_types.h" 24 #include "amd_iommu.h" 25 26 static void v1_tlb_flush_all(void *cookie) 27 { 28 } 29 30 static void v1_tlb_flush_walk(unsigned long iova, size_t size, 31 size_t granule, void *cookie) 32 { 33 } 34 35 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, 36 unsigned long iova, size_t granule, 37 void *cookie) 38 { 39 } 40 41 static const struct iommu_flush_ops v1_flush_ops = { 42 .tlb_flush_all = v1_tlb_flush_all, 43 .tlb_flush_walk = v1_tlb_flush_walk, 44 .tlb_add_page = v1_tlb_add_page, 45 }; 46 47 /* 48 * Helper function to get the first pte of a large mapping 49 */ 50 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 51 unsigned long *count) 52 { 53 unsigned long pte_mask, pg_size, cnt; 54 u64 *fpte; 55 56 pg_size = PTE_PAGE_SIZE(*pte); 57 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 58 pte_mask = ~((cnt << 3) - 1); 59 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 60 61 if (page_size) 62 *page_size = pg_size; 63 64 if (count) 65 *count = cnt; 66 67 return fpte; 68 } 69 70 /**************************************************************************** 71 * 72 * The functions below are used the create the page table mappings for 73 * unity mapped regions. 74 * 75 ****************************************************************************/ 76 77 static void free_pt_page(u64 *pt, struct list_head *freelist) 78 { 79 struct page *p = virt_to_page(pt); 80 81 list_add_tail(&p->lru, freelist); 82 } 83 84 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 85 { 86 u64 *p; 87 int i; 88 89 for (i = 0; i < 512; ++i) { 90 /* PTE present? */ 91 if (!IOMMU_PTE_PRESENT(pt[i])) 92 continue; 93 94 /* Large PTE? */ 95 if (PM_PTE_LEVEL(pt[i]) == 0 || 96 PM_PTE_LEVEL(pt[i]) == 7) 97 continue; 98 99 /* 100 * Free the next level. No need to look at l1 tables here since 101 * they can only contain leaf PTEs; just free them directly. 102 */ 103 p = IOMMU_PTE_PAGE(pt[i]); 104 if (lvl > 2) 105 free_pt_lvl(p, freelist, lvl - 1); 106 else 107 free_pt_page(p, freelist); 108 } 109 110 free_pt_page(pt, freelist); 111 } 112 113 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 114 { 115 switch (mode) { 116 case PAGE_MODE_NONE: 117 case PAGE_MODE_7_LEVEL: 118 break; 119 case PAGE_MODE_1_LEVEL: 120 free_pt_page(root, freelist); 121 break; 122 case PAGE_MODE_2_LEVEL: 123 case PAGE_MODE_3_LEVEL: 124 case PAGE_MODE_4_LEVEL: 125 case PAGE_MODE_5_LEVEL: 126 case PAGE_MODE_6_LEVEL: 127 free_pt_lvl(root, freelist, mode); 128 break; 129 default: 130 BUG(); 131 } 132 } 133 134 void amd_iommu_domain_set_pgtable(struct protection_domain *domain, 135 u64 *root, int mode) 136 { 137 u64 pt_root; 138 139 /* lowest 3 bits encode pgtable mode */ 140 pt_root = mode & 7; 141 pt_root |= (u64)root; 142 143 amd_iommu_domain_set_pt_root(domain, pt_root); 144 } 145 146 /* 147 * This function is used to add another level to an IO page table. Adding 148 * another level increases the size of the address space by 9 bits to a size up 149 * to 64 bits. 150 */ 151 static bool increase_address_space(struct protection_domain *domain, 152 unsigned long address, 153 gfp_t gfp) 154 { 155 unsigned long flags; 156 bool ret = true; 157 u64 *pte; 158 159 pte = alloc_pgtable_page(domain->nid, gfp); 160 if (!pte) 161 return false; 162 163 spin_lock_irqsave(&domain->lock, flags); 164 165 if (address <= PM_LEVEL_SIZE(domain->iop.mode)) 166 goto out; 167 168 ret = false; 169 if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) 170 goto out; 171 172 *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); 173 174 domain->iop.root = pte; 175 domain->iop.mode += 1; 176 amd_iommu_update_and_flush_device_table(domain); 177 amd_iommu_domain_flush_complete(domain); 178 179 /* 180 * Device Table needs to be updated and flushed before the new root can 181 * be published. 182 */ 183 amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); 184 185 pte = NULL; 186 ret = true; 187 188 out: 189 spin_unlock_irqrestore(&domain->lock, flags); 190 free_page((unsigned long)pte); 191 192 return ret; 193 } 194 195 static u64 *alloc_pte(struct protection_domain *domain, 196 unsigned long address, 197 unsigned long page_size, 198 u64 **pte_page, 199 gfp_t gfp, 200 bool *updated) 201 { 202 int level, end_lvl; 203 u64 *pte, *page; 204 205 BUG_ON(!is_power_of_2(page_size)); 206 207 while (address > PM_LEVEL_SIZE(domain->iop.mode)) { 208 /* 209 * Return an error if there is no memory to update the 210 * page-table. 211 */ 212 if (!increase_address_space(domain, address, gfp)) 213 return NULL; 214 } 215 216 217 level = domain->iop.mode - 1; 218 pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; 219 address = PAGE_SIZE_ALIGN(address, page_size); 220 end_lvl = PAGE_SIZE_LEVEL(page_size); 221 222 while (level > end_lvl) { 223 u64 __pte, __npte; 224 int pte_level; 225 226 __pte = *pte; 227 pte_level = PM_PTE_LEVEL(__pte); 228 229 /* 230 * If we replace a series of large PTEs, we need 231 * to tear down all of them. 232 */ 233 if (IOMMU_PTE_PRESENT(__pte) && 234 pte_level == PAGE_MODE_7_LEVEL) { 235 unsigned long count, i; 236 u64 *lpte; 237 238 lpte = first_pte_l7(pte, NULL, &count); 239 240 /* 241 * Unmap the replicated PTEs that still match the 242 * original large mapping 243 */ 244 for (i = 0; i < count; ++i) 245 cmpxchg64(&lpte[i], __pte, 0ULL); 246 247 *updated = true; 248 continue; 249 } 250 251 if (!IOMMU_PTE_PRESENT(__pte) || 252 pte_level == PAGE_MODE_NONE) { 253 page = alloc_pgtable_page(domain->nid, gfp); 254 255 if (!page) 256 return NULL; 257 258 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 259 260 /* pte could have been changed somewhere. */ 261 if (!try_cmpxchg64(pte, &__pte, __npte)) 262 free_page((unsigned long)page); 263 else if (IOMMU_PTE_PRESENT(__pte)) 264 *updated = true; 265 266 continue; 267 } 268 269 /* No level skipping support yet */ 270 if (pte_level != level) 271 return NULL; 272 273 level -= 1; 274 275 pte = IOMMU_PTE_PAGE(__pte); 276 277 if (pte_page && level == end_lvl) 278 *pte_page = pte; 279 280 pte = &pte[PM_LEVEL_INDEX(level, address)]; 281 } 282 283 return pte; 284 } 285 286 /* 287 * This function checks if there is a PTE for a given dma address. If 288 * there is one, it returns the pointer to it. 289 */ 290 static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 291 unsigned long address, 292 unsigned long *page_size) 293 { 294 int level; 295 u64 *pte; 296 297 *page_size = 0; 298 299 if (address > PM_LEVEL_SIZE(pgtable->mode)) 300 return NULL; 301 302 level = pgtable->mode - 1; 303 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 304 *page_size = PTE_LEVEL_PAGE_SIZE(level); 305 306 while (level > 0) { 307 308 /* Not Present */ 309 if (!IOMMU_PTE_PRESENT(*pte)) 310 return NULL; 311 312 /* Large PTE */ 313 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 314 PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 315 break; 316 317 /* No level skipping support yet */ 318 if (PM_PTE_LEVEL(*pte) != level) 319 return NULL; 320 321 level -= 1; 322 323 /* Walk to the next level */ 324 pte = IOMMU_PTE_PAGE(*pte); 325 pte = &pte[PM_LEVEL_INDEX(level, address)]; 326 *page_size = PTE_LEVEL_PAGE_SIZE(level); 327 } 328 329 /* 330 * If we have a series of large PTEs, make 331 * sure to return a pointer to the first one. 332 */ 333 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 334 pte = first_pte_l7(pte, page_size, NULL); 335 336 return pte; 337 } 338 339 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 340 { 341 u64 *pt; 342 int mode; 343 344 while (!try_cmpxchg64(pte, &pteval, 0)) 345 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 346 347 if (!IOMMU_PTE_PRESENT(pteval)) 348 return; 349 350 pt = IOMMU_PTE_PAGE(pteval); 351 mode = IOMMU_PTE_MODE(pteval); 352 353 free_sub_pt(pt, mode, freelist); 354 } 355 356 /* 357 * Generic mapping functions. It maps a physical address into a DMA 358 * address space. It allocates the page table pages if necessary. 359 * In the future it can be extended to a generic mapping function 360 * supporting all features of AMD IOMMU page tables like level skipping 361 * and full 64 bit address spaces. 362 */ 363 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 364 phys_addr_t paddr, size_t pgsize, size_t pgcount, 365 int prot, gfp_t gfp, size_t *mapped) 366 { 367 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 368 LIST_HEAD(freelist); 369 bool updated = false; 370 u64 __pte, *pte; 371 int ret, i, count; 372 373 BUG_ON(!IS_ALIGNED(iova, pgsize)); 374 BUG_ON(!IS_ALIGNED(paddr, pgsize)); 375 376 ret = -EINVAL; 377 if (!(prot & IOMMU_PROT_MASK)) 378 goto out; 379 380 while (pgcount > 0) { 381 count = PAGE_SIZE_PTE_COUNT(pgsize); 382 pte = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated); 383 384 ret = -ENOMEM; 385 if (!pte) 386 goto out; 387 388 for (i = 0; i < count; ++i) 389 free_clear_pte(&pte[i], pte[i], &freelist); 390 391 if (!list_empty(&freelist)) 392 updated = true; 393 394 if (count > 1) { 395 __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 396 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 397 } else 398 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 399 400 if (prot & IOMMU_PROT_IR) 401 __pte |= IOMMU_PTE_IR; 402 if (prot & IOMMU_PROT_IW) 403 __pte |= IOMMU_PTE_IW; 404 405 for (i = 0; i < count; ++i) 406 pte[i] = __pte; 407 408 iova += pgsize; 409 paddr += pgsize; 410 pgcount--; 411 if (mapped) 412 *mapped += pgsize; 413 } 414 415 ret = 0; 416 417 out: 418 if (updated) { 419 unsigned long flags; 420 421 spin_lock_irqsave(&dom->lock, flags); 422 /* 423 * Flush domain TLB(s) and wait for completion. Any Device-Table 424 * Updates and flushing already happened in 425 * increase_address_space(). 426 */ 427 amd_iommu_domain_flush_tlb_pde(dom); 428 amd_iommu_domain_flush_complete(dom); 429 spin_unlock_irqrestore(&dom->lock, flags); 430 } 431 432 /* Everything flushed out, free pages now */ 433 put_pages_list(&freelist); 434 435 return ret; 436 } 437 438 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 439 unsigned long iova, 440 size_t pgsize, size_t pgcount, 441 struct iommu_iotlb_gather *gather) 442 { 443 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 444 unsigned long long unmapped; 445 unsigned long unmap_size; 446 u64 *pte; 447 size_t size = pgcount << __ffs(pgsize); 448 449 BUG_ON(!is_power_of_2(pgsize)); 450 451 unmapped = 0; 452 453 while (unmapped < size) { 454 pte = fetch_pte(pgtable, iova, &unmap_size); 455 if (pte) { 456 int i, count; 457 458 count = PAGE_SIZE_PTE_COUNT(unmap_size); 459 for (i = 0; i < count; i++) 460 pte[i] = 0ULL; 461 } else { 462 return unmapped; 463 } 464 465 iova = (iova & ~(unmap_size - 1)) + unmap_size; 466 unmapped += unmap_size; 467 } 468 469 return unmapped; 470 } 471 472 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 473 { 474 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 475 unsigned long offset_mask, pte_pgsize; 476 u64 *pte, __pte; 477 478 pte = fetch_pte(pgtable, iova, &pte_pgsize); 479 480 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 481 return 0; 482 483 offset_mask = pte_pgsize - 1; 484 __pte = __sme_clr(*pte & PM_ADDR_MASK); 485 486 return (__pte & ~offset_mask) | (iova & offset_mask); 487 } 488 489 /* 490 * ---------------------------------------------------- 491 */ 492 static void v1_free_pgtable(struct io_pgtable *iop) 493 { 494 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); 495 struct protection_domain *dom; 496 LIST_HEAD(freelist); 497 498 if (pgtable->mode == PAGE_MODE_NONE) 499 return; 500 501 dom = container_of(pgtable, struct protection_domain, iop); 502 503 /* Page-table is not visible to IOMMU anymore, so free it */ 504 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 505 pgtable->mode > PAGE_MODE_6_LEVEL); 506 507 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 508 509 /* Update data structure */ 510 amd_iommu_domain_clr_pt_root(dom); 511 512 /* Make changes visible to IOMMUs */ 513 amd_iommu_domain_update(dom); 514 515 put_pages_list(&freelist); 516 } 517 518 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 519 { 520 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 521 522 cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, 523 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, 524 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, 525 cfg->tlb = &v1_flush_ops; 526 527 pgtable->iop.ops.map_pages = iommu_v1_map_pages; 528 pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; 529 pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; 530 531 return &pgtable->iop; 532 } 533 534 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 535 .alloc = v1_alloc_pgtable, 536 .free = v1_free_pgtable, 537 }; 538