xref: /linux/drivers/iommu/amd/io_pgtable.c (revision 90d32e92011eaae8e70a9169b4e7acf4ca8f9d3a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * CPU-agnostic AMD IO page table allocator.
4  *
5  * Copyright (C) 2020 Advanced Micro Devices, Inc.
6  * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
7  */
8 
9 #define pr_fmt(fmt)     "AMD-Vi: " fmt
10 #define dev_fmt(fmt)    pr_fmt(fmt)
11 
12 #include <linux/atomic.h>
13 #include <linux/bitops.h>
14 #include <linux/io-pgtable.h>
15 #include <linux/kernel.h>
16 #include <linux/sizes.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/dma-mapping.h>
20 
21 #include <asm/barrier.h>
22 
23 #include "amd_iommu_types.h"
24 #include "amd_iommu.h"
25 #include "../iommu-pages.h"
26 
27 static void v1_tlb_flush_all(void *cookie)
28 {
29 }
30 
31 static void v1_tlb_flush_walk(unsigned long iova, size_t size,
32 				  size_t granule, void *cookie)
33 {
34 }
35 
36 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather,
37 					 unsigned long iova, size_t granule,
38 					 void *cookie)
39 {
40 }
41 
42 static const struct iommu_flush_ops v1_flush_ops = {
43 	.tlb_flush_all	= v1_tlb_flush_all,
44 	.tlb_flush_walk = v1_tlb_flush_walk,
45 	.tlb_add_page	= v1_tlb_add_page,
46 };
47 
48 /*
49  * Helper function to get the first pte of a large mapping
50  */
51 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
52 			 unsigned long *count)
53 {
54 	unsigned long pte_mask, pg_size, cnt;
55 	u64 *fpte;
56 
57 	pg_size  = PTE_PAGE_SIZE(*pte);
58 	cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
59 	pte_mask = ~((cnt << 3) - 1);
60 	fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
61 
62 	if (page_size)
63 		*page_size = pg_size;
64 
65 	if (count)
66 		*count = cnt;
67 
68 	return fpte;
69 }
70 
71 /****************************************************************************
72  *
73  * The functions below are used the create the page table mappings for
74  * unity mapped regions.
75  *
76  ****************************************************************************/
77 
78 static void free_pt_page(u64 *pt, struct list_head *freelist)
79 {
80 	struct page *p = virt_to_page(pt);
81 
82 	list_add_tail(&p->lru, freelist);
83 }
84 
85 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl)
86 {
87 	u64 *p;
88 	int i;
89 
90 	for (i = 0; i < 512; ++i) {
91 		/* PTE present? */
92 		if (!IOMMU_PTE_PRESENT(pt[i]))
93 			continue;
94 
95 		/* Large PTE? */
96 		if (PM_PTE_LEVEL(pt[i]) == 0 ||
97 		    PM_PTE_LEVEL(pt[i]) == 7)
98 			continue;
99 
100 		/*
101 		 * Free the next level. No need to look at l1 tables here since
102 		 * they can only contain leaf PTEs; just free them directly.
103 		 */
104 		p = IOMMU_PTE_PAGE(pt[i]);
105 		if (lvl > 2)
106 			free_pt_lvl(p, freelist, lvl - 1);
107 		else
108 			free_pt_page(p, freelist);
109 	}
110 
111 	free_pt_page(pt, freelist);
112 }
113 
114 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist)
115 {
116 	switch (mode) {
117 	case PAGE_MODE_NONE:
118 	case PAGE_MODE_7_LEVEL:
119 		break;
120 	case PAGE_MODE_1_LEVEL:
121 		free_pt_page(root, freelist);
122 		break;
123 	case PAGE_MODE_2_LEVEL:
124 	case PAGE_MODE_3_LEVEL:
125 	case PAGE_MODE_4_LEVEL:
126 	case PAGE_MODE_5_LEVEL:
127 	case PAGE_MODE_6_LEVEL:
128 		free_pt_lvl(root, freelist, mode);
129 		break;
130 	default:
131 		BUG();
132 	}
133 }
134 
135 void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
136 				  u64 *root, int mode)
137 {
138 	u64 pt_root;
139 
140 	/* lowest 3 bits encode pgtable mode */
141 	pt_root = mode & 7;
142 	pt_root |= (u64)root;
143 
144 	amd_iommu_domain_set_pt_root(domain, pt_root);
145 }
146 
147 /*
148  * This function is used to add another level to an IO page table. Adding
149  * another level increases the size of the address space by 9 bits to a size up
150  * to 64 bits.
151  */
152 static bool increase_address_space(struct protection_domain *domain,
153 				   unsigned long address,
154 				   gfp_t gfp)
155 {
156 	unsigned long flags;
157 	bool ret = true;
158 	u64 *pte;
159 
160 	pte = iommu_alloc_page_node(domain->nid, gfp);
161 	if (!pte)
162 		return false;
163 
164 	spin_lock_irqsave(&domain->lock, flags);
165 
166 	if (address <= PM_LEVEL_SIZE(domain->iop.mode))
167 		goto out;
168 
169 	ret = false;
170 	if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL))
171 		goto out;
172 
173 	*pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root));
174 
175 	domain->iop.root  = pte;
176 	domain->iop.mode += 1;
177 	amd_iommu_update_and_flush_device_table(domain);
178 	amd_iommu_domain_flush_complete(domain);
179 
180 	/*
181 	 * Device Table needs to be updated and flushed before the new root can
182 	 * be published.
183 	 */
184 	amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode);
185 
186 	pte = NULL;
187 	ret = true;
188 
189 out:
190 	spin_unlock_irqrestore(&domain->lock, flags);
191 	iommu_free_page(pte);
192 
193 	return ret;
194 }
195 
196 static u64 *alloc_pte(struct protection_domain *domain,
197 		      unsigned long address,
198 		      unsigned long page_size,
199 		      u64 **pte_page,
200 		      gfp_t gfp,
201 		      bool *updated)
202 {
203 	int level, end_lvl;
204 	u64 *pte, *page;
205 
206 	BUG_ON(!is_power_of_2(page_size));
207 
208 	while (address > PM_LEVEL_SIZE(domain->iop.mode)) {
209 		/*
210 		 * Return an error if there is no memory to update the
211 		 * page-table.
212 		 */
213 		if (!increase_address_space(domain, address, gfp))
214 			return NULL;
215 	}
216 
217 
218 	level   = domain->iop.mode - 1;
219 	pte     = &domain->iop.root[PM_LEVEL_INDEX(level, address)];
220 	address = PAGE_SIZE_ALIGN(address, page_size);
221 	end_lvl = PAGE_SIZE_LEVEL(page_size);
222 
223 	while (level > end_lvl) {
224 		u64 __pte, __npte;
225 		int pte_level;
226 
227 		__pte     = *pte;
228 		pte_level = PM_PTE_LEVEL(__pte);
229 
230 		/*
231 		 * If we replace a series of large PTEs, we need
232 		 * to tear down all of them.
233 		 */
234 		if (IOMMU_PTE_PRESENT(__pte) &&
235 		    pte_level == PAGE_MODE_7_LEVEL) {
236 			unsigned long count, i;
237 			u64 *lpte;
238 
239 			lpte = first_pte_l7(pte, NULL, &count);
240 
241 			/*
242 			 * Unmap the replicated PTEs that still match the
243 			 * original large mapping
244 			 */
245 			for (i = 0; i < count; ++i)
246 				cmpxchg64(&lpte[i], __pte, 0ULL);
247 
248 			*updated = true;
249 			continue;
250 		}
251 
252 		if (!IOMMU_PTE_PRESENT(__pte) ||
253 		    pte_level == PAGE_MODE_NONE) {
254 			page = iommu_alloc_page_node(domain->nid, gfp);
255 
256 			if (!page)
257 				return NULL;
258 
259 			__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
260 
261 			/* pte could have been changed somewhere. */
262 			if (!try_cmpxchg64(pte, &__pte, __npte))
263 				iommu_free_page(page);
264 			else if (IOMMU_PTE_PRESENT(__pte))
265 				*updated = true;
266 
267 			continue;
268 		}
269 
270 		/* No level skipping support yet */
271 		if (pte_level != level)
272 			return NULL;
273 
274 		level -= 1;
275 
276 		pte = IOMMU_PTE_PAGE(__pte);
277 
278 		if (pte_page && level == end_lvl)
279 			*pte_page = pte;
280 
281 		pte = &pte[PM_LEVEL_INDEX(level, address)];
282 	}
283 
284 	return pte;
285 }
286 
287 /*
288  * This function checks if there is a PTE for a given dma address. If
289  * there is one, it returns the pointer to it.
290  */
291 static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
292 		      unsigned long address,
293 		      unsigned long *page_size)
294 {
295 	int level;
296 	u64 *pte;
297 
298 	*page_size = 0;
299 
300 	if (address > PM_LEVEL_SIZE(pgtable->mode))
301 		return NULL;
302 
303 	level	   =  pgtable->mode - 1;
304 	pte	   = &pgtable->root[PM_LEVEL_INDEX(level, address)];
305 	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
306 
307 	while (level > 0) {
308 
309 		/* Not Present */
310 		if (!IOMMU_PTE_PRESENT(*pte))
311 			return NULL;
312 
313 		/* Large PTE */
314 		if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
315 		    PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
316 			break;
317 
318 		/* No level skipping support yet */
319 		if (PM_PTE_LEVEL(*pte) != level)
320 			return NULL;
321 
322 		level -= 1;
323 
324 		/* Walk to the next level */
325 		pte	   = IOMMU_PTE_PAGE(*pte);
326 		pte	   = &pte[PM_LEVEL_INDEX(level, address)];
327 		*page_size = PTE_LEVEL_PAGE_SIZE(level);
328 	}
329 
330 	/*
331 	 * If we have a series of large PTEs, make
332 	 * sure to return a pointer to the first one.
333 	 */
334 	if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
335 		pte = first_pte_l7(pte, page_size, NULL);
336 
337 	return pte;
338 }
339 
340 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist)
341 {
342 	u64 *pt;
343 	int mode;
344 
345 	while (!try_cmpxchg64(pte, &pteval, 0))
346 		pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
347 
348 	if (!IOMMU_PTE_PRESENT(pteval))
349 		return;
350 
351 	pt   = IOMMU_PTE_PAGE(pteval);
352 	mode = IOMMU_PTE_MODE(pteval);
353 
354 	free_sub_pt(pt, mode, freelist);
355 }
356 
357 /*
358  * Generic mapping functions. It maps a physical address into a DMA
359  * address space. It allocates the page table pages if necessary.
360  * In the future it can be extended to a generic mapping function
361  * supporting all features of AMD IOMMU page tables like level skipping
362  * and full 64 bit address spaces.
363  */
364 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
365 			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
366 			      int prot, gfp_t gfp, size_t *mapped)
367 {
368 	struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
369 	LIST_HEAD(freelist);
370 	bool updated = false;
371 	u64 __pte, *pte;
372 	int ret, i, count;
373 	size_t size = pgcount << __ffs(pgsize);
374 	unsigned long o_iova = iova;
375 
376 	BUG_ON(!IS_ALIGNED(iova, pgsize));
377 	BUG_ON(!IS_ALIGNED(paddr, pgsize));
378 
379 	ret = -EINVAL;
380 	if (!(prot & IOMMU_PROT_MASK))
381 		goto out;
382 
383 	while (pgcount > 0) {
384 		count = PAGE_SIZE_PTE_COUNT(pgsize);
385 		pte   = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated);
386 
387 		ret = -ENOMEM;
388 		if (!pte)
389 			goto out;
390 
391 		for (i = 0; i < count; ++i)
392 			free_clear_pte(&pte[i], pte[i], &freelist);
393 
394 		if (!list_empty(&freelist))
395 			updated = true;
396 
397 		if (count > 1) {
398 			__pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
399 			__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
400 		} else
401 			__pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
402 
403 		if (prot & IOMMU_PROT_IR)
404 			__pte |= IOMMU_PTE_IR;
405 		if (prot & IOMMU_PROT_IW)
406 			__pte |= IOMMU_PTE_IW;
407 
408 		for (i = 0; i < count; ++i)
409 			pte[i] = __pte;
410 
411 		iova  += pgsize;
412 		paddr += pgsize;
413 		pgcount--;
414 		if (mapped)
415 			*mapped += pgsize;
416 	}
417 
418 	ret = 0;
419 
420 out:
421 	if (updated) {
422 		unsigned long flags;
423 
424 		spin_lock_irqsave(&dom->lock, flags);
425 		/*
426 		 * Flush domain TLB(s) and wait for completion. Any Device-Table
427 		 * Updates and flushing already happened in
428 		 * increase_address_space().
429 		 */
430 		amd_iommu_domain_flush_pages(dom, o_iova, size);
431 		spin_unlock_irqrestore(&dom->lock, flags);
432 	}
433 
434 	/* Everything flushed out, free pages now */
435 	iommu_put_pages_list(&freelist);
436 
437 	return ret;
438 }
439 
440 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
441 					  unsigned long iova,
442 					  size_t pgsize, size_t pgcount,
443 					  struct iommu_iotlb_gather *gather)
444 {
445 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
446 	unsigned long long unmapped;
447 	unsigned long unmap_size;
448 	u64 *pte;
449 	size_t size = pgcount << __ffs(pgsize);
450 
451 	BUG_ON(!is_power_of_2(pgsize));
452 
453 	unmapped = 0;
454 
455 	while (unmapped < size) {
456 		pte = fetch_pte(pgtable, iova, &unmap_size);
457 		if (pte) {
458 			int i, count;
459 
460 			count = PAGE_SIZE_PTE_COUNT(unmap_size);
461 			for (i = 0; i < count; i++)
462 				pte[i] = 0ULL;
463 		} else {
464 			return unmapped;
465 		}
466 
467 		iova = (iova & ~(unmap_size - 1)) + unmap_size;
468 		unmapped += unmap_size;
469 	}
470 
471 	return unmapped;
472 }
473 
474 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
475 {
476 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
477 	unsigned long offset_mask, pte_pgsize;
478 	u64 *pte, __pte;
479 
480 	pte = fetch_pte(pgtable, iova, &pte_pgsize);
481 
482 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
483 		return 0;
484 
485 	offset_mask = pte_pgsize - 1;
486 	__pte	    = __sme_clr(*pte & PM_ADDR_MASK);
487 
488 	return (__pte & ~offset_mask) | (iova & offset_mask);
489 }
490 
491 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
492 				     unsigned long flags)
493 {
494 	bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
495 	bool dirty = false;
496 	int i, count;
497 
498 	/*
499 	 * 2.2.3.2 Host Dirty Support
500 	 * When a non-default page size is used , software must OR the
501 	 * Dirty bits in all of the replicated host PTEs used to map
502 	 * the page. The IOMMU does not guarantee the Dirty bits are
503 	 * set in all of the replicated PTEs. Any portion of the page
504 	 * may have been written even if the Dirty bit is set in only
505 	 * one of the replicated PTEs.
506 	 */
507 	count = PAGE_SIZE_PTE_COUNT(size);
508 	for (i = 0; i < count && test_only; i++) {
509 		if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
510 			dirty = true;
511 			break;
512 		}
513 	}
514 
515 	for (i = 0; i < count && !test_only; i++) {
516 		if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
517 				       (unsigned long *)&ptep[i])) {
518 			dirty = true;
519 		}
520 	}
521 
522 	return dirty;
523 }
524 
525 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
526 					 unsigned long iova, size_t size,
527 					 unsigned long flags,
528 					 struct iommu_dirty_bitmap *dirty)
529 {
530 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
531 	unsigned long end = iova + size - 1;
532 
533 	do {
534 		unsigned long pgsize = 0;
535 		u64 *ptep, pte;
536 
537 		ptep = fetch_pte(pgtable, iova, &pgsize);
538 		if (ptep)
539 			pte = READ_ONCE(*ptep);
540 		if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
541 			pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
542 			iova += pgsize;
543 			continue;
544 		}
545 
546 		/*
547 		 * Mark the whole IOVA range as dirty even if only one of
548 		 * the replicated PTEs were marked dirty.
549 		 */
550 		if (pte_test_and_clear_dirty(ptep, pgsize, flags))
551 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
552 		iova += pgsize;
553 	} while (iova < end);
554 
555 	return 0;
556 }
557 
558 /*
559  * ----------------------------------------------------
560  */
561 static void v1_free_pgtable(struct io_pgtable *iop)
562 {
563 	struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
564 	struct protection_domain *dom;
565 	LIST_HEAD(freelist);
566 
567 	if (pgtable->mode == PAGE_MODE_NONE)
568 		return;
569 
570 	dom = container_of(pgtable, struct protection_domain, iop);
571 
572 	/* Page-table is not visible to IOMMU anymore, so free it */
573 	BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
574 	       pgtable->mode > PAGE_MODE_6_LEVEL);
575 
576 	free_sub_pt(pgtable->root, pgtable->mode, &freelist);
577 
578 	/* Update data structure */
579 	amd_iommu_domain_clr_pt_root(dom);
580 
581 	/* Make changes visible to IOMMUs */
582 	amd_iommu_domain_update(dom);
583 
584 	iommu_put_pages_list(&freelist);
585 }
586 
587 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
588 {
589 	struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
590 
591 	cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES,
592 	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE,
593 	cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE,
594 	cfg->tlb            = &v1_flush_ops;
595 
596 	pgtable->iop.ops.map_pages    = iommu_v1_map_pages;
597 	pgtable->iop.ops.unmap_pages  = iommu_v1_unmap_pages;
598 	pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
599 	pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
600 
601 	return &pgtable->iop;
602 }
603 
604 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
605 	.alloc	= v1_alloc_pgtable,
606 	.free	= v1_free_pgtable,
607 };
608