xref: /linux/drivers/iommu/amd/io_pgtable.c (revision 336b4dae6dfecc9aa53a3a68c71b9c1c1d466388)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * CPU-agnostic AMD IO page table allocator.
4  *
5  * Copyright (C) 2020 Advanced Micro Devices, Inc.
6  * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
7  */
8 
9 #define pr_fmt(fmt)     "AMD-Vi: " fmt
10 #define dev_fmt(fmt)    pr_fmt(fmt)
11 
12 #include <linux/atomic.h>
13 #include <linux/bitops.h>
14 #include <linux/io-pgtable.h>
15 #include <linux/kernel.h>
16 #include <linux/sizes.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/dma-mapping.h>
20 
21 #include <asm/barrier.h>
22 
23 #include "amd_iommu_types.h"
24 #include "amd_iommu.h"
25 #include "../iommu-pages.h"
26 
27 /*
28  * Helper function to get the first pte of a large mapping
29  */
first_pte_l7(u64 * pte,unsigned long * page_size,unsigned long * count)30 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
31 			 unsigned long *count)
32 {
33 	unsigned long pte_mask, pg_size, cnt;
34 	u64 *fpte;
35 
36 	pg_size  = PTE_PAGE_SIZE(*pte);
37 	cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
38 	pte_mask = ~((cnt << 3) - 1);
39 	fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
40 
41 	if (page_size)
42 		*page_size = pg_size;
43 
44 	if (count)
45 		*count = cnt;
46 
47 	return fpte;
48 }
49 
free_pt_page(u64 * pt,struct list_head * freelist)50 static void free_pt_page(u64 *pt, struct list_head *freelist)
51 {
52 	struct page *p = virt_to_page(pt);
53 
54 	list_add_tail(&p->lru, freelist);
55 }
56 
free_pt_lvl(u64 * pt,struct list_head * freelist,int lvl)57 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl)
58 {
59 	u64 *p;
60 	int i;
61 
62 	for (i = 0; i < 512; ++i) {
63 		/* PTE present? */
64 		if (!IOMMU_PTE_PRESENT(pt[i]))
65 			continue;
66 
67 		/* Large PTE? */
68 		if (PM_PTE_LEVEL(pt[i]) == 0 ||
69 		    PM_PTE_LEVEL(pt[i]) == 7)
70 			continue;
71 
72 		/*
73 		 * Free the next level. No need to look at l1 tables here since
74 		 * they can only contain leaf PTEs; just free them directly.
75 		 */
76 		p = IOMMU_PTE_PAGE(pt[i]);
77 		if (lvl > 2)
78 			free_pt_lvl(p, freelist, lvl - 1);
79 		else
80 			free_pt_page(p, freelist);
81 	}
82 
83 	free_pt_page(pt, freelist);
84 }
85 
free_sub_pt(u64 * root,int mode,struct list_head * freelist)86 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist)
87 {
88 	switch (mode) {
89 	case PAGE_MODE_NONE:
90 	case PAGE_MODE_7_LEVEL:
91 		break;
92 	case PAGE_MODE_1_LEVEL:
93 		free_pt_page(root, freelist);
94 		break;
95 	case PAGE_MODE_2_LEVEL:
96 	case PAGE_MODE_3_LEVEL:
97 	case PAGE_MODE_4_LEVEL:
98 	case PAGE_MODE_5_LEVEL:
99 	case PAGE_MODE_6_LEVEL:
100 		free_pt_lvl(root, freelist, mode);
101 		break;
102 	default:
103 		BUG();
104 	}
105 }
106 
107 /*
108  * This function is used to add another level to an IO page table. Adding
109  * another level increases the size of the address space by 9 bits to a size up
110  * to 64 bits.
111  */
increase_address_space(struct amd_io_pgtable * pgtable,unsigned long address,unsigned int page_size_level,gfp_t gfp)112 static bool increase_address_space(struct amd_io_pgtable *pgtable,
113 				   unsigned long address,
114 				   unsigned int page_size_level,
115 				   gfp_t gfp)
116 {
117 	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
118 	struct protection_domain *domain =
119 		container_of(pgtable, struct protection_domain, iop);
120 	unsigned long flags;
121 	bool ret = true;
122 	u64 *pte;
123 
124 	pte = iommu_alloc_page_node(cfg->amd.nid, gfp);
125 	if (!pte)
126 		return false;
127 
128 	spin_lock_irqsave(&domain->lock, flags);
129 
130 	if (address <= PM_LEVEL_SIZE(pgtable->mode) &&
131 	    pgtable->mode - 1 >= page_size_level)
132 		goto out;
133 
134 	ret = false;
135 	if (WARN_ON_ONCE(pgtable->mode == PAGE_MODE_6_LEVEL))
136 		goto out;
137 
138 	*pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root));
139 
140 	pgtable->root  = pte;
141 	pgtable->mode += 1;
142 	amd_iommu_update_and_flush_device_table(domain);
143 
144 	pte = NULL;
145 	ret = true;
146 
147 out:
148 	spin_unlock_irqrestore(&domain->lock, flags);
149 	iommu_free_page(pte);
150 
151 	return ret;
152 }
153 
alloc_pte(struct amd_io_pgtable * pgtable,unsigned long address,unsigned long page_size,u64 ** pte_page,gfp_t gfp,bool * updated)154 static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
155 		      unsigned long address,
156 		      unsigned long page_size,
157 		      u64 **pte_page,
158 		      gfp_t gfp,
159 		      bool *updated)
160 {
161 	unsigned long last_addr = address + (page_size - 1);
162 	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
163 	int level, end_lvl;
164 	u64 *pte, *page;
165 
166 	BUG_ON(!is_power_of_2(page_size));
167 
168 	while (last_addr > PM_LEVEL_SIZE(pgtable->mode) ||
169 	       pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) {
170 		/*
171 		 * Return an error if there is no memory to update the
172 		 * page-table.
173 		 */
174 		if (!increase_address_space(pgtable, last_addr,
175 					    PAGE_SIZE_LEVEL(page_size), gfp))
176 			return NULL;
177 	}
178 
179 
180 	level   = pgtable->mode - 1;
181 	pte     = &pgtable->root[PM_LEVEL_INDEX(level, address)];
182 	address = PAGE_SIZE_ALIGN(address, page_size);
183 	end_lvl = PAGE_SIZE_LEVEL(page_size);
184 
185 	while (level > end_lvl) {
186 		u64 __pte, __npte;
187 		int pte_level;
188 
189 		__pte     = *pte;
190 		pte_level = PM_PTE_LEVEL(__pte);
191 
192 		/*
193 		 * If we replace a series of large PTEs, we need
194 		 * to tear down all of them.
195 		 */
196 		if (IOMMU_PTE_PRESENT(__pte) &&
197 		    pte_level == PAGE_MODE_7_LEVEL) {
198 			unsigned long count, i;
199 			u64 *lpte;
200 
201 			lpte = first_pte_l7(pte, NULL, &count);
202 
203 			/*
204 			 * Unmap the replicated PTEs that still match the
205 			 * original large mapping
206 			 */
207 			for (i = 0; i < count; ++i)
208 				cmpxchg64(&lpte[i], __pte, 0ULL);
209 
210 			*updated = true;
211 			continue;
212 		}
213 
214 		if (!IOMMU_PTE_PRESENT(__pte) ||
215 		    pte_level == PAGE_MODE_NONE) {
216 			page = iommu_alloc_page_node(cfg->amd.nid, gfp);
217 
218 			if (!page)
219 				return NULL;
220 
221 			__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
222 
223 			/* pte could have been changed somewhere. */
224 			if (!try_cmpxchg64(pte, &__pte, __npte))
225 				iommu_free_page(page);
226 			else if (IOMMU_PTE_PRESENT(__pte))
227 				*updated = true;
228 
229 			continue;
230 		}
231 
232 		/* No level skipping support yet */
233 		if (pte_level != level)
234 			return NULL;
235 
236 		level -= 1;
237 
238 		pte = IOMMU_PTE_PAGE(__pte);
239 
240 		if (pte_page && level == end_lvl)
241 			*pte_page = pte;
242 
243 		pte = &pte[PM_LEVEL_INDEX(level, address)];
244 	}
245 
246 	return pte;
247 }
248 
249 /*
250  * This function checks if there is a PTE for a given dma address. If
251  * there is one, it returns the pointer to it.
252  */
fetch_pte(struct amd_io_pgtable * pgtable,unsigned long address,unsigned long * page_size)253 static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
254 		      unsigned long address,
255 		      unsigned long *page_size)
256 {
257 	int level;
258 	u64 *pte;
259 
260 	*page_size = 0;
261 
262 	if (address > PM_LEVEL_SIZE(pgtable->mode))
263 		return NULL;
264 
265 	level	   =  pgtable->mode - 1;
266 	pte	   = &pgtable->root[PM_LEVEL_INDEX(level, address)];
267 	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
268 
269 	while (level > 0) {
270 
271 		/* Not Present */
272 		if (!IOMMU_PTE_PRESENT(*pte))
273 			return NULL;
274 
275 		/* Large PTE */
276 		if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
277 		    PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
278 			break;
279 
280 		/* No level skipping support yet */
281 		if (PM_PTE_LEVEL(*pte) != level)
282 			return NULL;
283 
284 		level -= 1;
285 
286 		/* Walk to the next level */
287 		pte	   = IOMMU_PTE_PAGE(*pte);
288 		pte	   = &pte[PM_LEVEL_INDEX(level, address)];
289 		*page_size = PTE_LEVEL_PAGE_SIZE(level);
290 	}
291 
292 	/*
293 	 * If we have a series of large PTEs, make
294 	 * sure to return a pointer to the first one.
295 	 */
296 	if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
297 		pte = first_pte_l7(pte, page_size, NULL);
298 
299 	return pte;
300 }
301 
free_clear_pte(u64 * pte,u64 pteval,struct list_head * freelist)302 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist)
303 {
304 	u64 *pt;
305 	int mode;
306 
307 	while (!try_cmpxchg64(pte, &pteval, 0))
308 		pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
309 
310 	if (!IOMMU_PTE_PRESENT(pteval))
311 		return;
312 
313 	pt   = IOMMU_PTE_PAGE(pteval);
314 	mode = IOMMU_PTE_MODE(pteval);
315 
316 	free_sub_pt(pt, mode, freelist);
317 }
318 
319 /*
320  * Generic mapping functions. It maps a physical address into a DMA
321  * address space. It allocates the page table pages if necessary.
322  * In the future it can be extended to a generic mapping function
323  * supporting all features of AMD IOMMU page tables like level skipping
324  * and full 64 bit address spaces.
325  */
iommu_v1_map_pages(struct io_pgtable_ops * ops,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)326 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
327 			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
328 			      int prot, gfp_t gfp, size_t *mapped)
329 {
330 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
331 	LIST_HEAD(freelist);
332 	bool updated = false;
333 	u64 __pte, *pte;
334 	int ret, i, count;
335 	size_t size = pgcount << __ffs(pgsize);
336 	unsigned long o_iova = iova;
337 
338 	BUG_ON(!IS_ALIGNED(iova, pgsize));
339 	BUG_ON(!IS_ALIGNED(paddr, pgsize));
340 
341 	ret = -EINVAL;
342 	if (!(prot & IOMMU_PROT_MASK))
343 		goto out;
344 
345 	while (pgcount > 0) {
346 		count = PAGE_SIZE_PTE_COUNT(pgsize);
347 		pte   = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated);
348 
349 		ret = -ENOMEM;
350 		if (!pte)
351 			goto out;
352 
353 		for (i = 0; i < count; ++i)
354 			free_clear_pte(&pte[i], pte[i], &freelist);
355 
356 		if (!list_empty(&freelist))
357 			updated = true;
358 
359 		if (count > 1) {
360 			__pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
361 			__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
362 		} else
363 			__pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
364 
365 		if (prot & IOMMU_PROT_IR)
366 			__pte |= IOMMU_PTE_IR;
367 		if (prot & IOMMU_PROT_IW)
368 			__pte |= IOMMU_PTE_IW;
369 
370 		for (i = 0; i < count; ++i)
371 			pte[i] = __pte;
372 
373 		iova  += pgsize;
374 		paddr += pgsize;
375 		pgcount--;
376 		if (mapped)
377 			*mapped += pgsize;
378 	}
379 
380 	ret = 0;
381 
382 out:
383 	if (updated) {
384 		struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
385 		unsigned long flags;
386 
387 		spin_lock_irqsave(&dom->lock, flags);
388 		/*
389 		 * Flush domain TLB(s) and wait for completion. Any Device-Table
390 		 * Updates and flushing already happened in
391 		 * increase_address_space().
392 		 */
393 		amd_iommu_domain_flush_pages(dom, o_iova, size);
394 		spin_unlock_irqrestore(&dom->lock, flags);
395 	}
396 
397 	/* Everything flushed out, free pages now */
398 	iommu_put_pages_list(&freelist);
399 
400 	return ret;
401 }
402 
iommu_v1_unmap_pages(struct io_pgtable_ops * ops,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)403 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
404 					  unsigned long iova,
405 					  size_t pgsize, size_t pgcount,
406 					  struct iommu_iotlb_gather *gather)
407 {
408 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
409 	unsigned long long unmapped;
410 	unsigned long unmap_size;
411 	u64 *pte;
412 	size_t size = pgcount << __ffs(pgsize);
413 
414 	BUG_ON(!is_power_of_2(pgsize));
415 
416 	unmapped = 0;
417 
418 	while (unmapped < size) {
419 		pte = fetch_pte(pgtable, iova, &unmap_size);
420 		if (pte) {
421 			int i, count;
422 
423 			count = PAGE_SIZE_PTE_COUNT(unmap_size);
424 			for (i = 0; i < count; i++)
425 				pte[i] = 0ULL;
426 		} else {
427 			return unmapped;
428 		}
429 
430 		iova = (iova & ~(unmap_size - 1)) + unmap_size;
431 		unmapped += unmap_size;
432 	}
433 
434 	return unmapped;
435 }
436 
iommu_v1_iova_to_phys(struct io_pgtable_ops * ops,unsigned long iova)437 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
438 {
439 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
440 	unsigned long offset_mask, pte_pgsize;
441 	u64 *pte, __pte;
442 
443 	pte = fetch_pte(pgtable, iova, &pte_pgsize);
444 
445 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
446 		return 0;
447 
448 	offset_mask = pte_pgsize - 1;
449 	__pte	    = __sme_clr(*pte & PM_ADDR_MASK);
450 
451 	return (__pte & ~offset_mask) | (iova & offset_mask);
452 }
453 
pte_test_and_clear_dirty(u64 * ptep,unsigned long size,unsigned long flags)454 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
455 				     unsigned long flags)
456 {
457 	bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
458 	bool dirty = false;
459 	int i, count;
460 
461 	/*
462 	 * 2.2.3.2 Host Dirty Support
463 	 * When a non-default page size is used , software must OR the
464 	 * Dirty bits in all of the replicated host PTEs used to map
465 	 * the page. The IOMMU does not guarantee the Dirty bits are
466 	 * set in all of the replicated PTEs. Any portion of the page
467 	 * may have been written even if the Dirty bit is set in only
468 	 * one of the replicated PTEs.
469 	 */
470 	count = PAGE_SIZE_PTE_COUNT(size);
471 	for (i = 0; i < count && test_only; i++) {
472 		if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
473 			dirty = true;
474 			break;
475 		}
476 	}
477 
478 	for (i = 0; i < count && !test_only; i++) {
479 		if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
480 				       (unsigned long *)&ptep[i])) {
481 			dirty = true;
482 		}
483 	}
484 
485 	return dirty;
486 }
487 
iommu_v1_read_and_clear_dirty(struct io_pgtable_ops * ops,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)488 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
489 					 unsigned long iova, size_t size,
490 					 unsigned long flags,
491 					 struct iommu_dirty_bitmap *dirty)
492 {
493 	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
494 	unsigned long end = iova + size - 1;
495 
496 	do {
497 		unsigned long pgsize = 0;
498 		u64 *ptep, pte;
499 
500 		ptep = fetch_pte(pgtable, iova, &pgsize);
501 		if (ptep)
502 			pte = READ_ONCE(*ptep);
503 		if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
504 			pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
505 			iova += pgsize;
506 			continue;
507 		}
508 
509 		/*
510 		 * Mark the whole IOVA range as dirty even if only one of
511 		 * the replicated PTEs were marked dirty.
512 		 */
513 		if (pte_test_and_clear_dirty(ptep, pgsize, flags))
514 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
515 		iova += pgsize;
516 	} while (iova < end);
517 
518 	return 0;
519 }
520 
521 /*
522  * ----------------------------------------------------
523  */
v1_free_pgtable(struct io_pgtable * iop)524 static void v1_free_pgtable(struct io_pgtable *iop)
525 {
526 	struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
527 	LIST_HEAD(freelist);
528 
529 	if (pgtable->mode == PAGE_MODE_NONE)
530 		return;
531 
532 	/* Page-table is not visible to IOMMU anymore, so free it */
533 	BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
534 	       pgtable->mode > PAGE_MODE_6_LEVEL);
535 
536 	free_sub_pt(pgtable->root, pgtable->mode, &freelist);
537 	iommu_put_pages_list(&freelist);
538 }
539 
v1_alloc_pgtable(struct io_pgtable_cfg * cfg,void * cookie)540 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
541 {
542 	struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
543 
544 	pgtable->root = iommu_alloc_page_node(cfg->amd.nid, GFP_KERNEL);
545 	if (!pgtable->root)
546 		return NULL;
547 	pgtable->mode = PAGE_MODE_3_LEVEL;
548 
549 	cfg->pgsize_bitmap  = amd_iommu_pgsize_bitmap;
550 	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE;
551 	cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE;
552 
553 	pgtable->pgtbl.ops.map_pages    = iommu_v1_map_pages;
554 	pgtable->pgtbl.ops.unmap_pages  = iommu_v1_unmap_pages;
555 	pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys;
556 	pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
557 
558 	return &pgtable->pgtbl;
559 }
560 
561 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
562 	.alloc	= v1_alloc_pgtable,
563 	.free	= v1_free_pgtable,
564 };
565