xref: /linux/drivers/iommu/generic_pt/fmt/amdv1.h (revision ce5cfb0fa20dc6454da039612e34325b7b4a8243)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
4  *
5  * AMD IOMMU v1 page table
6  *
7  * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
8  * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
9  *
10  * Note the level numbering here matches the core code, so level 0 is the same
11  * as mode 1.
12  *
13  */
14 #ifndef __GENERIC_PT_FMT_AMDV1_H
15 #define __GENERIC_PT_FMT_AMDV1_H
16 
17 #include "defs_amdv1.h"
18 #include "../pt_defs.h"
19 
20 #include <asm/page.h>
21 #include <linux/bitfield.h>
22 #include <linux/container_of.h>
23 #include <linux/mem_encrypt.h>
24 #include <linux/minmax.h>
25 #include <linux/sizes.h>
26 #include <linux/string.h>
27 
28 enum {
29 	PT_ITEM_WORD_SIZE = sizeof(u64),
30 	/*
31 	 * The IOMMUFD selftest uses the AMDv1 format with some alterations It
32 	 * uses a 2k page size to test cases where the CPU page size is not the
33 	 * same.
34 	 */
35 #ifdef AMDV1_IOMMUFD_SELFTEST
36 	PT_MAX_VA_ADDRESS_LG2 = 56,
37 	PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
38 	PT_MAX_TOP_LEVEL = 4,
39 	PT_GRANULE_LG2SZ = 11,
40 #else
41 	PT_MAX_VA_ADDRESS_LG2 = 64,
42 	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
43 	PT_MAX_TOP_LEVEL = 5,
44 	PT_GRANULE_LG2SZ = 12,
45 #endif
46 	PT_TABLEMEM_LG2SZ = 12,
47 
48 	/* The DTE only has these bits for the top phyiscal address */
49 	PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
50 };
51 
52 /* PTE bits */
53 enum {
54 	AMDV1PT_FMT_PR = BIT(0),
55 	AMDV1PT_FMT_D = BIT(6),
56 	AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
57 	AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
58 	AMDV1PT_FMT_FC = BIT_ULL(60),
59 	AMDV1PT_FMT_IR = BIT_ULL(61),
60 	AMDV1PT_FMT_IW = BIT_ULL(62),
61 };
62 
63 /*
64  * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
65  * these defines to avoid it.
66  */
67 #define AMDV1PT_FMT_NL_DEFAULT 0
68 #define AMDV1PT_FMT_NL_SIZE 7
69 
amdv1pt_table_pa(const struct pt_state * pts)70 static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
71 {
72 	u64 entry = pts->entry;
73 
74 	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
75 		entry = __sme_clr(entry);
76 	return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
77 }
78 #define pt_table_pa amdv1pt_table_pa
79 
80 /* Returns the oa for the start of the contiguous entry */
amdv1pt_entry_oa(const struct pt_state * pts)81 static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
82 {
83 	u64 entry = pts->entry;
84 	pt_oaddr_t oa;
85 
86 	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
87 		entry = __sme_clr(entry);
88 	oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
89 
90 	if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
91 		unsigned int sz_bits = oaffz(oa);
92 
93 		oa = oalog2_set_mod(oa, 0, sz_bits);
94 	} else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
95 			      AMDV1PT_FMT_NL_DEFAULT))
96 		return 0;
97 	return oalog2_mul(oa, PT_GRANULE_LG2SZ);
98 }
99 #define pt_entry_oa amdv1pt_entry_oa
100 
amdv1pt_can_have_leaf(const struct pt_state * pts)101 static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
102 {
103 	/*
104 	 * Table 15: Page Table Level Parameters
105 	 * The top most level cannot have translation entries
106 	 */
107 	return pts->level < PT_MAX_TOP_LEVEL;
108 }
109 #define pt_can_have_leaf amdv1pt_can_have_leaf
110 
111 /* Body in pt_fmt_defaults.h */
112 static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
113 
114 static inline unsigned int
amdv1pt_entry_num_contig_lg2(const struct pt_state * pts)115 amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
116 {
117 	u32 code;
118 
119 	if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
120 	    AMDV1PT_FMT_NL_DEFAULT)
121 		return ilog2(1);
122 
123 	PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
124 		   AMDV1PT_FMT_NL_SIZE);
125 
126 	/*
127 	 * The contiguous size is encoded in the length of a string of 1's in
128 	 * the low bits of the OA. Reverse the equation:
129 	 *  code = log2_to_int(num_contig_lg2 + item_lg2sz -
130 	 *              PT_GRANULE_LG2SZ - 1) - 1
131 	 * Which can be expressed as:
132 	 *  num_contig_lg2 = oalog2_ffz(code) + 1 -
133 	 *              item_lg2sz - PT_GRANULE_LG2SZ
134 	 *
135 	 * Assume the bit layout is correct and remove the masking. Reorganize
136 	 * the equation to move all the arithmetic before the ffz.
137 	 */
138 	code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
139 			      pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
140 	return ffz_t(u32, code);
141 }
142 #define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
143 
amdv1pt_num_items_lg2(const struct pt_state * pts)144 static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
145 {
146 	/*
147 	 * Top entry covers bits [63:57] only, this is handled through
148 	 * max_vasz_lg2.
149 	 */
150 	if (PT_WARN_ON(pts->level == 5))
151 		return 7;
152 	return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
153 }
154 #define pt_num_items_lg2 amdv1pt_num_items_lg2
155 
amdv1pt_possible_sizes(const struct pt_state * pts)156 static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
157 {
158 	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
159 
160 	if (!amdv1pt_can_have_leaf(pts))
161 		return 0;
162 
163 	/*
164 	 * Table 14: Example Page Size Encodings
165 	 * Address bits 51:32 can be used to encode page sizes greater than 4
166 	 * Gbytes. Address bits 63:52 are zero-extended.
167 	 *
168 	 * 512GB Pages are not supported due to a hardware bug.
169 	 * Otherwise every power of two size is supported.
170 	 */
171 	return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
172 			   isz_lg2) & ~SZ_512G;
173 }
174 #define pt_possible_sizes amdv1pt_possible_sizes
175 
amdv1pt_load_entry_raw(struct pt_state * pts)176 static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
177 {
178 	const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
179 	unsigned int next_level;
180 	u64 entry;
181 
182 	pts->entry = entry = READ_ONCE(*tablep);
183 	if (!(entry & AMDV1PT_FMT_PR))
184 		return PT_ENTRY_EMPTY;
185 
186 	next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
187 	if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
188 	    next_level == AMDV1PT_FMT_NL_SIZE)
189 		return PT_ENTRY_OA;
190 	return PT_ENTRY_TABLE;
191 }
192 #define pt_load_entry_raw amdv1pt_load_entry_raw
193 
194 static inline void
amdv1pt_install_leaf_entry(struct pt_state * pts,pt_oaddr_t oa,unsigned int oasz_lg2,const struct pt_write_attrs * attrs)195 amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
196 			   unsigned int oasz_lg2,
197 			   const struct pt_write_attrs *attrs)
198 {
199 	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
200 	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
201 	u64 entry;
202 
203 	if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
204 		return;
205 
206 	entry = AMDV1PT_FMT_PR |
207 		FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
208 		attrs->descriptor_bits;
209 
210 	if (oasz_lg2 == isz_lg2) {
211 		entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
212 				    AMDV1PT_FMT_NL_DEFAULT);
213 		WRITE_ONCE(*tablep, entry);
214 	} else {
215 		unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
216 		u64 *end = tablep + log2_to_int(num_contig_lg2);
217 
218 		entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
219 				    AMDV1PT_FMT_NL_SIZE) |
220 			 FIELD_PREP(AMDV1PT_FMT_OA,
221 				    oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
222 						  1) -
223 					    1);
224 
225 		/* See amdv1pt_clear_entries() */
226 		if (num_contig_lg2 <= ilog2(32)) {
227 			for (; tablep != end; tablep++)
228 				WRITE_ONCE(*tablep, entry);
229 		} else {
230 			memset64(tablep, entry, log2_to_int(num_contig_lg2));
231 		}
232 	}
233 	pts->entry = entry;
234 }
235 #define pt_install_leaf_entry amdv1pt_install_leaf_entry
236 
amdv1pt_install_table(struct pt_state * pts,pt_oaddr_t table_pa,const struct pt_write_attrs * attrs)237 static inline bool amdv1pt_install_table(struct pt_state *pts,
238 					 pt_oaddr_t table_pa,
239 					 const struct pt_write_attrs *attrs)
240 {
241 	u64 entry;
242 
243 	/*
244 	 * IR and IW are ANDed from the table levels along with the PTE. We
245 	 * always control permissions from the PTE, so always set IR and IW for
246 	 * tables.
247 	 */
248 	entry = AMDV1PT_FMT_PR |
249 		FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
250 		FIELD_PREP(AMDV1PT_FMT_OA,
251 			   log2_div(table_pa, PT_GRANULE_LG2SZ)) |
252 		AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
253 	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
254 		entry = __sme_set(entry);
255 	return pt_table_install64(pts, entry);
256 }
257 #define pt_install_table amdv1pt_install_table
258 
amdv1pt_attr_from_entry(const struct pt_state * pts,struct pt_write_attrs * attrs)259 static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
260 					   struct pt_write_attrs *attrs)
261 {
262 	attrs->descriptor_bits =
263 		pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
264 }
265 #define pt_attr_from_entry amdv1pt_attr_from_entry
266 
amdv1pt_clear_entries(struct pt_state * pts,unsigned int num_contig_lg2)267 static inline void amdv1pt_clear_entries(struct pt_state *pts,
268 					 unsigned int num_contig_lg2)
269 {
270 	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
271 	u64 *end = tablep + log2_to_int(num_contig_lg2);
272 
273 	/*
274 	 * gcc generates rep stos for the io-pgtable code, and this difference
275 	 * can show in microbenchmarks with larger contiguous page sizes.
276 	 * rep is slower for small cases.
277 	 */
278 	if (num_contig_lg2 <= ilog2(32)) {
279 		for (; tablep != end; tablep++)
280 			WRITE_ONCE(*tablep, 0);
281 	} else {
282 		memset64(tablep, 0, log2_to_int(num_contig_lg2));
283 	}
284 }
285 #define pt_clear_entries amdv1pt_clear_entries
286 
amdv1pt_entry_is_write_dirty(const struct pt_state * pts)287 static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
288 {
289 	unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
290 	u64 *tablep = pt_cur_table(pts, u64) +
291 		      log2_set_mod(pts->index, 0, num_contig_lg2);
292 	u64 *end = tablep + log2_to_int(num_contig_lg2);
293 
294 	for (; tablep != end; tablep++)
295 		if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
296 			return true;
297 	return false;
298 }
299 #define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
300 
amdv1pt_entry_make_write_clean(struct pt_state * pts)301 static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
302 {
303 	unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
304 	u64 *tablep = pt_cur_table(pts, u64) +
305 		      log2_set_mod(pts->index, 0, num_contig_lg2);
306 	u64 *end = tablep + log2_to_int(num_contig_lg2);
307 
308 	for (; tablep != end; tablep++)
309 		WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
310 }
311 #define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
312 
amdv1pt_entry_make_write_dirty(struct pt_state * pts)313 static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
314 {
315 	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
316 	u64 new = pts->entry | AMDV1PT_FMT_D;
317 
318 	return try_cmpxchg64(tablep, &pts->entry, new);
319 }
320 #define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
321 
322 /* --- iommu */
323 #include <linux/generic_pt/iommu.h>
324 #include <linux/iommu.h>
325 
326 #define pt_iommu_table pt_iommu_amdv1
327 
328 /* The common struct is in the per-format common struct */
common_from_iommu(struct pt_iommu * iommu_table)329 static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
330 {
331 	return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
332 			->amdpt.common;
333 }
334 
iommu_from_common(struct pt_common * common)335 static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
336 {
337 	return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
338 }
339 
amdv1pt_iommu_set_prot(struct pt_common * common,struct pt_write_attrs * attrs,unsigned int iommu_prot)340 static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
341 					 struct pt_write_attrs *attrs,
342 					 unsigned int iommu_prot)
343 {
344 	u64 pte = 0;
345 
346 	if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
347 		pte |= AMDV1PT_FMT_FC;
348 	if (iommu_prot & IOMMU_READ)
349 		pte |= AMDV1PT_FMT_IR;
350 	if (iommu_prot & IOMMU_WRITE)
351 		pte |= AMDV1PT_FMT_IW;
352 
353 	/*
354 	 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
355 	 * control this. For now if the tables use sme_set then so do the ptes.
356 	 */
357 	if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
358 		pte = __sme_set(pte);
359 
360 	attrs->descriptor_bits = pte;
361 	return 0;
362 }
363 #define pt_iommu_set_prot amdv1pt_iommu_set_prot
364 
amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 * iommu_table,const struct pt_iommu_amdv1_cfg * cfg)365 static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
366 					 const struct pt_iommu_amdv1_cfg *cfg)
367 {
368 	struct pt_amdv1 *table = &iommu_table->amdpt;
369 	unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
370 
371 	if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
372 		return -EINVAL;
373 
374 	if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
375 	    cfg->starting_level != PT_MAX_TOP_LEVEL)
376 		max_vasz_lg2 = PT_GRANULE_LG2SZ +
377 			       (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
378 				       (cfg->starting_level + 1);
379 
380 	table->common.max_vasz_lg2 =
381 		min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
382 	table->common.max_oasz_lg2 =
383 		min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
384 	pt_top_set_level(&table->common, cfg->starting_level);
385 	return 0;
386 }
387 #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
388 
389 #ifndef PT_FMT_VARIANT
390 static inline void
amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 * table,const struct pt_range * top_range,struct pt_iommu_amdv1_hw_info * info)391 amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
392 			  const struct pt_range *top_range,
393 			  struct pt_iommu_amdv1_hw_info *info)
394 {
395 	info->host_pt_root = virt_to_phys(top_range->top_table);
396 	PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
397 	info->mode = top_range->top_level + 1;
398 }
399 #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
400 #endif
401 
402 #if defined(GENERIC_PT_KUNIT)
403 static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
404 	/* Matches what io_pgtable does */
405 	[0] = { .starting_level = 2 },
406 };
407 #define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
408 enum { KUNIT_FMT_FEATURES = 0 };
409 #endif
410 
411 #endif
412