1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3 * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
4 *
5 * AMD IOMMU v1 page table
6 *
7 * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
8 * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
9 *
10 * Note the level numbering here matches the core code, so level 0 is the same
11 * as mode 1.
12 *
13 */
14 #ifndef __GENERIC_PT_FMT_AMDV1_H
15 #define __GENERIC_PT_FMT_AMDV1_H
16
17 #include "defs_amdv1.h"
18 #include "../pt_defs.h"
19
20 #include <asm/page.h>
21 #include <linux/bitfield.h>
22 #include <linux/container_of.h>
23 #include <linux/mem_encrypt.h>
24 #include <linux/minmax.h>
25 #include <linux/sizes.h>
26 #include <linux/string.h>
27
28 enum {
29 PT_ITEM_WORD_SIZE = sizeof(u64),
30 /*
31 * The IOMMUFD selftest uses the AMDv1 format with some alterations It
32 * uses a 2k page size to test cases where the CPU page size is not the
33 * same.
34 */
35 #ifdef AMDV1_IOMMUFD_SELFTEST
36 PT_MAX_VA_ADDRESS_LG2 = 56,
37 PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
38 PT_MAX_TOP_LEVEL = 4,
39 PT_GRANULE_LG2SZ = 11,
40 #else
41 PT_MAX_VA_ADDRESS_LG2 = 64,
42 PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
43 PT_MAX_TOP_LEVEL = 5,
44 PT_GRANULE_LG2SZ = 12,
45 #endif
46 PT_TABLEMEM_LG2SZ = 12,
47
48 /* The DTE only has these bits for the top phyiscal address */
49 PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
50 };
51
52 /* PTE bits */
53 enum {
54 AMDV1PT_FMT_PR = BIT(0),
55 AMDV1PT_FMT_D = BIT(6),
56 AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
57 AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
58 AMDV1PT_FMT_FC = BIT_ULL(60),
59 AMDV1PT_FMT_IR = BIT_ULL(61),
60 AMDV1PT_FMT_IW = BIT_ULL(62),
61 };
62
63 /*
64 * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
65 * these defines to avoid it.
66 */
67 #define AMDV1PT_FMT_NL_DEFAULT 0
68 #define AMDV1PT_FMT_NL_SIZE 7
69
amdv1pt_table_pa(const struct pt_state * pts)70 static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
71 {
72 u64 entry = pts->entry;
73
74 if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
75 entry = __sme_clr(entry);
76 return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
77 }
78 #define pt_table_pa amdv1pt_table_pa
79
80 /* Returns the oa for the start of the contiguous entry */
amdv1pt_entry_oa(const struct pt_state * pts)81 static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
82 {
83 u64 entry = pts->entry;
84 pt_oaddr_t oa;
85
86 if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
87 entry = __sme_clr(entry);
88 oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
89
90 if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
91 unsigned int sz_bits = oaffz(oa);
92
93 oa = oalog2_set_mod(oa, 0, sz_bits);
94 } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
95 AMDV1PT_FMT_NL_DEFAULT))
96 return 0;
97 return oalog2_mul(oa, PT_GRANULE_LG2SZ);
98 }
99 #define pt_entry_oa amdv1pt_entry_oa
100
amdv1pt_can_have_leaf(const struct pt_state * pts)101 static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
102 {
103 /*
104 * Table 15: Page Table Level Parameters
105 * The top most level cannot have translation entries
106 */
107 return pts->level < PT_MAX_TOP_LEVEL;
108 }
109 #define pt_can_have_leaf amdv1pt_can_have_leaf
110
111 /* Body in pt_fmt_defaults.h */
112 static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
113
114 static inline unsigned int
amdv1pt_entry_num_contig_lg2(const struct pt_state * pts)115 amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
116 {
117 u32 code;
118
119 if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
120 AMDV1PT_FMT_NL_DEFAULT)
121 return ilog2(1);
122
123 PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
124 AMDV1PT_FMT_NL_SIZE);
125
126 /*
127 * The contiguous size is encoded in the length of a string of 1's in
128 * the low bits of the OA. Reverse the equation:
129 * code = log2_to_int(num_contig_lg2 + item_lg2sz -
130 * PT_GRANULE_LG2SZ - 1) - 1
131 * Which can be expressed as:
132 * num_contig_lg2 = oalog2_ffz(code) + 1 -
133 * item_lg2sz - PT_GRANULE_LG2SZ
134 *
135 * Assume the bit layout is correct and remove the masking. Reorganize
136 * the equation to move all the arithmetic before the ffz.
137 */
138 code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
139 pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
140 return ffz_t(u32, code);
141 }
142 #define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
143
amdv1pt_num_items_lg2(const struct pt_state * pts)144 static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
145 {
146 /*
147 * Top entry covers bits [63:57] only, this is handled through
148 * max_vasz_lg2.
149 */
150 if (PT_WARN_ON(pts->level == 5))
151 return 7;
152 return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
153 }
154 #define pt_num_items_lg2 amdv1pt_num_items_lg2
155
amdv1pt_possible_sizes(const struct pt_state * pts)156 static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
157 {
158 unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
159
160 if (!amdv1pt_can_have_leaf(pts))
161 return 0;
162
163 /*
164 * Table 14: Example Page Size Encodings
165 * Address bits 51:32 can be used to encode page sizes greater than 4
166 * Gbytes. Address bits 63:52 are zero-extended.
167 *
168 * 512GB Pages are not supported due to a hardware bug.
169 * Otherwise every power of two size is supported.
170 */
171 return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
172 isz_lg2) & ~SZ_512G;
173 }
174 #define pt_possible_sizes amdv1pt_possible_sizes
175
amdv1pt_load_entry_raw(struct pt_state * pts)176 static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
177 {
178 const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
179 unsigned int next_level;
180 u64 entry;
181
182 pts->entry = entry = READ_ONCE(*tablep);
183 if (!(entry & AMDV1PT_FMT_PR))
184 return PT_ENTRY_EMPTY;
185
186 next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
187 if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
188 next_level == AMDV1PT_FMT_NL_SIZE)
189 return PT_ENTRY_OA;
190 return PT_ENTRY_TABLE;
191 }
192 #define pt_load_entry_raw amdv1pt_load_entry_raw
193
194 static inline void
amdv1pt_install_leaf_entry(struct pt_state * pts,pt_oaddr_t oa,unsigned int oasz_lg2,const struct pt_write_attrs * attrs)195 amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
196 unsigned int oasz_lg2,
197 const struct pt_write_attrs *attrs)
198 {
199 unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
200 u64 *tablep = pt_cur_table(pts, u64) + pts->index;
201 u64 entry;
202
203 if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
204 return;
205
206 entry = AMDV1PT_FMT_PR |
207 FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
208 attrs->descriptor_bits;
209
210 if (oasz_lg2 == isz_lg2) {
211 entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
212 AMDV1PT_FMT_NL_DEFAULT);
213 WRITE_ONCE(*tablep, entry);
214 } else {
215 unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
216 u64 *end = tablep + log2_to_int(num_contig_lg2);
217
218 entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
219 AMDV1PT_FMT_NL_SIZE) |
220 FIELD_PREP(AMDV1PT_FMT_OA,
221 oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
222 1) -
223 1);
224
225 /* See amdv1pt_clear_entries() */
226 if (num_contig_lg2 <= ilog2(32)) {
227 for (; tablep != end; tablep++)
228 WRITE_ONCE(*tablep, entry);
229 } else {
230 memset64(tablep, entry, log2_to_int(num_contig_lg2));
231 }
232 }
233 pts->entry = entry;
234 }
235 #define pt_install_leaf_entry amdv1pt_install_leaf_entry
236
amdv1pt_install_table(struct pt_state * pts,pt_oaddr_t table_pa,const struct pt_write_attrs * attrs)237 static inline bool amdv1pt_install_table(struct pt_state *pts,
238 pt_oaddr_t table_pa,
239 const struct pt_write_attrs *attrs)
240 {
241 u64 entry;
242
243 /*
244 * IR and IW are ANDed from the table levels along with the PTE. We
245 * always control permissions from the PTE, so always set IR and IW for
246 * tables.
247 */
248 entry = AMDV1PT_FMT_PR |
249 FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
250 FIELD_PREP(AMDV1PT_FMT_OA,
251 log2_div(table_pa, PT_GRANULE_LG2SZ)) |
252 AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
253 if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
254 entry = __sme_set(entry);
255 return pt_table_install64(pts, entry);
256 }
257 #define pt_install_table amdv1pt_install_table
258
amdv1pt_attr_from_entry(const struct pt_state * pts,struct pt_write_attrs * attrs)259 static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
260 struct pt_write_attrs *attrs)
261 {
262 attrs->descriptor_bits =
263 pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
264 }
265 #define pt_attr_from_entry amdv1pt_attr_from_entry
266
amdv1pt_clear_entries(struct pt_state * pts,unsigned int num_contig_lg2)267 static inline void amdv1pt_clear_entries(struct pt_state *pts,
268 unsigned int num_contig_lg2)
269 {
270 u64 *tablep = pt_cur_table(pts, u64) + pts->index;
271 u64 *end = tablep + log2_to_int(num_contig_lg2);
272
273 /*
274 * gcc generates rep stos for the io-pgtable code, and this difference
275 * can show in microbenchmarks with larger contiguous page sizes.
276 * rep is slower for small cases.
277 */
278 if (num_contig_lg2 <= ilog2(32)) {
279 for (; tablep != end; tablep++)
280 WRITE_ONCE(*tablep, 0);
281 } else {
282 memset64(tablep, 0, log2_to_int(num_contig_lg2));
283 }
284 }
285 #define pt_clear_entries amdv1pt_clear_entries
286
amdv1pt_entry_is_write_dirty(const struct pt_state * pts)287 static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
288 {
289 unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
290 u64 *tablep = pt_cur_table(pts, u64) +
291 log2_set_mod(pts->index, 0, num_contig_lg2);
292 u64 *end = tablep + log2_to_int(num_contig_lg2);
293
294 for (; tablep != end; tablep++)
295 if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
296 return true;
297 return false;
298 }
299 #define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
300
amdv1pt_entry_make_write_clean(struct pt_state * pts)301 static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
302 {
303 unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
304 u64 *tablep = pt_cur_table(pts, u64) +
305 log2_set_mod(pts->index, 0, num_contig_lg2);
306 u64 *end = tablep + log2_to_int(num_contig_lg2);
307
308 for (; tablep != end; tablep++)
309 WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
310 }
311 #define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
312
amdv1pt_entry_make_write_dirty(struct pt_state * pts)313 static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
314 {
315 u64 *tablep = pt_cur_table(pts, u64) + pts->index;
316 u64 new = pts->entry | AMDV1PT_FMT_D;
317
318 return try_cmpxchg64(tablep, &pts->entry, new);
319 }
320 #define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
321
322 /* --- iommu */
323 #include <linux/generic_pt/iommu.h>
324 #include <linux/iommu.h>
325
326 #define pt_iommu_table pt_iommu_amdv1
327
328 /* The common struct is in the per-format common struct */
common_from_iommu(struct pt_iommu * iommu_table)329 static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
330 {
331 return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
332 ->amdpt.common;
333 }
334
iommu_from_common(struct pt_common * common)335 static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
336 {
337 return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
338 }
339
amdv1pt_iommu_set_prot(struct pt_common * common,struct pt_write_attrs * attrs,unsigned int iommu_prot)340 static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
341 struct pt_write_attrs *attrs,
342 unsigned int iommu_prot)
343 {
344 u64 pte = 0;
345
346 if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
347 pte |= AMDV1PT_FMT_FC;
348 if (iommu_prot & IOMMU_READ)
349 pte |= AMDV1PT_FMT_IR;
350 if (iommu_prot & IOMMU_WRITE)
351 pte |= AMDV1PT_FMT_IW;
352
353 /*
354 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
355 * control this. For now if the tables use sme_set then so do the ptes.
356 */
357 if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
358 pte = __sme_set(pte);
359
360 attrs->descriptor_bits = pte;
361 return 0;
362 }
363 #define pt_iommu_set_prot amdv1pt_iommu_set_prot
364
amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 * iommu_table,const struct pt_iommu_amdv1_cfg * cfg)365 static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
366 const struct pt_iommu_amdv1_cfg *cfg)
367 {
368 struct pt_amdv1 *table = &iommu_table->amdpt;
369 unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
370
371 if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
372 return -EINVAL;
373
374 if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
375 cfg->starting_level != PT_MAX_TOP_LEVEL)
376 max_vasz_lg2 = PT_GRANULE_LG2SZ +
377 (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
378 (cfg->starting_level + 1);
379
380 table->common.max_vasz_lg2 =
381 min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
382 table->common.max_oasz_lg2 =
383 min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
384 pt_top_set_level(&table->common, cfg->starting_level);
385 return 0;
386 }
387 #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
388
389 #ifndef PT_FMT_VARIANT
390 static inline void
amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 * table,const struct pt_range * top_range,struct pt_iommu_amdv1_hw_info * info)391 amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
392 const struct pt_range *top_range,
393 struct pt_iommu_amdv1_hw_info *info)
394 {
395 info->host_pt_root = virt_to_phys(top_range->top_table);
396 PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
397 info->mode = top_range->top_level + 1;
398 }
399 #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
400 #endif
401
402 #if defined(GENERIC_PT_KUNIT)
403 static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
404 /* Matches what io_pgtable does */
405 [0] = { .starting_level = 2 },
406 };
407 #define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
408 enum { KUNIT_FMT_FEATURES = 0 };
409 #endif
410
411 #endif
412