1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 * 5 * AMD IOMMU v1 page table 6 * 7 * This is described in Section "2.2.3 I/O Page Tables for Host Translations" 8 * of the "AMD I/O Virtualization Technology (IOMMU) Specification" 9 * 10 * Note the level numbering here matches the core code, so level 0 is the same 11 * as mode 1. 12 * 13 */ 14 #ifndef __GENERIC_PT_FMT_AMDV1_H 15 #define __GENERIC_PT_FMT_AMDV1_H 16 17 #include "defs_amdv1.h" 18 #include "../pt_defs.h" 19 20 #include <asm/page.h> 21 #include <linux/bitfield.h> 22 #include <linux/container_of.h> 23 #include <linux/mem_encrypt.h> 24 #include <linux/minmax.h> 25 #include <linux/sizes.h> 26 #include <linux/string.h> 27 28 enum { 29 PT_ITEM_WORD_SIZE = sizeof(u64), 30 /* 31 * The IOMMUFD selftest uses the AMDv1 format with some alterations It 32 * uses a 2k page size to test cases where the CPU page size is not the 33 * same. 34 */ 35 #ifdef AMDV1_IOMMUFD_SELFTEST 36 PT_MAX_VA_ADDRESS_LG2 = 56, 37 PT_MAX_OUTPUT_ADDRESS_LG2 = 51, 38 PT_MAX_TOP_LEVEL = 4, 39 PT_GRANULE_LG2SZ = 11, 40 #else 41 PT_MAX_VA_ADDRESS_LG2 = 64, 42 PT_MAX_OUTPUT_ADDRESS_LG2 = 52, 43 PT_MAX_TOP_LEVEL = 5, 44 PT_GRANULE_LG2SZ = 12, 45 #endif 46 PT_TABLEMEM_LG2SZ = 12, 47 48 /* The DTE only has these bits for the top phyiscal address */ 49 PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), 50 }; 51 52 /* PTE bits */ 53 enum { 54 AMDV1PT_FMT_PR = BIT(0), 55 AMDV1PT_FMT_D = BIT(6), 56 AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), 57 AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), 58 AMDV1PT_FMT_FC = BIT_ULL(60), 59 AMDV1PT_FMT_IR = BIT_ULL(61), 60 AMDV1PT_FMT_IW = BIT_ULL(62), 61 }; 62 63 /* 64 * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make 65 * these defines to avoid it. 66 */ 67 #define AMDV1PT_FMT_NL_DEFAULT 0 68 #define AMDV1PT_FMT_NL_SIZE 7 69 70 static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) 71 { 72 u64 entry = pts->entry; 73 74 if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 75 entry = __sme_clr(entry); 76 return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); 77 } 78 #define pt_table_pa amdv1pt_table_pa 79 80 /* Returns the oa for the start of the contiguous entry */ 81 static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) 82 { 83 u64 entry = pts->entry; 84 pt_oaddr_t oa; 85 86 if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 87 entry = __sme_clr(entry); 88 oa = FIELD_GET(AMDV1PT_FMT_OA, entry); 89 90 if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { 91 unsigned int sz_bits = oaffz(oa); 92 93 oa = oalog2_set_mod(oa, 0, sz_bits); 94 } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != 95 AMDV1PT_FMT_NL_DEFAULT)) 96 return 0; 97 return oalog2_mul(oa, PT_GRANULE_LG2SZ); 98 } 99 #define pt_entry_oa amdv1pt_entry_oa 100 101 static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) 102 { 103 /* 104 * Table 15: Page Table Level Parameters 105 * The top most level cannot have translation entries 106 */ 107 return pts->level < PT_MAX_TOP_LEVEL; 108 } 109 #define pt_can_have_leaf amdv1pt_can_have_leaf 110 111 /* Body in pt_fmt_defaults.h */ 112 static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); 113 114 static inline unsigned int 115 amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) 116 { 117 u32 code; 118 119 if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == 120 AMDV1PT_FMT_NL_DEFAULT) 121 return ilog2(1); 122 123 PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != 124 AMDV1PT_FMT_NL_SIZE); 125 126 /* 127 * The contiguous size is encoded in the length of a string of 1's in 128 * the low bits of the OA. Reverse the equation: 129 * code = log2_to_int(num_contig_lg2 + item_lg2sz - 130 * PT_GRANULE_LG2SZ - 1) - 1 131 * Which can be expressed as: 132 * num_contig_lg2 = oalog2_ffz(code) + 1 - 133 * item_lg2sz - PT_GRANULE_LG2SZ 134 * 135 * Assume the bit layout is correct and remove the masking. Reorganize 136 * the equation to move all the arithmetic before the ffz. 137 */ 138 code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + 139 pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); 140 return ffz_t(u32, code); 141 } 142 #define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 143 144 static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) 145 { 146 /* 147 * Top entry covers bits [63:57] only, this is handled through 148 * max_vasz_lg2. 149 */ 150 if (PT_WARN_ON(pts->level == 5)) 151 return 7; 152 return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 153 } 154 #define pt_num_items_lg2 amdv1pt_num_items_lg2 155 156 static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) 157 { 158 unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 159 160 if (!amdv1pt_can_have_leaf(pts)) 161 return 0; 162 163 /* 164 * Table 14: Example Page Size Encodings 165 * Address bits 51:32 can be used to encode page sizes greater than 4 166 * Gbytes. Address bits 63:52 are zero-extended. 167 * 168 * 512GB Pages are not supported due to a hardware bug. 169 * Otherwise every power of two size is supported. 170 */ 171 return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), 172 isz_lg2) & ~SZ_512G; 173 } 174 #define pt_possible_sizes amdv1pt_possible_sizes 175 176 static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) 177 { 178 const u64 *tablep = pt_cur_table(pts, u64) + pts->index; 179 unsigned int next_level; 180 u64 entry; 181 182 pts->entry = entry = READ_ONCE(*tablep); 183 if (!(entry & AMDV1PT_FMT_PR)) 184 return PT_ENTRY_EMPTY; 185 186 next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); 187 if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || 188 next_level == AMDV1PT_FMT_NL_SIZE) 189 return PT_ENTRY_OA; 190 return PT_ENTRY_TABLE; 191 } 192 #define pt_load_entry_raw amdv1pt_load_entry_raw 193 194 static inline void 195 amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 196 unsigned int oasz_lg2, 197 const struct pt_write_attrs *attrs) 198 { 199 unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 200 u64 *tablep = pt_cur_table(pts, u64) + pts->index; 201 u64 entry; 202 203 if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 204 return; 205 206 entry = AMDV1PT_FMT_PR | 207 FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | 208 attrs->descriptor_bits; 209 210 if (oasz_lg2 == isz_lg2) { 211 entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, 212 AMDV1PT_FMT_NL_DEFAULT); 213 WRITE_ONCE(*tablep, entry); 214 } else { 215 unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; 216 u64 *end = tablep + log2_to_int(num_contig_lg2); 217 218 entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, 219 AMDV1PT_FMT_NL_SIZE) | 220 FIELD_PREP(AMDV1PT_FMT_OA, 221 oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - 222 1) - 223 1); 224 225 /* See amdv1pt_clear_entries() */ 226 if (num_contig_lg2 <= ilog2(32)) { 227 for (; tablep != end; tablep++) 228 WRITE_ONCE(*tablep, entry); 229 } else { 230 memset64(tablep, entry, log2_to_int(num_contig_lg2)); 231 } 232 } 233 pts->entry = entry; 234 } 235 #define pt_install_leaf_entry amdv1pt_install_leaf_entry 236 237 static inline bool amdv1pt_install_table(struct pt_state *pts, 238 pt_oaddr_t table_pa, 239 const struct pt_write_attrs *attrs) 240 { 241 u64 entry; 242 243 /* 244 * IR and IW are ANDed from the table levels along with the PTE. We 245 * always control permissions from the PTE, so always set IR and IW for 246 * tables. 247 */ 248 entry = AMDV1PT_FMT_PR | 249 FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | 250 FIELD_PREP(AMDV1PT_FMT_OA, 251 log2_div(table_pa, PT_GRANULE_LG2SZ)) | 252 AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; 253 if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 254 entry = __sme_set(entry); 255 return pt_table_install64(pts, entry); 256 } 257 #define pt_install_table amdv1pt_install_table 258 259 static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, 260 struct pt_write_attrs *attrs) 261 { 262 attrs->descriptor_bits = 263 pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); 264 } 265 #define pt_attr_from_entry amdv1pt_attr_from_entry 266 267 static inline void amdv1pt_clear_entries(struct pt_state *pts, 268 unsigned int num_contig_lg2) 269 { 270 u64 *tablep = pt_cur_table(pts, u64) + pts->index; 271 u64 *end = tablep + log2_to_int(num_contig_lg2); 272 273 /* 274 * gcc generates rep stos for the io-pgtable code, and this difference 275 * can show in microbenchmarks with larger contiguous page sizes. 276 * rep is slower for small cases. 277 */ 278 if (num_contig_lg2 <= ilog2(32)) { 279 for (; tablep != end; tablep++) 280 WRITE_ONCE(*tablep, 0); 281 } else { 282 memset64(tablep, 0, log2_to_int(num_contig_lg2)); 283 } 284 } 285 #define pt_clear_entries amdv1pt_clear_entries 286 287 static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) 288 { 289 unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); 290 u64 *tablep = pt_cur_table(pts, u64) + 291 log2_set_mod(pts->index, 0, num_contig_lg2); 292 u64 *end = tablep + log2_to_int(num_contig_lg2); 293 294 for (; tablep != end; tablep++) 295 if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) 296 return true; 297 return false; 298 } 299 #define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty 300 301 static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) 302 { 303 unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); 304 u64 *tablep = pt_cur_table(pts, u64) + 305 log2_set_mod(pts->index, 0, num_contig_lg2); 306 u64 *end = tablep + log2_to_int(num_contig_lg2); 307 308 for (; tablep != end; tablep++) 309 WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); 310 } 311 #define pt_entry_make_write_clean amdv1pt_entry_make_write_clean 312 313 static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) 314 { 315 u64 *tablep = pt_cur_table(pts, u64) + pts->index; 316 u64 new = pts->entry | AMDV1PT_FMT_D; 317 318 return try_cmpxchg64(tablep, &pts->entry, new); 319 } 320 #define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty 321 322 /* --- iommu */ 323 #include <linux/generic_pt/iommu.h> 324 #include <linux/iommu.h> 325 326 #define pt_iommu_table pt_iommu_amdv1 327 328 /* The common struct is in the per-format common struct */ 329 static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 330 { 331 return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) 332 ->amdpt.common; 333 } 334 335 static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 336 { 337 return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; 338 } 339 340 static inline int amdv1pt_iommu_set_prot(struct pt_common *common, 341 struct pt_write_attrs *attrs, 342 unsigned int iommu_prot) 343 { 344 u64 pte = 0; 345 346 if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) 347 pte |= AMDV1PT_FMT_FC; 348 if (iommu_prot & IOMMU_READ) 349 pte |= AMDV1PT_FMT_IR; 350 if (iommu_prot & IOMMU_WRITE) 351 pte |= AMDV1PT_FMT_IW; 352 353 /* 354 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to 355 * control this. For now if the tables use sme_set then so do the ptes. 356 */ 357 if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 358 pte = __sme_set(pte); 359 360 attrs->descriptor_bits = pte; 361 return 0; 362 } 363 #define pt_iommu_set_prot amdv1pt_iommu_set_prot 364 365 static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, 366 const struct pt_iommu_amdv1_cfg *cfg) 367 { 368 struct pt_amdv1 *table = &iommu_table->amdpt; 369 unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; 370 371 if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) 372 return -EINVAL; 373 374 if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && 375 cfg->starting_level != PT_MAX_TOP_LEVEL) 376 max_vasz_lg2 = PT_GRANULE_LG2SZ + 377 (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * 378 (cfg->starting_level + 1); 379 380 table->common.max_vasz_lg2 = 381 min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); 382 table->common.max_oasz_lg2 = 383 min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); 384 pt_top_set_level(&table->common, cfg->starting_level); 385 return 0; 386 } 387 #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init 388 389 #ifndef PT_FMT_VARIANT 390 static inline void 391 amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, 392 const struct pt_range *top_range, 393 struct pt_iommu_amdv1_hw_info *info) 394 { 395 info->host_pt_root = virt_to_phys(top_range->top_table); 396 PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); 397 info->mode = top_range->top_level + 1; 398 } 399 #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info 400 #endif 401 402 #if defined(GENERIC_PT_KUNIT) 403 static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { 404 /* Matches what io_pgtable does */ 405 [0] = { .starting_level = 2 }, 406 }; 407 #define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs 408 enum { KUNIT_FMT_FEATURES = 0 }; 409 #endif 410 411 #endif 412