1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * AMD Address Translation Library 4 * 5 * umc.c : Unified Memory Controller (UMC) topology helpers 6 * 7 * Copyright (c) 2023, Advanced Micro Devices, Inc. 8 * All Rights Reserved. 9 * 10 * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> 11 */ 12 13 #include "internal.h" 14 15 /* 16 * MI300 has a fixed, model-specific mapping between a UMC instance and 17 * its related Data Fabric Coherent Station instance. 18 * 19 * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the 20 * UMC instance within a Node. Use this to find the appropriate Coherent 21 * Station ID. 22 * 23 * Redundant bits were removed from the map below. 24 */ 25 static const u16 umc_coh_st_map[32] = { 26 0x393, 0x293, 0x193, 0x093, 27 0x392, 0x292, 0x192, 0x092, 28 0x391, 0x291, 0x191, 0x091, 29 0x390, 0x290, 0x190, 0x090, 30 0x793, 0x693, 0x593, 0x493, 31 0x792, 0x692, 0x592, 0x492, 32 0x791, 0x691, 0x591, 0x491, 33 0x790, 0x690, 0x590, 0x490, 34 }; 35 36 #define UMC_ID_MI300 GENMASK(23, 12) 37 static u8 get_coh_st_inst_id_mi300(struct atl_err *err) 38 { 39 u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid); 40 u8 i; 41 42 for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) { 43 if (umc_id == umc_coh_st_map[i]) 44 break; 45 } 46 47 WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map)); 48 49 return i; 50 } 51 52 /* XOR the bits in @val. */ 53 static u16 bitwise_xor_bits(u16 val) 54 { 55 u16 tmp = 0; 56 u8 i; 57 58 for (i = 0; i < 16; i++) 59 tmp ^= (val >> i) & 0x1; 60 61 return tmp; 62 } 63 64 struct xor_bits { 65 bool xor_enable; 66 u16 col_xor; 67 u32 row_xor; 68 }; 69 70 #define NUM_BANK_BITS 4 71 72 static struct { 73 /* UMC::CH::AddrHashBank */ 74 struct xor_bits bank[NUM_BANK_BITS]; 75 76 /* UMC::CH::AddrHashPC */ 77 struct xor_bits pc; 78 79 /* UMC::CH::AddrHashPC2 */ 80 u8 bank_xor; 81 } addr_hash; 82 83 #define MI300_UMC_CH_BASE 0x90000 84 #define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8) 85 #define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0) 86 #define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4) 87 88 #define ADDR_HASH_XOR_EN BIT(0) 89 #define ADDR_HASH_COL_XOR GENMASK(13, 1) 90 #define ADDR_HASH_ROW_XOR GENMASK(31, 14) 91 #define ADDR_HASH_BANK_XOR GENMASK(5, 0) 92 93 /* 94 * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used 95 * for hashing. Do this during module init, since the values will not 96 * change during run time. 97 * 98 * These registers are instantiated for each UMC across each AMD Node. 99 * However, they should be identically programmed due to the fixed hardware 100 * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a 101 * single global structure for simplicity. 102 */ 103 int get_addr_hash_mi300(void) 104 { 105 u32 temp; 106 int ret; 107 u8 i; 108 109 for (i = 0; i < NUM_BANK_BITS; i++) { 110 ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp); 111 if (ret) 112 return ret; 113 114 addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); 115 addr_hash.bank[i].col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); 116 addr_hash.bank[i].row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); 117 } 118 119 ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp); 120 if (ret) 121 return ret; 122 123 addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); 124 addr_hash.pc.col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); 125 addr_hash.pc.row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); 126 127 ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp); 128 if (ret) 129 return ret; 130 131 addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp); 132 133 return 0; 134 } 135 136 /* 137 * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must 138 * be converted to the intermediate normalized address (NA) before translating to a 139 * system physical address. 140 * 141 * The DRAM address includes bank, row, and column. Also included are bits for 142 * pseudochannel (PC) and stack ID (SID). 143 * 144 * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero 145 * 146 * The MCA address format is as follows: 147 * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]} 148 * 149 * The normalized address format is fixed in hardware and is as follows: 150 * NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]} 151 * 152 * Additionally, the PC and Bank bits may be hashed. This must be accounted for before 153 * reconstructing the normalized address. 154 */ 155 #define MI300_UMC_MCA_COL GENMASK(5, 1) 156 #define MI300_UMC_MCA_BANK GENMASK(9, 6) 157 #define MI300_UMC_MCA_ROW GENMASK(24, 10) 158 #define MI300_UMC_MCA_PC BIT(25) 159 #define MI300_UMC_MCA_SID GENMASK(27, 26) 160 161 #define MI300_NA_COL_1_0 GENMASK(6, 5) 162 #define MI300_NA_PC BIT(7) 163 #define MI300_NA_COL_3_2 GENMASK(9, 8) 164 #define MI300_NA_BANK_3_2 GENMASK(11, 10) 165 #define MI300_NA_BANK_1_0 GENMASK(13, 12) 166 #define MI300_NA_COL_4 BIT(14) 167 #define MI300_NA_ROW GENMASK(28, 15) 168 #define MI300_NA_SID GENMASK(30, 29) 169 170 static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) 171 { 172 u16 i, col, row, bank, pc, sid, temp; 173 174 col = FIELD_GET(MI300_UMC_MCA_COL, addr); 175 bank = FIELD_GET(MI300_UMC_MCA_BANK, addr); 176 row = FIELD_GET(MI300_UMC_MCA_ROW, addr); 177 pc = FIELD_GET(MI300_UMC_MCA_PC, addr); 178 sid = FIELD_GET(MI300_UMC_MCA_SID, addr); 179 180 /* Calculate hash for each Bank bit. */ 181 for (i = 0; i < NUM_BANK_BITS; i++) { 182 if (!addr_hash.bank[i].xor_enable) 183 continue; 184 185 temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor); 186 temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor); 187 bank ^= temp << i; 188 } 189 190 /* Calculate hash for PC bit. */ 191 if (addr_hash.pc.xor_enable) { 192 /* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */ 193 bank |= sid << 5; 194 195 temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); 196 temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); 197 temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor); 198 pc ^= temp; 199 200 /* Drop SID bits for the sake of debug printing later. */ 201 bank &= 0x1F; 202 } 203 204 /* Reconstruct the normalized address starting with NA[4:0] = 0 */ 205 addr = 0; 206 207 /* NA[6:5] = Column[1:0] */ 208 temp = col & 0x3; 209 addr |= FIELD_PREP(MI300_NA_COL_1_0, temp); 210 211 /* NA[7] = PC */ 212 addr |= FIELD_PREP(MI300_NA_PC, pc); 213 214 /* NA[9:8] = Column[3:2] */ 215 temp = (col >> 2) & 0x3; 216 addr |= FIELD_PREP(MI300_NA_COL_3_2, temp); 217 218 /* NA[11:10] = Bank[3:2] */ 219 temp = (bank >> 2) & 0x3; 220 addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp); 221 222 /* NA[13:12] = Bank[1:0] */ 223 temp = bank & 0x3; 224 addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp); 225 226 /* NA[14] = Column[4] */ 227 temp = (col >> 4) & 0x1; 228 addr |= FIELD_PREP(MI300_NA_COL_4, temp); 229 230 /* NA[28:15] = Row[13:0] */ 231 addr |= FIELD_PREP(MI300_NA_ROW, row); 232 233 /* NA[30:29] = SID[1:0] */ 234 addr |= FIELD_PREP(MI300_NA_SID, sid); 235 236 pr_debug("Addr=0x%016lx", addr); 237 pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid); 238 239 return addr; 240 } 241 242 /* 243 * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire 244 * all memory within that DRAM row. This applies to the memory with a DRAM 245 * bank. 246 * 247 * To find the memory addresses, loop through permutations of the DRAM column 248 * bits and find the System Physical address of each. The column bits are used 249 * to calculate the intermediate Normalized address, so all permutations should 250 * be checked. 251 * 252 * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. 253 */ 254 #define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) 255 static void retire_row_mi300(struct atl_err *a_err) 256 { 257 unsigned long addr; 258 struct page *p; 259 u8 col; 260 261 for (col = 0; col < MI300_NUM_COL; col++) { 262 a_err->addr &= ~MI300_UMC_MCA_COL; 263 a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col); 264 265 addr = amd_convert_umc_mca_addr_to_sys_addr(a_err); 266 if (IS_ERR_VALUE(addr)) 267 continue; 268 269 addr = PHYS_PFN(addr); 270 271 /* 272 * Skip invalid or already poisoned pages to avoid unnecessary 273 * error messages from memory_failure(). 274 */ 275 p = pfn_to_online_page(addr); 276 if (!p) 277 continue; 278 279 if (PageHWPoison(p)) 280 continue; 281 282 memory_failure(addr, 0); 283 } 284 } 285 286 void amd_retire_dram_row(struct atl_err *a_err) 287 { 288 if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) 289 return retire_row_mi300(a_err); 290 } 291 EXPORT_SYMBOL_GPL(amd_retire_dram_row); 292 293 static unsigned long get_addr(unsigned long addr) 294 { 295 if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) 296 return convert_dram_to_norm_addr_mi300(addr); 297 298 return addr; 299 } 300 301 #define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) 302 static u8 get_die_id(struct atl_err *err) 303 { 304 /* 305 * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this 306 * needs to be divided by 4 to get the internal Die ID. 307 */ 308 if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) { 309 u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid); 310 311 return node_id >> 2; 312 } 313 314 /* 315 * For CPUs, this is the AMD Node ID modulo the number 316 * of AMD Nodes per socket. 317 */ 318 return topology_amd_node_id(err->cpu) % topology_amd_nodes_per_pkg(); 319 } 320 321 #define UMC_CHANNEL_NUM GENMASK(31, 20) 322 static u8 get_coh_st_inst_id(struct atl_err *err) 323 { 324 if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) 325 return get_coh_st_inst_id_mi300(err); 326 327 return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); 328 } 329 330 unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) 331 { 332 u8 socket_id = topology_physical_package_id(err->cpu); 333 u8 coh_st_inst_id = get_coh_st_inst_id(err); 334 unsigned long addr = get_addr(err->addr); 335 u8 die_id = get_die_id(err); 336 337 pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx", 338 socket_id, die_id, coh_st_inst_id, addr); 339 340 return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); 341 } 342