amd64_edac.c (cf79f291f985662150363b4a93d16f88f12643bc) | amd64_edac.c (6c9058f49084569d1d816e87185e0a4f9ab1a321) |
---|---|
1// SPDX-License-Identifier: GPL-2.0-only | 1// SPDX-License-Identifier: GPL-2.0-only |
2#include <linux/ras.h> |
|
2#include "amd64_edac.h" 3#include <asm/amd_nb.h> 4 5static struct edac_pci_ctl_info *pci_ctl; 6 7/* 8 * Set by command line parameter. If BIOS has enabled the ECC, this override is 9 * cleared to prevent re-enabling the hardware by this driver. --- 1036 unchanged lines hidden (view full) --- 1046 /* Nodes below the GPU base node are CPU nodes and don't need a fixup. */ 1047 if (nid < gpu_node_map.base_node_id) 1048 return node_id; 1049 1050 /* Convert the hardware-provided AMD Node ID to a Linux logical one. */ 1051 return nid - gpu_node_map.base_node_id + 1; 1052} 1053 | 3#include "amd64_edac.h" 4#include <asm/amd_nb.h> 5 6static struct edac_pci_ctl_info *pci_ctl; 7 8/* 9 * Set by command line parameter. If BIOS has enabled the ECC, this override is 10 * cleared to prevent re-enabling the hardware by this driver. --- 1036 unchanged lines hidden (view full) --- 1047 /* Nodes below the GPU base node are CPU nodes and don't need a fixup. */ 1048 if (nid < gpu_node_map.base_node_id) 1049 return node_id; 1050 1051 /* Convert the hardware-provided AMD Node ID to a Linux logical one. */ 1052 return nid - gpu_node_map.base_node_id + 1; 1053} 1054 |
1054/* Protect the PCI config register pairs used for DF indirect access. */ 1055static DEFINE_MUTEX(df_indirect_mutex); 1056 1057/* 1058 * Data Fabric Indirect Access uses FICAA/FICAD. 1059 * 1060 * Fabric Indirect Configuration Access Address (FICAA): Constructed based 1061 * on the device's Instance Id and the PCI function and register offset of 1062 * the desired register. 1063 * 1064 * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO 1065 * and FICAD HI registers but so far we only need the LO register. 1066 * 1067 * Use Instance Id 0xFF to indicate a broadcast read. 1068 */ 1069#define DF_BROADCAST 0xFF 1070static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) 1071{ 1072 struct pci_dev *F4; 1073 u32 ficaa; 1074 int err = -ENODEV; 1075 1076 if (node >= amd_nb_num()) 1077 goto out; 1078 1079 F4 = node_to_amd_nb(node)->link; 1080 if (!F4) 1081 goto out; 1082 1083 ficaa = (instance_id == DF_BROADCAST) ? 0 : 1; 1084 ficaa |= reg & 0x3FC; 1085 ficaa |= (func & 0x7) << 11; 1086 ficaa |= instance_id << 16; 1087 1088 mutex_lock(&df_indirect_mutex); 1089 1090 err = pci_write_config_dword(F4, 0x5C, ficaa); 1091 if (err) { 1092 pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); 1093 goto out_unlock; 1094 } 1095 1096 err = pci_read_config_dword(F4, 0x98, lo); 1097 if (err) 1098 pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); 1099 1100out_unlock: 1101 mutex_unlock(&df_indirect_mutex); 1102 1103out: 1104 return err; 1105} 1106 1107static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) 1108{ 1109 return __df_indirect_read(node, func, reg, instance_id, lo); 1110} 1111 1112static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) 1113{ 1114 return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); 1115} 1116 1117struct addr_ctx { 1118 u64 ret_addr; 1119 u32 tmp; 1120 u16 nid; 1121 u8 inst_id; 1122}; 1123 1124static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) 1125{ 1126 u64 dram_base_addr, dram_limit_addr, dram_hole_base; 1127 1128 u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; 1129 u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; 1130 u8 intlv_addr_sel, intlv_addr_bit; 1131 u8 num_intlv_bits, hashed_bit; 1132 u8 lgcy_mmio_hole_en, base = 0; 1133 u8 cs_mask, cs_id = 0; 1134 bool hash_enabled = false; 1135 1136 struct addr_ctx ctx; 1137 1138 memset(&ctx, 0, sizeof(ctx)); 1139 1140 /* Start from the normalized address */ 1141 ctx.ret_addr = norm_addr; 1142 1143 ctx.nid = nid; 1144 ctx.inst_id = umc; 1145 1146 /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ 1147 if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp)) 1148 goto out_err; 1149 1150 /* Remove HiAddrOffset from normalized address, if enabled: */ 1151 if (ctx.tmp & BIT(0)) { 1152 u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8; 1153 1154 if (norm_addr >= hi_addr_offset) { 1155 ctx.ret_addr -= hi_addr_offset; 1156 base = 1; 1157 } 1158 } 1159 1160 /* Read D18F0x110 (DramBaseAddress). */ 1161 if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp)) 1162 goto out_err; 1163 1164 /* Check if address range is valid. */ 1165 if (!(ctx.tmp & BIT(0))) { 1166 pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", 1167 __func__, ctx.tmp); 1168 goto out_err; 1169 } 1170 1171 lgcy_mmio_hole_en = ctx.tmp & BIT(1); 1172 intlv_num_chan = (ctx.tmp >> 4) & 0xF; 1173 intlv_addr_sel = (ctx.tmp >> 8) & 0x7; 1174 dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16; 1175 1176 /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ 1177 if (intlv_addr_sel > 3) { 1178 pr_err("%s: Invalid interleave address select %d.\n", 1179 __func__, intlv_addr_sel); 1180 goto out_err; 1181 } 1182 1183 /* Read D18F0x114 (DramLimitAddress). */ 1184 if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp)) 1185 goto out_err; 1186 1187 intlv_num_sockets = (ctx.tmp >> 8) & 0x1; 1188 intlv_num_dies = (ctx.tmp >> 10) & 0x3; 1189 dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); 1190 1191 intlv_addr_bit = intlv_addr_sel + 8; 1192 1193 /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ 1194 switch (intlv_num_chan) { 1195 case 0: intlv_num_chan = 0; break; 1196 case 1: intlv_num_chan = 1; break; 1197 case 3: intlv_num_chan = 2; break; 1198 case 5: intlv_num_chan = 3; break; 1199 case 7: intlv_num_chan = 4; break; 1200 1201 case 8: intlv_num_chan = 1; 1202 hash_enabled = true; 1203 break; 1204 default: 1205 pr_err("%s: Invalid number of interleaved channels %d.\n", 1206 __func__, intlv_num_chan); 1207 goto out_err; 1208 } 1209 1210 num_intlv_bits = intlv_num_chan; 1211 1212 if (intlv_num_dies > 2) { 1213 pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", 1214 __func__, intlv_num_dies); 1215 goto out_err; 1216 } 1217 1218 num_intlv_bits += intlv_num_dies; 1219 1220 /* Add a bit if sockets are interleaved. */ 1221 num_intlv_bits += intlv_num_sockets; 1222 1223 /* Assert num_intlv_bits <= 4 */ 1224 if (num_intlv_bits > 4) { 1225 pr_err("%s: Invalid interleave bits %d.\n", 1226 __func__, num_intlv_bits); 1227 goto out_err; 1228 } 1229 1230 if (num_intlv_bits > 0) { 1231 u64 temp_addr_x, temp_addr_i, temp_addr_y; 1232 u8 die_id_bit, sock_id_bit, cs_fabric_id; 1233 1234 /* 1235 * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. 1236 * This is the fabric id for this coherent slave. Use 1237 * umc/channel# as instance id of the coherent slave 1238 * for FICAA. 1239 */ 1240 if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp)) 1241 goto out_err; 1242 1243 cs_fabric_id = (ctx.tmp >> 8) & 0xFF; 1244 die_id_bit = 0; 1245 1246 /* If interleaved over more than 1 channel: */ 1247 if (intlv_num_chan) { 1248 die_id_bit = intlv_num_chan; 1249 cs_mask = (1 << die_id_bit) - 1; 1250 cs_id = cs_fabric_id & cs_mask; 1251 } 1252 1253 sock_id_bit = die_id_bit; 1254 1255 /* Read D18F1x208 (SystemFabricIdMask). */ 1256 if (intlv_num_dies || intlv_num_sockets) 1257 if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp)) 1258 goto out_err; 1259 1260 /* If interleaved over more than 1 die. */ 1261 if (intlv_num_dies) { 1262 sock_id_bit = die_id_bit + intlv_num_dies; 1263 die_id_shift = (ctx.tmp >> 24) & 0xF; 1264 die_id_mask = (ctx.tmp >> 8) & 0xFF; 1265 1266 cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; 1267 } 1268 1269 /* If interleaved over more than 1 socket. */ 1270 if (intlv_num_sockets) { 1271 socket_id_shift = (ctx.tmp >> 28) & 0xF; 1272 socket_id_mask = (ctx.tmp >> 16) & 0xFF; 1273 1274 cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; 1275 } 1276 1277 /* 1278 * The pre-interleaved address consists of XXXXXXIIIYYYYY 1279 * where III is the ID for this CS, and XXXXXXYYYYY are the 1280 * address bits from the post-interleaved address. 1281 * "num_intlv_bits" has been calculated to tell us how many "I" 1282 * bits there are. "intlv_addr_bit" tells us how many "Y" bits 1283 * there are (where "I" starts). 1284 */ 1285 temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0); 1286 temp_addr_i = (cs_id << intlv_addr_bit); 1287 temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; 1288 ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; 1289 } 1290 1291 /* Add dram base address */ 1292 ctx.ret_addr += dram_base_addr; 1293 1294 /* If legacy MMIO hole enabled */ 1295 if (lgcy_mmio_hole_en) { 1296 if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp)) 1297 goto out_err; 1298 1299 dram_hole_base = ctx.tmp & GENMASK(31, 24); 1300 if (ctx.ret_addr >= dram_hole_base) 1301 ctx.ret_addr += (BIT_ULL(32) - dram_hole_base); 1302 } 1303 1304 if (hash_enabled) { 1305 /* Save some parentheses and grab ls-bit at the end. */ 1306 hashed_bit = (ctx.ret_addr >> 12) ^ 1307 (ctx.ret_addr >> 18) ^ 1308 (ctx.ret_addr >> 21) ^ 1309 (ctx.ret_addr >> 30) ^ 1310 cs_id; 1311 1312 hashed_bit &= BIT(0); 1313 1314 if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0))) 1315 ctx.ret_addr ^= BIT(intlv_addr_bit); 1316 } 1317 1318 /* Is calculated system address is above DRAM limit address? */ 1319 if (ctx.ret_addr > dram_limit_addr) 1320 goto out_err; 1321 1322 *sys_addr = ctx.ret_addr; 1323 return 0; 1324 1325out_err: 1326 return -EINVAL; 1327} 1328 | |
1329static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); 1330 1331/* 1332 * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs 1333 * are ECC capable. 1334 */ 1335static unsigned long dct_determine_edac_cap(struct amd64_pvt *pvt) 1336{ --- 1731 unchanged lines hidden (view full) --- 3068 err->channel = (m->ipid & GENMASK(31, 0)) >> 20; 3069 err->csrow = m->synd & 0x7; 3070} 3071 3072static void decode_umc_error(int node_id, struct mce *m) 3073{ 3074 u8 ecc_type = (m->status >> 45) & 0x3; 3075 struct mem_ctl_info *mci; | 1055static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); 1056 1057/* 1058 * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs 1059 * are ECC capable. 1060 */ 1061static unsigned long dct_determine_edac_cap(struct amd64_pvt *pvt) 1062{ --- 1731 unchanged lines hidden (view full) --- 2794 err->channel = (m->ipid & GENMASK(31, 0)) >> 20; 2795 err->csrow = m->synd & 0x7; 2796} 2797 2798static void decode_umc_error(int node_id, struct mce *m) 2799{ 2800 u8 ecc_type = (m->status >> 45) & 0x3; 2801 struct mem_ctl_info *mci; |
2802 unsigned long sys_addr; |
|
3076 struct amd64_pvt *pvt; | 2803 struct amd64_pvt *pvt; |
2804 struct atl_err a_err; |
|
3077 struct err_info err; | 2805 struct err_info err; |
3078 u64 sys_addr; | |
3079 3080 node_id = fixup_node_id(node_id, m); 3081 3082 mci = edac_mc_find(node_id); 3083 if (!mci) 3084 return; 3085 3086 pvt = mci->pvt_info; --- 14 unchanged lines hidden (view full) --- 3101 if (length) 3102 err.syndrome = (m->synd >> 32) & GENMASK(length - 1, 0); 3103 else 3104 err.err_code = ERR_CHANNEL; 3105 } 3106 3107 pvt->ops->get_err_info(m, &err); 3108 | 2806 2807 node_id = fixup_node_id(node_id, m); 2808 2809 mci = edac_mc_find(node_id); 2810 if (!mci) 2811 return; 2812 2813 pvt = mci->pvt_info; --- 14 unchanged lines hidden (view full) --- 2828 if (length) 2829 err.syndrome = (m->synd >> 32) & GENMASK(length - 1, 0); 2830 else 2831 err.err_code = ERR_CHANNEL; 2832 } 2833 2834 pvt->ops->get_err_info(m, &err); 2835 |
3109 if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { | 2836 a_err.addr = m->addr; 2837 a_err.ipid = m->ipid; 2838 a_err.cpu = m->extcpu; 2839 2840 sys_addr = amd_convert_umc_mca_addr_to_sys_addr(&a_err); 2841 if (IS_ERR_VALUE(sys_addr)) { |
3110 err.err_code = ERR_NORM_ADDR; 3111 goto log_error; 3112 } 3113 3114 error_address_to_page_and_offset(sys_addr, &err); 3115 3116log_error: 3117 __log_ecc_error(mci, &err, ecc_type); --- 1385 unchanged lines hidden --- | 2842 err.err_code = ERR_NORM_ADDR; 2843 goto log_error; 2844 } 2845 2846 error_address_to_page_and_offset(sys_addr, &err); 2847 2848log_error: 2849 __log_ecc_error(mci, &err, ecc_type); --- 1385 unchanged lines hidden --- |