amd64_edac.c (cf79f291f985662150363b4a93d16f88f12643bc) amd64_edac.c (6c9058f49084569d1d816e87185e0a4f9ab1a321)
1// SPDX-License-Identifier: GPL-2.0-only
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/ras.h>
2#include "amd64_edac.h"
3#include <asm/amd_nb.h>
4
5static struct edac_pci_ctl_info *pci_ctl;
6
7/*
8 * Set by command line parameter. If BIOS has enabled the ECC, this override is
9 * cleared to prevent re-enabling the hardware by this driver.

--- 1036 unchanged lines hidden (view full) ---

1046 /* Nodes below the GPU base node are CPU nodes and don't need a fixup. */
1047 if (nid < gpu_node_map.base_node_id)
1048 return node_id;
1049
1050 /* Convert the hardware-provided AMD Node ID to a Linux logical one. */
1051 return nid - gpu_node_map.base_node_id + 1;
1052}
1053
3#include "amd64_edac.h"
4#include <asm/amd_nb.h>
5
6static struct edac_pci_ctl_info *pci_ctl;
7
8/*
9 * Set by command line parameter. If BIOS has enabled the ECC, this override is
10 * cleared to prevent re-enabling the hardware by this driver.

--- 1036 unchanged lines hidden (view full) ---

1047 /* Nodes below the GPU base node are CPU nodes and don't need a fixup. */
1048 if (nid < gpu_node_map.base_node_id)
1049 return node_id;
1050
1051 /* Convert the hardware-provided AMD Node ID to a Linux logical one. */
1052 return nid - gpu_node_map.base_node_id + 1;
1053}
1054
1054/* Protect the PCI config register pairs used for DF indirect access. */
1055static DEFINE_MUTEX(df_indirect_mutex);
1056
1057/*
1058 * Data Fabric Indirect Access uses FICAA/FICAD.
1059 *
1060 * Fabric Indirect Configuration Access Address (FICAA): Constructed based
1061 * on the device's Instance Id and the PCI function and register offset of
1062 * the desired register.
1063 *
1064 * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
1065 * and FICAD HI registers but so far we only need the LO register.
1066 *
1067 * Use Instance Id 0xFF to indicate a broadcast read.
1068 */
1069#define DF_BROADCAST 0xFF
1070static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
1071{
1072 struct pci_dev *F4;
1073 u32 ficaa;
1074 int err = -ENODEV;
1075
1076 if (node >= amd_nb_num())
1077 goto out;
1078
1079 F4 = node_to_amd_nb(node)->link;
1080 if (!F4)
1081 goto out;
1082
1083 ficaa = (instance_id == DF_BROADCAST) ? 0 : 1;
1084 ficaa |= reg & 0x3FC;
1085 ficaa |= (func & 0x7) << 11;
1086 ficaa |= instance_id << 16;
1087
1088 mutex_lock(&df_indirect_mutex);
1089
1090 err = pci_write_config_dword(F4, 0x5C, ficaa);
1091 if (err) {
1092 pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
1093 goto out_unlock;
1094 }
1095
1096 err = pci_read_config_dword(F4, 0x98, lo);
1097 if (err)
1098 pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
1099
1100out_unlock:
1101 mutex_unlock(&df_indirect_mutex);
1102
1103out:
1104 return err;
1105}
1106
1107static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
1108{
1109 return __df_indirect_read(node, func, reg, instance_id, lo);
1110}
1111
1112static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo)
1113{
1114 return __df_indirect_read(node, func, reg, DF_BROADCAST, lo);
1115}
1116
1117struct addr_ctx {
1118 u64 ret_addr;
1119 u32 tmp;
1120 u16 nid;
1121 u8 inst_id;
1122};
1123
1124static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
1125{
1126 u64 dram_base_addr, dram_limit_addr, dram_hole_base;
1127
1128 u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
1129 u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
1130 u8 intlv_addr_sel, intlv_addr_bit;
1131 u8 num_intlv_bits, hashed_bit;
1132 u8 lgcy_mmio_hole_en, base = 0;
1133 u8 cs_mask, cs_id = 0;
1134 bool hash_enabled = false;
1135
1136 struct addr_ctx ctx;
1137
1138 memset(&ctx, 0, sizeof(ctx));
1139
1140 /* Start from the normalized address */
1141 ctx.ret_addr = norm_addr;
1142
1143 ctx.nid = nid;
1144 ctx.inst_id = umc;
1145
1146 /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
1147 if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp))
1148 goto out_err;
1149
1150 /* Remove HiAddrOffset from normalized address, if enabled: */
1151 if (ctx.tmp & BIT(0)) {
1152 u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8;
1153
1154 if (norm_addr >= hi_addr_offset) {
1155 ctx.ret_addr -= hi_addr_offset;
1156 base = 1;
1157 }
1158 }
1159
1160 /* Read D18F0x110 (DramBaseAddress). */
1161 if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp))
1162 goto out_err;
1163
1164 /* Check if address range is valid. */
1165 if (!(ctx.tmp & BIT(0))) {
1166 pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
1167 __func__, ctx.tmp);
1168 goto out_err;
1169 }
1170
1171 lgcy_mmio_hole_en = ctx.tmp & BIT(1);
1172 intlv_num_chan = (ctx.tmp >> 4) & 0xF;
1173 intlv_addr_sel = (ctx.tmp >> 8) & 0x7;
1174 dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16;
1175
1176 /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
1177 if (intlv_addr_sel > 3) {
1178 pr_err("%s: Invalid interleave address select %d.\n",
1179 __func__, intlv_addr_sel);
1180 goto out_err;
1181 }
1182
1183 /* Read D18F0x114 (DramLimitAddress). */
1184 if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp))
1185 goto out_err;
1186
1187 intlv_num_sockets = (ctx.tmp >> 8) & 0x1;
1188 intlv_num_dies = (ctx.tmp >> 10) & 0x3;
1189 dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
1190
1191 intlv_addr_bit = intlv_addr_sel + 8;
1192
1193 /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
1194 switch (intlv_num_chan) {
1195 case 0: intlv_num_chan = 0; break;
1196 case 1: intlv_num_chan = 1; break;
1197 case 3: intlv_num_chan = 2; break;
1198 case 5: intlv_num_chan = 3; break;
1199 case 7: intlv_num_chan = 4; break;
1200
1201 case 8: intlv_num_chan = 1;
1202 hash_enabled = true;
1203 break;
1204 default:
1205 pr_err("%s: Invalid number of interleaved channels %d.\n",
1206 __func__, intlv_num_chan);
1207 goto out_err;
1208 }
1209
1210 num_intlv_bits = intlv_num_chan;
1211
1212 if (intlv_num_dies > 2) {
1213 pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
1214 __func__, intlv_num_dies);
1215 goto out_err;
1216 }
1217
1218 num_intlv_bits += intlv_num_dies;
1219
1220 /* Add a bit if sockets are interleaved. */
1221 num_intlv_bits += intlv_num_sockets;
1222
1223 /* Assert num_intlv_bits <= 4 */
1224 if (num_intlv_bits > 4) {
1225 pr_err("%s: Invalid interleave bits %d.\n",
1226 __func__, num_intlv_bits);
1227 goto out_err;
1228 }
1229
1230 if (num_intlv_bits > 0) {
1231 u64 temp_addr_x, temp_addr_i, temp_addr_y;
1232 u8 die_id_bit, sock_id_bit, cs_fabric_id;
1233
1234 /*
1235 * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
1236 * This is the fabric id for this coherent slave. Use
1237 * umc/channel# as instance id of the coherent slave
1238 * for FICAA.
1239 */
1240 if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp))
1241 goto out_err;
1242
1243 cs_fabric_id = (ctx.tmp >> 8) & 0xFF;
1244 die_id_bit = 0;
1245
1246 /* If interleaved over more than 1 channel: */
1247 if (intlv_num_chan) {
1248 die_id_bit = intlv_num_chan;
1249 cs_mask = (1 << die_id_bit) - 1;
1250 cs_id = cs_fabric_id & cs_mask;
1251 }
1252
1253 sock_id_bit = die_id_bit;
1254
1255 /* Read D18F1x208 (SystemFabricIdMask). */
1256 if (intlv_num_dies || intlv_num_sockets)
1257 if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp))
1258 goto out_err;
1259
1260 /* If interleaved over more than 1 die. */
1261 if (intlv_num_dies) {
1262 sock_id_bit = die_id_bit + intlv_num_dies;
1263 die_id_shift = (ctx.tmp >> 24) & 0xF;
1264 die_id_mask = (ctx.tmp >> 8) & 0xFF;
1265
1266 cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
1267 }
1268
1269 /* If interleaved over more than 1 socket. */
1270 if (intlv_num_sockets) {
1271 socket_id_shift = (ctx.tmp >> 28) & 0xF;
1272 socket_id_mask = (ctx.tmp >> 16) & 0xFF;
1273
1274 cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
1275 }
1276
1277 /*
1278 * The pre-interleaved address consists of XXXXXXIIIYYYYY
1279 * where III is the ID for this CS, and XXXXXXYYYYY are the
1280 * address bits from the post-interleaved address.
1281 * "num_intlv_bits" has been calculated to tell us how many "I"
1282 * bits there are. "intlv_addr_bit" tells us how many "Y" bits
1283 * there are (where "I" starts).
1284 */
1285 temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0);
1286 temp_addr_i = (cs_id << intlv_addr_bit);
1287 temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
1288 ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
1289 }
1290
1291 /* Add dram base address */
1292 ctx.ret_addr += dram_base_addr;
1293
1294 /* If legacy MMIO hole enabled */
1295 if (lgcy_mmio_hole_en) {
1296 if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp))
1297 goto out_err;
1298
1299 dram_hole_base = ctx.tmp & GENMASK(31, 24);
1300 if (ctx.ret_addr >= dram_hole_base)
1301 ctx.ret_addr += (BIT_ULL(32) - dram_hole_base);
1302 }
1303
1304 if (hash_enabled) {
1305 /* Save some parentheses and grab ls-bit at the end. */
1306 hashed_bit = (ctx.ret_addr >> 12) ^
1307 (ctx.ret_addr >> 18) ^
1308 (ctx.ret_addr >> 21) ^
1309 (ctx.ret_addr >> 30) ^
1310 cs_id;
1311
1312 hashed_bit &= BIT(0);
1313
1314 if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0)))
1315 ctx.ret_addr ^= BIT(intlv_addr_bit);
1316 }
1317
1318 /* Is calculated system address is above DRAM limit address? */
1319 if (ctx.ret_addr > dram_limit_addr)
1320 goto out_err;
1321
1322 *sys_addr = ctx.ret_addr;
1323 return 0;
1324
1325out_err:
1326 return -EINVAL;
1327}
1328
1329static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16);
1330
1331/*
1332 * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs
1333 * are ECC capable.
1334 */
1335static unsigned long dct_determine_edac_cap(struct amd64_pvt *pvt)
1336{

--- 1731 unchanged lines hidden (view full) ---

3068 err->channel = (m->ipid & GENMASK(31, 0)) >> 20;
3069 err->csrow = m->synd & 0x7;
3070}
3071
3072static void decode_umc_error(int node_id, struct mce *m)
3073{
3074 u8 ecc_type = (m->status >> 45) & 0x3;
3075 struct mem_ctl_info *mci;
1055static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16);
1056
1057/*
1058 * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs
1059 * are ECC capable.
1060 */
1061static unsigned long dct_determine_edac_cap(struct amd64_pvt *pvt)
1062{

--- 1731 unchanged lines hidden (view full) ---

2794 err->channel = (m->ipid & GENMASK(31, 0)) >> 20;
2795 err->csrow = m->synd & 0x7;
2796}
2797
2798static void decode_umc_error(int node_id, struct mce *m)
2799{
2800 u8 ecc_type = (m->status >> 45) & 0x3;
2801 struct mem_ctl_info *mci;
2802 unsigned long sys_addr;
3076 struct amd64_pvt *pvt;
2803 struct amd64_pvt *pvt;
2804 struct atl_err a_err;
3077 struct err_info err;
2805 struct err_info err;
3078 u64 sys_addr;
3079
3080 node_id = fixup_node_id(node_id, m);
3081
3082 mci = edac_mc_find(node_id);
3083 if (!mci)
3084 return;
3085
3086 pvt = mci->pvt_info;

--- 14 unchanged lines hidden (view full) ---

3101 if (length)
3102 err.syndrome = (m->synd >> 32) & GENMASK(length - 1, 0);
3103 else
3104 err.err_code = ERR_CHANNEL;
3105 }
3106
3107 pvt->ops->get_err_info(m, &err);
3108
2806
2807 node_id = fixup_node_id(node_id, m);
2808
2809 mci = edac_mc_find(node_id);
2810 if (!mci)
2811 return;
2812
2813 pvt = mci->pvt_info;

--- 14 unchanged lines hidden (view full) ---

2828 if (length)
2829 err.syndrome = (m->synd >> 32) & GENMASK(length - 1, 0);
2830 else
2831 err.err_code = ERR_CHANNEL;
2832 }
2833
2834 pvt->ops->get_err_info(m, &err);
2835
3109 if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
2836 a_err.addr = m->addr;
2837 a_err.ipid = m->ipid;
2838 a_err.cpu = m->extcpu;
2839
2840 sys_addr = amd_convert_umc_mca_addr_to_sys_addr(&a_err);
2841 if (IS_ERR_VALUE(sys_addr)) {
3110 err.err_code = ERR_NORM_ADDR;
3111 goto log_error;
3112 }
3113
3114 error_address_to_page_and_offset(sys_addr, &err);
3115
3116log_error:
3117 __log_ecc_error(mci, &err, ecc_type);

--- 1385 unchanged lines hidden ---
2842 err.err_code = ERR_NORM_ADDR;
2843 goto log_error;
2844 }
2845
2846 error_address_to_page_and_offset(sys_addr, &err);
2847
2848log_error:
2849 __log_ecc_error(mci, &err, ecc_type);

--- 1385 unchanged lines hidden ---