xref: /linux/drivers/ras/amd/atl/umc.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * AMD Address Translation Library
4  *
5  * umc.c : Unified Memory Controller (UMC) topology helpers
6  *
7  * Copyright (c) 2023, Advanced Micro Devices, Inc.
8  * All Rights Reserved.
9  *
10  * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
11  */
12 
13 #include "internal.h"
14 
15 /*
16  * MI300 has a fixed, model-specific mapping between a UMC instance and
17  * its related Data Fabric Coherent Station instance.
18  *
19  * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the
20  * UMC instance within a Node. Use this to find the appropriate Coherent
21  * Station ID.
22  *
23  * Redundant bits were removed from the map below.
24  */
25 static const u16 umc_coh_st_map[32] = {
26 	0x393, 0x293, 0x193, 0x093,
27 	0x392, 0x292, 0x192, 0x092,
28 	0x391, 0x291, 0x191, 0x091,
29 	0x390, 0x290, 0x190, 0x090,
30 	0x793, 0x693, 0x593, 0x493,
31 	0x792, 0x692, 0x592, 0x492,
32 	0x791, 0x691, 0x591, 0x491,
33 	0x790, 0x690, 0x590, 0x490,
34 };
35 
36 #define UMC_ID_MI300 GENMASK(23, 12)
37 static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
38 {
39 	u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid);
40 	u8 i;
41 
42 	for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) {
43 		if (umc_id == umc_coh_st_map[i])
44 			break;
45 	}
46 
47 	WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map));
48 
49 	return i;
50 }
51 
52 /* XOR the bits in @val. */
53 static u16 bitwise_xor_bits(u16 val)
54 {
55 	u16 tmp = 0;
56 	u8 i;
57 
58 	for (i = 0; i < 16; i++)
59 		tmp ^= (val >> i) & 0x1;
60 
61 	return tmp;
62 }
63 
64 struct xor_bits {
65 	bool	xor_enable;
66 	u16	col_xor;
67 	u32	row_xor;
68 };
69 
70 #define NUM_BANK_BITS	4
71 
72 static struct {
73 	/* UMC::CH::AddrHashBank */
74 	struct xor_bits	bank[NUM_BANK_BITS];
75 
76 	/* UMC::CH::AddrHashPC */
77 	struct xor_bits	pc;
78 
79 	/* UMC::CH::AddrHashPC2 */
80 	u8		bank_xor;
81 } addr_hash;
82 
83 #define MI300_UMC_CH_BASE	0x90000
84 #define MI300_ADDR_HASH_BANK0	(MI300_UMC_CH_BASE + 0xC8)
85 #define MI300_ADDR_HASH_PC	(MI300_UMC_CH_BASE + 0xE0)
86 #define MI300_ADDR_HASH_PC2	(MI300_UMC_CH_BASE + 0xE4)
87 
88 #define ADDR_HASH_XOR_EN	BIT(0)
89 #define ADDR_HASH_COL_XOR	GENMASK(13, 1)
90 #define ADDR_HASH_ROW_XOR	GENMASK(31, 14)
91 #define ADDR_HASH_BANK_XOR	GENMASK(5, 0)
92 
93 /*
94  * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used
95  * for hashing. Do this during module init, since the values will not
96  * change during run time.
97  *
98  * These registers are instantiated for each UMC across each AMD Node.
99  * However, they should be identically programmed due to the fixed hardware
100  * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a
101  * single global structure for simplicity.
102  */
103 int get_addr_hash_mi300(void)
104 {
105 	u32 temp;
106 	int ret;
107 	u8 i;
108 
109 	for (i = 0; i < NUM_BANK_BITS; i++) {
110 		ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp);
111 		if (ret)
112 			return ret;
113 
114 		addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
115 		addr_hash.bank[i].col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
116 		addr_hash.bank[i].row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
117 	}
118 
119 	ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp);
120 	if (ret)
121 		return ret;
122 
123 	addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
124 	addr_hash.pc.col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
125 	addr_hash.pc.row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
126 
127 	ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp);
128 	if (ret)
129 		return ret;
130 
131 	addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp);
132 
133 	return 0;
134 }
135 
136 /*
137  * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must
138  * be converted to the intermediate normalized address (NA) before translating to a
139  * system physical address.
140  *
141  * The DRAM address includes bank, row, and column. Also included are bits for
142  * pseudochannel (PC) and stack ID (SID).
143  *
144  * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero
145  *
146  * The MCA address format is as follows:
147  *	MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]}
148  *
149  * The normalized address format is fixed in hardware and is as follows:
150  *	NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]}
151  *
152  * Additionally, the PC and Bank bits may be hashed. This must be accounted for before
153  * reconstructing the normalized address.
154  */
155 #define MI300_UMC_MCA_COL	GENMASK(5, 1)
156 #define MI300_UMC_MCA_BANK	GENMASK(9, 6)
157 #define MI300_UMC_MCA_ROW	GENMASK(24, 10)
158 #define MI300_UMC_MCA_PC	BIT(25)
159 #define MI300_UMC_MCA_SID	GENMASK(27, 26)
160 
161 #define MI300_NA_COL_1_0	GENMASK(6, 5)
162 #define MI300_NA_PC		BIT(7)
163 #define MI300_NA_COL_3_2	GENMASK(9, 8)
164 #define MI300_NA_BANK_3_2	GENMASK(11, 10)
165 #define MI300_NA_BANK_1_0	GENMASK(13, 12)
166 #define MI300_NA_COL_4		BIT(14)
167 #define MI300_NA_ROW		GENMASK(28, 15)
168 #define MI300_NA_SID		GENMASK(30, 29)
169 
170 static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
171 {
172 	u16 i, col, row, bank, pc, sid, temp;
173 
174 	col  = FIELD_GET(MI300_UMC_MCA_COL,  addr);
175 	bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
176 	row  = FIELD_GET(MI300_UMC_MCA_ROW,  addr);
177 	pc   = FIELD_GET(MI300_UMC_MCA_PC,   addr);
178 	sid  = FIELD_GET(MI300_UMC_MCA_SID,  addr);
179 
180 	/* Calculate hash for each Bank bit. */
181 	for (i = 0; i < NUM_BANK_BITS; i++) {
182 		if (!addr_hash.bank[i].xor_enable)
183 			continue;
184 
185 		temp  = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
186 		temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
187 		bank ^= temp << i;
188 	}
189 
190 	/* Calculate hash for PC bit. */
191 	if (addr_hash.pc.xor_enable) {
192 		/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
193 		bank |= sid << 5;
194 
195 		temp  = bitwise_xor_bits(col  & addr_hash.pc.col_xor);
196 		temp ^= bitwise_xor_bits(row  & addr_hash.pc.row_xor);
197 		temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor);
198 		pc   ^= temp;
199 
200 		/* Drop SID bits for the sake of debug printing later. */
201 		bank &= 0x1F;
202 	}
203 
204 	/* Reconstruct the normalized address starting with NA[4:0] = 0 */
205 	addr  = 0;
206 
207 	/* NA[6:5] = Column[1:0] */
208 	temp  = col & 0x3;
209 	addr |= FIELD_PREP(MI300_NA_COL_1_0, temp);
210 
211 	/* NA[7] = PC */
212 	addr |= FIELD_PREP(MI300_NA_PC, pc);
213 
214 	/* NA[9:8] = Column[3:2] */
215 	temp  = (col >> 2) & 0x3;
216 	addr |= FIELD_PREP(MI300_NA_COL_3_2, temp);
217 
218 	/* NA[11:10] = Bank[3:2] */
219 	temp  = (bank >> 2) & 0x3;
220 	addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp);
221 
222 	/* NA[13:12] = Bank[1:0] */
223 	temp  = bank & 0x3;
224 	addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp);
225 
226 	/* NA[14] = Column[4] */
227 	temp  = (col >> 4) & 0x1;
228 	addr |= FIELD_PREP(MI300_NA_COL_4, temp);
229 
230 	/* NA[28:15] = Row[13:0] */
231 	addr |= FIELD_PREP(MI300_NA_ROW, row);
232 
233 	/* NA[30:29] = SID[1:0] */
234 	addr |= FIELD_PREP(MI300_NA_SID, sid);
235 
236 	pr_debug("Addr=0x%016lx", addr);
237 	pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid);
238 
239 	return addr;
240 }
241 
242 /*
243  * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
244  * all memory within that DRAM row. This applies to the memory with a DRAM
245  * bank.
246  *
247  * To find the memory addresses, loop through permutations of the DRAM column
248  * bits and find the System Physical address of each. The column bits are used
249  * to calculate the intermediate Normalized address, so all permutations should
250  * be checked.
251  *
252  * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
253  */
254 #define MI300_NUM_COL		BIT(HWEIGHT(MI300_UMC_MCA_COL))
255 static void retire_row_mi300(struct atl_err *a_err)
256 {
257 	unsigned long addr;
258 	struct page *p;
259 	u8 col;
260 
261 	for (col = 0; col < MI300_NUM_COL; col++) {
262 		a_err->addr &= ~MI300_UMC_MCA_COL;
263 		a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
264 
265 		addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
266 		if (IS_ERR_VALUE(addr))
267 			continue;
268 
269 		addr = PHYS_PFN(addr);
270 
271 		/*
272 		 * Skip invalid or already poisoned pages to avoid unnecessary
273 		 * error messages from memory_failure().
274 		 */
275 		p = pfn_to_online_page(addr);
276 		if (!p)
277 			continue;
278 
279 		if (PageHWPoison(p))
280 			continue;
281 
282 		memory_failure(addr, 0);
283 	}
284 }
285 
286 void amd_retire_dram_row(struct atl_err *a_err)
287 {
288 	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
289 		return retire_row_mi300(a_err);
290 }
291 EXPORT_SYMBOL_GPL(amd_retire_dram_row);
292 
293 static unsigned long get_addr(unsigned long addr)
294 {
295 	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
296 		return convert_dram_to_norm_addr_mi300(addr);
297 
298 	return addr;
299 }
300 
301 #define MCA_IPID_INST_ID_HI	GENMASK_ULL(47, 44)
302 static u8 get_die_id(struct atl_err *err)
303 {
304 	/*
305 	 * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this
306 	 * needs to be divided by 4 to get the internal Die ID.
307 	 */
308 	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) {
309 		u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid);
310 
311 		return node_id >> 2;
312 	}
313 
314 	/*
315 	 * For CPUs, this is the AMD Node ID modulo the number
316 	 * of AMD Nodes per socket.
317 	 */
318 	return topology_amd_node_id(err->cpu) % topology_amd_nodes_per_pkg();
319 }
320 
321 #define UMC_CHANNEL_NUM	GENMASK(31, 20)
322 static u8 get_coh_st_inst_id(struct atl_err *err)
323 {
324 	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
325 		return get_coh_st_inst_id_mi300(err);
326 
327 	return FIELD_GET(UMC_CHANNEL_NUM, err->ipid);
328 }
329 
330 unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
331 {
332 	u8 socket_id = topology_physical_package_id(err->cpu);
333 	u8 coh_st_inst_id = get_coh_st_inst_id(err);
334 	unsigned long addr = get_addr(err->addr);
335 	u8 die_id = get_die_id(err);
336 
337 	pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",
338 		 socket_id, die_id, coh_st_inst_id, addr);
339 
340 	return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
341 }
342