xref: /linux/drivers/edac/versalnet_edac.c (revision 87751e715e23ede7386fb57a1a8593aa9830b21f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * AMD Versal NET memory controller driver
4  * Copyright (C) 2025 Advanced Micro Devices, Inc.
5  */
6 
7 #include <linux/cdx/edac_cdx_pcol.h>
8 #include <linux/edac.h>
9 #include <linux/module.h>
10 #include <linux/of_device.h>
11 #include <linux/ras.h>
12 #include <linux/remoteproc.h>
13 #include <linux/rpmsg.h>
14 #include <linux/sizes.h>
15 #include <ras/ras_event.h>
16 
17 #include "edac_module.h"
18 
19 /* Granularity of reported error in bytes */
20 #define MC5_ERR_GRAIN			1
21 #define MC_GET_DDR_CONFIG_IN_LEN	4
22 
23 #define MC5_IRQ_CE_MASK			GENMASK(18, 15)
24 #define MC5_IRQ_UE_MASK			GENMASK(14, 11)
25 
26 #define MC5_RANK_1_MASK			GENMASK(11, 6)
27 #define MASK_24				GENMASK(29, 24)
28 #define MASK_0				GENMASK(5, 0)
29 
30 #define MC5_LRANK_1_MASK		GENMASK(11, 6)
31 #define MC5_LRANK_2_MASK		GENMASK(17, 12)
32 #define MC5_BANK1_MASK			GENMASK(11, 6)
33 #define MC5_GRP_0_MASK			GENMASK(17, 12)
34 #define MC5_GRP_1_MASK			GENMASK(23, 18)
35 
36 #define MC5_REGHI_ROW			7
37 #define MC5_EACHBIT			1
38 #define MC5_ERR_TYPE_CE			0
39 #define MC5_ERR_TYPE_UE			1
40 #define MC5_HIGH_MEM_EN			BIT(20)
41 #define MC5_MEM_MASK			GENMASK(19, 0)
42 #define MC5_X16_BASE			256
43 #define MC5_X16_ECC			32
44 #define MC5_X16_SIZE			(MC5_X16_BASE + MC5_X16_ECC)
45 #define MC5_X32_SIZE			576
46 #define MC5_HIMEM_BASE			(256 * SZ_1M)
47 #define MC5_ILC_HIMEM_EN		BIT(28)
48 #define MC5_ILC_MEM			GENMASK(27, 0)
49 #define MC5_INTERLEAVE_SEL		GENMASK(3, 0)
50 #define MC5_BUS_WIDTH_MASK		GENMASK(19, 18)
51 #define MC5_NUM_CHANS_MASK		BIT(17)
52 #define MC5_RANK_MASK			GENMASK(15, 14)
53 
54 #define ERROR_LEVEL			2
55 #define ERROR_ID			3
56 #define TOTAL_ERR_LENGTH		5
57 #define MSG_ERR_OFFSET			8
58 #define MSG_ERR_LENGTH			9
59 #define ERROR_DATA			10
60 #define MCDI_RESPONSE			0xFF
61 
62 #define REG_MAX				152
63 #define ADEC_MAX			152
64 #define NUM_CONTROLLERS			8
65 #define REGS_PER_CONTROLLER		19
66 #define ADEC_NUM			19
67 #define BUFFER_SZ			80
68 
69 #define XDDR5_BUS_WIDTH_64		0
70 #define XDDR5_BUS_WIDTH_32		1
71 #define XDDR5_BUS_WIDTH_16		2
72 
73 /**
74  * struct ecc_error_info - ECC error log information.
75  * @burstpos:		Burst position.
76  * @lrank:		Logical Rank number.
77  * @rank:		Rank number.
78  * @group:		Group number.
79  * @bank:		Bank number.
80  * @col:		Column number.
81  * @row:		Row number.
82  * @rowhi:		Row number higher bits.
83  * @i:			Combined ECC error vector containing encoded values of burst position,
84  *			rank, bank, column, and row information.
85  */
86 union ecc_error_info {
87 	struct {
88 		u32 burstpos:3;
89 		u32 lrank:4;
90 		u32 rank:2;
91 		u32 group:3;
92 		u32 bank:2;
93 		u32 col:11;
94 		u32 row:7;
95 		u32 rowhi;
96 	};
97 	u64 i;
98 } __packed;
99 
100 /* Row and column bit positions in the address decoder (ADEC) registers. */
101 union row_col_mapping {
102 	struct {
103 		u32 row0:6;
104 		u32 row1:6;
105 		u32 row2:6;
106 		u32 row3:6;
107 		u32 row4:6;
108 		u32 reserved:2;
109 	};
110 	struct {
111 		u32 col1:6;
112 		u32 col2:6;
113 		u32 col3:6;
114 		u32 col4:6;
115 		u32 col5:6;
116 		u32 reservedcol:2;
117 	};
118 	u32 i;
119 } __packed;
120 
121 /**
122  * struct ecc_status - ECC status information to report.
123  * @ceinfo:	Correctable errors.
124  * @ueinfo:	Uncorrected errors.
125  * @channel:	Channel number.
126  * @error_type:	Error type.
127  */
128 struct ecc_status {
129 	union ecc_error_info ceinfo[2];
130 	union ecc_error_info ueinfo[2];
131 	u8 channel;
132 	u8 error_type;
133 };
134 
135 /**
136  * struct mc_priv - DDR memory controller private instance data.
137  * @message:		Buffer for framing the event specific info.
138  * @stat:		ECC status information.
139  * @error_id:		The error id.
140  * @error_level:	The error level.
141  * @dwidth:		Width of data bus excluding ECC bits.
142  * @part_len:		The support of the message received.
143  * @regs:		The registers sent on the rpmsg.
144  * @adec:		Address decode registers.
145  * @mci:		Memory controller interface.
146  * @ept:		rpmsg endpoint.
147  * @mcdi:		The mcdi handle.
148  */
149 struct mc_priv {
150 	char message[256];
151 	struct ecc_status stat;
152 	u32 error_id;
153 	u32 error_level;
154 	u32 dwidth;
155 	u32 part_len;
156 	u32 regs[REG_MAX];
157 	u32 adec[ADEC_MAX];
158 	struct mem_ctl_info *mci[NUM_CONTROLLERS];
159 	struct rpmsg_endpoint *ept;
160 	struct cdx_mcdi *mcdi;
161 };
162 
163 /*
164  * Address decoder (ADEC) registers to match the order in which the register
165  * information is received from the firmware.
166  */
167 enum adec_info {
168 	CONF = 0,
169 	ADEC0,
170 	ADEC1,
171 	ADEC2,
172 	ADEC3,
173 	ADEC4,
174 	ADEC5,
175 	ADEC6,
176 	ADEC7,
177 	ADEC8,
178 	ADEC9,
179 	ADEC10,
180 	ADEC11,
181 	ADEC12,
182 	ADEC13,
183 	ADEC14,
184 	ADEC15,
185 	ADEC16,
186 	ADECILC,
187 };
188 
189 enum reg_info {
190 	ISR = 0,
191 	IMR,
192 	ECCR0_ERR_STATUS,
193 	ECCR0_ADDR_LO,
194 	ECCR0_ADDR_HI,
195 	ECCR0_DATA_LO,
196 	ECCR0_DATA_HI,
197 	ECCR0_PAR,
198 	ECCR1_ERR_STATUS,
199 	ECCR1_ADDR_LO,
200 	ECCR1_ADDR_HI,
201 	ECCR1_DATA_LO,
202 	ECCR1_DATA_HI,
203 	ECCR1_PAR,
204 	XMPU_ERR,
205 	XMPU_ERR_ADDR_L0,
206 	XMPU_ERR_ADDR_HI,
207 	XMPU_ERR_AXI_ID,
208 	ADEC_CHK_ERR_LOG,
209 };
210 
get_ddr_info(u32 * error_data,struct mc_priv * priv)211 static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
212 {
213 	u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
214 	struct ecc_status *p;
215 
216 	isr = error_data[ISR];
217 
218 	if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
219 		return false;
220 
221 	eccr0_val = error_data[ECCR0_ERR_STATUS];
222 	eccr1_val = error_data[ECCR1_ERR_STATUS];
223 
224 	if (!eccr0_val && !eccr1_val)
225 		return false;
226 
227 	p = &priv->stat;
228 
229 	if (!eccr0_val)
230 		p->channel = 1;
231 	else
232 		p->channel = 0;
233 
234 	reglo = error_data[ECCR0_ADDR_LO];
235 	reghi = error_data[ECCR0_ADDR_HI];
236 	if (isr & MC5_IRQ_CE_MASK)
237 		p->ceinfo[0].i = reglo | (u64)reghi << 32;
238 	else if (isr & MC5_IRQ_UE_MASK)
239 		p->ueinfo[0].i = reglo | (u64)reghi << 32;
240 
241 	parity = error_data[ECCR0_PAR];
242 	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
243 		 reghi, reglo, parity);
244 
245 	reglo = error_data[ECCR1_ADDR_LO];
246 	reghi = error_data[ECCR1_ADDR_HI];
247 	if (isr & MC5_IRQ_CE_MASK)
248 		p->ceinfo[1].i = reglo | (u64)reghi << 32;
249 	else if (isr & MC5_IRQ_UE_MASK)
250 		p->ueinfo[1].i = reglo | (u64)reghi << 32;
251 
252 	parity = error_data[ECCR1_PAR];
253 	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
254 		 reghi, reglo, parity);
255 
256 	return true;
257 }
258 
259 /**
260  * convert_to_physical - Convert @error_data to a physical address.
261  * @priv:	DDR memory controller private instance data.
262  * @pinf:	ECC error info structure.
263  * @controller:	Controller number of the MC5
264  * @error_data:	the DDRMC5 ADEC address decoder register data
265  *
266  * Return: physical address of the DDR memory.
267  */
convert_to_physical(struct mc_priv * priv,union ecc_error_info pinf,int controller,int * error_data)268 static unsigned long convert_to_physical(struct mc_priv *priv,
269 					 union ecc_error_info pinf,
270 					 int controller, int *error_data)
271 {
272 	u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
273 	u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
274 	unsigned long err_addr = 0, addr;
275 	union row_col_mapping cols;
276 	union row_col_mapping rows;
277 	u32 col_bit_0;
278 
279 	row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
280 	offset = controller * ADEC_NUM;
281 
282 	reg = error_data[ADEC6];
283 	rows.i = reg;
284 	err_addr |= (row & BIT(0)) << rows.row0;
285 	row >>= MC5_EACHBIT;
286 	err_addr |= (row & BIT(0)) << rows.row1;
287 	row >>= MC5_EACHBIT;
288 	err_addr |= (row & BIT(0)) << rows.row2;
289 	row >>= MC5_EACHBIT;
290 	err_addr |= (row & BIT(0)) << rows.row3;
291 	row >>= MC5_EACHBIT;
292 	err_addr |= (row & BIT(0)) << rows.row4;
293 	row >>= MC5_EACHBIT;
294 
295 	reg = error_data[ADEC7];
296 	rows.i = reg;
297 	err_addr |= (row & BIT(0)) << rows.row0;
298 	row >>= MC5_EACHBIT;
299 	err_addr |= (row & BIT(0)) << rows.row1;
300 	row >>= MC5_EACHBIT;
301 	err_addr |= (row & BIT(0)) << rows.row2;
302 	row >>= MC5_EACHBIT;
303 	err_addr |= (row & BIT(0)) << rows.row3;
304 	row >>= MC5_EACHBIT;
305 	err_addr |= (row & BIT(0)) << rows.row4;
306 	row >>= MC5_EACHBIT;
307 
308 	reg = error_data[ADEC8];
309 	rows.i = reg;
310 	err_addr |= (row & BIT(0)) << rows.row0;
311 	row >>= MC5_EACHBIT;
312 	err_addr |= (row & BIT(0)) << rows.row1;
313 	row >>= MC5_EACHBIT;
314 	err_addr |= (row & BIT(0)) << rows.row2;
315 	row >>= MC5_EACHBIT;
316 	err_addr |= (row & BIT(0)) << rows.row3;
317 	row >>= MC5_EACHBIT;
318 	err_addr |= (row & BIT(0)) << rows.row4;
319 
320 	reg = error_data[ADEC9];
321 	rows.i = reg;
322 
323 	err_addr |= (row & BIT(0)) << rows.row0;
324 	row >>= MC5_EACHBIT;
325 	err_addr |= (row & BIT(0)) << rows.row1;
326 	row >>= MC5_EACHBIT;
327 	err_addr |= (row & BIT(0)) << rows.row2;
328 	row >>= MC5_EACHBIT;
329 
330 	col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
331 	pinf.col >>= 1;
332 	err_addr |= (pinf.col & 1) << col_bit_0;
333 
334 	cols.i = error_data[ADEC10];
335 	err_addr |= (pinf.col & 1) << cols.col1;
336 	pinf.col >>= 1;
337 	err_addr |= (pinf.col & 1) << cols.col2;
338 	pinf.col >>= 1;
339 	err_addr |= (pinf.col & 1) << cols.col3;
340 	pinf.col >>= 1;
341 	err_addr |= (pinf.col & 1) << cols.col4;
342 	pinf.col >>= 1;
343 	err_addr |= (pinf.col & 1) << cols.col5;
344 	pinf.col >>= 1;
345 
346 	cols.i = error_data[ADEC11];
347 	err_addr |= (pinf.col & 1) << cols.col1;
348 	pinf.col >>= 1;
349 	err_addr |= (pinf.col & 1) << cols.col2;
350 	pinf.col >>= 1;
351 	err_addr |= (pinf.col & 1) << cols.col3;
352 	pinf.col >>= 1;
353 	err_addr |= (pinf.col & 1) << cols.col4;
354 	pinf.col >>= 1;
355 	err_addr |= (pinf.col & 1) << cols.col5;
356 	pinf.col >>= 1;
357 
358 	reg = error_data[ADEC12];
359 	err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
360 	pinf.bank >>= MC5_EACHBIT;
361 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
362 	pinf.bank >>= MC5_EACHBIT;
363 
364 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
365 	pinf.group >>= MC5_EACHBIT;
366 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
367 	pinf.group >>= MC5_EACHBIT;
368 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
369 	pinf.group >>= MC5_EACHBIT;
370 
371 	reg = error_data[ADEC4];
372 	err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
373 	pinf.rank >>= MC5_EACHBIT;
374 	err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
375 	pinf.rank >>= MC5_EACHBIT;
376 
377 	reg = error_data[ADEC5];
378 	err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
379 	pinf.lrank >>= MC5_EACHBIT;
380 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
381 	pinf.lrank >>= MC5_EACHBIT;
382 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
383 	pinf.lrank >>= MC5_EACHBIT;
384 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
385 	pinf.lrank >>= MC5_EACHBIT;
386 
387 	high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
388 	interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
389 
390 	high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
391 	low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
392 	reg = priv->adec[ADEC14 + offset];
393 	ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
394 	ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
395 	if (ilc_himem_en)
396 		ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
397 	else
398 		ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
399 
400 	if (priv->dwidth == DEV_X16) {
401 		blk = err_addr / MC5_X16_SIZE;
402 		rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
403 		err_addr = rsh_req_addr * interleave * 2;
404 	} else {
405 		blk = err_addr / MC5_X32_SIZE;
406 		rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
407 		err_addr = rsh_req_addr * interleave * 2;
408 	}
409 
410 	if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
411 		addr = err_addr - high_mem_offset;
412 	else
413 		addr = err_addr - low_mem_offset;
414 
415 	return addr;
416 }
417 
418 /**
419  * handle_error - Handle errors.
420  * @priv:	DDR memory controller private instance data.
421  * @stat:	ECC status structure.
422  * @ctl_num:	Controller number of the MC5
423  * @error_data:	the MC5 ADEC address decoder register data
424  *
425  * Handles ECC correctable and uncorrectable errors.
426  */
handle_error(struct mc_priv * priv,struct ecc_status * stat,int ctl_num,int * error_data)427 static void handle_error(struct mc_priv  *priv, struct ecc_status *stat,
428 			 int ctl_num, int *error_data)
429 {
430 	union ecc_error_info pinf;
431 	struct mem_ctl_info *mci;
432 	unsigned long pa;
433 	phys_addr_t pfn;
434 	int err;
435 
436 	if (WARN_ON_ONCE(ctl_num >= NUM_CONTROLLERS))
437 		return;
438 
439 	mci = priv->mci[ctl_num];
440 
441 	if (stat->error_type == MC5_ERR_TYPE_CE) {
442 		pinf = stat->ceinfo[stat->channel];
443 		snprintf(priv->message, sizeof(priv->message),
444 			 "Error type:%s Controller %d Addr at %lx\n",
445 			 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
446 
447 		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
448 				     1, 0, 0, 0, 0, 0, -1,
449 				     priv->message, "");
450 	}
451 
452 	if (stat->error_type == MC5_ERR_TYPE_UE) {
453 		pinf = stat->ueinfo[stat->channel];
454 		snprintf(priv->message, sizeof(priv->message),
455 			 "Error type:%s controller %d Addr at %lx\n",
456 			 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
457 
458 		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
459 				     1, 0, 0, 0, 0, 0, -1,
460 				     priv->message, "");
461 		pa = convert_to_physical(priv, pinf, ctl_num, error_data);
462 		pfn = PHYS_PFN(pa);
463 
464 		if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
465 			err = memory_failure(pfn, MF_ACTION_REQUIRED);
466 			if (err)
467 				edac_dbg(2, "memory_failure() error: %d", err);
468 			else
469 				edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
470 		}
471 	}
472 }
473 
mc_init(struct mem_ctl_info * mci,struct device * dev)474 static void mc_init(struct mem_ctl_info *mci, struct device *dev)
475 {
476 	struct mc_priv *priv = mci->pvt_info;
477 	struct csrow_info *csi;
478 	struct dimm_info *dimm;
479 	u32 row;
480 	int ch;
481 
482 	/* Initialize controller capabilities and configuration */
483 	mci->mtype_cap = MEM_FLAG_DDR5;
484 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
485 	mci->scrub_cap = SCRUB_HW_SRC;
486 	mci->scrub_mode = SCRUB_NONE;
487 
488 	mci->edac_cap = EDAC_FLAG_SECDED;
489 	mci->ctl_name = "VersalNET DDR5";
490 	mci->dev_name = dev_name(dev);
491 	mci->mod_name = "versalnet_edac";
492 
493 	edac_op_state = EDAC_OPSTATE_INT;
494 
495 	for (row = 0; row < mci->nr_csrows; row++) {
496 		csi = mci->csrows[row];
497 		for (ch = 0; ch < csi->nr_channels; ch++) {
498 			dimm = csi->channels[ch]->dimm;
499 			dimm->edac_mode = EDAC_SECDED;
500 			dimm->mtype = MEM_DDR5;
501 			dimm->grain = MC5_ERR_GRAIN;
502 			dimm->dtype = priv->dwidth;
503 		}
504 	}
505 }
506 
507 #define to_mci(k) container_of(k, struct mem_ctl_info, dev)
508 
mcdi_rpc_timeout(struct cdx_mcdi * cdx,unsigned int cmd)509 static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
510 {
511 	return MCDI_RPC_TIMEOUT;
512 }
513 
mcdi_request(struct cdx_mcdi * cdx,const struct cdx_dword * hdr,size_t hdr_len,const struct cdx_dword * sdu,size_t sdu_len)514 static void mcdi_request(struct cdx_mcdi *cdx,
515 			 const struct cdx_dword *hdr, size_t hdr_len,
516 			 const struct cdx_dword *sdu, size_t sdu_len)
517 {
518 	void *send_buf;
519 	int ret;
520 
521 	send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
522 	if (!send_buf)
523 		return;
524 
525 	memcpy(send_buf, hdr, hdr_len);
526 	memcpy(send_buf + hdr_len, sdu, sdu_len);
527 
528 	ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
529 	if (ret)
530 		dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
531 
532 	kfree(send_buf);
533 }
534 
535 static const struct cdx_mcdi_ops mcdi_ops = {
536 	.mcdi_rpc_timeout = mcdi_rpc_timeout,
537 	.mcdi_request = mcdi_request,
538 };
539 
get_ddr_config(u32 index,u32 * buffer,struct cdx_mcdi * amd_mcdi)540 static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
541 {
542 	size_t outlen;
543 	int ret;
544 
545 	MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
546 	MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
547 
548 	MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
549 
550 	ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
551 			   outbuf, sizeof(outbuf), &outlen);
552 	if (!ret)
553 		memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
554 		       (ADEC_NUM * 4));
555 }
556 
setup_mcdi(struct mc_priv * mc_priv)557 static int setup_mcdi(struct mc_priv *mc_priv)
558 {
559 	struct cdx_mcdi *amd_mcdi;
560 	int ret, i;
561 
562 	amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL);
563 	if (!amd_mcdi)
564 		return -ENOMEM;
565 
566 	amd_mcdi->mcdi_ops = &mcdi_ops;
567 	ret = cdx_mcdi_init(amd_mcdi);
568 	if (ret) {
569 		kfree(amd_mcdi);
570 		return ret;
571 	}
572 
573 	amd_mcdi->ept = mc_priv->ept;
574 	mc_priv->mcdi = amd_mcdi;
575 
576 	for (i = 0; i < NUM_CONTROLLERS; i++)
577 		get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
578 
579 	return 0;
580 }
581 
582 static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
583 						 0xb8, 0xb4, 0x45, 0x56, 0x2e,
584 						 0x8c, 0x5b, 0xec);
585 
rpmsg_cb(struct rpmsg_device * rpdev,void * data,int len,void * priv,u32 src)586 static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
587 		    int len, void *priv, u32 src)
588 {
589 	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
590 	const guid_t *sec_type = &guid_null;
591 	u32 length, offset, error_id;
592 	u32 *result = (u32 *)data;
593 	struct ecc_status *p;
594 	int i, j, k, sec_sev;
595 	const char *err_str;
596 	u32 *adec_data;
597 
598 	if (*(u8 *)data == MCDI_RESPONSE) {
599 		cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
600 		return 0;
601 	}
602 
603 	sec_sev = result[ERROR_LEVEL];
604 	error_id = result[ERROR_ID];
605 	length = result[MSG_ERR_LENGTH];
606 	offset = result[MSG_ERR_OFFSET];
607 
608 	/*
609 	 * The data can come in two stretches. Construct the regs from two
610 	 * messages. The offset indicates the offset from which the data is to
611 	 * be taken.
612 	 */
613 	for (i = 0 ; i < length; i++) {
614 		k = offset + i;
615 		j = ERROR_DATA + i;
616 		mc_priv->regs[k] = result[j];
617 	}
618 
619 	if (result[TOTAL_ERR_LENGTH] > length) {
620 		if (!mc_priv->part_len)
621 			mc_priv->part_len = length;
622 		else
623 			mc_priv->part_len += length;
624 
625 		if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
626 			return 0;
627 		mc_priv->part_len = 0;
628 	}
629 
630 	mc_priv->error_id = error_id;
631 	mc_priv->error_level = result[ERROR_LEVEL];
632 
633 	switch (error_id) {
634 	case 5:		err_str = "General Software Non-Correctable error"; break;
635 	case 6:		err_str = "CFU error"; break;
636 	case 7:		err_str = "CFRAME error"; break;
637 	case 10:	err_str = "DDRMC Microblaze Correctable ECC error"; break;
638 	case 11:	err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
639 	case 15:	err_str = "MMCM error"; break;
640 	case 16:	err_str = "HNICX Correctable error"; break;
641 	case 17:	err_str = "HNICX Non-Correctable error"; break;
642 
643 	case 18:
644 		p = &mc_priv->stat;
645 		memset(p, 0, sizeof(struct ecc_status));
646 		p->error_type = MC5_ERR_TYPE_CE;
647 		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
648 			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
649 				adec_data = mc_priv->adec + ADEC_NUM * i;
650 				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
651 			}
652 		}
653 		return 0;
654 	case 19:
655 		p = &mc_priv->stat;
656 		memset(p, 0, sizeof(struct ecc_status));
657 		p->error_type = MC5_ERR_TYPE_UE;
658 		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
659 			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
660 				adec_data = mc_priv->adec + ADEC_NUM * i;
661 				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
662 			}
663 		}
664 		return 0;
665 
666 	case 21:	err_str = "GT Non-Correctable error"; break;
667 	case 22:	err_str = "PL Sysmon Correctable error"; break;
668 	case 23:	err_str = "PL Sysmon Non-Correctable error"; break;
669 	case 111:	err_str = "LPX unexpected dfx activation error"; break;
670 	case 114:	err_str = "INT_LPD Non-Correctable error"; break;
671 	case 116:	err_str = "INT_OCM Non-Correctable error"; break;
672 	case 117:	err_str = "INT_FPD Correctable error"; break;
673 	case 118:	err_str = "INT_FPD Non-Correctable error"; break;
674 	case 120:	err_str = "INT_IOU Non-Correctable error"; break;
675 	case 123:	err_str = "err_int_irq from APU GIC Distributor"; break;
676 	case 124:	err_str = "fault_int_irq from APU GIC Distribute"; break;
677 	case 132 ... 139: err_str = "FPX SPLITTER error"; break;
678 	case 140:	err_str = "APU Cluster 0 error"; break;
679 	case 141:	err_str = "APU Cluster 1 error"; break;
680 	case 142:	err_str = "APU Cluster 2 error"; break;
681 	case 143:	err_str = "APU Cluster 3 error"; break;
682 	case 145:	err_str = "WWDT1 LPX error"; break;
683 	case 147:	err_str = "IPI error"; break;
684 	case 152 ... 153: err_str = "AFIFS error"; break;
685 	case 154 ... 155: err_str = "LPX glitch error"; break;
686 	case 185 ... 186: err_str = "FPX AFIFS error"; break;
687 	case 195 ... 199: err_str = "AFIFM error"; break;
688 	case 108:	err_str = "PSM Correctable error"; break;
689 	case 59:	err_str = "PMC correctable error"; break;
690 	case 60:	err_str = "PMC Un correctable error"; break;
691 	case 43 ... 47:	err_str = "PMC Sysmon error"; break;
692 	case 163 ... 184: err_str = "RPU error"; break;
693 	case 148:	err_str = "OCM0 correctable error"; break;
694 	case 149:	err_str = "OCM1 correctable error"; break;
695 	case 150:	err_str = "OCM0 Un-correctable error"; break;
696 	case 151:	err_str = "OCM1 Un-correctable error"; break;
697 	case 189:	err_str = "PSX_CMN_3 PD block consolidated error"; break;
698 	case 191:	err_str = "FPD_INT_WRAP PD block consolidated error"; break;
699 	case 232:	err_str = "CRAM Un-Correctable error"; break;
700 	default:	err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
701 	}
702 
703 	snprintf(mc_priv->message,
704 		 sizeof(mc_priv->message),
705 		 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
706 
707 	/* Convert to bytes */
708 	length = result[TOTAL_ERR_LENGTH] * 4;
709 	log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
710 			       sec_sev, (void *)&mc_priv->regs, length);
711 
712 	return 0;
713 }
714 
715 static struct rpmsg_device_id amd_rpmsg_id_table[] = {
716 	{ .name = "error_ipc" },
717 	{ },
718 };
719 MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
720 
rpmsg_probe(struct rpmsg_device * rpdev)721 static int rpmsg_probe(struct rpmsg_device *rpdev)
722 {
723 	struct rpmsg_channel_info chinfo;
724 	struct mc_priv *pg;
725 
726 	pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
727 	chinfo.src = RPMSG_ADDR_ANY;
728 	chinfo.dst = rpdev->dst;
729 	strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
730 		strlen(amd_rpmsg_id_table[0].name));
731 
732 	pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
733 	if (!pg->ept)
734 		return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
735 				     chinfo.name);
736 
737 	dev_set_drvdata(&rpdev->dev, pg);
738 
739 	return 0;
740 }
741 
rpmsg_remove(struct rpmsg_device * rpdev)742 static void rpmsg_remove(struct rpmsg_device *rpdev)
743 {
744 	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
745 
746 	rpmsg_destroy_ept(mc_priv->ept);
747 	dev_set_drvdata(&rpdev->dev, NULL);
748 }
749 
750 static struct rpmsg_driver amd_rpmsg_driver = {
751 	.drv.name = KBUILD_MODNAME,
752 	.probe = rpmsg_probe,
753 	.remove = rpmsg_remove,
754 	.callback = rpmsg_cb,
755 	.id_table = amd_rpmsg_id_table,
756 };
757 
versal_edac_release(struct device * dev)758 static void versal_edac_release(struct device *dev)
759 {
760 	kfree(dev);
761 }
762 
init_versalnet(struct mc_priv * priv,struct platform_device * pdev)763 static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
764 {
765 	u32 num_chans, rank, dwidth, config;
766 	struct edac_mc_layer layers[2];
767 	struct mem_ctl_info *mci;
768 	struct device *dev;
769 	enum dev_type dt;
770 	char *name;
771 	int rc, i;
772 
773 	for (i = 0; i < NUM_CONTROLLERS; i++) {
774 		config = priv->adec[CONF + i * ADEC_NUM];
775 		num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
776 		rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
777 		dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
778 
779 		switch (dwidth) {
780 		case XDDR5_BUS_WIDTH_16:
781 			dt = DEV_X16;
782 			break;
783 		case XDDR5_BUS_WIDTH_32:
784 			dt = DEV_X32;
785 			break;
786 		case XDDR5_BUS_WIDTH_64:
787 			dt = DEV_X64;
788 			break;
789 		default:
790 			dt = DEV_UNKNOWN;
791 		}
792 
793 		if (dt == DEV_UNKNOWN)
794 			continue;
795 
796 		/* Find the first enabled device and register that one. */
797 		layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
798 		layers[0].size = rank;
799 		layers[0].is_virt_csrow = true;
800 		layers[1].type = EDAC_MC_LAYER_CHANNEL;
801 		layers[1].size = num_chans;
802 		layers[1].is_virt_csrow = false;
803 
804 		rc = -ENOMEM;
805 		mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers,
806 				    sizeof(struct mc_priv));
807 		if (!mci) {
808 			edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
809 			goto err_alloc;
810 		}
811 
812 		priv->mci[i] = mci;
813 		priv->dwidth = dt;
814 
815 		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
816 		dev->release = versal_edac_release;
817 		name = kmalloc(32, GFP_KERNEL);
818 		sprintf(name, "versal-net-ddrmc5-edac-%d", i);
819 		dev->init_name = name;
820 		rc = device_register(dev);
821 		if (rc)
822 			goto err_alloc;
823 
824 		mci->pdev = dev;
825 
826 		platform_set_drvdata(pdev, priv);
827 
828 		mc_init(mci, dev);
829 		rc = edac_mc_add_mc(mci);
830 		if (rc) {
831 			edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
832 			goto err_alloc;
833 		}
834 	}
835 	return 0;
836 
837 err_alloc:
838 	while (i--) {
839 		mci = priv->mci[i];
840 		if (!mci)
841 			continue;
842 
843 		if (mci->pdev) {
844 			device_unregister(mci->pdev);
845 			edac_mc_del_mc(mci->pdev);
846 		}
847 
848 		edac_mc_free(mci);
849 	}
850 
851 	return rc;
852 }
853 
remove_versalnet(struct mc_priv * priv)854 static void remove_versalnet(struct mc_priv *priv)
855 {
856 	struct mem_ctl_info *mci;
857 	int i;
858 
859 	for (i = 0; i < NUM_CONTROLLERS; i++) {
860 		device_unregister(priv->mci[i]->pdev);
861 		mci = edac_mc_del_mc(priv->mci[i]->pdev);
862 		if (!mci)
863 			return;
864 
865 		edac_mc_free(mci);
866 	}
867 }
868 
mc_probe(struct platform_device * pdev)869 static int mc_probe(struct platform_device *pdev)
870 {
871 	struct device_node *r5_core_node;
872 	struct mc_priv *priv;
873 	struct rproc *rp;
874 	int rc;
875 
876 	r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
877 	if (!r5_core_node) {
878 		dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
879 		return -EINVAL;
880 	}
881 
882 	rp = rproc_get_by_phandle(r5_core_node->phandle);
883 	if (!rp)
884 		return -EPROBE_DEFER;
885 
886 	rc = rproc_boot(rp);
887 	if (rc) {
888 		dev_err(&pdev->dev, "Failed to attach to remote processor\n");
889 		goto err_rproc_boot;
890 	}
891 
892 	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
893 	if (!priv) {
894 		rc = -ENOMEM;
895 		goto err_alloc;
896 	}
897 
898 	amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
899 
900 	rc = register_rpmsg_driver(&amd_rpmsg_driver);
901 	if (rc) {
902 		edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
903 		goto err_alloc;
904 	}
905 
906 	rc = setup_mcdi(priv);
907 	if (rc)
908 		goto err_unreg;
909 
910 	priv->mcdi->r5_rproc = rp;
911 
912 	rc = init_versalnet(priv, pdev);
913 	if (rc)
914 		goto err_init;
915 
916 	return 0;
917 
918 err_init:
919 	cdx_mcdi_finish(priv->mcdi);
920 
921 err_unreg:
922 	unregister_rpmsg_driver(&amd_rpmsg_driver);
923 
924 err_alloc:
925 	rproc_shutdown(rp);
926 
927 err_rproc_boot:
928 	rproc_put(rp);
929 
930 	return rc;
931 }
932 
mc_remove(struct platform_device * pdev)933 static void mc_remove(struct platform_device *pdev)
934 {
935 	struct mc_priv *priv = platform_get_drvdata(pdev);
936 
937 	unregister_rpmsg_driver(&amd_rpmsg_driver);
938 	remove_versalnet(priv);
939 	rproc_shutdown(priv->mcdi->r5_rproc);
940 	cdx_mcdi_finish(priv->mcdi);
941 }
942 
943 static const struct of_device_id amd_edac_match[] = {
944 	{ .compatible = "xlnx,versal-net-ddrmc5", },
945 	{}
946 };
947 MODULE_DEVICE_TABLE(of, amd_edac_match);
948 
949 static struct platform_driver amd_ddr_edac_mc_driver = {
950 	.driver = {
951 		.name = "versal-net-edac",
952 		.of_match_table = amd_edac_match,
953 	},
954 	.probe = mc_probe,
955 	.remove = mc_remove,
956 };
957 
958 module_platform_driver(amd_ddr_edac_mc_driver);
959 
960 MODULE_AUTHOR("AMD Inc");
961 MODULE_DESCRIPTION("Versal NET EDAC driver");
962 MODULE_LICENSE("GPL");
963