xref: /linux/drivers/edac/versalnet_edac.c (revision 03f76ddff5b04a808ae16c06418460151e2fdd4b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * AMD Versal NET memory controller driver
4  * Copyright (C) 2025 Advanced Micro Devices, Inc.
5  */
6 
7 #include <linux/cdx/edac_cdx_pcol.h>
8 #include <linux/edac.h>
9 #include <linux/module.h>
10 #include <linux/of_device.h>
11 #include <linux/ras.h>
12 #include <linux/remoteproc.h>
13 #include <linux/rpmsg.h>
14 #include <linux/sizes.h>
15 #include <ras/ras_event.h>
16 
17 #include "edac_module.h"
18 
19 /* Granularity of reported error in bytes */
20 #define MC5_ERR_GRAIN			1
21 #define MC_GET_DDR_CONFIG_IN_LEN	4
22 
23 #define MC5_IRQ_CE_MASK			GENMASK(18, 15)
24 #define MC5_IRQ_UE_MASK			GENMASK(14, 11)
25 
26 #define MC5_RANK_1_MASK			GENMASK(11, 6)
27 #define MASK_24				GENMASK(29, 24)
28 #define MASK_0				GENMASK(5, 0)
29 
30 #define MC5_LRANK_1_MASK		GENMASK(11, 6)
31 #define MC5_LRANK_2_MASK		GENMASK(17, 12)
32 #define MC5_BANK1_MASK			GENMASK(11, 6)
33 #define MC5_GRP_0_MASK			GENMASK(17, 12)
34 #define MC5_GRP_1_MASK			GENMASK(23, 18)
35 
36 #define MC5_REGHI_ROW			7
37 #define MC5_EACHBIT			1
38 #define MC5_ERR_TYPE_CE			0
39 #define MC5_ERR_TYPE_UE			1
40 #define MC5_HIGH_MEM_EN			BIT(20)
41 #define MC5_MEM_MASK			GENMASK(19, 0)
42 #define MC5_X16_BASE			256
43 #define MC5_X16_ECC			32
44 #define MC5_X16_SIZE			(MC5_X16_BASE + MC5_X16_ECC)
45 #define MC5_X32_SIZE			576
46 #define MC5_HIMEM_BASE			(256 * SZ_1M)
47 #define MC5_ILC_HIMEM_EN		BIT(28)
48 #define MC5_ILC_MEM			GENMASK(27, 0)
49 #define MC5_INTERLEAVE_SEL		GENMASK(3, 0)
50 #define MC5_BUS_WIDTH_MASK		GENMASK(19, 18)
51 #define MC5_NUM_CHANS_MASK		BIT(17)
52 #define MC5_RANK_MASK			GENMASK(15, 14)
53 
54 #define ERROR_LEVEL			2
55 #define ERROR_ID			3
56 #define TOTAL_ERR_LENGTH		5
57 #define MSG_ERR_OFFSET			8
58 #define MSG_ERR_LENGTH			9
59 #define ERROR_DATA			10
60 #define MCDI_RESPONSE			0xFF
61 
62 #define REG_MAX				152
63 #define ADEC_MAX			152
64 #define NUM_CONTROLLERS			8
65 #define REGS_PER_CONTROLLER		19
66 #define ADEC_NUM			19
67 #define BUFFER_SZ			80
68 
69 #define XDDR5_BUS_WIDTH_64		0
70 #define XDDR5_BUS_WIDTH_32		1
71 #define XDDR5_BUS_WIDTH_16		2
72 
73 /**
74  * struct ecc_error_info - ECC error log information.
75  * @burstpos:		Burst position.
76  * @lrank:		Logical Rank number.
77  * @rank:		Rank number.
78  * @group:		Group number.
79  * @bank:		Bank number.
80  * @col:		Column number.
81  * @row:		Row number.
82  * @rowhi:		Row number higher bits.
83  * @i:			Combined ECC error vector containing encoded values of burst position,
84  *			rank, bank, column, and row information.
85  */
86 union ecc_error_info {
87 	struct {
88 		u32 burstpos:3;
89 		u32 lrank:4;
90 		u32 rank:2;
91 		u32 group:3;
92 		u32 bank:2;
93 		u32 col:11;
94 		u32 row:7;
95 		u32 rowhi;
96 	};
97 	u64 i;
98 } __packed;
99 
100 /* Row and column bit positions in the address decoder (ADEC) registers. */
101 union row_col_mapping {
102 	struct {
103 		u32 row0:6;
104 		u32 row1:6;
105 		u32 row2:6;
106 		u32 row3:6;
107 		u32 row4:6;
108 		u32 reserved:2;
109 	};
110 	struct {
111 		u32 col1:6;
112 		u32 col2:6;
113 		u32 col3:6;
114 		u32 col4:6;
115 		u32 col5:6;
116 		u32 reservedcol:2;
117 	};
118 	u32 i;
119 } __packed;
120 
121 /**
122  * struct ecc_status - ECC status information to report.
123  * @ceinfo:	Correctable errors.
124  * @ueinfo:	Uncorrected errors.
125  * @channel:	Channel number.
126  * @error_type:	Error type.
127  */
128 struct ecc_status {
129 	union ecc_error_info ceinfo[2];
130 	union ecc_error_info ueinfo[2];
131 	u8 channel;
132 	u8 error_type;
133 };
134 
135 /**
136  * struct mc_priv - DDR memory controller private instance data.
137  * @message:		Buffer for framing the event specific info.
138  * @stat:		ECC status information.
139  * @error_id:		The error id.
140  * @error_level:	The error level.
141  * @dwidth:		Width of data bus excluding ECC bits.
142  * @part_len:		The support of the message received.
143  * @regs:		The registers sent on the rpmsg.
144  * @adec:		Address decode registers.
145  * @mci:		Memory controller interface.
146  * @ept:		rpmsg endpoint.
147  * @mcdi:		The mcdi handle.
148  */
149 struct mc_priv {
150 	char message[256];
151 	struct ecc_status stat;
152 	u32 error_id;
153 	u32 error_level;
154 	u32 dwidth;
155 	u32 part_len;
156 	u32 regs[REG_MAX];
157 	u32 adec[ADEC_MAX];
158 	struct mem_ctl_info *mci[NUM_CONTROLLERS];
159 	struct rpmsg_endpoint *ept;
160 	struct cdx_mcdi *mcdi;
161 };
162 
163 /*
164  * Address decoder (ADEC) registers to match the order in which the register
165  * information is received from the firmware.
166  */
167 enum adec_info {
168 	CONF = 0,
169 	ADEC0,
170 	ADEC1,
171 	ADEC2,
172 	ADEC3,
173 	ADEC4,
174 	ADEC5,
175 	ADEC6,
176 	ADEC7,
177 	ADEC8,
178 	ADEC9,
179 	ADEC10,
180 	ADEC11,
181 	ADEC12,
182 	ADEC13,
183 	ADEC14,
184 	ADEC15,
185 	ADEC16,
186 	ADECILC,
187 };
188 
189 enum reg_info {
190 	ISR = 0,
191 	IMR,
192 	ECCR0_ERR_STATUS,
193 	ECCR0_ADDR_LO,
194 	ECCR0_ADDR_HI,
195 	ECCR0_DATA_LO,
196 	ECCR0_DATA_HI,
197 	ECCR0_PAR,
198 	ECCR1_ERR_STATUS,
199 	ECCR1_ADDR_LO,
200 	ECCR1_ADDR_HI,
201 	ECCR1_DATA_LO,
202 	ECCR1_DATA_HI,
203 	ECCR1_PAR,
204 	XMPU_ERR,
205 	XMPU_ERR_ADDR_L0,
206 	XMPU_ERR_ADDR_HI,
207 	XMPU_ERR_AXI_ID,
208 	ADEC_CHK_ERR_LOG,
209 };
210 
get_ddr_info(u32 * error_data,struct mc_priv * priv)211 static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
212 {
213 	u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
214 	struct ecc_status *p;
215 
216 	isr = error_data[ISR];
217 
218 	if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
219 		return false;
220 
221 	eccr0_val = error_data[ECCR0_ERR_STATUS];
222 	eccr1_val = error_data[ECCR1_ERR_STATUS];
223 
224 	if (!eccr0_val && !eccr1_val)
225 		return false;
226 
227 	p = &priv->stat;
228 
229 	if (!eccr0_val)
230 		p->channel = 1;
231 	else
232 		p->channel = 0;
233 
234 	reglo = error_data[ECCR0_ADDR_LO];
235 	reghi = error_data[ECCR0_ADDR_HI];
236 	if (isr & MC5_IRQ_CE_MASK)
237 		p->ceinfo[0].i = reglo | (u64)reghi << 32;
238 	else if (isr & MC5_IRQ_UE_MASK)
239 		p->ueinfo[0].i = reglo | (u64)reghi << 32;
240 
241 	parity = error_data[ECCR0_PAR];
242 	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
243 		 reghi, reglo, parity);
244 
245 	reglo = error_data[ECCR1_ADDR_LO];
246 	reghi = error_data[ECCR1_ADDR_HI];
247 	if (isr & MC5_IRQ_CE_MASK)
248 		p->ceinfo[1].i = reglo | (u64)reghi << 32;
249 	else if (isr & MC5_IRQ_UE_MASK)
250 		p->ueinfo[1].i = reglo | (u64)reghi << 32;
251 
252 	parity = error_data[ECCR1_PAR];
253 	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
254 		 reghi, reglo, parity);
255 
256 	return true;
257 }
258 
259 /**
260  * convert_to_physical - Convert @error_data to a physical address.
261  * @priv:	DDR memory controller private instance data.
262  * @pinf:	ECC error info structure.
263  * @controller:	Controller number of the MC5
264  * @error_data:	the DDRMC5 ADEC address decoder register data
265  *
266  * Return: physical address of the DDR memory.
267  */
convert_to_physical(struct mc_priv * priv,union ecc_error_info pinf,int controller,int * error_data)268 static unsigned long convert_to_physical(struct mc_priv *priv,
269 					 union ecc_error_info pinf,
270 					 int controller, int *error_data)
271 {
272 	u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
273 	u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
274 	unsigned long err_addr = 0, addr;
275 	union row_col_mapping cols;
276 	union row_col_mapping rows;
277 	u32 col_bit_0;
278 
279 	row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
280 	offset = controller * ADEC_NUM;
281 
282 	reg = error_data[ADEC6];
283 	rows.i = reg;
284 	err_addr |= (row & BIT(0)) << rows.row0;
285 	row >>= MC5_EACHBIT;
286 	err_addr |= (row & BIT(0)) << rows.row1;
287 	row >>= MC5_EACHBIT;
288 	err_addr |= (row & BIT(0)) << rows.row2;
289 	row >>= MC5_EACHBIT;
290 	err_addr |= (row & BIT(0)) << rows.row3;
291 	row >>= MC5_EACHBIT;
292 	err_addr |= (row & BIT(0)) << rows.row4;
293 	row >>= MC5_EACHBIT;
294 
295 	reg = error_data[ADEC7];
296 	rows.i = reg;
297 	err_addr |= (row & BIT(0)) << rows.row0;
298 	row >>= MC5_EACHBIT;
299 	err_addr |= (row & BIT(0)) << rows.row1;
300 	row >>= MC5_EACHBIT;
301 	err_addr |= (row & BIT(0)) << rows.row2;
302 	row >>= MC5_EACHBIT;
303 	err_addr |= (row & BIT(0)) << rows.row3;
304 	row >>= MC5_EACHBIT;
305 	err_addr |= (row & BIT(0)) << rows.row4;
306 	row >>= MC5_EACHBIT;
307 
308 	reg = error_data[ADEC8];
309 	rows.i = reg;
310 	err_addr |= (row & BIT(0)) << rows.row0;
311 	row >>= MC5_EACHBIT;
312 	err_addr |= (row & BIT(0)) << rows.row1;
313 	row >>= MC5_EACHBIT;
314 	err_addr |= (row & BIT(0)) << rows.row2;
315 	row >>= MC5_EACHBIT;
316 	err_addr |= (row & BIT(0)) << rows.row3;
317 	row >>= MC5_EACHBIT;
318 	err_addr |= (row & BIT(0)) << rows.row4;
319 
320 	reg = error_data[ADEC9];
321 	rows.i = reg;
322 
323 	err_addr |= (row & BIT(0)) << rows.row0;
324 	row >>= MC5_EACHBIT;
325 	err_addr |= (row & BIT(0)) << rows.row1;
326 	row >>= MC5_EACHBIT;
327 	err_addr |= (row & BIT(0)) << rows.row2;
328 	row >>= MC5_EACHBIT;
329 
330 	col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
331 	pinf.col >>= 1;
332 	err_addr |= (pinf.col & 1) << col_bit_0;
333 
334 	cols.i = error_data[ADEC10];
335 	err_addr |= (pinf.col & 1) << cols.col1;
336 	pinf.col >>= 1;
337 	err_addr |= (pinf.col & 1) << cols.col2;
338 	pinf.col >>= 1;
339 	err_addr |= (pinf.col & 1) << cols.col3;
340 	pinf.col >>= 1;
341 	err_addr |= (pinf.col & 1) << cols.col4;
342 	pinf.col >>= 1;
343 	err_addr |= (pinf.col & 1) << cols.col5;
344 	pinf.col >>= 1;
345 
346 	cols.i = error_data[ADEC11];
347 	err_addr |= (pinf.col & 1) << cols.col1;
348 	pinf.col >>= 1;
349 	err_addr |= (pinf.col & 1) << cols.col2;
350 	pinf.col >>= 1;
351 	err_addr |= (pinf.col & 1) << cols.col3;
352 	pinf.col >>= 1;
353 	err_addr |= (pinf.col & 1) << cols.col4;
354 	pinf.col >>= 1;
355 	err_addr |= (pinf.col & 1) << cols.col5;
356 	pinf.col >>= 1;
357 
358 	reg = error_data[ADEC12];
359 	err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
360 	pinf.bank >>= MC5_EACHBIT;
361 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
362 	pinf.bank >>= MC5_EACHBIT;
363 
364 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
365 	pinf.group >>= MC5_EACHBIT;
366 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
367 	pinf.group >>= MC5_EACHBIT;
368 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
369 	pinf.group >>= MC5_EACHBIT;
370 
371 	reg = error_data[ADEC4];
372 	err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
373 	pinf.rank >>= MC5_EACHBIT;
374 	err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
375 	pinf.rank >>= MC5_EACHBIT;
376 
377 	reg = error_data[ADEC5];
378 	err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
379 	pinf.lrank >>= MC5_EACHBIT;
380 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
381 	pinf.lrank >>= MC5_EACHBIT;
382 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
383 	pinf.lrank >>= MC5_EACHBIT;
384 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
385 	pinf.lrank >>= MC5_EACHBIT;
386 
387 	high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
388 	interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
389 
390 	high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
391 	low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
392 	reg = priv->adec[ADEC14 + offset];
393 	ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
394 	ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
395 	if (ilc_himem_en)
396 		ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
397 	else
398 		ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
399 
400 	if (priv->dwidth == DEV_X16) {
401 		blk = err_addr / MC5_X16_SIZE;
402 		rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
403 		err_addr = rsh_req_addr * interleave * 2;
404 	} else {
405 		blk = err_addr / MC5_X32_SIZE;
406 		rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
407 		err_addr = rsh_req_addr * interleave * 2;
408 	}
409 
410 	if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
411 		addr = err_addr - high_mem_offset;
412 	else
413 		addr = err_addr - low_mem_offset;
414 
415 	return addr;
416 }
417 
418 /**
419  * handle_error - Handle errors.
420  * @priv:	DDR memory controller private instance data.
421  * @stat:	ECC status structure.
422  * @ctl_num:	Controller number of the MC5
423  * @error_data:	the MC5 ADEC address decoder register data
424  *
425  * Handles ECC correctable and uncorrectable errors.
426  */
handle_error(struct mc_priv * priv,struct ecc_status * stat,int ctl_num,int * error_data)427 static void handle_error(struct mc_priv  *priv, struct ecc_status *stat,
428 			 int ctl_num, int *error_data)
429 {
430 	union ecc_error_info pinf;
431 	struct mem_ctl_info *mci;
432 	unsigned long pa;
433 	phys_addr_t pfn;
434 	int err;
435 
436 	if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS))
437 		return;
438 
439 	mci = priv->mci[ctl_num];
440 
441 	if (stat->error_type == MC5_ERR_TYPE_CE) {
442 		pinf = stat->ceinfo[stat->channel];
443 		snprintf(priv->message, sizeof(priv->message),
444 			 "Error type:%s Controller %d Addr at %lx\n",
445 			 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
446 
447 		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
448 				     1, 0, 0, 0, 0, 0, -1,
449 				     priv->message, "");
450 	}
451 
452 	if (stat->error_type == MC5_ERR_TYPE_UE) {
453 		pinf = stat->ueinfo[stat->channel];
454 		snprintf(priv->message, sizeof(priv->message),
455 			 "Error type:%s controller %d Addr at %lx\n",
456 			 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
457 
458 		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
459 				     1, 0, 0, 0, 0, 0, -1,
460 				     priv->message, "");
461 		pa = convert_to_physical(priv, pinf, ctl_num, error_data);
462 		pfn = PHYS_PFN(pa);
463 
464 		if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
465 			err = memory_failure(pfn, MF_ACTION_REQUIRED);
466 			if (err)
467 				edac_dbg(2, "memory_failure() error: %d", err);
468 			else
469 				edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
470 		}
471 	}
472 }
473 
mc_init(struct mem_ctl_info * mci,struct device * dev)474 static void mc_init(struct mem_ctl_info *mci, struct device *dev)
475 {
476 	struct mc_priv *priv = mci->pvt_info;
477 	struct csrow_info *csi;
478 	struct dimm_info *dimm;
479 	u32 row;
480 	int ch;
481 
482 	/* Initialize controller capabilities and configuration */
483 	mci->mtype_cap = MEM_FLAG_DDR5;
484 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
485 	mci->scrub_cap = SCRUB_HW_SRC;
486 	mci->scrub_mode = SCRUB_NONE;
487 
488 	mci->edac_cap = EDAC_FLAG_SECDED;
489 	mci->ctl_name = "VersalNET DDR5";
490 	mci->dev_name = dev_name(dev);
491 	mci->mod_name = "versalnet_edac";
492 
493 	edac_op_state = EDAC_OPSTATE_INT;
494 
495 	for (row = 0; row < mci->nr_csrows; row++) {
496 		csi = mci->csrows[row];
497 		for (ch = 0; ch < csi->nr_channels; ch++) {
498 			dimm = csi->channels[ch]->dimm;
499 			dimm->edac_mode = EDAC_SECDED;
500 			dimm->mtype = MEM_DDR5;
501 			dimm->grain = MC5_ERR_GRAIN;
502 			dimm->dtype = priv->dwidth;
503 		}
504 	}
505 }
506 
507 #define to_mci(k) container_of(k, struct mem_ctl_info, dev)
508 
mcdi_rpc_timeout(struct cdx_mcdi * cdx,unsigned int cmd)509 static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
510 {
511 	return MCDI_RPC_TIMEOUT;
512 }
513 
mcdi_request(struct cdx_mcdi * cdx,const struct cdx_dword * hdr,size_t hdr_len,const struct cdx_dword * sdu,size_t sdu_len)514 static void mcdi_request(struct cdx_mcdi *cdx,
515 			 const struct cdx_dword *hdr, size_t hdr_len,
516 			 const struct cdx_dword *sdu, size_t sdu_len)
517 {
518 	void *send_buf;
519 	int ret;
520 
521 	send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
522 	if (!send_buf)
523 		return;
524 
525 	memcpy(send_buf, hdr, hdr_len);
526 	memcpy(send_buf + hdr_len, sdu, sdu_len);
527 
528 	ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
529 	if (ret)
530 		dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
531 
532 	kfree(send_buf);
533 }
534 
535 static const struct cdx_mcdi_ops mcdi_ops = {
536 	.mcdi_rpc_timeout = mcdi_rpc_timeout,
537 	.mcdi_request = mcdi_request,
538 };
539 
get_ddr_config(u32 index,u32 * buffer,struct cdx_mcdi * amd_mcdi)540 static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
541 {
542 	size_t outlen;
543 	int ret;
544 
545 	MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
546 	MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
547 
548 	MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
549 
550 	ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
551 			   outbuf, sizeof(outbuf), &outlen);
552 	if (!ret)
553 		memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
554 		       (ADEC_NUM * 4));
555 }
556 
setup_mcdi(struct mc_priv * mc_priv)557 static int setup_mcdi(struct mc_priv *mc_priv)
558 {
559 	struct cdx_mcdi *amd_mcdi;
560 	int ret, i;
561 
562 	amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL);
563 	if (!amd_mcdi)
564 		return -ENOMEM;
565 
566 	amd_mcdi->mcdi_ops = &mcdi_ops;
567 	ret = cdx_mcdi_init(amd_mcdi);
568 	if (ret) {
569 		kfree(amd_mcdi);
570 		return ret;
571 	}
572 
573 	amd_mcdi->ept = mc_priv->ept;
574 	mc_priv->mcdi = amd_mcdi;
575 
576 	for (i = 0; i < NUM_CONTROLLERS; i++)
577 		get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
578 
579 	return 0;
580 }
581 
582 static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
583 						 0xb8, 0xb4, 0x45, 0x56, 0x2e,
584 						 0x8c, 0x5b, 0xec);
585 
rpmsg_cb(struct rpmsg_device * rpdev,void * data,int len,void * priv,u32 src)586 static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
587 		    int len, void *priv, u32 src)
588 {
589 	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
590 	const guid_t *sec_type = &guid_null;
591 	u32 length, offset, error_id;
592 	u32 *result = (u32 *)data;
593 	struct ecc_status *p;
594 	int i, j, k, sec_sev;
595 	const char *err_str;
596 	u32 *adec_data;
597 
598 	if (*(u8 *)data == MCDI_RESPONSE) {
599 		cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
600 		return 0;
601 	}
602 
603 	sec_sev = result[ERROR_LEVEL];
604 	error_id = result[ERROR_ID];
605 	length = result[MSG_ERR_LENGTH];
606 	offset = result[MSG_ERR_OFFSET];
607 
608 	if (result[TOTAL_ERR_LENGTH] > length) {
609 		if (!mc_priv->part_len)
610 			mc_priv->part_len = length;
611 		else
612 			mc_priv->part_len += length;
613 		/*
614 		 * The data can come in 2 stretches. Construct the regs from 2
615 		 * messages the offset indicates the offset from which the data is to
616 		 * be taken
617 		 */
618 		for (i = 0 ; i < length; i++) {
619 			k = offset + i;
620 			j = ERROR_DATA + i;
621 			mc_priv->regs[k] = result[j];
622 		}
623 		if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
624 			return 0;
625 		mc_priv->part_len = 0;
626 	}
627 
628 	mc_priv->error_id = error_id;
629 	mc_priv->error_level = result[ERROR_LEVEL];
630 
631 	switch (error_id) {
632 	case 5:		err_str = "General Software Non-Correctable error"; break;
633 	case 6:		err_str = "CFU error"; break;
634 	case 7:		err_str = "CFRAME error"; break;
635 	case 10:	err_str = "DDRMC Microblaze Correctable ECC error"; break;
636 	case 11:	err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
637 	case 15:	err_str = "MMCM error"; break;
638 	case 16:	err_str = "HNICX Correctable error"; break;
639 	case 17:	err_str = "HNICX Non-Correctable error"; break;
640 
641 	case 18:
642 		p = &mc_priv->stat;
643 		memset(p, 0, sizeof(struct ecc_status));
644 		p->error_type = MC5_ERR_TYPE_CE;
645 		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
646 			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
647 				adec_data = mc_priv->adec + ADEC_NUM * i;
648 				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
649 			}
650 		}
651 		return 0;
652 	case 19:
653 		p = &mc_priv->stat;
654 		memset(p, 0, sizeof(struct ecc_status));
655 		p->error_type = MC5_ERR_TYPE_UE;
656 		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
657 			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
658 				adec_data = mc_priv->adec + ADEC_NUM * i;
659 				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
660 			}
661 		}
662 		return 0;
663 
664 	case 21:	err_str = "GT Non-Correctable error"; break;
665 	case 22:	err_str = "PL Sysmon Correctable error"; break;
666 	case 23:	err_str = "PL Sysmon Non-Correctable error"; break;
667 	case 111:	err_str = "LPX unexpected dfx activation error"; break;
668 	case 114:	err_str = "INT_LPD Non-Correctable error"; break;
669 	case 116:	err_str = "INT_OCM Non-Correctable error"; break;
670 	case 117:	err_str = "INT_FPD Correctable error"; break;
671 	case 118:	err_str = "INT_FPD Non-Correctable error"; break;
672 	case 120:	err_str = "INT_IOU Non-Correctable error"; break;
673 	case 123:	err_str = "err_int_irq from APU GIC Distributor"; break;
674 	case 124:	err_str = "fault_int_irq from APU GIC Distribute"; break;
675 	case 132 ... 139: err_str = "FPX SPLITTER error"; break;
676 	case 140:	err_str = "APU Cluster 0 error"; break;
677 	case 141:	err_str = "APU Cluster 1 error"; break;
678 	case 142:	err_str = "APU Cluster 2 error"; break;
679 	case 143:	err_str = "APU Cluster 3 error"; break;
680 	case 145:	err_str = "WWDT1 LPX error"; break;
681 	case 147:	err_str = "IPI error"; break;
682 	case 152 ... 153: err_str = "AFIFS error"; break;
683 	case 154 ... 155: err_str = "LPX glitch error"; break;
684 	case 185 ... 186: err_str = "FPX AFIFS error"; break;
685 	case 195 ... 199: err_str = "AFIFM error"; break;
686 	case 108:	err_str = "PSM Correctable error"; break;
687 	case 59:	err_str = "PMC correctable error"; break;
688 	case 60:	err_str = "PMC Un correctable error"; break;
689 	case 43 ... 47:	err_str = "PMC Sysmon error"; break;
690 	case 163 ... 184: err_str = "RPU error"; break;
691 	case 148:	err_str = "OCM0 correctable error"; break;
692 	case 149:	err_str = "OCM1 correctable error"; break;
693 	case 150:	err_str = "OCM0 Un-correctable error"; break;
694 	case 151:	err_str = "OCM1 Un-correctable error"; break;
695 	case 189:	err_str = "PSX_CMN_3 PD block consolidated error"; break;
696 	case 191:	err_str = "FPD_INT_WRAP PD block consolidated error"; break;
697 	case 232:	err_str = "CRAM Un-Correctable error"; break;
698 	default:	err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
699 	}
700 
701 	snprintf(mc_priv->message,
702 		 sizeof(mc_priv->message),
703 		 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
704 
705 	/* Convert to bytes */
706 	length = result[TOTAL_ERR_LENGTH] * 4;
707 	log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
708 			       sec_sev, (void *)&result[ERROR_DATA], length);
709 
710 	return 0;
711 }
712 
713 static struct rpmsg_device_id amd_rpmsg_id_table[] = {
714 	{ .name = "error_ipc" },
715 	{ },
716 };
717 MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
718 
rpmsg_probe(struct rpmsg_device * rpdev)719 static int rpmsg_probe(struct rpmsg_device *rpdev)
720 {
721 	struct rpmsg_channel_info chinfo;
722 	struct mc_priv *pg;
723 
724 	pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
725 	chinfo.src = RPMSG_ADDR_ANY;
726 	chinfo.dst = rpdev->dst;
727 	strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
728 		strlen(amd_rpmsg_id_table[0].name));
729 
730 	pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
731 	if (!pg->ept)
732 		return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
733 				     chinfo.name);
734 
735 	dev_set_drvdata(&rpdev->dev, pg);
736 
737 	return 0;
738 }
739 
rpmsg_remove(struct rpmsg_device * rpdev)740 static void rpmsg_remove(struct rpmsg_device *rpdev)
741 {
742 	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
743 
744 	rpmsg_destroy_ept(mc_priv->ept);
745 	dev_set_drvdata(&rpdev->dev, NULL);
746 }
747 
748 static struct rpmsg_driver amd_rpmsg_driver = {
749 	.drv.name = KBUILD_MODNAME,
750 	.probe = rpmsg_probe,
751 	.remove = rpmsg_remove,
752 	.callback = rpmsg_cb,
753 	.id_table = amd_rpmsg_id_table,
754 };
755 
versal_edac_release(struct device * dev)756 static void versal_edac_release(struct device *dev)
757 {
758 	kfree(dev);
759 }
760 
init_versalnet(struct mc_priv * priv,struct platform_device * pdev)761 static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
762 {
763 	u32 num_chans, rank, dwidth, config;
764 	struct edac_mc_layer layers[2];
765 	struct mem_ctl_info *mci;
766 	struct device *dev;
767 	enum dev_type dt;
768 	char *name;
769 	int rc, i;
770 
771 	for (i = 0; i < NUM_CONTROLLERS; i++) {
772 		config = priv->adec[CONF + i * ADEC_NUM];
773 		num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
774 		rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
775 		dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
776 
777 		switch (dwidth) {
778 		case XDDR5_BUS_WIDTH_16:
779 			dt = DEV_X16;
780 			break;
781 		case XDDR5_BUS_WIDTH_32:
782 			dt = DEV_X32;
783 			break;
784 		case XDDR5_BUS_WIDTH_64:
785 			dt = DEV_X64;
786 			break;
787 		default:
788 			dt = DEV_UNKNOWN;
789 		}
790 
791 		if (dt == DEV_UNKNOWN)
792 			continue;
793 
794 		/* Find the first enabled device and register that one. */
795 		layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
796 		layers[0].size = rank;
797 		layers[0].is_virt_csrow = true;
798 		layers[1].type = EDAC_MC_LAYER_CHANNEL;
799 		layers[1].size = num_chans;
800 		layers[1].is_virt_csrow = false;
801 
802 		rc = -ENOMEM;
803 		mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers,
804 				    sizeof(struct mc_priv));
805 		if (!mci) {
806 			edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
807 			goto err_alloc;
808 		}
809 
810 		priv->mci[i] = mci;
811 		priv->dwidth = dt;
812 
813 		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
814 		dev->release = versal_edac_release;
815 		name = kmalloc(32, GFP_KERNEL);
816 		sprintf(name, "versal-net-ddrmc5-edac-%d", i);
817 		dev->init_name = name;
818 		rc = device_register(dev);
819 		if (rc)
820 			goto err_alloc;
821 
822 		mci->pdev = dev;
823 
824 		platform_set_drvdata(pdev, priv);
825 
826 		mc_init(mci, dev);
827 		rc = edac_mc_add_mc(mci);
828 		if (rc) {
829 			edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
830 			goto err_alloc;
831 		}
832 	}
833 	return 0;
834 
835 err_alloc:
836 	while (i--) {
837 		mci = priv->mci[i];
838 		if (!mci)
839 			continue;
840 
841 		if (mci->pdev) {
842 			device_unregister(mci->pdev);
843 			edac_mc_del_mc(mci->pdev);
844 		}
845 
846 		edac_mc_free(mci);
847 	}
848 
849 	return rc;
850 }
851 
remove_versalnet(struct mc_priv * priv)852 static void remove_versalnet(struct mc_priv *priv)
853 {
854 	struct mem_ctl_info *mci;
855 	int i;
856 
857 	for (i = 0; i < NUM_CONTROLLERS; i++) {
858 		device_unregister(priv->mci[i]->pdev);
859 		mci = edac_mc_del_mc(priv->mci[i]->pdev);
860 		if (!mci)
861 			return;
862 
863 		edac_mc_free(mci);
864 	}
865 }
866 
mc_probe(struct platform_device * pdev)867 static int mc_probe(struct platform_device *pdev)
868 {
869 	struct device_node *r5_core_node;
870 	struct mc_priv *priv;
871 	struct rproc *rp;
872 	int rc;
873 
874 	r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
875 	if (!r5_core_node) {
876 		dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
877 		return -EINVAL;
878 	}
879 
880 	rp = rproc_get_by_phandle(r5_core_node->phandle);
881 	if (!rp)
882 		return -EPROBE_DEFER;
883 
884 	rc = rproc_boot(rp);
885 	if (rc) {
886 		dev_err(&pdev->dev, "Failed to attach to remote processor\n");
887 		goto err_rproc_boot;
888 	}
889 
890 	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
891 	if (!priv) {
892 		rc = -ENOMEM;
893 		goto err_alloc;
894 	}
895 
896 	amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
897 
898 	rc = register_rpmsg_driver(&amd_rpmsg_driver);
899 	if (rc) {
900 		edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
901 		goto err_alloc;
902 	}
903 
904 	rc = setup_mcdi(priv);
905 	if (rc)
906 		goto err_unreg;
907 
908 	priv->mcdi->r5_rproc = rp;
909 
910 	rc = init_versalnet(priv, pdev);
911 	if (rc)
912 		goto err_init;
913 
914 	return 0;
915 
916 err_init:
917 	cdx_mcdi_finish(priv->mcdi);
918 
919 err_unreg:
920 	unregister_rpmsg_driver(&amd_rpmsg_driver);
921 
922 err_alloc:
923 	rproc_shutdown(rp);
924 
925 err_rproc_boot:
926 	rproc_put(rp);
927 
928 	return rc;
929 }
930 
mc_remove(struct platform_device * pdev)931 static void mc_remove(struct platform_device *pdev)
932 {
933 	struct mc_priv *priv = platform_get_drvdata(pdev);
934 
935 	unregister_rpmsg_driver(&amd_rpmsg_driver);
936 	remove_versalnet(priv);
937 	rproc_shutdown(priv->mcdi->r5_rproc);
938 	cdx_mcdi_finish(priv->mcdi);
939 }
940 
941 static const struct of_device_id amd_edac_match[] = {
942 	{ .compatible = "xlnx,versal-net-ddrmc5", },
943 	{}
944 };
945 MODULE_DEVICE_TABLE(of, amd_edac_match);
946 
947 static struct platform_driver amd_ddr_edac_mc_driver = {
948 	.driver = {
949 		.name = "versal-net-edac",
950 		.of_match_table = amd_edac_match,
951 	},
952 	.probe = mc_probe,
953 	.remove = mc_remove,
954 };
955 
956 module_platform_driver(amd_ddr_edac_mc_driver);
957 
958 MODULE_AUTHOR("AMD Inc");
959 MODULE_DESCRIPTION("Versal NET EDAC driver");
960 MODULE_LICENSE("GPL");
961