1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * AMD Versal NET memory controller driver
4 * Copyright (C) 2025 Advanced Micro Devices, Inc.
5 */
6
7 #include <linux/cdx/edac_cdx_pcol.h>
8 #include <linux/edac.h>
9 #include <linux/module.h>
10 #include <linux/of_device.h>
11 #include <linux/ras.h>
12 #include <linux/remoteproc.h>
13 #include <linux/rpmsg.h>
14 #include <linux/sizes.h>
15 #include <ras/ras_event.h>
16
17 #include "edac_module.h"
18
19 /* Granularity of reported error in bytes */
20 #define MC5_ERR_GRAIN 1
21 #define MC_GET_DDR_CONFIG_IN_LEN 4
22
23 #define MC5_IRQ_CE_MASK GENMASK(18, 15)
24 #define MC5_IRQ_UE_MASK GENMASK(14, 11)
25
26 #define MC5_RANK_1_MASK GENMASK(11, 6)
27 #define MASK_24 GENMASK(29, 24)
28 #define MASK_0 GENMASK(5, 0)
29
30 #define MC5_LRANK_1_MASK GENMASK(11, 6)
31 #define MC5_LRANK_2_MASK GENMASK(17, 12)
32 #define MC5_BANK1_MASK GENMASK(11, 6)
33 #define MC5_GRP_0_MASK GENMASK(17, 12)
34 #define MC5_GRP_1_MASK GENMASK(23, 18)
35
36 #define MC5_REGHI_ROW 7
37 #define MC5_EACHBIT 1
38 #define MC5_ERR_TYPE_CE 0
39 #define MC5_ERR_TYPE_UE 1
40 #define MC5_HIGH_MEM_EN BIT(20)
41 #define MC5_MEM_MASK GENMASK(19, 0)
42 #define MC5_X16_BASE 256
43 #define MC5_X16_ECC 32
44 #define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC)
45 #define MC5_X32_SIZE 576
46 #define MC5_HIMEM_BASE (256 * SZ_1M)
47 #define MC5_ILC_HIMEM_EN BIT(28)
48 #define MC5_ILC_MEM GENMASK(27, 0)
49 #define MC5_INTERLEAVE_SEL GENMASK(3, 0)
50 #define MC5_BUS_WIDTH_MASK GENMASK(19, 18)
51 #define MC5_NUM_CHANS_MASK BIT(17)
52 #define MC5_RANK_MASK GENMASK(15, 14)
53
54 #define ERROR_LEVEL 2
55 #define ERROR_ID 3
56 #define TOTAL_ERR_LENGTH 5
57 #define MSG_ERR_OFFSET 8
58 #define MSG_ERR_LENGTH 9
59 #define ERROR_DATA 10
60 #define MCDI_RESPONSE 0xFF
61
62 #define REG_MAX 152
63 #define ADEC_MAX 152
64 #define NUM_CONTROLLERS 8
65 #define REGS_PER_CONTROLLER 19
66 #define ADEC_NUM 19
67 #define BUFFER_SZ 80
68
69 #define XDDR5_BUS_WIDTH_64 0
70 #define XDDR5_BUS_WIDTH_32 1
71 #define XDDR5_BUS_WIDTH_16 2
72
73 /**
74 * struct ecc_error_info - ECC error log information.
75 * @burstpos: Burst position.
76 * @lrank: Logical Rank number.
77 * @rank: Rank number.
78 * @group: Group number.
79 * @bank: Bank number.
80 * @col: Column number.
81 * @row: Row number.
82 * @rowhi: Row number higher bits.
83 * @i: Combined ECC error vector containing encoded values of burst position,
84 * rank, bank, column, and row information.
85 */
86 union ecc_error_info {
87 struct {
88 u32 burstpos:3;
89 u32 lrank:4;
90 u32 rank:2;
91 u32 group:3;
92 u32 bank:2;
93 u32 col:11;
94 u32 row:7;
95 u32 rowhi;
96 };
97 u64 i;
98 } __packed;
99
100 /* Row and column bit positions in the address decoder (ADEC) registers. */
101 union row_col_mapping {
102 struct {
103 u32 row0:6;
104 u32 row1:6;
105 u32 row2:6;
106 u32 row3:6;
107 u32 row4:6;
108 u32 reserved:2;
109 };
110 struct {
111 u32 col1:6;
112 u32 col2:6;
113 u32 col3:6;
114 u32 col4:6;
115 u32 col5:6;
116 u32 reservedcol:2;
117 };
118 u32 i;
119 } __packed;
120
121 /**
122 * struct ecc_status - ECC status information to report.
123 * @ceinfo: Correctable errors.
124 * @ueinfo: Uncorrected errors.
125 * @channel: Channel number.
126 * @error_type: Error type.
127 */
128 struct ecc_status {
129 union ecc_error_info ceinfo[2];
130 union ecc_error_info ueinfo[2];
131 u8 channel;
132 u8 error_type;
133 };
134
135 /**
136 * struct mc_priv - DDR memory controller private instance data.
137 * @message: Buffer for framing the event specific info.
138 * @stat: ECC status information.
139 * @error_id: The error id.
140 * @error_level: The error level.
141 * @dwidth: Width of data bus excluding ECC bits.
142 * @part_len: The support of the message received.
143 * @regs: The registers sent on the rpmsg.
144 * @adec: Address decode registers.
145 * @mci: Memory controller interface.
146 * @ept: rpmsg endpoint.
147 * @mcdi: The mcdi handle.
148 */
149 struct mc_priv {
150 char message[256];
151 struct ecc_status stat;
152 u32 error_id;
153 u32 error_level;
154 u32 dwidth;
155 u32 part_len;
156 u32 regs[REG_MAX];
157 u32 adec[ADEC_MAX];
158 struct mem_ctl_info *mci[NUM_CONTROLLERS];
159 struct rpmsg_endpoint *ept;
160 struct cdx_mcdi *mcdi;
161 };
162
163 /*
164 * Address decoder (ADEC) registers to match the order in which the register
165 * information is received from the firmware.
166 */
167 enum adec_info {
168 CONF = 0,
169 ADEC0,
170 ADEC1,
171 ADEC2,
172 ADEC3,
173 ADEC4,
174 ADEC5,
175 ADEC6,
176 ADEC7,
177 ADEC8,
178 ADEC9,
179 ADEC10,
180 ADEC11,
181 ADEC12,
182 ADEC13,
183 ADEC14,
184 ADEC15,
185 ADEC16,
186 ADECILC,
187 };
188
189 enum reg_info {
190 ISR = 0,
191 IMR,
192 ECCR0_ERR_STATUS,
193 ECCR0_ADDR_LO,
194 ECCR0_ADDR_HI,
195 ECCR0_DATA_LO,
196 ECCR0_DATA_HI,
197 ECCR0_PAR,
198 ECCR1_ERR_STATUS,
199 ECCR1_ADDR_LO,
200 ECCR1_ADDR_HI,
201 ECCR1_DATA_LO,
202 ECCR1_DATA_HI,
203 ECCR1_PAR,
204 XMPU_ERR,
205 XMPU_ERR_ADDR_L0,
206 XMPU_ERR_ADDR_HI,
207 XMPU_ERR_AXI_ID,
208 ADEC_CHK_ERR_LOG,
209 };
210
get_ddr_info(u32 * error_data,struct mc_priv * priv)211 static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
212 {
213 u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
214 struct ecc_status *p;
215
216 isr = error_data[ISR];
217
218 if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
219 return false;
220
221 eccr0_val = error_data[ECCR0_ERR_STATUS];
222 eccr1_val = error_data[ECCR1_ERR_STATUS];
223
224 if (!eccr0_val && !eccr1_val)
225 return false;
226
227 p = &priv->stat;
228
229 if (!eccr0_val)
230 p->channel = 1;
231 else
232 p->channel = 0;
233
234 reglo = error_data[ECCR0_ADDR_LO];
235 reghi = error_data[ECCR0_ADDR_HI];
236 if (isr & MC5_IRQ_CE_MASK)
237 p->ceinfo[0].i = reglo | (u64)reghi << 32;
238 else if (isr & MC5_IRQ_UE_MASK)
239 p->ueinfo[0].i = reglo | (u64)reghi << 32;
240
241 parity = error_data[ECCR0_PAR];
242 edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
243 reghi, reglo, parity);
244
245 reglo = error_data[ECCR1_ADDR_LO];
246 reghi = error_data[ECCR1_ADDR_HI];
247 if (isr & MC5_IRQ_CE_MASK)
248 p->ceinfo[1].i = reglo | (u64)reghi << 32;
249 else if (isr & MC5_IRQ_UE_MASK)
250 p->ueinfo[1].i = reglo | (u64)reghi << 32;
251
252 parity = error_data[ECCR1_PAR];
253 edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
254 reghi, reglo, parity);
255
256 return true;
257 }
258
259 /**
260 * convert_to_physical - Convert @error_data to a physical address.
261 * @priv: DDR memory controller private instance data.
262 * @pinf: ECC error info structure.
263 * @controller: Controller number of the MC5
264 * @error_data: the DDRMC5 ADEC address decoder register data
265 *
266 * Return: physical address of the DDR memory.
267 */
convert_to_physical(struct mc_priv * priv,union ecc_error_info pinf,int controller,int * error_data)268 static unsigned long convert_to_physical(struct mc_priv *priv,
269 union ecc_error_info pinf,
270 int controller, int *error_data)
271 {
272 u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
273 u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
274 unsigned long err_addr = 0, addr;
275 union row_col_mapping cols;
276 union row_col_mapping rows;
277 u32 col_bit_0;
278
279 row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
280 offset = controller * ADEC_NUM;
281
282 reg = error_data[ADEC6];
283 rows.i = reg;
284 err_addr |= (row & BIT(0)) << rows.row0;
285 row >>= MC5_EACHBIT;
286 err_addr |= (row & BIT(0)) << rows.row1;
287 row >>= MC5_EACHBIT;
288 err_addr |= (row & BIT(0)) << rows.row2;
289 row >>= MC5_EACHBIT;
290 err_addr |= (row & BIT(0)) << rows.row3;
291 row >>= MC5_EACHBIT;
292 err_addr |= (row & BIT(0)) << rows.row4;
293 row >>= MC5_EACHBIT;
294
295 reg = error_data[ADEC7];
296 rows.i = reg;
297 err_addr |= (row & BIT(0)) << rows.row0;
298 row >>= MC5_EACHBIT;
299 err_addr |= (row & BIT(0)) << rows.row1;
300 row >>= MC5_EACHBIT;
301 err_addr |= (row & BIT(0)) << rows.row2;
302 row >>= MC5_EACHBIT;
303 err_addr |= (row & BIT(0)) << rows.row3;
304 row >>= MC5_EACHBIT;
305 err_addr |= (row & BIT(0)) << rows.row4;
306 row >>= MC5_EACHBIT;
307
308 reg = error_data[ADEC8];
309 rows.i = reg;
310 err_addr |= (row & BIT(0)) << rows.row0;
311 row >>= MC5_EACHBIT;
312 err_addr |= (row & BIT(0)) << rows.row1;
313 row >>= MC5_EACHBIT;
314 err_addr |= (row & BIT(0)) << rows.row2;
315 row >>= MC5_EACHBIT;
316 err_addr |= (row & BIT(0)) << rows.row3;
317 row >>= MC5_EACHBIT;
318 err_addr |= (row & BIT(0)) << rows.row4;
319
320 reg = error_data[ADEC9];
321 rows.i = reg;
322
323 err_addr |= (row & BIT(0)) << rows.row0;
324 row >>= MC5_EACHBIT;
325 err_addr |= (row & BIT(0)) << rows.row1;
326 row >>= MC5_EACHBIT;
327 err_addr |= (row & BIT(0)) << rows.row2;
328 row >>= MC5_EACHBIT;
329
330 col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
331 pinf.col >>= 1;
332 err_addr |= (pinf.col & 1) << col_bit_0;
333
334 cols.i = error_data[ADEC10];
335 err_addr |= (pinf.col & 1) << cols.col1;
336 pinf.col >>= 1;
337 err_addr |= (pinf.col & 1) << cols.col2;
338 pinf.col >>= 1;
339 err_addr |= (pinf.col & 1) << cols.col3;
340 pinf.col >>= 1;
341 err_addr |= (pinf.col & 1) << cols.col4;
342 pinf.col >>= 1;
343 err_addr |= (pinf.col & 1) << cols.col5;
344 pinf.col >>= 1;
345
346 cols.i = error_data[ADEC11];
347 err_addr |= (pinf.col & 1) << cols.col1;
348 pinf.col >>= 1;
349 err_addr |= (pinf.col & 1) << cols.col2;
350 pinf.col >>= 1;
351 err_addr |= (pinf.col & 1) << cols.col3;
352 pinf.col >>= 1;
353 err_addr |= (pinf.col & 1) << cols.col4;
354 pinf.col >>= 1;
355 err_addr |= (pinf.col & 1) << cols.col5;
356 pinf.col >>= 1;
357
358 reg = error_data[ADEC12];
359 err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
360 pinf.bank >>= MC5_EACHBIT;
361 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
362 pinf.bank >>= MC5_EACHBIT;
363
364 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
365 pinf.group >>= MC5_EACHBIT;
366 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
367 pinf.group >>= MC5_EACHBIT;
368 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
369 pinf.group >>= MC5_EACHBIT;
370
371 reg = error_data[ADEC4];
372 err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
373 pinf.rank >>= MC5_EACHBIT;
374 err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
375 pinf.rank >>= MC5_EACHBIT;
376
377 reg = error_data[ADEC5];
378 err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
379 pinf.lrank >>= MC5_EACHBIT;
380 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
381 pinf.lrank >>= MC5_EACHBIT;
382 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
383 pinf.lrank >>= MC5_EACHBIT;
384 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
385 pinf.lrank >>= MC5_EACHBIT;
386
387 high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
388 interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
389
390 high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
391 low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
392 reg = priv->adec[ADEC14 + offset];
393 ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
394 ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
395 if (ilc_himem_en)
396 ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
397 else
398 ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
399
400 if (priv->dwidth == DEV_X16) {
401 blk = err_addr / MC5_X16_SIZE;
402 rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
403 err_addr = rsh_req_addr * interleave * 2;
404 } else {
405 blk = err_addr / MC5_X32_SIZE;
406 rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
407 err_addr = rsh_req_addr * interleave * 2;
408 }
409
410 if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
411 addr = err_addr - high_mem_offset;
412 else
413 addr = err_addr - low_mem_offset;
414
415 return addr;
416 }
417
418 /**
419 * handle_error - Handle errors.
420 * @priv: DDR memory controller private instance data.
421 * @stat: ECC status structure.
422 * @ctl_num: Controller number of the MC5
423 * @error_data: the MC5 ADEC address decoder register data
424 *
425 * Handles ECC correctable and uncorrectable errors.
426 */
handle_error(struct mc_priv * priv,struct ecc_status * stat,int ctl_num,int * error_data)427 static void handle_error(struct mc_priv *priv, struct ecc_status *stat,
428 int ctl_num, int *error_data)
429 {
430 union ecc_error_info pinf;
431 struct mem_ctl_info *mci;
432 unsigned long pa;
433 phys_addr_t pfn;
434 int err;
435
436 if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS))
437 return;
438
439 mci = priv->mci[ctl_num];
440
441 if (stat->error_type == MC5_ERR_TYPE_CE) {
442 pinf = stat->ceinfo[stat->channel];
443 snprintf(priv->message, sizeof(priv->message),
444 "Error type:%s Controller %d Addr at %lx\n",
445 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
446
447 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
448 1, 0, 0, 0, 0, 0, -1,
449 priv->message, "");
450 }
451
452 if (stat->error_type == MC5_ERR_TYPE_UE) {
453 pinf = stat->ueinfo[stat->channel];
454 snprintf(priv->message, sizeof(priv->message),
455 "Error type:%s controller %d Addr at %lx\n",
456 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
457
458 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
459 1, 0, 0, 0, 0, 0, -1,
460 priv->message, "");
461 pa = convert_to_physical(priv, pinf, ctl_num, error_data);
462 pfn = PHYS_PFN(pa);
463
464 if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
465 err = memory_failure(pfn, MF_ACTION_REQUIRED);
466 if (err)
467 edac_dbg(2, "memory_failure() error: %d", err);
468 else
469 edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
470 }
471 }
472 }
473
mc_init(struct mem_ctl_info * mci,struct device * dev)474 static void mc_init(struct mem_ctl_info *mci, struct device *dev)
475 {
476 struct mc_priv *priv = mci->pvt_info;
477 struct csrow_info *csi;
478 struct dimm_info *dimm;
479 u32 row;
480 int ch;
481
482 /* Initialize controller capabilities and configuration */
483 mci->mtype_cap = MEM_FLAG_DDR5;
484 mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
485 mci->scrub_cap = SCRUB_HW_SRC;
486 mci->scrub_mode = SCRUB_NONE;
487
488 mci->edac_cap = EDAC_FLAG_SECDED;
489 mci->ctl_name = "VersalNET DDR5";
490 mci->dev_name = dev_name(dev);
491 mci->mod_name = "versalnet_edac";
492
493 edac_op_state = EDAC_OPSTATE_INT;
494
495 for (row = 0; row < mci->nr_csrows; row++) {
496 csi = mci->csrows[row];
497 for (ch = 0; ch < csi->nr_channels; ch++) {
498 dimm = csi->channels[ch]->dimm;
499 dimm->edac_mode = EDAC_SECDED;
500 dimm->mtype = MEM_DDR5;
501 dimm->grain = MC5_ERR_GRAIN;
502 dimm->dtype = priv->dwidth;
503 }
504 }
505 }
506
507 #define to_mci(k) container_of(k, struct mem_ctl_info, dev)
508
mcdi_rpc_timeout(struct cdx_mcdi * cdx,unsigned int cmd)509 static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
510 {
511 return MCDI_RPC_TIMEOUT;
512 }
513
mcdi_request(struct cdx_mcdi * cdx,const struct cdx_dword * hdr,size_t hdr_len,const struct cdx_dword * sdu,size_t sdu_len)514 static void mcdi_request(struct cdx_mcdi *cdx,
515 const struct cdx_dword *hdr, size_t hdr_len,
516 const struct cdx_dword *sdu, size_t sdu_len)
517 {
518 void *send_buf;
519 int ret;
520
521 send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
522 if (!send_buf)
523 return;
524
525 memcpy(send_buf, hdr, hdr_len);
526 memcpy(send_buf + hdr_len, sdu, sdu_len);
527
528 ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
529 if (ret)
530 dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
531
532 kfree(send_buf);
533 }
534
535 static const struct cdx_mcdi_ops mcdi_ops = {
536 .mcdi_rpc_timeout = mcdi_rpc_timeout,
537 .mcdi_request = mcdi_request,
538 };
539
get_ddr_config(u32 index,u32 * buffer,struct cdx_mcdi * amd_mcdi)540 static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
541 {
542 size_t outlen;
543 int ret;
544
545 MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
546 MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
547
548 MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
549
550 ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
551 outbuf, sizeof(outbuf), &outlen);
552 if (!ret)
553 memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
554 (ADEC_NUM * 4));
555 }
556
setup_mcdi(struct mc_priv * mc_priv)557 static int setup_mcdi(struct mc_priv *mc_priv)
558 {
559 struct cdx_mcdi *amd_mcdi;
560 int ret, i;
561
562 amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL);
563 if (!amd_mcdi)
564 return -ENOMEM;
565
566 amd_mcdi->mcdi_ops = &mcdi_ops;
567 ret = cdx_mcdi_init(amd_mcdi);
568 if (ret) {
569 kfree(amd_mcdi);
570 return ret;
571 }
572
573 amd_mcdi->ept = mc_priv->ept;
574 mc_priv->mcdi = amd_mcdi;
575
576 for (i = 0; i < NUM_CONTROLLERS; i++)
577 get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
578
579 return 0;
580 }
581
582 static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
583 0xb8, 0xb4, 0x45, 0x56, 0x2e,
584 0x8c, 0x5b, 0xec);
585
rpmsg_cb(struct rpmsg_device * rpdev,void * data,int len,void * priv,u32 src)586 static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
587 int len, void *priv, u32 src)
588 {
589 struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
590 const guid_t *sec_type = &guid_null;
591 u32 length, offset, error_id;
592 u32 *result = (u32 *)data;
593 struct ecc_status *p;
594 int i, j, k, sec_sev;
595 const char *err_str;
596 u32 *adec_data;
597
598 if (*(u8 *)data == MCDI_RESPONSE) {
599 cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
600 return 0;
601 }
602
603 sec_sev = result[ERROR_LEVEL];
604 error_id = result[ERROR_ID];
605 length = result[MSG_ERR_LENGTH];
606 offset = result[MSG_ERR_OFFSET];
607
608 if (result[TOTAL_ERR_LENGTH] > length) {
609 if (!mc_priv->part_len)
610 mc_priv->part_len = length;
611 else
612 mc_priv->part_len += length;
613 /*
614 * The data can come in 2 stretches. Construct the regs from 2
615 * messages the offset indicates the offset from which the data is to
616 * be taken
617 */
618 for (i = 0 ; i < length; i++) {
619 k = offset + i;
620 j = ERROR_DATA + i;
621 mc_priv->regs[k] = result[j];
622 }
623 if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
624 return 0;
625 mc_priv->part_len = 0;
626 }
627
628 mc_priv->error_id = error_id;
629 mc_priv->error_level = result[ERROR_LEVEL];
630
631 switch (error_id) {
632 case 5: err_str = "General Software Non-Correctable error"; break;
633 case 6: err_str = "CFU error"; break;
634 case 7: err_str = "CFRAME error"; break;
635 case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break;
636 case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
637 case 15: err_str = "MMCM error"; break;
638 case 16: err_str = "HNICX Correctable error"; break;
639 case 17: err_str = "HNICX Non-Correctable error"; break;
640
641 case 18:
642 p = &mc_priv->stat;
643 memset(p, 0, sizeof(struct ecc_status));
644 p->error_type = MC5_ERR_TYPE_CE;
645 for (i = 0 ; i < NUM_CONTROLLERS; i++) {
646 if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
647 adec_data = mc_priv->adec + ADEC_NUM * i;
648 handle_error(mc_priv, &mc_priv->stat, i, adec_data);
649 }
650 }
651 return 0;
652 case 19:
653 p = &mc_priv->stat;
654 memset(p, 0, sizeof(struct ecc_status));
655 p->error_type = MC5_ERR_TYPE_UE;
656 for (i = 0 ; i < NUM_CONTROLLERS; i++) {
657 if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
658 adec_data = mc_priv->adec + ADEC_NUM * i;
659 handle_error(mc_priv, &mc_priv->stat, i, adec_data);
660 }
661 }
662 return 0;
663
664 case 21: err_str = "GT Non-Correctable error"; break;
665 case 22: err_str = "PL Sysmon Correctable error"; break;
666 case 23: err_str = "PL Sysmon Non-Correctable error"; break;
667 case 111: err_str = "LPX unexpected dfx activation error"; break;
668 case 114: err_str = "INT_LPD Non-Correctable error"; break;
669 case 116: err_str = "INT_OCM Non-Correctable error"; break;
670 case 117: err_str = "INT_FPD Correctable error"; break;
671 case 118: err_str = "INT_FPD Non-Correctable error"; break;
672 case 120: err_str = "INT_IOU Non-Correctable error"; break;
673 case 123: err_str = "err_int_irq from APU GIC Distributor"; break;
674 case 124: err_str = "fault_int_irq from APU GIC Distribute"; break;
675 case 132 ... 139: err_str = "FPX SPLITTER error"; break;
676 case 140: err_str = "APU Cluster 0 error"; break;
677 case 141: err_str = "APU Cluster 1 error"; break;
678 case 142: err_str = "APU Cluster 2 error"; break;
679 case 143: err_str = "APU Cluster 3 error"; break;
680 case 145: err_str = "WWDT1 LPX error"; break;
681 case 147: err_str = "IPI error"; break;
682 case 152 ... 153: err_str = "AFIFS error"; break;
683 case 154 ... 155: err_str = "LPX glitch error"; break;
684 case 185 ... 186: err_str = "FPX AFIFS error"; break;
685 case 195 ... 199: err_str = "AFIFM error"; break;
686 case 108: err_str = "PSM Correctable error"; break;
687 case 59: err_str = "PMC correctable error"; break;
688 case 60: err_str = "PMC Un correctable error"; break;
689 case 43 ... 47: err_str = "PMC Sysmon error"; break;
690 case 163 ... 184: err_str = "RPU error"; break;
691 case 148: err_str = "OCM0 correctable error"; break;
692 case 149: err_str = "OCM1 correctable error"; break;
693 case 150: err_str = "OCM0 Un-correctable error"; break;
694 case 151: err_str = "OCM1 Un-correctable error"; break;
695 case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break;
696 case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break;
697 case 232: err_str = "CRAM Un-Correctable error"; break;
698 default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
699 }
700
701 snprintf(mc_priv->message,
702 sizeof(mc_priv->message),
703 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
704
705 /* Convert to bytes */
706 length = result[TOTAL_ERR_LENGTH] * 4;
707 log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
708 sec_sev, (void *)&result[ERROR_DATA], length);
709
710 return 0;
711 }
712
713 static struct rpmsg_device_id amd_rpmsg_id_table[] = {
714 { .name = "error_ipc" },
715 { },
716 };
717 MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
718
rpmsg_probe(struct rpmsg_device * rpdev)719 static int rpmsg_probe(struct rpmsg_device *rpdev)
720 {
721 struct rpmsg_channel_info chinfo;
722 struct mc_priv *pg;
723
724 pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
725 chinfo.src = RPMSG_ADDR_ANY;
726 chinfo.dst = rpdev->dst;
727 strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
728 strlen(amd_rpmsg_id_table[0].name));
729
730 pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
731 if (!pg->ept)
732 return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
733 chinfo.name);
734
735 dev_set_drvdata(&rpdev->dev, pg);
736
737 return 0;
738 }
739
rpmsg_remove(struct rpmsg_device * rpdev)740 static void rpmsg_remove(struct rpmsg_device *rpdev)
741 {
742 struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
743
744 rpmsg_destroy_ept(mc_priv->ept);
745 dev_set_drvdata(&rpdev->dev, NULL);
746 }
747
748 static struct rpmsg_driver amd_rpmsg_driver = {
749 .drv.name = KBUILD_MODNAME,
750 .probe = rpmsg_probe,
751 .remove = rpmsg_remove,
752 .callback = rpmsg_cb,
753 .id_table = amd_rpmsg_id_table,
754 };
755
versal_edac_release(struct device * dev)756 static void versal_edac_release(struct device *dev)
757 {
758 kfree(dev);
759 }
760
init_versalnet(struct mc_priv * priv,struct platform_device * pdev)761 static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
762 {
763 u32 num_chans, rank, dwidth, config;
764 struct edac_mc_layer layers[2];
765 struct mem_ctl_info *mci;
766 struct device *dev;
767 enum dev_type dt;
768 char *name;
769 int rc, i;
770
771 for (i = 0; i < NUM_CONTROLLERS; i++) {
772 config = priv->adec[CONF + i * ADEC_NUM];
773 num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
774 rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
775 dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
776
777 switch (dwidth) {
778 case XDDR5_BUS_WIDTH_16:
779 dt = DEV_X16;
780 break;
781 case XDDR5_BUS_WIDTH_32:
782 dt = DEV_X32;
783 break;
784 case XDDR5_BUS_WIDTH_64:
785 dt = DEV_X64;
786 break;
787 default:
788 dt = DEV_UNKNOWN;
789 }
790
791 if (dt == DEV_UNKNOWN)
792 continue;
793
794 /* Find the first enabled device and register that one. */
795 layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
796 layers[0].size = rank;
797 layers[0].is_virt_csrow = true;
798 layers[1].type = EDAC_MC_LAYER_CHANNEL;
799 layers[1].size = num_chans;
800 layers[1].is_virt_csrow = false;
801
802 rc = -ENOMEM;
803 mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers,
804 sizeof(struct mc_priv));
805 if (!mci) {
806 edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
807 goto err_alloc;
808 }
809
810 priv->mci[i] = mci;
811 priv->dwidth = dt;
812
813 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
814 dev->release = versal_edac_release;
815 name = kmalloc(32, GFP_KERNEL);
816 sprintf(name, "versal-net-ddrmc5-edac-%d", i);
817 dev->init_name = name;
818 rc = device_register(dev);
819 if (rc)
820 goto err_alloc;
821
822 mci->pdev = dev;
823
824 platform_set_drvdata(pdev, priv);
825
826 mc_init(mci, dev);
827 rc = edac_mc_add_mc(mci);
828 if (rc) {
829 edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
830 goto err_alloc;
831 }
832 }
833 return 0;
834
835 err_alloc:
836 while (i--) {
837 mci = priv->mci[i];
838 if (!mci)
839 continue;
840
841 if (mci->pdev) {
842 device_unregister(mci->pdev);
843 edac_mc_del_mc(mci->pdev);
844 }
845
846 edac_mc_free(mci);
847 }
848
849 return rc;
850 }
851
remove_versalnet(struct mc_priv * priv)852 static void remove_versalnet(struct mc_priv *priv)
853 {
854 struct mem_ctl_info *mci;
855 int i;
856
857 for (i = 0; i < NUM_CONTROLLERS; i++) {
858 device_unregister(priv->mci[i]->pdev);
859 mci = edac_mc_del_mc(priv->mci[i]->pdev);
860 if (!mci)
861 return;
862
863 edac_mc_free(mci);
864 }
865 }
866
mc_probe(struct platform_device * pdev)867 static int mc_probe(struct platform_device *pdev)
868 {
869 struct device_node *r5_core_node;
870 struct mc_priv *priv;
871 struct rproc *rp;
872 int rc;
873
874 r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
875 if (!r5_core_node) {
876 dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
877 return -EINVAL;
878 }
879
880 rp = rproc_get_by_phandle(r5_core_node->phandle);
881 if (!rp)
882 return -EPROBE_DEFER;
883
884 rc = rproc_boot(rp);
885 if (rc) {
886 dev_err(&pdev->dev, "Failed to attach to remote processor\n");
887 goto err_rproc_boot;
888 }
889
890 priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
891 if (!priv) {
892 rc = -ENOMEM;
893 goto err_alloc;
894 }
895
896 amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
897
898 rc = register_rpmsg_driver(&amd_rpmsg_driver);
899 if (rc) {
900 edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
901 goto err_alloc;
902 }
903
904 rc = setup_mcdi(priv);
905 if (rc)
906 goto err_unreg;
907
908 priv->mcdi->r5_rproc = rp;
909
910 rc = init_versalnet(priv, pdev);
911 if (rc)
912 goto err_init;
913
914 return 0;
915
916 err_init:
917 cdx_mcdi_finish(priv->mcdi);
918
919 err_unreg:
920 unregister_rpmsg_driver(&amd_rpmsg_driver);
921
922 err_alloc:
923 rproc_shutdown(rp);
924
925 err_rproc_boot:
926 rproc_put(rp);
927
928 return rc;
929 }
930
mc_remove(struct platform_device * pdev)931 static void mc_remove(struct platform_device *pdev)
932 {
933 struct mc_priv *priv = platform_get_drvdata(pdev);
934
935 unregister_rpmsg_driver(&amd_rpmsg_driver);
936 remove_versalnet(priv);
937 rproc_shutdown(priv->mcdi->r5_rproc);
938 cdx_mcdi_finish(priv->mcdi);
939 }
940
941 static const struct of_device_id amd_edac_match[] = {
942 { .compatible = "xlnx,versal-net-ddrmc5", },
943 {}
944 };
945 MODULE_DEVICE_TABLE(of, amd_edac_match);
946
947 static struct platform_driver amd_ddr_edac_mc_driver = {
948 .driver = {
949 .name = "versal-net-edac",
950 .of_match_table = amd_edac_match,
951 },
952 .probe = mc_probe,
953 .remove = mc_remove,
954 };
955
956 module_platform_driver(amd_ddr_edac_mc_driver);
957
958 MODULE_AUTHOR("AMD Inc");
959 MODULE_DESCRIPTION("Versal NET EDAC driver");
960 MODULE_LICENSE("GPL");
961