1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * AMD Versal NET memory controller driver
4 * Copyright (C) 2025 Advanced Micro Devices, Inc.
5 */
6
7 #include <linux/cdx/edac_cdx_pcol.h>
8 #include <linux/edac.h>
9 #include <linux/module.h>
10 #include <linux/of_device.h>
11 #include <linux/ras.h>
12 #include <linux/remoteproc.h>
13 #include <linux/rpmsg.h>
14 #include <linux/sizes.h>
15 #include <ras/ras_event.h>
16
17 #include "edac_module.h"
18
19 /* Granularity of reported error in bytes */
20 #define MC5_ERR_GRAIN 1
21 #define MC_GET_DDR_CONFIG_IN_LEN 4
22
23 #define MC5_IRQ_CE_MASK GENMASK(18, 15)
24 #define MC5_IRQ_UE_MASK GENMASK(14, 11)
25
26 #define MC5_RANK_1_MASK GENMASK(11, 6)
27 #define MASK_24 GENMASK(29, 24)
28 #define MASK_0 GENMASK(5, 0)
29
30 #define MC5_LRANK_1_MASK GENMASK(11, 6)
31 #define MC5_LRANK_2_MASK GENMASK(17, 12)
32 #define MC5_BANK1_MASK GENMASK(11, 6)
33 #define MC5_GRP_0_MASK GENMASK(17, 12)
34 #define MC5_GRP_1_MASK GENMASK(23, 18)
35
36 #define MC5_REGHI_ROW 7
37 #define MC5_EACHBIT 1
38 #define MC5_ERR_TYPE_CE 0
39 #define MC5_ERR_TYPE_UE 1
40 #define MC5_HIGH_MEM_EN BIT(20)
41 #define MC5_MEM_MASK GENMASK(19, 0)
42 #define MC5_X16_BASE 256
43 #define MC5_X16_ECC 32
44 #define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC)
45 #define MC5_X32_SIZE 576
46 #define MC5_HIMEM_BASE (256 * SZ_1M)
47 #define MC5_ILC_HIMEM_EN BIT(28)
48 #define MC5_ILC_MEM GENMASK(27, 0)
49 #define MC5_INTERLEAVE_SEL GENMASK(3, 0)
50 #define MC5_BUS_WIDTH_MASK GENMASK(19, 18)
51 #define MC5_NUM_CHANS_MASK BIT(17)
52 #define MC5_RANK_MASK GENMASK(15, 14)
53
54 #define ERROR_LEVEL 2
55 #define ERROR_ID 3
56 #define TOTAL_ERR_LENGTH 5
57 #define MSG_ERR_OFFSET 8
58 #define MSG_ERR_LENGTH 9
59 #define ERROR_DATA 10
60 #define MCDI_RESPONSE 0xFF
61
62 #define REG_MAX 152
63 #define ADEC_MAX 152
64 #define NUM_CONTROLLERS 8
65 #define REGS_PER_CONTROLLER 19
66 #define ADEC_NUM 19
67 #define BUFFER_SZ 80
68
69 #define XDDR5_BUS_WIDTH_64 0
70 #define XDDR5_BUS_WIDTH_32 1
71 #define XDDR5_BUS_WIDTH_16 2
72
73 /**
74 * struct ecc_error_info - ECC error log information.
75 * @burstpos: Burst position.
76 * @lrank: Logical Rank number.
77 * @rank: Rank number.
78 * @group: Group number.
79 * @bank: Bank number.
80 * @col: Column number.
81 * @row: Row number.
82 * @rowhi: Row number higher bits.
83 * @i: Combined ECC error vector containing encoded values of burst position,
84 * rank, bank, column, and row information.
85 */
86 union ecc_error_info {
87 struct {
88 u32 burstpos:3;
89 u32 lrank:4;
90 u32 rank:2;
91 u32 group:3;
92 u32 bank:2;
93 u32 col:11;
94 u32 row:7;
95 u32 rowhi;
96 };
97 u64 i;
98 } __packed;
99
100 /* Row and column bit positions in the address decoder (ADEC) registers. */
101 union row_col_mapping {
102 struct {
103 u32 row0:6;
104 u32 row1:6;
105 u32 row2:6;
106 u32 row3:6;
107 u32 row4:6;
108 u32 reserved:2;
109 };
110 struct {
111 u32 col1:6;
112 u32 col2:6;
113 u32 col3:6;
114 u32 col4:6;
115 u32 col5:6;
116 u32 reservedcol:2;
117 };
118 u32 i;
119 } __packed;
120
121 /**
122 * struct ecc_status - ECC status information to report.
123 * @ceinfo: Correctable errors.
124 * @ueinfo: Uncorrected errors.
125 * @channel: Channel number.
126 * @error_type: Error type.
127 */
128 struct ecc_status {
129 union ecc_error_info ceinfo[2];
130 union ecc_error_info ueinfo[2];
131 u8 channel;
132 u8 error_type;
133 };
134
135 /**
136 * struct mc_priv - DDR memory controller private instance data.
137 * @message: Buffer for framing the event specific info.
138 * @stat: ECC status information.
139 * @error_id: The error id.
140 * @error_level: The error level.
141 * @dwidth: Width of data bus excluding ECC bits.
142 * @part_len: The support of the message received.
143 * @regs: The registers sent on the rpmsg.
144 * @adec: Address decode registers.
145 * @mci: Memory controller interface.
146 * @ept: rpmsg endpoint.
147 * @mcdi: The mcdi handle.
148 */
149 struct mc_priv {
150 char message[256];
151 struct ecc_status stat;
152 u32 error_id;
153 u32 error_level;
154 u32 dwidth;
155 u32 part_len;
156 u32 regs[REG_MAX];
157 u32 adec[ADEC_MAX];
158 struct mem_ctl_info *mci[NUM_CONTROLLERS];
159 struct rpmsg_endpoint *ept;
160 struct cdx_mcdi *mcdi;
161 };
162
163 /*
164 * Address decoder (ADEC) registers to match the order in which the register
165 * information is received from the firmware.
166 */
167 enum adec_info {
168 CONF = 0,
169 ADEC0,
170 ADEC1,
171 ADEC2,
172 ADEC3,
173 ADEC4,
174 ADEC5,
175 ADEC6,
176 ADEC7,
177 ADEC8,
178 ADEC9,
179 ADEC10,
180 ADEC11,
181 ADEC12,
182 ADEC13,
183 ADEC14,
184 ADEC15,
185 ADEC16,
186 ADECILC,
187 };
188
189 enum reg_info {
190 ISR = 0,
191 IMR,
192 ECCR0_ERR_STATUS,
193 ECCR0_ADDR_LO,
194 ECCR0_ADDR_HI,
195 ECCR0_DATA_LO,
196 ECCR0_DATA_HI,
197 ECCR0_PAR,
198 ECCR1_ERR_STATUS,
199 ECCR1_ADDR_LO,
200 ECCR1_ADDR_HI,
201 ECCR1_DATA_LO,
202 ECCR1_DATA_HI,
203 ECCR1_PAR,
204 XMPU_ERR,
205 XMPU_ERR_ADDR_L0,
206 XMPU_ERR_ADDR_HI,
207 XMPU_ERR_AXI_ID,
208 ADEC_CHK_ERR_LOG,
209 };
210
get_ddr_info(u32 * error_data,struct mc_priv * priv)211 static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
212 {
213 u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
214 struct ecc_status *p;
215
216 isr = error_data[ISR];
217
218 if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
219 return false;
220
221 eccr0_val = error_data[ECCR0_ERR_STATUS];
222 eccr1_val = error_data[ECCR1_ERR_STATUS];
223
224 if (!eccr0_val && !eccr1_val)
225 return false;
226
227 p = &priv->stat;
228
229 if (!eccr0_val)
230 p->channel = 1;
231 else
232 p->channel = 0;
233
234 reglo = error_data[ECCR0_ADDR_LO];
235 reghi = error_data[ECCR0_ADDR_HI];
236 if (isr & MC5_IRQ_CE_MASK)
237 p->ceinfo[0].i = reglo | (u64)reghi << 32;
238 else if (isr & MC5_IRQ_UE_MASK)
239 p->ueinfo[0].i = reglo | (u64)reghi << 32;
240
241 parity = error_data[ECCR0_PAR];
242 edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
243 reghi, reglo, parity);
244
245 reglo = error_data[ECCR1_ADDR_LO];
246 reghi = error_data[ECCR1_ADDR_HI];
247 if (isr & MC5_IRQ_CE_MASK)
248 p->ceinfo[1].i = reglo | (u64)reghi << 32;
249 else if (isr & MC5_IRQ_UE_MASK)
250 p->ueinfo[1].i = reglo | (u64)reghi << 32;
251
252 parity = error_data[ECCR1_PAR];
253 edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
254 reghi, reglo, parity);
255
256 return true;
257 }
258
259 /**
260 * convert_to_physical - Convert @error_data to a physical address.
261 * @priv: DDR memory controller private instance data.
262 * @pinf: ECC error info structure.
263 * @controller: Controller number of the MC5
264 * @error_data: the DDRMC5 ADEC address decoder register data
265 *
266 * Return: physical address of the DDR memory.
267 */
convert_to_physical(struct mc_priv * priv,union ecc_error_info pinf,int controller,int * error_data)268 static unsigned long convert_to_physical(struct mc_priv *priv,
269 union ecc_error_info pinf,
270 int controller, int *error_data)
271 {
272 u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
273 u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
274 unsigned long err_addr = 0, addr;
275 union row_col_mapping cols;
276 union row_col_mapping rows;
277 u32 col_bit_0;
278
279 row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
280 offset = controller * ADEC_NUM;
281
282 reg = error_data[ADEC6];
283 rows.i = reg;
284 err_addr |= (row & BIT(0)) << rows.row0;
285 row >>= MC5_EACHBIT;
286 err_addr |= (row & BIT(0)) << rows.row1;
287 row >>= MC5_EACHBIT;
288 err_addr |= (row & BIT(0)) << rows.row2;
289 row >>= MC5_EACHBIT;
290 err_addr |= (row & BIT(0)) << rows.row3;
291 row >>= MC5_EACHBIT;
292 err_addr |= (row & BIT(0)) << rows.row4;
293 row >>= MC5_EACHBIT;
294
295 reg = error_data[ADEC7];
296 rows.i = reg;
297 err_addr |= (row & BIT(0)) << rows.row0;
298 row >>= MC5_EACHBIT;
299 err_addr |= (row & BIT(0)) << rows.row1;
300 row >>= MC5_EACHBIT;
301 err_addr |= (row & BIT(0)) << rows.row2;
302 row >>= MC5_EACHBIT;
303 err_addr |= (row & BIT(0)) << rows.row3;
304 row >>= MC5_EACHBIT;
305 err_addr |= (row & BIT(0)) << rows.row4;
306 row >>= MC5_EACHBIT;
307
308 reg = error_data[ADEC8];
309 rows.i = reg;
310 err_addr |= (row & BIT(0)) << rows.row0;
311 row >>= MC5_EACHBIT;
312 err_addr |= (row & BIT(0)) << rows.row1;
313 row >>= MC5_EACHBIT;
314 err_addr |= (row & BIT(0)) << rows.row2;
315 row >>= MC5_EACHBIT;
316 err_addr |= (row & BIT(0)) << rows.row3;
317 row >>= MC5_EACHBIT;
318 err_addr |= (row & BIT(0)) << rows.row4;
319
320 reg = error_data[ADEC9];
321 rows.i = reg;
322
323 err_addr |= (row & BIT(0)) << rows.row0;
324 row >>= MC5_EACHBIT;
325 err_addr |= (row & BIT(0)) << rows.row1;
326 row >>= MC5_EACHBIT;
327 err_addr |= (row & BIT(0)) << rows.row2;
328 row >>= MC5_EACHBIT;
329
330 col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
331 pinf.col >>= 1;
332 err_addr |= (pinf.col & 1) << col_bit_0;
333
334 cols.i = error_data[ADEC10];
335 err_addr |= (pinf.col & 1) << cols.col1;
336 pinf.col >>= 1;
337 err_addr |= (pinf.col & 1) << cols.col2;
338 pinf.col >>= 1;
339 err_addr |= (pinf.col & 1) << cols.col3;
340 pinf.col >>= 1;
341 err_addr |= (pinf.col & 1) << cols.col4;
342 pinf.col >>= 1;
343 err_addr |= (pinf.col & 1) << cols.col5;
344 pinf.col >>= 1;
345
346 cols.i = error_data[ADEC11];
347 err_addr |= (pinf.col & 1) << cols.col1;
348 pinf.col >>= 1;
349 err_addr |= (pinf.col & 1) << cols.col2;
350 pinf.col >>= 1;
351 err_addr |= (pinf.col & 1) << cols.col3;
352 pinf.col >>= 1;
353 err_addr |= (pinf.col & 1) << cols.col4;
354 pinf.col >>= 1;
355 err_addr |= (pinf.col & 1) << cols.col5;
356 pinf.col >>= 1;
357
358 reg = error_data[ADEC12];
359 err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
360 pinf.bank >>= MC5_EACHBIT;
361 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
362 pinf.bank >>= MC5_EACHBIT;
363
364 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
365 pinf.group >>= MC5_EACHBIT;
366 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
367 pinf.group >>= MC5_EACHBIT;
368 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
369 pinf.group >>= MC5_EACHBIT;
370
371 reg = error_data[ADEC4];
372 err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
373 pinf.rank >>= MC5_EACHBIT;
374 err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
375 pinf.rank >>= MC5_EACHBIT;
376
377 reg = error_data[ADEC5];
378 err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
379 pinf.lrank >>= MC5_EACHBIT;
380 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
381 pinf.lrank >>= MC5_EACHBIT;
382 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
383 pinf.lrank >>= MC5_EACHBIT;
384 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
385 pinf.lrank >>= MC5_EACHBIT;
386
387 high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
388 interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
389
390 high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
391 low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
392 reg = priv->adec[ADEC14 + offset];
393 ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
394 ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
395 if (ilc_himem_en)
396 ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
397 else
398 ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
399
400 if (priv->dwidth == DEV_X16) {
401 blk = err_addr / MC5_X16_SIZE;
402 rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
403 err_addr = rsh_req_addr * interleave * 2;
404 } else {
405 blk = err_addr / MC5_X32_SIZE;
406 rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
407 err_addr = rsh_req_addr * interleave * 2;
408 }
409
410 if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
411 addr = err_addr - high_mem_offset;
412 else
413 addr = err_addr - low_mem_offset;
414
415 return addr;
416 }
417
418 /**
419 * handle_error - Handle errors.
420 * @priv: DDR memory controller private instance data.
421 * @stat: ECC status structure.
422 * @ctl_num: Controller number of the MC5
423 * @error_data: the MC5 ADEC address decoder register data
424 *
425 * Handles ECC correctable and uncorrectable errors.
426 */
handle_error(struct mc_priv * priv,struct ecc_status * stat,int ctl_num,int * error_data)427 static void handle_error(struct mc_priv *priv, struct ecc_status *stat,
428 int ctl_num, int *error_data)
429 {
430 union ecc_error_info pinf;
431 struct mem_ctl_info *mci;
432 unsigned long pa;
433 phys_addr_t pfn;
434 int err;
435
436 if (WARN_ON_ONCE(ctl_num >= NUM_CONTROLLERS))
437 return;
438
439 mci = priv->mci[ctl_num];
440
441 if (stat->error_type == MC5_ERR_TYPE_CE) {
442 pinf = stat->ceinfo[stat->channel];
443 snprintf(priv->message, sizeof(priv->message),
444 "Error type:%s Controller %d Addr at %lx\n",
445 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
446
447 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
448 1, 0, 0, 0, 0, 0, -1,
449 priv->message, "");
450 }
451
452 if (stat->error_type == MC5_ERR_TYPE_UE) {
453 pinf = stat->ueinfo[stat->channel];
454 snprintf(priv->message, sizeof(priv->message),
455 "Error type:%s controller %d Addr at %lx\n",
456 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
457
458 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
459 1, 0, 0, 0, 0, 0, -1,
460 priv->message, "");
461 pa = convert_to_physical(priv, pinf, ctl_num, error_data);
462 pfn = PHYS_PFN(pa);
463
464 if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
465 err = memory_failure(pfn, MF_ACTION_REQUIRED);
466 if (err)
467 edac_dbg(2, "memory_failure() error: %d", err);
468 else
469 edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
470 }
471 }
472 }
473
mc_init(struct mem_ctl_info * mci,struct device * dev)474 static void mc_init(struct mem_ctl_info *mci, struct device *dev)
475 {
476 struct mc_priv *priv = mci->pvt_info;
477 struct csrow_info *csi;
478 struct dimm_info *dimm;
479 u32 row;
480 int ch;
481
482 /* Initialize controller capabilities and configuration */
483 mci->mtype_cap = MEM_FLAG_DDR5;
484 mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
485 mci->scrub_cap = SCRUB_HW_SRC;
486 mci->scrub_mode = SCRUB_NONE;
487
488 mci->edac_cap = EDAC_FLAG_SECDED;
489 mci->ctl_name = "VersalNET DDR5";
490 mci->dev_name = dev_name(dev);
491 mci->mod_name = "versalnet_edac";
492
493 edac_op_state = EDAC_OPSTATE_INT;
494
495 for (row = 0; row < mci->nr_csrows; row++) {
496 csi = mci->csrows[row];
497 for (ch = 0; ch < csi->nr_channels; ch++) {
498 dimm = csi->channels[ch]->dimm;
499 dimm->edac_mode = EDAC_SECDED;
500 dimm->mtype = MEM_DDR5;
501 dimm->grain = MC5_ERR_GRAIN;
502 dimm->dtype = priv->dwidth;
503 }
504 }
505 }
506
507 #define to_mci(k) container_of(k, struct mem_ctl_info, dev)
508
mcdi_rpc_timeout(struct cdx_mcdi * cdx,unsigned int cmd)509 static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
510 {
511 return MCDI_RPC_TIMEOUT;
512 }
513
mcdi_request(struct cdx_mcdi * cdx,const struct cdx_dword * hdr,size_t hdr_len,const struct cdx_dword * sdu,size_t sdu_len)514 static void mcdi_request(struct cdx_mcdi *cdx,
515 const struct cdx_dword *hdr, size_t hdr_len,
516 const struct cdx_dword *sdu, size_t sdu_len)
517 {
518 void *send_buf;
519 int ret;
520
521 send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
522 if (!send_buf)
523 return;
524
525 memcpy(send_buf, hdr, hdr_len);
526 memcpy(send_buf + hdr_len, sdu, sdu_len);
527
528 ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
529 if (ret)
530 dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
531
532 kfree(send_buf);
533 }
534
535 static const struct cdx_mcdi_ops mcdi_ops = {
536 .mcdi_rpc_timeout = mcdi_rpc_timeout,
537 .mcdi_request = mcdi_request,
538 };
539
get_ddr_config(u32 index,u32 * buffer,struct cdx_mcdi * amd_mcdi)540 static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
541 {
542 size_t outlen;
543 int ret;
544
545 MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
546 MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
547
548 MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
549
550 ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
551 outbuf, sizeof(outbuf), &outlen);
552 if (!ret)
553 memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
554 (ADEC_NUM * 4));
555 }
556
setup_mcdi(struct mc_priv * mc_priv)557 static int setup_mcdi(struct mc_priv *mc_priv)
558 {
559 struct cdx_mcdi *amd_mcdi;
560 int ret, i;
561
562 amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL);
563 if (!amd_mcdi)
564 return -ENOMEM;
565
566 amd_mcdi->mcdi_ops = &mcdi_ops;
567 ret = cdx_mcdi_init(amd_mcdi);
568 if (ret) {
569 kfree(amd_mcdi);
570 return ret;
571 }
572
573 amd_mcdi->ept = mc_priv->ept;
574 mc_priv->mcdi = amd_mcdi;
575
576 for (i = 0; i < NUM_CONTROLLERS; i++)
577 get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
578
579 return 0;
580 }
581
582 static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
583 0xb8, 0xb4, 0x45, 0x56, 0x2e,
584 0x8c, 0x5b, 0xec);
585
rpmsg_cb(struct rpmsg_device * rpdev,void * data,int len,void * priv,u32 src)586 static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
587 int len, void *priv, u32 src)
588 {
589 struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
590 const guid_t *sec_type = &guid_null;
591 u32 length, offset, error_id;
592 u32 *result = (u32 *)data;
593 struct ecc_status *p;
594 int i, j, k, sec_sev;
595 const char *err_str;
596 u32 *adec_data;
597
598 if (*(u8 *)data == MCDI_RESPONSE) {
599 cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
600 return 0;
601 }
602
603 sec_sev = result[ERROR_LEVEL];
604 error_id = result[ERROR_ID];
605 length = result[MSG_ERR_LENGTH];
606 offset = result[MSG_ERR_OFFSET];
607
608 /*
609 * The data can come in two stretches. Construct the regs from two
610 * messages. The offset indicates the offset from which the data is to
611 * be taken.
612 */
613 for (i = 0 ; i < length; i++) {
614 k = offset + i;
615 j = ERROR_DATA + i;
616 mc_priv->regs[k] = result[j];
617 }
618
619 if (result[TOTAL_ERR_LENGTH] > length) {
620 if (!mc_priv->part_len)
621 mc_priv->part_len = length;
622 else
623 mc_priv->part_len += length;
624
625 if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
626 return 0;
627 mc_priv->part_len = 0;
628 }
629
630 mc_priv->error_id = error_id;
631 mc_priv->error_level = result[ERROR_LEVEL];
632
633 switch (error_id) {
634 case 5: err_str = "General Software Non-Correctable error"; break;
635 case 6: err_str = "CFU error"; break;
636 case 7: err_str = "CFRAME error"; break;
637 case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break;
638 case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
639 case 15: err_str = "MMCM error"; break;
640 case 16: err_str = "HNICX Correctable error"; break;
641 case 17: err_str = "HNICX Non-Correctable error"; break;
642
643 case 18:
644 p = &mc_priv->stat;
645 memset(p, 0, sizeof(struct ecc_status));
646 p->error_type = MC5_ERR_TYPE_CE;
647 for (i = 0 ; i < NUM_CONTROLLERS; i++) {
648 if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
649 adec_data = mc_priv->adec + ADEC_NUM * i;
650 handle_error(mc_priv, &mc_priv->stat, i, adec_data);
651 }
652 }
653 return 0;
654 case 19:
655 p = &mc_priv->stat;
656 memset(p, 0, sizeof(struct ecc_status));
657 p->error_type = MC5_ERR_TYPE_UE;
658 for (i = 0 ; i < NUM_CONTROLLERS; i++) {
659 if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
660 adec_data = mc_priv->adec + ADEC_NUM * i;
661 handle_error(mc_priv, &mc_priv->stat, i, adec_data);
662 }
663 }
664 return 0;
665
666 case 21: err_str = "GT Non-Correctable error"; break;
667 case 22: err_str = "PL Sysmon Correctable error"; break;
668 case 23: err_str = "PL Sysmon Non-Correctable error"; break;
669 case 111: err_str = "LPX unexpected dfx activation error"; break;
670 case 114: err_str = "INT_LPD Non-Correctable error"; break;
671 case 116: err_str = "INT_OCM Non-Correctable error"; break;
672 case 117: err_str = "INT_FPD Correctable error"; break;
673 case 118: err_str = "INT_FPD Non-Correctable error"; break;
674 case 120: err_str = "INT_IOU Non-Correctable error"; break;
675 case 123: err_str = "err_int_irq from APU GIC Distributor"; break;
676 case 124: err_str = "fault_int_irq from APU GIC Distribute"; break;
677 case 132 ... 139: err_str = "FPX SPLITTER error"; break;
678 case 140: err_str = "APU Cluster 0 error"; break;
679 case 141: err_str = "APU Cluster 1 error"; break;
680 case 142: err_str = "APU Cluster 2 error"; break;
681 case 143: err_str = "APU Cluster 3 error"; break;
682 case 145: err_str = "WWDT1 LPX error"; break;
683 case 147: err_str = "IPI error"; break;
684 case 152 ... 153: err_str = "AFIFS error"; break;
685 case 154 ... 155: err_str = "LPX glitch error"; break;
686 case 185 ... 186: err_str = "FPX AFIFS error"; break;
687 case 195 ... 199: err_str = "AFIFM error"; break;
688 case 108: err_str = "PSM Correctable error"; break;
689 case 59: err_str = "PMC correctable error"; break;
690 case 60: err_str = "PMC Un correctable error"; break;
691 case 43 ... 47: err_str = "PMC Sysmon error"; break;
692 case 163 ... 184: err_str = "RPU error"; break;
693 case 148: err_str = "OCM0 correctable error"; break;
694 case 149: err_str = "OCM1 correctable error"; break;
695 case 150: err_str = "OCM0 Un-correctable error"; break;
696 case 151: err_str = "OCM1 Un-correctable error"; break;
697 case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break;
698 case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break;
699 case 232: err_str = "CRAM Un-Correctable error"; break;
700 default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
701 }
702
703 snprintf(mc_priv->message,
704 sizeof(mc_priv->message),
705 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
706
707 /* Convert to bytes */
708 length = result[TOTAL_ERR_LENGTH] * 4;
709 log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
710 sec_sev, (void *)&mc_priv->regs, length);
711
712 return 0;
713 }
714
715 static struct rpmsg_device_id amd_rpmsg_id_table[] = {
716 { .name = "error_ipc" },
717 { },
718 };
719 MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
720
rpmsg_probe(struct rpmsg_device * rpdev)721 static int rpmsg_probe(struct rpmsg_device *rpdev)
722 {
723 struct rpmsg_channel_info chinfo;
724 struct mc_priv *pg;
725
726 pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
727 chinfo.src = RPMSG_ADDR_ANY;
728 chinfo.dst = rpdev->dst;
729 strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
730 strlen(amd_rpmsg_id_table[0].name));
731
732 pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
733 if (!pg->ept)
734 return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
735 chinfo.name);
736
737 dev_set_drvdata(&rpdev->dev, pg);
738
739 return 0;
740 }
741
rpmsg_remove(struct rpmsg_device * rpdev)742 static void rpmsg_remove(struct rpmsg_device *rpdev)
743 {
744 struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
745
746 rpmsg_destroy_ept(mc_priv->ept);
747 dev_set_drvdata(&rpdev->dev, NULL);
748 }
749
750 static struct rpmsg_driver amd_rpmsg_driver = {
751 .drv.name = KBUILD_MODNAME,
752 .probe = rpmsg_probe,
753 .remove = rpmsg_remove,
754 .callback = rpmsg_cb,
755 .id_table = amd_rpmsg_id_table,
756 };
757
versal_edac_release(struct device * dev)758 static void versal_edac_release(struct device *dev)
759 {
760 kfree(dev);
761 }
762
init_versalnet(struct mc_priv * priv,struct platform_device * pdev)763 static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
764 {
765 u32 num_chans, rank, dwidth, config;
766 struct edac_mc_layer layers[2];
767 struct mem_ctl_info *mci;
768 struct device *dev;
769 enum dev_type dt;
770 char *name;
771 int rc, i;
772
773 for (i = 0; i < NUM_CONTROLLERS; i++) {
774 config = priv->adec[CONF + i * ADEC_NUM];
775 num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
776 rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
777 dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
778
779 switch (dwidth) {
780 case XDDR5_BUS_WIDTH_16:
781 dt = DEV_X16;
782 break;
783 case XDDR5_BUS_WIDTH_32:
784 dt = DEV_X32;
785 break;
786 case XDDR5_BUS_WIDTH_64:
787 dt = DEV_X64;
788 break;
789 default:
790 dt = DEV_UNKNOWN;
791 }
792
793 if (dt == DEV_UNKNOWN)
794 continue;
795
796 /* Find the first enabled device and register that one. */
797 layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
798 layers[0].size = rank;
799 layers[0].is_virt_csrow = true;
800 layers[1].type = EDAC_MC_LAYER_CHANNEL;
801 layers[1].size = num_chans;
802 layers[1].is_virt_csrow = false;
803
804 rc = -ENOMEM;
805 mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers,
806 sizeof(struct mc_priv));
807 if (!mci) {
808 edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
809 goto err_alloc;
810 }
811
812 priv->mci[i] = mci;
813 priv->dwidth = dt;
814
815 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
816 dev->release = versal_edac_release;
817 name = kmalloc(32, GFP_KERNEL);
818 sprintf(name, "versal-net-ddrmc5-edac-%d", i);
819 dev->init_name = name;
820 rc = device_register(dev);
821 if (rc)
822 goto err_alloc;
823
824 mci->pdev = dev;
825
826 platform_set_drvdata(pdev, priv);
827
828 mc_init(mci, dev);
829 rc = edac_mc_add_mc(mci);
830 if (rc) {
831 edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
832 goto err_alloc;
833 }
834 }
835 return 0;
836
837 err_alloc:
838 while (i--) {
839 mci = priv->mci[i];
840 if (!mci)
841 continue;
842
843 if (mci->pdev) {
844 device_unregister(mci->pdev);
845 edac_mc_del_mc(mci->pdev);
846 }
847
848 edac_mc_free(mci);
849 }
850
851 return rc;
852 }
853
remove_versalnet(struct mc_priv * priv)854 static void remove_versalnet(struct mc_priv *priv)
855 {
856 struct mem_ctl_info *mci;
857 int i;
858
859 for (i = 0; i < NUM_CONTROLLERS; i++) {
860 device_unregister(priv->mci[i]->pdev);
861 mci = edac_mc_del_mc(priv->mci[i]->pdev);
862 if (!mci)
863 return;
864
865 edac_mc_free(mci);
866 }
867 }
868
mc_probe(struct platform_device * pdev)869 static int mc_probe(struct platform_device *pdev)
870 {
871 struct device_node *r5_core_node;
872 struct mc_priv *priv;
873 struct rproc *rp;
874 int rc;
875
876 r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
877 if (!r5_core_node) {
878 dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
879 return -EINVAL;
880 }
881
882 rp = rproc_get_by_phandle(r5_core_node->phandle);
883 if (!rp)
884 return -EPROBE_DEFER;
885
886 rc = rproc_boot(rp);
887 if (rc) {
888 dev_err(&pdev->dev, "Failed to attach to remote processor\n");
889 goto err_rproc_boot;
890 }
891
892 priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
893 if (!priv) {
894 rc = -ENOMEM;
895 goto err_alloc;
896 }
897
898 amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
899
900 rc = register_rpmsg_driver(&amd_rpmsg_driver);
901 if (rc) {
902 edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
903 goto err_alloc;
904 }
905
906 rc = setup_mcdi(priv);
907 if (rc)
908 goto err_unreg;
909
910 priv->mcdi->r5_rproc = rp;
911
912 rc = init_versalnet(priv, pdev);
913 if (rc)
914 goto err_init;
915
916 return 0;
917
918 err_init:
919 cdx_mcdi_finish(priv->mcdi);
920
921 err_unreg:
922 unregister_rpmsg_driver(&amd_rpmsg_driver);
923
924 err_alloc:
925 rproc_shutdown(rp);
926
927 err_rproc_boot:
928 rproc_put(rp);
929
930 return rc;
931 }
932
mc_remove(struct platform_device * pdev)933 static void mc_remove(struct platform_device *pdev)
934 {
935 struct mc_priv *priv = platform_get_drvdata(pdev);
936
937 unregister_rpmsg_driver(&amd_rpmsg_driver);
938 remove_versalnet(priv);
939 rproc_shutdown(priv->mcdi->r5_rproc);
940 cdx_mcdi_finish(priv->mcdi);
941 }
942
943 static const struct of_device_id amd_edac_match[] = {
944 { .compatible = "xlnx,versal-net-ddrmc5", },
945 {}
946 };
947 MODULE_DEVICE_TABLE(of, amd_edac_match);
948
949 static struct platform_driver amd_ddr_edac_mc_driver = {
950 .driver = {
951 .name = "versal-net-edac",
952 .of_match_table = amd_edac_match,
953 },
954 .probe = mc_probe,
955 .remove = mc_remove,
956 };
957
958 module_platform_driver(amd_ddr_edac_mc_driver);
959
960 MODULE_AUTHOR("AMD Inc");
961 MODULE_DESCRIPTION("Versal NET EDAC driver");
962 MODULE_LICENSE("GPL");
963