1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Ampere Computing SoC's SMpro Error Monitoring Driver 4 * 5 * Copyright (c) 2022, Ampere Computing LLC 6 * 7 */ 8 9 #include <linux/i2c.h> 10 #include <linux/mod_devicetable.h> 11 #include <linux/module.h> 12 #include <linux/platform_device.h> 13 #include <linux/regmap.h> 14 15 /* GPI RAS Error Registers */ 16 #define GPI_RAS_ERR 0x7E 17 18 /* Core and L2C Error Registers */ 19 #define CORE_CE_ERR_CNT 0x80 20 #define CORE_CE_ERR_LEN 0x81 21 #define CORE_CE_ERR_DATA 0x82 22 #define CORE_UE_ERR_CNT 0x83 23 #define CORE_UE_ERR_LEN 0x84 24 #define CORE_UE_ERR_DATA 0x85 25 26 /* Memory Error Registers */ 27 #define MEM_CE_ERR_CNT 0x90 28 #define MEM_CE_ERR_LEN 0x91 29 #define MEM_CE_ERR_DATA 0x92 30 #define MEM_UE_ERR_CNT 0x93 31 #define MEM_UE_ERR_LEN 0x94 32 #define MEM_UE_ERR_DATA 0x95 33 34 /* RAS Error/Warning Registers */ 35 #define ERR_SMPRO_TYPE 0xA0 36 #define ERR_PMPRO_TYPE 0xA1 37 #define ERR_SMPRO_INFO_LO 0xA2 38 #define ERR_SMPRO_INFO_HI 0xA3 39 #define ERR_SMPRO_DATA_LO 0xA4 40 #define ERR_SMPRO_DATA_HI 0xA5 41 #define WARN_SMPRO_INFO_LO 0xAA 42 #define WARN_SMPRO_INFO_HI 0xAB 43 #define ERR_PMPRO_INFO_LO 0xA6 44 #define ERR_PMPRO_INFO_HI 0xA7 45 #define ERR_PMPRO_DATA_LO 0xA8 46 #define ERR_PMPRO_DATA_HI 0xA9 47 #define WARN_PMPRO_INFO_LO 0xAC 48 #define WARN_PMPRO_INFO_HI 0xAD 49 50 /* PCIE Error Registers */ 51 #define PCIE_CE_ERR_CNT 0xC0 52 #define PCIE_CE_ERR_LEN 0xC1 53 #define PCIE_CE_ERR_DATA 0xC2 54 #define PCIE_UE_ERR_CNT 0xC3 55 #define PCIE_UE_ERR_LEN 0xC4 56 #define PCIE_UE_ERR_DATA 0xC5 57 58 /* Other Error Registers */ 59 #define OTHER_CE_ERR_CNT 0xD0 60 #define OTHER_CE_ERR_LEN 0xD1 61 #define OTHER_CE_ERR_DATA 0xD2 62 #define OTHER_UE_ERR_CNT 0xD8 63 #define OTHER_UE_ERR_LEN 0xD9 64 #define OTHER_UE_ERR_DATA 0xDA 65 66 /* Event Data Registers */ 67 #define VRD_WARN_FAULT_EVENT_DATA 0x78 68 #define VRD_HOT_EVENT_DATA 0x79 69 #define DIMM_HOT_EVENT_DATA 0x7A 70 71 #define MAX_READ_BLOCK_LENGTH 48 72 73 #define RAS_SMPRO_ERR 0 74 #define RAS_PMPRO_ERR 1 75 76 enum RAS_48BYTES_ERR_TYPES { 77 CORE_CE_ERR, 78 CORE_UE_ERR, 79 MEM_CE_ERR, 80 MEM_UE_ERR, 81 PCIE_CE_ERR, 82 PCIE_UE_ERR, 83 OTHER_CE_ERR, 84 OTHER_UE_ERR, 85 NUM_48BYTES_ERR_TYPE, 86 }; 87 88 struct smpro_error_hdr { 89 u8 count; /* Number of the RAS errors */ 90 u8 len; /* Number of data bytes */ 91 u8 data; /* Start of 48-byte data */ 92 u8 max_cnt; /* Max num of errors */ 93 }; 94 95 /* 96 * Included Address of registers to get Count, Length of data and Data 97 * of the 48 bytes error data 98 */ 99 static struct smpro_error_hdr smpro_error_table[] = { 100 [CORE_CE_ERR] = { 101 .count = CORE_CE_ERR_CNT, 102 .len = CORE_CE_ERR_LEN, 103 .data = CORE_CE_ERR_DATA, 104 .max_cnt = 32 105 }, 106 [CORE_UE_ERR] = { 107 .count = CORE_UE_ERR_CNT, 108 .len = CORE_UE_ERR_LEN, 109 .data = CORE_UE_ERR_DATA, 110 .max_cnt = 32 111 }, 112 [MEM_CE_ERR] = { 113 .count = MEM_CE_ERR_CNT, 114 .len = MEM_CE_ERR_LEN, 115 .data = MEM_CE_ERR_DATA, 116 .max_cnt = 16 117 }, 118 [MEM_UE_ERR] = { 119 .count = MEM_UE_ERR_CNT, 120 .len = MEM_UE_ERR_LEN, 121 .data = MEM_UE_ERR_DATA, 122 .max_cnt = 16 123 }, 124 [PCIE_CE_ERR] = { 125 .count = PCIE_CE_ERR_CNT, 126 .len = PCIE_CE_ERR_LEN, 127 .data = PCIE_CE_ERR_DATA, 128 .max_cnt = 96 129 }, 130 [PCIE_UE_ERR] = { 131 .count = PCIE_UE_ERR_CNT, 132 .len = PCIE_UE_ERR_LEN, 133 .data = PCIE_UE_ERR_DATA, 134 .max_cnt = 96 135 }, 136 [OTHER_CE_ERR] = { 137 .count = OTHER_CE_ERR_CNT, 138 .len = OTHER_CE_ERR_LEN, 139 .data = OTHER_CE_ERR_DATA, 140 .max_cnt = 8 141 }, 142 [OTHER_UE_ERR] = { 143 .count = OTHER_UE_ERR_CNT, 144 .len = OTHER_UE_ERR_LEN, 145 .data = OTHER_UE_ERR_DATA, 146 .max_cnt = 8 147 }, 148 }; 149 150 /* 151 * List of SCP registers which are used to get 152 * one type of RAS Internal errors. 153 */ 154 struct smpro_int_error_hdr { 155 u8 type; 156 u8 info_l; 157 u8 info_h; 158 u8 data_l; 159 u8 data_h; 160 u8 warn_l; 161 u8 warn_h; 162 }; 163 164 static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = { 165 [RAS_SMPRO_ERR] = { 166 .type = ERR_SMPRO_TYPE, 167 .info_l = ERR_SMPRO_INFO_LO, 168 .info_h = ERR_SMPRO_INFO_HI, 169 .data_l = ERR_SMPRO_DATA_LO, 170 .data_h = ERR_SMPRO_DATA_HI, 171 .warn_l = WARN_SMPRO_INFO_LO, 172 .warn_h = WARN_SMPRO_INFO_HI, 173 }, 174 [RAS_PMPRO_ERR] = { 175 .type = ERR_PMPRO_TYPE, 176 .info_l = ERR_PMPRO_INFO_LO, 177 .info_h = ERR_PMPRO_INFO_HI, 178 .data_l = ERR_PMPRO_DATA_LO, 179 .data_h = ERR_PMPRO_DATA_HI, 180 .warn_l = WARN_PMPRO_INFO_LO, 181 .warn_h = WARN_PMPRO_INFO_HI, 182 }, 183 }; 184 185 struct smpro_errmon { 186 struct regmap *regmap; 187 }; 188 189 enum EVENT_TYPES { 190 VRD_WARN_FAULT_EVENT, 191 VRD_HOT_EVENT, 192 DIMM_HOT_EVENT, 193 NUM_EVENTS_TYPE, 194 }; 195 196 /* Included Address of event source and data registers */ 197 static u8 smpro_event_table[NUM_EVENTS_TYPE] = { 198 VRD_WARN_FAULT_EVENT_DATA, 199 VRD_HOT_EVENT_DATA, 200 DIMM_HOT_EVENT_DATA, 201 }; 202 203 static ssize_t smpro_event_data_read(struct device *dev, 204 struct device_attribute *da, char *buf, 205 int channel) 206 { 207 struct smpro_errmon *errmon = dev_get_drvdata(dev); 208 s32 event_data; 209 int ret; 210 211 ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data); 212 if (ret) 213 return ret; 214 /* Clear event after read */ 215 if (event_data != 0) 216 regmap_write(errmon->regmap, smpro_event_table[channel], event_data); 217 218 return sysfs_emit(buf, "%04x\n", event_data); 219 } 220 221 static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da, 222 char *buf, int channel) 223 { 224 struct smpro_errmon *errmon = dev_get_drvdata(dev); 225 struct smpro_error_hdr *err_info; 226 s32 err_count; 227 int ret; 228 229 err_info = &smpro_error_table[channel]; 230 231 ret = regmap_read(errmon->regmap, err_info->count, &err_count); 232 if (ret) 233 return ret; 234 235 /* Bit 8 indicates the overflow status */ 236 return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0); 237 } 238 239 static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da, 240 char *buf, int channel) 241 { 242 struct smpro_errmon *errmon = dev_get_drvdata(dev); 243 unsigned char err_data[MAX_READ_BLOCK_LENGTH]; 244 struct smpro_error_hdr *err_info; 245 s32 err_count, err_length; 246 int ret; 247 248 err_info = &smpro_error_table[channel]; 249 250 ret = regmap_read(errmon->regmap, err_info->count, &err_count); 251 /* Error count is the low byte */ 252 err_count &= 0xff; 253 if (ret || !err_count || err_count > err_info->max_cnt) 254 return ret; 255 256 ret = regmap_read(errmon->regmap, err_info->len, &err_length); 257 if (ret || err_length <= 0) 258 return ret; 259 260 if (err_length > MAX_READ_BLOCK_LENGTH) 261 err_length = MAX_READ_BLOCK_LENGTH; 262 263 memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH); 264 ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length); 265 if (ret < 0) 266 return ret; 267 268 /* clear the error */ 269 ret = regmap_write(errmon->regmap, err_info->count, 0x100); 270 if (ret) 271 return ret; 272 /* 273 * The output of Core/Memory/PCIe/Others UE/CE errors follows the format 274 * specified in section 5.8.1 CE/UE Error Data record in 275 * Altra SOC BMC Interface specification. 276 */ 277 return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data); 278 } 279 280 /* 281 * Output format: 282 * <4-byte hex value of error info><4-byte hex value of error extensive data> 283 * Where: 284 * + error info : The error information 285 * + error data : Extensive data (32 bits) 286 * Reference to section 5.10 RAS Internal Error Register Definition in 287 * Altra SOC BMC Interface specification 288 */ 289 static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da, 290 char *buf, int channel) 291 { 292 struct smpro_errmon *errmon = dev_get_drvdata(dev); 293 struct smpro_int_error_hdr *err_info; 294 unsigned int err[4] = { 0 }; 295 unsigned int err_type; 296 unsigned int val; 297 int ret; 298 299 /* read error status */ 300 ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); 301 if (ret) 302 return ret; 303 304 if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || 305 (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) 306 return 0; 307 308 err_info = &list_smpro_int_error_hdr[channel]; 309 ret = regmap_read(errmon->regmap, err_info->type, &val); 310 if (ret) 311 return ret; 312 313 err_type = (val & BIT(1)) ? BIT(1) : 314 (val & BIT(2)) ? BIT(2) : 0; 315 316 if (!err_type) 317 return 0; 318 319 ret = regmap_read(errmon->regmap, err_info->info_l, err + 1); 320 if (ret) 321 return ret; 322 323 ret = regmap_read(errmon->regmap, err_info->info_h, err); 324 if (ret) 325 return ret; 326 327 if (err_type & BIT(2)) { 328 /* Error with data type */ 329 ret = regmap_read(errmon->regmap, err_info->data_l, err + 3); 330 if (ret) 331 return ret; 332 333 ret = regmap_read(errmon->regmap, err_info->data_h, err + 2); 334 if (ret) 335 return ret; 336 } 337 338 /* clear the read errors */ 339 ret = regmap_write(errmon->regmap, err_info->type, err_type); 340 if (ret) 341 return ret; 342 343 return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err); 344 } 345 346 /* 347 * Output format: 348 * <4-byte hex value of warining info> 349 * Reference to section 5.10 RAS Internal Error Register Definition in 350 * Altra SOC BMC Interface specification 351 */ 352 static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da, 353 char *buf, int channel) 354 { 355 struct smpro_errmon *errmon = dev_get_drvdata(dev); 356 struct smpro_int_error_hdr *err_info; 357 unsigned int warn[2] = { 0 }; 358 unsigned int val; 359 int ret; 360 361 /* read error status */ 362 ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); 363 if (ret) 364 return ret; 365 366 if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || 367 (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) 368 return 0; 369 370 err_info = &list_smpro_int_error_hdr[channel]; 371 ret = regmap_read(errmon->regmap, err_info->type, &val); 372 if (ret) 373 return ret; 374 375 if (!(val & BIT(0))) 376 return 0; 377 378 ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1); 379 if (ret) 380 return ret; 381 382 ret = regmap_read(errmon->regmap, err_info->warn_h, warn); 383 if (ret) 384 return ret; 385 386 /* clear the warning */ 387 ret = regmap_write(errmon->regmap, err_info->type, BIT(0)); 388 if (ret) 389 return ret; 390 391 return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn); 392 } 393 394 #define ERROR_OVERFLOW_RO(_error, _index) \ 395 static ssize_t overflow_##_error##_show(struct device *dev, \ 396 struct device_attribute *da, \ 397 char *buf) \ 398 { \ 399 return smpro_overflow_data_read(dev, da, buf, _index); \ 400 } \ 401 static DEVICE_ATTR_RO(overflow_##_error) 402 403 ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR); 404 ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR); 405 ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR); 406 ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR); 407 ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR); 408 ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR); 409 ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR); 410 ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR); 411 412 #define ERROR_RO(_error, _index) \ 413 static ssize_t error_##_error##_show(struct device *dev, \ 414 struct device_attribute *da, \ 415 char *buf) \ 416 { \ 417 return smpro_error_data_read(dev, da, buf, _index); \ 418 } \ 419 static DEVICE_ATTR_RO(error_##_error) 420 421 ERROR_RO(core_ce, CORE_CE_ERR); 422 ERROR_RO(core_ue, CORE_UE_ERR); 423 ERROR_RO(mem_ce, MEM_CE_ERR); 424 ERROR_RO(mem_ue, MEM_UE_ERR); 425 ERROR_RO(pcie_ce, PCIE_CE_ERR); 426 ERROR_RO(pcie_ue, PCIE_UE_ERR); 427 ERROR_RO(other_ce, OTHER_CE_ERR); 428 ERROR_RO(other_ue, OTHER_UE_ERR); 429 430 static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf) 431 { 432 return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR); 433 } 434 static DEVICE_ATTR_RO(error_smpro); 435 436 static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) 437 { 438 return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR); 439 } 440 static DEVICE_ATTR_RO(error_pmpro); 441 442 static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf) 443 { 444 return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR); 445 } 446 static DEVICE_ATTR_RO(warn_smpro); 447 448 static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) 449 { 450 return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR); 451 } 452 static DEVICE_ATTR_RO(warn_pmpro); 453 454 #define EVENT_RO(_event, _index) \ 455 static ssize_t event_##_event##_show(struct device *dev, \ 456 struct device_attribute *da, \ 457 char *buf) \ 458 { \ 459 return smpro_event_data_read(dev, da, buf, _index); \ 460 } \ 461 static DEVICE_ATTR_RO(event_##_event) 462 463 EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT); 464 EVENT_RO(vrd_hot, VRD_HOT_EVENT); 465 EVENT_RO(dimm_hot, DIMM_HOT_EVENT); 466 467 static struct attribute *smpro_errmon_attrs[] = { 468 &dev_attr_overflow_core_ce.attr, 469 &dev_attr_overflow_core_ue.attr, 470 &dev_attr_overflow_mem_ce.attr, 471 &dev_attr_overflow_mem_ue.attr, 472 &dev_attr_overflow_pcie_ce.attr, 473 &dev_attr_overflow_pcie_ue.attr, 474 &dev_attr_overflow_other_ce.attr, 475 &dev_attr_overflow_other_ue.attr, 476 &dev_attr_error_core_ce.attr, 477 &dev_attr_error_core_ue.attr, 478 &dev_attr_error_mem_ce.attr, 479 &dev_attr_error_mem_ue.attr, 480 &dev_attr_error_pcie_ce.attr, 481 &dev_attr_error_pcie_ue.attr, 482 &dev_attr_error_other_ce.attr, 483 &dev_attr_error_other_ue.attr, 484 &dev_attr_error_smpro.attr, 485 &dev_attr_error_pmpro.attr, 486 &dev_attr_warn_smpro.attr, 487 &dev_attr_warn_pmpro.attr, 488 &dev_attr_event_vrd_warn_fault.attr, 489 &dev_attr_event_vrd_hot.attr, 490 &dev_attr_event_dimm_hot.attr, 491 NULL 492 }; 493 494 ATTRIBUTE_GROUPS(smpro_errmon); 495 496 static int smpro_errmon_probe(struct platform_device *pdev) 497 { 498 struct smpro_errmon *errmon; 499 500 errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL); 501 if (!errmon) 502 return -ENOMEM; 503 504 platform_set_drvdata(pdev, errmon); 505 506 errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); 507 if (!errmon->regmap) 508 return -ENODEV; 509 510 return 0; 511 } 512 513 static struct platform_driver smpro_errmon_driver = { 514 .probe = smpro_errmon_probe, 515 .driver = { 516 .name = "smpro-errmon", 517 .dev_groups = smpro_errmon_groups, 518 }, 519 }; 520 521 module_platform_driver(smpro_errmon_driver); 522 523 MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>"); 524 MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>"); 525 MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>"); 526 MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>"); 527 MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>"); 528 MODULE_DESCRIPTION("Ampere Altra SMpro driver"); 529 MODULE_LICENSE("GPL"); 530