1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include "amdgpu_ras_eeprom.h" 25 #include "amdgpu.h" 26 #include "amdgpu_ras.h" 27 #include <linux/bits.h> 28 #include "atom.h" 29 #include "amdgpu_eeprom.h" 30 #include "amdgpu_atomfirmware.h" 31 #include <linux/debugfs.h> 32 #include <linux/uaccess.h> 33 34 #include "amdgpu_reset.h" 35 #include "amdgpu_ras_mgr.h" 36 37 /* These are memory addresses as would be seen by one or more EEPROM 38 * chips strung on the I2C bus, usually by manipulating pins 1-3 of a 39 * set of EEPROM devices. They form a continuous memory space. 40 * 41 * The I2C device address includes the device type identifier, 1010b, 42 * which is a reserved value and indicates that this is an I2C EEPROM 43 * device. It also includes the top 3 bits of the 19 bit EEPROM memory 44 * address, namely bits 18, 17, and 16. This makes up the 7 bit 45 * address sent on the I2C bus with bit 0 being the direction bit, 46 * which is not represented here, and sent by the hardware directly. 47 * 48 * For instance, 49 * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0. 50 * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h. 51 * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h. 52 * Depending on the size of the I2C EEPROM device(s), bits 18:16 may 53 * address memory in a device or a device on the I2C bus, depending on 54 * the status of pins 1-3. See top of amdgpu_eeprom.c. 55 * 56 * The RAS table lives either at address 0 or address 40000h of EEPROM. 57 */ 58 #define EEPROM_I2C_MADDR_0 0x0 59 #define EEPROM_I2C_MADDR_4 0x40000 60 61 /* 62 * The 2 macros below represent the actual size in bytes that 63 * those entities occupy in the EEPROM memory. 64 * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which 65 * uses uint64 to store 6b fields such as retired_page. 66 */ 67 #define RAS_TABLE_HEADER_SIZE 20 68 #define RAS_TABLE_RECORD_SIZE 24 69 70 /* Table hdr is 'AMDR' */ 71 #define RAS_TABLE_HDR_VAL 0x414d4452 72 73 /* Bad GPU tag ‘BADG’ */ 74 #define RAS_TABLE_HDR_BAD 0x42414447 75 76 /* 77 * EEPROM Table structure v1 78 * --------------------------------- 79 * | | 80 * | EEPROM TABLE HEADER | 81 * | ( size 20 Bytes ) | 82 * | | 83 * --------------------------------- 84 * | | 85 * | BAD PAGE RECORD AREA | 86 * | | 87 * --------------------------------- 88 */ 89 90 /* Assume 2-Mbit size EEPROM and take up the whole space. */ 91 #define RAS_TBL_SIZE_BYTES (256 * 1024) 92 #define RAS_TABLE_START 0 93 #define RAS_HDR_START RAS_TABLE_START 94 #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) 95 #define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ 96 / RAS_TABLE_RECORD_SIZE) 97 98 /* 99 * EEPROM Table structrue v2.1 100 * --------------------------------- 101 * | | 102 * | EEPROM TABLE HEADER | 103 * | ( size 20 Bytes ) | 104 * | | 105 * --------------------------------- 106 * | | 107 * | EEPROM TABLE RAS INFO | 108 * | (available info size 4 Bytes) | 109 * | ( reserved size 252 Bytes ) | 110 * | | 111 * --------------------------------- 112 * | | 113 * | BAD PAGE RECORD AREA | 114 * | | 115 * --------------------------------- 116 */ 117 118 /* EEPROM Table V2_1 */ 119 #define RAS_TABLE_V2_1_INFO_SIZE 256 120 #define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE 121 #define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \ 122 RAS_TABLE_V2_1_INFO_SIZE) 123 #define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \ 124 RAS_TABLE_V2_1_INFO_SIZE) \ 125 / RAS_TABLE_RECORD_SIZE) 126 127 #define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */ 128 129 /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM 130 * offset off of RAS_TABLE_START. That is, this is something you can 131 * add to control->i2c_address, and then tell I2C layer to read 132 * from/write to there. _N is the so called absolute index, 133 * because it starts right after the table header. 134 */ 135 #define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \ 136 (_N) * RAS_TABLE_RECORD_SIZE) 137 138 #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \ 139 (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE) 140 141 /* Given a 0-based relative record index, 0, 1, 2, ..., etc., off 142 * of "fri", return the absolute record index off of the end of 143 * the table header. 144 */ 145 #define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \ 146 (_C)->ras_max_record_count) 147 148 #define RAS_NUM_RECS(_tbl_hdr) \ 149 (((_tbl_hdr)->tbl_size < RAS_TABLE_HEADER_SIZE) ? 0u : \ 150 (((_tbl_hdr)->tbl_size - RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)) 151 152 #define RAS_NUM_RECS_V2_1(_tbl_hdr) \ 153 (((_tbl_hdr)->tbl_size < RAS_TABLE_HEADER_SIZE + \ 154 RAS_TABLE_V2_1_INFO_SIZE) ? 0u : \ 155 (((_tbl_hdr)->tbl_size - RAS_TABLE_HEADER_SIZE - \ 156 RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE)) 157 158 #define to_amdgpu_device(x) ((container_of(x, struct amdgpu_ras, eeprom_control))->adev) 159 160 static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) 161 { 162 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 163 case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */ 164 case IP_VERSION(11, 0, 7): /* Sienna cichlid */ 165 case IP_VERSION(13, 0, 0): 166 case IP_VERSION(13, 0, 2): /* Aldebaran */ 167 case IP_VERSION(13, 0, 10): 168 return true; 169 case IP_VERSION(13, 0, 6): 170 case IP_VERSION(13, 0, 12): 171 case IP_VERSION(13, 0, 14): 172 return (adev->gmc.is_app_apu) ? false : true; 173 default: 174 return false; 175 } 176 } 177 178 static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, 179 struct amdgpu_ras_eeprom_control *control) 180 { 181 struct atom_context *atom_ctx = adev->mode_info.atom_context; 182 u8 i2c_addr; 183 184 if (!control) 185 return false; 186 187 if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { 188 /* The address given by VBIOS is an 8-bit, wire-format 189 * address, i.e. the most significant byte. 190 * 191 * Normalize it to a 19-bit EEPROM address. Remove the 192 * device type identifier and make it a 7-bit address; 193 * then make it a 19-bit EEPROM address. See top of 194 * amdgpu_eeprom.c. 195 */ 196 i2c_addr = (i2c_addr & 0x0F) >> 1; 197 control->i2c_address = ((u32) i2c_addr) << 16; 198 199 return true; 200 } 201 202 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 203 case IP_VERSION(11, 0, 2): 204 /* VEGA20 and ARCTURUS */ 205 if (adev->asic_type == CHIP_VEGA20) 206 control->i2c_address = EEPROM_I2C_MADDR_0; 207 else if (strnstr(atom_ctx->vbios_pn, 208 "D342", 209 sizeof(atom_ctx->vbios_pn))) 210 control->i2c_address = EEPROM_I2C_MADDR_0; 211 else 212 control->i2c_address = EEPROM_I2C_MADDR_4; 213 return true; 214 case IP_VERSION(11, 0, 7): 215 control->i2c_address = EEPROM_I2C_MADDR_0; 216 return true; 217 case IP_VERSION(13, 0, 2): 218 if (strnstr(atom_ctx->vbios_pn, "D673", 219 sizeof(atom_ctx->vbios_pn))) 220 control->i2c_address = EEPROM_I2C_MADDR_4; 221 else 222 control->i2c_address = EEPROM_I2C_MADDR_0; 223 return true; 224 case IP_VERSION(13, 0, 0): 225 if (strnstr(atom_ctx->vbios_pn, "D707", 226 sizeof(atom_ctx->vbios_pn))) 227 control->i2c_address = EEPROM_I2C_MADDR_0; 228 else 229 control->i2c_address = EEPROM_I2C_MADDR_4; 230 return true; 231 case IP_VERSION(13, 0, 6): 232 case IP_VERSION(13, 0, 10): 233 case IP_VERSION(13, 0, 12): 234 case IP_VERSION(13, 0, 14): 235 control->i2c_address = EEPROM_I2C_MADDR_4; 236 return true; 237 default: 238 return false; 239 } 240 } 241 242 static void 243 __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr, 244 unsigned char *buf) 245 { 246 u32 *pp = (uint32_t *)buf; 247 248 pp[0] = cpu_to_le32(hdr->header); 249 pp[1] = cpu_to_le32(hdr->version); 250 pp[2] = cpu_to_le32(hdr->first_rec_offset); 251 pp[3] = cpu_to_le32(hdr->tbl_size); 252 pp[4] = cpu_to_le32(hdr->checksum); 253 } 254 255 static void 256 __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr, 257 unsigned char *buf) 258 { 259 u32 *pp = (uint32_t *)buf; 260 261 hdr->header = le32_to_cpu(pp[0]); 262 hdr->version = le32_to_cpu(pp[1]); 263 hdr->first_rec_offset = le32_to_cpu(pp[2]); 264 hdr->tbl_size = le32_to_cpu(pp[3]); 265 hdr->checksum = le32_to_cpu(pp[4]); 266 } 267 268 static int __write_table_header(struct amdgpu_ras_eeprom_control *control) 269 { 270 u8 buf[RAS_TABLE_HEADER_SIZE]; 271 struct amdgpu_device *adev = to_amdgpu_device(control); 272 int res; 273 274 memset(buf, 0, sizeof(buf)); 275 __encode_table_header_to_buf(&control->tbl_hdr, buf); 276 277 /* i2c may be unstable in gpu reset */ 278 down_read(&adev->reset_domain->sem); 279 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus, 280 control->i2c_address + 281 control->ras_header_offset, 282 buf, RAS_TABLE_HEADER_SIZE); 283 up_read(&adev->reset_domain->sem); 284 285 if (res < 0) { 286 dev_err(adev->dev, "Failed to write EEPROM table header:%d", 287 res); 288 } else if (res < RAS_TABLE_HEADER_SIZE) { 289 dev_err(adev->dev, "Short write:%d out of %d\n", res, 290 RAS_TABLE_HEADER_SIZE); 291 res = -EIO; 292 } else { 293 res = 0; 294 } 295 296 return res; 297 } 298 299 static void 300 __encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai, 301 unsigned char *buf) 302 { 303 u32 *pp = (uint32_t *)buf; 304 u32 tmp; 305 306 tmp = ((uint32_t)(rai->rma_status) & 0xFF) | 307 (((uint32_t)(rai->health_percent) << 8) & 0xFF00) | 308 (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000); 309 pp[0] = cpu_to_le32(tmp); 310 } 311 312 static void 313 __decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai, 314 unsigned char *buf) 315 { 316 u32 *pp = (uint32_t *)buf; 317 u32 tmp; 318 319 tmp = le32_to_cpu(pp[0]); 320 rai->rma_status = tmp & 0xFF; 321 rai->health_percent = (tmp >> 8) & 0xFF; 322 rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF; 323 } 324 325 static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control) 326 { 327 struct amdgpu_device *adev = to_amdgpu_device(control); 328 u8 *buf; 329 int res; 330 331 buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL); 332 if (!buf) { 333 dev_err(adev->dev, 334 "Failed to alloc buf to write table ras info\n"); 335 return -ENOMEM; 336 } 337 338 __encode_table_ras_info_to_buf(&control->tbl_rai, buf); 339 340 /* i2c may be unstable in gpu reset */ 341 down_read(&adev->reset_domain->sem); 342 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus, 343 control->i2c_address + 344 control->ras_info_offset, 345 buf, RAS_TABLE_V2_1_INFO_SIZE); 346 up_read(&adev->reset_domain->sem); 347 348 if (res < 0) { 349 dev_err(adev->dev, "Failed to write EEPROM table ras info:%d", 350 res); 351 } else if (res < RAS_TABLE_V2_1_INFO_SIZE) { 352 dev_err(adev->dev, "Short write:%d out of %d\n", res, 353 RAS_TABLE_V2_1_INFO_SIZE); 354 res = -EIO; 355 } else { 356 res = 0; 357 } 358 359 kfree(buf); 360 361 return res; 362 } 363 364 static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control) 365 { 366 int ii; 367 u8 *pp, csum; 368 size_t sz; 369 370 /* Header checksum, skip checksum field in the calculation */ 371 sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); 372 pp = (u8 *) &control->tbl_hdr; 373 csum = 0; 374 for (ii = 0; ii < sz; ii++, pp++) 375 csum += *pp; 376 377 return csum; 378 } 379 380 static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control) 381 { 382 int ii; 383 u8 *pp, csum; 384 size_t sz; 385 386 sz = sizeof(control->tbl_rai); 387 pp = (u8 *) &control->tbl_rai; 388 csum = 0; 389 for (ii = 0; ii < sz; ii++, pp++) 390 csum += *pp; 391 392 return csum; 393 } 394 395 static int amdgpu_ras_eeprom_correct_header_tag( 396 struct amdgpu_ras_eeprom_control *control, 397 uint32_t header) 398 { 399 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 400 u8 *hh; 401 int res; 402 u8 csum; 403 404 csum = -hdr->checksum; 405 406 hh = (void *) &hdr->header; 407 csum -= (hh[0] + hh[1] + hh[2] + hh[3]); 408 hh = (void *) &header; 409 csum += hh[0] + hh[1] + hh[2] + hh[3]; 410 csum = -csum; 411 mutex_lock(&control->ras_tbl_mutex); 412 hdr->header = header; 413 hdr->checksum = csum; 414 res = __write_table_header(control); 415 mutex_unlock(&control->ras_tbl_mutex); 416 417 return res; 418 } 419 420 static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control) 421 { 422 struct amdgpu_device *adev = to_amdgpu_device(control); 423 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 424 425 switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { 426 case IP_VERSION(8, 10, 0): 427 hdr->version = RAS_TABLE_VER_V2_1; 428 return; 429 case IP_VERSION(12, 0, 0): 430 case IP_VERSION(12, 5, 0): 431 hdr->version = RAS_TABLE_VER_V3; 432 return; 433 default: 434 hdr->version = RAS_TABLE_VER_V1; 435 return; 436 } 437 } 438 439 /** 440 * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table 441 * @control: pointer to control structure 442 * 443 * Reset the contents of the header of the RAS EEPROM table. 444 * Return 0 on success, -errno on error. 445 */ 446 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) 447 { 448 struct amdgpu_device *adev = to_amdgpu_device(control); 449 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 450 struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; 451 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 452 u32 erase_res = 0; 453 u8 csum; 454 int res; 455 456 mutex_lock(&control->ras_tbl_mutex); 457 458 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 459 hdr->header = RAS_TABLE_HDR_VAL; 460 amdgpu_ras_set_eeprom_table_version(control); 461 462 if (hdr->version >= RAS_TABLE_VER_V2_1) { 463 hdr->first_rec_offset = RAS_RECORD_START_V2_1; 464 hdr->tbl_size = RAS_TABLE_HEADER_SIZE + 465 RAS_TABLE_V2_1_INFO_SIZE; 466 rai->rma_status = GPU_HEALTH_USABLE; 467 468 control->ras_record_offset = RAS_RECORD_START_V2_1; 469 control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; 470 /** 471 * GPU health represented as a percentage. 472 * 0 means worst health, 100 means fully health. 473 */ 474 rai->health_percent = 100; 475 /* ecc_page_threshold = 0 means disable bad page retirement */ 476 rai->ecc_page_threshold = con->bad_page_cnt_threshold; 477 } else { 478 hdr->first_rec_offset = RAS_RECORD_START; 479 hdr->tbl_size = RAS_TABLE_HEADER_SIZE; 480 481 control->ras_record_offset = RAS_RECORD_START; 482 control->ras_max_record_count = RAS_MAX_RECORD_COUNT; 483 } 484 485 csum = __calc_hdr_byte_sum(control); 486 if (hdr->version >= RAS_TABLE_VER_V2_1) 487 csum += __calc_ras_info_byte_sum(control); 488 csum = -csum; 489 hdr->checksum = csum; 490 res = __write_table_header(control); 491 if (!res && hdr->version > RAS_TABLE_VER_V1) 492 res = __write_table_ras_info(control); 493 } else { 494 res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res); 495 if (res || erase_res) { 496 dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d", 497 res, erase_res); 498 if (!res) 499 res = -EIO; 500 } 501 } 502 503 control->ras_num_recs = 0; 504 control->ras_num_bad_pages = 0; 505 control->ras_num_mca_recs = 0; 506 control->ras_num_pa_recs = 0; 507 control->ras_fri = 0; 508 509 amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages); 510 511 control->bad_channel_bitmap = 0; 512 amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); 513 con->update_channel_flag = false; 514 /* there is no record on eeprom now, clear the counter */ 515 if (con->eh_data) 516 con->eh_data->count_saved = 0; 517 518 amdgpu_ras_debugfs_set_ret_size(control); 519 520 mutex_unlock(&control->ras_tbl_mutex); 521 522 return res; 523 } 524 525 static void 526 __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control, 527 struct eeprom_table_record *record, 528 unsigned char *buf) 529 { 530 __le64 tmp = 0; 531 int i = 0; 532 533 /* Next are all record fields according to EEPROM page spec in LE foramt */ 534 buf[i++] = record->err_type; 535 536 buf[i++] = record->bank; 537 538 tmp = cpu_to_le64(record->ts); 539 memcpy(buf + i, &tmp, 8); 540 i += 8; 541 542 tmp = cpu_to_le64((record->offset & 0xffffffffffff)); 543 memcpy(buf + i, &tmp, 6); 544 i += 6; 545 546 buf[i++] = record->mem_channel; 547 buf[i++] = record->mcumc_id; 548 549 tmp = cpu_to_le64((record->retired_page & 0xffffffffffff)); 550 memcpy(buf + i, &tmp, 6); 551 } 552 553 static void 554 __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control, 555 struct eeprom_table_record *record, 556 unsigned char *buf) 557 { 558 __le64 tmp = 0; 559 int i = 0; 560 561 /* Next are all record fields according to EEPROM page spec in LE foramt */ 562 record->err_type = buf[i++]; 563 564 record->bank = buf[i++]; 565 566 memcpy(&tmp, buf + i, 8); 567 record->ts = le64_to_cpu(tmp); 568 i += 8; 569 570 memcpy(&tmp, buf + i, 6); 571 record->offset = (le64_to_cpu(tmp) & 0xffffffffffff); 572 i += 6; 573 574 record->mem_channel = buf[i++]; 575 record->mcumc_id = buf[i++]; 576 577 memcpy(&tmp, buf + i, 6); 578 record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff); 579 } 580 581 bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) 582 { 583 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 584 585 if (amdgpu_uniras_enabled(adev)) 586 return amdgpu_ras_mgr_check_eeprom_safety_watermark(adev); 587 588 if (!__is_ras_eeprom_supported(adev) || 589 !amdgpu_bad_page_threshold) 590 return false; 591 592 /* skip check eeprom table for VEGA20 Gaming */ 593 if (!con) 594 return false; 595 else 596 if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC))) 597 return false; 598 599 if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { 600 if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold) 601 dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", 602 con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); 603 if ((amdgpu_bad_page_threshold == -1) || 604 (amdgpu_bad_page_threshold == -2)) { 605 dev_warn(adev->dev, 606 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n"); 607 return false; 608 } else { 609 dev_warn(adev->dev, 610 "Please consider adjusting the customized threshold.\n"); 611 return true; 612 } 613 } 614 615 return false; 616 } 617 618 /** 619 * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM 620 * @control: pointer to control structure 621 * @buf: pointer to buffer containing data to write 622 * @fri: start writing at this index 623 * @num: number of records to write 624 * 625 * The caller must hold the table mutex in @control. 626 * Return 0 on success, -errno otherwise. 627 */ 628 static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, 629 u8 *buf, const u32 fri, const u32 num) 630 { 631 struct amdgpu_device *adev = to_amdgpu_device(control); 632 u32 buf_size; 633 int res; 634 635 /* i2c may be unstable in gpu reset */ 636 down_read(&adev->reset_domain->sem); 637 buf_size = num * RAS_TABLE_RECORD_SIZE; 638 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus, 639 control->i2c_address + 640 RAS_INDEX_TO_OFFSET(control, fri), 641 buf, buf_size); 642 up_read(&adev->reset_domain->sem); 643 if (res < 0) { 644 dev_err(adev->dev, "Writing %d EEPROM table records error:%d", 645 num, res); 646 } else if (res < buf_size) { 647 /* Short write, return error. 648 */ 649 dev_err(adev->dev, "Wrote %d records out of %d", 650 res / RAS_TABLE_RECORD_SIZE, num); 651 res = -EIO; 652 } else { 653 res = 0; 654 } 655 656 return res; 657 } 658 659 static int 660 amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, 661 struct eeprom_table_record *record, 662 const u32 num) 663 { 664 struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); 665 struct amdgpu_device *adev = to_amdgpu_device(control); 666 u32 a, b, i; 667 u8 *buf, *pp; 668 int res; 669 670 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); 671 if (!buf) 672 return -ENOMEM; 673 674 /* Encode all of them in one go. 675 */ 676 pp = buf; 677 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { 678 __encode_table_record_to_buf(control, &record[i], pp); 679 680 /* update bad channel bitmap */ 681 if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && 682 !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { 683 control->bad_channel_bitmap |= 1 << record[i].mem_channel; 684 con->update_channel_flag = true; 685 } 686 } 687 688 /* a, first record index to write into. 689 * b, last record index to write into. 690 * a = first index to read (fri) + number of records in the table, 691 * b = a + @num - 1. 692 * Let N = control->ras_max_num_record_count, then we have, 693 * case 0: 0 <= a <= b < N, 694 * just append @num records starting at a; 695 * case 1: 0 <= a < N <= b, 696 * append (N - a) records starting at a, and 697 * append the remainder, b % N + 1, starting at 0. 698 * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases, 699 * case 2a: 0 <= a <= b < N 700 * append num records starting at a; and fix fri if b overwrote it, 701 * and since a <= b, if b overwrote it then a must've also, 702 * and if b didn't overwrite it, then a didn't also. 703 * case 2b: 0 <= b < a < N 704 * write num records starting at a, which wraps around 0=N 705 * and overwrite fri unconditionally. Now from case 2a, 706 * this means that b eclipsed fri to overwrite it and wrap 707 * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally 708 * set fri = b + 1 (mod N). 709 * Now, since fri is updated in every case, except the trivial case 0, 710 * the number of records present in the table after writing, is, 711 * num_recs - 1 = b - fri (mod N), and we take the positive value, 712 * by adding an arbitrary multiple of N before taking the modulo N 713 * as shown below. 714 */ 715 a = control->ras_fri + control->ras_num_recs; 716 b = a + num - 1; 717 if (b < control->ras_max_record_count) { 718 res = __amdgpu_ras_eeprom_write(control, buf, a, num); 719 } else if (a < control->ras_max_record_count) { 720 u32 g0, g1; 721 722 g0 = control->ras_max_record_count - a; 723 g1 = b % control->ras_max_record_count + 1; 724 res = __amdgpu_ras_eeprom_write(control, buf, a, g0); 725 if (res) 726 goto Out; 727 res = __amdgpu_ras_eeprom_write(control, 728 buf + g0 * RAS_TABLE_RECORD_SIZE, 729 0, g1); 730 if (res) 731 goto Out; 732 if (g1 > control->ras_fri) 733 control->ras_fri = g1 % control->ras_max_record_count; 734 } else { 735 a %= control->ras_max_record_count; 736 b %= control->ras_max_record_count; 737 738 if (a <= b) { 739 /* Note that, b - a + 1 = num. */ 740 res = __amdgpu_ras_eeprom_write(control, buf, a, num); 741 if (res) 742 goto Out; 743 if (b >= control->ras_fri) 744 control->ras_fri = (b + 1) % control->ras_max_record_count; 745 } else { 746 u32 g0, g1; 747 748 /* b < a, which means, we write from 749 * a to the end of the table, and from 750 * the start of the table to b. 751 */ 752 g0 = control->ras_max_record_count - a; 753 g1 = b + 1; 754 res = __amdgpu_ras_eeprom_write(control, buf, a, g0); 755 if (res) 756 goto Out; 757 res = __amdgpu_ras_eeprom_write(control, 758 buf + g0 * RAS_TABLE_RECORD_SIZE, 759 0, g1); 760 if (res) 761 goto Out; 762 control->ras_fri = g1 % control->ras_max_record_count; 763 } 764 } 765 control->ras_num_recs = 1 + (control->ras_max_record_count + b 766 - control->ras_fri) 767 % control->ras_max_record_count; 768 769 /*old asics only save pa to eeprom like before*/ 770 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) 771 control->ras_num_pa_recs += num; 772 else 773 control->ras_num_mca_recs += num; 774 775 control->ras_num_bad_pages = con->bad_page_num; 776 Out: 777 kfree(buf); 778 return res; 779 } 780 781 static int 782 amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) 783 { 784 struct amdgpu_device *adev = to_amdgpu_device(control); 785 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 786 u8 *buf, *pp, csum; 787 u32 buf_size; 788 int res; 789 790 /* Modify the header if it exceeds. 791 */ 792 if (amdgpu_bad_page_threshold != 0 && 793 control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { 794 dev_warn(adev->dev, 795 "Saved bad pages %d reaches threshold value %d\n", 796 control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 797 798 if (adev->cper.enabled && !amdgpu_uniras_enabled(adev) && 799 amdgpu_cper_generate_bp_threshold_record(adev)) 800 dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); 801 802 if ((amdgpu_bad_page_threshold != -1) && 803 (amdgpu_bad_page_threshold != -2)) { 804 control->tbl_hdr.header = RAS_TABLE_HDR_BAD; 805 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { 806 control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; 807 control->tbl_rai.health_percent = 0; 808 } 809 ras->is_rma = true; 810 } 811 812 /* ignore the -ENOTSUPP return value */ 813 amdgpu_dpm_send_rma_reason(adev); 814 } 815 816 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 817 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + 818 RAS_TABLE_V2_1_INFO_SIZE + 819 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 820 else 821 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + 822 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 823 control->tbl_hdr.checksum = 0; 824 825 buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 826 buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); 827 if (!buf) { 828 dev_err(adev->dev, 829 "allocating memory for table of size %d bytes failed\n", 830 control->tbl_hdr.tbl_size); 831 res = -ENOMEM; 832 goto Out; 833 } 834 835 down_read(&adev->reset_domain->sem); 836 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 837 control->i2c_address + 838 control->ras_record_offset, 839 buf, buf_size); 840 up_read(&adev->reset_domain->sem); 841 if (res < 0) { 842 dev_err(adev->dev, "EEPROM failed reading records:%d\n", res); 843 goto Out; 844 } else if (res < buf_size) { 845 dev_err(adev->dev, "EEPROM read %d out of %d bytes\n", res, 846 buf_size); 847 res = -EIO; 848 goto Out; 849 } 850 851 /** 852 * bad page records have been stored in eeprom, 853 * now calculate gpu health percent 854 */ 855 if (amdgpu_bad_page_threshold != 0 && 856 control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && 857 control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) 858 control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - 859 control->ras_num_bad_pages) * 100) / 860 ras->bad_page_cnt_threshold; 861 862 /* Recalc the checksum. 863 */ 864 csum = 0; 865 for (pp = buf; pp < buf + buf_size; pp++) 866 csum += *pp; 867 868 csum += __calc_hdr_byte_sum(control); 869 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 870 csum += __calc_ras_info_byte_sum(control); 871 /* avoid sign extension when assigning to "checksum" */ 872 csum = -csum; 873 control->tbl_hdr.checksum = csum; 874 res = __write_table_header(control); 875 if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1) 876 res = __write_table_ras_info(control); 877 Out: 878 kfree(buf); 879 return res; 880 } 881 882 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control) 883 { 884 struct amdgpu_device *adev = to_amdgpu_device(control); 885 int ret, retry = 20; 886 887 if (!amdgpu_ras_smu_eeprom_supported(adev)) 888 return 0; 889 890 control->ras_num_recs_old = control->ras_num_recs; 891 892 do { 893 /* 1000ms timeout is long enough, smu_get_badpage_count won't 894 * return -EBUSY before timeout. 895 */ 896 ret = amdgpu_ras_smu_get_badpage_count(adev, 897 &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS); 898 if (!ret && 899 (control->ras_num_recs_old == control->ras_num_recs)) { 900 /* record number update in PMFW needs some time, 901 * smu_get_badpage_count may return immediately without 902 * count update, sleep for a while and retry again. 903 */ 904 msleep(50); 905 retry--; 906 } else { 907 break; 908 } 909 } while (retry); 910 911 /* no update of record number is not a real failure, 912 * don't print warning here 913 */ 914 if (!ret && (control->ras_num_recs_old == control->ras_num_recs)) 915 ret = -EINVAL; 916 917 return ret; 918 } 919 920 static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control) 921 { 922 struct amdgpu_device *adev = to_amdgpu_device(control); 923 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 924 925 if (!amdgpu_ras_smu_eeprom_supported(adev) || !con) 926 return 0; 927 928 control->ras_num_bad_pages = con->bad_page_num; 929 930 if (amdgpu_bad_page_threshold != 0 && 931 control->ras_num_bad_pages > con->bad_page_cnt_threshold) { 932 dev_warn(adev->dev, 933 "Saved bad pages %d reaches threshold value %d\n", 934 control->ras_num_bad_pages, con->bad_page_cnt_threshold); 935 936 if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev)) 937 dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); 938 939 if ((amdgpu_bad_page_threshold != -1) && 940 (amdgpu_bad_page_threshold != -2)) 941 con->is_rma = true; 942 } 943 944 return 0; 945 } 946 947 /** 948 * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table 949 * @control: pointer to control structure 950 * @record: array of records to append 951 * @num: number of records in @record array 952 * 953 * Append @num records to the table, calculate the checksum and write 954 * the table back to EEPROM. The maximum number of records that 955 * can be appended is between 1 and control->ras_max_record_count, 956 * regardless of how many records are already stored in the table. 957 * 958 * Return 0 on success or if EEPROM is not supported, -errno on error. 959 */ 960 int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, 961 struct eeprom_table_record *record, 962 const u32 num) 963 { 964 struct amdgpu_device *adev = to_amdgpu_device(control); 965 int res, i; 966 uint64_t nps = AMDGPU_NPS1_PARTITION_MODE; 967 968 if (!__is_ras_eeprom_supported(adev)) 969 return 0; 970 971 if (amdgpu_ras_smu_eeprom_supported(adev)) 972 return amdgpu_ras_smu_eeprom_append(control); 973 974 if (num == 0) { 975 dev_err(adev->dev, "will not append 0 records\n"); 976 return -EINVAL; 977 } else if (num > control->ras_max_record_count) { 978 dev_err(adev->dev, 979 "cannot append %d records than the size of table %d\n", 980 num, control->ras_max_record_count); 981 return -EINVAL; 982 } 983 984 if (adev->gmc.gmc_funcs->query_mem_partition_mode) 985 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 986 987 /* set the new channel index flag */ 988 for (i = 0; i < num; i++) 989 record[i].retired_page |= (nps << UMC_NPS_SHIFT); 990 991 mutex_lock(&control->ras_tbl_mutex); 992 993 res = amdgpu_ras_eeprom_append_table(control, record, num); 994 if (!res) 995 res = amdgpu_ras_eeprom_update_header(control); 996 if (!res) 997 amdgpu_ras_debugfs_set_ret_size(control); 998 999 mutex_unlock(&control->ras_tbl_mutex); 1000 1001 /* clear channel index flag, the flag is only saved on eeprom */ 1002 for (i = 0; i < num; i++) 1003 record[i].retired_page &= ~(nps << UMC_NPS_SHIFT); 1004 1005 return res; 1006 } 1007 1008 /** 1009 * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer 1010 * @control: pointer to control structure 1011 * @buf: pointer to buffer to read into 1012 * @fri: first record index, start reading at this index, absolute index 1013 * @num: number of records to read 1014 * 1015 * The caller must hold the table mutex in @control. 1016 * Return 0 on success, -errno otherwise. 1017 */ 1018 static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, 1019 u8 *buf, const u32 fri, const u32 num) 1020 { 1021 struct amdgpu_device *adev = to_amdgpu_device(control); 1022 u32 buf_size; 1023 int res; 1024 1025 /* i2c may be unstable in gpu reset */ 1026 down_read(&adev->reset_domain->sem); 1027 buf_size = num * RAS_TABLE_RECORD_SIZE; 1028 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1029 control->i2c_address + 1030 RAS_INDEX_TO_OFFSET(control, fri), 1031 buf, buf_size); 1032 up_read(&adev->reset_domain->sem); 1033 if (res < 0) { 1034 dev_err(adev->dev, "Reading %d EEPROM table records error:%d", 1035 num, res); 1036 } else if (res < buf_size) { 1037 /* Short read, return error. 1038 */ 1039 dev_err(adev->dev, "Read %d records out of %d", 1040 res / RAS_TABLE_RECORD_SIZE, num); 1041 res = -EIO; 1042 } else { 1043 res = 0; 1044 } 1045 1046 return res; 1047 } 1048 1049 int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, 1050 struct eeprom_table_record *record, u32 rec_idx, 1051 const u32 num) 1052 { 1053 struct amdgpu_device *adev = to_amdgpu_device(control); 1054 uint64_t ts, end_idx; 1055 int i, ret; 1056 u64 mca, ipid; 1057 u32 cu, mem_channel, mcumc_id; 1058 1059 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1060 return 0; 1061 1062 if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse) 1063 return -EOPNOTSUPP; 1064 1065 end_idx = rec_idx + num; 1066 for (i = rec_idx; i < end_idx; i++) { 1067 ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca); 1068 if (ret) 1069 return ret; 1070 1071 ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid); 1072 if (ret) 1073 return ret; 1074 1075 ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts); 1076 if (ret) 1077 return ret; 1078 1079 record[i - rec_idx].address = mca; 1080 /* retired_page (pa) is unused now */ 1081 record[i - rec_idx].retired_page = 0x1ULL; 1082 record[i - rec_idx].ts = ts; 1083 record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 1084 1085 adev->umc.ras->mca_ipid_parse(adev, ipid, 1086 &cu, &mem_channel, &mcumc_id, NULL); 1087 record[i - rec_idx].cu = (u8)cu; 1088 record[i - rec_idx].mem_channel = (u8)mem_channel; 1089 record[i - rec_idx].mcumc_id = (u8)mcumc_id; 1090 } 1091 1092 return 0; 1093 } 1094 1095 /** 1096 * amdgpu_ras_eeprom_read -- read EEPROM 1097 * @control: pointer to control structure 1098 * @record: array of records to read into 1099 * @num: number of records in @record 1100 * 1101 * Reads num records from the RAS table in EEPROM and 1102 * writes the data into @record array. 1103 * 1104 * Returns 0 on success, -errno on error. 1105 */ 1106 int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, 1107 struct eeprom_table_record *record, 1108 const u32 num) 1109 { 1110 struct amdgpu_device *adev = to_amdgpu_device(control); 1111 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1112 int i, res; 1113 u8 *buf, *pp; 1114 u32 g0, g1; 1115 1116 if (amdgpu_ras_smu_eeprom_supported(adev)) 1117 return amdgpu_ras_eeprom_read_idx(control, record, 0, num); 1118 1119 if (!__is_ras_eeprom_supported(adev)) 1120 return 0; 1121 1122 if (num == 0) { 1123 dev_err(adev->dev, "will not read 0 records\n"); 1124 return -EINVAL; 1125 } else if (num > control->ras_num_recs) { 1126 dev_err(adev->dev, "too many records to read:%d available:%d\n", 1127 num, control->ras_num_recs); 1128 return -EINVAL; 1129 } 1130 1131 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); 1132 if (!buf) 1133 return -ENOMEM; 1134 1135 /* Determine how many records to read, from the first record 1136 * index, fri, to the end of the table, and from the beginning 1137 * of the table, such that the total number of records is 1138 * @num, and we handle wrap around when fri > 0 and 1139 * fri + num > RAS_MAX_RECORD_COUNT. 1140 * 1141 * First we compute the index of the last element 1142 * which would be fetched from each region, 1143 * g0 is in [fri, fri + num - 1], and 1144 * g1 is in [0, RAS_MAX_RECORD_COUNT - 1]. 1145 * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of 1146 * the last element to fetch, we set g0 to _the number_ 1147 * of elements to fetch, @num, since we know that the last 1148 * indexed to be fetched does not exceed the table. 1149 * 1150 * If, however, g0 >= RAS_MAX_RECORD_COUNT, then 1151 * we set g0 to the number of elements to read 1152 * until the end of the table, and g1 to the number of 1153 * elements to read from the beginning of the table. 1154 */ 1155 g0 = control->ras_fri + num - 1; 1156 g1 = g0 % control->ras_max_record_count; 1157 if (g0 < control->ras_max_record_count) { 1158 g0 = num; 1159 g1 = 0; 1160 } else { 1161 g0 = control->ras_max_record_count - control->ras_fri; 1162 g1 += 1; 1163 } 1164 1165 mutex_lock(&control->ras_tbl_mutex); 1166 res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0); 1167 if (res) 1168 goto Out; 1169 if (g1) { 1170 res = __amdgpu_ras_eeprom_read(control, 1171 buf + g0 * RAS_TABLE_RECORD_SIZE, 1172 0, g1); 1173 if (res) 1174 goto Out; 1175 } 1176 1177 res = 0; 1178 1179 /* Read up everything? Then transform. 1180 */ 1181 pp = buf; 1182 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { 1183 __decode_table_record_from_buf(control, &record[i], pp); 1184 1185 /* update bad channel bitmap */ 1186 if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && 1187 !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { 1188 control->bad_channel_bitmap |= 1 << record[i].mem_channel; 1189 con->update_channel_flag = true; 1190 } 1191 } 1192 Out: 1193 kfree(buf); 1194 mutex_unlock(&control->ras_tbl_mutex); 1195 1196 return res; 1197 } 1198 1199 uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control) 1200 { 1201 /* get available eeprom table version first before eeprom table init */ 1202 amdgpu_ras_set_eeprom_table_version(control); 1203 1204 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 1205 return RAS_MAX_RECORD_COUNT_V2_1; 1206 else 1207 return RAS_MAX_RECORD_COUNT; 1208 } 1209 1210 static ssize_t 1211 amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf, 1212 size_t size, loff_t *pos) 1213 { 1214 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 1215 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1216 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; 1217 u8 data[50]; 1218 int res; 1219 1220 if (!size) 1221 return size; 1222 1223 if (!ras || !control) { 1224 res = snprintf(data, sizeof(data), "Not supported\n"); 1225 } else { 1226 res = snprintf(data, sizeof(data), "%d bytes or %d records\n", 1227 RAS_TBL_SIZE_BYTES, control->ras_max_record_count); 1228 } 1229 1230 if (*pos >= res) 1231 return 0; 1232 1233 res -= *pos; 1234 res = min_t(size_t, res, size); 1235 1236 if (copy_to_user(buf, &data[*pos], res)) 1237 return -EFAULT; 1238 1239 *pos += res; 1240 1241 return res; 1242 } 1243 1244 const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = { 1245 .owner = THIS_MODULE, 1246 .read = amdgpu_ras_debugfs_eeprom_size_read, 1247 .write = NULL, 1248 .llseek = default_llseek, 1249 }; 1250 1251 static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n"; 1252 static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n"; 1253 #define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1) 1254 static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n"; 1255 static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n"; 1256 #define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1) 1257 1258 static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = { 1259 "ignore", 1260 "re", 1261 "ue", 1262 }; 1263 1264 static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control) 1265 { 1266 return strlen(tbl_hdr_str) + tbl_hdr_fmt_size + 1267 strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs; 1268 } 1269 1270 void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control) 1271 { 1272 struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras, 1273 eeprom_control); 1274 struct dentry *de = ras->de_ras_eeprom_table; 1275 1276 if (de) 1277 d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control); 1278 } 1279 1280 static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf, 1281 size_t size, loff_t *pos) 1282 { 1283 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 1284 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1285 struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control; 1286 const size_t orig_size = size; 1287 int res = -EFAULT; 1288 size_t data_len; 1289 1290 /* pmfw manages eeprom data by itself */ 1291 if (amdgpu_ras_smu_eeprom_supported(adev)) 1292 return 0; 1293 1294 mutex_lock(&control->ras_tbl_mutex); 1295 1296 /* We want *pos - data_len > 0, which means there's 1297 * bytes to be printed from data. 1298 */ 1299 data_len = strlen(tbl_hdr_str); 1300 if (*pos < data_len) { 1301 data_len -= *pos; 1302 data_len = min_t(size_t, data_len, size); 1303 if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len)) 1304 goto Out; 1305 buf += data_len; 1306 size -= data_len; 1307 *pos += data_len; 1308 } 1309 1310 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size; 1311 if (*pos < data_len && size > 0) { 1312 u8 data[tbl_hdr_fmt_size + 1]; 1313 loff_t lpos; 1314 1315 snprintf(data, sizeof(data), tbl_hdr_fmt, 1316 control->tbl_hdr.header, 1317 control->tbl_hdr.version, 1318 control->tbl_hdr.first_rec_offset, 1319 control->tbl_hdr.tbl_size, 1320 control->tbl_hdr.checksum); 1321 1322 data_len -= *pos; 1323 data_len = min_t(size_t, data_len, size); 1324 lpos = *pos - strlen(tbl_hdr_str); 1325 if (copy_to_user(buf, &data[lpos], data_len)) 1326 goto Out; 1327 buf += data_len; 1328 size -= data_len; 1329 *pos += data_len; 1330 } 1331 1332 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str); 1333 if (*pos < data_len && size > 0) { 1334 loff_t lpos; 1335 1336 data_len -= *pos; 1337 data_len = min_t(size_t, data_len, size); 1338 lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size; 1339 if (copy_to_user(buf, &rec_hdr_str[lpos], data_len)) 1340 goto Out; 1341 buf += data_len; 1342 size -= data_len; 1343 *pos += data_len; 1344 } 1345 1346 data_len = amdgpu_ras_debugfs_table_size(control); 1347 if (*pos < data_len && size > 0) { 1348 u8 dare[RAS_TABLE_RECORD_SIZE]; 1349 u8 data[rec_hdr_fmt_size + 1]; 1350 struct eeprom_table_record record; 1351 int s, r; 1352 1353 /* Find the starting record index 1354 */ 1355 s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size - 1356 strlen(rec_hdr_str); 1357 s = s / rec_hdr_fmt_size; 1358 r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size - 1359 strlen(rec_hdr_str); 1360 r = r % rec_hdr_fmt_size; 1361 1362 for ( ; size > 0 && s < control->ras_num_recs; s++) { 1363 u32 ai = RAS_RI_TO_AI(control, s); 1364 /* Read a single record 1365 */ 1366 res = __amdgpu_ras_eeprom_read(control, dare, ai, 1); 1367 if (res) 1368 goto Out; 1369 __decode_table_record_from_buf(control, &record, dare); 1370 snprintf(data, sizeof(data), rec_hdr_fmt, 1371 s, 1372 RAS_INDEX_TO_OFFSET(control, ai), 1373 record_err_type_str[record.err_type], 1374 record.bank, 1375 record.ts, 1376 record.offset, 1377 record.mem_channel, 1378 record.mcumc_id, 1379 record.retired_page); 1380 1381 data_len = min_t(size_t, rec_hdr_fmt_size - r, size); 1382 if (copy_to_user(buf, &data[r], data_len)) { 1383 res = -EFAULT; 1384 goto Out; 1385 } 1386 buf += data_len; 1387 size -= data_len; 1388 *pos += data_len; 1389 r = 0; 1390 } 1391 } 1392 res = 0; 1393 Out: 1394 mutex_unlock(&control->ras_tbl_mutex); 1395 return res < 0 ? res : orig_size - size; 1396 } 1397 1398 static ssize_t 1399 amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf, 1400 size_t size, loff_t *pos) 1401 { 1402 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 1403 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1404 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; 1405 u8 data[81]; 1406 int res; 1407 1408 if (!size) 1409 return size; 1410 1411 if (!ras || !control) { 1412 res = snprintf(data, sizeof(data), "Not supported\n"); 1413 if (*pos >= res) 1414 return 0; 1415 1416 res -= *pos; 1417 res = min_t(size_t, res, size); 1418 1419 if (copy_to_user(buf, &data[*pos], res)) 1420 return -EFAULT; 1421 1422 *pos += res; 1423 1424 return res; 1425 } else { 1426 return amdgpu_ras_debugfs_table_read(f, buf, size, pos); 1427 } 1428 } 1429 1430 const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = { 1431 .owner = THIS_MODULE, 1432 .read = amdgpu_ras_debugfs_eeprom_table_read, 1433 .write = NULL, 1434 .llseek = default_llseek, 1435 }; 1436 1437 /** 1438 * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum 1439 * @control: pointer to control structure 1440 * 1441 * Check the checksum of the stored in EEPROM RAS table. 1442 * 1443 * Return 0 if the checksum is correct, 1444 * positive if it is not correct, and 1445 * -errno on I/O error. 1446 */ 1447 static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control) 1448 { 1449 struct amdgpu_device *adev = to_amdgpu_device(control); 1450 int buf_size, res; 1451 u8 csum, *buf, *pp; 1452 1453 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 1454 buf_size = RAS_TABLE_HEADER_SIZE + 1455 RAS_TABLE_V2_1_INFO_SIZE + 1456 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 1457 else 1458 buf_size = RAS_TABLE_HEADER_SIZE + 1459 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 1460 1461 buf = kzalloc(buf_size, GFP_KERNEL); 1462 if (!buf) { 1463 dev_err(adev->dev, 1464 "Out of memory checking RAS table checksum.\n"); 1465 return -ENOMEM; 1466 } 1467 1468 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1469 control->i2c_address + 1470 control->ras_header_offset, 1471 buf, buf_size); 1472 if (res < buf_size) { 1473 dev_err(adev->dev, "Partial read for checksum, res:%d\n", res); 1474 /* On partial reads, return -EIO. 1475 */ 1476 if (res >= 0) 1477 res = -EIO; 1478 goto Out; 1479 } 1480 1481 csum = 0; 1482 for (pp = buf; pp < buf + buf_size; pp++) 1483 csum += *pp; 1484 Out: 1485 kfree(buf); 1486 return res < 0 ? res : csum; 1487 } 1488 1489 static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) 1490 { 1491 struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; 1492 struct amdgpu_device *adev = to_amdgpu_device(control); 1493 unsigned char *buf; 1494 int res; 1495 1496 buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL); 1497 if (!buf) { 1498 dev_err(adev->dev, 1499 "Failed to alloc buf to read EEPROM table ras info\n"); 1500 return -ENOMEM; 1501 } 1502 1503 /** 1504 * EEPROM table V2_1 supports ras info, 1505 * read EEPROM table ras info 1506 */ 1507 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1508 control->i2c_address + control->ras_info_offset, 1509 buf, RAS_TABLE_V2_1_INFO_SIZE); 1510 if (res < RAS_TABLE_V2_1_INFO_SIZE) { 1511 dev_err(adev->dev, 1512 "Failed to read EEPROM table ras info, res:%d", res); 1513 res = res >= 0 ? -EIO : res; 1514 goto Out; 1515 } 1516 1517 __decode_table_ras_info_from_buf(rai, buf); 1518 1519 Out: 1520 kfree(buf); 1521 return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; 1522 } 1523 1524 static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control) 1525 { 1526 struct amdgpu_device *adev = to_amdgpu_device(control); 1527 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 1528 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1529 uint64_t local_time; 1530 int res; 1531 1532 ras->is_rma = false; 1533 1534 if (!__is_ras_eeprom_supported(adev)) 1535 return 0; 1536 mutex_init(&control->ras_tbl_mutex); 1537 1538 res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version)); 1539 if (res) 1540 return res; 1541 1542 res = amdgpu_ras_smu_get_badpage_count(adev, 1543 &(control->ras_num_recs), 100); 1544 if (res) 1545 return res; 1546 1547 local_time = (uint64_t)ktime_get_real_seconds(); 1548 res = amdgpu_ras_smu_set_timestamp(adev, local_time); 1549 if (res) 1550 return res; 1551 1552 control->ras_max_record_count = 4000; 1553 1554 control->ras_num_mca_recs = 0; 1555 control->ras_num_pa_recs = 0; 1556 1557 return 0; 1558 } 1559 1560 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) 1561 { 1562 struct amdgpu_device *adev = to_amdgpu_device(control); 1563 unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; 1564 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 1565 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1566 int dev_var = adev->pdev->device & 0xF; 1567 uint32_t vram_type = adev->gmc.vram_type; 1568 int res; 1569 1570 if (amdgpu_ras_smu_eeprom_supported(adev)) 1571 return amdgpu_ras_smu_eeprom_init(control); 1572 1573 ras->is_rma = false; 1574 1575 if (!__is_ras_eeprom_supported(adev)) 1576 return 0; 1577 1578 /* Verify i2c adapter is initialized */ 1579 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo) 1580 return -ENOENT; 1581 1582 if (!__get_eeprom_i2c_addr(adev, control)) 1583 return -EINVAL; 1584 1585 control->ras_header_offset = RAS_HDR_START; 1586 control->ras_info_offset = RAS_TABLE_V2_1_INFO_START; 1587 mutex_init(&control->ras_tbl_mutex); 1588 1589 /* Read the table header from EEPROM address */ 1590 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1591 control->i2c_address + control->ras_header_offset, 1592 buf, RAS_TABLE_HEADER_SIZE); 1593 if (res < RAS_TABLE_HEADER_SIZE) { 1594 dev_err(adev->dev, "Failed to read EEPROM table header, res:%d", 1595 res); 1596 return res >= 0 ? -EIO : res; 1597 } 1598 1599 __decode_table_header_from_buf(hdr, buf); 1600 1601 if (hdr->header != RAS_TABLE_HDR_VAL && 1602 hdr->header != RAS_TABLE_HDR_BAD) { 1603 dev_info(adev->dev, "Creating a new EEPROM table"); 1604 return amdgpu_ras_eeprom_reset_table(control); 1605 } 1606 1607 if (!(adev->flags & AMD_IS_APU) && (dev_var == 0x5) && 1608 (vram_type == AMDGPU_VRAM_TYPE_HBM3E) && 1609 (hdr->version < RAS_TABLE_VER_V3)) { 1610 return amdgpu_ras_eeprom_reset_table(control); 1611 } 1612 1613 switch (hdr->version) { 1614 case RAS_TABLE_VER_V2_1: 1615 case RAS_TABLE_VER_V3: 1616 if (hdr->tbl_size < RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE) { 1617 dev_err(adev->dev, 1618 "RAS header invalid, tbl_size %u smaller than minimum %u, resetting table\n", 1619 hdr->tbl_size, 1620 RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE); 1621 return amdgpu_ras_eeprom_reset_table(control); 1622 } 1623 control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); 1624 control->ras_record_offset = RAS_RECORD_START_V2_1; 1625 control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; 1626 break; 1627 case RAS_TABLE_VER_V1: 1628 if (hdr->tbl_size < RAS_TABLE_HEADER_SIZE) { 1629 dev_err(adev->dev, 1630 "RAS header invalid, tbl_size %u smaller than minimum %u, resetting table\n", 1631 hdr->tbl_size, RAS_TABLE_HEADER_SIZE); 1632 return amdgpu_ras_eeprom_reset_table(control); 1633 } 1634 control->ras_num_recs = RAS_NUM_RECS(hdr); 1635 control->ras_record_offset = RAS_RECORD_START; 1636 control->ras_max_record_count = RAS_MAX_RECORD_COUNT; 1637 break; 1638 default: 1639 dev_err(adev->dev, 1640 "RAS header invalid, unsupported version: %u", 1641 hdr->version); 1642 return -EINVAL; 1643 } 1644 1645 if (control->ras_num_recs > control->ras_max_record_count) { 1646 dev_err(adev->dev, 1647 "RAS header invalid, records in header: %u max allowed :%u", 1648 control->ras_num_recs, control->ras_max_record_count); 1649 return -EINVAL; 1650 } 1651 1652 control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); 1653 if (hdr->first_rec_offset < control->ras_record_offset || 1654 control->ras_fri >= control->ras_max_record_count) { 1655 dev_err(adev->dev, 1656 "RAS header invalid, ras_fri: %u, first_rec_offset:0x%x", 1657 control->ras_fri, hdr->first_rec_offset); 1658 return -EINVAL; 1659 } 1660 1661 control->ras_num_mca_recs = 0; 1662 control->ras_num_pa_recs = 0; 1663 return 0; 1664 } 1665 1666 static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control) 1667 { 1668 struct amdgpu_device *adev = to_amdgpu_device(control); 1669 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1670 1671 if (!__is_ras_eeprom_supported(adev)) 1672 return 0; 1673 1674 control->ras_num_bad_pages = ras->bad_page_num; 1675 1676 if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) && 1677 amdgpu_bad_page_threshold != 0) { 1678 dev_warn(adev->dev, 1679 "RAS records:%d exceed threshold:%d\n", 1680 control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 1681 if ((amdgpu_bad_page_threshold == -1) || 1682 (amdgpu_bad_page_threshold == -2)) { 1683 dev_warn(adev->dev, 1684 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); 1685 } else { 1686 ras->is_rma = true; 1687 dev_warn(adev->dev, 1688 "User defined threshold is set, runtime service will be halt when threshold is reached\n"); 1689 } 1690 1691 return 0; 1692 } 1693 1694 dev_dbg(adev->dev, 1695 "Found existing EEPROM table with %d records", 1696 control->ras_num_bad_pages); 1697 1698 /* Warn if we are at 90% of the threshold or above 1699 */ 1700 if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) 1701 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", 1702 control->ras_num_bad_pages, 1703 ras->bad_page_cnt_threshold); 1704 return 0; 1705 } 1706 1707 int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) 1708 { 1709 struct amdgpu_device *adev = to_amdgpu_device(control); 1710 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 1711 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1712 int res = 0; 1713 1714 if (amdgpu_ras_smu_eeprom_supported(adev)) 1715 return amdgpu_ras_smu_eeprom_check(control); 1716 1717 if (!__is_ras_eeprom_supported(adev)) 1718 return 0; 1719 1720 /* Verify i2c adapter is initialized */ 1721 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo) 1722 return -ENOENT; 1723 1724 if (!__get_eeprom_i2c_addr(adev, control)) 1725 return -EINVAL; 1726 1727 control->ras_num_bad_pages = ras->bad_page_num; 1728 1729 if (hdr->header == RAS_TABLE_HDR_VAL) { 1730 dev_dbg(adev->dev, 1731 "Found existing EEPROM table with %d records", 1732 control->ras_num_bad_pages); 1733 1734 if (hdr->version >= RAS_TABLE_VER_V2_1) { 1735 res = __read_table_ras_info(control); 1736 if (res) 1737 return res; 1738 } 1739 1740 res = __verify_ras_table_checksum(control); 1741 if (res) { 1742 dev_err(adev->dev, 1743 "RAS table incorrect checksum or error:%d\n", 1744 res); 1745 return -EINVAL; 1746 } 1747 1748 /* Warn if we are at 90% of the threshold or above 1749 */ 1750 if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) 1751 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", 1752 control->ras_num_bad_pages, 1753 ras->bad_page_cnt_threshold); 1754 } else if (hdr->header == RAS_TABLE_HDR_BAD && 1755 amdgpu_bad_page_threshold != 0) { 1756 if (hdr->version >= RAS_TABLE_VER_V2_1) { 1757 res = __read_table_ras_info(control); 1758 if (res) 1759 return res; 1760 } 1761 1762 res = __verify_ras_table_checksum(control); 1763 if (res) { 1764 dev_err(adev->dev, 1765 "RAS Table incorrect checksum or error:%d\n", 1766 res); 1767 return -EINVAL; 1768 } 1769 if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) { 1770 /* This means that, the threshold was increased since 1771 * the last time the system was booted, and now, 1772 * ras->bad_page_cnt_threshold - control->num_recs > 0, 1773 * so that at least one more record can be saved, 1774 * before the page count threshold is reached. 1775 */ 1776 dev_info(adev->dev, 1777 "records:%d threshold:%d, resetting " 1778 "RAS table header signature", 1779 control->ras_num_bad_pages, 1780 ras->bad_page_cnt_threshold); 1781 res = amdgpu_ras_eeprom_correct_header_tag(control, 1782 RAS_TABLE_HDR_VAL); 1783 } else { 1784 dev_warn(adev->dev, 1785 "RAS records:%d exceed threshold:%d\n", 1786 control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 1787 if ((amdgpu_bad_page_threshold == -1) || 1788 (amdgpu_bad_page_threshold == -2)) { 1789 res = 0; 1790 dev_warn(adev->dev, 1791 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); 1792 } else { 1793 ras->is_rma = true; 1794 dev_warn(adev->dev, 1795 "User defined threshold is set, runtime service will be halt when threshold is reached\n"); 1796 } 1797 } 1798 } 1799 1800 return res < 0 ? res : 0; 1801 } 1802 1803 void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev) 1804 { 1805 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1806 struct amdgpu_ras_eeprom_control *control; 1807 int res; 1808 1809 if (!__is_ras_eeprom_supported(adev) || !ras || 1810 amdgpu_ras_smu_eeprom_supported(adev)) 1811 return; 1812 control = &ras->eeprom_control; 1813 if (!control->is_eeprom_valid) 1814 return; 1815 res = __verify_ras_table_checksum(control); 1816 if (res) { 1817 dev_warn(adev->dev, 1818 "RAS table incorrect checksum or error:%d, try to recover\n", 1819 res); 1820 if (!amdgpu_ras_eeprom_reset_table(control)) 1821 if (!amdgpu_ras_save_bad_pages(adev, NULL)) 1822 if (!__verify_ras_table_checksum(control)) { 1823 dev_info(adev->dev, "RAS table recovery succeed\n"); 1824 return; 1825 } 1826 dev_err(adev->dev, "RAS table recovery failed\n"); 1827 control->is_eeprom_valid = false; 1828 } 1829 return; 1830 } 1831 1832 static const struct ras_smu_drv *amdgpu_ras_get_smu_ras_drv(struct amdgpu_device *adev) 1833 { 1834 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1835 1836 if (!ras) 1837 return NULL; 1838 1839 return ras->ras_smu_drv; 1840 } 1841 1842 static uint64_t amdgpu_ras_smu_get_feature_flags(struct amdgpu_device *adev) 1843 { 1844 const struct ras_smu_drv *ras_smu_drv = amdgpu_ras_get_smu_ras_drv(adev); 1845 uint64_t flags = 0ULL; 1846 1847 if (!ras_smu_drv) 1848 goto out; 1849 1850 if (ras_smu_drv->ras_smu_feature_flags) 1851 ras_smu_drv->ras_smu_feature_flags(adev, &flags); 1852 1853 out: 1854 return flags; 1855 } 1856 1857 bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev) 1858 { 1859 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1860 uint64_t flags = 0ULL; 1861 1862 if (!__is_ras_eeprom_supported(adev) || !smu_ras_drv) 1863 return false; 1864 1865 if (!smu_ras_drv->smu_eeprom_funcs) 1866 return false; 1867 1868 flags = amdgpu_ras_smu_get_feature_flags(adev); 1869 1870 return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM); 1871 } 1872 1873 int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev, 1874 uint32_t *table_version) 1875 { 1876 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1877 1878 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1879 return -EOPNOTSUPP; 1880 1881 if (smu_ras_drv->smu_eeprom_funcs->get_ras_table_version) 1882 return smu_ras_drv->smu_eeprom_funcs->get_ras_table_version(adev, 1883 table_version); 1884 return -EOPNOTSUPP; 1885 } 1886 1887 int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev, 1888 uint32_t *count, uint32_t timeout) 1889 { 1890 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1891 1892 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1893 return -EOPNOTSUPP; 1894 1895 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_count) 1896 return smu_ras_drv->smu_eeprom_funcs->get_badpage_count(adev, 1897 count, timeout); 1898 return -EOPNOTSUPP; 1899 } 1900 1901 int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev, 1902 uint16_t index, uint64_t *mca_addr) 1903 { 1904 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1905 1906 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1907 return -EOPNOTSUPP; 1908 1909 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr) 1910 return smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr(adev, 1911 index, mca_addr); 1912 return -EOPNOTSUPP; 1913 } 1914 1915 int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev, 1916 uint64_t timestamp) 1917 { 1918 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1919 1920 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1921 return -EOPNOTSUPP; 1922 1923 if (smu_ras_drv->smu_eeprom_funcs->set_timestamp) 1924 return smu_ras_drv->smu_eeprom_funcs->set_timestamp(adev, 1925 timestamp); 1926 return -EOPNOTSUPP; 1927 } 1928 1929 int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev, 1930 uint16_t index, uint64_t *timestamp) 1931 { 1932 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1933 1934 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1935 return -EOPNOTSUPP; 1936 1937 if (smu_ras_drv->smu_eeprom_funcs->get_timestamp) 1938 return smu_ras_drv->smu_eeprom_funcs->get_timestamp(adev, 1939 index, timestamp); 1940 return -EOPNOTSUPP; 1941 } 1942 1943 int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev, 1944 uint16_t index, uint64_t *ipid) 1945 { 1946 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1947 1948 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1949 return -EOPNOTSUPP; 1950 1951 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid) 1952 return smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid(adev, 1953 index, ipid); 1954 return -EOPNOTSUPP; 1955 } 1956 1957 int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev, 1958 uint32_t *result) 1959 { 1960 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1961 1962 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1963 return -EOPNOTSUPP; 1964 1965 if (smu_ras_drv->smu_eeprom_funcs->erase_ras_table) 1966 return smu_ras_drv->smu_eeprom_funcs->erase_ras_table(adev, 1967 result); 1968 return -EOPNOTSUPP; 1969 } 1970 1971 void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev) 1972 { 1973 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1974 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; 1975 1976 if (!control || amdgpu_bad_page_threshold == 0) 1977 return; 1978 1979 if (control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { 1980 if (amdgpu_dpm_send_rma_reason(adev)) 1981 dev_warn(adev->dev, "Unable to send out-of-band RMA CPER"); 1982 else 1983 dev_dbg(adev->dev, "Sent out-of-band RMA CPER"); 1984 1985 if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) { 1986 if (amdgpu_cper_generate_bp_threshold_record(adev)) 1987 dev_warn(adev->dev, "Unable to send in-band RMA CPER"); 1988 else 1989 dev_dbg(adev->dev, "Sent in-band RMA CPER"); 1990 } 1991 } 1992 } 1993