1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include "amdgpu_ras_eeprom.h" 25 #include "amdgpu.h" 26 #include "amdgpu_ras.h" 27 #include <linux/bits.h> 28 #include "atom.h" 29 #include "amdgpu_eeprom.h" 30 #include "amdgpu_atomfirmware.h" 31 #include <linux/debugfs.h> 32 #include <linux/uaccess.h> 33 34 #include "amdgpu_reset.h" 35 #include "amdgpu_ras_mgr.h" 36 37 /* These are memory addresses as would be seen by one or more EEPROM 38 * chips strung on the I2C bus, usually by manipulating pins 1-3 of a 39 * set of EEPROM devices. They form a continuous memory space. 40 * 41 * The I2C device address includes the device type identifier, 1010b, 42 * which is a reserved value and indicates that this is an I2C EEPROM 43 * device. It also includes the top 3 bits of the 19 bit EEPROM memory 44 * address, namely bits 18, 17, and 16. This makes up the 7 bit 45 * address sent on the I2C bus with bit 0 being the direction bit, 46 * which is not represented here, and sent by the hardware directly. 47 * 48 * For instance, 49 * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0. 50 * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h. 51 * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h. 52 * Depending on the size of the I2C EEPROM device(s), bits 18:16 may 53 * address memory in a device or a device on the I2C bus, depending on 54 * the status of pins 1-3. See top of amdgpu_eeprom.c. 55 * 56 * The RAS table lives either at address 0 or address 40000h of EEPROM. 57 */ 58 #define EEPROM_I2C_MADDR_0 0x0 59 #define EEPROM_I2C_MADDR_4 0x40000 60 61 /* 62 * The 2 macros below represent the actual size in bytes that 63 * those entities occupy in the EEPROM memory. 64 * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which 65 * uses uint64 to store 6b fields such as retired_page. 66 */ 67 #define RAS_TABLE_HEADER_SIZE 20 68 #define RAS_TABLE_RECORD_SIZE 24 69 70 /* Table hdr is 'AMDR' */ 71 #define RAS_TABLE_HDR_VAL 0x414d4452 72 73 /* Bad GPU tag ‘BADG’ */ 74 #define RAS_TABLE_HDR_BAD 0x42414447 75 76 /* 77 * EEPROM Table structure v1 78 * --------------------------------- 79 * | | 80 * | EEPROM TABLE HEADER | 81 * | ( size 20 Bytes ) | 82 * | | 83 * --------------------------------- 84 * | | 85 * | BAD PAGE RECORD AREA | 86 * | | 87 * --------------------------------- 88 */ 89 90 /* Assume 2-Mbit size EEPROM and take up the whole space. */ 91 #define RAS_TBL_SIZE_BYTES (256 * 1024) 92 #define RAS_TABLE_START 0 93 #define RAS_HDR_START RAS_TABLE_START 94 #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) 95 #define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ 96 / RAS_TABLE_RECORD_SIZE) 97 98 /* 99 * EEPROM Table structrue v2.1 100 * --------------------------------- 101 * | | 102 * | EEPROM TABLE HEADER | 103 * | ( size 20 Bytes ) | 104 * | | 105 * --------------------------------- 106 * | | 107 * | EEPROM TABLE RAS INFO | 108 * | (available info size 4 Bytes) | 109 * | ( reserved size 252 Bytes ) | 110 * | | 111 * --------------------------------- 112 * | | 113 * | BAD PAGE RECORD AREA | 114 * | | 115 * --------------------------------- 116 */ 117 118 /* EEPROM Table V2_1 */ 119 #define RAS_TABLE_V2_1_INFO_SIZE 256 120 #define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE 121 #define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \ 122 RAS_TABLE_V2_1_INFO_SIZE) 123 #define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \ 124 RAS_TABLE_V2_1_INFO_SIZE) \ 125 / RAS_TABLE_RECORD_SIZE) 126 127 #define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */ 128 129 /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM 130 * offset off of RAS_TABLE_START. That is, this is something you can 131 * add to control->i2c_address, and then tell I2C layer to read 132 * from/write to there. _N is the so called absolute index, 133 * because it starts right after the table header. 134 */ 135 #define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \ 136 (_N) * RAS_TABLE_RECORD_SIZE) 137 138 #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \ 139 (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE) 140 141 /* Given a 0-based relative record index, 0, 1, 2, ..., etc., off 142 * of "fri", return the absolute record index off of the end of 143 * the table header. 144 */ 145 #define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \ 146 (_C)->ras_max_record_count) 147 148 #define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \ 149 RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE) 150 151 #define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \ 152 RAS_TABLE_HEADER_SIZE - \ 153 RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE) 154 155 #define to_amdgpu_device(x) ((container_of(x, struct amdgpu_ras, eeprom_control))->adev) 156 157 static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) 158 { 159 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 160 case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */ 161 case IP_VERSION(11, 0, 7): /* Sienna cichlid */ 162 case IP_VERSION(13, 0, 0): 163 case IP_VERSION(13, 0, 2): /* Aldebaran */ 164 case IP_VERSION(13, 0, 10): 165 return true; 166 case IP_VERSION(13, 0, 6): 167 case IP_VERSION(13, 0, 12): 168 case IP_VERSION(13, 0, 14): 169 return (adev->gmc.is_app_apu) ? false : true; 170 default: 171 return false; 172 } 173 } 174 175 static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, 176 struct amdgpu_ras_eeprom_control *control) 177 { 178 struct atom_context *atom_ctx = adev->mode_info.atom_context; 179 u8 i2c_addr; 180 181 if (!control) 182 return false; 183 184 if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { 185 /* The address given by VBIOS is an 8-bit, wire-format 186 * address, i.e. the most significant byte. 187 * 188 * Normalize it to a 19-bit EEPROM address. Remove the 189 * device type identifier and make it a 7-bit address; 190 * then make it a 19-bit EEPROM address. See top of 191 * amdgpu_eeprom.c. 192 */ 193 i2c_addr = (i2c_addr & 0x0F) >> 1; 194 control->i2c_address = ((u32) i2c_addr) << 16; 195 196 return true; 197 } 198 199 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 200 case IP_VERSION(11, 0, 2): 201 /* VEGA20 and ARCTURUS */ 202 if (adev->asic_type == CHIP_VEGA20) 203 control->i2c_address = EEPROM_I2C_MADDR_0; 204 else if (strnstr(atom_ctx->vbios_pn, 205 "D342", 206 sizeof(atom_ctx->vbios_pn))) 207 control->i2c_address = EEPROM_I2C_MADDR_0; 208 else 209 control->i2c_address = EEPROM_I2C_MADDR_4; 210 return true; 211 case IP_VERSION(11, 0, 7): 212 control->i2c_address = EEPROM_I2C_MADDR_0; 213 return true; 214 case IP_VERSION(13, 0, 2): 215 if (strnstr(atom_ctx->vbios_pn, "D673", 216 sizeof(atom_ctx->vbios_pn))) 217 control->i2c_address = EEPROM_I2C_MADDR_4; 218 else 219 control->i2c_address = EEPROM_I2C_MADDR_0; 220 return true; 221 case IP_VERSION(13, 0, 0): 222 if (strnstr(atom_ctx->vbios_pn, "D707", 223 sizeof(atom_ctx->vbios_pn))) 224 control->i2c_address = EEPROM_I2C_MADDR_0; 225 else 226 control->i2c_address = EEPROM_I2C_MADDR_4; 227 return true; 228 case IP_VERSION(13, 0, 6): 229 case IP_VERSION(13, 0, 10): 230 case IP_VERSION(13, 0, 12): 231 case IP_VERSION(13, 0, 14): 232 control->i2c_address = EEPROM_I2C_MADDR_4; 233 return true; 234 default: 235 return false; 236 } 237 } 238 239 static void 240 __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr, 241 unsigned char *buf) 242 { 243 u32 *pp = (uint32_t *)buf; 244 245 pp[0] = cpu_to_le32(hdr->header); 246 pp[1] = cpu_to_le32(hdr->version); 247 pp[2] = cpu_to_le32(hdr->first_rec_offset); 248 pp[3] = cpu_to_le32(hdr->tbl_size); 249 pp[4] = cpu_to_le32(hdr->checksum); 250 } 251 252 static void 253 __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr, 254 unsigned char *buf) 255 { 256 u32 *pp = (uint32_t *)buf; 257 258 hdr->header = le32_to_cpu(pp[0]); 259 hdr->version = le32_to_cpu(pp[1]); 260 hdr->first_rec_offset = le32_to_cpu(pp[2]); 261 hdr->tbl_size = le32_to_cpu(pp[3]); 262 hdr->checksum = le32_to_cpu(pp[4]); 263 } 264 265 static int __write_table_header(struct amdgpu_ras_eeprom_control *control) 266 { 267 u8 buf[RAS_TABLE_HEADER_SIZE]; 268 struct amdgpu_device *adev = to_amdgpu_device(control); 269 int res; 270 271 memset(buf, 0, sizeof(buf)); 272 __encode_table_header_to_buf(&control->tbl_hdr, buf); 273 274 /* i2c may be unstable in gpu reset */ 275 down_read(&adev->reset_domain->sem); 276 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus, 277 control->i2c_address + 278 control->ras_header_offset, 279 buf, RAS_TABLE_HEADER_SIZE); 280 up_read(&adev->reset_domain->sem); 281 282 if (res < 0) { 283 dev_err(adev->dev, "Failed to write EEPROM table header:%d", 284 res); 285 } else if (res < RAS_TABLE_HEADER_SIZE) { 286 dev_err(adev->dev, "Short write:%d out of %d\n", res, 287 RAS_TABLE_HEADER_SIZE); 288 res = -EIO; 289 } else { 290 res = 0; 291 } 292 293 return res; 294 } 295 296 static void 297 __encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai, 298 unsigned char *buf) 299 { 300 u32 *pp = (uint32_t *)buf; 301 u32 tmp; 302 303 tmp = ((uint32_t)(rai->rma_status) & 0xFF) | 304 (((uint32_t)(rai->health_percent) << 8) & 0xFF00) | 305 (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000); 306 pp[0] = cpu_to_le32(tmp); 307 } 308 309 static void 310 __decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai, 311 unsigned char *buf) 312 { 313 u32 *pp = (uint32_t *)buf; 314 u32 tmp; 315 316 tmp = le32_to_cpu(pp[0]); 317 rai->rma_status = tmp & 0xFF; 318 rai->health_percent = (tmp >> 8) & 0xFF; 319 rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF; 320 } 321 322 static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control) 323 { 324 struct amdgpu_device *adev = to_amdgpu_device(control); 325 u8 *buf; 326 int res; 327 328 buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL); 329 if (!buf) { 330 dev_err(adev->dev, 331 "Failed to alloc buf to write table ras info\n"); 332 return -ENOMEM; 333 } 334 335 __encode_table_ras_info_to_buf(&control->tbl_rai, buf); 336 337 /* i2c may be unstable in gpu reset */ 338 down_read(&adev->reset_domain->sem); 339 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus, 340 control->i2c_address + 341 control->ras_info_offset, 342 buf, RAS_TABLE_V2_1_INFO_SIZE); 343 up_read(&adev->reset_domain->sem); 344 345 if (res < 0) { 346 dev_err(adev->dev, "Failed to write EEPROM table ras info:%d", 347 res); 348 } else if (res < RAS_TABLE_V2_1_INFO_SIZE) { 349 dev_err(adev->dev, "Short write:%d out of %d\n", res, 350 RAS_TABLE_V2_1_INFO_SIZE); 351 res = -EIO; 352 } else { 353 res = 0; 354 } 355 356 kfree(buf); 357 358 return res; 359 } 360 361 static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control) 362 { 363 int ii; 364 u8 *pp, csum; 365 size_t sz; 366 367 /* Header checksum, skip checksum field in the calculation */ 368 sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); 369 pp = (u8 *) &control->tbl_hdr; 370 csum = 0; 371 for (ii = 0; ii < sz; ii++, pp++) 372 csum += *pp; 373 374 return csum; 375 } 376 377 static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control) 378 { 379 int ii; 380 u8 *pp, csum; 381 size_t sz; 382 383 sz = sizeof(control->tbl_rai); 384 pp = (u8 *) &control->tbl_rai; 385 csum = 0; 386 for (ii = 0; ii < sz; ii++, pp++) 387 csum += *pp; 388 389 return csum; 390 } 391 392 static int amdgpu_ras_eeprom_correct_header_tag( 393 struct amdgpu_ras_eeprom_control *control, 394 uint32_t header) 395 { 396 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 397 u8 *hh; 398 int res; 399 u8 csum; 400 401 csum = -hdr->checksum; 402 403 hh = (void *) &hdr->header; 404 csum -= (hh[0] + hh[1] + hh[2] + hh[3]); 405 hh = (void *) &header; 406 csum += hh[0] + hh[1] + hh[2] + hh[3]; 407 csum = -csum; 408 mutex_lock(&control->ras_tbl_mutex); 409 hdr->header = header; 410 hdr->checksum = csum; 411 res = __write_table_header(control); 412 mutex_unlock(&control->ras_tbl_mutex); 413 414 return res; 415 } 416 417 static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control) 418 { 419 struct amdgpu_device *adev = to_amdgpu_device(control); 420 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 421 422 switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { 423 case IP_VERSION(8, 10, 0): 424 hdr->version = RAS_TABLE_VER_V2_1; 425 return; 426 case IP_VERSION(12, 0, 0): 427 case IP_VERSION(12, 5, 0): 428 hdr->version = RAS_TABLE_VER_V3; 429 return; 430 default: 431 hdr->version = RAS_TABLE_VER_V1; 432 return; 433 } 434 } 435 436 /** 437 * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table 438 * @control: pointer to control structure 439 * 440 * Reset the contents of the header of the RAS EEPROM table. 441 * Return 0 on success, -errno on error. 442 */ 443 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) 444 { 445 struct amdgpu_device *adev = to_amdgpu_device(control); 446 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 447 struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; 448 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 449 u32 erase_res = 0; 450 u8 csum; 451 int res; 452 453 mutex_lock(&control->ras_tbl_mutex); 454 455 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 456 hdr->header = RAS_TABLE_HDR_VAL; 457 amdgpu_ras_set_eeprom_table_version(control); 458 459 if (hdr->version >= RAS_TABLE_VER_V2_1) { 460 hdr->first_rec_offset = RAS_RECORD_START_V2_1; 461 hdr->tbl_size = RAS_TABLE_HEADER_SIZE + 462 RAS_TABLE_V2_1_INFO_SIZE; 463 rai->rma_status = GPU_HEALTH_USABLE; 464 465 control->ras_record_offset = RAS_RECORD_START_V2_1; 466 control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; 467 /** 468 * GPU health represented as a percentage. 469 * 0 means worst health, 100 means fully health. 470 */ 471 rai->health_percent = 100; 472 /* ecc_page_threshold = 0 means disable bad page retirement */ 473 rai->ecc_page_threshold = con->bad_page_cnt_threshold; 474 } else { 475 hdr->first_rec_offset = RAS_RECORD_START; 476 hdr->tbl_size = RAS_TABLE_HEADER_SIZE; 477 478 control->ras_record_offset = RAS_RECORD_START; 479 control->ras_max_record_count = RAS_MAX_RECORD_COUNT; 480 } 481 482 csum = __calc_hdr_byte_sum(control); 483 if (hdr->version >= RAS_TABLE_VER_V2_1) 484 csum += __calc_ras_info_byte_sum(control); 485 csum = -csum; 486 hdr->checksum = csum; 487 res = __write_table_header(control); 488 if (!res && hdr->version > RAS_TABLE_VER_V1) 489 res = __write_table_ras_info(control); 490 } else { 491 res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res); 492 if (res || erase_res) { 493 dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d", 494 res, erase_res); 495 if (!res) 496 res = -EIO; 497 } 498 } 499 500 control->ras_num_recs = 0; 501 control->ras_num_bad_pages = 0; 502 control->ras_num_mca_recs = 0; 503 control->ras_num_pa_recs = 0; 504 control->ras_fri = 0; 505 506 amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages); 507 508 control->bad_channel_bitmap = 0; 509 amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); 510 con->update_channel_flag = false; 511 512 amdgpu_ras_debugfs_set_ret_size(control); 513 514 mutex_unlock(&control->ras_tbl_mutex); 515 516 return res; 517 } 518 519 static void 520 __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control, 521 struct eeprom_table_record *record, 522 unsigned char *buf) 523 { 524 __le64 tmp = 0; 525 int i = 0; 526 527 /* Next are all record fields according to EEPROM page spec in LE foramt */ 528 buf[i++] = record->err_type; 529 530 buf[i++] = record->bank; 531 532 tmp = cpu_to_le64(record->ts); 533 memcpy(buf + i, &tmp, 8); 534 i += 8; 535 536 tmp = cpu_to_le64((record->offset & 0xffffffffffff)); 537 memcpy(buf + i, &tmp, 6); 538 i += 6; 539 540 buf[i++] = record->mem_channel; 541 buf[i++] = record->mcumc_id; 542 543 tmp = cpu_to_le64((record->retired_page & 0xffffffffffff)); 544 memcpy(buf + i, &tmp, 6); 545 } 546 547 static void 548 __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control, 549 struct eeprom_table_record *record, 550 unsigned char *buf) 551 { 552 __le64 tmp = 0; 553 int i = 0; 554 555 /* Next are all record fields according to EEPROM page spec in LE foramt */ 556 record->err_type = buf[i++]; 557 558 record->bank = buf[i++]; 559 560 memcpy(&tmp, buf + i, 8); 561 record->ts = le64_to_cpu(tmp); 562 i += 8; 563 564 memcpy(&tmp, buf + i, 6); 565 record->offset = (le64_to_cpu(tmp) & 0xffffffffffff); 566 i += 6; 567 568 record->mem_channel = buf[i++]; 569 record->mcumc_id = buf[i++]; 570 571 memcpy(&tmp, buf + i, 6); 572 record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff); 573 } 574 575 bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) 576 { 577 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 578 579 if (amdgpu_uniras_enabled(adev)) 580 return amdgpu_ras_mgr_check_eeprom_safety_watermark(adev); 581 582 if (!__is_ras_eeprom_supported(adev) || 583 !amdgpu_bad_page_threshold) 584 return false; 585 586 /* skip check eeprom table for VEGA20 Gaming */ 587 if (!con) 588 return false; 589 else 590 if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC))) 591 return false; 592 593 if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { 594 if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold) 595 dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", 596 con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); 597 if ((amdgpu_bad_page_threshold == -1) || 598 (amdgpu_bad_page_threshold == -2)) { 599 dev_warn(adev->dev, 600 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n"); 601 return false; 602 } else { 603 dev_warn(adev->dev, 604 "Please consider adjusting the customized threshold.\n"); 605 return true; 606 } 607 } 608 609 return false; 610 } 611 612 /** 613 * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM 614 * @control: pointer to control structure 615 * @buf: pointer to buffer containing data to write 616 * @fri: start writing at this index 617 * @num: number of records to write 618 * 619 * The caller must hold the table mutex in @control. 620 * Return 0 on success, -errno otherwise. 621 */ 622 static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, 623 u8 *buf, const u32 fri, const u32 num) 624 { 625 struct amdgpu_device *adev = to_amdgpu_device(control); 626 u32 buf_size; 627 int res; 628 629 /* i2c may be unstable in gpu reset */ 630 down_read(&adev->reset_domain->sem); 631 buf_size = num * RAS_TABLE_RECORD_SIZE; 632 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus, 633 control->i2c_address + 634 RAS_INDEX_TO_OFFSET(control, fri), 635 buf, buf_size); 636 up_read(&adev->reset_domain->sem); 637 if (res < 0) { 638 dev_err(adev->dev, "Writing %d EEPROM table records error:%d", 639 num, res); 640 } else if (res < buf_size) { 641 /* Short write, return error. 642 */ 643 dev_err(adev->dev, "Wrote %d records out of %d", 644 res / RAS_TABLE_RECORD_SIZE, num); 645 res = -EIO; 646 } else { 647 res = 0; 648 } 649 650 return res; 651 } 652 653 static int 654 amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, 655 struct eeprom_table_record *record, 656 const u32 num) 657 { 658 struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); 659 struct amdgpu_device *adev = to_amdgpu_device(control); 660 u32 a, b, i; 661 u8 *buf, *pp; 662 int res; 663 664 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); 665 if (!buf) 666 return -ENOMEM; 667 668 /* Encode all of them in one go. 669 */ 670 pp = buf; 671 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { 672 __encode_table_record_to_buf(control, &record[i], pp); 673 674 /* update bad channel bitmap */ 675 if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && 676 !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { 677 control->bad_channel_bitmap |= 1 << record[i].mem_channel; 678 con->update_channel_flag = true; 679 } 680 } 681 682 /* a, first record index to write into. 683 * b, last record index to write into. 684 * a = first index to read (fri) + number of records in the table, 685 * b = a + @num - 1. 686 * Let N = control->ras_max_num_record_count, then we have, 687 * case 0: 0 <= a <= b < N, 688 * just append @num records starting at a; 689 * case 1: 0 <= a < N <= b, 690 * append (N - a) records starting at a, and 691 * append the remainder, b % N + 1, starting at 0. 692 * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases, 693 * case 2a: 0 <= a <= b < N 694 * append num records starting at a; and fix fri if b overwrote it, 695 * and since a <= b, if b overwrote it then a must've also, 696 * and if b didn't overwrite it, then a didn't also. 697 * case 2b: 0 <= b < a < N 698 * write num records starting at a, which wraps around 0=N 699 * and overwrite fri unconditionally. Now from case 2a, 700 * this means that b eclipsed fri to overwrite it and wrap 701 * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally 702 * set fri = b + 1 (mod N). 703 * Now, since fri is updated in every case, except the trivial case 0, 704 * the number of records present in the table after writing, is, 705 * num_recs - 1 = b - fri (mod N), and we take the positive value, 706 * by adding an arbitrary multiple of N before taking the modulo N 707 * as shown below. 708 */ 709 a = control->ras_fri + control->ras_num_recs; 710 b = a + num - 1; 711 if (b < control->ras_max_record_count) { 712 res = __amdgpu_ras_eeprom_write(control, buf, a, num); 713 } else if (a < control->ras_max_record_count) { 714 u32 g0, g1; 715 716 g0 = control->ras_max_record_count - a; 717 g1 = b % control->ras_max_record_count + 1; 718 res = __amdgpu_ras_eeprom_write(control, buf, a, g0); 719 if (res) 720 goto Out; 721 res = __amdgpu_ras_eeprom_write(control, 722 buf + g0 * RAS_TABLE_RECORD_SIZE, 723 0, g1); 724 if (res) 725 goto Out; 726 if (g1 > control->ras_fri) 727 control->ras_fri = g1 % control->ras_max_record_count; 728 } else { 729 a %= control->ras_max_record_count; 730 b %= control->ras_max_record_count; 731 732 if (a <= b) { 733 /* Note that, b - a + 1 = num. */ 734 res = __amdgpu_ras_eeprom_write(control, buf, a, num); 735 if (res) 736 goto Out; 737 if (b >= control->ras_fri) 738 control->ras_fri = (b + 1) % control->ras_max_record_count; 739 } else { 740 u32 g0, g1; 741 742 /* b < a, which means, we write from 743 * a to the end of the table, and from 744 * the start of the table to b. 745 */ 746 g0 = control->ras_max_record_count - a; 747 g1 = b + 1; 748 res = __amdgpu_ras_eeprom_write(control, buf, a, g0); 749 if (res) 750 goto Out; 751 res = __amdgpu_ras_eeprom_write(control, 752 buf + g0 * RAS_TABLE_RECORD_SIZE, 753 0, g1); 754 if (res) 755 goto Out; 756 control->ras_fri = g1 % control->ras_max_record_count; 757 } 758 } 759 control->ras_num_recs = 1 + (control->ras_max_record_count + b 760 - control->ras_fri) 761 % control->ras_max_record_count; 762 763 /*old asics only save pa to eeprom like before*/ 764 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) 765 control->ras_num_pa_recs += num; 766 else 767 control->ras_num_mca_recs += num; 768 769 control->ras_num_bad_pages = con->bad_page_num; 770 Out: 771 kfree(buf); 772 return res; 773 } 774 775 static int 776 amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) 777 { 778 struct amdgpu_device *adev = to_amdgpu_device(control); 779 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 780 u8 *buf, *pp, csum; 781 u32 buf_size; 782 int res; 783 784 /* Modify the header if it exceeds. 785 */ 786 if (amdgpu_bad_page_threshold != 0 && 787 control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { 788 dev_warn(adev->dev, 789 "Saved bad pages %d reaches threshold value %d\n", 790 control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 791 792 if (adev->cper.enabled && !amdgpu_uniras_enabled(adev) && 793 amdgpu_cper_generate_bp_threshold_record(adev)) 794 dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); 795 796 if ((amdgpu_bad_page_threshold != -1) && 797 (amdgpu_bad_page_threshold != -2)) { 798 control->tbl_hdr.header = RAS_TABLE_HDR_BAD; 799 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { 800 control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; 801 control->tbl_rai.health_percent = 0; 802 } 803 ras->is_rma = true; 804 } 805 806 /* ignore the -ENOTSUPP return value */ 807 amdgpu_dpm_send_rma_reason(adev); 808 } 809 810 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 811 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + 812 RAS_TABLE_V2_1_INFO_SIZE + 813 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 814 else 815 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + 816 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 817 control->tbl_hdr.checksum = 0; 818 819 buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 820 buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); 821 if (!buf) { 822 dev_err(adev->dev, 823 "allocating memory for table of size %d bytes failed\n", 824 control->tbl_hdr.tbl_size); 825 res = -ENOMEM; 826 goto Out; 827 } 828 829 down_read(&adev->reset_domain->sem); 830 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 831 control->i2c_address + 832 control->ras_record_offset, 833 buf, buf_size); 834 up_read(&adev->reset_domain->sem); 835 if (res < 0) { 836 dev_err(adev->dev, "EEPROM failed reading records:%d\n", res); 837 goto Out; 838 } else if (res < buf_size) { 839 dev_err(adev->dev, "EEPROM read %d out of %d bytes\n", res, 840 buf_size); 841 res = -EIO; 842 goto Out; 843 } 844 845 /** 846 * bad page records have been stored in eeprom, 847 * now calculate gpu health percent 848 */ 849 if (amdgpu_bad_page_threshold != 0 && 850 control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && 851 control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) 852 control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - 853 control->ras_num_bad_pages) * 100) / 854 ras->bad_page_cnt_threshold; 855 856 /* Recalc the checksum. 857 */ 858 csum = 0; 859 for (pp = buf; pp < buf + buf_size; pp++) 860 csum += *pp; 861 862 csum += __calc_hdr_byte_sum(control); 863 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 864 csum += __calc_ras_info_byte_sum(control); 865 /* avoid sign extension when assigning to "checksum" */ 866 csum = -csum; 867 control->tbl_hdr.checksum = csum; 868 res = __write_table_header(control); 869 if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1) 870 res = __write_table_ras_info(control); 871 Out: 872 kfree(buf); 873 return res; 874 } 875 876 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control) 877 { 878 struct amdgpu_device *adev = to_amdgpu_device(control); 879 int ret, retry = 20; 880 881 if (!amdgpu_ras_smu_eeprom_supported(adev)) 882 return 0; 883 884 control->ras_num_recs_old = control->ras_num_recs; 885 886 do { 887 /* 1000ms timeout is long enough, smu_get_badpage_count won't 888 * return -EBUSY before timeout. 889 */ 890 ret = amdgpu_ras_smu_get_badpage_count(adev, 891 &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS); 892 if (!ret && 893 (control->ras_num_recs_old == control->ras_num_recs)) { 894 /* record number update in PMFW needs some time, 895 * smu_get_badpage_count may return immediately without 896 * count update, sleep for a while and retry again. 897 */ 898 msleep(50); 899 retry--; 900 } else { 901 break; 902 } 903 } while (retry); 904 905 /* no update of record number is not a real failure, 906 * don't print warning here 907 */ 908 if (!ret && (control->ras_num_recs_old == control->ras_num_recs)) 909 ret = -EINVAL; 910 911 return ret; 912 } 913 914 static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control) 915 { 916 struct amdgpu_device *adev = to_amdgpu_device(control); 917 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 918 919 if (!amdgpu_ras_smu_eeprom_supported(adev) || !con) 920 return 0; 921 922 control->ras_num_bad_pages = con->bad_page_num; 923 924 if (amdgpu_bad_page_threshold != 0 && 925 control->ras_num_bad_pages > con->bad_page_cnt_threshold) { 926 dev_warn(adev->dev, 927 "Saved bad pages %d reaches threshold value %d\n", 928 control->ras_num_bad_pages, con->bad_page_cnt_threshold); 929 930 if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev)) 931 dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); 932 933 if ((amdgpu_bad_page_threshold != -1) && 934 (amdgpu_bad_page_threshold != -2)) 935 con->is_rma = true; 936 } 937 938 return 0; 939 } 940 941 /** 942 * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table 943 * @control: pointer to control structure 944 * @record: array of records to append 945 * @num: number of records in @record array 946 * 947 * Append @num records to the table, calculate the checksum and write 948 * the table back to EEPROM. The maximum number of records that 949 * can be appended is between 1 and control->ras_max_record_count, 950 * regardless of how many records are already stored in the table. 951 * 952 * Return 0 on success or if EEPROM is not supported, -errno on error. 953 */ 954 int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, 955 struct eeprom_table_record *record, 956 const u32 num) 957 { 958 struct amdgpu_device *adev = to_amdgpu_device(control); 959 int res, i; 960 uint64_t nps = AMDGPU_NPS1_PARTITION_MODE; 961 962 if (!__is_ras_eeprom_supported(adev)) 963 return 0; 964 965 if (amdgpu_ras_smu_eeprom_supported(adev)) 966 return amdgpu_ras_smu_eeprom_append(control); 967 968 if (num == 0) { 969 dev_err(adev->dev, "will not append 0 records\n"); 970 return -EINVAL; 971 } else if (num > control->ras_max_record_count) { 972 dev_err(adev->dev, 973 "cannot append %d records than the size of table %d\n", 974 num, control->ras_max_record_count); 975 return -EINVAL; 976 } 977 978 if (adev->gmc.gmc_funcs->query_mem_partition_mode) 979 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 980 981 /* set the new channel index flag */ 982 for (i = 0; i < num; i++) 983 record[i].retired_page |= (nps << UMC_NPS_SHIFT); 984 985 mutex_lock(&control->ras_tbl_mutex); 986 987 res = amdgpu_ras_eeprom_append_table(control, record, num); 988 if (!res) 989 res = amdgpu_ras_eeprom_update_header(control); 990 if (!res) 991 amdgpu_ras_debugfs_set_ret_size(control); 992 993 mutex_unlock(&control->ras_tbl_mutex); 994 995 /* clear channel index flag, the flag is only saved on eeprom */ 996 for (i = 0; i < num; i++) 997 record[i].retired_page &= ~(nps << UMC_NPS_SHIFT); 998 999 return res; 1000 } 1001 1002 /** 1003 * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer 1004 * @control: pointer to control structure 1005 * @buf: pointer to buffer to read into 1006 * @fri: first record index, start reading at this index, absolute index 1007 * @num: number of records to read 1008 * 1009 * The caller must hold the table mutex in @control. 1010 * Return 0 on success, -errno otherwise. 1011 */ 1012 static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, 1013 u8 *buf, const u32 fri, const u32 num) 1014 { 1015 struct amdgpu_device *adev = to_amdgpu_device(control); 1016 u32 buf_size; 1017 int res; 1018 1019 /* i2c may be unstable in gpu reset */ 1020 down_read(&adev->reset_domain->sem); 1021 buf_size = num * RAS_TABLE_RECORD_SIZE; 1022 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1023 control->i2c_address + 1024 RAS_INDEX_TO_OFFSET(control, fri), 1025 buf, buf_size); 1026 up_read(&adev->reset_domain->sem); 1027 if (res < 0) { 1028 dev_err(adev->dev, "Reading %d EEPROM table records error:%d", 1029 num, res); 1030 } else if (res < buf_size) { 1031 /* Short read, return error. 1032 */ 1033 dev_err(adev->dev, "Read %d records out of %d", 1034 res / RAS_TABLE_RECORD_SIZE, num); 1035 res = -EIO; 1036 } else { 1037 res = 0; 1038 } 1039 1040 return res; 1041 } 1042 1043 int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, 1044 struct eeprom_table_record *record, u32 rec_idx, 1045 const u32 num) 1046 { 1047 struct amdgpu_device *adev = to_amdgpu_device(control); 1048 uint64_t ts, end_idx; 1049 int i, ret; 1050 u64 mca, ipid; 1051 1052 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1053 return 0; 1054 1055 if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse) 1056 return -EOPNOTSUPP; 1057 1058 end_idx = rec_idx + num; 1059 for (i = rec_idx; i < end_idx; i++) { 1060 ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca); 1061 if (ret) 1062 return ret; 1063 1064 ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid); 1065 if (ret) 1066 return ret; 1067 1068 ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts); 1069 if (ret) 1070 return ret; 1071 1072 record[i - rec_idx].address = mca; 1073 /* retired_page (pa) is unused now */ 1074 record[i - rec_idx].retired_page = 0x1ULL; 1075 record[i - rec_idx].ts = ts; 1076 record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 1077 1078 adev->umc.ras->mca_ipid_parse(adev, ipid, 1079 (uint32_t *)&(record[i - rec_idx].cu), 1080 (uint32_t *)&(record[i - rec_idx].mem_channel), 1081 (uint32_t *)&(record[i - rec_idx].mcumc_id), NULL); 1082 } 1083 1084 return 0; 1085 } 1086 1087 /** 1088 * amdgpu_ras_eeprom_read -- read EEPROM 1089 * @control: pointer to control structure 1090 * @record: array of records to read into 1091 * @num: number of records in @record 1092 * 1093 * Reads num records from the RAS table in EEPROM and 1094 * writes the data into @record array. 1095 * 1096 * Returns 0 on success, -errno on error. 1097 */ 1098 int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, 1099 struct eeprom_table_record *record, 1100 const u32 num) 1101 { 1102 struct amdgpu_device *adev = to_amdgpu_device(control); 1103 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1104 int i, res; 1105 u8 *buf, *pp; 1106 u32 g0, g1; 1107 1108 if (amdgpu_ras_smu_eeprom_supported(adev)) 1109 return amdgpu_ras_eeprom_read_idx(control, record, 0, num); 1110 1111 if (!__is_ras_eeprom_supported(adev)) 1112 return 0; 1113 1114 if (num == 0) { 1115 dev_err(adev->dev, "will not read 0 records\n"); 1116 return -EINVAL; 1117 } else if (num > control->ras_num_recs) { 1118 dev_err(adev->dev, "too many records to read:%d available:%d\n", 1119 num, control->ras_num_recs); 1120 return -EINVAL; 1121 } 1122 1123 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); 1124 if (!buf) 1125 return -ENOMEM; 1126 1127 /* Determine how many records to read, from the first record 1128 * index, fri, to the end of the table, and from the beginning 1129 * of the table, such that the total number of records is 1130 * @num, and we handle wrap around when fri > 0 and 1131 * fri + num > RAS_MAX_RECORD_COUNT. 1132 * 1133 * First we compute the index of the last element 1134 * which would be fetched from each region, 1135 * g0 is in [fri, fri + num - 1], and 1136 * g1 is in [0, RAS_MAX_RECORD_COUNT - 1]. 1137 * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of 1138 * the last element to fetch, we set g0 to _the number_ 1139 * of elements to fetch, @num, since we know that the last 1140 * indexed to be fetched does not exceed the table. 1141 * 1142 * If, however, g0 >= RAS_MAX_RECORD_COUNT, then 1143 * we set g0 to the number of elements to read 1144 * until the end of the table, and g1 to the number of 1145 * elements to read from the beginning of the table. 1146 */ 1147 g0 = control->ras_fri + num - 1; 1148 g1 = g0 % control->ras_max_record_count; 1149 if (g0 < control->ras_max_record_count) { 1150 g0 = num; 1151 g1 = 0; 1152 } else { 1153 g0 = control->ras_max_record_count - control->ras_fri; 1154 g1 += 1; 1155 } 1156 1157 mutex_lock(&control->ras_tbl_mutex); 1158 res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0); 1159 if (res) 1160 goto Out; 1161 if (g1) { 1162 res = __amdgpu_ras_eeprom_read(control, 1163 buf + g0 * RAS_TABLE_RECORD_SIZE, 1164 0, g1); 1165 if (res) 1166 goto Out; 1167 } 1168 1169 res = 0; 1170 1171 /* Read up everything? Then transform. 1172 */ 1173 pp = buf; 1174 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { 1175 __decode_table_record_from_buf(control, &record[i], pp); 1176 1177 /* update bad channel bitmap */ 1178 if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && 1179 !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { 1180 control->bad_channel_bitmap |= 1 << record[i].mem_channel; 1181 con->update_channel_flag = true; 1182 } 1183 } 1184 Out: 1185 kfree(buf); 1186 mutex_unlock(&control->ras_tbl_mutex); 1187 1188 return res; 1189 } 1190 1191 uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control) 1192 { 1193 /* get available eeprom table version first before eeprom table init */ 1194 amdgpu_ras_set_eeprom_table_version(control); 1195 1196 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 1197 return RAS_MAX_RECORD_COUNT_V2_1; 1198 else 1199 return RAS_MAX_RECORD_COUNT; 1200 } 1201 1202 static ssize_t 1203 amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf, 1204 size_t size, loff_t *pos) 1205 { 1206 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 1207 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1208 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; 1209 u8 data[50]; 1210 int res; 1211 1212 if (!size) 1213 return size; 1214 1215 if (!ras || !control) { 1216 res = snprintf(data, sizeof(data), "Not supported\n"); 1217 } else { 1218 res = snprintf(data, sizeof(data), "%d bytes or %d records\n", 1219 RAS_TBL_SIZE_BYTES, control->ras_max_record_count); 1220 } 1221 1222 if (*pos >= res) 1223 return 0; 1224 1225 res -= *pos; 1226 res = min_t(size_t, res, size); 1227 1228 if (copy_to_user(buf, &data[*pos], res)) 1229 return -EFAULT; 1230 1231 *pos += res; 1232 1233 return res; 1234 } 1235 1236 const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = { 1237 .owner = THIS_MODULE, 1238 .read = amdgpu_ras_debugfs_eeprom_size_read, 1239 .write = NULL, 1240 .llseek = default_llseek, 1241 }; 1242 1243 static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n"; 1244 static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n"; 1245 #define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1) 1246 static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n"; 1247 static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n"; 1248 #define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1) 1249 1250 static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = { 1251 "ignore", 1252 "re", 1253 "ue", 1254 }; 1255 1256 static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control) 1257 { 1258 return strlen(tbl_hdr_str) + tbl_hdr_fmt_size + 1259 strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs; 1260 } 1261 1262 void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control) 1263 { 1264 struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras, 1265 eeprom_control); 1266 struct dentry *de = ras->de_ras_eeprom_table; 1267 1268 if (de) 1269 d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control); 1270 } 1271 1272 static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf, 1273 size_t size, loff_t *pos) 1274 { 1275 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 1276 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1277 struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control; 1278 const size_t orig_size = size; 1279 int res = -EFAULT; 1280 size_t data_len; 1281 1282 /* pmfw manages eeprom data by itself */ 1283 if (amdgpu_ras_smu_eeprom_supported(adev)) 1284 return 0; 1285 1286 mutex_lock(&control->ras_tbl_mutex); 1287 1288 /* We want *pos - data_len > 0, which means there's 1289 * bytes to be printed from data. 1290 */ 1291 data_len = strlen(tbl_hdr_str); 1292 if (*pos < data_len) { 1293 data_len -= *pos; 1294 data_len = min_t(size_t, data_len, size); 1295 if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len)) 1296 goto Out; 1297 buf += data_len; 1298 size -= data_len; 1299 *pos += data_len; 1300 } 1301 1302 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size; 1303 if (*pos < data_len && size > 0) { 1304 u8 data[tbl_hdr_fmt_size + 1]; 1305 loff_t lpos; 1306 1307 snprintf(data, sizeof(data), tbl_hdr_fmt, 1308 control->tbl_hdr.header, 1309 control->tbl_hdr.version, 1310 control->tbl_hdr.first_rec_offset, 1311 control->tbl_hdr.tbl_size, 1312 control->tbl_hdr.checksum); 1313 1314 data_len -= *pos; 1315 data_len = min_t(size_t, data_len, size); 1316 lpos = *pos - strlen(tbl_hdr_str); 1317 if (copy_to_user(buf, &data[lpos], data_len)) 1318 goto Out; 1319 buf += data_len; 1320 size -= data_len; 1321 *pos += data_len; 1322 } 1323 1324 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str); 1325 if (*pos < data_len && size > 0) { 1326 loff_t lpos; 1327 1328 data_len -= *pos; 1329 data_len = min_t(size_t, data_len, size); 1330 lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size; 1331 if (copy_to_user(buf, &rec_hdr_str[lpos], data_len)) 1332 goto Out; 1333 buf += data_len; 1334 size -= data_len; 1335 *pos += data_len; 1336 } 1337 1338 data_len = amdgpu_ras_debugfs_table_size(control); 1339 if (*pos < data_len && size > 0) { 1340 u8 dare[RAS_TABLE_RECORD_SIZE]; 1341 u8 data[rec_hdr_fmt_size + 1]; 1342 struct eeprom_table_record record; 1343 int s, r; 1344 1345 /* Find the starting record index 1346 */ 1347 s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size - 1348 strlen(rec_hdr_str); 1349 s = s / rec_hdr_fmt_size; 1350 r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size - 1351 strlen(rec_hdr_str); 1352 r = r % rec_hdr_fmt_size; 1353 1354 for ( ; size > 0 && s < control->ras_num_recs; s++) { 1355 u32 ai = RAS_RI_TO_AI(control, s); 1356 /* Read a single record 1357 */ 1358 res = __amdgpu_ras_eeprom_read(control, dare, ai, 1); 1359 if (res) 1360 goto Out; 1361 __decode_table_record_from_buf(control, &record, dare); 1362 snprintf(data, sizeof(data), rec_hdr_fmt, 1363 s, 1364 RAS_INDEX_TO_OFFSET(control, ai), 1365 record_err_type_str[record.err_type], 1366 record.bank, 1367 record.ts, 1368 record.offset, 1369 record.mem_channel, 1370 record.mcumc_id, 1371 record.retired_page); 1372 1373 data_len = min_t(size_t, rec_hdr_fmt_size - r, size); 1374 if (copy_to_user(buf, &data[r], data_len)) { 1375 res = -EFAULT; 1376 goto Out; 1377 } 1378 buf += data_len; 1379 size -= data_len; 1380 *pos += data_len; 1381 r = 0; 1382 } 1383 } 1384 res = 0; 1385 Out: 1386 mutex_unlock(&control->ras_tbl_mutex); 1387 return res < 0 ? res : orig_size - size; 1388 } 1389 1390 static ssize_t 1391 amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf, 1392 size_t size, loff_t *pos) 1393 { 1394 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 1395 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1396 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; 1397 u8 data[81]; 1398 int res; 1399 1400 if (!size) 1401 return size; 1402 1403 if (!ras || !control) { 1404 res = snprintf(data, sizeof(data), "Not supported\n"); 1405 if (*pos >= res) 1406 return 0; 1407 1408 res -= *pos; 1409 res = min_t(size_t, res, size); 1410 1411 if (copy_to_user(buf, &data[*pos], res)) 1412 return -EFAULT; 1413 1414 *pos += res; 1415 1416 return res; 1417 } else { 1418 return amdgpu_ras_debugfs_table_read(f, buf, size, pos); 1419 } 1420 } 1421 1422 const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = { 1423 .owner = THIS_MODULE, 1424 .read = amdgpu_ras_debugfs_eeprom_table_read, 1425 .write = NULL, 1426 .llseek = default_llseek, 1427 }; 1428 1429 /** 1430 * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum 1431 * @control: pointer to control structure 1432 * 1433 * Check the checksum of the stored in EEPROM RAS table. 1434 * 1435 * Return 0 if the checksum is correct, 1436 * positive if it is not correct, and 1437 * -errno on I/O error. 1438 */ 1439 static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control) 1440 { 1441 struct amdgpu_device *adev = to_amdgpu_device(control); 1442 int buf_size, res; 1443 u8 csum, *buf, *pp; 1444 1445 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) 1446 buf_size = RAS_TABLE_HEADER_SIZE + 1447 RAS_TABLE_V2_1_INFO_SIZE + 1448 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 1449 else 1450 buf_size = RAS_TABLE_HEADER_SIZE + 1451 control->ras_num_recs * RAS_TABLE_RECORD_SIZE; 1452 1453 buf = kzalloc(buf_size, GFP_KERNEL); 1454 if (!buf) { 1455 dev_err(adev->dev, 1456 "Out of memory checking RAS table checksum.\n"); 1457 return -ENOMEM; 1458 } 1459 1460 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1461 control->i2c_address + 1462 control->ras_header_offset, 1463 buf, buf_size); 1464 if (res < buf_size) { 1465 dev_err(adev->dev, "Partial read for checksum, res:%d\n", res); 1466 /* On partial reads, return -EIO. 1467 */ 1468 if (res >= 0) 1469 res = -EIO; 1470 goto Out; 1471 } 1472 1473 csum = 0; 1474 for (pp = buf; pp < buf + buf_size; pp++) 1475 csum += *pp; 1476 Out: 1477 kfree(buf); 1478 return res < 0 ? res : csum; 1479 } 1480 1481 static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) 1482 { 1483 struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; 1484 struct amdgpu_device *adev = to_amdgpu_device(control); 1485 unsigned char *buf; 1486 int res; 1487 1488 buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL); 1489 if (!buf) { 1490 dev_err(adev->dev, 1491 "Failed to alloc buf to read EEPROM table ras info\n"); 1492 return -ENOMEM; 1493 } 1494 1495 /** 1496 * EEPROM table V2_1 supports ras info, 1497 * read EEPROM table ras info 1498 */ 1499 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1500 control->i2c_address + control->ras_info_offset, 1501 buf, RAS_TABLE_V2_1_INFO_SIZE); 1502 if (res < RAS_TABLE_V2_1_INFO_SIZE) { 1503 dev_err(adev->dev, 1504 "Failed to read EEPROM table ras info, res:%d", res); 1505 res = res >= 0 ? -EIO : res; 1506 goto Out; 1507 } 1508 1509 __decode_table_ras_info_from_buf(rai, buf); 1510 1511 Out: 1512 kfree(buf); 1513 return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; 1514 } 1515 1516 static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control) 1517 { 1518 struct amdgpu_device *adev = to_amdgpu_device(control); 1519 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 1520 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1521 uint64_t local_time; 1522 int res; 1523 1524 ras->is_rma = false; 1525 1526 if (!__is_ras_eeprom_supported(adev)) 1527 return 0; 1528 mutex_init(&control->ras_tbl_mutex); 1529 1530 res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version)); 1531 if (res) 1532 return res; 1533 1534 res = amdgpu_ras_smu_get_badpage_count(adev, 1535 &(control->ras_num_recs), 100); 1536 if (res) 1537 return res; 1538 1539 local_time = (uint64_t)ktime_get_real_seconds(); 1540 res = amdgpu_ras_smu_set_timestamp(adev, local_time); 1541 if (res) 1542 return res; 1543 1544 control->ras_max_record_count = 4000; 1545 1546 control->ras_num_mca_recs = 0; 1547 control->ras_num_pa_recs = 0; 1548 1549 return 0; 1550 } 1551 1552 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) 1553 { 1554 struct amdgpu_device *adev = to_amdgpu_device(control); 1555 unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; 1556 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 1557 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1558 int res; 1559 1560 if (amdgpu_ras_smu_eeprom_supported(adev)) 1561 return amdgpu_ras_smu_eeprom_init(control); 1562 1563 ras->is_rma = false; 1564 1565 if (!__is_ras_eeprom_supported(adev)) 1566 return 0; 1567 1568 /* Verify i2c adapter is initialized */ 1569 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo) 1570 return -ENOENT; 1571 1572 if (!__get_eeprom_i2c_addr(adev, control)) 1573 return -EINVAL; 1574 1575 control->ras_header_offset = RAS_HDR_START; 1576 control->ras_info_offset = RAS_TABLE_V2_1_INFO_START; 1577 mutex_init(&control->ras_tbl_mutex); 1578 1579 /* Read the table header from EEPROM address */ 1580 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus, 1581 control->i2c_address + control->ras_header_offset, 1582 buf, RAS_TABLE_HEADER_SIZE); 1583 if (res < RAS_TABLE_HEADER_SIZE) { 1584 dev_err(adev->dev, "Failed to read EEPROM table header, res:%d", 1585 res); 1586 return res >= 0 ? -EIO : res; 1587 } 1588 1589 __decode_table_header_from_buf(hdr, buf); 1590 1591 if (hdr->header != RAS_TABLE_HDR_VAL && 1592 hdr->header != RAS_TABLE_HDR_BAD) { 1593 dev_info(adev->dev, "Creating a new EEPROM table"); 1594 return amdgpu_ras_eeprom_reset_table(control); 1595 } 1596 1597 switch (hdr->version) { 1598 case RAS_TABLE_VER_V2_1: 1599 case RAS_TABLE_VER_V3: 1600 control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); 1601 control->ras_record_offset = RAS_RECORD_START_V2_1; 1602 control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; 1603 break; 1604 case RAS_TABLE_VER_V1: 1605 control->ras_num_recs = RAS_NUM_RECS(hdr); 1606 control->ras_record_offset = RAS_RECORD_START; 1607 control->ras_max_record_count = RAS_MAX_RECORD_COUNT; 1608 break; 1609 default: 1610 dev_err(adev->dev, 1611 "RAS header invalid, unsupported version: %u", 1612 hdr->version); 1613 return -EINVAL; 1614 } 1615 1616 if (control->ras_num_recs > control->ras_max_record_count) { 1617 dev_err(adev->dev, 1618 "RAS header invalid, records in header: %u max allowed :%u", 1619 control->ras_num_recs, control->ras_max_record_count); 1620 return -EINVAL; 1621 } 1622 1623 control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); 1624 control->ras_num_mca_recs = 0; 1625 control->ras_num_pa_recs = 0; 1626 return 0; 1627 } 1628 1629 static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control) 1630 { 1631 struct amdgpu_device *adev = to_amdgpu_device(control); 1632 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1633 1634 if (!__is_ras_eeprom_supported(adev)) 1635 return 0; 1636 1637 control->ras_num_bad_pages = ras->bad_page_num; 1638 1639 if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) && 1640 amdgpu_bad_page_threshold != 0) { 1641 dev_warn(adev->dev, 1642 "RAS records:%d exceed threshold:%d\n", 1643 control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 1644 if ((amdgpu_bad_page_threshold == -1) || 1645 (amdgpu_bad_page_threshold == -2)) { 1646 dev_warn(adev->dev, 1647 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); 1648 } else { 1649 ras->is_rma = true; 1650 dev_warn(adev->dev, 1651 "User defined threshold is set, runtime service will be halt when threshold is reached\n"); 1652 } 1653 1654 return 0; 1655 } 1656 1657 dev_dbg(adev->dev, 1658 "Found existing EEPROM table with %d records", 1659 control->ras_num_bad_pages); 1660 1661 /* Warn if we are at 90% of the threshold or above 1662 */ 1663 if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) 1664 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", 1665 control->ras_num_bad_pages, 1666 ras->bad_page_cnt_threshold); 1667 return 0; 1668 } 1669 1670 int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) 1671 { 1672 struct amdgpu_device *adev = to_amdgpu_device(control); 1673 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 1674 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1675 int res = 0; 1676 1677 if (amdgpu_ras_smu_eeprom_supported(adev)) 1678 return amdgpu_ras_smu_eeprom_check(control); 1679 1680 if (!__is_ras_eeprom_supported(adev)) 1681 return 0; 1682 1683 /* Verify i2c adapter is initialized */ 1684 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo) 1685 return -ENOENT; 1686 1687 if (!__get_eeprom_i2c_addr(adev, control)) 1688 return -EINVAL; 1689 1690 control->ras_num_bad_pages = ras->bad_page_num; 1691 1692 if (hdr->header == RAS_TABLE_HDR_VAL) { 1693 dev_dbg(adev->dev, 1694 "Found existing EEPROM table with %d records", 1695 control->ras_num_bad_pages); 1696 1697 if (hdr->version >= RAS_TABLE_VER_V2_1) { 1698 res = __read_table_ras_info(control); 1699 if (res) 1700 return res; 1701 } 1702 1703 res = __verify_ras_table_checksum(control); 1704 if (res) 1705 dev_err(adev->dev, 1706 "RAS table incorrect checksum or error:%d\n", 1707 res); 1708 1709 /* Warn if we are at 90% of the threshold or above 1710 */ 1711 if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) 1712 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", 1713 control->ras_num_bad_pages, 1714 ras->bad_page_cnt_threshold); 1715 } else if (hdr->header == RAS_TABLE_HDR_BAD && 1716 amdgpu_bad_page_threshold != 0) { 1717 if (hdr->version >= RAS_TABLE_VER_V2_1) { 1718 res = __read_table_ras_info(control); 1719 if (res) 1720 return res; 1721 } 1722 1723 res = __verify_ras_table_checksum(control); 1724 if (res) { 1725 dev_err(adev->dev, 1726 "RAS Table incorrect checksum or error:%d\n", 1727 res); 1728 return -EINVAL; 1729 } 1730 if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) { 1731 /* This means that, the threshold was increased since 1732 * the last time the system was booted, and now, 1733 * ras->bad_page_cnt_threshold - control->num_recs > 0, 1734 * so that at least one more record can be saved, 1735 * before the page count threshold is reached. 1736 */ 1737 dev_info(adev->dev, 1738 "records:%d threshold:%d, resetting " 1739 "RAS table header signature", 1740 control->ras_num_bad_pages, 1741 ras->bad_page_cnt_threshold); 1742 res = amdgpu_ras_eeprom_correct_header_tag(control, 1743 RAS_TABLE_HDR_VAL); 1744 } else { 1745 dev_warn(adev->dev, 1746 "RAS records:%d exceed threshold:%d\n", 1747 control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 1748 if ((amdgpu_bad_page_threshold == -1) || 1749 (amdgpu_bad_page_threshold == -2)) { 1750 res = 0; 1751 dev_warn(adev->dev, 1752 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); 1753 } else { 1754 ras->is_rma = true; 1755 dev_warn(adev->dev, 1756 "User defined threshold is set, runtime service will be halt when threshold is reached\n"); 1757 } 1758 } 1759 } 1760 1761 return res < 0 ? res : 0; 1762 } 1763 1764 void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev) 1765 { 1766 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1767 struct amdgpu_ras_eeprom_control *control; 1768 int res; 1769 1770 if (!__is_ras_eeprom_supported(adev) || !ras || 1771 amdgpu_ras_smu_eeprom_supported(adev)) 1772 return; 1773 control = &ras->eeprom_control; 1774 if (!control->is_eeprom_valid) 1775 return; 1776 res = __verify_ras_table_checksum(control); 1777 if (res) { 1778 dev_warn(adev->dev, 1779 "RAS table incorrect checksum or error:%d, try to recover\n", 1780 res); 1781 if (!amdgpu_ras_eeprom_reset_table(control)) 1782 if (!amdgpu_ras_save_bad_pages(adev, NULL)) 1783 if (!__verify_ras_table_checksum(control)) { 1784 dev_info(adev->dev, "RAS table recovery succeed\n"); 1785 return; 1786 } 1787 dev_err(adev->dev, "RAS table recovery failed\n"); 1788 control->is_eeprom_valid = false; 1789 } 1790 return; 1791 } 1792 1793 static const struct ras_smu_drv *amdgpu_ras_get_smu_ras_drv(struct amdgpu_device *adev) 1794 { 1795 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1796 1797 if (!ras) 1798 return NULL; 1799 1800 return ras->ras_smu_drv; 1801 } 1802 1803 static uint64_t amdgpu_ras_smu_get_feature_flags(struct amdgpu_device *adev) 1804 { 1805 const struct ras_smu_drv *ras_smu_drv = amdgpu_ras_get_smu_ras_drv(adev); 1806 uint64_t flags = 0ULL; 1807 1808 if (!ras_smu_drv) 1809 goto out; 1810 1811 if (ras_smu_drv->ras_smu_feature_flags) 1812 ras_smu_drv->ras_smu_feature_flags(adev, &flags); 1813 1814 out: 1815 return flags; 1816 } 1817 1818 bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev) 1819 { 1820 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1821 uint64_t flags = 0ULL; 1822 1823 if (!__is_ras_eeprom_supported(adev) || !smu_ras_drv) 1824 return false; 1825 1826 if (!smu_ras_drv->smu_eeprom_funcs) 1827 return false; 1828 1829 flags = amdgpu_ras_smu_get_feature_flags(adev); 1830 1831 return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM); 1832 } 1833 1834 int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev, 1835 uint32_t *table_version) 1836 { 1837 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1838 1839 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1840 return -EOPNOTSUPP; 1841 1842 if (smu_ras_drv->smu_eeprom_funcs->get_ras_table_version) 1843 return smu_ras_drv->smu_eeprom_funcs->get_ras_table_version(adev, 1844 table_version); 1845 return -EOPNOTSUPP; 1846 } 1847 1848 int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev, 1849 uint32_t *count, uint32_t timeout) 1850 { 1851 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1852 1853 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1854 return -EOPNOTSUPP; 1855 1856 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_count) 1857 return smu_ras_drv->smu_eeprom_funcs->get_badpage_count(adev, 1858 count, timeout); 1859 return -EOPNOTSUPP; 1860 } 1861 1862 int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev, 1863 uint16_t index, uint64_t *mca_addr) 1864 { 1865 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1866 1867 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1868 return -EOPNOTSUPP; 1869 1870 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr) 1871 return smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr(adev, 1872 index, mca_addr); 1873 return -EOPNOTSUPP; 1874 } 1875 1876 int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev, 1877 uint64_t timestamp) 1878 { 1879 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1880 1881 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1882 return -EOPNOTSUPP; 1883 1884 if (smu_ras_drv->smu_eeprom_funcs->set_timestamp) 1885 return smu_ras_drv->smu_eeprom_funcs->set_timestamp(adev, 1886 timestamp); 1887 return -EOPNOTSUPP; 1888 } 1889 1890 int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev, 1891 uint16_t index, uint64_t *timestamp) 1892 { 1893 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1894 1895 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1896 return -EOPNOTSUPP; 1897 1898 if (smu_ras_drv->smu_eeprom_funcs->get_timestamp) 1899 return smu_ras_drv->smu_eeprom_funcs->get_timestamp(adev, 1900 index, timestamp); 1901 return -EOPNOTSUPP; 1902 } 1903 1904 int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev, 1905 uint16_t index, uint64_t *ipid) 1906 { 1907 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1908 1909 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1910 return -EOPNOTSUPP; 1911 1912 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid) 1913 return smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid(adev, 1914 index, ipid); 1915 return -EOPNOTSUPP; 1916 } 1917 1918 int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev, 1919 uint32_t *result) 1920 { 1921 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); 1922 1923 if (!amdgpu_ras_smu_eeprom_supported(adev)) 1924 return -EOPNOTSUPP; 1925 1926 if (smu_ras_drv->smu_eeprom_funcs->erase_ras_table) 1927 return smu_ras_drv->smu_eeprom_funcs->erase_ras_table(adev, 1928 result); 1929 return -EOPNOTSUPP; 1930 } 1931