1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include "ras.h" 25 #include "ras_cmd.h" 26 27 #define RAS_CMD_MAJOR_VERSION 6 28 #define RAS_CMD_MINOR_VERSION 0 29 #define RAS_CMD_VERSION (((RAS_CMD_MAJOR_VERSION) << 10) | (RAS_CMD_MINOR_VERSION)) 30 31 static int ras_cmd_add_device(struct ras_core_context *ras_core) 32 { 33 INIT_LIST_HEAD(&ras_core->ras_cmd.head); 34 ras_core->ras_cmd.ras_core = ras_core; 35 ras_core->ras_cmd.dev_handle = (uintptr_t)ras_core ^ RAS_CMD_DEV_HANDLE_MAGIC; 36 return 0; 37 } 38 39 static int ras_cmd_remove_device(struct ras_core_context *ras_core) 40 { 41 memset(&ras_core->ras_cmd, 0, sizeof(ras_core->ras_cmd)); 42 return 0; 43 } 44 45 static int ras_get_block_ecc_info(struct ras_core_context *ras_core, 46 struct ras_cmd_ctx *cmd, void *data) 47 { 48 struct ras_cmd_block_ecc_info_req *input_data = 49 (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw; 50 struct ras_cmd_block_ecc_info_rsp *output_data = 51 (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw; 52 struct ras_ecc_count err_data; 53 int ret; 54 55 if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req)) 56 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 57 58 memset(&err_data, 0, sizeof(err_data)); 59 ret = ras_aca_get_block_ecc_count(ras_core, input_data->block_id, &err_data); 60 if (ret) 61 return RAS_CMD__ERROR_GENERIC; 62 63 output_data->ce_count = err_data.total_ce_count; 64 output_data->ue_count = err_data.total_ue_count; 65 output_data->de_count = err_data.total_de_count; 66 67 cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp); 68 return RAS_CMD__SUCCESS; 69 } 70 71 static void ras_cmd_update_bad_page_info(struct ras_cmd_bad_page_record *ras_cmd_record, 72 struct eeprom_umc_record *record) 73 { 74 ras_cmd_record->retired_page = record->cur_nps_retired_row_pfn; 75 ras_cmd_record->ts = record->ts; 76 ras_cmd_record->err_type = record->err_type; 77 ras_cmd_record->mem_channel = record->mem_channel; 78 ras_cmd_record->mcumc_id = record->mcumc_id; 79 ras_cmd_record->address = record->address; 80 ras_cmd_record->bank = record->bank; 81 ras_cmd_record->valid = 1; 82 } 83 84 static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core, 85 uint32_t group_index, struct ras_cmd_bad_pages_info_rsp *output_data) 86 { 87 struct eeprom_umc_record record; 88 struct ras_cmd_bad_page_record *ras_cmd_record; 89 uint32_t i = 0, bp_cnt = 0, group_cnt = 0; 90 91 output_data->bp_in_group = 0; 92 output_data->group_index = 0; 93 94 bp_cnt = ras_umc_get_badpage_count(ras_core); 95 if (bp_cnt) { 96 output_data->group_index = group_index; 97 group_cnt = bp_cnt / RAS_CMD_MAX_BAD_PAGES_PER_GROUP 98 + ((bp_cnt % RAS_CMD_MAX_BAD_PAGES_PER_GROUP) ? 1 : 0); 99 100 if (group_index >= group_cnt) 101 return RAS_CMD__ERROR_INVALID_INPUT_DATA; 102 103 i = group_index * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 104 for (; 105 i < bp_cnt && output_data->bp_in_group < RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 106 i++) { 107 if (ras_umc_get_badpage_record(ras_core, i, &record)) 108 return RAS_CMD__ERROR_GENERIC; 109 110 ras_cmd_record = &output_data->records[i % RAS_CMD_MAX_BAD_PAGES_PER_GROUP]; 111 112 memset(ras_cmd_record, 0, sizeof(*ras_cmd_record)); 113 ras_cmd_update_bad_page_info(ras_cmd_record, &record); 114 output_data->bp_in_group++; 115 } 116 } 117 output_data->bp_total_cnt = bp_cnt; 118 return RAS_CMD__SUCCESS; 119 } 120 121 static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core, 122 struct ras_cmd_ctx *cmd, void *data) 123 { 124 struct ras_cmd_bad_pages_info_req *input_data = 125 (struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw; 126 struct ras_cmd_bad_pages_info_rsp *output_data = 127 (struct ras_cmd_bad_pages_info_rsp *)cmd->output_buff_raw; 128 int ret; 129 130 if (cmd->input_size != sizeof(struct ras_cmd_bad_pages_info_req)) 131 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 132 133 ret = ras_cmd_get_group_bad_pages(ras_core, input_data->group_index, output_data); 134 if (ret) 135 return RAS_CMD__ERROR_GENERIC; 136 137 output_data->version = 0; 138 139 cmd->output_size = sizeof(struct ras_cmd_bad_pages_info_rsp); 140 return RAS_CMD__SUCCESS; 141 } 142 143 static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core, 144 struct ras_cmd_ctx *cmd, void *data) 145 { 146 if (cmd->input_size != sizeof(struct ras_cmd_dev_handle)) 147 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 148 149 if (ras_eeprom_reset_table(ras_core)) 150 return RAS_CMD__ERROR_GENERIC; 151 152 if (ras_umc_clean_badpage_data(ras_core)) 153 return RAS_CMD__ERROR_GENERIC; 154 155 return RAS_CMD__SUCCESS; 156 } 157 158 static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core, 159 struct ras_cmd_ctx *cmd, void *data) 160 { 161 if (cmd->input_size != sizeof(struct ras_cmd_dev_handle)) 162 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 163 164 if (ras_aca_clear_all_blocks_ecc_count(ras_core)) 165 return RAS_CMD__ERROR_GENERIC; 166 167 if (ras_umc_clear_logged_ecc(ras_core)) 168 return RAS_CMD__ERROR_GENERIC; 169 170 return RAS_CMD__SUCCESS; 171 } 172 173 static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core, 174 struct ras_cmd_ctx *cmd, void *data) 175 { 176 struct ras_cmd_cper_snapshot_rsp *output_data = 177 (struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw; 178 struct ras_log_batch_overview overview; 179 180 if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req)) 181 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 182 183 ras_log_ring_get_batch_overview(ras_core, &overview); 184 185 output_data->total_cper_num = overview.logged_batch_count; 186 output_data->start_cper_id = overview.first_batch_id; 187 output_data->latest_cper_id = overview.last_batch_id; 188 189 output_data->version = 0; 190 191 cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp); 192 return RAS_CMD__SUCCESS; 193 } 194 195 static int ras_cmd_get_cper_records(struct ras_core_context *ras_core, 196 struct ras_cmd_ctx *cmd, void *data) 197 { 198 struct ras_cmd_cper_record_req *req = 199 (struct ras_cmd_cper_record_req *)cmd->input_buff_raw; 200 struct ras_cmd_cper_record_rsp *rsp = 201 (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw; 202 struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0}; 203 struct ras_log_batch_overview overview; 204 uint32_t offset = 0, real_data_len = 0; 205 uint64_t batch_id; 206 uint8_t *buffer; 207 int ret = 0, i, count; 208 209 if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req)) 210 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 211 212 if (!req->buf_size || !req->buf_ptr || !req->cper_num) 213 return RAS_CMD__ERROR_INVALID_INPUT_DATA; 214 215 buffer = kzalloc(req->buf_size, GFP_KERNEL); 216 if (!buffer) 217 return RAS_CMD__ERROR_GENERIC; 218 219 ras_log_ring_get_batch_overview(ras_core, &overview); 220 for (i = 0; i < req->cper_num; i++) { 221 batch_id = req->cper_start_id + i; 222 if (batch_id >= overview.last_batch_id) 223 break; 224 225 count = ras_log_ring_get_batch_records(ras_core, batch_id, trace, 226 ARRAY_SIZE(trace)); 227 if (count > 0) { 228 ret = ras_cper_generate_cper(ras_core, trace, count, 229 &buffer[offset], req->buf_size - offset, &real_data_len); 230 if (ret) 231 break; 232 233 offset += real_data_len; 234 } 235 } 236 237 if ((ret && (ret != -ENOMEM)) || 238 copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, offset)) { 239 kfree(buffer); 240 return RAS_CMD__ERROR_GENERIC; 241 } 242 243 rsp->real_data_size = offset; 244 rsp->real_cper_num = i; 245 rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0; 246 rsp->version = 0; 247 248 cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp); 249 250 kfree(buffer); 251 252 return RAS_CMD__SUCCESS; 253 } 254 255 static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core, 256 struct ras_cmd_ctx *cmd, void *data) 257 { 258 struct ras_cmd_batch_trace_snapshot_rsp *rsp = 259 (struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw; 260 struct ras_log_batch_overview overview; 261 262 263 if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_snapshot_req)) 264 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 265 266 ras_log_ring_get_batch_overview(ras_core, &overview); 267 268 rsp->total_batch_num = overview.logged_batch_count; 269 rsp->start_batch_id = overview.first_batch_id; 270 rsp->latest_batch_id = overview.last_batch_id; 271 rsp->version = 0; 272 273 cmd->output_size = sizeof(struct ras_cmd_batch_trace_snapshot_rsp); 274 return RAS_CMD__SUCCESS; 275 } 276 277 static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core, 278 struct ras_cmd_ctx *cmd, void *data) 279 { 280 struct ras_cmd_batch_trace_record_req *input_data = 281 (struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw; 282 struct ras_cmd_batch_trace_record_rsp *output_data = 283 (struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw; 284 struct ras_log_batch_overview overview; 285 struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0}; 286 struct ras_log_info *record; 287 int i, j, count = 0, offset = 0; 288 uint64_t id; 289 bool completed = false; 290 291 if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_record_req)) 292 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 293 294 if ((!input_data->batch_num) || (input_data->batch_num > RAS_CMD_MAX_BATCH_NUM)) 295 return RAS_CMD__ERROR_INVALID_INPUT_DATA; 296 297 ras_log_ring_get_batch_overview(ras_core, &overview); 298 if ((input_data->start_batch_id < overview.first_batch_id) || 299 (input_data->start_batch_id >= overview.last_batch_id)) 300 return RAS_CMD__ERROR_INVALID_INPUT_SIZE; 301 302 for (i = 0; i < input_data->batch_num; i++) { 303 id = input_data->start_batch_id + i; 304 if (id >= overview.last_batch_id) { 305 completed = true; 306 break; 307 } 308 309 count = ras_log_ring_get_batch_records(ras_core, 310 id, trace_arry, ARRAY_SIZE(trace_arry)); 311 if (count > 0) { 312 if ((offset + count) > RAS_CMD_MAX_TRACE_NUM) 313 break; 314 for (j = 0; j < count; j++) { 315 record = &output_data->records[offset + j]; 316 record->seqno = trace_arry[j]->seqno; 317 record->timestamp = trace_arry[j]->timestamp; 318 record->event = trace_arry[j]->event; 319 memcpy(&record->aca_reg, 320 &trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg)); 321 } 322 } else { 323 count = 0; 324 } 325 326 output_data->batchs[i].batch_id = id; 327 output_data->batchs[i].offset = offset; 328 output_data->batchs[i].trace_num = count; 329 offset += count; 330 } 331 332 output_data->start_batch_id = input_data->start_batch_id; 333 output_data->real_batch_num = i; 334 output_data->remain_num = completed ? 0 : (input_data->batch_num - i); 335 output_data->version = 0; 336 337 cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp); 338 339 return RAS_CMD__SUCCESS; 340 } 341 342 static enum ras_ta_block __get_ras_ta_block(enum ras_block_id block) 343 { 344 switch (block) { 345 case RAS_BLOCK_ID__UMC: 346 return RAS_TA_BLOCK__UMC; 347 case RAS_BLOCK_ID__SDMA: 348 return RAS_TA_BLOCK__SDMA; 349 case RAS_BLOCK_ID__GFX: 350 return RAS_TA_BLOCK__GFX; 351 case RAS_BLOCK_ID__MMHUB: 352 return RAS_TA_BLOCK__MMHUB; 353 case RAS_BLOCK_ID__ATHUB: 354 return RAS_TA_BLOCK__ATHUB; 355 case RAS_BLOCK_ID__PCIE_BIF: 356 return RAS_TA_BLOCK__PCIE_BIF; 357 case RAS_BLOCK_ID__HDP: 358 return RAS_TA_BLOCK__HDP; 359 case RAS_BLOCK_ID__XGMI_WAFL: 360 return RAS_TA_BLOCK__XGMI_WAFL; 361 case RAS_BLOCK_ID__DF: 362 return RAS_TA_BLOCK__DF; 363 case RAS_BLOCK_ID__SMN: 364 return RAS_TA_BLOCK__SMN; 365 case RAS_BLOCK_ID__SEM: 366 return RAS_TA_BLOCK__SEM; 367 case RAS_BLOCK_ID__MP0: 368 return RAS_TA_BLOCK__MP0; 369 case RAS_BLOCK_ID__MP1: 370 return RAS_TA_BLOCK__MP1; 371 case RAS_BLOCK_ID__FUSE: 372 return RAS_TA_BLOCK__FUSE; 373 case RAS_BLOCK_ID__MCA: 374 return RAS_TA_BLOCK__MCA; 375 case RAS_BLOCK_ID__VCN: 376 return RAS_TA_BLOCK__VCN; 377 case RAS_BLOCK_ID__JPEG: 378 return RAS_TA_BLOCK__JPEG; 379 default: 380 return RAS_TA_BLOCK__UMC; 381 } 382 } 383 384 static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error) 385 { 386 switch (error) { 387 case RAS_ECC_ERR__NONE: 388 return RAS_TA_ERROR__NONE; 389 case RAS_ECC_ERR__PARITY: 390 return RAS_TA_ERROR__PARITY; 391 case RAS_ECC_ERR__SINGLE_CORRECTABLE: 392 return RAS_TA_ERROR__SINGLE_CORRECTABLE; 393 case RAS_ECC_ERR__MULTI_UNCORRECTABLE: 394 return RAS_TA_ERROR__MULTI_UNCORRECTABLE; 395 case RAS_ECC_ERR__POISON: 396 return RAS_TA_ERROR__POISON; 397 default: 398 return RAS_TA_ERROR__NONE; 399 } 400 } 401 402 static int ras_cmd_inject_error(struct ras_core_context *ras_core, 403 struct ras_cmd_ctx *cmd, void *data) 404 { 405 struct ras_cmd_inject_error_req *req = 406 (struct ras_cmd_inject_error_req *)cmd->input_buff_raw; 407 struct ras_cmd_inject_error_rsp *output_data = 408 (struct ras_cmd_inject_error_rsp *)cmd->output_buff_raw; 409 int ret = 0; 410 struct ras_ta_trigger_error_input block_info = { 411 .block_id = __get_ras_ta_block(req->block_id), 412 .sub_block_index = req->subblock_id, 413 .inject_error_type = __get_ras_ta_err_type(req->error_type), 414 .address = req->address, 415 .value = req->method, 416 }; 417 418 ret = ras_psp_trigger_error(ras_core, &block_info, req->instance_mask); 419 if (!ret) { 420 output_data->version = 0; 421 output_data->address = block_info.address; 422 cmd->output_size = sizeof(struct ras_cmd_inject_error_rsp); 423 } else { 424 RAS_DEV_ERR(ras_core->dev, "ras inject block %u failed %d\n", req->block_id, ret); 425 ret = RAS_CMD__ERROR_ACCESS_DENIED; 426 } 427 428 return ret; 429 } 430 431 static struct ras_cmd_func_map ras_cmd_maps[] = { 432 {RAS_CMD__INJECT_ERROR, ras_cmd_inject_error}, 433 {RAS_CMD__GET_BLOCK_ECC_STATUS, ras_get_block_ecc_info}, 434 {RAS_CMD__GET_BAD_PAGES, ras_cmd_get_bad_pages}, 435 {RAS_CMD__CLEAR_BAD_PAGE_INFO, ras_cmd_clear_bad_page_info}, 436 {RAS_CMD__RESET_ALL_ERROR_COUNTS, ras_cmd_reset_all_error_counts}, 437 {RAS_CMD__GET_CPER_SNAPSHOT, ras_cmd_get_cper_snapshot}, 438 {RAS_CMD__GET_CPER_RECORD, ras_cmd_get_cper_records}, 439 {RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, ras_cmd_get_batch_trace_snapshot}, 440 {RAS_CMD__GET_BATCH_TRACE_RECORD, ras_cmd_get_batch_trace_records}, 441 }; 442 443 int rascore_handle_cmd(struct ras_core_context *ras_core, 444 struct ras_cmd_ctx *cmd, void *data) 445 { 446 struct ras_cmd_func_map *ras_cmd = NULL; 447 int i; 448 449 for (i = 0; i < ARRAY_SIZE(ras_cmd_maps); i++) { 450 if (cmd->cmd_id == ras_cmd_maps[i].cmd_id) { 451 ras_cmd = &ras_cmd_maps[i]; 452 break; 453 } 454 } 455 456 if (!ras_cmd) 457 return RAS_CMD__ERROR_UKNOWN_CMD; 458 459 return ras_cmd->func(ras_core, cmd, data); 460 } 461 462 int ras_cmd_init(struct ras_core_context *ras_core) 463 { 464 return ras_cmd_add_device(ras_core); 465 } 466 467 int ras_cmd_fini(struct ras_core_context *ras_core) 468 { 469 ras_cmd_remove_device(ras_core); 470 return 0; 471 } 472 473 int ras_cmd_query_interface_info(struct ras_core_context *ras_core, 474 struct ras_query_interface_info_rsp *rsp) 475 { 476 rsp->ras_cmd_major_ver = RAS_CMD_MAJOR_VERSION; 477 rsp->ras_cmd_minor_ver = RAS_CMD_MINOR_VERSION; 478 479 return 0; 480 } 481 482 int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core, 483 uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr) 484 { 485 struct umc_bank_addr umc_bank = {0}; 486 int ret; 487 488 ret = ras_umc_translate_soc_pa_and_bank(ras_core, &soc_pa, &umc_bank, false); 489 if (ret) 490 return RAS_CMD__ERROR_GENERIC; 491 492 bank_addr->stack_id = umc_bank.stack_id; 493 bank_addr->bank_group = umc_bank.bank_group; 494 bank_addr->bank = umc_bank.bank; 495 bank_addr->row = umc_bank.row; 496 bank_addr->column = umc_bank.column; 497 bank_addr->channel = umc_bank.channel; 498 bank_addr->subchannel = umc_bank.subchannel; 499 500 return 0; 501 } 502 503 int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core, 504 struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa) 505 { 506 struct umc_bank_addr umc_bank = {0}; 507 508 umc_bank.stack_id = bank_addr.stack_id; 509 umc_bank.bank_group = bank_addr.bank_group; 510 umc_bank.bank = bank_addr.bank; 511 umc_bank.row = bank_addr.row; 512 umc_bank.column = bank_addr.column; 513 umc_bank.channel = bank_addr.channel; 514 umc_bank.subchannel = bank_addr.subchannel; 515 516 return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, &umc_bank, true); 517 } 518 519 uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core) 520 { 521 return ras_core->ras_cmd.dev_handle; 522 } 523