1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include "amdgpu.h" 25 #include "amdgpu_reset.h" 26 #include "amdgpu_xgmi.h" 27 #include "ras_sys.h" 28 #include "amdgpu_ras_mgr.h" 29 #include "amdgpu_ras_cmd.h" 30 #include "amdgpu_ras_process.h" 31 #include "amdgpu_ras_eeprom_i2c.h" 32 #include "amdgpu_ras_mp1_v13_0.h" 33 #include "amdgpu_ras_nbio_v7_9.h" 34 35 #define MAX_SOCKET_NUM_PER_HIVE 8 36 #define MAX_AID_NUM_PER_SOCKET 4 37 #define MAX_XCD_NUM_PER_AID 2 38 39 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 40 #define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M) 41 42 #define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4) 43 44 /* Reserve 8 physical dram row for possible retirement. 45 * In worst cases, it will lose 8 * 2MB memory in vram domain 46 */ 47 #define RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20) 48 49 50 static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr) 51 { 52 struct ras_event_state *event_state; 53 int i; 54 55 memset(mgr, 0, sizeof(*mgr)); 56 atomic64_set(&mgr->seqno, 0); 57 58 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { 59 event_state = &mgr->event_state[i]; 60 event_state->last_seqno = RAS_EVENT_INVALID_ID; 61 atomic64_set(&event_state->count, 0); 62 } 63 } 64 65 static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core) 66 { 67 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 68 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 69 struct ras_event_manager *event_mgr; 70 struct amdgpu_hive_info *hive; 71 72 hive = amdgpu_get_xgmi_hive(adev); 73 event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr; 74 75 /* init event manager with node 0 on xgmi system */ 76 if (!amdgpu_reset_in_recovery(adev)) { 77 if (!hive || adev->gmc.xgmi.node_id == 0) 78 ras_mgr_init_event_mgr(event_mgr); 79 } 80 81 if (hive) 82 amdgpu_put_xgmi_hive(hive); 83 } 84 85 static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev, 86 struct ras_core_config *config) 87 { 88 struct ras_aca_config *aca_cfg = &config->aca_cfg; 89 90 aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE; 91 aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET; 92 aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID; 93 94 return 0; 95 } 96 97 static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev, 98 struct ras_core_config *config) 99 { 100 struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg; 101 102 eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func; 103 eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus; 104 if (eeprom_cfg->eeprom_i2c_adapter) { 105 const struct i2c_adapter_quirks *quirks = 106 ((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks; 107 108 if (quirks) { 109 eeprom_cfg->max_i2c_read_len = quirks->max_read_len; 110 eeprom_cfg->max_i2c_write_len = quirks->max_write_len; 111 } 112 } 113 114 /* 115 * amdgpu_bad_page_threshold is used to config 116 * the threshold for the number of bad pages. 117 * -1: Threshold is set to default value 118 * Driver will issue a warning message when threshold is reached 119 * and continue runtime services. 120 * 0: Disable bad page retirement 121 * Driver will not retire bad pages 122 * which is intended for debugging purpose. 123 * -2: Threshold is determined by a formula 124 * that assumes 1 bad page per 100M of local memory. 125 * Driver will continue runtime services when threhold is reached. 126 * 0 < threshold < max number of bad page records in EEPROM, 127 * A user-defined threshold is set 128 * Driver will halt runtime services when this custom threshold is reached. 129 */ 130 if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD) 131 eeprom_cfg->eeprom_record_threshold_count = 132 div64_u64(adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE); 133 else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD) 134 eeprom_cfg->eeprom_record_threshold_count = 135 COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT); 136 else 137 eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold; 138 139 eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold; 140 141 return 0; 142 } 143 144 static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev, 145 struct ras_core_config *config) 146 { 147 struct ras_mp1_config *mp1_cfg = &config->mp1_cfg; 148 int ret = 0; 149 150 switch (config->mp1_ip_version) { 151 case IP_VERSION(13, 0, 6): 152 case IP_VERSION(13, 0, 14): 153 case IP_VERSION(13, 0, 12): 154 mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0; 155 break; 156 default: 157 RAS_DEV_ERR(adev, 158 "The mp1(0x%x) ras config is not right!\n", 159 config->mp1_ip_version); 160 ret = -EINVAL; 161 break; 162 } 163 164 return ret; 165 } 166 167 static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev, 168 struct ras_core_config *config) 169 { 170 struct ras_nbio_config *nbio_cfg = &config->nbio_cfg; 171 int ret = 0; 172 173 switch (config->nbio_ip_version) { 174 case IP_VERSION(7, 9, 0): 175 case IP_VERSION(7, 9, 1): 176 nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9; 177 break; 178 default: 179 RAS_DEV_ERR(adev, 180 "The nbio(0x%x) ras config is not right!\n", 181 config->nbio_ip_version); 182 ret = -EINVAL; 183 break; 184 } 185 186 return ret; 187 } 188 189 static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core, 190 struct ras_psp_sys_status *status) 191 { 192 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 193 struct ta_context *context = &adev->psp.ras_context.context; 194 195 status->initialized = context->initialized; 196 status->session_id = context->session_id; 197 status->psp_cmd_mutex = &adev->psp.mutex; 198 199 return 0; 200 } 201 202 static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core, 203 struct ras_ta_init_param *ras_ta_param) 204 { 205 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 206 uint32_t nps_mode; 207 208 if (amdgpu_ras_is_poison_mode_supported(adev)) 209 ras_ta_param->poison_mode_en = 1; 210 211 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) 212 ras_ta_param->dgpu_mode = 1; 213 214 ras_ta_param->xcc_mask = adev->gfx.xcc_mask; 215 ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2; 216 217 ras_ta_param->active_umc_mask = adev->umc.active_mask; 218 219 if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, &nps_mode)) 220 ras_ta_param->nps_mode = nps_mode; 221 222 return 0; 223 } 224 225 const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = { 226 .get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status, 227 .get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param, 228 }; 229 230 static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev, 231 struct ras_core_config *config) 232 { 233 struct ras_psp_config *psp_cfg = &config->psp_cfg; 234 235 psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func; 236 237 return 0; 238 } 239 240 static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev, 241 struct ras_core_config *config) 242 { 243 struct ras_umc_config *umc_cfg = &config->umc_cfg; 244 245 umc_cfg->umc_vram_type = adev->gmc.vram_type; 246 247 return 0; 248 } 249 250 static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev) 251 { 252 struct ras_core_config init_config; 253 254 memset(&init_config, 0, sizeof(init_config)); 255 256 init_config.umc_ip_version = amdgpu_ip_version(adev, UMC_HWIP, 0); 257 init_config.mp1_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0); 258 init_config.gfx_ip_version = amdgpu_ip_version(adev, GC_HWIP, 0); 259 init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0); 260 init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0); 261 262 if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) || 263 init_config.umc_ip_version == IP_VERSION(12, 5, 0)) 264 init_config.aca_ip_version = IP_VERSION(1, 0, 0); 265 266 init_config.sys_fn = &amdgpu_ras_sys_fn; 267 init_config.ras_eeprom_supported = true; 268 init_config.poison_supported = 269 amdgpu_ras_is_poison_mode_supported(adev); 270 271 amdgpu_ras_mgr_init_aca_config(adev, &init_config); 272 amdgpu_ras_mgr_init_eeprom_config(adev, &init_config); 273 amdgpu_ras_mgr_init_mp1_config(adev, &init_config); 274 amdgpu_ras_mgr_init_nbio_config(adev, &init_config); 275 amdgpu_ras_mgr_init_psp_config(adev, &init_config); 276 amdgpu_ras_mgr_init_umc_config(adev, &init_config); 277 278 return ras_core_create(&init_config); 279 } 280 281 static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block) 282 { 283 struct amdgpu_device *adev = ip_block->adev; 284 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 285 struct amdgpu_ras_mgr *ras_mgr; 286 int ret = 0; 287 288 /* Disabled by default */ 289 con->uniras_enabled = false; 290 291 /* Enabled only in debug mode */ 292 if (adev->debug_enable_ras_aca) { 293 con->uniras_enabled = true; 294 RAS_DEV_INFO(adev, "Debug amdgpu uniras!"); 295 } 296 297 if (!con->uniras_enabled) 298 return 0; 299 300 ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL); 301 if (!ras_mgr) 302 return -EINVAL; 303 304 con->ras_mgr = ras_mgr; 305 ras_mgr->adev = adev; 306 307 ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev); 308 if (!ras_mgr->ras_core) { 309 RAS_DEV_ERR(adev, "Failed to create ras core!\n"); 310 ret = -EINVAL; 311 goto err; 312 } 313 314 ras_mgr->ras_core->dev = adev; 315 316 amdgpu_ras_process_init(adev); 317 ras_core_sw_init(ras_mgr->ras_core); 318 amdgpu_ras_mgr_init_event_mgr(ras_mgr->ras_core); 319 return 0; 320 321 err: 322 kfree(ras_mgr); 323 return ret; 324 } 325 326 static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block) 327 { 328 struct amdgpu_device *adev = ip_block->adev; 329 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 330 struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr; 331 332 if (!con->uniras_enabled) 333 return 0; 334 335 if (!ras_mgr) 336 return 0; 337 338 amdgpu_ras_process_fini(adev); 339 ras_core_sw_fini(ras_mgr->ras_core); 340 ras_core_destroy(ras_mgr->ras_core); 341 ras_mgr->ras_core = NULL; 342 343 kfree(con->ras_mgr); 344 con->ras_mgr = NULL; 345 346 return 0; 347 } 348 349 static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block) 350 { 351 struct amdgpu_device *adev = ip_block->adev; 352 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 353 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 354 int ret; 355 356 if (!con->uniras_enabled) 357 return 0; 358 359 if (!ras_mgr || !ras_mgr->ras_core) 360 return -EINVAL; 361 362 ret = ras_core_hw_init(ras_mgr->ras_core); 363 if (ret) { 364 RAS_DEV_ERR(adev, "Failed to initialize ras core!\n"); 365 return ret; 366 } 367 368 ras_mgr->ras_is_ready = true; 369 370 amdgpu_enable_uniras(adev, true); 371 372 RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n"); 373 return 0; 374 } 375 376 static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block) 377 { 378 struct amdgpu_device *adev = ip_block->adev; 379 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 380 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 381 382 if (!con->uniras_enabled) 383 return 0; 384 385 if (!ras_mgr || !ras_mgr->ras_core) 386 return -EINVAL; 387 388 ras_core_hw_fini(ras_mgr->ras_core); 389 390 ras_mgr->ras_is_ready = false; 391 392 return 0; 393 } 394 395 struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev) 396 { 397 if (!adev || !adev->psp.ras_context.ras) 398 return NULL; 399 400 return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr; 401 } 402 403 static const struct amd_ip_funcs __maybe_unused ras_v1_0_ip_funcs = { 404 .name = "ras_v1_0", 405 .sw_init = amdgpu_ras_mgr_sw_init, 406 .sw_fini = amdgpu_ras_mgr_sw_fini, 407 .hw_init = amdgpu_ras_mgr_hw_init, 408 .hw_fini = amdgpu_ras_mgr_hw_fini, 409 }; 410 411 const struct amdgpu_ip_block_version ras_v1_0_ip_block = { 412 .type = AMD_IP_BLOCK_TYPE_RAS, 413 .major = 1, 414 .minor = 0, 415 .rev = 0, 416 .funcs = &ras_v1_0_ip_funcs, 417 }; 418 419 int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable) 420 { 421 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 422 423 if (!ras_mgr || !ras_mgr->ras_core) 424 return -EPERM; 425 426 if (amdgpu_sriov_vf(adev)) 427 return -EPERM; 428 429 RAS_DEV_INFO(adev, "Enable amdgpu unified ras!"); 430 return ras_core_set_status(ras_mgr->ras_core, enable); 431 } 432 433 bool amdgpu_uniras_enabled(struct amdgpu_device *adev) 434 { 435 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 436 437 if (!ras_mgr || !ras_mgr->ras_core) 438 return false; 439 440 if (amdgpu_sriov_vf(adev)) 441 return false; 442 443 return ras_core_is_enabled(ras_mgr->ras_core); 444 } 445 446 static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev) 447 { 448 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 449 450 if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready && 451 ras_core_is_ready(ras_mgr->ras_core)) 452 return true; 453 454 return false; 455 } 456 457 int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data) 458 { 459 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 460 461 if (!amdgpu_ras_mgr_is_ready(adev)) 462 return -EPERM; 463 464 return ras_core_handle_nbio_irq(ras_mgr->ras_core, data); 465 } 466 467 uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev, 468 enum ras_seqno_type seqno_type) 469 { 470 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 471 int ret; 472 uint64_t seq_no; 473 474 if (!amdgpu_ras_mgr_is_ready(adev) || 475 (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)) 476 return 0; 477 478 seq_no = ras_core_gen_seqno(ras_mgr->ras_core, seqno_type); 479 480 if ((seqno_type == RAS_SEQNO_TYPE_DE) || 481 (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) { 482 ret = ras_core_put_seqno(ras_mgr->ras_core, seqno_type, seq_no); 483 if (ret) 484 RAS_DEV_WARN(adev, "There are too many ras interrupts!"); 485 } 486 487 return seq_no; 488 } 489 490 int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data) 491 { 492 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 493 struct ras_ih_info *ih_info = (struct ras_ih_info *)data; 494 uint64_t seq_no = 0; 495 int ret = 0; 496 497 if (!amdgpu_ras_mgr_is_ready(adev)) 498 return -EPERM; 499 500 if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) { 501 if (ras_mgr->ras_core->poison_supported) { 502 seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_DE); 503 RAS_DEV_INFO(adev, 504 "{%llu} RAS poison is created, no user action is needed.\n", 505 seq_no); 506 } 507 508 ret = amdgpu_ras_process_handle_umc_interrupt(adev, ih_info); 509 } else if (ras_mgr->ras_core->poison_supported) { 510 ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, ih_info); 511 } else { 512 RAS_DEV_WARN(adev, 513 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); 514 } 515 516 return ret; 517 } 518 519 int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data) 520 { 521 if (!amdgpu_ras_mgr_is_ready(adev)) 522 return -EPERM; 523 524 return amdgpu_ras_process_handle_consumption_interrupt(adev, data); 525 } 526 527 int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev) 528 { 529 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 530 531 if (!amdgpu_ras_mgr_is_ready(adev)) 532 return -EPERM; 533 534 return ras_core_update_ecc_info(ras_mgr->ras_core); 535 } 536 537 int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags) 538 { 539 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 540 541 if (!amdgpu_ras_mgr_is_ready(adev)) 542 return -EPERM; 543 544 con->gpu_reset_flags |= flags; 545 return amdgpu_ras_reset_gpu(adev); 546 } 547 548 bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev) 549 { 550 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 551 552 if (!amdgpu_ras_mgr_is_ready(adev)) 553 return false; 554 555 return ras_eeprom_check_safety_watermark(ras_mgr->ras_core); 556 } 557 558 int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev, 559 uint32_t *nps_mode) 560 { 561 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 562 uint32_t mode; 563 564 if (!amdgpu_ras_mgr_is_ready(adev)) 565 return -EINVAL; 566 567 mode = ras_core_get_curr_nps_mode(ras_mgr->ras_core); 568 if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE) 569 return -EINVAL; 570 571 *nps_mode = mode; 572 573 return 0; 574 } 575 576 bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev, 577 uint64_t addr) 578 { 579 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 580 581 if (!amdgpu_ras_mgr_is_ready(adev)) 582 return false; 583 584 return ras_umc_check_retired_addr(ras_mgr->ras_core, addr); 585 } 586 587 bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev) 588 { 589 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 590 591 if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready) 592 return false; 593 594 return ras_core_gpu_is_rma(ras_mgr->ras_core); 595 } 596 597 int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, 598 uint32_t cmd_id, void *input, uint32_t input_size, 599 void *output, uint32_t out_size) 600 { 601 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 602 struct ras_cmd_ctx *cmd_ctx; 603 uint32_t ctx_buf_size = PAGE_SIZE; 604 int ret; 605 606 if (!amdgpu_ras_mgr_is_ready(adev)) 607 return -EPERM; 608 609 cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL); 610 if (!cmd_ctx) 611 return -ENOMEM; 612 613 cmd_ctx->cmd_id = cmd_id; 614 615 memcpy(cmd_ctx->input_buff_raw, input, input_size); 616 cmd_ctx->input_size = input_size; 617 cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx); 618 619 ret = amdgpu_ras_submit_cmd(ras_mgr->ras_core, cmd_ctx); 620 if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size)) 621 memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size); 622 623 kfree(cmd_ctx); 624 625 return ret; 626 } 627 628 int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev) 629 { 630 if (!amdgpu_ras_mgr_is_ready(adev)) { 631 RAS_DEV_ERR(adev, "Invalid ras suspend!\n"); 632 return -EPERM; 633 } 634 635 amdgpu_ras_process_pre_reset(adev); 636 return 0; 637 } 638 639 int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev) 640 { 641 if (!amdgpu_ras_mgr_is_ready(adev)) { 642 RAS_DEV_ERR(adev, "Invalid ras resume!\n"); 643 return -EPERM; 644 } 645 646 amdgpu_ras_process_post_reset(adev); 647 return 0; 648 } 649