1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include "ras.h" 25 #include "ras_process.h" 26 27 #define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req)) 28 29 #define RAS_POLLING_ECC_TIMEOUT 300 30 31 static int ras_process_put_event(struct ras_core_context *ras_core, 32 struct ras_event_req *req) 33 { 34 struct ras_process *ras_proc = &ras_core->ras_proc; 35 int ret; 36 37 ret = kfifo_in_spinlocked(&ras_proc->event_fifo, 38 req, sizeof(*req), &ras_proc->fifo_spinlock); 39 if (!ret) { 40 RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n"); 41 return -ENOSPC; 42 } 43 44 return 0; 45 } 46 47 static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core, 48 uint32_t reset_cause) 49 { 50 struct ras_event_req req = {0}; 51 52 req.reset = reset_cause; 53 54 return ras_process_put_event(ras_core, &req); 55 } 56 57 static int ras_process_get_event(struct ras_core_context *ras_core, 58 struct ras_event_req *req) 59 { 60 struct ras_process *ras_proc = &ras_core->ras_proc; 61 62 return kfifo_out_spinlocked(&ras_proc->event_fifo, 63 req, sizeof(*req), &ras_proc->fifo_spinlock); 64 } 65 66 static void ras_process_clear_event_fifo(struct ras_core_context *ras_core) 67 { 68 struct ras_event_req req; 69 int ret; 70 71 do { 72 ret = ras_process_get_event(ras_core, &req); 73 } while (ret); 74 } 75 76 #define AMDGPU_RAS_WAITING_DATA_READY 200 77 static int ras_process_umc_event(struct ras_core_context *ras_core, 78 uint32_t event_count) 79 { 80 struct ras_ecc_count ecc_data; 81 int ret = 0; 82 uint32_t timeout = 0; 83 uint32_t detected_de_count = 0; 84 85 do { 86 memset(&ecc_data, 0, sizeof(ecc_data)); 87 ret = ras_core_update_ecc_info(ras_core); 88 if (ret) 89 return ret; 90 91 ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data); 92 if (ret) 93 return ret; 94 95 if (ecc_data.new_de_count) { 96 detected_de_count += ecc_data.new_de_count; 97 timeout = 0; 98 } else { 99 if (!timeout && event_count) 100 timeout = AMDGPU_RAS_WAITING_DATA_READY; 101 102 if (timeout) { 103 if (!--timeout) 104 break; 105 106 msleep(1); 107 } 108 } 109 } while (detected_de_count < event_count); 110 111 if (detected_de_count && ras_core_gpu_is_rma(ras_core)) 112 ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA); 113 114 return 0; 115 } 116 117 static int ras_process_non_umc_event(struct ras_core_context *ras_core) 118 { 119 struct ras_process *ras_proc = &ras_core->ras_proc; 120 struct ras_event_req req; 121 uint32_t event_count = kfifo_len(&ras_proc->event_fifo); 122 uint32_t reset_flags = 0; 123 int ret = 0, i; 124 125 for (i = 0; i < event_count; i++) { 126 memset(&req, 0, sizeof(req)); 127 ret = ras_process_get_event(ras_core, &req); 128 if (!ret) 129 continue; 130 131 ras_core_event_notify(ras_core, 132 RAS_EVENT_ID__POISON_CONSUMPTION, &req); 133 134 reset_flags |= req.reset; 135 136 if (req.reset == GPU_RESET_CAUSE_RMA) 137 continue; 138 139 if (req.reset) 140 RAS_DEV_INFO(ras_core->dev, 141 "{%llu} GPU reset for %s RAS poison consumption is issued!\n", 142 req.seqno, ras_core_get_ras_block_name(req.block)); 143 else 144 RAS_DEV_INFO(ras_core->dev, 145 "{%llu} %s RAS poison consumption is issued!\n", 146 req.seqno, ras_core_get_ras_block_name(req.block)); 147 } 148 149 if (reset_flags) { 150 ret = ras_core_event_notify(ras_core, 151 RAS_EVENT_ID__RESET_GPU, &reset_flags); 152 if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA)) 153 return -RAS_CORE_GPU_IN_MODE1_RESET; 154 } 155 156 return ret; 157 } 158 159 int ras_process_handle_ras_event(struct ras_core_context *ras_core) 160 { 161 struct ras_process *ras_proc = &ras_core->ras_proc; 162 uint32_t umc_event_count; 163 int ret; 164 165 ret = ras_core_event_notify(ras_core, 166 RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL); 167 if (ret) 168 return ret; 169 170 ras_aca_clear_fatal_flag(ras_core); 171 ras_umc_log_pending_bad_bank(ras_core); 172 173 do { 174 umc_event_count = atomic_read(&ras_proc->umc_interrupt_count); 175 ret = ras_process_umc_event(ras_core, umc_event_count); 176 if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) 177 break; 178 179 if (umc_event_count) 180 atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count); 181 } while (atomic_read(&ras_proc->umc_interrupt_count)); 182 183 if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) && 184 (kfifo_len(&ras_proc->event_fifo))) 185 ret = ras_process_non_umc_event(ras_core); 186 187 if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) { 188 /* Clear poison fifo */ 189 ras_process_clear_event_fifo(ras_core); 190 atomic_set(&ras_proc->umc_interrupt_count, 0); 191 } 192 193 ras_core_event_notify(ras_core, 194 RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL); 195 return ret; 196 } 197 198 static int thread_wait_condition(void *param) 199 { 200 struct ras_process *ras_proc = (struct ras_process *)param; 201 202 return (kthread_should_stop() || 203 atomic_read(&ras_proc->ras_interrupt_req)); 204 } 205 206 static int ras_process_thread(void *context) 207 { 208 struct ras_core_context *ras_core = (struct ras_core_context *)context; 209 struct ras_process *ras_proc = &ras_core->ras_proc; 210 211 while (!kthread_should_stop()) { 212 ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq, 213 thread_wait_condition, ras_proc, 214 msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT)); 215 216 if (kthread_should_stop()) 217 break; 218 219 if (!ras_core->is_initialized) 220 continue; 221 222 atomic_set(&ras_proc->ras_interrupt_req, 0); 223 224 if (ras_core_gpu_in_reset(ras_core)) 225 continue; 226 227 if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event) 228 ras_core->sys_fn->async_handle_ras_event(ras_core, NULL); 229 else 230 ras_process_handle_ras_event(ras_core); 231 } 232 233 return 0; 234 } 235 236 int ras_process_init(struct ras_core_context *ras_core) 237 { 238 struct ras_process *ras_proc = &ras_core->ras_proc; 239 int ret; 240 241 ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL); 242 if (ret) 243 return ret; 244 245 spin_lock_init(&ras_proc->fifo_spinlock); 246 247 init_waitqueue_head(&ras_proc->ras_process_wq); 248 249 ras_proc->ras_process_thread = kthread_run(ras_process_thread, 250 (void *)ras_core, "ras_process_thread"); 251 if (!ras_proc->ras_process_thread) { 252 RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n"); 253 ret = -ENOMEM; 254 goto err; 255 } 256 257 return 0; 258 259 err: 260 ras_process_fini(ras_core); 261 return ret; 262 } 263 264 int ras_process_fini(struct ras_core_context *ras_core) 265 { 266 struct ras_process *ras_proc = &ras_core->ras_proc; 267 268 if (ras_proc->ras_process_thread) { 269 kthread_stop(ras_proc->ras_process_thread); 270 ras_proc->ras_process_thread = NULL; 271 } 272 273 kfifo_free(&ras_proc->event_fifo); 274 275 return 0; 276 } 277 278 static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core, 279 struct ras_event_req *req) 280 { 281 struct ras_process *ras_proc = &ras_core->ras_proc; 282 283 atomic_inc(&ras_proc->umc_interrupt_count); 284 atomic_inc(&ras_proc->ras_interrupt_req); 285 286 wake_up(&ras_proc->ras_process_wq); 287 return 0; 288 } 289 290 static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core, 291 struct ras_event_req *req) 292 { 293 struct ras_process *ras_proc = &ras_core->ras_proc; 294 int ret; 295 296 ret = ras_process_put_event(ras_core, req); 297 if (!ret) { 298 atomic_inc(&ras_proc->ras_interrupt_req); 299 wake_up(&ras_proc->ras_process_wq); 300 } 301 302 return ret; 303 } 304 305 int ras_process_add_interrupt_req(struct ras_core_context *ras_core, 306 struct ras_event_req *req, bool is_umc) 307 { 308 int ret; 309 310 if (!ras_core) 311 return -EINVAL; 312 313 if (!ras_core->is_initialized) 314 return -EPERM; 315 316 if (is_umc) 317 ret = ras_process_add_umc_interrupt_req(ras_core, req); 318 else 319 ret = ras_process_add_non_umc_interrupt_req(ras_core, req); 320 321 return ret; 322 } 323