1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "kfd_priv.h" 25 #include "kfd_events.h" 26 #include "soc15_int.h" 27 #include "kfd_device_queue_manager.h" 28 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h" 29 #include "kfd_smi_events.h" 30 #include "kfd_debug.h" 31 32 /* 33 * GFX12.1 SQ Interrupts 34 * 35 * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit 36 * packet to the Interrupt Handler: 37 * Auto - Generated by the SQG (various cmd overflows, timestamps etc) 38 * Wave - Generated by S_SENDMSG through a shader program 39 * Error - HW generated errors (Illegal instructions, Memviols, EDC etc) 40 * 41 * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus 42 * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such: 43 * 44 * - context_id1[7:6] 45 * Encoding type (0 = Auto, 1 = Wave, 2 = Error) 46 * 47 * - context_id0[26] 48 * PRIV bit indicates that Wave S_SEND or error occurred within trap 49 * 50 * - context_id0[24:0] 51 * 25-bit data with the following layout per encoding type: 52 * Auto - only context_id0[8:0] is used, which reports various interrupts 53 * generated by SQG. The rest is 0. 54 * Wave - user data sent from m0 via S_SENDMSG (context_id0[23:0]) 55 * Error - Error Type (context_id0[24:21]), Error Details (context_id0[20:0]) 56 * 57 * The other context_id bits show coordinates (SE/SH/CU/SIMD/WGP) for wave 58 * S_SENDMSG and Errors. These are 0 for Auto. 59 */ 60 61 enum SQ_INTERRUPT_WORD_ENCODING { 62 SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0, 63 SQ_INTERRUPT_WORD_ENCODING_INST, 64 SQ_INTERRUPT_WORD_ENCODING_ERROR, 65 }; 66 67 enum SQ_INTERRUPT_ERROR_TYPE { 68 SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0, 69 SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST, 70 SQ_INTERRUPT_ERROR_TYPE_MEMVIOL, 71 SQ_INTERRUPT_ERROR_TYPE_EDC_FED, 72 }; 73 74 /* SQ_INTERRUPT_WORD_AUTO_CTXID */ 75 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0 76 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1 77 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL__SHIFT 2 78 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL__SHIFT 3 79 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 8 80 #define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6 81 82 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001 83 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002 84 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL_MASK 0x00000004 85 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL_MASK 0x00000008 86 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000100 87 #define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x000000c0 88 89 /* SQ_INTERRUPT_WORD_WAVE_CTXID */ 90 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0 91 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID__SHIFT 25 92 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 26 93 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 27 94 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID__SHIFT 0 95 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 2 96 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6 97 98 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x00ffffff /* [23:0] */ 99 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID_MASK 0x02000000 /* [25] */ 100 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x04000000 /* [26] */ 101 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */ 102 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */ 103 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */ 104 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */ 105 106 /* SQ_INTERRUPT_WORD_ERROR_CTXID */ 107 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL__SHIFT 0 108 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__MEM_VIOL__SHIFT 19 109 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE__SHIFT 21 110 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__SA_ID__SHIFT 25 111 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV__SHIFT 26 112 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID__SHIFT 27 113 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID__SHIFT 0 114 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID__SHIFT 2 115 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING__SHIFT 6 116 117 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL_MASK 0x0007ffff /* [18:0] */ 118 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__MEM_VIOL_MASK 0x00180000 /* [20:19] */ 119 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE_MASK 0x01e00000 /* [24:21] */ 120 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__SA_ID_MASK 0x02000000 /* [25] */ 121 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV_MASK 0x04000000 /* [26] */ 122 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */ 123 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */ 124 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */ 125 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */ 126 127 /* 128 * The debugger will send user data(m0) with PRIV=1 to indicate it requires 129 * notification from the KFD with the following queue id (DOORBELL_ID) and 130 * trap code (TRAP_CODE). 131 */ 132 #define KFD_CTXID0_TRAP_CODE_SHIFT 10 133 #define KFD_CTXID0_TRAP_CODE_MASK 0xfffc00 134 #define KFD_CTXID0_CP_BAD_OP_ECODE_MASK 0x3ffffff 135 #define KFD_CTXID0_DOORBELL_ID_MASK 0x0003ff 136 137 #define KFD_CTXID0_TRAP_CODE(ctxid0) (((ctxid0) & \ 138 KFD_CTXID0_TRAP_CODE_MASK) >> \ 139 KFD_CTXID0_TRAP_CODE_SHIFT) 140 #define KFD_CTXID0_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \ 141 KFD_CTXID0_CP_BAD_OP_ECODE_MASK) >> \ 142 KFD_CTXID0_TRAP_CODE_SHIFT) 143 #define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \ 144 KFD_CTXID0_DOORBELL_ID_MASK) 145 146 static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1) 147 { 148 pr_debug_ratelimited( 149 "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf0_full %d, ttrace_buf1_full %d ttrace_utc_err %d\n", 150 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE), 151 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT), 152 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF0_FULL), 153 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF1_FULL), 154 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR)); 155 } 156 157 static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1) 158 { 159 pr_debug_ratelimited( 160 "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", 161 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA), 162 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SA_ID), 163 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV), 164 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID), 165 REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID), 166 REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID)); 167 } 168 169 static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) 170 { 171 pr_debug_ratelimited( 172 "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", 173 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL), 174 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE), 175 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SA_ID), 176 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV), 177 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID), 178 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID), 179 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID)); 180 } 181 182 static void event_interrupt_poison_consumption_v12_1(struct kfd_node *node, 183 uint16_t pasid, uint16_t source_id) 184 { 185 enum amdgpu_ras_block block = 0; 186 int ret = -EINVAL; 187 uint32_t reset = 0; 188 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); 189 190 if (!p) 191 return; 192 193 /* all queues of a process will be unmapped in one time */ 194 if (atomic_read(&p->poison)) { 195 kfd_unref_process(p); 196 return; 197 } 198 199 atomic_set(&p->poison, 1); 200 kfd_unref_process(p); 201 202 switch (source_id) { 203 case SOC15_INTSRC_SQ_INTERRUPT_MSG: 204 if (node->dqm->ops.reset_queues) 205 ret = node->dqm->ops.reset_queues(node->dqm, pasid); 206 block = AMDGPU_RAS_BLOCK__GFX; 207 if (ret) 208 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 209 break; 210 case SOC21_INTSRC_SDMA_ECC: 211 default: 212 block = AMDGPU_RAS_BLOCK__GFX; 213 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 214 break; 215 } 216 217 kfd_signal_poison_consumed_event(node, pasid); 218 219 /* 220 * resetting queue passes, do page retirement without gpu reset 221 * resetting queue fails, fallback to gpu reset solution 222 */ 223 amdgpu_amdkfd_ras_poison_consumption_handler(node->adev, block, reset); 224 } 225 226 static bool event_interrupt_isr_v12_1(struct kfd_node *node, 227 const uint32_t *ih_ring_entry, 228 uint32_t *patched_ihre, 229 bool *patched_flag) 230 { 231 uint16_t source_id, client_id, pasid, vmid, node_id; 232 const uint32_t *data = ih_ring_entry; 233 uint32_t context_id0; 234 235 node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); 236 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 237 238 if (!kfd_irq_is_from_node(node, node_id, vmid)) { 239 pr_debug("Interrupt not for Node, node_id: %d, vmid: %d\n", node_id, vmid); 240 return false; 241 } 242 243 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 244 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 245 246 /* Only handle interrupts from KFD VMIDs */ 247 if (!KFD_IRQ_IS_FENCE(client_id, source_id) && 248 (vmid < node->vm_info.first_vmid_kfd || 249 vmid > node->vm_info.last_vmid_kfd)) 250 return false; 251 252 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 253 context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 254 255 if ((source_id == SOC15_INTSRC_CP_END_OF_PIPE) && 256 (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG)) 257 return false; 258 259 pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", 260 client_id, source_id, vmid, pasid); 261 pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", 262 data[0], data[1], data[2], data[3], 263 data[4], data[5], data[6], data[7]); 264 265 /* If there is no valid PASID, it's likely a bug */ 266 if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) 267 return false; 268 269 /* Interrupt types we care about: various signals and faults. 270 * They will be forwarded to a work queue (see below). 271 */ 272 return source_id == SOC15_INTSRC_CP_END_OF_PIPE || 273 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || 274 source_id == SOC15_INTSRC_CP_BAD_OPCODE || 275 source_id == SOC21_INTSRC_SDMA_TRAP || 276 KFD_IRQ_IS_FENCE(client_id, source_id) || 277 ((client_id == SOC21_IH_CLIENTID_VMC || 278 client_id == SOC21_IH_CLIENTID_UTCL2) && 279 !amdgpu_no_queue_eviction_on_vm_fault); 280 } 281 282 static void event_interrupt_wq_v12_1(struct kfd_node *node, 283 const uint32_t *ih_ring_entry) 284 { 285 uint16_t source_id, client_id, ring_id, pasid, vmid; 286 uint32_t context_id0, context_id1; 287 uint8_t sq_int_enc, sq_int_priv, sq_int_errtype; 288 struct kfd_vm_fault_info info = {0}; 289 struct kfd_hsa_memory_exception_data exception_data; 290 291 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 292 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 293 ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); 294 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 295 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 296 context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 297 context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry); 298 299 /* VMC, UTCL2 */ 300 if (client_id == SOC21_IH_CLIENTID_VMC || 301 client_id == SOC21_IH_CLIENTID_UTCL2) { 302 info.vmid = vmid; 303 info.mc_id = client_id; 304 info.page_addr = ih_ring_entry[4] | 305 (uint64_t)(ih_ring_entry[5] & 0xf) << 32; 306 info.prot_valid = ring_id & 0x08; 307 info.prot_read = ring_id & 0x10; 308 info.prot_write = ring_id & 0x20; 309 310 memset(&exception_data, 0, sizeof(exception_data)); 311 exception_data.gpu_id = node->id; 312 exception_data.va = (info.page_addr) << PAGE_SHIFT; 313 exception_data.failure.NotPresent = info.prot_valid ? 1 : 0; 314 exception_data.failure.NoExecute = info.prot_exec ? 1 : 0; 315 exception_data.failure.ReadOnly = info.prot_write ? 1 : 0; 316 exception_data.failure.imprecise = 0; 317 318 kfd_set_dbg_ev_from_interrupt(node, pasid, -1, 319 KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION), 320 &exception_data, sizeof(exception_data)); 321 kfd_smi_event_update_vmfault(node, pasid); 322 323 /* GRBM, SDMA, SE, PMM */ 324 } else if (client_id == SOC21_IH_CLIENTID_GRBM_CP || 325 client_id == SOC21_IH_CLIENTID_GFX) { 326 327 /* CP */ 328 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) 329 kfd_signal_event_interrupt(pasid, context_id0, 32); 330 else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && 331 KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { 332 u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); 333 334 kfd_set_dbg_ev_from_interrupt(node, pasid, doorbell_id, 335 KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), 336 NULL, 0); 337 kfd_dqm_suspend_bad_queue_mes(node, pasid, doorbell_id); 338 } 339 340 /* SDMA */ 341 else if (source_id == SOC21_INTSRC_SDMA_TRAP) 342 kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); 343 else if (source_id == SOC21_INTSRC_SDMA_ECC) { 344 event_interrupt_poison_consumption_v12_1(node, pasid, source_id); 345 return; 346 } 347 348 /* SQ */ 349 else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) { 350 sq_int_enc = REG_GET_FIELD(context_id1, 351 SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); 352 switch (sq_int_enc) { 353 case SQ_INTERRUPT_WORD_ENCODING_AUTO: 354 print_sq_intr_info_auto(context_id0, context_id1); 355 break; 356 case SQ_INTERRUPT_WORD_ENCODING_INST: 357 print_sq_intr_info_inst(context_id0, context_id1); 358 sq_int_priv = REG_GET_FIELD(context_id0, 359 SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV); 360 if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(node, pasid, 361 KFD_CTXID0_DOORBELL_ID(context_id0), 362 KFD_CTXID0_TRAP_CODE(context_id0), 363 NULL, 0))) 364 return; 365 break; 366 case SQ_INTERRUPT_WORD_ENCODING_ERROR: 367 print_sq_intr_info_error(context_id0, context_id1); 368 sq_int_errtype = REG_GET_FIELD(context_id0, 369 SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE); 370 if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && 371 sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { 372 event_interrupt_poison_consumption_v12_1( 373 node, pasid, source_id); 374 return; 375 } 376 break; 377 default: 378 break; 379 } 380 kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24); 381 } 382 383 } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { 384 kfd_process_close_interrupt_drain(pasid); 385 } 386 } 387 388 const struct kfd_event_interrupt_class event_interrupt_class_v12_1 = { 389 .interrupt_isr = event_interrupt_isr_v12_1, 390 .interrupt_wq = event_interrupt_wq_v12_1, 391 }; 392