1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2016-2022 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "kfd_priv.h" 25 #include "kfd_events.h" 26 #include "kfd_debug.h" 27 #include "soc15_int.h" 28 #include "kfd_device_queue_manager.h" 29 #include "kfd_smi_events.h" 30 #include "amdgpu_ras.h" 31 #include "amdgpu_ras_mgr.h" 32 33 /* 34 * GFX9 SQ Interrupts 35 * 36 * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit 37 * packet to the Interrupt Handler: 38 * Auto - Generated by the SQG (various cmd overflows, timestamps etc) 39 * Wave - Generated by S_SENDMSG through a shader program 40 * Error - HW generated errors (Illegal instructions, Memviols, EDC etc) 41 * 42 * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus 43 * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such: 44 * 45 * - context_id0[27:26] 46 * Encoding type (0 = Auto, 1 = Wave, 2 = Error) 47 * 48 * - context_id0[13] 49 * PRIV bit indicates that Wave S_SEND or error occurred within trap 50 * 51 * - {context_id1[7:0],context_id0[31:28],context_id0[11:0]} 52 * 24-bit data with the following layout per encoding type: 53 * Auto - only context_id0[8:0] is used, which reports various interrupts 54 * generated by SQG. The rest is 0. 55 * Wave - user data sent from m0 via S_SENDMSG 56 * Error - Error type (context_id1[7:4]), Error Details (rest of bits) 57 * 58 * The other context_id bits show coordinates (SE/SH/CU/SIMD/WAVE) for wave 59 * S_SENDMSG and Errors. These are 0 for Auto. 60 */ 61 62 enum SQ_INTERRUPT_WORD_ENCODING { 63 SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0, 64 SQ_INTERRUPT_WORD_ENCODING_INST, 65 SQ_INTERRUPT_WORD_ENCODING_ERROR, 66 }; 67 68 enum SQ_INTERRUPT_ERROR_TYPE { 69 SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0, 70 SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST, 71 SQ_INTERRUPT_ERROR_TYPE_MEMVIOL, 72 SQ_INTERRUPT_ERROR_TYPE_EDC_FED, 73 }; 74 75 /* SQ_INTERRUPT_WORD_AUTO_CTXID */ 76 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0 77 #define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1 78 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2 79 #define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3 80 #define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4 81 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5 82 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6 83 #define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7 84 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8 85 #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24 86 #define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26 87 88 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x00000001 89 #define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x00000002 90 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x00000004 91 #define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x00000008 92 #define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x00000010 93 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x00000020 94 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x00000040 95 #define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x00000080 96 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 0x00000100 97 #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x03000000 98 #define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c000000 99 100 /* SQ_INTERRUPT_WORD_WAVE_CTXID */ 101 #define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0 102 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12 103 #define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13 104 #define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14 105 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18 106 #define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20 107 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24 108 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26 109 110 #define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x00000fff 111 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x00001000 112 #define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x00002000 113 #define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000 114 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c0000 115 #define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f00000 116 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x03000000 117 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c000000 118 119 /* GFX9 SQ interrupt 24-bit data from context_id<0,1> */ 120 #define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) \ 121 ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff0000)) 122 123 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000 124 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 125 126 /* 127 * The debugger will send user data(m0) with PRIV=1 to indicate it requires 128 * notification from the KFD with the following queue id (DOORBELL_ID) and 129 * trap code (TRAP_CODE). 130 */ 131 #define KFD_INT_DATA_DEBUG_DOORBELL_MASK 0x0003ff 132 #define KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT 10 133 #define KFD_INT_DATA_DEBUG_TRAP_CODE_MASK 0x07fc00 134 #define KFD_DEBUG_DOORBELL_ID(sq_int_data) ((sq_int_data) & \ 135 KFD_INT_DATA_DEBUG_DOORBELL_MASK) 136 #define KFD_DEBUG_TRAP_CODE(sq_int_data) (((sq_int_data) & \ 137 KFD_INT_DATA_DEBUG_TRAP_CODE_MASK) \ 138 >> KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT) 139 #define KFD_DEBUG_CP_BAD_OP_ECODE_MASK 0x3fffc00 140 #define KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT 10 141 #define KFD_DEBUG_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \ 142 KFD_DEBUG_CP_BAD_OP_ECODE_MASK) \ 143 >> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT) 144 145 static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, 146 uint16_t pasid, uint16_t client_id) 147 { 148 enum amdgpu_ras_block block = 0; 149 uint32_t reset = 0; 150 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); 151 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; 152 u64 event_id; 153 int old_poison, ret; 154 155 if (!p) 156 return; 157 158 /* all queues of a process will be unmapped in one time */ 159 old_poison = atomic_cmpxchg(&p->poison, 0, 1); 160 kfd_unref_process(p); 161 if (old_poison) 162 return; 163 164 switch (client_id) { 165 case SOC15_IH_CLIENTID_SE0SH: 166 case SOC15_IH_CLIENTID_SE1SH: 167 case SOC15_IH_CLIENTID_SE2SH: 168 case SOC15_IH_CLIENTID_SE3SH: 169 case SOC15_IH_CLIENTID_UTCL2: 170 block = AMDGPU_RAS_BLOCK__GFX; 171 if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) { 172 /* driver mode-2 for gfx poison is only supported by 173 * pmfw 0x00557300 and onwards */ 174 if (dev->adev->pm.fw_version < 0x00557300) 175 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 176 else 177 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 178 } else if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) { 179 /* driver mode-2 for gfx poison is only supported by 180 * pmfw 0x05550C00 and onwards */ 181 if (dev->adev->pm.fw_version < 0x05550C00) 182 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 183 else 184 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 185 } else { 186 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 187 } 188 amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX); 189 break; 190 case SOC15_IH_CLIENTID_VMC: 191 case SOC15_IH_CLIENTID_VMC1: 192 block = AMDGPU_RAS_BLOCK__MMHUB; 193 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 194 break; 195 case SOC15_IH_CLIENTID_SDMA0: 196 case SOC15_IH_CLIENTID_SDMA1: 197 case SOC15_IH_CLIENTID_SDMA2: 198 case SOC15_IH_CLIENTID_SDMA3: 199 case SOC15_IH_CLIENTID_SDMA4: 200 block = AMDGPU_RAS_BLOCK__SDMA; 201 if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2)) { 202 /* driver mode-2 for gfx poison is only supported by 203 * pmfw 0x00557300 and onwards */ 204 if (dev->adev->pm.fw_version < 0x00557300) 205 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 206 else 207 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 208 } else if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5)) { 209 /* driver mode-2 for gfx poison is only supported by 210 * pmfw 0x05550C00 and onwards */ 211 if (dev->adev->pm.fw_version < 0x05550C00) 212 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 213 else 214 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 215 } else { 216 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 217 } 218 amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA); 219 break; 220 default: 221 dev_warn(dev->adev->dev, 222 "client %d does not support poison consumption\n", client_id); 223 return; 224 } 225 226 ret = amdgpu_ras_mark_ras_event(dev->adev, type); 227 if (ret) 228 return; 229 230 kfd_signal_poison_consumed_event(dev, pasid); 231 232 if (amdgpu_uniras_enabled(dev->adev)) 233 event_id = amdgpu_ras_mgr_gen_ras_event_seqno(dev->adev, 234 RAS_SEQNO_TYPE_POISON_CONSUMPTION); 235 else 236 event_id = amdgpu_ras_acquire_event_id(dev->adev, type); 237 238 RAS_EVENT_LOG(dev->adev, event_id, 239 "poison is consumed by client %d, kick off gpu reset flow\n", client_id); 240 241 amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, 242 block, pasid, NULL, NULL, reset); 243 } 244 245 static bool context_id_expected(struct kfd_dev *dev) 246 { 247 switch (KFD_GC_VERSION(dev)) { 248 case IP_VERSION(9, 0, 1): 249 return dev->mec_fw_version >= 0x817a; 250 case IP_VERSION(9, 1, 0): 251 case IP_VERSION(9, 2, 1): 252 case IP_VERSION(9, 2, 2): 253 case IP_VERSION(9, 3, 0): 254 case IP_VERSION(9, 4, 0): 255 return dev->mec_fw_version >= 0x17a; 256 default: 257 /* Other GFXv9 and later GPUs always sent valid context IDs 258 * on legitimate events 259 */ 260 return KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 1); 261 } 262 } 263 264 static bool event_interrupt_isr_v9(struct kfd_node *dev, 265 const uint32_t *ih_ring_entry, 266 uint32_t *patched_ihre, 267 bool *patched_flag) 268 { 269 uint16_t source_id, client_id, pasid, vmid; 270 const uint32_t *data = ih_ring_entry; 271 272 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 273 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 274 275 /* Only handle interrupts from KFD VMIDs */ 276 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 277 if (!KFD_IRQ_IS_FENCE(client_id, source_id) && 278 (vmid < dev->vm_info.first_vmid_kfd || 279 vmid > dev->vm_info.last_vmid_kfd)) 280 return false; 281 282 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 283 284 /* Only handle clients we care about */ 285 if (client_id != SOC15_IH_CLIENTID_GRBM_CP && 286 client_id != SOC15_IH_CLIENTID_SDMA0 && 287 client_id != SOC15_IH_CLIENTID_SDMA1 && 288 client_id != SOC15_IH_CLIENTID_SDMA2 && 289 client_id != SOC15_IH_CLIENTID_SDMA3 && 290 client_id != SOC15_IH_CLIENTID_SDMA4 && 291 client_id != SOC15_IH_CLIENTID_SDMA5 && 292 client_id != SOC15_IH_CLIENTID_SDMA6 && 293 client_id != SOC15_IH_CLIENTID_SDMA7 && 294 client_id != SOC15_IH_CLIENTID_VMC && 295 client_id != SOC15_IH_CLIENTID_VMC1 && 296 client_id != SOC15_IH_CLIENTID_UTCL2 && 297 client_id != SOC15_IH_CLIENTID_SE0SH && 298 client_id != SOC15_IH_CLIENTID_SE1SH && 299 client_id != SOC15_IH_CLIENTID_SE2SH && 300 client_id != SOC15_IH_CLIENTID_SE3SH && 301 !KFD_IRQ_IS_FENCE(client_id, source_id)) 302 return false; 303 304 /* This is a known issue for gfx9. Under non HWS, pasid is not set 305 * in the interrupt payload, so we need to find out the pasid on our 306 * own. 307 */ 308 if (!pasid && dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { 309 const uint32_t pasid_mask = 0xffff; 310 311 *patched_flag = true; 312 memcpy(patched_ihre, ih_ring_entry, 313 dev->kfd->device_info.ih_ring_entry_size); 314 315 pasid = dev->dqm->vmid_pasid[vmid]; 316 317 /* Patch the pasid field */ 318 patched_ihre[3] = cpu_to_le32((le32_to_cpu(patched_ihre[3]) 319 & ~pasid_mask) | pasid); 320 } 321 322 dev_dbg(dev->adev->dev, 323 "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", 324 client_id, source_id, vmid, pasid); 325 dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", 326 data[0], data[1], data[2], data[3], data[4], data[5], data[6], 327 data[7]); 328 329 /* If there is no valid PASID, it's likely a bug */ 330 if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) 331 return false; 332 333 /* Workaround CP firmware sending bogus signals with 0 context_id. 334 * Those can be safely ignored on hardware and firmware versions that 335 * include a valid context_id on legitimate signals. This avoids the 336 * slow path in kfd_signal_event_interrupt that scans all event slots 337 * for signaled events. 338 */ 339 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) { 340 uint32_t context_id = 341 SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 342 343 if (context_id == 0 && context_id_expected(dev->kfd)) 344 return false; 345 } 346 347 /* Interrupt types we care about: various signals and faults. 348 * They will be forwarded to a work queue (see below). 349 */ 350 return source_id == SOC15_INTSRC_CP_END_OF_PIPE || 351 source_id == SOC15_INTSRC_SDMA_TRAP || 352 source_id == SOC15_INTSRC_SDMA_ECC || 353 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || 354 source_id == SOC15_INTSRC_CP_BAD_OPCODE || 355 KFD_IRQ_IS_FENCE(client_id, source_id) || 356 ((client_id == SOC15_IH_CLIENTID_VMC || 357 client_id == SOC15_IH_CLIENTID_VMC1 || 358 client_id == SOC15_IH_CLIENTID_UTCL2) && 359 !amdgpu_no_queue_eviction_on_vm_fault); 360 } 361 362 static void event_interrupt_wq_v9(struct kfd_node *dev, 363 const uint32_t *ih_ring_entry) 364 { 365 uint16_t source_id, client_id, pasid, vmid; 366 uint32_t context_id0, context_id1; 367 uint32_t sq_intr_err, sq_int_data, encoding; 368 369 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 370 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 371 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 372 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 373 context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 374 context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry); 375 376 if (client_id == SOC15_IH_CLIENTID_GRBM_CP || 377 client_id == SOC15_IH_CLIENTID_SE0SH || 378 client_id == SOC15_IH_CLIENTID_SE1SH || 379 client_id == SOC15_IH_CLIENTID_SE2SH || 380 client_id == SOC15_IH_CLIENTID_SE3SH) { 381 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) 382 kfd_signal_event_interrupt(pasid, context_id0, 32); 383 else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) { 384 sq_int_data = KFD_CONTEXT_ID_GET_SQ_INT_DATA(context_id0, context_id1); 385 encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING); 386 switch (encoding) { 387 case SQ_INTERRUPT_WORD_ENCODING_AUTO: 388 dev_dbg_ratelimited( 389 dev->adev->dev, 390 "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", 391 REG_GET_FIELD( 392 context_id0, 393 SQ_INTERRUPT_WORD_AUTO_CTXID, 394 SE_ID), 395 REG_GET_FIELD( 396 context_id0, 397 SQ_INTERRUPT_WORD_AUTO_CTXID, 398 THREAD_TRACE), 399 REG_GET_FIELD( 400 context_id0, 401 SQ_INTERRUPT_WORD_AUTO_CTXID, 402 WLT), 403 REG_GET_FIELD( 404 context_id0, 405 SQ_INTERRUPT_WORD_AUTO_CTXID, 406 THREAD_TRACE_BUF_FULL), 407 REG_GET_FIELD( 408 context_id0, 409 SQ_INTERRUPT_WORD_AUTO_CTXID, 410 REG_TIMESTAMP), 411 REG_GET_FIELD( 412 context_id0, 413 SQ_INTERRUPT_WORD_AUTO_CTXID, 414 CMD_TIMESTAMP), 415 REG_GET_FIELD( 416 context_id0, 417 SQ_INTERRUPT_WORD_AUTO_CTXID, 418 HOST_CMD_OVERFLOW), 419 REG_GET_FIELD( 420 context_id0, 421 SQ_INTERRUPT_WORD_AUTO_CTXID, 422 HOST_REG_OVERFLOW), 423 REG_GET_FIELD( 424 context_id0, 425 SQ_INTERRUPT_WORD_AUTO_CTXID, 426 IMMED_OVERFLOW), 427 REG_GET_FIELD( 428 context_id0, 429 SQ_INTERRUPT_WORD_AUTO_CTXID, 430 THREAD_TRACE_UTC_ERROR)); 431 break; 432 case SQ_INTERRUPT_WORD_ENCODING_INST: 433 dev_dbg_ratelimited( 434 dev->adev->dev, 435 "sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", 436 REG_GET_FIELD( 437 context_id0, 438 SQ_INTERRUPT_WORD_WAVE_CTXID, 439 SE_ID), 440 REG_GET_FIELD( 441 context_id0, 442 SQ_INTERRUPT_WORD_WAVE_CTXID, 443 DATA), 444 REG_GET_FIELD( 445 context_id0, 446 SQ_INTERRUPT_WORD_WAVE_CTXID, 447 SH_ID), 448 REG_GET_FIELD( 449 context_id0, 450 SQ_INTERRUPT_WORD_WAVE_CTXID, 451 PRIV), 452 REG_GET_FIELD( 453 context_id0, 454 SQ_INTERRUPT_WORD_WAVE_CTXID, 455 WAVE_ID), 456 REG_GET_FIELD( 457 context_id0, 458 SQ_INTERRUPT_WORD_WAVE_CTXID, 459 SIMD_ID), 460 REG_GET_FIELD( 461 context_id0, 462 SQ_INTERRUPT_WORD_WAVE_CTXID, 463 CU_ID), 464 sq_int_data); 465 if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) { 466 if (kfd_set_dbg_ev_from_interrupt(dev, pasid, 467 KFD_DEBUG_DOORBELL_ID(sq_int_data), 468 KFD_DEBUG_TRAP_CODE(sq_int_data), 469 NULL, 0)) 470 return; 471 } 472 break; 473 case SQ_INTERRUPT_WORD_ENCODING_ERROR: 474 sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE); 475 dev_warn_ratelimited( 476 dev->adev->dev, 477 "sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", 478 REG_GET_FIELD( 479 context_id0, 480 SQ_INTERRUPT_WORD_WAVE_CTXID, 481 SE_ID), 482 REG_GET_FIELD( 483 context_id0, 484 SQ_INTERRUPT_WORD_WAVE_CTXID, 485 DATA), 486 REG_GET_FIELD( 487 context_id0, 488 SQ_INTERRUPT_WORD_WAVE_CTXID, 489 SH_ID), 490 REG_GET_FIELD( 491 context_id0, 492 SQ_INTERRUPT_WORD_WAVE_CTXID, 493 PRIV), 494 REG_GET_FIELD( 495 context_id0, 496 SQ_INTERRUPT_WORD_WAVE_CTXID, 497 WAVE_ID), 498 REG_GET_FIELD( 499 context_id0, 500 SQ_INTERRUPT_WORD_WAVE_CTXID, 501 SIMD_ID), 502 REG_GET_FIELD( 503 context_id0, 504 SQ_INTERRUPT_WORD_WAVE_CTXID, 505 CU_ID), 506 sq_intr_err); 507 if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && 508 sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { 509 event_interrupt_poison_consumption_v9(dev, pasid, client_id); 510 return; 511 } 512 break; 513 default: 514 break; 515 } 516 kfd_signal_event_interrupt(pasid, sq_int_data, 24); 517 } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && 518 KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { 519 kfd_set_dbg_ev_from_interrupt(dev, pasid, 520 KFD_DEBUG_DOORBELL_ID(context_id0), 521 KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), 522 NULL, 0); 523 } 524 } else if (client_id == SOC15_IH_CLIENTID_SDMA0 || 525 client_id == SOC15_IH_CLIENTID_SDMA1 || 526 client_id == SOC15_IH_CLIENTID_SDMA2 || 527 client_id == SOC15_IH_CLIENTID_SDMA3 || 528 client_id == SOC15_IH_CLIENTID_SDMA4 || 529 client_id == SOC15_IH_CLIENTID_SDMA5 || 530 client_id == SOC15_IH_CLIENTID_SDMA6 || 531 client_id == SOC15_IH_CLIENTID_SDMA7) { 532 if (source_id == SOC15_INTSRC_SDMA_TRAP) { 533 kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); 534 } else if (source_id == SOC15_INTSRC_SDMA_ECC) { 535 event_interrupt_poison_consumption_v9(dev, pasid, client_id); 536 return; 537 } 538 } else if (client_id == SOC15_IH_CLIENTID_VMC || 539 client_id == SOC15_IH_CLIENTID_VMC1 || 540 client_id == SOC15_IH_CLIENTID_UTCL2) { 541 struct kfd_vm_fault_info info = {0}; 542 uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); 543 struct kfd_hsa_memory_exception_data exception_data; 544 545 if (source_id == SOC15_INTSRC_VMC_UTCL2_POISON) { 546 event_interrupt_poison_consumption_v9(dev, pasid, client_id); 547 return; 548 } 549 550 info.vmid = vmid; 551 info.mc_id = client_id; 552 info.page_addr = ih_ring_entry[4] | 553 (uint64_t)(ih_ring_entry[5] & 0xf) << 32; 554 info.prot_valid = ring_id & 0x08; 555 info.prot_read = ring_id & 0x10; 556 info.prot_write = ring_id & 0x20; 557 558 memset(&exception_data, 0, sizeof(exception_data)); 559 exception_data.gpu_id = dev->id; 560 exception_data.va = (info.page_addr) << PAGE_SHIFT; 561 exception_data.failure.NotPresent = info.prot_valid ? 1 : 0; 562 exception_data.failure.NoExecute = info.prot_exec ? 1 : 0; 563 exception_data.failure.ReadOnly = info.prot_write ? 1 : 0; 564 exception_data.failure.imprecise = 0; 565 566 kfd_set_dbg_ev_from_interrupt(dev, 567 pasid, 568 -1, 569 KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION), 570 &exception_data, 571 sizeof(exception_data)); 572 kfd_smi_event_update_vmfault(dev, pasid); 573 } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { 574 kfd_process_close_interrupt_drain(pasid); 575 } 576 } 577 578 static bool event_interrupt_isr_v9_4_3(struct kfd_node *node, 579 const uint32_t *ih_ring_entry, 580 uint32_t *patched_ihre, 581 bool *patched_flag) 582 { 583 uint16_t node_id, vmid; 584 585 /* 586 * For GFX 9.4.3, process the interrupt if: 587 * - NodeID field in IH entry matches the corresponding bit 588 * set in interrupt_bitmap Bits 0-15. 589 * OR 590 * - If partition mode is CPX and interrupt came from 591 * Node_id 0,4,8,12, then check if the Bit (16 + client id) 592 * is set in interrupt bitmap Bits 16-31. 593 */ 594 node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); 595 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 596 if (kfd_irq_is_from_node(node, node_id, vmid)) 597 return event_interrupt_isr_v9(node, ih_ring_entry, 598 patched_ihre, patched_flag); 599 return false; 600 } 601 602 const struct kfd_event_interrupt_class event_interrupt_class_v9 = { 603 .interrupt_isr = event_interrupt_isr_v9, 604 .interrupt_wq = event_interrupt_wq_v9, 605 }; 606 607 const struct kfd_event_interrupt_class event_interrupt_class_v9_4_3 = { 608 .interrupt_isr = event_interrupt_isr_v9_4_3, 609 .interrupt_wq = event_interrupt_wq_v9, 610 }; 611