1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2024 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #include <generated/utsrelease.h> 26 #include <linux/devcoredump.h> 27 #include "amdgpu_dev_coredump.h" 28 #include "atom.h" 29 30 #ifndef CONFIG_DEV_COREDUMP 31 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, 32 bool vram_lost, struct amdgpu_job *job) 33 { 34 } 35 void amdgpu_coredump_init(struct amdgpu_device *adev) 36 { 37 } 38 void amdgpu_coredump_fini(struct amdgpu_device *adev) 39 { 40 } 41 #else 42 43 #define AMDGPU_CORE_DUMP_SIZE_MAX (256 * 1024 * 1024) 44 45 const char *hw_ip_names[MAX_HWIP] = { 46 [GC_HWIP] = "GC", 47 [HDP_HWIP] = "HDP", 48 [SDMA0_HWIP] = "SDMA0", 49 [SDMA1_HWIP] = "SDMA1", 50 [SDMA2_HWIP] = "SDMA2", 51 [SDMA3_HWIP] = "SDMA3", 52 [SDMA4_HWIP] = "SDMA4", 53 [SDMA5_HWIP] = "SDMA5", 54 [SDMA6_HWIP] = "SDMA6", 55 [SDMA7_HWIP] = "SDMA7", 56 [LSDMA_HWIP] = "LSDMA", 57 [MMHUB_HWIP] = "MMHUB", 58 [ATHUB_HWIP] = "ATHUB", 59 [NBIO_HWIP] = "NBIO", 60 [MP0_HWIP] = "MP0", 61 [MP1_HWIP] = "MP1", 62 [UVD_HWIP] = "UVD/JPEG/VCN", 63 [VCN1_HWIP] = "VCN1", 64 [VCE_HWIP] = "VCE", 65 [VPE_HWIP] = "VPE", 66 [DF_HWIP] = "DF", 67 [DCE_HWIP] = "DCE", 68 [OSSSYS_HWIP] = "OSSSYS", 69 [SMUIO_HWIP] = "SMUIO", 70 [PWR_HWIP] = "PWR", 71 [NBIF_HWIP] = "NBIF", 72 [THM_HWIP] = "THM", 73 [CLK_HWIP] = "CLK", 74 [UMC_HWIP] = "UMC", 75 [RSMU_HWIP] = "RSMU", 76 [XGMI_HWIP] = "XGMI", 77 [DCI_HWIP] = "DCI", 78 [PCIE_HWIP] = "PCIE", 79 }; 80 81 static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, 82 struct drm_printer *p) 83 { 84 uint32_t version; 85 uint32_t feature; 86 uint8_t smu_program, smu_major, smu_minor, smu_debug; 87 struct atom_context *ctx = adev->mode_info.atom_context; 88 89 drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n", 90 adev->vce.fb_version, adev->vce.fw_version); 91 drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0, 92 adev->uvd.fw_version); 93 drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0, 94 adev->gmc.fw_version); 95 drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n", 96 adev->gfx.me_feature_version, adev->gfx.me_fw_version); 97 drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n", 98 adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version); 99 drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n", 100 adev->gfx.ce_feature_version, adev->gfx.ce_fw_version); 101 drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n", 102 adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version); 103 104 drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n", 105 adev->gfx.rlc_srlc_feature_version, 106 adev->gfx.rlc_srlc_fw_version); 107 drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n", 108 adev->gfx.rlc_srlg_feature_version, 109 adev->gfx.rlc_srlg_fw_version); 110 drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n", 111 adev->gfx.rlc_srls_feature_version, 112 adev->gfx.rlc_srls_fw_version); 113 drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n", 114 adev->gfx.rlcp_ucode_feature_version, 115 adev->gfx.rlcp_ucode_version); 116 drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n", 117 adev->gfx.rlcv_ucode_feature_version, 118 adev->gfx.rlcv_ucode_version); 119 drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n", 120 adev->gfx.mec_feature_version, adev->gfx.mec_fw_version); 121 122 if (adev->gfx.mec2_fw) 123 drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n", 124 adev->gfx.mec2_feature_version, 125 adev->gfx.mec2_fw_version); 126 127 drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0, 128 adev->gfx.imu_fw_version); 129 drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n", 130 adev->psp.sos.feature_version, adev->psp.sos.fw_version); 131 drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n", 132 adev->psp.asd_context.bin_desc.feature_version, 133 adev->psp.asd_context.bin_desc.fw_version); 134 135 drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n", 136 adev->psp.xgmi_context.context.bin_desc.feature_version, 137 adev->psp.xgmi_context.context.bin_desc.fw_version); 138 drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n", 139 adev->psp.ras_context.context.bin_desc.feature_version, 140 adev->psp.ras_context.context.bin_desc.fw_version); 141 drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n", 142 adev->psp.hdcp_context.context.bin_desc.feature_version, 143 adev->psp.hdcp_context.context.bin_desc.fw_version); 144 drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n", 145 adev->psp.dtm_context.context.bin_desc.feature_version, 146 adev->psp.dtm_context.context.bin_desc.fw_version); 147 drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n", 148 adev->psp.rap_context.context.bin_desc.feature_version, 149 adev->psp.rap_context.context.bin_desc.fw_version); 150 drm_printf(p, 151 "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n", 152 adev->psp.securedisplay_context.context.bin_desc.feature_version, 153 adev->psp.securedisplay_context.context.bin_desc.fw_version); 154 155 /* SMC firmware */ 156 version = adev->pm.fw_version; 157 158 smu_program = (version >> 24) & 0xff; 159 smu_major = (version >> 16) & 0xff; 160 smu_minor = (version >> 8) & 0xff; 161 smu_debug = (version >> 0) & 0xff; 162 drm_printf(p, 163 "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n", 164 0, smu_program, version, smu_major, smu_minor, smu_debug); 165 166 /* SDMA firmware */ 167 for (int i = 0; i < adev->sdma.num_instances; i++) { 168 drm_printf(p, 169 "SDMA%d feature version: %u, firmware version: 0x%08x\n", 170 i, adev->sdma.instance[i].feature_version, 171 adev->sdma.instance[i].fw_version); 172 } 173 174 drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0, 175 adev->vcn.fw_version); 176 drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0, 177 adev->dm.dmcu_fw_version); 178 drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0, 179 adev->dm.dmcub_fw_version); 180 drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n", 181 adev->psp.toc.feature_version, adev->psp.toc.fw_version); 182 183 version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK; 184 feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >> 185 AMDGPU_MES_FEAT_VERSION_SHIFT; 186 drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n", 187 feature, version); 188 189 version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; 190 feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >> 191 AMDGPU_MES_FEAT_VERSION_SHIFT; 192 drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature, 193 version); 194 195 drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n", 196 adev->vpe.feature_version, adev->vpe.fw_version); 197 198 if (adev->bios) { 199 drm_printf(p, "\nVBIOS Information\n"); 200 drm_printf(p, "vbios name : %s\n", ctx->name); 201 drm_printf(p, "vbios pn : %s\n", ctx->vbios_pn); 202 drm_printf(p, "vbios version : %d\n", ctx->version); 203 drm_printf(p, "vbios ver_str : %s\n", ctx->vbios_ver_str); 204 drm_printf(p, "vbios date : %s\n", ctx->date); 205 }else { 206 drm_printf(p, "\nVBIOS Information: NA\n"); 207 } 208 } 209 210 static ssize_t 211 amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump) 212 { 213 struct amdgpu_device *adev = coredump->adev; 214 struct drm_printer p; 215 struct drm_print_iterator iter; 216 struct amdgpu_vm_fault_info *fault_info; 217 struct amdgpu_bo_va_mapping *mapping; 218 struct amdgpu_ip_block *ip_block; 219 struct amdgpu_res_cursor cursor; 220 struct amdgpu_bo *abo, *root; 221 uint64_t va_start, offset; 222 struct amdgpu_ring *ring; 223 struct amdgpu_vm *vm; 224 u32 *ib_content; 225 uint8_t *kptr; 226 int ver, i, j, r; 227 u32 ring_idx, off; 228 bool sizing_pass; 229 230 sizing_pass = buffer == NULL; 231 iter.data = buffer; 232 iter.offset = 0; 233 iter.remain = count; 234 235 p = drm_coredump_printer(&iter); 236 237 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 238 drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); 239 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 240 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 241 drm_printf(&p, "time: %ptSp\n", &coredump->reset_time); 242 243 if (coredump->reset_task_info.task.pid) 244 drm_printf(&p, "process_name: %s PID: %d\n", 245 coredump->reset_task_info.process_name, 246 coredump->reset_task_info.task.pid); 247 248 /* SOC Information */ 249 drm_printf(&p, "\nSOC Information\n"); 250 drm_printf(&p, "SOC Device id: %d\n", coredump->adev->pdev->device); 251 drm_printf(&p, "SOC PCI Revision id: %d\n", coredump->adev->pdev->revision); 252 drm_printf(&p, "SOC Family: %d\n", coredump->adev->family); 253 drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id); 254 drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id); 255 256 /* Memory Information */ 257 drm_printf(&p, "\nSOC Memory Information\n"); 258 drm_printf(&p, "real vram size: %llu\n", coredump->adev->gmc.real_vram_size); 259 drm_printf(&p, "visible vram size: %llu\n", coredump->adev->gmc.visible_vram_size); 260 drm_printf(&p, "gtt size: %llu\n", coredump->adev->mman.gtt_mgr.manager.size); 261 262 /* GDS Config */ 263 drm_printf(&p, "\nGDS Config\n"); 264 drm_printf(&p, "gds: total size: %d\n", coredump->adev->gds.gds_size); 265 drm_printf(&p, "gds: compute partition size: %d\n", coredump->adev->gds.gds_size); 266 drm_printf(&p, "gds: gws per compute partition: %d\n", coredump->adev->gds.gws_size); 267 drm_printf(&p, "gds: os per compute partition: %d\n", coredump->adev->gds.oa_size); 268 269 /* HWIP Version Information */ 270 drm_printf(&p, "\nHW IP Version Information\n"); 271 for (int i = 1; i < MAX_HWIP; i++) { 272 for (int j = 0; j < HWIP_MAX_INSTANCE; j++) { 273 ver = coredump->adev->ip_versions[i][j]; 274 if (ver) 275 drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n", 276 hw_ip_names[i], i, j, 277 IP_VERSION_MAJ(ver), 278 IP_VERSION_MIN(ver), 279 IP_VERSION_REV(ver), 280 IP_VERSION_VARIANT(ver), 281 IP_VERSION_SUBREV(ver)); 282 } 283 } 284 285 amdgpu_discovery_dump(coredump->adev, &p); 286 287 /* IP firmware information */ 288 drm_printf(&p, "\nIP Firmwares\n"); 289 amdgpu_devcoredump_fw_info(coredump->adev, &p); 290 291 if (coredump->ring) { 292 drm_printf(&p, "\nRing timed out details\n"); 293 drm_printf(&p, "IP Type: %d Ring Name: %s\n", 294 coredump->ring->funcs->type, 295 coredump->ring->name); 296 } 297 298 /* Add page fault information */ 299 fault_info = &coredump->adev->vm_manager.fault_info; 300 drm_printf(&p, "\n[%s] Page fault observed\n", 301 fault_info->vmhub ? "mmhub" : "gfxhub"); 302 drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr); 303 drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status); 304 305 /* dump the ip state for each ip */ 306 drm_printf(&p, "IP Dump\n"); 307 for (int i = 0; i < coredump->adev->num_ip_blocks; i++) { 308 ip_block = &coredump->adev->ip_blocks[i]; 309 if (ip_block->version->funcs->print_ip_state) { 310 drm_printf(&p, "IP: %s\n", ip_block->version->funcs->name); 311 ip_block->version->funcs->print_ip_state(ip_block, &p); 312 drm_printf(&p, "\n"); 313 } 314 } 315 316 /* Add ring buffer information */ 317 drm_printf(&p, "Ring buffer information\n"); 318 if (coredump->num_rings) { 319 for (i = 0; i < coredump->num_rings; i++) { 320 ring_idx = coredump->rings[i].ring_index; 321 ring = coredump->adev->rings[ring_idx]; 322 off = coredump->rings[i].offset; 323 324 drm_printf(&p, "ring name: %s\n", ring->name); 325 drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n", 326 coredump->rings[i].rptr, 327 coredump->rings[i].wptr, 328 ring->buf_mask); 329 drm_printf(&p, "Ring size in dwords: %d\n", 330 ring->ring_size / 4); 331 drm_printf(&p, "Ring contents\n"); 332 drm_printf(&p, "Offset \t Value\n"); 333 334 for (j = 0; j < ring->ring_size; j += 4) 335 drm_printf(&p, "0x%x \t 0x%x\n", j, 336 coredump->rings_dw[off + j / 4]); 337 } 338 } 339 340 if (coredump->skip_vram_check) 341 drm_printf(&p, "VRAM lost check is skipped!\n"); 342 else if (coredump->reset_vram_lost) 343 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 344 345 if (coredump->num_ibs) { 346 /* Don't try to lookup the VM or map the BOs when calculating the 347 * size required to store the devcoredump. 348 */ 349 if (sizing_pass) 350 vm = NULL; 351 else 352 vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid); 353 354 for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) { 355 ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4, 356 GFP_KERNEL); 357 if (!ib_content) 358 continue; 359 360 /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the 361 * drm_printf() calls (ib_content doesn't need to be initialized 362 * as its content won't be written anywhere). 363 */ 364 if (!vm) 365 goto output_ib_content; 366 367 va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK; 368 mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE); 369 if (!mapping) 370 goto free_ib_content; 371 372 offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE); 373 abo = amdgpu_bo_ref(mapping->bo_va->base.bo); 374 r = amdgpu_bo_reserve(abo, false); 375 if (r) 376 goto free_ib_content; 377 378 if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) { 379 off = 0; 380 381 if (abo->tbo.resource->mem_type != TTM_PL_VRAM) 382 goto unreserve_abo; 383 384 amdgpu_res_first(abo->tbo.resource, offset, 385 coredump->ibs[i].ib_size_dw * 4, 386 &cursor); 387 while (cursor.remaining) { 388 amdgpu_device_mm_access(adev, cursor.start / 4, 389 &ib_content[off], cursor.size / 4, 390 false); 391 off += cursor.size; 392 amdgpu_res_next(&cursor, cursor.size); 393 } 394 } else { 395 r = ttm_bo_kmap(&abo->tbo, 0, 396 PFN_UP(abo->tbo.base.size), 397 &abo->kmap); 398 if (r) 399 goto unreserve_abo; 400 401 kptr = amdgpu_bo_kptr(abo); 402 kptr += offset; 403 memcpy(ib_content, kptr, 404 coredump->ibs[i].ib_size_dw * 4); 405 406 amdgpu_bo_kunmap(abo); 407 } 408 409 output_ib_content: 410 drm_printf(&p, "\nIB #%d 0x%llx %d dw\n", 411 i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw); 412 for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++) 413 drm_printf(&p, "0x%08x\n", ib_content[j]); 414 unreserve_abo: 415 if (vm) 416 amdgpu_bo_unreserve(abo); 417 free_ib_content: 418 kvfree(ib_content); 419 } 420 if (vm) { 421 amdgpu_bo_unreserve(root); 422 amdgpu_bo_unref(&root); 423 } 424 } 425 426 return count - iter.remain; 427 } 428 429 static ssize_t 430 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, 431 void *data, size_t datalen) 432 { 433 struct amdgpu_coredump_info *coredump = data; 434 ssize_t byte_copied; 435 436 if (!coredump) 437 return -ENODEV; 438 439 if (!coredump->formatted) 440 return -ENODEV; 441 442 if (offset >= coredump->formatted_size) 443 return 0; 444 445 byte_copied = count < coredump->formatted_size - offset ? count : 446 coredump->formatted_size - offset; 447 memcpy(buffer, coredump->formatted + offset, byte_copied); 448 449 return byte_copied; 450 } 451 452 static void amdgpu_devcoredump_free(void *data) 453 { 454 struct amdgpu_coredump_info *coredump = data; 455 456 kvfree(coredump->formatted); 457 kvfree(coredump->rings); 458 kvfree(coredump->rings_dw); 459 kvfree(data); 460 } 461 462 static void amdgpu_devcoredump_deferred_work(struct work_struct *work) 463 { 464 struct amdgpu_device *adev = container_of(work, typeof(*adev), coredump_work); 465 struct amdgpu_coredump_info *coredump = adev->coredump; 466 467 if (!coredump) 468 goto end; 469 470 /* Do a one-time preparation of the coredump output because 471 * repeatingly calling drm_coredump_printer is very slow. 472 */ 473 coredump->formatted_size = amdgpu_devcoredump_format( 474 NULL, AMDGPU_CORE_DUMP_SIZE_MAX, coredump); 475 coredump->formatted = kvzalloc(coredump->formatted_size, GFP_KERNEL); 476 if (!coredump->formatted) { 477 amdgpu_devcoredump_free(coredump); 478 goto end; 479 } 480 481 amdgpu_devcoredump_format(coredump->formatted, coredump->formatted_size, coredump); 482 483 /* If there's an existing coredump for this device, the free function will be 484 * called immediately so coredump might be invalid after the call to dev_coredumpm. 485 */ 486 dev_coredumpm(coredump->adev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, 487 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 488 489 end: 490 adev->coredump = NULL; 491 } 492 493 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, 494 bool vram_lost, struct amdgpu_job *job) 495 { 496 struct drm_device *dev = adev_to_drm(adev); 497 struct amdgpu_coredump_info *coredump; 498 size_t size = sizeof(*coredump); 499 struct drm_sched_job *s_job; 500 u64 total_ring_size, ring_count; 501 struct amdgpu_ring *ring; 502 int i, off, idx; 503 504 /* No need to generate a new coredump if there's one in progress already. */ 505 if (work_busy(&adev->coredump_work)) 506 return; 507 508 if (job && job->pasid) 509 size += sizeof(struct amdgpu_coredump_ib_info) * job->num_ibs; 510 511 coredump = kzalloc(size, GFP_NOWAIT); 512 if (!coredump) 513 return; 514 515 coredump->skip_vram_check = skip_vram_check; 516 coredump->reset_vram_lost = vram_lost; 517 518 if (job && job->pasid) { 519 struct amdgpu_task_info *ti; 520 521 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 522 if (ti) { 523 coredump->reset_task_info = *ti; 524 amdgpu_vm_put_task_info(ti); 525 } 526 coredump->pasid = job->pasid; 527 coredump->num_ibs = job->num_ibs; 528 for (i = 0; i < job->num_ibs; ++i) { 529 coredump->ibs[i].gpu_addr = job->ibs[i].gpu_addr; 530 coredump->ibs[i].ib_size_dw = job->ibs[i].length_dw; 531 } 532 } 533 534 if (job) { 535 s_job = &job->base; 536 coredump->ring = to_amdgpu_ring(s_job->sched); 537 } 538 539 /* Dump ring content if memory allocation succeeds. */ 540 ring_count = 0; 541 total_ring_size = 0; 542 for (i = 0; i < adev->num_rings; i++) { 543 ring = adev->rings[i]; 544 545 /* Only dump rings with unsignalled fences. */ 546 if (atomic_read(&ring->fence_drv.last_seq) == ring->fence_drv.sync_seq && 547 coredump->ring != ring) 548 continue; 549 550 total_ring_size += ring->ring_size; 551 ring_count++; 552 } 553 coredump->rings_dw = kzalloc(total_ring_size, GFP_NOWAIT); 554 coredump->rings = kcalloc(ring_count, sizeof(struct amdgpu_coredump_ring), GFP_NOWAIT); 555 if (coredump->rings && coredump->rings_dw) { 556 for (i = 0, off = 0, idx = 0; i < adev->num_rings && idx < ring_count; i++) { 557 ring = adev->rings[i]; 558 559 if (atomic_read(&ring->fence_drv.last_seq) == ring->fence_drv.sync_seq && 560 coredump->ring != ring) 561 continue; 562 563 coredump->rings[idx].ring_index = ring->idx; 564 coredump->rings[idx].rptr = amdgpu_ring_get_rptr(ring); 565 coredump->rings[idx].wptr = amdgpu_ring_get_wptr(ring); 566 coredump->rings[idx].offset = off; 567 568 memcpy(&coredump->rings_dw[off], ring->ring, ring->ring_size); 569 off += ring->ring_size / 4; 570 idx++; 571 } 572 coredump->num_rings = idx; 573 } else { 574 kvfree(coredump->rings_dw); 575 kvfree(coredump->rings); 576 coredump->rings_dw = NULL; 577 coredump->rings = NULL; 578 } 579 580 coredump->adev = adev; 581 582 ktime_get_ts64(&coredump->reset_time); 583 584 /* Update the current coredump pointer (no lock needed, this function can only be called 585 * from a single thread) 586 */ 587 adev->coredump = coredump; 588 /* Kick off coredump formatting to a worker thread. */ 589 queue_work(system_dfl_wq, &adev->coredump_work); 590 591 drm_info(dev, "AMDGPU device coredump file has been created\n"); 592 drm_info(dev, "Check your /sys/class/drm/card%d/device/devcoredump/data\n", 593 dev->primary->index); 594 } 595 596 void amdgpu_coredump_init(struct amdgpu_device *adev) 597 { 598 INIT_WORK(&adev->coredump_work, amdgpu_devcoredump_deferred_work); 599 } 600 601 void amdgpu_coredump_fini(struct amdgpu_device *adev) 602 { 603 /* Finish deferred coredump formatting before HW/IP teardown. */ 604 flush_work(&adev->coredump_work); 605 } 606 #endif 607