1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2024 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include <generated/utsrelease.h>
26 #include <linux/devcoredump.h>
27 #include "amdgpu_dev_coredump.h"
28 #include "atom.h"
29
30 #ifndef CONFIG_DEV_COREDUMP
amdgpu_coredump(struct amdgpu_device * adev,bool skip_vram_check,bool vram_lost,struct amdgpu_job * job)31 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
32 bool vram_lost, struct amdgpu_job *job)
33 {
34 }
amdgpu_coredump_init(struct amdgpu_device * adev)35 void amdgpu_coredump_init(struct amdgpu_device *adev)
36 {
37 }
amdgpu_coredump_fini(struct amdgpu_device * adev)38 void amdgpu_coredump_fini(struct amdgpu_device *adev)
39 {
40 }
41 #else
42
43 #define AMDGPU_CORE_DUMP_SIZE_MAX (256 * 1024 * 1024)
44
45 const char *hw_ip_names[MAX_HWIP] = {
46 [GC_HWIP] = "GC",
47 [HDP_HWIP] = "HDP",
48 [SDMA0_HWIP] = "SDMA0",
49 [SDMA1_HWIP] = "SDMA1",
50 [SDMA2_HWIP] = "SDMA2",
51 [SDMA3_HWIP] = "SDMA3",
52 [SDMA4_HWIP] = "SDMA4",
53 [SDMA5_HWIP] = "SDMA5",
54 [SDMA6_HWIP] = "SDMA6",
55 [SDMA7_HWIP] = "SDMA7",
56 [LSDMA_HWIP] = "LSDMA",
57 [MMHUB_HWIP] = "MMHUB",
58 [ATHUB_HWIP] = "ATHUB",
59 [NBIO_HWIP] = "NBIO",
60 [MP0_HWIP] = "MP0",
61 [MP1_HWIP] = "MP1",
62 [UVD_HWIP] = "UVD/JPEG/VCN",
63 [VCN1_HWIP] = "VCN1",
64 [VCE_HWIP] = "VCE",
65 [VPE_HWIP] = "VPE",
66 [DF_HWIP] = "DF",
67 [DCE_HWIP] = "DCE",
68 [OSSSYS_HWIP] = "OSSSYS",
69 [SMUIO_HWIP] = "SMUIO",
70 [PWR_HWIP] = "PWR",
71 [NBIF_HWIP] = "NBIF",
72 [THM_HWIP] = "THM",
73 [CLK_HWIP] = "CLK",
74 [UMC_HWIP] = "UMC",
75 [RSMU_HWIP] = "RSMU",
76 [XGMI_HWIP] = "XGMI",
77 [DCI_HWIP] = "DCI",
78 [PCIE_HWIP] = "PCIE",
79 };
80
amdgpu_devcoredump_fw_info(struct amdgpu_device * adev,struct drm_printer * p)81 static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
82 struct drm_printer *p)
83 {
84 uint32_t version;
85 uint32_t feature;
86 uint8_t smu_program, smu_major, smu_minor, smu_debug;
87 struct atom_context *ctx = adev->mode_info.atom_context;
88
89 drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
90 adev->vce.fb_version, adev->vce.fw_version);
91 drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
92 adev->uvd.fw_version);
93 drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
94 adev->gmc.fw_version);
95 drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
96 adev->gfx.me_feature_version, adev->gfx.me_fw_version);
97 drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
98 adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
99 drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
100 adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
101 drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
102 adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
103
104 drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
105 adev->gfx.rlc_srlc_feature_version,
106 adev->gfx.rlc_srlc_fw_version);
107 drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
108 adev->gfx.rlc_srlg_feature_version,
109 adev->gfx.rlc_srlg_fw_version);
110 drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
111 adev->gfx.rlc_srls_feature_version,
112 adev->gfx.rlc_srls_fw_version);
113 drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
114 adev->gfx.rlcp_ucode_feature_version,
115 adev->gfx.rlcp_ucode_version);
116 drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
117 adev->gfx.rlcv_ucode_feature_version,
118 adev->gfx.rlcv_ucode_version);
119 drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
120 adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
121
122 if (adev->gfx.mec2_fw)
123 drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
124 adev->gfx.mec2_feature_version,
125 adev->gfx.mec2_fw_version);
126
127 drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
128 adev->gfx.imu_fw_version);
129 drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
130 adev->psp.sos.feature_version, adev->psp.sos.fw_version);
131 drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
132 adev->psp.asd_context.bin_desc.feature_version,
133 adev->psp.asd_context.bin_desc.fw_version);
134
135 drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
136 adev->psp.xgmi_context.context.bin_desc.feature_version,
137 adev->psp.xgmi_context.context.bin_desc.fw_version);
138 drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
139 adev->psp.ras_context.context.bin_desc.feature_version,
140 adev->psp.ras_context.context.bin_desc.fw_version);
141 drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
142 adev->psp.hdcp_context.context.bin_desc.feature_version,
143 adev->psp.hdcp_context.context.bin_desc.fw_version);
144 drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
145 adev->psp.dtm_context.context.bin_desc.feature_version,
146 adev->psp.dtm_context.context.bin_desc.fw_version);
147 drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
148 adev->psp.rap_context.context.bin_desc.feature_version,
149 adev->psp.rap_context.context.bin_desc.fw_version);
150 drm_printf(p,
151 "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
152 adev->psp.securedisplay_context.context.bin_desc.feature_version,
153 adev->psp.securedisplay_context.context.bin_desc.fw_version);
154
155 /* SMC firmware */
156 version = adev->pm.fw_version;
157
158 smu_program = (version >> 24) & 0xff;
159 smu_major = (version >> 16) & 0xff;
160 smu_minor = (version >> 8) & 0xff;
161 smu_debug = (version >> 0) & 0xff;
162 drm_printf(p,
163 "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
164 0, smu_program, version, smu_major, smu_minor, smu_debug);
165
166 /* SDMA firmware */
167 for (int i = 0; i < adev->sdma.num_instances; i++) {
168 drm_printf(p,
169 "SDMA%d feature version: %u, firmware version: 0x%08x\n",
170 i, adev->sdma.instance[i].feature_version,
171 adev->sdma.instance[i].fw_version);
172 }
173
174 drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
175 adev->vcn.fw_version);
176 drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
177 adev->dm.dmcu_fw_version);
178 drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
179 adev->dm.dmcub_fw_version);
180 drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
181 adev->psp.toc.feature_version, adev->psp.toc.fw_version);
182
183 version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
184 feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
185 AMDGPU_MES_FEAT_VERSION_SHIFT;
186 drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
187 feature, version);
188
189 version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
190 feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
191 AMDGPU_MES_FEAT_VERSION_SHIFT;
192 drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
193 version);
194
195 drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
196 adev->vpe.feature_version, adev->vpe.fw_version);
197
198 if (adev->bios) {
199 drm_printf(p, "\nVBIOS Information\n");
200 drm_printf(p, "vbios name : %s\n", ctx->name);
201 drm_printf(p, "vbios pn : %s\n", ctx->vbios_pn);
202 drm_printf(p, "vbios version : %d\n", ctx->version);
203 drm_printf(p, "vbios ver_str : %s\n", ctx->vbios_ver_str);
204 drm_printf(p, "vbios date : %s\n", ctx->date);
205 }else {
206 drm_printf(p, "\nVBIOS Information: NA\n");
207 }
208 }
209
210 static ssize_t
amdgpu_devcoredump_format(char * buffer,size_t count,struct amdgpu_coredump_info * coredump)211 amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump)
212 {
213 struct amdgpu_device *adev = coredump->adev;
214 struct drm_printer p;
215 struct drm_print_iterator iter;
216 struct amdgpu_vm_fault_info *fault_info;
217 struct amdgpu_bo_va_mapping *mapping;
218 struct amdgpu_ip_block *ip_block;
219 struct amdgpu_res_cursor cursor;
220 struct amdgpu_bo *abo, *root;
221 uint64_t va_start, offset;
222 struct amdgpu_ring *ring;
223 struct amdgpu_vm *vm;
224 u32 *ib_content;
225 uint8_t *kptr;
226 int ver, i, j, r;
227 u32 ring_idx, off;
228 bool sizing_pass;
229
230 sizing_pass = buffer == NULL;
231 iter.data = buffer;
232 iter.offset = 0;
233 iter.remain = count;
234
235 p = drm_coredump_printer(&iter);
236
237 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
238 drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
239 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
240 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
241 drm_printf(&p, "time: %ptSp\n", &coredump->reset_time);
242
243 if (coredump->reset_task_info.task.pid)
244 drm_printf(&p, "process_name: %s PID: %d\n",
245 coredump->reset_task_info.process_name,
246 coredump->reset_task_info.task.pid);
247
248 /* SOC Information */
249 drm_printf(&p, "\nSOC Information\n");
250 drm_printf(&p, "SOC Device id: %d\n", coredump->adev->pdev->device);
251 drm_printf(&p, "SOC PCI Revision id: %d\n", coredump->adev->pdev->revision);
252 drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
253 drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
254 drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
255
256 /* Memory Information */
257 drm_printf(&p, "\nSOC Memory Information\n");
258 drm_printf(&p, "real vram size: %llu\n", coredump->adev->gmc.real_vram_size);
259 drm_printf(&p, "visible vram size: %llu\n", coredump->adev->gmc.visible_vram_size);
260 drm_printf(&p, "gtt size: %llu\n", coredump->adev->mman.gtt_mgr.manager.size);
261
262 /* GDS Config */
263 drm_printf(&p, "\nGDS Config\n");
264 drm_printf(&p, "gds: total size: %d\n", coredump->adev->gds.gds_size);
265 drm_printf(&p, "gds: compute partition size: %d\n", coredump->adev->gds.gds_size);
266 drm_printf(&p, "gds: gws per compute partition: %d\n", coredump->adev->gds.gws_size);
267 drm_printf(&p, "gds: os per compute partition: %d\n", coredump->adev->gds.oa_size);
268
269 /* HWIP Version Information */
270 drm_printf(&p, "\nHW IP Version Information\n");
271 for (int i = 1; i < MAX_HWIP; i++) {
272 for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
273 ver = coredump->adev->ip_versions[i][j];
274 if (ver)
275 drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
276 hw_ip_names[i], i, j,
277 IP_VERSION_MAJ(ver),
278 IP_VERSION_MIN(ver),
279 IP_VERSION_REV(ver),
280 IP_VERSION_VARIANT(ver),
281 IP_VERSION_SUBREV(ver));
282 }
283 }
284
285 amdgpu_discovery_dump(coredump->adev, &p);
286
287 /* IP firmware information */
288 drm_printf(&p, "\nIP Firmwares\n");
289 amdgpu_devcoredump_fw_info(coredump->adev, &p);
290
291 if (coredump->ring) {
292 drm_printf(&p, "\nRing timed out details\n");
293 drm_printf(&p, "IP Type: %d Ring Name: %s\n",
294 coredump->ring->funcs->type,
295 coredump->ring->name);
296 }
297
298 /* Add page fault information */
299 fault_info = &coredump->adev->vm_manager.fault_info;
300 drm_printf(&p, "\n[%s] Page fault observed\n",
301 fault_info->vmhub ? "mmhub" : "gfxhub");
302 drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
303 drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
304
305 /* dump the ip state for each ip */
306 drm_printf(&p, "IP Dump\n");
307 for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
308 ip_block = &coredump->adev->ip_blocks[i];
309 if (ip_block->version->funcs->print_ip_state) {
310 drm_printf(&p, "IP: %s\n", ip_block->version->funcs->name);
311 ip_block->version->funcs->print_ip_state(ip_block, &p);
312 drm_printf(&p, "\n");
313 }
314 }
315
316 /* Add ring buffer information */
317 drm_printf(&p, "Ring buffer information\n");
318 if (coredump->num_rings) {
319 for (i = 0; i < coredump->num_rings; i++) {
320 ring_idx = coredump->rings[i].ring_index;
321 ring = coredump->adev->rings[ring_idx];
322 off = coredump->rings[i].offset;
323
324 drm_printf(&p, "ring name: %s\n", ring->name);
325 drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
326 coredump->rings[i].rptr,
327 coredump->rings[i].wptr,
328 ring->buf_mask);
329 drm_printf(&p, "Ring size in dwords: %d\n",
330 ring->ring_size / 4);
331 drm_printf(&p, "Ring contents\n");
332 drm_printf(&p, "Offset \t Value\n");
333
334 for (j = 0; j < ring->ring_size; j += 4)
335 drm_printf(&p, "0x%x \t 0x%x\n", j,
336 coredump->rings_dw[off + j / 4]);
337 }
338 }
339
340 if (coredump->skip_vram_check)
341 drm_printf(&p, "VRAM lost check is skipped!\n");
342 else if (coredump->reset_vram_lost)
343 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
344
345 if (coredump->num_ibs) {
346 /* Don't try to lookup the VM or map the BOs when calculating the
347 * size required to store the devcoredump.
348 */
349 if (sizing_pass)
350 vm = NULL;
351 else
352 vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
353
354 for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) {
355 ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4,
356 GFP_KERNEL);
357 if (!ib_content)
358 continue;
359
360 /* vm=NULL can only happen when 'sizing_pass' is true. Skip to the
361 * drm_printf() calls (ib_content doesn't need to be initialized
362 * as its content won't be written anywhere).
363 */
364 if (!vm)
365 goto output_ib_content;
366
367 va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK;
368 mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE);
369 if (!mapping)
370 goto free_ib_content;
371
372 offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE);
373 abo = amdgpu_bo_ref(mapping->bo_va->base.bo);
374 r = amdgpu_bo_reserve(abo, false);
375 if (r)
376 goto free_ib_content;
377
378 if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) {
379 off = 0;
380
381 if (abo->tbo.resource->mem_type != TTM_PL_VRAM)
382 goto unreserve_abo;
383
384 amdgpu_res_first(abo->tbo.resource, offset,
385 coredump->ibs[i].ib_size_dw * 4,
386 &cursor);
387 while (cursor.remaining) {
388 amdgpu_device_mm_access(adev, cursor.start / 4,
389 &ib_content[off], cursor.size / 4,
390 false);
391 off += cursor.size;
392 amdgpu_res_next(&cursor, cursor.size);
393 }
394 } else {
395 r = ttm_bo_kmap(&abo->tbo, 0,
396 PFN_UP(abo->tbo.base.size),
397 &abo->kmap);
398 if (r)
399 goto unreserve_abo;
400
401 kptr = amdgpu_bo_kptr(abo);
402 kptr += offset;
403 memcpy(ib_content, kptr,
404 coredump->ibs[i].ib_size_dw * 4);
405
406 amdgpu_bo_kunmap(abo);
407 }
408
409 output_ib_content:
410 drm_printf(&p, "\nIB #%d 0x%llx %d dw\n",
411 i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw);
412 for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)
413 drm_printf(&p, "0x%08x\n", ib_content[j]);
414 unreserve_abo:
415 if (vm)
416 amdgpu_bo_unreserve(abo);
417 free_ib_content:
418 kvfree(ib_content);
419 }
420 if (vm) {
421 amdgpu_bo_unreserve(root);
422 amdgpu_bo_unref(&root);
423 }
424 }
425
426 return count - iter.remain;
427 }
428
429 static ssize_t
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)430 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
431 void *data, size_t datalen)
432 {
433 struct amdgpu_coredump_info *coredump = data;
434 ssize_t byte_copied;
435
436 if (!coredump)
437 return -ENODEV;
438
439 if (!coredump->formatted)
440 return -ENODEV;
441
442 if (offset >= coredump->formatted_size)
443 return 0;
444
445 byte_copied = count < coredump->formatted_size - offset ? count :
446 coredump->formatted_size - offset;
447 memcpy(buffer, coredump->formatted + offset, byte_copied);
448
449 return byte_copied;
450 }
451
amdgpu_devcoredump_free(void * data)452 static void amdgpu_devcoredump_free(void *data)
453 {
454 struct amdgpu_coredump_info *coredump = data;
455
456 kvfree(coredump->formatted);
457 kvfree(coredump->rings);
458 kvfree(coredump->rings_dw);
459 kvfree(data);
460 }
461
amdgpu_devcoredump_deferred_work(struct work_struct * work)462 static void amdgpu_devcoredump_deferred_work(struct work_struct *work)
463 {
464 struct amdgpu_device *adev = container_of(work, typeof(*adev), coredump_work);
465 struct amdgpu_coredump_info *coredump = adev->coredump;
466
467 if (!coredump)
468 goto end;
469
470 /* Do a one-time preparation of the coredump output because
471 * repeatingly calling drm_coredump_printer is very slow.
472 */
473 coredump->formatted_size = amdgpu_devcoredump_format(
474 NULL, AMDGPU_CORE_DUMP_SIZE_MAX, coredump);
475 coredump->formatted = kvzalloc(coredump->formatted_size, GFP_KERNEL);
476 if (!coredump->formatted) {
477 amdgpu_devcoredump_free(coredump);
478 goto end;
479 }
480
481 amdgpu_devcoredump_format(coredump->formatted, coredump->formatted_size, coredump);
482
483 /* If there's an existing coredump for this device, the free function will be
484 * called immediately so coredump might be invalid after the call to dev_coredumpm.
485 */
486 dev_coredumpm(coredump->adev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
487 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
488
489 end:
490 adev->coredump = NULL;
491 }
492
amdgpu_coredump(struct amdgpu_device * adev,bool skip_vram_check,bool vram_lost,struct amdgpu_job * job)493 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
494 bool vram_lost, struct amdgpu_job *job)
495 {
496 struct drm_device *dev = adev_to_drm(adev);
497 struct amdgpu_coredump_info *coredump;
498 size_t size = sizeof(*coredump);
499 struct drm_sched_job *s_job;
500 u64 total_ring_size, ring_count;
501 struct amdgpu_ring *ring;
502 int i, off, idx;
503
504 /* No need to generate a new coredump if there's one in progress already. */
505 if (work_busy(&adev->coredump_work))
506 return;
507
508 if (job && job->pasid)
509 size += sizeof(struct amdgpu_coredump_ib_info) * job->num_ibs;
510
511 coredump = kzalloc(size, GFP_NOWAIT);
512 if (!coredump)
513 return;
514
515 coredump->skip_vram_check = skip_vram_check;
516 coredump->reset_vram_lost = vram_lost;
517
518 if (job && job->pasid) {
519 struct amdgpu_task_info *ti;
520
521 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
522 if (ti) {
523 coredump->reset_task_info = *ti;
524 amdgpu_vm_put_task_info(ti);
525 }
526 coredump->pasid = job->pasid;
527 coredump->num_ibs = job->num_ibs;
528 for (i = 0; i < job->num_ibs; ++i) {
529 coredump->ibs[i].gpu_addr = job->ibs[i].gpu_addr;
530 coredump->ibs[i].ib_size_dw = job->ibs[i].length_dw;
531 }
532 }
533
534 if (job) {
535 s_job = &job->base;
536 coredump->ring = to_amdgpu_ring(s_job->sched);
537 }
538
539 /* Dump ring content if memory allocation succeeds. */
540 ring_count = 0;
541 total_ring_size = 0;
542 for (i = 0; i < adev->num_rings; i++) {
543 ring = adev->rings[i];
544
545 /* Only dump rings with unsignalled fences. */
546 if (atomic_read(&ring->fence_drv.last_seq) == ring->fence_drv.sync_seq &&
547 coredump->ring != ring)
548 continue;
549
550 total_ring_size += ring->ring_size;
551 ring_count++;
552 }
553 coredump->rings_dw = kzalloc(total_ring_size, GFP_NOWAIT);
554 coredump->rings = kcalloc(ring_count, sizeof(struct amdgpu_coredump_ring), GFP_NOWAIT);
555 if (coredump->rings && coredump->rings_dw) {
556 for (i = 0, off = 0, idx = 0; i < adev->num_rings; i++) {
557 ring = adev->rings[i];
558
559 if (atomic_read(&ring->fence_drv.last_seq) == ring->fence_drv.sync_seq &&
560 coredump->ring != ring)
561 continue;
562
563 coredump->rings[idx].ring_index = ring->idx;
564 coredump->rings[idx].rptr = amdgpu_ring_get_rptr(ring);
565 coredump->rings[idx].wptr = amdgpu_ring_get_wptr(ring);
566 coredump->rings[idx].offset = off;
567
568 memcpy(&coredump->rings_dw[off], ring->ring, ring->ring_size);
569 off += ring->ring_size / 4;
570 idx++;
571 }
572 coredump->num_rings = idx;
573 } else {
574 kvfree(coredump->rings_dw);
575 kvfree(coredump->rings);
576 coredump->rings_dw = NULL;
577 coredump->rings = NULL;
578 }
579
580 coredump->adev = adev;
581
582 ktime_get_ts64(&coredump->reset_time);
583
584 /* Update the current coredump pointer (no lock needed, this function can only be called
585 * from a single thread)
586 */
587 adev->coredump = coredump;
588 /* Kick off coredump formatting to a worker thread. */
589 queue_work(system_unbound_wq, &adev->coredump_work);
590
591 drm_info(dev, "AMDGPU device coredump file has been created\n");
592 drm_info(dev, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
593 dev->primary->index);
594 }
595
amdgpu_coredump_init(struct amdgpu_device * adev)596 void amdgpu_coredump_init(struct amdgpu_device *adev)
597 {
598 INIT_WORK(&adev->coredump_work, amdgpu_devcoredump_deferred_work);
599 }
600
amdgpu_coredump_fini(struct amdgpu_device * adev)601 void amdgpu_coredump_fini(struct amdgpu_device *adev)
602 {
603 /* Finish deferred coredump formatting before HW/IP teardown. */
604 flush_work(&adev->coredump_work);
605 }
606 #endif
607