xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c (revision 92c4c9fdc838d3b41a996bb700ea64b9e78fc7ea)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2024 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <generated/utsrelease.h>
26 #include <linux/devcoredump.h>
27 #include "amdgpu_dev_coredump.h"
28 #include "atom.h"
29 
30 #ifndef CONFIG_DEV_COREDUMP
amdgpu_coredump(struct amdgpu_device * adev,bool skip_vram_check,bool vram_lost,struct amdgpu_job * job)31 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
32 		     bool vram_lost, struct amdgpu_job *job)
33 {
34 }
amdgpu_coredump_init(struct amdgpu_device * adev)35 void amdgpu_coredump_init(struct amdgpu_device *adev)
36 {
37 }
amdgpu_coredump_fini(struct amdgpu_device * adev)38 void amdgpu_coredump_fini(struct amdgpu_device *adev)
39 {
40 }
41 #else
42 
43 #define AMDGPU_CORE_DUMP_SIZE_MAX (256 * 1024 * 1024)
44 
45 const char *hw_ip_names[MAX_HWIP] = {
46 	[GC_HWIP]		= "GC",
47 	[HDP_HWIP]		= "HDP",
48 	[SDMA0_HWIP]		= "SDMA0",
49 	[SDMA1_HWIP]		= "SDMA1",
50 	[SDMA2_HWIP]		= "SDMA2",
51 	[SDMA3_HWIP]		= "SDMA3",
52 	[SDMA4_HWIP]		= "SDMA4",
53 	[SDMA5_HWIP]		= "SDMA5",
54 	[SDMA6_HWIP]		= "SDMA6",
55 	[SDMA7_HWIP]		= "SDMA7",
56 	[LSDMA_HWIP]		= "LSDMA",
57 	[MMHUB_HWIP]		= "MMHUB",
58 	[ATHUB_HWIP]		= "ATHUB",
59 	[NBIO_HWIP]		= "NBIO",
60 	[MP0_HWIP]		= "MP0",
61 	[MP1_HWIP]		= "MP1",
62 	[UVD_HWIP]		= "UVD/JPEG/VCN",
63 	[VCN1_HWIP]		= "VCN1",
64 	[VCE_HWIP]		= "VCE",
65 	[VPE_HWIP]		= "VPE",
66 	[DF_HWIP]		= "DF",
67 	[DCE_HWIP]		= "DCE",
68 	[OSSSYS_HWIP]		= "OSSSYS",
69 	[SMUIO_HWIP]		= "SMUIO",
70 	[PWR_HWIP]		= "PWR",
71 	[NBIF_HWIP]		= "NBIF",
72 	[THM_HWIP]		= "THM",
73 	[CLK_HWIP]		= "CLK",
74 	[UMC_HWIP]		= "UMC",
75 	[RSMU_HWIP]		= "RSMU",
76 	[XGMI_HWIP]		= "XGMI",
77 	[DCI_HWIP]		= "DCI",
78 	[PCIE_HWIP]		= "PCIE",
79 };
80 
amdgpu_devcoredump_fw_info(struct amdgpu_device * adev,struct drm_printer * p)81 static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
82 				       struct drm_printer *p)
83 {
84 	uint32_t version;
85 	uint32_t feature;
86 	uint8_t smu_program, smu_major, smu_minor, smu_debug;
87 	struct atom_context *ctx = adev->mode_info.atom_context;
88 
89 	drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
90 		   adev->vce.fb_version, adev->vce.fw_version);
91 	drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
92 		   adev->uvd.fw_version);
93 	drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
94 		   adev->gmc.fw_version);
95 	drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
96 		   adev->gfx.me_feature_version, adev->gfx.me_fw_version);
97 	drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
98 		   adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
99 	drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
100 		   adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
101 	drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
102 		   adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
103 
104 	drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
105 		   adev->gfx.rlc_srlc_feature_version,
106 		   adev->gfx.rlc_srlc_fw_version);
107 	drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
108 		   adev->gfx.rlc_srlg_feature_version,
109 		   adev->gfx.rlc_srlg_fw_version);
110 	drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
111 		   adev->gfx.rlc_srls_feature_version,
112 		   adev->gfx.rlc_srls_fw_version);
113 	drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
114 		   adev->gfx.rlcp_ucode_feature_version,
115 		   adev->gfx.rlcp_ucode_version);
116 	drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
117 		   adev->gfx.rlcv_ucode_feature_version,
118 		   adev->gfx.rlcv_ucode_version);
119 	drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
120 		   adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
121 
122 	if (adev->gfx.mec2_fw)
123 		drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
124 			   adev->gfx.mec2_feature_version,
125 			   adev->gfx.mec2_fw_version);
126 
127 	drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
128 		   adev->gfx.imu_fw_version);
129 	drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
130 		   adev->psp.sos.feature_version, adev->psp.sos.fw_version);
131 	drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
132 		   adev->psp.asd_context.bin_desc.feature_version,
133 		   adev->psp.asd_context.bin_desc.fw_version);
134 
135 	drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
136 		   adev->psp.xgmi_context.context.bin_desc.feature_version,
137 		   adev->psp.xgmi_context.context.bin_desc.fw_version);
138 	drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
139 		   adev->psp.ras_context.context.bin_desc.feature_version,
140 		   adev->psp.ras_context.context.bin_desc.fw_version);
141 	drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
142 		   adev->psp.hdcp_context.context.bin_desc.feature_version,
143 		   adev->psp.hdcp_context.context.bin_desc.fw_version);
144 	drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
145 		   adev->psp.dtm_context.context.bin_desc.feature_version,
146 		   adev->psp.dtm_context.context.bin_desc.fw_version);
147 	drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
148 		   adev->psp.rap_context.context.bin_desc.feature_version,
149 		   adev->psp.rap_context.context.bin_desc.fw_version);
150 	drm_printf(p,
151 		   "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
152 		   adev->psp.securedisplay_context.context.bin_desc.feature_version,
153 		   adev->psp.securedisplay_context.context.bin_desc.fw_version);
154 
155 	/* SMC firmware */
156 	version = adev->pm.fw_version;
157 
158 	smu_program = (version >> 24) & 0xff;
159 	smu_major = (version >> 16) & 0xff;
160 	smu_minor = (version >> 8) & 0xff;
161 	smu_debug = (version >> 0) & 0xff;
162 	drm_printf(p,
163 		   "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
164 		   0, smu_program, version, smu_major, smu_minor, smu_debug);
165 
166 	/* SDMA firmware */
167 	for (int i = 0; i < adev->sdma.num_instances; i++) {
168 		drm_printf(p,
169 			   "SDMA%d feature version: %u, firmware version: 0x%08x\n",
170 			   i, adev->sdma.instance[i].feature_version,
171 			   adev->sdma.instance[i].fw_version);
172 	}
173 
174 	drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
175 		   adev->vcn.fw_version);
176 	drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
177 		   adev->dm.dmcu_fw_version);
178 	drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
179 		   adev->dm.dmcub_fw_version);
180 	drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
181 		   adev->psp.toc.feature_version, adev->psp.toc.fw_version);
182 
183 	version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
184 	feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
185 		  AMDGPU_MES_FEAT_VERSION_SHIFT;
186 	drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
187 		   feature, version);
188 
189 	version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
190 	feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
191 		  AMDGPU_MES_FEAT_VERSION_SHIFT;
192 	drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
193 		   version);
194 
195 	drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
196 		   adev->vpe.feature_version, adev->vpe.fw_version);
197 
198 	if (adev->bios) {
199 		drm_printf(p, "\nVBIOS Information\n");
200 		drm_printf(p, "vbios name       : %s\n", ctx->name);
201 		drm_printf(p, "vbios pn         : %s\n", ctx->vbios_pn);
202 		drm_printf(p, "vbios version    : %d\n", ctx->version);
203 		drm_printf(p, "vbios ver_str    : %s\n", ctx->vbios_ver_str);
204 		drm_printf(p, "vbios date       : %s\n", ctx->date);
205 	}else {
206 		drm_printf(p, "\nVBIOS Information: NA\n");
207 	}
208 }
209 
210 static ssize_t
amdgpu_devcoredump_format(char * buffer,size_t count,struct amdgpu_coredump_info * coredump)211 amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump)
212 {
213 	struct amdgpu_device *adev = coredump->adev;
214 	struct drm_printer p;
215 	struct drm_print_iterator iter;
216 	struct amdgpu_vm_fault_info *fault_info;
217 	struct amdgpu_bo_va_mapping *mapping;
218 	struct amdgpu_ip_block *ip_block;
219 	struct amdgpu_res_cursor cursor;
220 	struct amdgpu_bo *abo, *root;
221 	uint64_t va_start, offset;
222 	struct amdgpu_ring *ring;
223 	struct amdgpu_vm *vm;
224 	u32 *ib_content;
225 	uint8_t *kptr;
226 	int ver, i, j, r;
227 	u32 ring_idx, off;
228 	bool sizing_pass;
229 
230 	sizing_pass = buffer == NULL;
231 	iter.data = buffer;
232 	iter.offset = 0;
233 	iter.remain = count;
234 
235 	p = drm_coredump_printer(&iter);
236 
237 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
238 	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
239 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
240 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
241 	drm_printf(&p, "time: %ptSp\n", &coredump->reset_time);
242 
243 	if (coredump->reset_task_info.task.pid)
244 		drm_printf(&p, "process_name: %s PID: %d\n",
245 			   coredump->reset_task_info.process_name,
246 			   coredump->reset_task_info.task.pid);
247 
248 	/* SOC Information */
249 	drm_printf(&p, "\nSOC Information\n");
250 	drm_printf(&p, "SOC Device id: %d\n", coredump->adev->pdev->device);
251 	drm_printf(&p, "SOC PCI Revision id: %d\n", coredump->adev->pdev->revision);
252 	drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
253 	drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
254 	drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
255 
256 	/* Memory Information */
257 	drm_printf(&p, "\nSOC Memory Information\n");
258 	drm_printf(&p, "real vram size: %llu\n", coredump->adev->gmc.real_vram_size);
259 	drm_printf(&p, "visible vram size: %llu\n", coredump->adev->gmc.visible_vram_size);
260 	drm_printf(&p, "gtt size: %llu\n", coredump->adev->mman.gtt_mgr.manager.size);
261 
262 	/* GDS Config */
263 	drm_printf(&p, "\nGDS Config\n");
264 	drm_printf(&p, "gds: total size: %d\n", coredump->adev->gds.gds_size);
265 	drm_printf(&p, "gds: compute partition size: %d\n", coredump->adev->gds.gds_size);
266 	drm_printf(&p, "gds: gws per compute partition: %d\n", coredump->adev->gds.gws_size);
267 	drm_printf(&p, "gds: os per compute partition: %d\n", coredump->adev->gds.oa_size);
268 
269 	/* HWIP Version Information */
270 	drm_printf(&p, "\nHW IP Version Information\n");
271 	for (int i = 1; i < MAX_HWIP; i++) {
272 		for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
273 			ver = coredump->adev->ip_versions[i][j];
274 			if (ver)
275 				drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
276 					   hw_ip_names[i], i, j,
277 					   IP_VERSION_MAJ(ver),
278 					   IP_VERSION_MIN(ver),
279 					   IP_VERSION_REV(ver),
280 					   IP_VERSION_VARIANT(ver),
281 					   IP_VERSION_SUBREV(ver));
282 		}
283 	}
284 
285 	amdgpu_discovery_dump(coredump->adev, &p);
286 
287 	/* IP firmware information */
288 	drm_printf(&p, "\nIP Firmwares\n");
289 	amdgpu_devcoredump_fw_info(coredump->adev, &p);
290 
291 	if (coredump->ring) {
292 		drm_printf(&p, "\nRing timed out details\n");
293 		drm_printf(&p, "IP Type: %d Ring Name: %s\n",
294 			   coredump->ring->funcs->type,
295 			   coredump->ring->name);
296 	}
297 
298 	/* Add page fault information */
299 	fault_info = &coredump->adev->vm_manager.fault_info;
300 	drm_printf(&p, "\n[%s] Page fault observed\n",
301 		   fault_info->vmhub ? "mmhub" : "gfxhub");
302 	drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
303 	drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
304 
305 	/* dump the ip state for each ip */
306 	drm_printf(&p, "IP Dump\n");
307 	for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
308 		ip_block = &coredump->adev->ip_blocks[i];
309 		if (ip_block->version->funcs->print_ip_state) {
310 			drm_printf(&p, "IP: %s\n", ip_block->version->funcs->name);
311 			ip_block->version->funcs->print_ip_state(ip_block, &p);
312 			drm_printf(&p, "\n");
313 		}
314 	}
315 
316 	/* Add ring buffer information */
317 	drm_printf(&p, "Ring buffer information\n");
318 	if (coredump->num_rings) {
319 		for (i = 0; i < coredump->num_rings; i++) {
320 			ring_idx = coredump->rings[i].ring_index;
321 			ring = coredump->adev->rings[ring_idx];
322 			off = coredump->rings[i].offset;
323 
324 			drm_printf(&p, "ring name: %s\n", ring->name);
325 			drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
326 				   coredump->rings[i].rptr,
327 				   coredump->rings[i].wptr,
328 				   ring->buf_mask);
329 			drm_printf(&p, "Ring size in dwords: %d\n",
330 				ring->ring_size / 4);
331 			drm_printf(&p, "Ring contents\n");
332 			drm_printf(&p, "Offset \t Value\n");
333 
334 			for (j = 0; j < ring->ring_size; j += 4)
335 				drm_printf(&p, "0x%x \t 0x%x\n", j,
336 					   coredump->rings_dw[off + j / 4]);
337 		}
338 	}
339 
340 	if (coredump->skip_vram_check)
341 		drm_printf(&p, "VRAM lost check is skipped!\n");
342 	else if (coredump->reset_vram_lost)
343 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
344 
345 	if (coredump->num_ibs) {
346 		/* Don't try to lookup the VM or map the BOs when calculating the
347 		 * size required to store the devcoredump.
348 		 */
349 		if (sizing_pass)
350 			vm = NULL;
351 		else
352 			vm = amdgpu_vm_lock_by_pasid(adev, &root, coredump->pasid);
353 
354 		for (int i = 0; i < coredump->num_ibs && (sizing_pass || vm); i++) {
355 			ib_content = kvmalloc_array(coredump->ibs[i].ib_size_dw, 4,
356 						    GFP_KERNEL);
357 			if (!ib_content)
358 				continue;
359 
360 			/* vm=NULL can only happen when 'sizing_pass' is true. Skip to the
361 			 * drm_printf() calls (ib_content doesn't need to be initialized
362 			 * as its content won't be written anywhere).
363 			 */
364 			if (!vm)
365 				goto output_ib_content;
366 
367 			va_start = coredump->ibs[i].gpu_addr & AMDGPU_GMC_HOLE_MASK;
368 			mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / AMDGPU_GPU_PAGE_SIZE);
369 			if (!mapping)
370 				goto free_ib_content;
371 
372 			offset = va_start - (mapping->start * AMDGPU_GPU_PAGE_SIZE);
373 			abo = amdgpu_bo_ref(mapping->bo_va->base.bo);
374 			r = amdgpu_bo_reserve(abo, false);
375 			if (r)
376 				goto free_ib_content;
377 
378 			if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) {
379 				off = 0;
380 
381 				if (abo->tbo.resource->mem_type != TTM_PL_VRAM)
382 					goto unreserve_abo;
383 
384 				amdgpu_res_first(abo->tbo.resource, offset,
385 						 coredump->ibs[i].ib_size_dw * 4,
386 						 &cursor);
387 				while (cursor.remaining) {
388 					amdgpu_device_mm_access(adev, cursor.start / 4,
389 								&ib_content[off], cursor.size / 4,
390 								false);
391 					off += cursor.size;
392 					amdgpu_res_next(&cursor, cursor.size);
393 				}
394 			} else {
395 				r = ttm_bo_kmap(&abo->tbo, 0,
396 						PFN_UP(abo->tbo.base.size),
397 						&abo->kmap);
398 				if (r)
399 					goto unreserve_abo;
400 
401 				kptr = amdgpu_bo_kptr(abo);
402 				kptr += offset;
403 				memcpy(ib_content, kptr,
404 				       coredump->ibs[i].ib_size_dw * 4);
405 
406 				amdgpu_bo_kunmap(abo);
407 			}
408 
409 output_ib_content:
410 			drm_printf(&p, "\nIB #%d 0x%llx %d dw\n",
411 				   i, coredump->ibs[i].gpu_addr, coredump->ibs[i].ib_size_dw);
412 			for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)
413 				drm_printf(&p, "0x%08x\n", ib_content[j]);
414 unreserve_abo:
415 			if (vm)
416 				amdgpu_bo_unreserve(abo);
417 free_ib_content:
418 			kvfree(ib_content);
419 		}
420 		if (vm) {
421 			amdgpu_bo_unreserve(root);
422 			amdgpu_bo_unref(&root);
423 		}
424 	}
425 
426 	return count - iter.remain;
427 }
428 
429 static ssize_t
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)430 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
431 			void *data, size_t datalen)
432 {
433 	struct amdgpu_coredump_info *coredump = data;
434 	ssize_t byte_copied;
435 
436 	if (!coredump)
437 		return -ENODEV;
438 
439 	if (!coredump->formatted)
440 		return -ENODEV;
441 
442 	if (offset >= coredump->formatted_size)
443 		return 0;
444 
445 	byte_copied = count < coredump->formatted_size - offset ? count :
446 		coredump->formatted_size - offset;
447 	memcpy(buffer, coredump->formatted + offset, byte_copied);
448 
449 	return byte_copied;
450 }
451 
amdgpu_devcoredump_free(void * data)452 static void amdgpu_devcoredump_free(void *data)
453 {
454 	struct amdgpu_coredump_info *coredump = data;
455 
456 	kvfree(coredump->formatted);
457 	kvfree(coredump->rings);
458 	kvfree(coredump->rings_dw);
459 	kvfree(data);
460 }
461 
amdgpu_devcoredump_deferred_work(struct work_struct * work)462 static void amdgpu_devcoredump_deferred_work(struct work_struct *work)
463 {
464 	struct amdgpu_device *adev = container_of(work, typeof(*adev), coredump_work);
465 	struct amdgpu_coredump_info *coredump = adev->coredump;
466 
467 	if (!coredump)
468 		goto end;
469 
470 	/* Do a one-time preparation of the coredump output because
471 	 * repeatingly calling drm_coredump_printer is very slow.
472 	 */
473 	coredump->formatted_size = amdgpu_devcoredump_format(
474 		NULL, AMDGPU_CORE_DUMP_SIZE_MAX, coredump);
475 	coredump->formatted = kvzalloc(coredump->formatted_size, GFP_KERNEL);
476 	if (!coredump->formatted) {
477 		amdgpu_devcoredump_free(coredump);
478 		goto end;
479 	}
480 
481 	amdgpu_devcoredump_format(coredump->formatted, coredump->formatted_size, coredump);
482 
483 	/* If there's an existing coredump for this device, the free function will be
484 	 * called immediately so coredump might be invalid after the call to dev_coredumpm.
485 	 */
486 	dev_coredumpm(coredump->adev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
487 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
488 
489 end:
490 	adev->coredump = NULL;
491 }
492 
amdgpu_coredump(struct amdgpu_device * adev,bool skip_vram_check,bool vram_lost,struct amdgpu_job * job)493 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
494 		     bool vram_lost, struct amdgpu_job *job)
495 {
496 	struct drm_device *dev = adev_to_drm(adev);
497 	struct amdgpu_coredump_info *coredump;
498 	size_t size = sizeof(*coredump);
499 	struct drm_sched_job *s_job;
500 	u64 total_ring_size, ring_count;
501 	struct amdgpu_ring *ring;
502 	int i, off, idx;
503 
504 	/* No need to generate a new coredump if there's one in progress already. */
505 	if (work_busy(&adev->coredump_work))
506 		return;
507 
508 	if (job && job->pasid)
509 		size += sizeof(struct amdgpu_coredump_ib_info) * job->num_ibs;
510 
511 	coredump = kzalloc(size, GFP_NOWAIT);
512 	if (!coredump)
513 		return;
514 
515 	coredump->skip_vram_check = skip_vram_check;
516 	coredump->reset_vram_lost = vram_lost;
517 
518 	if (job && job->pasid) {
519 		struct amdgpu_task_info *ti;
520 
521 		ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
522 		if (ti) {
523 			coredump->reset_task_info = *ti;
524 			amdgpu_vm_put_task_info(ti);
525 		}
526 		coredump->pasid = job->pasid;
527 		coredump->num_ibs = job->num_ibs;
528 		for (i = 0; i < job->num_ibs; ++i) {
529 			coredump->ibs[i].gpu_addr = job->ibs[i].gpu_addr;
530 			coredump->ibs[i].ib_size_dw = job->ibs[i].length_dw;
531 		}
532 	}
533 
534 	if (job) {
535 		s_job = &job->base;
536 		coredump->ring = to_amdgpu_ring(s_job->sched);
537 	}
538 
539 	/* Dump ring content if memory allocation succeeds. */
540 	ring_count = 0;
541 	total_ring_size = 0;
542 	for (i = 0; i < adev->num_rings; i++) {
543 		ring = adev->rings[i];
544 
545 		/* Only dump rings with unsignalled fences. */
546 		if (atomic_read(&ring->fence_drv.last_seq) == ring->fence_drv.sync_seq &&
547 		    coredump->ring != ring)
548 			continue;
549 
550 		total_ring_size += ring->ring_size;
551 		ring_count++;
552 	}
553 	coredump->rings_dw = kzalloc(total_ring_size, GFP_NOWAIT);
554 	coredump->rings = kcalloc(ring_count, sizeof(struct amdgpu_coredump_ring), GFP_NOWAIT);
555 	if (coredump->rings && coredump->rings_dw) {
556 		for (i = 0, off = 0, idx = 0; i < adev->num_rings; i++) {
557 			ring = adev->rings[i];
558 
559 			if (atomic_read(&ring->fence_drv.last_seq) == ring->fence_drv.sync_seq &&
560 			    coredump->ring != ring)
561 				continue;
562 
563 			coredump->rings[idx].ring_index = ring->idx;
564 			coredump->rings[idx].rptr = amdgpu_ring_get_rptr(ring);
565 			coredump->rings[idx].wptr = amdgpu_ring_get_wptr(ring);
566 			coredump->rings[idx].offset = off;
567 
568 			memcpy(&coredump->rings_dw[off], ring->ring, ring->ring_size);
569 			off += ring->ring_size / 4;
570 			idx++;
571 		}
572 		coredump->num_rings = idx;
573 	} else {
574 		kvfree(coredump->rings_dw);
575 		kvfree(coredump->rings);
576 		coredump->rings_dw = NULL;
577 		coredump->rings = NULL;
578 	}
579 
580 	coredump->adev = adev;
581 
582 	ktime_get_ts64(&coredump->reset_time);
583 
584 	/* Update the current coredump pointer (no lock needed, this function can only be called
585 	 * from a single thread)
586 	 */
587 	adev->coredump = coredump;
588 	/* Kick off coredump formatting to a worker thread. */
589 	queue_work(system_unbound_wq, &adev->coredump_work);
590 
591 	drm_info(dev, "AMDGPU device coredump file has been created\n");
592 	drm_info(dev, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
593 		 dev->primary->index);
594 }
595 
amdgpu_coredump_init(struct amdgpu_device * adev)596 void amdgpu_coredump_init(struct amdgpu_device *adev)
597 {
598 	INIT_WORK(&adev->coredump_work, amdgpu_devcoredump_deferred_work);
599 }
600 
amdgpu_coredump_fini(struct amdgpu_device * adev)601 void amdgpu_coredump_fini(struct amdgpu_device *adev)
602 {
603 	/* Finish deferred coredump formatting before HW/IP teardown. */
604 	flush_work(&adev->coredump_work);
605 }
606 #endif
607