xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c (revision 3a39d672e7f48b8d6b91a09afa4b55352773b4b5)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2024 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <generated/utsrelease.h>
26 #include <linux/devcoredump.h>
27 #include "amdgpu_dev_coredump.h"
28 #include "atom.h"
29 
30 #ifndef CONFIG_DEV_COREDUMP
amdgpu_coredump(struct amdgpu_device * adev,bool skip_vram_check,bool vram_lost,struct amdgpu_job * job)31 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
32 		     bool vram_lost, struct amdgpu_job *job)
33 {
34 }
35 #else
36 
37 const char *hw_ip_names[MAX_HWIP] = {
38 	[GC_HWIP]		= "GC",
39 	[HDP_HWIP]		= "HDP",
40 	[SDMA0_HWIP]		= "SDMA0",
41 	[SDMA1_HWIP]		= "SDMA1",
42 	[SDMA2_HWIP]		= "SDMA2",
43 	[SDMA3_HWIP]		= "SDMA3",
44 	[SDMA4_HWIP]		= "SDMA4",
45 	[SDMA5_HWIP]		= "SDMA5",
46 	[SDMA6_HWIP]		= "SDMA6",
47 	[SDMA7_HWIP]		= "SDMA7",
48 	[LSDMA_HWIP]		= "LSDMA",
49 	[MMHUB_HWIP]		= "MMHUB",
50 	[ATHUB_HWIP]		= "ATHUB",
51 	[NBIO_HWIP]		= "NBIO",
52 	[MP0_HWIP]		= "MP0",
53 	[MP1_HWIP]		= "MP1",
54 	[UVD_HWIP]		= "UVD/JPEG/VCN",
55 	[VCN1_HWIP]		= "VCN1",
56 	[VCE_HWIP]		= "VCE",
57 	[VPE_HWIP]		= "VPE",
58 	[DF_HWIP]		= "DF",
59 	[DCE_HWIP]		= "DCE",
60 	[OSSSYS_HWIP]		= "OSSSYS",
61 	[SMUIO_HWIP]		= "SMUIO",
62 	[PWR_HWIP]		= "PWR",
63 	[NBIF_HWIP]		= "NBIF",
64 	[THM_HWIP]		= "THM",
65 	[CLK_HWIP]		= "CLK",
66 	[UMC_HWIP]		= "UMC",
67 	[RSMU_HWIP]		= "RSMU",
68 	[XGMI_HWIP]		= "XGMI",
69 	[DCI_HWIP]		= "DCI",
70 	[PCIE_HWIP]		= "PCIE",
71 };
72 
amdgpu_devcoredump_fw_info(struct amdgpu_device * adev,struct drm_printer * p)73 static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
74 				       struct drm_printer *p)
75 {
76 	uint32_t version;
77 	uint32_t feature;
78 	uint8_t smu_program, smu_major, smu_minor, smu_debug;
79 	struct atom_context *ctx = adev->mode_info.atom_context;
80 
81 	drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
82 		   adev->vce.fb_version, adev->vce.fw_version);
83 	drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
84 		   adev->uvd.fw_version);
85 	drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
86 		   adev->gmc.fw_version);
87 	drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
88 		   adev->gfx.me_feature_version, adev->gfx.me_fw_version);
89 	drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
90 		   adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
91 	drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
92 		   adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
93 	drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
94 		   adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
95 
96 	drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
97 		   adev->gfx.rlc_srlc_feature_version,
98 		   adev->gfx.rlc_srlc_fw_version);
99 	drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
100 		   adev->gfx.rlc_srlg_feature_version,
101 		   adev->gfx.rlc_srlg_fw_version);
102 	drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
103 		   adev->gfx.rlc_srls_feature_version,
104 		   adev->gfx.rlc_srls_fw_version);
105 	drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
106 		   adev->gfx.rlcp_ucode_feature_version,
107 		   adev->gfx.rlcp_ucode_version);
108 	drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
109 		   adev->gfx.rlcv_ucode_feature_version,
110 		   adev->gfx.rlcv_ucode_version);
111 	drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
112 		   adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
113 
114 	if (adev->gfx.mec2_fw)
115 		drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
116 			   adev->gfx.mec2_feature_version,
117 			   adev->gfx.mec2_fw_version);
118 
119 	drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
120 		   adev->gfx.imu_fw_version);
121 	drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
122 		   adev->psp.sos.feature_version, adev->psp.sos.fw_version);
123 	drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
124 		   adev->psp.asd_context.bin_desc.feature_version,
125 		   adev->psp.asd_context.bin_desc.fw_version);
126 
127 	drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
128 		   adev->psp.xgmi_context.context.bin_desc.feature_version,
129 		   adev->psp.xgmi_context.context.bin_desc.fw_version);
130 	drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
131 		   adev->psp.ras_context.context.bin_desc.feature_version,
132 		   adev->psp.ras_context.context.bin_desc.fw_version);
133 	drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
134 		   adev->psp.hdcp_context.context.bin_desc.feature_version,
135 		   adev->psp.hdcp_context.context.bin_desc.fw_version);
136 	drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
137 		   adev->psp.dtm_context.context.bin_desc.feature_version,
138 		   adev->psp.dtm_context.context.bin_desc.fw_version);
139 	drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
140 		   adev->psp.rap_context.context.bin_desc.feature_version,
141 		   adev->psp.rap_context.context.bin_desc.fw_version);
142 	drm_printf(p,
143 		   "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
144 		   adev->psp.securedisplay_context.context.bin_desc.feature_version,
145 		   adev->psp.securedisplay_context.context.bin_desc.fw_version);
146 
147 	/* SMC firmware */
148 	version = adev->pm.fw_version;
149 
150 	smu_program = (version >> 24) & 0xff;
151 	smu_major = (version >> 16) & 0xff;
152 	smu_minor = (version >> 8) & 0xff;
153 	smu_debug = (version >> 0) & 0xff;
154 	drm_printf(p,
155 		   "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
156 		   0, smu_program, version, smu_major, smu_minor, smu_debug);
157 
158 	/* SDMA firmware */
159 	for (int i = 0; i < adev->sdma.num_instances; i++) {
160 		drm_printf(p,
161 			   "SDMA%d feature version: %u, firmware version: 0x%08x\n",
162 			   i, adev->sdma.instance[i].feature_version,
163 			   adev->sdma.instance[i].fw_version);
164 	}
165 
166 	drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
167 		   adev->vcn.fw_version);
168 	drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
169 		   adev->dm.dmcu_fw_version);
170 	drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
171 		   adev->dm.dmcub_fw_version);
172 	drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
173 		   adev->psp.toc.feature_version, adev->psp.toc.fw_version);
174 
175 	version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
176 	feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
177 		  AMDGPU_MES_FEAT_VERSION_SHIFT;
178 	drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
179 		   feature, version);
180 
181 	version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
182 	feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
183 		  AMDGPU_MES_FEAT_VERSION_SHIFT;
184 	drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
185 		   version);
186 
187 	drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
188 		   adev->vpe.feature_version, adev->vpe.fw_version);
189 
190 	drm_printf(p, "\nVBIOS Information\n");
191 	drm_printf(p, "vbios name       : %s\n", ctx->name);
192 	drm_printf(p, "vbios pn         : %s\n", ctx->vbios_pn);
193 	drm_printf(p, "vbios version    : %d\n", ctx->version);
194 	drm_printf(p, "vbios ver_str    : %s\n", ctx->vbios_ver_str);
195 	drm_printf(p, "vbios date       : %s\n", ctx->date);
196 }
197 
198 static ssize_t
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)199 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
200 			void *data, size_t datalen)
201 {
202 	struct drm_printer p;
203 	struct amdgpu_coredump_info *coredump = data;
204 	struct drm_print_iterator iter;
205 	struct amdgpu_vm_fault_info *fault_info;
206 	int ver;
207 
208 	iter.data = buffer;
209 	iter.offset = 0;
210 	iter.start = offset;
211 	iter.remain = count;
212 
213 	p = drm_coredump_printer(&iter);
214 
215 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
216 	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
217 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
218 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
219 	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
220 		   coredump->reset_time.tv_nsec);
221 
222 	if (coredump->reset_task_info.pid)
223 		drm_printf(&p, "process_name: %s PID: %d\n",
224 			   coredump->reset_task_info.process_name,
225 			   coredump->reset_task_info.pid);
226 
227 	/* SOC Information */
228 	drm_printf(&p, "\nSOC Information\n");
229 	drm_printf(&p, "SOC Device id: %d\n", coredump->adev->pdev->device);
230 	drm_printf(&p, "SOC PCI Revision id: %d\n", coredump->adev->pdev->revision);
231 	drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
232 	drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
233 	drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
234 
235 	/* Memory Information */
236 	drm_printf(&p, "\nSOC Memory Information\n");
237 	drm_printf(&p, "real vram size: %llu\n", coredump->adev->gmc.real_vram_size);
238 	drm_printf(&p, "visible vram size: %llu\n", coredump->adev->gmc.visible_vram_size);
239 	drm_printf(&p, "gtt size: %llu\n", coredump->adev->mman.gtt_mgr.manager.size);
240 
241 	/* GDS Config */
242 	drm_printf(&p, "\nGDS Config\n");
243 	drm_printf(&p, "gds: total size: %d\n", coredump->adev->gds.gds_size);
244 	drm_printf(&p, "gds: compute partition size: %d\n", coredump->adev->gds.gds_size);
245 	drm_printf(&p, "gds: gws per compute partition: %d\n", coredump->adev->gds.gws_size);
246 	drm_printf(&p, "gds: os per compute partition: %d\n", coredump->adev->gds.oa_size);
247 
248 	/* HWIP Version Information */
249 	drm_printf(&p, "\nHW IP Version Information\n");
250 	for (int i = 1; i < MAX_HWIP; i++) {
251 		for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
252 			ver = coredump->adev->ip_versions[i][j];
253 			if (ver)
254 				drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
255 					   hw_ip_names[i], i, j,
256 					   IP_VERSION_MAJ(ver),
257 					   IP_VERSION_MIN(ver),
258 					   IP_VERSION_REV(ver),
259 					   IP_VERSION_VARIANT(ver),
260 					   IP_VERSION_SUBREV(ver));
261 		}
262 	}
263 
264 	/* IP firmware information */
265 	drm_printf(&p, "\nIP Firmwares\n");
266 	amdgpu_devcoredump_fw_info(coredump->adev, &p);
267 
268 	if (coredump->ring) {
269 		drm_printf(&p, "\nRing timed out details\n");
270 		drm_printf(&p, "IP Type: %d Ring Name: %s\n",
271 			   coredump->ring->funcs->type,
272 			   coredump->ring->name);
273 	}
274 
275 	/* Add page fault information */
276 	fault_info = &coredump->adev->vm_manager.fault_info;
277 	drm_printf(&p, "\n[%s] Page fault observed\n",
278 		   fault_info->vmhub ? "mmhub" : "gfxhub");
279 	drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
280 	drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
281 
282 	/* dump the ip state for each ip */
283 	drm_printf(&p, "IP Dump\n");
284 	for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
285 		if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) {
286 			drm_printf(&p, "IP: %s\n",
287 				   coredump->adev->ip_blocks[i]
288 					   .version->funcs->name);
289 			coredump->adev->ip_blocks[i]
290 				.version->funcs->print_ip_state(
291 					(void *)coredump->adev, &p);
292 			drm_printf(&p, "\n");
293 		}
294 	}
295 
296 	/* Add ring buffer information */
297 	drm_printf(&p, "Ring buffer information\n");
298 	for (int i = 0; i < coredump->adev->num_rings; i++) {
299 		int j = 0;
300 		struct amdgpu_ring *ring = coredump->adev->rings[i];
301 
302 		drm_printf(&p, "ring name: %s\n", ring->name);
303 		drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
304 			   amdgpu_ring_get_rptr(ring),
305 			   amdgpu_ring_get_wptr(ring),
306 			   ring->buf_mask);
307 		drm_printf(&p, "Ring size in dwords: %d\n",
308 			   ring->ring_size / 4);
309 		drm_printf(&p, "Ring contents\n");
310 		drm_printf(&p, "Offset \t Value\n");
311 
312 		while (j < ring->ring_size) {
313 			drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
314 			j += 4;
315 		}
316 	}
317 
318 	if (coredump->skip_vram_check)
319 		drm_printf(&p, "VRAM lost check is skipped!\n");
320 	else if (coredump->reset_vram_lost)
321 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
322 
323 	return count - iter.remain;
324 }
325 
amdgpu_devcoredump_free(void * data)326 static void amdgpu_devcoredump_free(void *data)
327 {
328 	kfree(data);
329 }
330 
amdgpu_coredump(struct amdgpu_device * adev,bool skip_vram_check,bool vram_lost,struct amdgpu_job * job)331 void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
332 		     bool vram_lost, struct amdgpu_job *job)
333 {
334 	struct drm_device *dev = adev_to_drm(adev);
335 	struct amdgpu_coredump_info *coredump;
336 	struct drm_sched_job *s_job;
337 
338 	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
339 
340 	if (!coredump) {
341 		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
342 		return;
343 	}
344 
345 	coredump->skip_vram_check = skip_vram_check;
346 	coredump->reset_vram_lost = vram_lost;
347 
348 	if (job && job->vm) {
349 		struct amdgpu_vm *vm = job->vm;
350 		struct amdgpu_task_info *ti;
351 
352 		ti = amdgpu_vm_get_task_info_vm(vm);
353 		if (ti) {
354 			coredump->reset_task_info = *ti;
355 			amdgpu_vm_put_task_info(ti);
356 		}
357 	}
358 
359 	if (job) {
360 		s_job = &job->base;
361 		coredump->ring = to_amdgpu_ring(s_job->sched);
362 	}
363 
364 	coredump->adev = adev;
365 
366 	ktime_get_ts64(&coredump->reset_time);
367 
368 	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
369 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
370 }
371 #endif
372