xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c (revision 7cc9196675234d4de0e1e19b9da1a8b86ecfeedd)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include <linux/devcoredump.h>
25 #include <generated/utsrelease.h>
26 
27 #include "amdgpu_reset.h"
28 #include "aldebaran.h"
29 #include "sienna_cichlid.h"
30 #include "smu_v13_0_10.h"
31 
32 const char *hw_ip_names[MAX_HWIP] = {
33 	[GC_HWIP]		= "GC",
34 	[HDP_HWIP]		= "HDP",
35 	[SDMA0_HWIP]		= "SDMA0",
36 	[SDMA1_HWIP]		= "SDMA1",
37 	[SDMA2_HWIP]		= "SDMA2",
38 	[SDMA3_HWIP]		= "SDMA3",
39 	[SDMA4_HWIP]		= "SDMA4",
40 	[SDMA5_HWIP]		= "SDMA5",
41 	[SDMA6_HWIP]		= "SDMA6",
42 	[SDMA7_HWIP]		= "SDMA7",
43 	[LSDMA_HWIP]		= "LSDMA",
44 	[MMHUB_HWIP]		= "MMHUB",
45 	[ATHUB_HWIP]		= "ATHUB",
46 	[NBIO_HWIP]		= "NBIO",
47 	[MP0_HWIP]		= "MP0",
48 	[MP1_HWIP]		= "MP1",
49 	[UVD_HWIP]		= "UVD/JPEG/VCN",
50 	[VCN1_HWIP]		= "VCN1",
51 	[VCE_HWIP]		= "VCE",
52 	[VPE_HWIP]		= "VPE",
53 	[DF_HWIP]		= "DF",
54 	[DCE_HWIP]		= "DCE",
55 	[OSSSYS_HWIP]		= "OSSSYS",
56 	[SMUIO_HWIP]		= "SMUIO",
57 	[PWR_HWIP]		= "PWR",
58 	[NBIF_HWIP]		= "NBIF",
59 	[THM_HWIP]		= "THM",
60 	[CLK_HWIP]		= "CLK",
61 	[UMC_HWIP]		= "UMC",
62 	[RSMU_HWIP]		= "RSMU",
63 	[XGMI_HWIP]		= "XGMI",
64 	[DCI_HWIP]		= "DCI",
65 	[PCIE_HWIP]		= "PCIE",
66 };
67 
68 int amdgpu_reset_init(struct amdgpu_device *adev)
69 {
70 	int ret = 0;
71 
72 	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
73 	case IP_VERSION(13, 0, 2):
74 	case IP_VERSION(13, 0, 6):
75 		ret = aldebaran_reset_init(adev);
76 		break;
77 	case IP_VERSION(11, 0, 7):
78 		ret = sienna_cichlid_reset_init(adev);
79 		break;
80 	case IP_VERSION(13, 0, 10):
81 		ret = smu_v13_0_10_reset_init(adev);
82 		break;
83 	default:
84 		break;
85 	}
86 
87 	return ret;
88 }
89 
90 int amdgpu_reset_fini(struct amdgpu_device *adev)
91 {
92 	int ret = 0;
93 
94 	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
95 	case IP_VERSION(13, 0, 2):
96 	case IP_VERSION(13, 0, 6):
97 		ret = aldebaran_reset_fini(adev);
98 		break;
99 	case IP_VERSION(11, 0, 7):
100 		ret = sienna_cichlid_reset_fini(adev);
101 		break;
102 	case IP_VERSION(13, 0, 10):
103 		ret = smu_v13_0_10_reset_fini(adev);
104 		break;
105 	default:
106 		break;
107 	}
108 
109 	return ret;
110 }
111 
112 int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
113 				   struct amdgpu_reset_context *reset_context)
114 {
115 	struct amdgpu_reset_handler *reset_handler = NULL;
116 
117 	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
118 		reset_handler = adev->reset_cntl->get_reset_handler(
119 			adev->reset_cntl, reset_context);
120 	if (!reset_handler)
121 		return -EOPNOTSUPP;
122 
123 	return reset_handler->prepare_hwcontext(adev->reset_cntl,
124 						reset_context);
125 }
126 
127 int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
128 			       struct amdgpu_reset_context *reset_context)
129 {
130 	int ret;
131 	struct amdgpu_reset_handler *reset_handler = NULL;
132 
133 	if (adev->reset_cntl)
134 		reset_handler = adev->reset_cntl->get_reset_handler(
135 			adev->reset_cntl, reset_context);
136 	if (!reset_handler)
137 		return -EOPNOTSUPP;
138 
139 	ret = reset_handler->perform_reset(adev->reset_cntl, reset_context);
140 	if (ret)
141 		return ret;
142 
143 	return reset_handler->restore_hwcontext(adev->reset_cntl,
144 						reset_context);
145 }
146 
147 
148 void amdgpu_reset_destroy_reset_domain(struct kref *ref)
149 {
150 	struct amdgpu_reset_domain *reset_domain = container_of(ref,
151 								struct amdgpu_reset_domain,
152 								refcount);
153 	if (reset_domain->wq)
154 		destroy_workqueue(reset_domain->wq);
155 
156 	kvfree(reset_domain);
157 }
158 
159 struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
160 							     char *wq_name)
161 {
162 	struct amdgpu_reset_domain *reset_domain;
163 
164 	reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL);
165 	if (!reset_domain) {
166 		DRM_ERROR("Failed to allocate amdgpu_reset_domain!");
167 		return NULL;
168 	}
169 
170 	reset_domain->type = type;
171 	kref_init(&reset_domain->refcount);
172 
173 	reset_domain->wq = create_singlethread_workqueue(wq_name);
174 	if (!reset_domain->wq) {
175 		DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!");
176 		amdgpu_reset_put_reset_domain(reset_domain);
177 		return NULL;
178 
179 	}
180 
181 	atomic_set(&reset_domain->in_gpu_reset, 0);
182 	atomic_set(&reset_domain->reset_res, 0);
183 	init_rwsem(&reset_domain->sem);
184 
185 	return reset_domain;
186 }
187 
188 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
189 {
190 	atomic_set(&reset_domain->in_gpu_reset, 1);
191 	down_write(&reset_domain->sem);
192 }
193 
194 
195 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
196 {
197 	atomic_set(&reset_domain->in_gpu_reset, 0);
198 	up_write(&reset_domain->sem);
199 }
200 
201 #ifndef CONFIG_DEV_COREDUMP
202 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
203 		     struct amdgpu_reset_context *reset_context)
204 {
205 }
206 #else
207 static ssize_t
208 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
209 			void *data, size_t datalen)
210 {
211 	struct drm_printer p;
212 	struct amdgpu_coredump_info *coredump = data;
213 	struct drm_print_iterator iter;
214 	struct amdgpu_vm_fault_info *fault_info;
215 	int i, ver;
216 
217 	iter.data = buffer;
218 	iter.offset = 0;
219 	iter.start = offset;
220 	iter.remain = count;
221 
222 	p = drm_coredump_printer(&iter);
223 
224 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
225 	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
226 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
227 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
228 	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
229 			coredump->reset_time.tv_nsec);
230 
231 	if (coredump->reset_task_info.pid)
232 		drm_printf(&p, "process_name: %s PID: %d\n",
233 			   coredump->reset_task_info.process_name,
234 			   coredump->reset_task_info.pid);
235 
236 	/* GPU IP's information of the SOC */
237 	drm_printf(&p, "\nIP Information\n");
238 	drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
239 	drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
240 	drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
241 
242 	for (int i = 1; i < MAX_HWIP; i++) {
243 		for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
244 			ver = coredump->adev->ip_versions[i][j];
245 			if (ver)
246 				drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
247 					   hw_ip_names[i], i, j,
248 					   IP_VERSION_MAJ(ver),
249 					   IP_VERSION_MIN(ver),
250 					   IP_VERSION_REV(ver),
251 					   IP_VERSION_VARIANT(ver),
252 					   IP_VERSION_SUBREV(ver));
253 		}
254 	}
255 
256 	if (coredump->ring) {
257 		drm_printf(&p, "\nRing timed out details\n");
258 		drm_printf(&p, "IP Type: %d Ring Name: %s\n",
259 			   coredump->ring->funcs->type,
260 			   coredump->ring->name);
261 	}
262 
263 	/* Add page fault information */
264 	fault_info = &coredump->adev->vm_manager.fault_info;
265 	drm_printf(&p, "\n[%s] Page fault observed\n",
266 		   fault_info->vmhub ? "mmhub" : "gfxhub");
267 	drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
268 	drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
269 
270 	/* Add ring buffer information */
271 	drm_printf(&p, "Ring buffer information\n");
272 	for (int i = 0; i < coredump->adev->num_rings; i++) {
273 		int j = 0;
274 		struct amdgpu_ring *ring = coredump->adev->rings[i];
275 
276 		drm_printf(&p, "ring name: %s\n", ring->name);
277 		drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
278 			   amdgpu_ring_get_rptr(ring),
279 			   amdgpu_ring_get_wptr(ring),
280 			   ring->buf_mask);
281 		drm_printf(&p, "Ring size in dwords: %d\n",
282 			   ring->ring_size / 4);
283 		drm_printf(&p, "Ring contents\n");
284 		drm_printf(&p, "Offset \t Value\n");
285 
286 		while (j < ring->ring_size) {
287 			drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j/4]);
288 			j += 4;
289 		}
290 	}
291 
292 	if (coredump->reset_vram_lost)
293 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
294 	if (coredump->adev->reset_info.num_regs) {
295 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
296 
297 		for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
298 			drm_printf(&p, "0x%08x: 0x%08x\n",
299 				   coredump->adev->reset_info.reset_dump_reg_list[i],
300 				   coredump->adev->reset_info.reset_dump_reg_value[i]);
301 	}
302 
303 	return count - iter.remain;
304 }
305 
306 static void amdgpu_devcoredump_free(void *data)
307 {
308 	kfree(data);
309 }
310 
311 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
312 		     struct amdgpu_reset_context *reset_context)
313 {
314 	struct amdgpu_coredump_info *coredump;
315 	struct drm_device *dev = adev_to_drm(adev);
316 	struct amdgpu_job *job = reset_context->job;
317 	struct drm_sched_job *s_job;
318 
319 	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
320 
321 	if (!coredump) {
322 		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
323 		return;
324 	}
325 
326 	coredump->reset_vram_lost = vram_lost;
327 
328 	if (reset_context->job && reset_context->job->vm) {
329 		struct amdgpu_task_info *ti;
330 		struct amdgpu_vm *vm = reset_context->job->vm;
331 
332 		ti = amdgpu_vm_get_task_info_vm(vm);
333 		if (ti) {
334 			coredump->reset_task_info = *ti;
335 			amdgpu_vm_put_task_info(ti);
336 		}
337 	}
338 
339 	if (job) {
340 		s_job = &job->base;
341 		coredump->ring = to_amdgpu_ring(s_job->sched);
342 	}
343 
344 	coredump->adev = adev;
345 
346 	ktime_get_ts64(&coredump->reset_time);
347 
348 	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
349 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
350 }
351 #endif
352