xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c (revision eed4edda910fe34dfae8c6bfbcf57f4593a54295)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include <linux/devcoredump.h>
25 #include <generated/utsrelease.h>
26 
27 #include "amdgpu_reset.h"
28 #include "aldebaran.h"
29 #include "sienna_cichlid.h"
30 #include "smu_v13_0_10.h"
31 
32 int amdgpu_reset_init(struct amdgpu_device *adev)
33 {
34 	int ret = 0;
35 
36 	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
37 	case IP_VERSION(13, 0, 2):
38 	case IP_VERSION(13, 0, 6):
39 		ret = aldebaran_reset_init(adev);
40 		break;
41 	case IP_VERSION(11, 0, 7):
42 		ret = sienna_cichlid_reset_init(adev);
43 		break;
44 	case IP_VERSION(13, 0, 10):
45 		ret = smu_v13_0_10_reset_init(adev);
46 		break;
47 	default:
48 		break;
49 	}
50 
51 	return ret;
52 }
53 
54 int amdgpu_reset_fini(struct amdgpu_device *adev)
55 {
56 	int ret = 0;
57 
58 	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
59 	case IP_VERSION(13, 0, 2):
60 	case IP_VERSION(13, 0, 6):
61 		ret = aldebaran_reset_fini(adev);
62 		break;
63 	case IP_VERSION(11, 0, 7):
64 		ret = sienna_cichlid_reset_fini(adev);
65 		break;
66 	case IP_VERSION(13, 0, 10):
67 		ret = smu_v13_0_10_reset_fini(adev);
68 		break;
69 	default:
70 		break;
71 	}
72 
73 	return ret;
74 }
75 
76 int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
77 				   struct amdgpu_reset_context *reset_context)
78 {
79 	struct amdgpu_reset_handler *reset_handler = NULL;
80 
81 	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
82 		reset_handler = adev->reset_cntl->get_reset_handler(
83 			adev->reset_cntl, reset_context);
84 	if (!reset_handler)
85 		return -EOPNOTSUPP;
86 
87 	return reset_handler->prepare_hwcontext(adev->reset_cntl,
88 						reset_context);
89 }
90 
91 int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
92 			       struct amdgpu_reset_context *reset_context)
93 {
94 	int ret;
95 	struct amdgpu_reset_handler *reset_handler = NULL;
96 
97 	if (adev->reset_cntl)
98 		reset_handler = adev->reset_cntl->get_reset_handler(
99 			adev->reset_cntl, reset_context);
100 	if (!reset_handler)
101 		return -EOPNOTSUPP;
102 
103 	ret = reset_handler->perform_reset(adev->reset_cntl, reset_context);
104 	if (ret)
105 		return ret;
106 
107 	return reset_handler->restore_hwcontext(adev->reset_cntl,
108 						reset_context);
109 }
110 
111 
112 void amdgpu_reset_destroy_reset_domain(struct kref *ref)
113 {
114 	struct amdgpu_reset_domain *reset_domain = container_of(ref,
115 								struct amdgpu_reset_domain,
116 								refcount);
117 	if (reset_domain->wq)
118 		destroy_workqueue(reset_domain->wq);
119 
120 	kvfree(reset_domain);
121 }
122 
123 struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
124 							     char *wq_name)
125 {
126 	struct amdgpu_reset_domain *reset_domain;
127 
128 	reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL);
129 	if (!reset_domain) {
130 		DRM_ERROR("Failed to allocate amdgpu_reset_domain!");
131 		return NULL;
132 	}
133 
134 	reset_domain->type = type;
135 	kref_init(&reset_domain->refcount);
136 
137 	reset_domain->wq = create_singlethread_workqueue(wq_name);
138 	if (!reset_domain->wq) {
139 		DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!");
140 		amdgpu_reset_put_reset_domain(reset_domain);
141 		return NULL;
142 
143 	}
144 
145 	atomic_set(&reset_domain->in_gpu_reset, 0);
146 	atomic_set(&reset_domain->reset_res, 0);
147 	init_rwsem(&reset_domain->sem);
148 
149 	return reset_domain;
150 }
151 
152 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
153 {
154 	atomic_set(&reset_domain->in_gpu_reset, 1);
155 	down_write(&reset_domain->sem);
156 }
157 
158 
159 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
160 {
161 	atomic_set(&reset_domain->in_gpu_reset, 0);
162 	up_write(&reset_domain->sem);
163 }
164 
165 #ifndef CONFIG_DEV_COREDUMP
166 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
167 		     struct amdgpu_reset_context *reset_context)
168 {
169 }
170 #else
171 static ssize_t
172 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
173 			void *data, size_t datalen)
174 {
175 	struct drm_printer p;
176 	struct amdgpu_coredump_info *coredump = data;
177 	struct drm_print_iterator iter;
178 	int i;
179 
180 	iter.data = buffer;
181 	iter.offset = 0;
182 	iter.start = offset;
183 	iter.remain = count;
184 
185 	p = drm_coredump_printer(&iter);
186 
187 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
188 	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
189 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
190 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
191 	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
192 			coredump->reset_time.tv_nsec);
193 
194 	if (coredump->reset_task_info.pid)
195 		drm_printf(&p, "process_name: %s PID: %d\n",
196 			   coredump->reset_task_info.process_name,
197 			   coredump->reset_task_info.pid);
198 
199 	if (coredump->ring) {
200 		drm_printf(&p, "\nRing timed out details\n");
201 		drm_printf(&p, "IP Type: %d Ring Name: %s\n",
202 			   coredump->ring->funcs->type,
203 			   coredump->ring->name);
204 	}
205 
206 	if (coredump->adev) {
207 		struct amdgpu_vm_fault_info *fault_info =
208 			&coredump->adev->vm_manager.fault_info;
209 
210 		drm_printf(&p, "\n[%s] Page fault observed\n",
211 			   fault_info->vmhub ? "mmhub" : "gfxhub");
212 		drm_printf(&p, "Faulty page starting at address: 0x%016llx\n",
213 			   fault_info->addr);
214 		drm_printf(&p, "Protection fault status register: 0x%x\n\n",
215 			   fault_info->status);
216 	}
217 
218 	drm_printf(&p, "Ring buffer information\n");
219 	for (int i = 0; i < coredump->adev->num_rings; i++) {
220 		int j = 0;
221 		struct amdgpu_ring *ring = coredump->adev->rings[i];
222 
223 		drm_printf(&p, "ring name: %s\n", ring->name);
224 		drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
225 			   amdgpu_ring_get_rptr(ring),
226 			   amdgpu_ring_get_wptr(ring),
227 			   ring->buf_mask);
228 		drm_printf(&p, "Ring size in dwords: %d\n",
229 			   ring->ring_size / 4);
230 		drm_printf(&p, "Ring contents\n");
231 		drm_printf(&p, "Offset \t Value\n");
232 
233 		while (j < ring->ring_size) {
234 			drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j/4]);
235 			j += 4;
236 		}
237 	}
238 
239 	if (coredump->reset_vram_lost)
240 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
241 	if (coredump->adev->reset_info.num_regs) {
242 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
243 
244 		for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
245 			drm_printf(&p, "0x%08x: 0x%08x\n",
246 				   coredump->adev->reset_info.reset_dump_reg_list[i],
247 				   coredump->adev->reset_info.reset_dump_reg_value[i]);
248 	}
249 
250 	return count - iter.remain;
251 }
252 
253 static void amdgpu_devcoredump_free(void *data)
254 {
255 	kfree(data);
256 }
257 
258 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
259 		     struct amdgpu_reset_context *reset_context)
260 {
261 	struct amdgpu_coredump_info *coredump;
262 	struct drm_device *dev = adev_to_drm(adev);
263 	struct amdgpu_job *job = reset_context->job;
264 	struct drm_sched_job *s_job;
265 
266 	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
267 
268 	if (!coredump) {
269 		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
270 		return;
271 	}
272 
273 	coredump->reset_vram_lost = vram_lost;
274 
275 	if (reset_context->job && reset_context->job->vm) {
276 		struct amdgpu_task_info *ti;
277 		struct amdgpu_vm *vm = reset_context->job->vm;
278 
279 		ti = amdgpu_vm_get_task_info_vm(vm);
280 		if (ti) {
281 			coredump->reset_task_info = *ti;
282 			amdgpu_vm_put_task_info(ti);
283 		}
284 	}
285 
286 	if (job) {
287 		s_job = &job->base;
288 		coredump->ring = to_amdgpu_ring(s_job->sched);
289 	}
290 
291 	coredump->adev = adev;
292 
293 	ktime_get_ts64(&coredump->reset_time);
294 
295 	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
296 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
297 }
298 #endif
299