xref: /linux/drivers/gpu/drm/amd/amdgpu/aldebaran.c (revision 5c00ff742bf5caf85f60e1c73999f99376fb865d)
1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include "aldebaran.h"
25 #include "amdgpu_reset.h"
26 #include "amdgpu_amdkfd.h"
27 #include "amdgpu_dpm.h"
28 #include "amdgpu_job.h"
29 #include "amdgpu_ring.h"
30 #include "amdgpu_ras.h"
31 #include "amdgpu_psp.h"
32 #include "amdgpu_xgmi.h"
33 
34 static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
35 {
36 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
37 
38 	if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
39 	     adev->gmc.xgmi.connected_to_cpu))
40 		return true;
41 
42 	return false;
43 }
44 
45 static struct amdgpu_reset_handler *
46 aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
47 			    struct amdgpu_reset_context *reset_context)
48 {
49 	struct amdgpu_reset_handler *handler;
50 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
51 	int i;
52 
53 	if (reset_context->method == AMD_RESET_METHOD_NONE) {
54 		if (aldebaran_is_mode2_default(reset_ctl))
55 			reset_context->method = AMD_RESET_METHOD_MODE2;
56 		else
57 			reset_context->method = amdgpu_asic_reset_method(adev);
58 	}
59 
60 	if (reset_context->method != AMD_RESET_METHOD_NONE) {
61 		dev_dbg(adev->dev, "Getting reset handler for method %d\n",
62 			reset_context->method);
63 		for_each_handler(i, handler, reset_ctl) {
64 			if (handler->reset_method == reset_context->method)
65 				return handler;
66 		}
67 	}
68 
69 	dev_dbg(adev->dev, "Reset handler not found!\n");
70 
71 	return NULL;
72 }
73 
74 static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
75 {
76 	int r, i;
77 
78 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
79 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
80 
81 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
82 		if (!(adev->ip_blocks[i].version->type ==
83 			      AMD_IP_BLOCK_TYPE_GFX ||
84 		      adev->ip_blocks[i].version->type ==
85 			      AMD_IP_BLOCK_TYPE_SDMA))
86 			continue;
87 
88 		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
89 		if (r)
90 			return r;
91 	}
92 
93 	return 0;
94 }
95 
96 static int
97 aldebaran_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
98 				  struct amdgpu_reset_context *reset_context)
99 {
100 	int r = 0;
101 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
102 
103 	dev_dbg(adev->dev, "Aldebaran prepare hw context\n");
104 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
105 	if (!amdgpu_sriov_vf(adev))
106 		r = aldebaran_mode2_suspend_ip(adev);
107 
108 	return r;
109 }
110 
111 static void aldebaran_async_reset(struct work_struct *work)
112 {
113 	struct amdgpu_reset_handler *handler;
114 	struct amdgpu_reset_control *reset_ctl =
115 		container_of(work, struct amdgpu_reset_control, reset_work);
116 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
117 	int i;
118 
119 	for_each_handler(i, handler, reset_ctl)	{
120 		if (handler->reset_method == reset_ctl->active_reset) {
121 			dev_dbg(adev->dev, "Resetting device\n");
122 			handler->do_reset(adev);
123 			break;
124 		}
125 	}
126 }
127 
128 static int aldebaran_mode2_reset(struct amdgpu_device *adev)
129 {
130 	/* disable BM */
131 	pci_clear_master(adev->pdev);
132 	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
133 	return adev->asic_reset_res;
134 }
135 
136 static int
137 aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
138 			      struct amdgpu_reset_context *reset_context)
139 {
140 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
141 	struct list_head *reset_device_list = reset_context->reset_device_list;
142 	struct amdgpu_device *tmp_adev = NULL;
143 	int r = 0;
144 
145 	dev_dbg(adev->dev, "aldebaran perform hw reset\n");
146 
147 	if (reset_device_list == NULL)
148 		return -EINVAL;
149 
150 	if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
151 	    reset_context->hive == NULL) {
152 		/* Wrong context, return error */
153 		return -EINVAL;
154 	}
155 
156 	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
157 		mutex_lock(&tmp_adev->reset_cntl->reset_lock);
158 		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
159 	}
160 	/*
161 	 * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
162 	 * them together so that they can be completed asynchronously on multiple nodes
163 	 */
164 	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
165 		/* For XGMI run all resets in parallel to speed up the process */
166 		if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
167 			if (!queue_work(system_unbound_wq,
168 					&tmp_adev->reset_cntl->reset_work))
169 				r = -EALREADY;
170 		} else
171 			r = aldebaran_mode2_reset(tmp_adev);
172 		if (r) {
173 			dev_err(tmp_adev->dev,
174 				"ASIC reset failed with error, %d for drm dev, %s",
175 				r, adev_to_drm(tmp_adev)->unique);
176 			break;
177 		}
178 	}
179 
180 	/* For XGMI wait for all resets to complete before proceed */
181 	if (!r) {
182 		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
183 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
184 				flush_work(&tmp_adev->reset_cntl->reset_work);
185 				r = tmp_adev->asic_reset_res;
186 				if (r)
187 					break;
188 			}
189 		}
190 	}
191 
192 	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
193 		mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
194 		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
195 	}
196 
197 	return r;
198 }
199 
200 static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
201 {
202 	struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
203 	struct amdgpu_firmware_info *ucode;
204 	struct amdgpu_ip_block *cmn_block;
205 	int ucode_count = 0;
206 	int i, r;
207 
208 	dev_dbg(adev->dev, "Reloading ucodes after reset\n");
209 	for (i = 0; i < adev->firmware.max_ucodes; i++) {
210 		ucode = &adev->firmware.ucode[i];
211 		if (!ucode->fw)
212 			continue;
213 		switch (ucode->ucode_id) {
214 		case AMDGPU_UCODE_ID_SDMA0:
215 		case AMDGPU_UCODE_ID_SDMA1:
216 		case AMDGPU_UCODE_ID_SDMA2:
217 		case AMDGPU_UCODE_ID_SDMA3:
218 		case AMDGPU_UCODE_ID_SDMA4:
219 		case AMDGPU_UCODE_ID_SDMA5:
220 		case AMDGPU_UCODE_ID_SDMA6:
221 		case AMDGPU_UCODE_ID_SDMA7:
222 		case AMDGPU_UCODE_ID_CP_MEC1:
223 		case AMDGPU_UCODE_ID_CP_MEC1_JT:
224 		case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:
225 		case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:
226 		case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:
227 		case AMDGPU_UCODE_ID_RLC_G:
228 			ucode_list[ucode_count++] = ucode;
229 			break;
230 		default:
231 			break;
232 		}
233 	}
234 
235 	/* Reinit NBIF block */
236 	cmn_block =
237 		amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_COMMON);
238 	if (unlikely(!cmn_block)) {
239 		dev_err(adev->dev, "Failed to get BIF handle\n");
240 		return -EINVAL;
241 	}
242 	r = amdgpu_ip_block_resume(cmn_block);
243 	if (r)
244 		return r;
245 
246 	/* Reinit GFXHUB */
247 	adev->gfxhub.funcs->init(adev);
248 	r = adev->gfxhub.funcs->gart_enable(adev);
249 	if (r) {
250 		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
251 		return r;
252 	}
253 
254 	/* Reload GFX firmware */
255 	r = psp_load_fw_list(&adev->psp, ucode_list, ucode_count);
256 	if (r) {
257 		dev_err(adev->dev, "GFX ucode load failed after reset\n");
258 		return r;
259 	}
260 
261 	/* Resume RLC, FW needs RLC alive to complete reset process */
262 	adev->gfx.rlc.funcs->resume(adev);
263 
264 	/* Wait for FW reset event complete */
265 	r = amdgpu_dpm_wait_for_event(adev, SMU_EVENT_RESET_COMPLETE, 0);
266 	if (r) {
267 		dev_err(adev->dev,
268 			"Failed to get response from firmware after reset\n");
269 		return r;
270 	}
271 
272 	for (i = 0; i < adev->num_ip_blocks; i++) {
273 		if (!(adev->ip_blocks[i].version->type ==
274 			      AMD_IP_BLOCK_TYPE_GFX ||
275 		      adev->ip_blocks[i].version->type ==
276 			      AMD_IP_BLOCK_TYPE_SDMA))
277 			continue;
278 
279 		r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
280 		if (r)
281 			return r;
282 	}
283 
284 	for (i = 0; i < adev->num_ip_blocks; i++) {
285 		if (!(adev->ip_blocks[i].version->type ==
286 			      AMD_IP_BLOCK_TYPE_GFX ||
287 		      adev->ip_blocks[i].version->type ==
288 			      AMD_IP_BLOCK_TYPE_SDMA ||
289 		      adev->ip_blocks[i].version->type ==
290 			      AMD_IP_BLOCK_TYPE_COMMON))
291 			continue;
292 
293 		if (adev->ip_blocks[i].version->funcs->late_init) {
294 			r = adev->ip_blocks[i].version->funcs->late_init(
295 				&adev->ip_blocks[i]);
296 			if (r) {
297 				dev_err(adev->dev,
298 					"late_init of IP block <%s> failed %d after reset\n",
299 					adev->ip_blocks[i].version->funcs->name,
300 					r);
301 				return r;
302 			}
303 		}
304 		adev->ip_blocks[i].status.late_initialized = true;
305 	}
306 
307 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
308 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
309 
310 	return r;
311 }
312 
313 static int
314 aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
315 				  struct amdgpu_reset_context *reset_context)
316 {
317 	struct list_head *reset_device_list = reset_context->reset_device_list;
318 	struct amdgpu_device *tmp_adev = NULL;
319 	struct amdgpu_ras *con;
320 	int r;
321 
322 	if (reset_device_list == NULL)
323 		return -EINVAL;
324 
325 	if (amdgpu_ip_version(reset_context->reset_req_dev, MP1_HWIP, 0) ==
326 		    IP_VERSION(13, 0, 2) &&
327 	    reset_context->hive == NULL) {
328 		/* Wrong context, return error */
329 		return -EINVAL;
330 	}
331 
332 	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
333 		dev_info(tmp_adev->dev,
334 			 "GPU reset succeeded, trying to resume\n");
335 		r = aldebaran_mode2_restore_ip(tmp_adev);
336 		if (r)
337 			goto end;
338 
339 		/*
340 		 * Add this ASIC as tracked as reset was already
341 		 * complete successfully.
342 		 */
343 		amdgpu_register_gpu_instance(tmp_adev);
344 
345 		/* Resume RAS, ecc_irq */
346 		con = amdgpu_ras_get_context(tmp_adev);
347 		if (!amdgpu_sriov_vf(tmp_adev) && con) {
348 			if (tmp_adev->sdma.ras &&
349 				tmp_adev->sdma.ras->ras_block.ras_late_init) {
350 				r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
351 						&tmp_adev->sdma.ras->ras_block.ras_comm);
352 				if (r) {
353 					dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
354 					goto end;
355 				}
356 			}
357 
358 			if (tmp_adev->gfx.ras &&
359 				tmp_adev->gfx.ras->ras_block.ras_late_init) {
360 				r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
361 						&tmp_adev->gfx.ras->ras_block.ras_comm);
362 				if (r) {
363 					dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
364 					goto end;
365 				}
366 			}
367 		}
368 
369 		amdgpu_ras_resume(tmp_adev);
370 
371 		/* Update PSP FW topology after reset */
372 		if (reset_context->hive &&
373 		    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
374 			r = amdgpu_xgmi_update_topology(reset_context->hive,
375 							tmp_adev);
376 
377 		if (!r) {
378 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
379 
380 			r = amdgpu_ib_ring_tests(tmp_adev);
381 			if (r) {
382 				dev_err(tmp_adev->dev,
383 					"ib ring test failed (%d).\n", r);
384 				r = -EAGAIN;
385 				tmp_adev->asic_reset_res = r;
386 				goto end;
387 			}
388 		}
389 	}
390 
391 end:
392 	return r;
393 }
394 
395 static struct amdgpu_reset_handler aldebaran_mode2_handler = {
396 	.reset_method		= AMD_RESET_METHOD_MODE2,
397 	.prepare_env		= NULL,
398 	.prepare_hwcontext	= aldebaran_mode2_prepare_hwcontext,
399 	.perform_reset		= aldebaran_mode2_perform_reset,
400 	.restore_hwcontext	= aldebaran_mode2_restore_hwcontext,
401 	.restore_env		= NULL,
402 	.do_reset		= aldebaran_mode2_reset,
403 };
404 
405 static struct amdgpu_reset_handler
406 	*aldebaran_rst_handlers[AMDGPU_RESET_MAX_HANDLERS] = {
407 		&aldebaran_mode2_handler,
408 		&xgmi_reset_on_init_handler,
409 	};
410 
411 int aldebaran_reset_init(struct amdgpu_device *adev)
412 {
413 	struct amdgpu_reset_control *reset_ctl;
414 
415 	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
416 	if (!reset_ctl)
417 		return -ENOMEM;
418 
419 	reset_ctl->handle = adev;
420 	reset_ctl->async_reset = aldebaran_async_reset;
421 	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
422 	reset_ctl->get_reset_handler = aldebaran_get_reset_handler;
423 
424 	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
425 	/* Only mode2 is handled through reset control now */
426 	reset_ctl->reset_handlers = &aldebaran_rst_handlers;
427 
428 	adev->reset_cntl = reset_ctl;
429 
430 	return 0;
431 }
432 
433 int aldebaran_reset_fini(struct amdgpu_device *adev)
434 {
435 	kfree(adev->reset_cntl);
436 	adev->reset_cntl = NULL;
437 	return 0;
438 }
439