xref: /linux/drivers/accel/ivpu/ivpu_pm.c (revision 37aeccf5f839c155e8c9100937a01059b24e61b5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2020-2024 Intel Corporation
4  */
5 
6 #include <linux/highmem.h>
7 #include <linux/moduleparam.h>
8 #include <linux/pci.h>
9 #include <linux/pm_runtime.h>
10 #include <linux/reboot.h>
11 
12 #include "ivpu_coredump.h"
13 #include "ivpu_drv.h"
14 #include "ivpu_fw.h"
15 #include "ivpu_fw_log.h"
16 #include "ivpu_hw.h"
17 #include "ivpu_ipc.h"
18 #include "ivpu_job.h"
19 #include "ivpu_jsm_msg.h"
20 #include "ivpu_mmu.h"
21 #include "ivpu_ms.h"
22 #include "ivpu_pm.h"
23 #include "ivpu_trace.h"
24 #include "vpu_boot_api.h"
25 
26 static bool ivpu_disable_recovery;
27 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
28 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
29 
30 static unsigned long ivpu_tdr_timeout_ms;
31 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
32 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
33 
34 #define PM_RESCHEDULE_LIMIT     5
35 
36 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
37 {
38 	struct ivpu_fw_info *fw = vdev->fw;
39 
40 	ivpu_cmdq_reset_all_contexts(vdev);
41 	ivpu_ipc_reset(vdev);
42 	ivpu_fw_log_reset(vdev);
43 	ivpu_fw_load(vdev);
44 	fw->entry_point = fw->cold_boot_entry_point;
45 }
46 
47 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
48 {
49 	struct ivpu_fw_info *fw = vdev->fw;
50 	struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem);
51 
52 	if (!bp->save_restore_ret_address) {
53 		ivpu_pm_prepare_cold_boot(vdev);
54 		return;
55 	}
56 
57 	ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address);
58 	fw->entry_point = bp->save_restore_ret_address;
59 }
60 
61 static int ivpu_suspend(struct ivpu_device *vdev)
62 {
63 	int ret;
64 
65 	ivpu_prepare_for_reset(vdev);
66 
67 	ret = ivpu_shutdown(vdev);
68 	if (ret)
69 		ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
70 
71 	return ret;
72 }
73 
74 static int ivpu_resume(struct ivpu_device *vdev)
75 {
76 	int ret;
77 
78 retry:
79 	pci_restore_state(to_pci_dev(vdev->drm.dev));
80 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
81 
82 	ret = ivpu_hw_power_up(vdev);
83 	if (ret) {
84 		ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
85 		goto err_power_down;
86 	}
87 
88 	ret = ivpu_mmu_enable(vdev);
89 	if (ret) {
90 		ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
91 		goto err_power_down;
92 	}
93 
94 	ret = ivpu_boot(vdev);
95 	if (ret)
96 		goto err_mmu_disable;
97 
98 	return 0;
99 
100 err_mmu_disable:
101 	ivpu_mmu_disable(vdev);
102 err_power_down:
103 	ivpu_hw_power_down(vdev);
104 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
105 
106 	if (!ivpu_fw_is_cold_boot(vdev)) {
107 		ivpu_pm_prepare_cold_boot(vdev);
108 		goto retry;
109 	} else {
110 		ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
111 	}
112 
113 	return ret;
114 }
115 
116 static void ivpu_pm_recovery_work(struct work_struct *work)
117 {
118 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
119 	struct ivpu_device *vdev = pm->vdev;
120 	char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
121 	int ret;
122 
123 	ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
124 
125 	ret = pm_runtime_resume_and_get(vdev->drm.dev);
126 	if (ret)
127 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
128 
129 	ivpu_jsm_state_dump(vdev);
130 	ivpu_dev_coredump(vdev);
131 
132 	atomic_inc(&vdev->pm->reset_counter);
133 	atomic_set(&vdev->pm->reset_pending, 1);
134 	down_write(&vdev->pm->reset_lock);
135 
136 	ivpu_suspend(vdev);
137 	ivpu_pm_prepare_cold_boot(vdev);
138 	ivpu_jobs_abort_all(vdev);
139 	ivpu_ms_cleanup_all(vdev);
140 
141 	ret = ivpu_resume(vdev);
142 	if (ret)
143 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
144 
145 	up_write(&vdev->pm->reset_lock);
146 	atomic_set(&vdev->pm->reset_pending, 0);
147 
148 	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
149 	pm_runtime_mark_last_busy(vdev->drm.dev);
150 	pm_runtime_put_autosuspend(vdev->drm.dev);
151 }
152 
153 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
154 {
155 	ivpu_err(vdev, "Recovery triggered by %s\n", reason);
156 
157 	if (ivpu_disable_recovery) {
158 		ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
159 		return;
160 	}
161 
162 	if (ivpu_is_fpga(vdev)) {
163 		ivpu_err(vdev, "Recovery not available on FPGA\n");
164 		return;
165 	}
166 
167 	/* Trigger recovery if it's not in progress */
168 	if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
169 		ivpu_hw_diagnose_failure(vdev);
170 		ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
171 		queue_work(system_long_wq, &vdev->pm->recovery_work);
172 	}
173 }
174 
175 static void ivpu_job_timeout_work(struct work_struct *work)
176 {
177 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
178 	struct ivpu_device *vdev = pm->vdev;
179 
180 	ivpu_pm_trigger_recovery(vdev, "TDR");
181 }
182 
183 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
184 {
185 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
186 
187 	/* No-op if already queued */
188 	queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
189 }
190 
191 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
192 {
193 	cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
194 }
195 
196 int ivpu_pm_suspend_cb(struct device *dev)
197 {
198 	struct drm_device *drm = dev_get_drvdata(dev);
199 	struct ivpu_device *vdev = to_ivpu_device(drm);
200 	unsigned long timeout;
201 
202 	trace_pm("suspend");
203 	ivpu_dbg(vdev, PM, "Suspend..\n");
204 
205 	timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
206 	while (!ivpu_hw_is_idle(vdev)) {
207 		cond_resched();
208 		if (time_after_eq(jiffies, timeout)) {
209 			ivpu_err(vdev, "Failed to enter idle on system suspend\n");
210 			return -EBUSY;
211 		}
212 	}
213 
214 	ivpu_jsm_pwr_d0i3_enter(vdev);
215 
216 	ivpu_suspend(vdev);
217 	ivpu_pm_prepare_warm_boot(vdev);
218 
219 	ivpu_dbg(vdev, PM, "Suspend done.\n");
220 	trace_pm("suspend done");
221 
222 	return 0;
223 }
224 
225 int ivpu_pm_resume_cb(struct device *dev)
226 {
227 	struct drm_device *drm = dev_get_drvdata(dev);
228 	struct ivpu_device *vdev = to_ivpu_device(drm);
229 	int ret;
230 
231 	trace_pm("resume");
232 	ivpu_dbg(vdev, PM, "Resume..\n");
233 
234 	ret = ivpu_resume(vdev);
235 	if (ret)
236 		ivpu_err(vdev, "Failed to resume: %d\n", ret);
237 
238 	ivpu_dbg(vdev, PM, "Resume done.\n");
239 	trace_pm("resume done");
240 
241 	return ret;
242 }
243 
244 int ivpu_pm_runtime_suspend_cb(struct device *dev)
245 {
246 	struct drm_device *drm = dev_get_drvdata(dev);
247 	struct ivpu_device *vdev = to_ivpu_device(drm);
248 	int ret, ret_d0i3;
249 	bool is_idle;
250 
251 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
252 	drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
253 
254 	trace_pm("runtime suspend");
255 	ivpu_dbg(vdev, PM, "Runtime suspend..\n");
256 
257 	ivpu_mmu_disable(vdev);
258 
259 	is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
260 	if (!is_idle)
261 		ivpu_err(vdev, "NPU is not idle before autosuspend\n");
262 
263 	ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
264 	if (ret_d0i3)
265 		ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
266 
267 	ret = ivpu_suspend(vdev);
268 	if (ret)
269 		ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
270 
271 	if (!is_idle || ret_d0i3) {
272 		ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
273 		atomic_inc(&vdev->pm->reset_counter);
274 		ivpu_dev_coredump(vdev);
275 		ivpu_pm_prepare_cold_boot(vdev);
276 	} else {
277 		ivpu_pm_prepare_warm_boot(vdev);
278 	}
279 
280 	ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
281 	trace_pm("runtime suspend done");
282 
283 	return 0;
284 }
285 
286 int ivpu_pm_runtime_resume_cb(struct device *dev)
287 {
288 	struct drm_device *drm = dev_get_drvdata(dev);
289 	struct ivpu_device *vdev = to_ivpu_device(drm);
290 	int ret;
291 
292 	trace_pm("runtime resume");
293 	ivpu_dbg(vdev, PM, "Runtime resume..\n");
294 
295 	ret = ivpu_resume(vdev);
296 	if (ret)
297 		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
298 
299 	ivpu_dbg(vdev, PM, "Runtime resume done.\n");
300 	trace_pm("runtime resume done");
301 
302 	return ret;
303 }
304 
305 int ivpu_rpm_get(struct ivpu_device *vdev)
306 {
307 	int ret;
308 
309 	ret = pm_runtime_resume_and_get(vdev->drm.dev);
310 	drm_WARN_ON(&vdev->drm, ret < 0);
311 
312 	return ret;
313 }
314 
315 void ivpu_rpm_put(struct ivpu_device *vdev)
316 {
317 	pm_runtime_mark_last_busy(vdev->drm.dev);
318 	pm_runtime_put_autosuspend(vdev->drm.dev);
319 }
320 
321 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
322 {
323 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
324 
325 	ivpu_dbg(vdev, PM, "Pre-reset..\n");
326 	atomic_inc(&vdev->pm->reset_counter);
327 	atomic_set(&vdev->pm->reset_pending, 1);
328 
329 	pm_runtime_get_sync(vdev->drm.dev);
330 	down_write(&vdev->pm->reset_lock);
331 	ivpu_prepare_for_reset(vdev);
332 	ivpu_hw_reset(vdev);
333 	ivpu_pm_prepare_cold_boot(vdev);
334 	ivpu_jobs_abort_all(vdev);
335 	ivpu_ms_cleanup_all(vdev);
336 
337 	ivpu_dbg(vdev, PM, "Pre-reset done.\n");
338 }
339 
340 void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
341 {
342 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
343 	int ret;
344 
345 	ivpu_dbg(vdev, PM, "Post-reset..\n");
346 	ret = ivpu_resume(vdev);
347 	if (ret)
348 		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
349 	up_write(&vdev->pm->reset_lock);
350 	atomic_set(&vdev->pm->reset_pending, 0);
351 	ivpu_dbg(vdev, PM, "Post-reset done.\n");
352 
353 	pm_runtime_mark_last_busy(vdev->drm.dev);
354 	pm_runtime_put_autosuspend(vdev->drm.dev);
355 }
356 
357 void ivpu_pm_init(struct ivpu_device *vdev)
358 {
359 	struct device *dev = vdev->drm.dev;
360 	struct ivpu_pm_info *pm = vdev->pm;
361 	int delay;
362 
363 	pm->vdev = vdev;
364 
365 	init_rwsem(&pm->reset_lock);
366 	atomic_set(&pm->reset_pending, 0);
367 	atomic_set(&pm->reset_counter, 0);
368 
369 	INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
370 	INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
371 
372 	if (ivpu_disable_recovery)
373 		delay = -1;
374 	else
375 		delay = vdev->timeout.autosuspend;
376 
377 	pm_runtime_use_autosuspend(dev);
378 	pm_runtime_set_autosuspend_delay(dev, delay);
379 
380 	ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
381 }
382 
383 void ivpu_pm_cancel_recovery(struct ivpu_device *vdev)
384 {
385 	drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
386 	cancel_work_sync(&vdev->pm->recovery_work);
387 }
388 
389 void ivpu_pm_enable(struct ivpu_device *vdev)
390 {
391 	struct device *dev = vdev->drm.dev;
392 
393 	pm_runtime_set_active(dev);
394 	pm_runtime_allow(dev);
395 	pm_runtime_mark_last_busy(dev);
396 	pm_runtime_put_autosuspend(dev);
397 }
398 
399 void ivpu_pm_disable(struct ivpu_device *vdev)
400 {
401 	pm_runtime_get_noresume(vdev->drm.dev);
402 	pm_runtime_forbid(vdev->drm.dev);
403 }
404 
405 int ivpu_pm_dct_init(struct ivpu_device *vdev)
406 {
407 	if (vdev->pm->dct_active_percent)
408 		return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
409 
410 	return 0;
411 }
412 
413 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
414 {
415 	u32 active_us, inactive_us;
416 	int ret;
417 
418 	if (active_percent == 0 || active_percent > 100)
419 		return -EINVAL;
420 
421 	active_us = (DCT_PERIOD_US * active_percent) / 100;
422 	inactive_us = DCT_PERIOD_US - active_us;
423 
424 	ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
425 	if (ret) {
426 		ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
427 		return ret;
428 	}
429 
430 	vdev->pm->dct_active_percent = active_percent;
431 
432 	ivpu_dbg(vdev, PM, "DCT set to %u%% (D0: %uus, D0i2: %uus)\n",
433 		 active_percent, active_us, inactive_us);
434 	return 0;
435 }
436 
437 int ivpu_pm_dct_disable(struct ivpu_device *vdev)
438 {
439 	int ret;
440 
441 	ret = ivpu_jsm_dct_disable(vdev);
442 	if (ret) {
443 		ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
444 		return ret;
445 	}
446 
447 	vdev->pm->dct_active_percent = 0;
448 
449 	ivpu_dbg(vdev, PM, "DCT disabled\n");
450 	return 0;
451 }
452 
453 void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev)
454 {
455 	bool enable;
456 	int ret;
457 
458 	if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
459 		return;
460 
461 	if (vdev->pm->dct_active_percent)
462 		ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
463 	else
464 		ret = ivpu_pm_dct_disable(vdev);
465 
466 	if (!ret)
467 		ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
468 }
469