xref: /linux/drivers/accel/ivpu/ivpu_pm.c (revision 53597deca0e38c30e6cd4ba2114fa42d2bcd85bb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2020-2026 Intel Corporation
4  */
5 
6 #include <linux/highmem.h>
7 #include <linux/moduleparam.h>
8 #include <linux/pci.h>
9 #include <linux/pm_runtime.h>
10 #include <linux/reboot.h>
11 
12 #include "ivpu_coredump.h"
13 #include "ivpu_drv.h"
14 #include "ivpu_fw.h"
15 #include "ivpu_fw_log.h"
16 #include "ivpu_hw.h"
17 #include "ivpu_ipc.h"
18 #include "ivpu_job.h"
19 #include "ivpu_jsm_msg.h"
20 #include "ivpu_mmu.h"
21 #include "ivpu_ms.h"
22 #include "ivpu_pm.h"
23 #include "ivpu_trace.h"
24 #include "vpu_boot_api.h"
25 
26 static bool ivpu_disable_recovery;
27 #if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
28 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
29 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
30 #endif
31 
32 static unsigned long ivpu_tdr_timeout_ms;
33 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
34 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
35 
36 static unsigned long ivpu_inference_timeout_ms;
37 module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
38 MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
39 
40 #define PM_RESCHEDULE_LIMIT     5
41 
42 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
43 {
44 	struct ivpu_fw_info *fw = vdev->fw;
45 
46 	ivpu_cmdq_reset_all_contexts(vdev);
47 	ivpu_ipc_reset(vdev);
48 	ivpu_fw_log_reset(vdev);
49 	ivpu_fw_load(vdev);
50 	fw->last_heartbeat = 0;
51 
52 	ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point);
53 	fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT;
54 }
55 
56 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
57 {
58 	struct ivpu_fw_info *fw = vdev->fw;
59 	struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp);
60 
61 	fw->warm_boot_entry_point = bp->save_restore_ret_address;
62 	if (!fw->warm_boot_entry_point) {
63 		ivpu_pm_prepare_cold_boot(vdev);
64 		return;
65 	}
66 
67 	ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point);
68 	fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT;
69 }
70 
71 static int ivpu_suspend(struct ivpu_device *vdev)
72 {
73 	int ret;
74 
75 	ivpu_prepare_for_reset(vdev);
76 
77 	ret = ivpu_shutdown(vdev);
78 	if (ret)
79 		ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
80 
81 	return ret;
82 }
83 
84 static int ivpu_resume(struct ivpu_device *vdev)
85 {
86 	int ret;
87 
88 retry:
89 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
90 	pci_restore_state(to_pci_dev(vdev->drm.dev));
91 
92 	ret = ivpu_hw_power_up(vdev);
93 	if (ret) {
94 		ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
95 		goto err_power_down;
96 	}
97 
98 	ret = ivpu_mmu_enable(vdev);
99 	if (ret) {
100 		ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
101 		goto err_power_down;
102 	}
103 
104 	ret = ivpu_boot(vdev);
105 	if (ret)
106 		goto err_mmu_disable;
107 
108 	return 0;
109 
110 err_mmu_disable:
111 	ivpu_mmu_disable(vdev);
112 err_power_down:
113 	ivpu_hw_power_down(vdev);
114 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
115 
116 	if (ivpu_fw_is_warm_boot(vdev)) {
117 		ivpu_pm_prepare_cold_boot(vdev);
118 		goto retry;
119 	} else {
120 		ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
121 	}
122 
123 	return ret;
124 }
125 
126 static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
127 {
128 	pm_runtime_disable(vdev->drm.dev);
129 
130 	atomic_inc(&vdev->pm->reset_counter);
131 	atomic_set(&vdev->pm->reset_pending, 1);
132 	down_write(&vdev->pm->reset_lock);
133 }
134 
135 static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
136 {
137 	int ret;
138 
139 	ivpu_pm_prepare_cold_boot(vdev);
140 	ivpu_jobs_abort_all(vdev);
141 	ivpu_ms_cleanup_all(vdev);
142 
143 	ret = ivpu_resume(vdev);
144 	if (ret) {
145 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
146 		pm_runtime_set_suspended(vdev->drm.dev);
147 	} else {
148 		pm_runtime_set_active(vdev->drm.dev);
149 	}
150 
151 	up_write(&vdev->pm->reset_lock);
152 	atomic_set(&vdev->pm->reset_pending, 0);
153 
154 	pm_runtime_mark_last_busy(vdev->drm.dev);
155 	pm_runtime_enable(vdev->drm.dev);
156 }
157 
158 static void ivpu_pm_recovery_work(struct work_struct *work)
159 {
160 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
161 	struct ivpu_device *vdev = pm->vdev;
162 	char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
163 
164 	ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
165 
166 	ivpu_pm_reset_begin(vdev);
167 
168 	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
169 		ivpu_jsm_state_dump_no_reply(vdev);
170 		ivpu_dev_coredump(vdev);
171 		ivpu_suspend(vdev);
172 	}
173 
174 	ivpu_pm_reset_complete(vdev);
175 
176 	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
177 }
178 
179 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
180 {
181 	ivpu_err(vdev, "Recovery triggered by %s\n", reason);
182 
183 	if (ivpu_disable_recovery) {
184 		ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
185 		return;
186 	}
187 
188 	/* Trigger recovery if it's not in progress */
189 	if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
190 		ivpu_hw_diagnose_failure(vdev);
191 		ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
192 		queue_work(system_dfl_wq, &vdev->pm->recovery_work);
193 	}
194 }
195 
196 static void ivpu_job_timeout_work(struct work_struct *work)
197 {
198 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
199 	struct ivpu_device *vdev = pm->vdev;
200 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
201 	unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
202 					     vdev->timeout.inference;
203 	u64 inference_max_retries;
204 	u64 heartbeat;
205 
206 	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
207 		ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
208 		goto abort;
209 	}
210 
211 	inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
212 	if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
213 		ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
214 			 inference_max_retries);
215 		goto abort;
216 	}
217 
218 	vdev->fw->last_heartbeat = heartbeat;
219 	ivpu_start_job_timeout_detection(vdev);
220 	return;
221 
222 abort:
223 	atomic_set(&vdev->job_timeout_counter, 0);
224 
225 	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) {
226 		ivpu_pm_trigger_recovery(vdev, "Job timeout");
227 		return;
228 	}
229 
230 	ivpu_jsm_state_dump(vdev);
231 	ivpu_dev_coredump(vdev);
232 	queue_work(system_percpu_wq, &vdev->context_abort_work);
233 }
234 
235 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
236 {
237 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
238 
239 	/* No-op if already queued */
240 	queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work,
241 			   msecs_to_jiffies(timeout_ms));
242 }
243 
244 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
245 {
246 	cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
247 	atomic_set(&vdev->job_timeout_counter, 0);
248 }
249 
250 int ivpu_pm_suspend_cb(struct device *dev)
251 {
252 	struct drm_device *drm = dev_get_drvdata(dev);
253 	struct ivpu_device *vdev = to_ivpu_device(drm);
254 	unsigned long timeout;
255 
256 	trace_pm("suspend");
257 	ivpu_dbg(vdev, PM, "Suspend..\n");
258 
259 	timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
260 	while (!ivpu_hw_is_idle(vdev)) {
261 		cond_resched();
262 		if (time_after_eq(jiffies, timeout)) {
263 			ivpu_err(vdev, "Failed to enter idle on system suspend\n");
264 			return -EBUSY;
265 		}
266 	}
267 
268 	ivpu_jsm_pwr_d0i3_enter(vdev);
269 
270 	ivpu_suspend(vdev);
271 	ivpu_pm_prepare_warm_boot(vdev);
272 
273 	ivpu_dbg(vdev, PM, "Suspend done.\n");
274 	trace_pm("suspend done");
275 
276 	return 0;
277 }
278 
279 int ivpu_pm_resume_cb(struct device *dev)
280 {
281 	struct drm_device *drm = dev_get_drvdata(dev);
282 	struct ivpu_device *vdev = to_ivpu_device(drm);
283 	int ret;
284 
285 	trace_pm("resume");
286 	ivpu_dbg(vdev, PM, "Resume..\n");
287 
288 	ret = ivpu_resume(vdev);
289 	if (ret)
290 		ivpu_err(vdev, "Failed to resume: %d\n", ret);
291 
292 	ivpu_dbg(vdev, PM, "Resume done.\n");
293 	trace_pm("resume done");
294 
295 	return ret;
296 }
297 
298 int ivpu_pm_runtime_suspend_cb(struct device *dev)
299 {
300 	struct drm_device *drm = dev_get_drvdata(dev);
301 	struct ivpu_device *vdev = to_ivpu_device(drm);
302 	int ret, ret_d0i3;
303 	bool is_idle;
304 
305 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
306 	drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
307 
308 	trace_pm("runtime suspend");
309 	ivpu_dbg(vdev, PM, "Runtime suspend..\n");
310 
311 	ivpu_mmu_disable(vdev);
312 
313 	is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
314 	if (!is_idle)
315 		ivpu_err(vdev, "NPU is not idle before autosuspend\n");
316 
317 	ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
318 	if (ret_d0i3)
319 		ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
320 
321 	ret = ivpu_suspend(vdev);
322 	if (ret)
323 		ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
324 
325 	if (!is_idle || ret_d0i3) {
326 		ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
327 		atomic_inc(&vdev->pm->reset_counter);
328 		ivpu_dev_coredump(vdev);
329 		ivpu_pm_prepare_cold_boot(vdev);
330 	} else {
331 		ivpu_pm_prepare_warm_boot(vdev);
332 	}
333 
334 	ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
335 	trace_pm("runtime suspend done");
336 
337 	return 0;
338 }
339 
340 int ivpu_pm_runtime_resume_cb(struct device *dev)
341 {
342 	struct drm_device *drm = dev_get_drvdata(dev);
343 	struct ivpu_device *vdev = to_ivpu_device(drm);
344 	int ret;
345 
346 	trace_pm("runtime resume");
347 	ivpu_dbg(vdev, PM, "Runtime resume..\n");
348 
349 	ret = ivpu_resume(vdev);
350 	if (ret)
351 		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
352 
353 	ivpu_dbg(vdev, PM, "Runtime resume done.\n");
354 	trace_pm("runtime resume done");
355 
356 	return ret;
357 }
358 
359 int ivpu_rpm_get(struct ivpu_device *vdev)
360 {
361 	int ret;
362 
363 	ret = pm_runtime_resume_and_get(vdev->drm.dev);
364 	if (ret < 0) {
365 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
366 		pm_runtime_set_suspended(vdev->drm.dev);
367 	}
368 
369 	return ret;
370 }
371 
372 void ivpu_rpm_put(struct ivpu_device *vdev)
373 {
374 	pm_runtime_put_autosuspend(vdev->drm.dev);
375 }
376 
377 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
378 {
379 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
380 
381 	ivpu_dbg(vdev, PM, "Pre-reset..\n");
382 
383 	ivpu_pm_reset_begin(vdev);
384 
385 	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
386 		ivpu_prepare_for_reset(vdev);
387 		ivpu_hw_reset(vdev);
388 	}
389 
390 	ivpu_dbg(vdev, PM, "Pre-reset done.\n");
391 }
392 
393 void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
394 {
395 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
396 
397 	ivpu_dbg(vdev, PM, "Post-reset..\n");
398 
399 	ivpu_pm_reset_complete(vdev);
400 
401 	ivpu_dbg(vdev, PM, "Post-reset done.\n");
402 }
403 
404 void ivpu_pm_init(struct ivpu_device *vdev)
405 {
406 	struct device *dev = vdev->drm.dev;
407 	struct ivpu_pm_info *pm = vdev->pm;
408 	int delay;
409 
410 	pm->vdev = vdev;
411 
412 	init_rwsem(&pm->reset_lock);
413 	atomic_set(&pm->reset_pending, 0);
414 	atomic_set(&pm->reset_counter, 0);
415 	atomic_set(&pm->engine_reset_counter, 0);
416 
417 	INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
418 	INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
419 
420 	if (ivpu_disable_recovery)
421 		delay = -1;
422 	else
423 		delay = vdev->timeout.autosuspend;
424 
425 	pm_runtime_use_autosuspend(dev);
426 	pm_runtime_set_autosuspend_delay(dev, delay);
427 	pm_runtime_set_active(dev);
428 
429 	ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
430 }
431 
432 void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
433 {
434 	drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
435 	disable_work_sync(&vdev->pm->recovery_work);
436 }
437 
438 void ivpu_pm_enable(struct ivpu_device *vdev)
439 {
440 	struct device *dev = vdev->drm.dev;
441 
442 	pm_runtime_allow(dev);
443 	pm_runtime_put_autosuspend(dev);
444 }
445 
446 void ivpu_pm_disable(struct ivpu_device *vdev)
447 {
448 	pm_runtime_get_noresume(vdev->drm.dev);
449 	pm_runtime_forbid(vdev->drm.dev);
450 }
451 
452 int ivpu_pm_dct_init(struct ivpu_device *vdev)
453 {
454 	if (vdev->pm->dct_active_percent)
455 		return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
456 
457 	return 0;
458 }
459 
460 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
461 {
462 	u32 active_us, inactive_us;
463 	int ret;
464 
465 	if (active_percent == 0 || active_percent > 100)
466 		return -EINVAL;
467 
468 	active_us = (DCT_PERIOD_US * active_percent) / 100;
469 	inactive_us = DCT_PERIOD_US - active_us;
470 
471 	vdev->pm->dct_active_percent = active_percent;
472 
473 	ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
474 		 active_percent, active_us, inactive_us);
475 
476 	ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
477 	if (ret) {
478 		ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
479 		return ret;
480 	}
481 
482 	return 0;
483 }
484 
485 int ivpu_pm_dct_disable(struct ivpu_device *vdev)
486 {
487 	int ret;
488 
489 	vdev->pm->dct_active_percent = 0;
490 
491 	ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
492 
493 	ret = ivpu_jsm_dct_disable(vdev);
494 	if (ret) {
495 		ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
496 		return ret;
497 	}
498 
499 	return 0;
500 }
501 
502 void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
503 {
504 	struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work);
505 	bool enable;
506 	int ret;
507 
508 	if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
509 		return;
510 
511 	if (enable)
512 		ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
513 	else
514 		ret = ivpu_pm_dct_disable(vdev);
515 
516 	if (!ret) {
517 		/* Convert percent to U1.7 format */
518 		u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100);
519 
520 		ivpu_hw_btrs_dct_set_status(vdev, enable, val);
521 	}
522 
523 }
524