xref: /linux/drivers/accel/ivpu/ivpu_pm.c (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2020-2024 Intel Corporation
4  */
5 
6 #include <linux/highmem.h>
7 #include <linux/moduleparam.h>
8 #include <linux/pci.h>
9 #include <linux/pm_runtime.h>
10 #include <linux/reboot.h>
11 
12 #include "ivpu_coredump.h"
13 #include "ivpu_drv.h"
14 #include "ivpu_fw.h"
15 #include "ivpu_fw_log.h"
16 #include "ivpu_hw.h"
17 #include "ivpu_ipc.h"
18 #include "ivpu_job.h"
19 #include "ivpu_jsm_msg.h"
20 #include "ivpu_mmu.h"
21 #include "ivpu_ms.h"
22 #include "ivpu_pm.h"
23 #include "ivpu_trace.h"
24 #include "vpu_boot_api.h"
25 
26 static bool ivpu_disable_recovery;
27 #if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
28 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
29 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
30 #endif
31 
32 static unsigned long ivpu_tdr_timeout_ms;
33 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
34 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
35 
36 static unsigned long ivpu_inference_timeout_ms;
37 module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
38 MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
39 
40 #define PM_RESCHEDULE_LIMIT     5
41 
42 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
43 {
44 	struct ivpu_fw_info *fw = vdev->fw;
45 
46 	ivpu_cmdq_reset_all_contexts(vdev);
47 	ivpu_ipc_reset(vdev);
48 	ivpu_fw_log_reset(vdev);
49 	ivpu_fw_load(vdev);
50 	fw->last_heartbeat = 0;
51 
52 	ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point);
53 	fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT;
54 }
55 
56 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
57 {
58 	struct ivpu_fw_info *fw = vdev->fw;
59 	struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp);
60 
61 	fw->warm_boot_entry_point = bp->save_restore_ret_address;
62 	if (!fw->warm_boot_entry_point) {
63 		ivpu_pm_prepare_cold_boot(vdev);
64 		return;
65 	}
66 
67 	ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point);
68 	fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT;
69 }
70 
71 static int ivpu_suspend(struct ivpu_device *vdev)
72 {
73 	int ret;
74 
75 	ivpu_prepare_for_reset(vdev);
76 
77 	ret = ivpu_shutdown(vdev);
78 	if (ret)
79 		ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
80 
81 	return ret;
82 }
83 
84 static int ivpu_resume(struct ivpu_device *vdev)
85 {
86 	int ret;
87 
88 retry:
89 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
90 	pci_restore_state(to_pci_dev(vdev->drm.dev));
91 
92 	ret = ivpu_hw_power_up(vdev);
93 	if (ret) {
94 		ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
95 		goto err_power_down;
96 	}
97 
98 	ret = ivpu_mmu_enable(vdev);
99 	if (ret) {
100 		ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
101 		goto err_power_down;
102 	}
103 
104 	ret = ivpu_boot(vdev);
105 	if (ret)
106 		goto err_mmu_disable;
107 
108 	return 0;
109 
110 err_mmu_disable:
111 	ivpu_mmu_disable(vdev);
112 err_power_down:
113 	ivpu_hw_power_down(vdev);
114 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
115 
116 	if (ivpu_fw_is_warm_boot(vdev)) {
117 		ivpu_pm_prepare_cold_boot(vdev);
118 		goto retry;
119 	} else {
120 		ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
121 	}
122 
123 	return ret;
124 }
125 
126 static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
127 {
128 	pm_runtime_disable(vdev->drm.dev);
129 
130 	atomic_inc(&vdev->pm->reset_counter);
131 	atomic_set(&vdev->pm->reset_pending, 1);
132 	down_write(&vdev->pm->reset_lock);
133 }
134 
135 static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
136 {
137 	int ret;
138 
139 	ivpu_pm_prepare_cold_boot(vdev);
140 	ivpu_jobs_abort_all(vdev);
141 	ivpu_ms_cleanup_all(vdev);
142 
143 	ret = ivpu_resume(vdev);
144 	if (ret) {
145 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
146 		pm_runtime_set_suspended(vdev->drm.dev);
147 	} else {
148 		pm_runtime_set_active(vdev->drm.dev);
149 	}
150 
151 	up_write(&vdev->pm->reset_lock);
152 	atomic_set(&vdev->pm->reset_pending, 0);
153 
154 	pm_runtime_mark_last_busy(vdev->drm.dev);
155 	pm_runtime_enable(vdev->drm.dev);
156 }
157 
158 static void ivpu_pm_recovery_work(struct work_struct *work)
159 {
160 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
161 	struct ivpu_device *vdev = pm->vdev;
162 	char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
163 
164 	ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
165 
166 	ivpu_pm_reset_begin(vdev);
167 
168 	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
169 		ivpu_jsm_state_dump(vdev);
170 		ivpu_dev_coredump(vdev);
171 		ivpu_suspend(vdev);
172 	}
173 
174 	ivpu_pm_reset_complete(vdev);
175 
176 	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
177 }
178 
179 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
180 {
181 	ivpu_err(vdev, "Recovery triggered by %s\n", reason);
182 
183 	if (ivpu_disable_recovery) {
184 		ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
185 		return;
186 	}
187 
188 	/* Trigger recovery if it's not in progress */
189 	if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
190 		ivpu_hw_diagnose_failure(vdev);
191 		ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
192 		queue_work(system_dfl_wq, &vdev->pm->recovery_work);
193 	}
194 }
195 
196 static void ivpu_job_timeout_work(struct work_struct *work)
197 {
198 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
199 	struct ivpu_device *vdev = pm->vdev;
200 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
201 	unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
202 					     vdev->timeout.inference;
203 	u64 inference_max_retries;
204 	u64 heartbeat;
205 
206 	if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
207 		ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
208 		goto recovery;
209 	}
210 
211 	inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
212 	if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
213 		ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
214 			 inference_max_retries);
215 		goto recovery;
216 	}
217 
218 	vdev->fw->last_heartbeat = heartbeat;
219 	ivpu_start_job_timeout_detection(vdev);
220 	return;
221 
222 recovery:
223 	atomic_set(&vdev->job_timeout_counter, 0);
224 	ivpu_pm_trigger_recovery(vdev, "TDR");
225 }
226 
227 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
228 {
229 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
230 
231 	/* No-op if already queued */
232 	queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work,
233 			   msecs_to_jiffies(timeout_ms));
234 }
235 
236 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
237 {
238 	cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
239 	atomic_set(&vdev->job_timeout_counter, 0);
240 }
241 
242 int ivpu_pm_suspend_cb(struct device *dev)
243 {
244 	struct drm_device *drm = dev_get_drvdata(dev);
245 	struct ivpu_device *vdev = to_ivpu_device(drm);
246 	unsigned long timeout;
247 
248 	trace_pm("suspend");
249 	ivpu_dbg(vdev, PM, "Suspend..\n");
250 
251 	timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
252 	while (!ivpu_hw_is_idle(vdev)) {
253 		cond_resched();
254 		if (time_after_eq(jiffies, timeout)) {
255 			ivpu_err(vdev, "Failed to enter idle on system suspend\n");
256 			return -EBUSY;
257 		}
258 	}
259 
260 	ivpu_jsm_pwr_d0i3_enter(vdev);
261 
262 	ivpu_suspend(vdev);
263 	ivpu_pm_prepare_warm_boot(vdev);
264 
265 	ivpu_dbg(vdev, PM, "Suspend done.\n");
266 	trace_pm("suspend done");
267 
268 	return 0;
269 }
270 
271 int ivpu_pm_resume_cb(struct device *dev)
272 {
273 	struct drm_device *drm = dev_get_drvdata(dev);
274 	struct ivpu_device *vdev = to_ivpu_device(drm);
275 	int ret;
276 
277 	trace_pm("resume");
278 	ivpu_dbg(vdev, PM, "Resume..\n");
279 
280 	ret = ivpu_resume(vdev);
281 	if (ret)
282 		ivpu_err(vdev, "Failed to resume: %d\n", ret);
283 
284 	ivpu_dbg(vdev, PM, "Resume done.\n");
285 	trace_pm("resume done");
286 
287 	return ret;
288 }
289 
290 int ivpu_pm_runtime_suspend_cb(struct device *dev)
291 {
292 	struct drm_device *drm = dev_get_drvdata(dev);
293 	struct ivpu_device *vdev = to_ivpu_device(drm);
294 	int ret, ret_d0i3;
295 	bool is_idle;
296 
297 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
298 	drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
299 
300 	trace_pm("runtime suspend");
301 	ivpu_dbg(vdev, PM, "Runtime suspend..\n");
302 
303 	ivpu_mmu_disable(vdev);
304 
305 	is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
306 	if (!is_idle)
307 		ivpu_err(vdev, "NPU is not idle before autosuspend\n");
308 
309 	ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
310 	if (ret_d0i3)
311 		ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
312 
313 	ret = ivpu_suspend(vdev);
314 	if (ret)
315 		ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
316 
317 	if (!is_idle || ret_d0i3) {
318 		ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
319 		atomic_inc(&vdev->pm->reset_counter);
320 		ivpu_dev_coredump(vdev);
321 		ivpu_pm_prepare_cold_boot(vdev);
322 	} else {
323 		ivpu_pm_prepare_warm_boot(vdev);
324 	}
325 
326 	ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
327 	trace_pm("runtime suspend done");
328 
329 	return 0;
330 }
331 
332 int ivpu_pm_runtime_resume_cb(struct device *dev)
333 {
334 	struct drm_device *drm = dev_get_drvdata(dev);
335 	struct ivpu_device *vdev = to_ivpu_device(drm);
336 	int ret;
337 
338 	trace_pm("runtime resume");
339 	ivpu_dbg(vdev, PM, "Runtime resume..\n");
340 
341 	ret = ivpu_resume(vdev);
342 	if (ret)
343 		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
344 
345 	ivpu_dbg(vdev, PM, "Runtime resume done.\n");
346 	trace_pm("runtime resume done");
347 
348 	return ret;
349 }
350 
351 int ivpu_rpm_get(struct ivpu_device *vdev)
352 {
353 	int ret;
354 
355 	ret = pm_runtime_resume_and_get(vdev->drm.dev);
356 	if (ret < 0) {
357 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
358 		pm_runtime_set_suspended(vdev->drm.dev);
359 	}
360 
361 	return ret;
362 }
363 
364 void ivpu_rpm_put(struct ivpu_device *vdev)
365 {
366 	pm_runtime_put_autosuspend(vdev->drm.dev);
367 }
368 
369 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
370 {
371 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
372 
373 	ivpu_dbg(vdev, PM, "Pre-reset..\n");
374 
375 	ivpu_pm_reset_begin(vdev);
376 
377 	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
378 		ivpu_prepare_for_reset(vdev);
379 		ivpu_hw_reset(vdev);
380 	}
381 
382 	ivpu_dbg(vdev, PM, "Pre-reset done.\n");
383 }
384 
385 void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
386 {
387 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
388 
389 	ivpu_dbg(vdev, PM, "Post-reset..\n");
390 
391 	ivpu_pm_reset_complete(vdev);
392 
393 	ivpu_dbg(vdev, PM, "Post-reset done.\n");
394 }
395 
396 void ivpu_pm_init(struct ivpu_device *vdev)
397 {
398 	struct device *dev = vdev->drm.dev;
399 	struct ivpu_pm_info *pm = vdev->pm;
400 	int delay;
401 
402 	pm->vdev = vdev;
403 
404 	init_rwsem(&pm->reset_lock);
405 	atomic_set(&pm->reset_pending, 0);
406 	atomic_set(&pm->reset_counter, 0);
407 
408 	INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
409 	INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
410 
411 	if (ivpu_disable_recovery)
412 		delay = -1;
413 	else
414 		delay = vdev->timeout.autosuspend;
415 
416 	pm_runtime_use_autosuspend(dev);
417 	pm_runtime_set_autosuspend_delay(dev, delay);
418 	pm_runtime_set_active(dev);
419 
420 	ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
421 }
422 
423 void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
424 {
425 	drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
426 	disable_work_sync(&vdev->pm->recovery_work);
427 }
428 
429 void ivpu_pm_enable(struct ivpu_device *vdev)
430 {
431 	struct device *dev = vdev->drm.dev;
432 
433 	pm_runtime_allow(dev);
434 	pm_runtime_put_autosuspend(dev);
435 }
436 
437 void ivpu_pm_disable(struct ivpu_device *vdev)
438 {
439 	pm_runtime_get_noresume(vdev->drm.dev);
440 	pm_runtime_forbid(vdev->drm.dev);
441 }
442 
443 int ivpu_pm_dct_init(struct ivpu_device *vdev)
444 {
445 	if (vdev->pm->dct_active_percent)
446 		return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
447 
448 	return 0;
449 }
450 
451 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
452 {
453 	u32 active_us, inactive_us;
454 	int ret;
455 
456 	if (active_percent == 0 || active_percent > 100)
457 		return -EINVAL;
458 
459 	active_us = (DCT_PERIOD_US * active_percent) / 100;
460 	inactive_us = DCT_PERIOD_US - active_us;
461 
462 	vdev->pm->dct_active_percent = active_percent;
463 
464 	ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
465 		 active_percent, active_us, inactive_us);
466 
467 	ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
468 	if (ret) {
469 		ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
470 		return ret;
471 	}
472 
473 	return 0;
474 }
475 
476 int ivpu_pm_dct_disable(struct ivpu_device *vdev)
477 {
478 	int ret;
479 
480 	vdev->pm->dct_active_percent = 0;
481 
482 	ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
483 
484 	ret = ivpu_jsm_dct_disable(vdev);
485 	if (ret) {
486 		ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
487 		return ret;
488 	}
489 
490 	return 0;
491 }
492 
493 void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
494 {
495 	struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work);
496 	bool enable;
497 	int ret;
498 
499 	if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
500 		return;
501 
502 	if (enable)
503 		ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
504 	else
505 		ret = ivpu_pm_dct_disable(vdev);
506 
507 	if (!ret) {
508 		/* Convert percent to U1.7 format */
509 		u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100);
510 
511 		ivpu_hw_btrs_dct_set_status(vdev, enable, val);
512 	}
513 
514 }
515