1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2020-2024 Intel Corporation 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/moduleparam.h> 8 #include <linux/pci.h> 9 #include <linux/pm_runtime.h> 10 #include <linux/reboot.h> 11 12 #include "ivpu_coredump.h" 13 #include "ivpu_drv.h" 14 #include "ivpu_fw.h" 15 #include "ivpu_fw_log.h" 16 #include "ivpu_hw.h" 17 #include "ivpu_ipc.h" 18 #include "ivpu_job.h" 19 #include "ivpu_jsm_msg.h" 20 #include "ivpu_mmu.h" 21 #include "ivpu_ms.h" 22 #include "ivpu_pm.h" 23 #include "ivpu_trace.h" 24 #include "vpu_boot_api.h" 25 26 static bool ivpu_disable_recovery; 27 #if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG) 28 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); 29 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected"); 30 #endif 31 32 static unsigned long ivpu_tdr_timeout_ms; 33 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); 34 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default"); 35 36 static unsigned long ivpu_inference_timeout_ms; 37 module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644); 38 MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default"); 39 40 #define PM_RESCHEDULE_LIMIT 5 41 42 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) 43 { 44 struct ivpu_fw_info *fw = vdev->fw; 45 46 ivpu_cmdq_reset_all_contexts(vdev); 47 ivpu_ipc_reset(vdev); 48 ivpu_fw_log_reset(vdev); 49 ivpu_fw_load(vdev); 50 fw->entry_point = fw->cold_boot_entry_point; 51 fw->last_heartbeat = 0; 52 } 53 54 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) 55 { 56 struct ivpu_fw_info *fw = vdev->fw; 57 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem); 58 59 if (!bp->save_restore_ret_address) { 60 ivpu_pm_prepare_cold_boot(vdev); 61 return; 62 } 63 64 ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address); 65 fw->entry_point = bp->save_restore_ret_address; 66 } 67 68 static int ivpu_suspend(struct ivpu_device *vdev) 69 { 70 int ret; 71 72 ivpu_prepare_for_reset(vdev); 73 74 ret = ivpu_shutdown(vdev); 75 if (ret) 76 ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); 77 78 return ret; 79 } 80 81 static int ivpu_resume(struct ivpu_device *vdev) 82 { 83 int ret; 84 85 retry: 86 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); 87 pci_restore_state(to_pci_dev(vdev->drm.dev)); 88 89 ret = ivpu_hw_power_up(vdev); 90 if (ret) { 91 ivpu_err(vdev, "Failed to power up HW: %d\n", ret); 92 goto err_power_down; 93 } 94 95 ret = ivpu_mmu_enable(vdev); 96 if (ret) { 97 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret); 98 goto err_power_down; 99 } 100 101 ret = ivpu_boot(vdev); 102 if (ret) 103 goto err_mmu_disable; 104 105 return 0; 106 107 err_mmu_disable: 108 ivpu_mmu_disable(vdev); 109 err_power_down: 110 ivpu_hw_power_down(vdev); 111 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); 112 113 if (!ivpu_fw_is_cold_boot(vdev)) { 114 ivpu_pm_prepare_cold_boot(vdev); 115 goto retry; 116 } else { 117 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); 118 } 119 120 return ret; 121 } 122 123 static void ivpu_pm_reset_begin(struct ivpu_device *vdev) 124 { 125 pm_runtime_disable(vdev->drm.dev); 126 127 atomic_inc(&vdev->pm->reset_counter); 128 atomic_set(&vdev->pm->reset_pending, 1); 129 down_write(&vdev->pm->reset_lock); 130 } 131 132 static void ivpu_pm_reset_complete(struct ivpu_device *vdev) 133 { 134 int ret; 135 136 ivpu_pm_prepare_cold_boot(vdev); 137 ivpu_jobs_abort_all(vdev); 138 ivpu_ms_cleanup_all(vdev); 139 140 ret = ivpu_resume(vdev); 141 if (ret) { 142 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 143 pm_runtime_set_suspended(vdev->drm.dev); 144 } else { 145 pm_runtime_set_active(vdev->drm.dev); 146 } 147 148 up_write(&vdev->pm->reset_lock); 149 atomic_set(&vdev->pm->reset_pending, 0); 150 151 pm_runtime_mark_last_busy(vdev->drm.dev); 152 pm_runtime_enable(vdev->drm.dev); 153 } 154 155 static void ivpu_pm_recovery_work(struct work_struct *work) 156 { 157 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); 158 struct ivpu_device *vdev = pm->vdev; 159 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; 160 161 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); 162 163 ivpu_pm_reset_begin(vdev); 164 165 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 166 ivpu_jsm_state_dump(vdev); 167 ivpu_dev_coredump(vdev); 168 ivpu_suspend(vdev); 169 } 170 171 ivpu_pm_reset_complete(vdev); 172 173 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); 174 } 175 176 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) 177 { 178 ivpu_err(vdev, "Recovery triggered by %s\n", reason); 179 180 if (ivpu_disable_recovery) { 181 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); 182 return; 183 } 184 185 /* Trigger recovery if it's not in progress */ 186 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { 187 ivpu_hw_diagnose_failure(vdev); 188 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ 189 queue_work(system_unbound_wq, &vdev->pm->recovery_work); 190 } 191 } 192 193 static void ivpu_job_timeout_work(struct work_struct *work) 194 { 195 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); 196 struct ivpu_device *vdev = pm->vdev; 197 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 198 unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms : 199 vdev->timeout.inference; 200 u64 inference_max_retries; 201 u64 heartbeat; 202 203 if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) { 204 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n"); 205 goto recovery; 206 } 207 208 inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms); 209 if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) { 210 ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n", 211 inference_max_retries); 212 goto recovery; 213 } 214 215 vdev->fw->last_heartbeat = heartbeat; 216 ivpu_start_job_timeout_detection(vdev); 217 return; 218 219 recovery: 220 atomic_set(&vdev->job_timeout_counter, 0); 221 ivpu_pm_trigger_recovery(vdev, "TDR"); 222 } 223 224 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) 225 { 226 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 227 228 /* No-op if already queued */ 229 queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms)); 230 } 231 232 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) 233 { 234 cancel_delayed_work_sync(&vdev->pm->job_timeout_work); 235 atomic_set(&vdev->job_timeout_counter, 0); 236 } 237 238 int ivpu_pm_suspend_cb(struct device *dev) 239 { 240 struct drm_device *drm = dev_get_drvdata(dev); 241 struct ivpu_device *vdev = to_ivpu_device(drm); 242 unsigned long timeout; 243 244 trace_pm("suspend"); 245 ivpu_dbg(vdev, PM, "Suspend..\n"); 246 247 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr); 248 while (!ivpu_hw_is_idle(vdev)) { 249 cond_resched(); 250 if (time_after_eq(jiffies, timeout)) { 251 ivpu_err(vdev, "Failed to enter idle on system suspend\n"); 252 return -EBUSY; 253 } 254 } 255 256 ivpu_jsm_pwr_d0i3_enter(vdev); 257 258 ivpu_suspend(vdev); 259 ivpu_pm_prepare_warm_boot(vdev); 260 261 ivpu_dbg(vdev, PM, "Suspend done.\n"); 262 trace_pm("suspend done"); 263 264 return 0; 265 } 266 267 int ivpu_pm_resume_cb(struct device *dev) 268 { 269 struct drm_device *drm = dev_get_drvdata(dev); 270 struct ivpu_device *vdev = to_ivpu_device(drm); 271 int ret; 272 273 trace_pm("resume"); 274 ivpu_dbg(vdev, PM, "Resume..\n"); 275 276 ret = ivpu_resume(vdev); 277 if (ret) 278 ivpu_err(vdev, "Failed to resume: %d\n", ret); 279 280 ivpu_dbg(vdev, PM, "Resume done.\n"); 281 trace_pm("resume done"); 282 283 return ret; 284 } 285 286 int ivpu_pm_runtime_suspend_cb(struct device *dev) 287 { 288 struct drm_device *drm = dev_get_drvdata(dev); 289 struct ivpu_device *vdev = to_ivpu_device(drm); 290 int ret, ret_d0i3; 291 bool is_idle; 292 293 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); 294 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); 295 296 trace_pm("runtime suspend"); 297 ivpu_dbg(vdev, PM, "Runtime suspend..\n"); 298 299 ivpu_mmu_disable(vdev); 300 301 is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent; 302 if (!is_idle) 303 ivpu_err(vdev, "NPU is not idle before autosuspend\n"); 304 305 ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev); 306 if (ret_d0i3) 307 ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3); 308 309 ret = ivpu_suspend(vdev); 310 if (ret) 311 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret); 312 313 if (!is_idle || ret_d0i3) { 314 ivpu_err(vdev, "Forcing cold boot due to previous errors\n"); 315 atomic_inc(&vdev->pm->reset_counter); 316 ivpu_dev_coredump(vdev); 317 ivpu_pm_prepare_cold_boot(vdev); 318 } else { 319 ivpu_pm_prepare_warm_boot(vdev); 320 } 321 322 ivpu_dbg(vdev, PM, "Runtime suspend done.\n"); 323 trace_pm("runtime suspend done"); 324 325 return 0; 326 } 327 328 int ivpu_pm_runtime_resume_cb(struct device *dev) 329 { 330 struct drm_device *drm = dev_get_drvdata(dev); 331 struct ivpu_device *vdev = to_ivpu_device(drm); 332 int ret; 333 334 trace_pm("runtime resume"); 335 ivpu_dbg(vdev, PM, "Runtime resume..\n"); 336 337 ret = ivpu_resume(vdev); 338 if (ret) 339 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); 340 341 ivpu_dbg(vdev, PM, "Runtime resume done.\n"); 342 trace_pm("runtime resume done"); 343 344 return ret; 345 } 346 347 int ivpu_rpm_get(struct ivpu_device *vdev) 348 { 349 int ret; 350 351 ret = pm_runtime_resume_and_get(vdev->drm.dev); 352 if (ret < 0) { 353 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 354 pm_runtime_set_suspended(vdev->drm.dev); 355 } 356 357 return ret; 358 } 359 360 void ivpu_rpm_put(struct ivpu_device *vdev) 361 { 362 pm_runtime_mark_last_busy(vdev->drm.dev); 363 pm_runtime_put_autosuspend(vdev->drm.dev); 364 } 365 366 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) 367 { 368 struct ivpu_device *vdev = pci_get_drvdata(pdev); 369 370 ivpu_dbg(vdev, PM, "Pre-reset..\n"); 371 372 ivpu_pm_reset_begin(vdev); 373 374 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 375 ivpu_prepare_for_reset(vdev); 376 ivpu_hw_reset(vdev); 377 } 378 379 ivpu_dbg(vdev, PM, "Pre-reset done.\n"); 380 } 381 382 void ivpu_pm_reset_done_cb(struct pci_dev *pdev) 383 { 384 struct ivpu_device *vdev = pci_get_drvdata(pdev); 385 386 ivpu_dbg(vdev, PM, "Post-reset..\n"); 387 388 ivpu_pm_reset_complete(vdev); 389 390 ivpu_dbg(vdev, PM, "Post-reset done.\n"); 391 } 392 393 void ivpu_pm_init(struct ivpu_device *vdev) 394 { 395 struct device *dev = vdev->drm.dev; 396 struct ivpu_pm_info *pm = vdev->pm; 397 int delay; 398 399 pm->vdev = vdev; 400 401 init_rwsem(&pm->reset_lock); 402 atomic_set(&pm->reset_pending, 0); 403 atomic_set(&pm->reset_counter, 0); 404 405 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); 406 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); 407 408 if (ivpu_disable_recovery) 409 delay = -1; 410 else 411 delay = vdev->timeout.autosuspend; 412 413 pm_runtime_use_autosuspend(dev); 414 pm_runtime_set_autosuspend_delay(dev, delay); 415 pm_runtime_set_active(dev); 416 417 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); 418 } 419 420 void ivpu_pm_cancel_recovery(struct ivpu_device *vdev) 421 { 422 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); 423 cancel_work_sync(&vdev->pm->recovery_work); 424 } 425 426 void ivpu_pm_enable(struct ivpu_device *vdev) 427 { 428 struct device *dev = vdev->drm.dev; 429 430 pm_runtime_allow(dev); 431 pm_runtime_mark_last_busy(dev); 432 pm_runtime_put_autosuspend(dev); 433 } 434 435 void ivpu_pm_disable(struct ivpu_device *vdev) 436 { 437 pm_runtime_get_noresume(vdev->drm.dev); 438 pm_runtime_forbid(vdev->drm.dev); 439 } 440 441 int ivpu_pm_dct_init(struct ivpu_device *vdev) 442 { 443 if (vdev->pm->dct_active_percent) 444 return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent); 445 446 return 0; 447 } 448 449 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent) 450 { 451 u32 active_us, inactive_us; 452 int ret; 453 454 if (active_percent == 0 || active_percent > 100) 455 return -EINVAL; 456 457 active_us = (DCT_PERIOD_US * active_percent) / 100; 458 inactive_us = DCT_PERIOD_US - active_us; 459 460 vdev->pm->dct_active_percent = active_percent; 461 462 ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n", 463 active_percent, active_us, inactive_us); 464 465 ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us); 466 if (ret) { 467 ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret); 468 return ret; 469 } 470 471 return 0; 472 } 473 474 int ivpu_pm_dct_disable(struct ivpu_device *vdev) 475 { 476 int ret; 477 478 vdev->pm->dct_active_percent = 0; 479 480 ivpu_dbg(vdev, PM, "DCT requested to be disabled\n"); 481 482 ret = ivpu_jsm_dct_disable(vdev); 483 if (ret) { 484 ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret); 485 return ret; 486 } 487 488 return 0; 489 } 490 491 void ivpu_pm_irq_dct_work_fn(struct work_struct *work) 492 { 493 struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work); 494 bool enable; 495 int ret; 496 497 if (ivpu_hw_btrs_dct_get_request(vdev, &enable)) 498 return; 499 500 if (enable) 501 ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT); 502 else 503 ret = ivpu_pm_dct_disable(vdev); 504 505 if (!ret) 506 ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent); 507 } 508