1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2020-2024 Intel Corporation 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/moduleparam.h> 8 #include <linux/pci.h> 9 #include <linux/pm_runtime.h> 10 #include <linux/reboot.h> 11 12 #include "ivpu_coredump.h" 13 #include "ivpu_drv.h" 14 #include "ivpu_fw.h" 15 #include "ivpu_fw_log.h" 16 #include "ivpu_hw.h" 17 #include "ivpu_ipc.h" 18 #include "ivpu_job.h" 19 #include "ivpu_jsm_msg.h" 20 #include "ivpu_mmu.h" 21 #include "ivpu_ms.h" 22 #include "ivpu_pm.h" 23 #include "ivpu_trace.h" 24 #include "vpu_boot_api.h" 25 26 static bool ivpu_disable_recovery; 27 #if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG) 28 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); 29 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected"); 30 #endif 31 32 static unsigned long ivpu_tdr_timeout_ms; 33 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); 34 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default"); 35 36 #define PM_RESCHEDULE_LIMIT 5 37 #define PM_TDR_HEARTBEAT_LIMIT 30 38 39 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) 40 { 41 struct ivpu_fw_info *fw = vdev->fw; 42 43 ivpu_cmdq_reset_all_contexts(vdev); 44 ivpu_ipc_reset(vdev); 45 ivpu_fw_log_reset(vdev); 46 ivpu_fw_load(vdev); 47 fw->entry_point = fw->cold_boot_entry_point; 48 fw->last_heartbeat = 0; 49 } 50 51 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) 52 { 53 struct ivpu_fw_info *fw = vdev->fw; 54 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem); 55 56 if (!bp->save_restore_ret_address) { 57 ivpu_pm_prepare_cold_boot(vdev); 58 return; 59 } 60 61 ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address); 62 fw->entry_point = bp->save_restore_ret_address; 63 } 64 65 static int ivpu_suspend(struct ivpu_device *vdev) 66 { 67 int ret; 68 69 ivpu_prepare_for_reset(vdev); 70 71 ret = ivpu_shutdown(vdev); 72 if (ret) 73 ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); 74 75 return ret; 76 } 77 78 static int ivpu_resume(struct ivpu_device *vdev) 79 { 80 int ret; 81 82 retry: 83 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); 84 pci_restore_state(to_pci_dev(vdev->drm.dev)); 85 86 ret = ivpu_hw_power_up(vdev); 87 if (ret) { 88 ivpu_err(vdev, "Failed to power up HW: %d\n", ret); 89 goto err_power_down; 90 } 91 92 ret = ivpu_mmu_enable(vdev); 93 if (ret) { 94 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret); 95 goto err_power_down; 96 } 97 98 ret = ivpu_boot(vdev); 99 if (ret) 100 goto err_mmu_disable; 101 102 return 0; 103 104 err_mmu_disable: 105 ivpu_mmu_disable(vdev); 106 err_power_down: 107 ivpu_hw_power_down(vdev); 108 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); 109 110 if (!ivpu_fw_is_cold_boot(vdev)) { 111 ivpu_pm_prepare_cold_boot(vdev); 112 goto retry; 113 } else { 114 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); 115 } 116 117 return ret; 118 } 119 120 static void ivpu_pm_reset_begin(struct ivpu_device *vdev) 121 { 122 pm_runtime_disable(vdev->drm.dev); 123 124 atomic_inc(&vdev->pm->reset_counter); 125 atomic_set(&vdev->pm->reset_pending, 1); 126 down_write(&vdev->pm->reset_lock); 127 } 128 129 static void ivpu_pm_reset_complete(struct ivpu_device *vdev) 130 { 131 int ret; 132 133 ivpu_pm_prepare_cold_boot(vdev); 134 ivpu_jobs_abort_all(vdev); 135 ivpu_ms_cleanup_all(vdev); 136 137 ret = ivpu_resume(vdev); 138 if (ret) { 139 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 140 pm_runtime_set_suspended(vdev->drm.dev); 141 } else { 142 pm_runtime_set_active(vdev->drm.dev); 143 } 144 145 up_write(&vdev->pm->reset_lock); 146 atomic_set(&vdev->pm->reset_pending, 0); 147 148 pm_runtime_mark_last_busy(vdev->drm.dev); 149 pm_runtime_enable(vdev->drm.dev); 150 } 151 152 static void ivpu_pm_recovery_work(struct work_struct *work) 153 { 154 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); 155 struct ivpu_device *vdev = pm->vdev; 156 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; 157 158 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); 159 160 ivpu_pm_reset_begin(vdev); 161 162 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 163 ivpu_jsm_state_dump(vdev); 164 ivpu_dev_coredump(vdev); 165 ivpu_suspend(vdev); 166 } 167 168 ivpu_pm_reset_complete(vdev); 169 170 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); 171 } 172 173 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) 174 { 175 ivpu_err(vdev, "Recovery triggered by %s\n", reason); 176 177 if (ivpu_disable_recovery) { 178 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); 179 return; 180 } 181 182 /* Trigger recovery if it's not in progress */ 183 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { 184 ivpu_hw_diagnose_failure(vdev); 185 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ 186 queue_work(system_unbound_wq, &vdev->pm->recovery_work); 187 } 188 } 189 190 static void ivpu_job_timeout_work(struct work_struct *work) 191 { 192 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); 193 struct ivpu_device *vdev = pm->vdev; 194 u64 heartbeat; 195 196 if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) { 197 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n"); 198 goto recovery; 199 } 200 201 if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) { 202 ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n"); 203 goto recovery; 204 } 205 206 vdev->fw->last_heartbeat = heartbeat; 207 ivpu_start_job_timeout_detection(vdev); 208 return; 209 210 recovery: 211 atomic_set(&vdev->job_timeout_counter, 0); 212 ivpu_pm_trigger_recovery(vdev, "TDR"); 213 } 214 215 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) 216 { 217 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 218 219 /* No-op if already queued */ 220 queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms)); 221 } 222 223 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) 224 { 225 cancel_delayed_work_sync(&vdev->pm->job_timeout_work); 226 atomic_set(&vdev->job_timeout_counter, 0); 227 } 228 229 int ivpu_pm_suspend_cb(struct device *dev) 230 { 231 struct drm_device *drm = dev_get_drvdata(dev); 232 struct ivpu_device *vdev = to_ivpu_device(drm); 233 unsigned long timeout; 234 235 trace_pm("suspend"); 236 ivpu_dbg(vdev, PM, "Suspend..\n"); 237 238 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr); 239 while (!ivpu_hw_is_idle(vdev)) { 240 cond_resched(); 241 if (time_after_eq(jiffies, timeout)) { 242 ivpu_err(vdev, "Failed to enter idle on system suspend\n"); 243 return -EBUSY; 244 } 245 } 246 247 ivpu_jsm_pwr_d0i3_enter(vdev); 248 249 ivpu_suspend(vdev); 250 ivpu_pm_prepare_warm_boot(vdev); 251 252 ivpu_dbg(vdev, PM, "Suspend done.\n"); 253 trace_pm("suspend done"); 254 255 return 0; 256 } 257 258 int ivpu_pm_resume_cb(struct device *dev) 259 { 260 struct drm_device *drm = dev_get_drvdata(dev); 261 struct ivpu_device *vdev = to_ivpu_device(drm); 262 int ret; 263 264 trace_pm("resume"); 265 ivpu_dbg(vdev, PM, "Resume..\n"); 266 267 ret = ivpu_resume(vdev); 268 if (ret) 269 ivpu_err(vdev, "Failed to resume: %d\n", ret); 270 271 ivpu_dbg(vdev, PM, "Resume done.\n"); 272 trace_pm("resume done"); 273 274 return ret; 275 } 276 277 int ivpu_pm_runtime_suspend_cb(struct device *dev) 278 { 279 struct drm_device *drm = dev_get_drvdata(dev); 280 struct ivpu_device *vdev = to_ivpu_device(drm); 281 int ret, ret_d0i3; 282 bool is_idle; 283 284 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); 285 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); 286 287 trace_pm("runtime suspend"); 288 ivpu_dbg(vdev, PM, "Runtime suspend..\n"); 289 290 ivpu_mmu_disable(vdev); 291 292 is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent; 293 if (!is_idle) 294 ivpu_err(vdev, "NPU is not idle before autosuspend\n"); 295 296 ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev); 297 if (ret_d0i3) 298 ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3); 299 300 ret = ivpu_suspend(vdev); 301 if (ret) 302 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret); 303 304 if (!is_idle || ret_d0i3) { 305 ivpu_err(vdev, "Forcing cold boot due to previous errors\n"); 306 atomic_inc(&vdev->pm->reset_counter); 307 ivpu_dev_coredump(vdev); 308 ivpu_pm_prepare_cold_boot(vdev); 309 } else { 310 ivpu_pm_prepare_warm_boot(vdev); 311 } 312 313 ivpu_dbg(vdev, PM, "Runtime suspend done.\n"); 314 trace_pm("runtime suspend done"); 315 316 return 0; 317 } 318 319 int ivpu_pm_runtime_resume_cb(struct device *dev) 320 { 321 struct drm_device *drm = dev_get_drvdata(dev); 322 struct ivpu_device *vdev = to_ivpu_device(drm); 323 int ret; 324 325 trace_pm("runtime resume"); 326 ivpu_dbg(vdev, PM, "Runtime resume..\n"); 327 328 ret = ivpu_resume(vdev); 329 if (ret) 330 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); 331 332 ivpu_dbg(vdev, PM, "Runtime resume done.\n"); 333 trace_pm("runtime resume done"); 334 335 return ret; 336 } 337 338 int ivpu_rpm_get(struct ivpu_device *vdev) 339 { 340 int ret; 341 342 ret = pm_runtime_resume_and_get(vdev->drm.dev); 343 if (ret < 0) { 344 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 345 pm_runtime_set_suspended(vdev->drm.dev); 346 } 347 348 return ret; 349 } 350 351 void ivpu_rpm_put(struct ivpu_device *vdev) 352 { 353 pm_runtime_mark_last_busy(vdev->drm.dev); 354 pm_runtime_put_autosuspend(vdev->drm.dev); 355 } 356 357 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) 358 { 359 struct ivpu_device *vdev = pci_get_drvdata(pdev); 360 361 ivpu_dbg(vdev, PM, "Pre-reset..\n"); 362 363 ivpu_pm_reset_begin(vdev); 364 365 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 366 ivpu_prepare_for_reset(vdev); 367 ivpu_hw_reset(vdev); 368 } 369 370 ivpu_dbg(vdev, PM, "Pre-reset done.\n"); 371 } 372 373 void ivpu_pm_reset_done_cb(struct pci_dev *pdev) 374 { 375 struct ivpu_device *vdev = pci_get_drvdata(pdev); 376 377 ivpu_dbg(vdev, PM, "Post-reset..\n"); 378 379 ivpu_pm_reset_complete(vdev); 380 381 ivpu_dbg(vdev, PM, "Post-reset done.\n"); 382 } 383 384 void ivpu_pm_init(struct ivpu_device *vdev) 385 { 386 struct device *dev = vdev->drm.dev; 387 struct ivpu_pm_info *pm = vdev->pm; 388 int delay; 389 390 pm->vdev = vdev; 391 392 init_rwsem(&pm->reset_lock); 393 atomic_set(&pm->reset_pending, 0); 394 atomic_set(&pm->reset_counter, 0); 395 396 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); 397 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); 398 399 if (ivpu_disable_recovery) 400 delay = -1; 401 else 402 delay = vdev->timeout.autosuspend; 403 404 pm_runtime_use_autosuspend(dev); 405 pm_runtime_set_autosuspend_delay(dev, delay); 406 pm_runtime_set_active(dev); 407 408 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); 409 } 410 411 void ivpu_pm_cancel_recovery(struct ivpu_device *vdev) 412 { 413 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); 414 cancel_work_sync(&vdev->pm->recovery_work); 415 } 416 417 void ivpu_pm_enable(struct ivpu_device *vdev) 418 { 419 struct device *dev = vdev->drm.dev; 420 421 pm_runtime_allow(dev); 422 pm_runtime_mark_last_busy(dev); 423 pm_runtime_put_autosuspend(dev); 424 } 425 426 void ivpu_pm_disable(struct ivpu_device *vdev) 427 { 428 pm_runtime_get_noresume(vdev->drm.dev); 429 pm_runtime_forbid(vdev->drm.dev); 430 } 431 432 int ivpu_pm_dct_init(struct ivpu_device *vdev) 433 { 434 if (vdev->pm->dct_active_percent) 435 return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent); 436 437 return 0; 438 } 439 440 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent) 441 { 442 u32 active_us, inactive_us; 443 int ret; 444 445 if (active_percent == 0 || active_percent > 100) 446 return -EINVAL; 447 448 active_us = (DCT_PERIOD_US * active_percent) / 100; 449 inactive_us = DCT_PERIOD_US - active_us; 450 451 vdev->pm->dct_active_percent = active_percent; 452 453 ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n", 454 active_percent, active_us, inactive_us); 455 456 ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us); 457 if (ret) { 458 ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret); 459 return ret; 460 } 461 462 return 0; 463 } 464 465 int ivpu_pm_dct_disable(struct ivpu_device *vdev) 466 { 467 int ret; 468 469 vdev->pm->dct_active_percent = 0; 470 471 ivpu_dbg(vdev, PM, "DCT requested to be disabled\n"); 472 473 ret = ivpu_jsm_dct_disable(vdev); 474 if (ret) { 475 ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret); 476 return ret; 477 } 478 479 return 0; 480 } 481 482 void ivpu_pm_irq_dct_work_fn(struct work_struct *work) 483 { 484 struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work); 485 bool enable; 486 int ret; 487 488 if (ivpu_hw_btrs_dct_get_request(vdev, &enable)) 489 return; 490 491 if (enable) 492 ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT); 493 else 494 ret = ivpu_pm_dct_disable(vdev); 495 496 if (!ret) 497 ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent); 498 } 499