1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2020-2024 Intel Corporation 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/moduleparam.h> 8 #include <linux/pci.h> 9 #include <linux/pm_runtime.h> 10 #include <linux/reboot.h> 11 12 #include "ivpu_coredump.h" 13 #include "ivpu_drv.h" 14 #include "ivpu_fw.h" 15 #include "ivpu_fw_log.h" 16 #include "ivpu_hw.h" 17 #include "ivpu_ipc.h" 18 #include "ivpu_job.h" 19 #include "ivpu_jsm_msg.h" 20 #include "ivpu_mmu.h" 21 #include "ivpu_ms.h" 22 #include "ivpu_pm.h" 23 #include "ivpu_trace.h" 24 #include "vpu_boot_api.h" 25 26 static bool ivpu_disable_recovery; 27 #if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG) 28 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); 29 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected"); 30 #endif 31 32 static unsigned long ivpu_tdr_timeout_ms; 33 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); 34 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default"); 35 36 static unsigned long ivpu_inference_timeout_ms; 37 module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644); 38 MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default"); 39 40 #define PM_RESCHEDULE_LIMIT 5 41 42 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) 43 { 44 struct ivpu_fw_info *fw = vdev->fw; 45 46 ivpu_cmdq_reset_all_contexts(vdev); 47 ivpu_ipc_reset(vdev); 48 ivpu_fw_log_reset(vdev); 49 ivpu_fw_load(vdev); 50 fw->last_heartbeat = 0; 51 52 ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point); 53 fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT; 54 } 55 56 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) 57 { 58 struct ivpu_fw_info *fw = vdev->fw; 59 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp); 60 61 fw->warm_boot_entry_point = bp->save_restore_ret_address; 62 if (!fw->warm_boot_entry_point) { 63 ivpu_pm_prepare_cold_boot(vdev); 64 return; 65 } 66 67 ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point); 68 fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT; 69 } 70 71 static int ivpu_suspend(struct ivpu_device *vdev) 72 { 73 int ret; 74 75 ivpu_prepare_for_reset(vdev); 76 77 ret = ivpu_shutdown(vdev); 78 if (ret) 79 ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); 80 81 return ret; 82 } 83 84 static int ivpu_resume(struct ivpu_device *vdev) 85 { 86 int ret; 87 88 retry: 89 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); 90 pci_restore_state(to_pci_dev(vdev->drm.dev)); 91 92 ret = ivpu_hw_power_up(vdev); 93 if (ret) { 94 ivpu_err(vdev, "Failed to power up HW: %d\n", ret); 95 goto err_power_down; 96 } 97 98 ret = ivpu_mmu_enable(vdev); 99 if (ret) { 100 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret); 101 goto err_power_down; 102 } 103 104 ret = ivpu_boot(vdev); 105 if (ret) 106 goto err_mmu_disable; 107 108 return 0; 109 110 err_mmu_disable: 111 ivpu_mmu_disable(vdev); 112 err_power_down: 113 ivpu_hw_power_down(vdev); 114 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); 115 116 if (ivpu_fw_is_warm_boot(vdev)) { 117 ivpu_pm_prepare_cold_boot(vdev); 118 goto retry; 119 } else { 120 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); 121 } 122 123 return ret; 124 } 125 126 static void ivpu_pm_reset_begin(struct ivpu_device *vdev) 127 { 128 pm_runtime_disable(vdev->drm.dev); 129 130 atomic_inc(&vdev->pm->reset_counter); 131 atomic_set(&vdev->pm->reset_pending, 1); 132 down_write(&vdev->pm->reset_lock); 133 } 134 135 static void ivpu_pm_reset_complete(struct ivpu_device *vdev) 136 { 137 int ret; 138 139 ivpu_pm_prepare_cold_boot(vdev); 140 ivpu_jobs_abort_all(vdev); 141 ivpu_ms_cleanup_all(vdev); 142 143 ret = ivpu_resume(vdev); 144 if (ret) { 145 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 146 pm_runtime_set_suspended(vdev->drm.dev); 147 } else { 148 pm_runtime_set_active(vdev->drm.dev); 149 } 150 151 up_write(&vdev->pm->reset_lock); 152 atomic_set(&vdev->pm->reset_pending, 0); 153 154 pm_runtime_mark_last_busy(vdev->drm.dev); 155 pm_runtime_enable(vdev->drm.dev); 156 } 157 158 static void ivpu_pm_recovery_work(struct work_struct *work) 159 { 160 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); 161 struct ivpu_device *vdev = pm->vdev; 162 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; 163 164 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); 165 166 ivpu_pm_reset_begin(vdev); 167 168 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 169 ivpu_jsm_state_dump(vdev); 170 ivpu_dev_coredump(vdev); 171 ivpu_suspend(vdev); 172 } 173 174 ivpu_pm_reset_complete(vdev); 175 176 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); 177 } 178 179 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) 180 { 181 ivpu_err(vdev, "Recovery triggered by %s\n", reason); 182 183 if (ivpu_disable_recovery) { 184 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); 185 return; 186 } 187 188 /* Trigger recovery if it's not in progress */ 189 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { 190 ivpu_hw_diagnose_failure(vdev); 191 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ 192 queue_work(system_dfl_wq, &vdev->pm->recovery_work); 193 } 194 } 195 196 static void ivpu_job_timeout_work(struct work_struct *work) 197 { 198 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); 199 struct ivpu_device *vdev = pm->vdev; 200 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 201 unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms : 202 vdev->timeout.inference; 203 u64 inference_max_retries; 204 u64 heartbeat; 205 206 if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) { 207 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n"); 208 goto recovery; 209 } 210 211 inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms); 212 if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) { 213 ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n", 214 inference_max_retries); 215 goto recovery; 216 } 217 218 vdev->fw->last_heartbeat = heartbeat; 219 ivpu_start_job_timeout_detection(vdev); 220 return; 221 222 recovery: 223 atomic_set(&vdev->job_timeout_counter, 0); 224 ivpu_pm_trigger_recovery(vdev, "TDR"); 225 } 226 227 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) 228 { 229 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 230 231 /* No-op if already queued */ 232 queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work, 233 msecs_to_jiffies(timeout_ms)); 234 } 235 236 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) 237 { 238 cancel_delayed_work_sync(&vdev->pm->job_timeout_work); 239 atomic_set(&vdev->job_timeout_counter, 0); 240 } 241 242 int ivpu_pm_suspend_cb(struct device *dev) 243 { 244 struct drm_device *drm = dev_get_drvdata(dev); 245 struct ivpu_device *vdev = to_ivpu_device(drm); 246 unsigned long timeout; 247 248 trace_pm("suspend"); 249 ivpu_dbg(vdev, PM, "Suspend..\n"); 250 251 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr); 252 while (!ivpu_hw_is_idle(vdev)) { 253 cond_resched(); 254 if (time_after_eq(jiffies, timeout)) { 255 ivpu_err(vdev, "Failed to enter idle on system suspend\n"); 256 return -EBUSY; 257 } 258 } 259 260 ivpu_jsm_pwr_d0i3_enter(vdev); 261 262 ivpu_suspend(vdev); 263 ivpu_pm_prepare_warm_boot(vdev); 264 265 ivpu_dbg(vdev, PM, "Suspend done.\n"); 266 trace_pm("suspend done"); 267 268 return 0; 269 } 270 271 int ivpu_pm_resume_cb(struct device *dev) 272 { 273 struct drm_device *drm = dev_get_drvdata(dev); 274 struct ivpu_device *vdev = to_ivpu_device(drm); 275 int ret; 276 277 trace_pm("resume"); 278 ivpu_dbg(vdev, PM, "Resume..\n"); 279 280 ret = ivpu_resume(vdev); 281 if (ret) 282 ivpu_err(vdev, "Failed to resume: %d\n", ret); 283 284 ivpu_dbg(vdev, PM, "Resume done.\n"); 285 trace_pm("resume done"); 286 287 return ret; 288 } 289 290 int ivpu_pm_runtime_suspend_cb(struct device *dev) 291 { 292 struct drm_device *drm = dev_get_drvdata(dev); 293 struct ivpu_device *vdev = to_ivpu_device(drm); 294 int ret, ret_d0i3; 295 bool is_idle; 296 297 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); 298 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); 299 300 trace_pm("runtime suspend"); 301 ivpu_dbg(vdev, PM, "Runtime suspend..\n"); 302 303 ivpu_mmu_disable(vdev); 304 305 is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent; 306 if (!is_idle) 307 ivpu_err(vdev, "NPU is not idle before autosuspend\n"); 308 309 ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev); 310 if (ret_d0i3) 311 ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3); 312 313 ret = ivpu_suspend(vdev); 314 if (ret) 315 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret); 316 317 if (!is_idle || ret_d0i3) { 318 ivpu_err(vdev, "Forcing cold boot due to previous errors\n"); 319 atomic_inc(&vdev->pm->reset_counter); 320 ivpu_dev_coredump(vdev); 321 ivpu_pm_prepare_cold_boot(vdev); 322 } else { 323 ivpu_pm_prepare_warm_boot(vdev); 324 } 325 326 ivpu_dbg(vdev, PM, "Runtime suspend done.\n"); 327 trace_pm("runtime suspend done"); 328 329 return 0; 330 } 331 332 int ivpu_pm_runtime_resume_cb(struct device *dev) 333 { 334 struct drm_device *drm = dev_get_drvdata(dev); 335 struct ivpu_device *vdev = to_ivpu_device(drm); 336 int ret; 337 338 trace_pm("runtime resume"); 339 ivpu_dbg(vdev, PM, "Runtime resume..\n"); 340 341 ret = ivpu_resume(vdev); 342 if (ret) 343 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); 344 345 ivpu_dbg(vdev, PM, "Runtime resume done.\n"); 346 trace_pm("runtime resume done"); 347 348 return ret; 349 } 350 351 int ivpu_rpm_get(struct ivpu_device *vdev) 352 { 353 int ret; 354 355 ret = pm_runtime_resume_and_get(vdev->drm.dev); 356 if (ret < 0) { 357 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 358 pm_runtime_set_suspended(vdev->drm.dev); 359 } 360 361 return ret; 362 } 363 364 void ivpu_rpm_put(struct ivpu_device *vdev) 365 { 366 pm_runtime_put_autosuspend(vdev->drm.dev); 367 } 368 369 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) 370 { 371 struct ivpu_device *vdev = pci_get_drvdata(pdev); 372 373 ivpu_dbg(vdev, PM, "Pre-reset..\n"); 374 375 ivpu_pm_reset_begin(vdev); 376 377 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 378 ivpu_prepare_for_reset(vdev); 379 ivpu_hw_reset(vdev); 380 } 381 382 ivpu_dbg(vdev, PM, "Pre-reset done.\n"); 383 } 384 385 void ivpu_pm_reset_done_cb(struct pci_dev *pdev) 386 { 387 struct ivpu_device *vdev = pci_get_drvdata(pdev); 388 389 ivpu_dbg(vdev, PM, "Post-reset..\n"); 390 391 ivpu_pm_reset_complete(vdev); 392 393 ivpu_dbg(vdev, PM, "Post-reset done.\n"); 394 } 395 396 void ivpu_pm_init(struct ivpu_device *vdev) 397 { 398 struct device *dev = vdev->drm.dev; 399 struct ivpu_pm_info *pm = vdev->pm; 400 int delay; 401 402 pm->vdev = vdev; 403 404 init_rwsem(&pm->reset_lock); 405 atomic_set(&pm->reset_pending, 0); 406 atomic_set(&pm->reset_counter, 0); 407 408 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); 409 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); 410 411 if (ivpu_disable_recovery) 412 delay = -1; 413 else 414 delay = vdev->timeout.autosuspend; 415 416 pm_runtime_use_autosuspend(dev); 417 pm_runtime_set_autosuspend_delay(dev, delay); 418 pm_runtime_set_active(dev); 419 420 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); 421 } 422 423 void ivpu_pm_disable_recovery(struct ivpu_device *vdev) 424 { 425 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); 426 disable_work_sync(&vdev->pm->recovery_work); 427 } 428 429 void ivpu_pm_enable(struct ivpu_device *vdev) 430 { 431 struct device *dev = vdev->drm.dev; 432 433 pm_runtime_allow(dev); 434 pm_runtime_put_autosuspend(dev); 435 } 436 437 void ivpu_pm_disable(struct ivpu_device *vdev) 438 { 439 pm_runtime_get_noresume(vdev->drm.dev); 440 pm_runtime_forbid(vdev->drm.dev); 441 } 442 443 int ivpu_pm_dct_init(struct ivpu_device *vdev) 444 { 445 if (vdev->pm->dct_active_percent) 446 return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent); 447 448 return 0; 449 } 450 451 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent) 452 { 453 u32 active_us, inactive_us; 454 int ret; 455 456 if (active_percent == 0 || active_percent > 100) 457 return -EINVAL; 458 459 active_us = (DCT_PERIOD_US * active_percent) / 100; 460 inactive_us = DCT_PERIOD_US - active_us; 461 462 vdev->pm->dct_active_percent = active_percent; 463 464 ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n", 465 active_percent, active_us, inactive_us); 466 467 ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us); 468 if (ret) { 469 ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret); 470 return ret; 471 } 472 473 return 0; 474 } 475 476 int ivpu_pm_dct_disable(struct ivpu_device *vdev) 477 { 478 int ret; 479 480 vdev->pm->dct_active_percent = 0; 481 482 ivpu_dbg(vdev, PM, "DCT requested to be disabled\n"); 483 484 ret = ivpu_jsm_dct_disable(vdev); 485 if (ret) { 486 ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret); 487 return ret; 488 } 489 490 return 0; 491 } 492 493 void ivpu_pm_irq_dct_work_fn(struct work_struct *work) 494 { 495 struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work); 496 bool enable; 497 int ret; 498 499 if (ivpu_hw_btrs_dct_get_request(vdev, &enable)) 500 return; 501 502 if (enable) 503 ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT); 504 else 505 ret = ivpu_pm_dct_disable(vdev); 506 507 if (!ret) { 508 /* Convert percent to U1.7 format */ 509 u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100); 510 511 ivpu_hw_btrs_dct_set_status(vdev, enable, val); 512 } 513 514 } 515