1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_pm.h" 7 8 #include <linux/pm_runtime.h> 9 10 #include <drm/drm_managed.h> 11 #include <drm/ttm/ttm_placement.h> 12 13 #include "display/xe_display.h" 14 #include "xe_bo.h" 15 #include "xe_bo_evict.h" 16 #include "xe_device.h" 17 #include "xe_device_sysfs.h" 18 #include "xe_ggtt.h" 19 #include "xe_gt.h" 20 #include "xe_guc.h" 21 #include "xe_irq.h" 22 #include "xe_pcode.h" 23 #include "xe_wa.h" 24 25 /** 26 * DOC: Xe Power Management 27 * 28 * Xe PM implements the main routines for both system level suspend states and 29 * for the opportunistic runtime suspend states. 30 * 31 * System Level Suspend (S-States) - In general this is OS initiated suspend 32 * driven by ACPI for achieving S0ix (a.k.a. S2idle, freeze), S3 (suspend to ram), 33 * S4 (disk). The main functions here are `xe_pm_suspend` and `xe_pm_resume`. They 34 * are the main point for the suspend to and resume from these states. 35 * 36 * PCI Device Suspend (D-States) - This is the opportunistic PCIe device low power 37 * state D3, controlled by the PCI subsystem and ACPI with the help from the 38 * runtime_pm infrastructure. 39 * PCI D3 is special and can mean D3hot, where Vcc power is on for keeping memory 40 * alive and quicker low latency resume or D3Cold where Vcc power is off for 41 * better power savings. 42 * The Vcc control of PCI hierarchy can only be controlled at the PCI root port 43 * level, while the device driver can be behind multiple bridges/switches and 44 * paired with other devices. For this reason, the PCI subsystem cannot perform 45 * the transition towards D3Cold. The lowest runtime PM possible from the PCI 46 * subsystem is D3hot. Then, if all these paired devices in the same root port 47 * are in D3hot, ACPI will assist here and run its own methods (_PR3 and _OFF) 48 * to perform the transition from D3hot to D3cold. Xe may disallow this 49 * transition by calling pci_d3cold_disable(root_pdev) before going to runtime 50 * suspend. It will be based on runtime conditions such as VRAM usage for a 51 * quick and low latency resume for instance. 52 * 53 * Runtime PM - This infrastructure provided by the Linux kernel allows the 54 * device drivers to indicate when the can be runtime suspended, so the device 55 * could be put at D3 (if supported), or allow deeper package sleep states 56 * (PC-states), and/or other low level power states. Xe PM component provides 57 * `xe_pm_runtime_suspend` and `xe_pm_runtime_resume` functions that PCI 58 * subsystem will call before transition to/from runtime suspend. 59 * 60 * Also, Xe PM provides get and put functions that Xe driver will use to 61 * indicate activity. In order to avoid locking complications with the memory 62 * management, whenever possible, these get and put functions needs to be called 63 * from the higher/outer levels. 64 * The main cases that need to be protected from the outer levels are: IOCTL, 65 * sysfs, debugfs, dma-buf sharing, GPU execution. 66 * 67 * This component is not responsible for GT idleness (RC6) nor GT frequency 68 * management (RPS). 69 */ 70 71 #ifdef CONFIG_LOCKDEP 72 struct lockdep_map xe_pm_runtime_lockdep_map = { 73 .name = "xe_pm_runtime_lockdep_map" 74 }; 75 #endif 76 77 /** 78 * xe_pm_suspend - Helper for System suspend, i.e. S0->S3 / S0->S2idle 79 * @xe: xe device instance 80 * 81 * Return: 0 on success 82 */ 83 int xe_pm_suspend(struct xe_device *xe) 84 { 85 struct xe_gt *gt; 86 u8 id; 87 int err; 88 89 drm_dbg(&xe->drm, "Suspending device\n"); 90 91 for_each_gt(gt, xe, id) 92 xe_gt_suspend_prepare(gt); 93 94 /* FIXME: Super racey... */ 95 err = xe_bo_evict_all(xe); 96 if (err) 97 goto err; 98 99 xe_display_pm_suspend(xe); 100 101 for_each_gt(gt, xe, id) { 102 err = xe_gt_suspend(gt); 103 if (err) { 104 xe_display_pm_resume(xe); 105 goto err; 106 } 107 } 108 109 xe_irq_suspend(xe); 110 111 xe_display_pm_suspend_late(xe); 112 113 drm_dbg(&xe->drm, "Device suspended\n"); 114 return 0; 115 err: 116 drm_dbg(&xe->drm, "Device suspend failed %d\n", err); 117 return err; 118 } 119 120 /** 121 * xe_pm_resume - Helper for System resume S3->S0 / S2idle->S0 122 * @xe: xe device instance 123 * 124 * Return: 0 on success 125 */ 126 int xe_pm_resume(struct xe_device *xe) 127 { 128 struct xe_tile *tile; 129 struct xe_gt *gt; 130 u8 id; 131 int err; 132 133 drm_dbg(&xe->drm, "Resuming device\n"); 134 135 for_each_tile(tile, xe, id) 136 xe_wa_apply_tile_workarounds(tile); 137 138 err = xe_pcode_ready(xe, true); 139 if (err) 140 return err; 141 142 xe_display_pm_resume_early(xe); 143 144 /* 145 * This only restores pinned memory which is the memory required for the 146 * GT(s) to resume. 147 */ 148 err = xe_bo_restore_kernel(xe); 149 if (err) 150 goto err; 151 152 xe_irq_resume(xe); 153 154 xe_display_pm_resume(xe); 155 156 for_each_gt(gt, xe, id) 157 xe_gt_resume(gt); 158 159 err = xe_bo_restore_user(xe); 160 if (err) 161 goto err; 162 163 drm_dbg(&xe->drm, "Device resumed\n"); 164 return 0; 165 err: 166 drm_dbg(&xe->drm, "Device resume failed %d\n", err); 167 return err; 168 } 169 170 static bool xe_pm_pci_d3cold_capable(struct xe_device *xe) 171 { 172 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 173 struct pci_dev *root_pdev; 174 175 root_pdev = pcie_find_root_port(pdev); 176 if (!root_pdev) 177 return false; 178 179 /* D3Cold requires PME capability */ 180 if (!pci_pme_capable(root_pdev, PCI_D3cold)) { 181 drm_dbg(&xe->drm, "d3cold: PME# not supported\n"); 182 return false; 183 } 184 185 /* D3Cold requires _PR3 power resource */ 186 if (!pci_pr3_present(root_pdev)) { 187 drm_dbg(&xe->drm, "d3cold: ACPI _PR3 not present\n"); 188 return false; 189 } 190 191 return true; 192 } 193 194 static void xe_pm_runtime_init(struct xe_device *xe) 195 { 196 struct device *dev = xe->drm.dev; 197 198 /* 199 * Disable the system suspend direct complete optimization. 200 * We need to ensure that the regular device suspend/resume functions 201 * are called since our runtime_pm cannot guarantee local memory 202 * eviction for d3cold. 203 * TODO: Check HDA audio dependencies claimed by i915, and then enforce 204 * this option to integrated graphics as well. 205 */ 206 if (IS_DGFX(xe)) 207 dev_pm_set_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); 208 209 pm_runtime_use_autosuspend(dev); 210 pm_runtime_set_autosuspend_delay(dev, 1000); 211 pm_runtime_set_active(dev); 212 pm_runtime_allow(dev); 213 pm_runtime_mark_last_busy(dev); 214 pm_runtime_put(dev); 215 } 216 217 int xe_pm_init_early(struct xe_device *xe) 218 { 219 int err; 220 221 INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list); 222 223 err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock); 224 if (err) 225 return err; 226 227 err = drmm_mutex_init(&xe->drm, &xe->d3cold.lock); 228 if (err) 229 return err; 230 231 return 0; 232 } 233 234 /** 235 * xe_pm_init - Initialize Xe Power Management 236 * @xe: xe device instance 237 * 238 * This component is responsible for System and Device sleep states. 239 * 240 * Returns 0 for success, negative error code otherwise. 241 */ 242 int xe_pm_init(struct xe_device *xe) 243 { 244 int err; 245 246 /* For now suspend/resume is only allowed with GuC */ 247 if (!xe_device_uc_enabled(xe)) 248 return 0; 249 250 xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe); 251 252 if (xe->d3cold.capable) { 253 err = xe_device_sysfs_init(xe); 254 if (err) 255 return err; 256 257 err = xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD); 258 if (err) 259 return err; 260 } 261 262 xe_pm_runtime_init(xe); 263 264 return 0; 265 } 266 267 /** 268 * xe_pm_runtime_fini - Finalize Runtime PM 269 * @xe: xe device instance 270 */ 271 void xe_pm_runtime_fini(struct xe_device *xe) 272 { 273 struct device *dev = xe->drm.dev; 274 275 pm_runtime_get_sync(dev); 276 pm_runtime_forbid(dev); 277 } 278 279 static void xe_pm_write_callback_task(struct xe_device *xe, 280 struct task_struct *task) 281 { 282 WRITE_ONCE(xe->pm_callback_task, task); 283 284 /* 285 * Just in case it's somehow possible for our writes to be reordered to 286 * the extent that something else re-uses the task written in 287 * pm_callback_task. For example after returning from the callback, but 288 * before the reordered write that resets pm_callback_task back to NULL. 289 */ 290 smp_mb(); /* pairs with xe_pm_read_callback_task */ 291 } 292 293 struct task_struct *xe_pm_read_callback_task(struct xe_device *xe) 294 { 295 smp_mb(); /* pairs with xe_pm_write_callback_task */ 296 297 return READ_ONCE(xe->pm_callback_task); 298 } 299 300 /** 301 * xe_pm_runtime_suspended - Check if runtime_pm state is suspended 302 * @xe: xe device instance 303 * 304 * This does not provide any guarantee that the device is going to remain 305 * suspended as it might be racing with the runtime state transitions. 306 * It can be used only as a non-reliable assertion, to ensure that we are not in 307 * the sleep state while trying to access some memory for instance. 308 * 309 * Returns true if PCI device is suspended, false otherwise. 310 */ 311 bool xe_pm_runtime_suspended(struct xe_device *xe) 312 { 313 return pm_runtime_suspended(xe->drm.dev); 314 } 315 316 /** 317 * xe_pm_runtime_suspend - Prepare our device for D3hot/D3Cold 318 * @xe: xe device instance 319 * 320 * Returns 0 for success, negative error code otherwise. 321 */ 322 int xe_pm_runtime_suspend(struct xe_device *xe) 323 { 324 struct xe_bo *bo, *on; 325 struct xe_gt *gt; 326 u8 id; 327 int err = 0; 328 329 /* Disable access_ongoing asserts and prevent recursive pm calls */ 330 xe_pm_write_callback_task(xe, current); 331 332 /* 333 * The actual xe_pm_runtime_put() is always async underneath, so 334 * exactly where that is called should makes no difference to us. However 335 * we still need to be very careful with the locks that this callback 336 * acquires and the locks that are acquired and held by any callers of 337 * xe_runtime_pm_get(). We already have the matching annotation 338 * on that side, but we also need it here. For example lockdep should be 339 * able to tell us if the following scenario is in theory possible: 340 * 341 * CPU0 | CPU1 (kworker) 342 * lock(A) | 343 * | xe_pm_runtime_suspend() 344 * | lock(A) 345 * xe_pm_runtime_get() | 346 * 347 * This will clearly deadlock since rpm core needs to wait for 348 * xe_pm_runtime_suspend() to complete, but here we are holding lock(A) 349 * on CPU0 which prevents CPU1 making forward progress. With the 350 * annotation here and in xe_pm_runtime_get() lockdep will see 351 * the potential lock inversion and give us a nice splat. 352 */ 353 lock_map_acquire(&xe_pm_runtime_lockdep_map); 354 355 /* 356 * Applying lock for entire list op as xe_ttm_bo_destroy and xe_bo_move_notify 357 * also checks and delets bo entry from user fault list. 358 */ 359 mutex_lock(&xe->mem_access.vram_userfault.lock); 360 list_for_each_entry_safe(bo, on, 361 &xe->mem_access.vram_userfault.list, vram_userfault_link) 362 xe_bo_runtime_pm_release_mmap_offset(bo); 363 mutex_unlock(&xe->mem_access.vram_userfault.lock); 364 365 if (xe->d3cold.allowed) { 366 err = xe_bo_evict_all(xe); 367 if (err) 368 goto out; 369 } 370 371 for_each_gt(gt, xe, id) { 372 err = xe_gt_suspend(gt); 373 if (err) 374 goto out; 375 } 376 377 xe_irq_suspend(xe); 378 out: 379 lock_map_release(&xe_pm_runtime_lockdep_map); 380 xe_pm_write_callback_task(xe, NULL); 381 return err; 382 } 383 384 /** 385 * xe_pm_runtime_resume - Waking up from D3hot/D3Cold 386 * @xe: xe device instance 387 * 388 * Returns 0 for success, negative error code otherwise. 389 */ 390 int xe_pm_runtime_resume(struct xe_device *xe) 391 { 392 struct xe_gt *gt; 393 u8 id; 394 int err = 0; 395 396 /* Disable access_ongoing asserts and prevent recursive pm calls */ 397 xe_pm_write_callback_task(xe, current); 398 399 lock_map_acquire(&xe_pm_runtime_lockdep_map); 400 401 /* 402 * It can be possible that xe has allowed d3cold but other pcie devices 403 * in gfx card soc would have blocked d3cold, therefore card has not 404 * really lost power. Detecting primary Gt power is sufficient. 405 */ 406 gt = xe_device_get_gt(xe, 0); 407 xe->d3cold.power_lost = xe_guc_in_reset(>->uc.guc); 408 409 if (xe->d3cold.allowed && xe->d3cold.power_lost) { 410 err = xe_pcode_ready(xe, true); 411 if (err) 412 goto out; 413 414 /* 415 * This only restores pinned memory which is the memory 416 * required for the GT(s) to resume. 417 */ 418 err = xe_bo_restore_kernel(xe); 419 if (err) 420 goto out; 421 } 422 423 xe_irq_resume(xe); 424 425 for_each_gt(gt, xe, id) 426 xe_gt_resume(gt); 427 428 if (xe->d3cold.allowed && xe->d3cold.power_lost) { 429 err = xe_bo_restore_user(xe); 430 if (err) 431 goto out; 432 } 433 out: 434 lock_map_release(&xe_pm_runtime_lockdep_map); 435 xe_pm_write_callback_task(xe, NULL); 436 return err; 437 } 438 439 /* 440 * For places where resume is synchronous it can be quite easy to deadlock 441 * if we are not careful. Also in practice it might be quite timing 442 * sensitive to ever see the 0 -> 1 transition with the callers locks 443 * held, so deadlocks might exist but are hard for lockdep to ever see. 444 * With this in mind, help lockdep learn about the potentially scary 445 * stuff that can happen inside the runtime_resume callback by acquiring 446 * a dummy lock (it doesn't protect anything and gets compiled out on 447 * non-debug builds). Lockdep then only needs to see the 448 * xe_pm_runtime_lockdep_map -> runtime_resume callback once, and then can 449 * hopefully validate all the (callers_locks) -> xe_pm_runtime_lockdep_map. 450 * For example if the (callers_locks) are ever grabbed in the 451 * runtime_resume callback, lockdep should give us a nice splat. 452 */ 453 static void pm_runtime_lockdep_prime(void) 454 { 455 lock_map_acquire(&xe_pm_runtime_lockdep_map); 456 lock_map_release(&xe_pm_runtime_lockdep_map); 457 } 458 459 /** 460 * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously 461 * @xe: xe device instance 462 */ 463 void xe_pm_runtime_get(struct xe_device *xe) 464 { 465 pm_runtime_get_noresume(xe->drm.dev); 466 467 if (xe_pm_read_callback_task(xe) == current) 468 return; 469 470 pm_runtime_lockdep_prime(); 471 pm_runtime_resume(xe->drm.dev); 472 } 473 474 /** 475 * xe_pm_runtime_put - Put the runtime_pm reference back and mark as idle 476 * @xe: xe device instance 477 */ 478 void xe_pm_runtime_put(struct xe_device *xe) 479 { 480 if (xe_pm_read_callback_task(xe) == current) { 481 pm_runtime_put_noidle(xe->drm.dev); 482 } else { 483 pm_runtime_mark_last_busy(xe->drm.dev); 484 pm_runtime_put(xe->drm.dev); 485 } 486 } 487 488 /** 489 * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl 490 * @xe: xe device instance 491 * 492 * Returns: Any number greater than or equal to 0 for success, negative error 493 * code otherwise. 494 */ 495 int xe_pm_runtime_get_ioctl(struct xe_device *xe) 496 { 497 if (WARN_ON(xe_pm_read_callback_task(xe) == current)) 498 return -ELOOP; 499 500 pm_runtime_lockdep_prime(); 501 return pm_runtime_get_sync(xe->drm.dev); 502 } 503 504 /** 505 * xe_pm_runtime_get_if_active - Get a runtime_pm reference if device active 506 * @xe: xe device instance 507 * 508 * Returns: Any number greater than or equal to 0 for success, negative error 509 * code otherwise. 510 */ 511 int xe_pm_runtime_get_if_active(struct xe_device *xe) 512 { 513 return pm_runtime_get_if_active(xe->drm.dev); 514 } 515 516 /** 517 * xe_pm_runtime_get_if_in_use - Get a runtime_pm reference and resume if needed 518 * @xe: xe device instance 519 * 520 * Returns: True if device is awake and the reference was taken, false otherwise. 521 */ 522 bool xe_pm_runtime_get_if_in_use(struct xe_device *xe) 523 { 524 if (xe_pm_read_callback_task(xe) == current) { 525 /* The device is awake, grab the ref and move on */ 526 pm_runtime_get_noresume(xe->drm.dev); 527 return true; 528 } 529 530 return pm_runtime_get_if_in_use(xe->drm.dev) > 0; 531 } 532 533 /** 534 * xe_pm_runtime_get_noresume - Bump runtime PM usage counter without resuming 535 * @xe: xe device instance 536 * 537 * This function should be used in inner places where it is surely already 538 * protected by outer-bound callers of `xe_pm_runtime_get`. 539 * It will warn if not protected. 540 * The reference should be put back after this function regardless, since it 541 * will always bump the usage counter, regardless. 542 */ 543 void xe_pm_runtime_get_noresume(struct xe_device *xe) 544 { 545 bool ref; 546 547 ref = xe_pm_runtime_get_if_in_use(xe); 548 549 if (drm_WARN(&xe->drm, !ref, "Missing outer runtime PM protection\n")) 550 pm_runtime_get_noresume(xe->drm.dev); 551 } 552 553 /** 554 * xe_pm_runtime_resume_and_get - Resume, then get a runtime_pm ref if awake. 555 * @xe: xe device instance 556 * 557 * Returns: True if device is awake and the reference was taken, false otherwise. 558 */ 559 bool xe_pm_runtime_resume_and_get(struct xe_device *xe) 560 { 561 if (xe_pm_read_callback_task(xe) == current) { 562 /* The device is awake, grab the ref and move on */ 563 pm_runtime_get_noresume(xe->drm.dev); 564 return true; 565 } 566 567 pm_runtime_lockdep_prime(); 568 return pm_runtime_resume_and_get(xe->drm.dev) >= 0; 569 } 570 571 /** 572 * xe_pm_assert_unbounded_bridge - Disable PM on unbounded pcie parent bridge 573 * @xe: xe device instance 574 */ 575 void xe_pm_assert_unbounded_bridge(struct xe_device *xe) 576 { 577 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 578 struct pci_dev *bridge = pci_upstream_bridge(pdev); 579 580 if (!bridge) 581 return; 582 583 if (!bridge->driver) { 584 drm_warn(&xe->drm, "unbounded parent pci bridge, device won't support any PM support.\n"); 585 device_set_pm_not_required(&pdev->dev); 586 } 587 } 588 589 /** 590 * xe_pm_set_vram_threshold - Set a vram threshold for allowing/blocking D3Cold 591 * @xe: xe device instance 592 * @threshold: VRAM size in bites for the D3cold threshold 593 * 594 * Returns 0 for success, negative error code otherwise. 595 */ 596 int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold) 597 { 598 struct ttm_resource_manager *man; 599 u32 vram_total_mb = 0; 600 int i; 601 602 for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { 603 man = ttm_manager_type(&xe->ttm, i); 604 if (man) 605 vram_total_mb += DIV_ROUND_UP_ULL(man->size, 1024 * 1024); 606 } 607 608 drm_dbg(&xe->drm, "Total vram %u mb\n", vram_total_mb); 609 610 if (threshold > vram_total_mb) 611 return -EINVAL; 612 613 mutex_lock(&xe->d3cold.lock); 614 xe->d3cold.vram_threshold = threshold; 615 mutex_unlock(&xe->d3cold.lock); 616 617 return 0; 618 } 619 620 /** 621 * xe_pm_d3cold_allowed_toggle - Check conditions to toggle d3cold.allowed 622 * @xe: xe device instance 623 * 624 * To be called during runtime_pm idle callback. 625 * Check for all the D3Cold conditions ahead of runtime suspend. 626 */ 627 void xe_pm_d3cold_allowed_toggle(struct xe_device *xe) 628 { 629 struct ttm_resource_manager *man; 630 u32 total_vram_used_mb = 0; 631 u64 vram_used; 632 int i; 633 634 if (!xe->d3cold.capable) { 635 xe->d3cold.allowed = false; 636 return; 637 } 638 639 for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { 640 man = ttm_manager_type(&xe->ttm, i); 641 if (man) { 642 vram_used = ttm_resource_manager_usage(man); 643 total_vram_used_mb += DIV_ROUND_UP_ULL(vram_used, 1024 * 1024); 644 } 645 } 646 647 mutex_lock(&xe->d3cold.lock); 648 649 if (total_vram_used_mb < xe->d3cold.vram_threshold) 650 xe->d3cold.allowed = true; 651 else 652 xe->d3cold.allowed = false; 653 654 mutex_unlock(&xe->d3cold.lock); 655 656 drm_dbg(&xe->drm, 657 "d3cold: allowed=%s\n", str_yes_no(xe->d3cold.allowed)); 658 } 659