1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_pm.h" 7 8 #include <linux/pm_runtime.h> 9 10 #include <drm/drm_managed.h> 11 #include <drm/ttm/ttm_placement.h> 12 13 #include "display/xe_display.h" 14 #include "xe_bo.h" 15 #include "xe_bo_evict.h" 16 #include "xe_device.h" 17 #include "xe_device_sysfs.h" 18 #include "xe_ggtt.h" 19 #include "xe_gt.h" 20 #include "xe_guc.h" 21 #include "xe_irq.h" 22 #include "xe_pcode.h" 23 #include "xe_wa.h" 24 25 /** 26 * DOC: Xe Power Management 27 * 28 * Xe PM implements the main routines for both system level suspend states and 29 * for the opportunistic runtime suspend states. 30 * 31 * System Level Suspend (S-States) - In general this is OS initiated suspend 32 * driven by ACPI for achieving S0ix (a.k.a. S2idle, freeze), S3 (suspend to ram), 33 * S4 (disk). The main functions here are `xe_pm_suspend` and `xe_pm_resume`. They 34 * are the main point for the suspend to and resume from these states. 35 * 36 * PCI Device Suspend (D-States) - This is the opportunistic PCIe device low power 37 * state D3, controlled by the PCI subsystem and ACPI with the help from the 38 * runtime_pm infrastructure. 39 * PCI D3 is special and can mean D3hot, where Vcc power is on for keeping memory 40 * alive and quicker low latency resume or D3Cold where Vcc power is off for 41 * better power savings. 42 * The Vcc control of PCI hierarchy can only be controlled at the PCI root port 43 * level, while the device driver can be behind multiple bridges/switches and 44 * paired with other devices. For this reason, the PCI subsystem cannot perform 45 * the transition towards D3Cold. The lowest runtime PM possible from the PCI 46 * subsystem is D3hot. Then, if all these paired devices in the same root port 47 * are in D3hot, ACPI will assist here and run its own methods (_PR3 and _OFF) 48 * to perform the transition from D3hot to D3cold. Xe may disallow this 49 * transition by calling pci_d3cold_disable(root_pdev) before going to runtime 50 * suspend. It will be based on runtime conditions such as VRAM usage for a 51 * quick and low latency resume for instance. 52 * 53 * Runtime PM - This infrastructure provided by the Linux kernel allows the 54 * device drivers to indicate when the can be runtime suspended, so the device 55 * could be put at D3 (if supported), or allow deeper package sleep states 56 * (PC-states), and/or other low level power states. Xe PM component provides 57 * `xe_pm_runtime_suspend` and `xe_pm_runtime_resume` functions that PCI 58 * subsystem will call before transition to/from runtime suspend. 59 * 60 * Also, Xe PM provides get and put functions that Xe driver will use to 61 * indicate activity. In order to avoid locking complications with the memory 62 * management, whenever possible, these get and put functions needs to be called 63 * from the higher/outer levels. 64 * The main cases that need to be protected from the outer levels are: IOCTL, 65 * sysfs, debugfs, dma-buf sharing, GPU execution. 66 * 67 * This component is not responsible for GT idleness (RC6) nor GT frequency 68 * management (RPS). 69 */ 70 71 #ifdef CONFIG_LOCKDEP 72 static struct lockdep_map xe_pm_runtime_lockdep_map = { 73 .name = "xe_pm_runtime_lockdep_map" 74 }; 75 #endif 76 77 /** 78 * xe_pm_suspend - Helper for System suspend, i.e. S0->S3 / S0->S2idle 79 * @xe: xe device instance 80 * 81 * Return: 0 on success 82 */ 83 int xe_pm_suspend(struct xe_device *xe) 84 { 85 struct xe_gt *gt; 86 u8 id; 87 int err; 88 89 drm_dbg(&xe->drm, "Suspending device\n"); 90 91 for_each_gt(gt, xe, id) 92 xe_gt_suspend_prepare(gt); 93 94 /* FIXME: Super racey... */ 95 err = xe_bo_evict_all(xe); 96 if (err) 97 goto err; 98 99 xe_display_pm_suspend(xe, false); 100 101 for_each_gt(gt, xe, id) { 102 err = xe_gt_suspend(gt); 103 if (err) { 104 xe_display_pm_resume(xe, false); 105 goto err; 106 } 107 } 108 109 xe_irq_suspend(xe); 110 111 xe_display_pm_suspend_late(xe); 112 113 drm_dbg(&xe->drm, "Device suspended\n"); 114 return 0; 115 err: 116 drm_dbg(&xe->drm, "Device suspend failed %d\n", err); 117 return err; 118 } 119 120 /** 121 * xe_pm_resume - Helper for System resume S3->S0 / S2idle->S0 122 * @xe: xe device instance 123 * 124 * Return: 0 on success 125 */ 126 int xe_pm_resume(struct xe_device *xe) 127 { 128 struct xe_tile *tile; 129 struct xe_gt *gt; 130 u8 id; 131 int err; 132 133 drm_dbg(&xe->drm, "Resuming device\n"); 134 135 for_each_tile(tile, xe, id) 136 xe_wa_apply_tile_workarounds(tile); 137 138 err = xe_pcode_ready(xe, true); 139 if (err) 140 return err; 141 142 xe_display_pm_resume_early(xe); 143 144 /* 145 * This only restores pinned memory which is the memory required for the 146 * GT(s) to resume. 147 */ 148 err = xe_bo_restore_kernel(xe); 149 if (err) 150 goto err; 151 152 xe_irq_resume(xe); 153 154 xe_display_pm_resume(xe, false); 155 156 for_each_gt(gt, xe, id) 157 xe_gt_resume(gt); 158 159 err = xe_bo_restore_user(xe); 160 if (err) 161 goto err; 162 163 drm_dbg(&xe->drm, "Device resumed\n"); 164 return 0; 165 err: 166 drm_dbg(&xe->drm, "Device resume failed %d\n", err); 167 return err; 168 } 169 170 static bool xe_pm_pci_d3cold_capable(struct xe_device *xe) 171 { 172 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 173 struct pci_dev *root_pdev; 174 175 root_pdev = pcie_find_root_port(pdev); 176 if (!root_pdev) 177 return false; 178 179 /* D3Cold requires PME capability */ 180 if (!pci_pme_capable(root_pdev, PCI_D3cold)) { 181 drm_dbg(&xe->drm, "d3cold: PME# not supported\n"); 182 return false; 183 } 184 185 /* D3Cold requires _PR3 power resource */ 186 if (!pci_pr3_present(root_pdev)) { 187 drm_dbg(&xe->drm, "d3cold: ACPI _PR3 not present\n"); 188 return false; 189 } 190 191 return true; 192 } 193 194 static void xe_pm_runtime_init(struct xe_device *xe) 195 { 196 struct device *dev = xe->drm.dev; 197 198 /* 199 * Disable the system suspend direct complete optimization. 200 * We need to ensure that the regular device suspend/resume functions 201 * are called since our runtime_pm cannot guarantee local memory 202 * eviction for d3cold. 203 * TODO: Check HDA audio dependencies claimed by i915, and then enforce 204 * this option to integrated graphics as well. 205 */ 206 if (IS_DGFX(xe)) 207 dev_pm_set_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); 208 209 pm_runtime_use_autosuspend(dev); 210 pm_runtime_set_autosuspend_delay(dev, 1000); 211 pm_runtime_set_active(dev); 212 pm_runtime_allow(dev); 213 pm_runtime_mark_last_busy(dev); 214 pm_runtime_put(dev); 215 } 216 217 int xe_pm_init_early(struct xe_device *xe) 218 { 219 int err; 220 221 INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list); 222 223 err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock); 224 if (err) 225 return err; 226 227 err = drmm_mutex_init(&xe->drm, &xe->d3cold.lock); 228 if (err) 229 return err; 230 231 return 0; 232 } 233 234 /** 235 * xe_pm_init - Initialize Xe Power Management 236 * @xe: xe device instance 237 * 238 * This component is responsible for System and Device sleep states. 239 * 240 * Returns 0 for success, negative error code otherwise. 241 */ 242 int xe_pm_init(struct xe_device *xe) 243 { 244 int err; 245 246 /* For now suspend/resume is only allowed with GuC */ 247 if (!xe_device_uc_enabled(xe)) 248 return 0; 249 250 xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe); 251 252 if (xe->d3cold.capable) { 253 err = xe_device_sysfs_init(xe); 254 if (err) 255 return err; 256 257 err = xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD); 258 if (err) 259 return err; 260 } 261 262 xe_pm_runtime_init(xe); 263 264 return 0; 265 } 266 267 /** 268 * xe_pm_runtime_fini - Finalize Runtime PM 269 * @xe: xe device instance 270 */ 271 void xe_pm_runtime_fini(struct xe_device *xe) 272 { 273 struct device *dev = xe->drm.dev; 274 275 pm_runtime_get_sync(dev); 276 pm_runtime_forbid(dev); 277 } 278 279 static void xe_pm_write_callback_task(struct xe_device *xe, 280 struct task_struct *task) 281 { 282 WRITE_ONCE(xe->pm_callback_task, task); 283 284 /* 285 * Just in case it's somehow possible for our writes to be reordered to 286 * the extent that something else re-uses the task written in 287 * pm_callback_task. For example after returning from the callback, but 288 * before the reordered write that resets pm_callback_task back to NULL. 289 */ 290 smp_mb(); /* pairs with xe_pm_read_callback_task */ 291 } 292 293 struct task_struct *xe_pm_read_callback_task(struct xe_device *xe) 294 { 295 smp_mb(); /* pairs with xe_pm_write_callback_task */ 296 297 return READ_ONCE(xe->pm_callback_task); 298 } 299 300 /** 301 * xe_pm_runtime_suspended - Check if runtime_pm state is suspended 302 * @xe: xe device instance 303 * 304 * This does not provide any guarantee that the device is going to remain 305 * suspended as it might be racing with the runtime state transitions. 306 * It can be used only as a non-reliable assertion, to ensure that we are not in 307 * the sleep state while trying to access some memory for instance. 308 * 309 * Returns true if PCI device is suspended, false otherwise. 310 */ 311 bool xe_pm_runtime_suspended(struct xe_device *xe) 312 { 313 return pm_runtime_suspended(xe->drm.dev); 314 } 315 316 /** 317 * xe_pm_runtime_suspend - Prepare our device for D3hot/D3Cold 318 * @xe: xe device instance 319 * 320 * Returns 0 for success, negative error code otherwise. 321 */ 322 int xe_pm_runtime_suspend(struct xe_device *xe) 323 { 324 struct xe_bo *bo, *on; 325 struct xe_gt *gt; 326 u8 id; 327 int err = 0; 328 329 /* Disable access_ongoing asserts and prevent recursive pm calls */ 330 xe_pm_write_callback_task(xe, current); 331 332 /* 333 * The actual xe_pm_runtime_put() is always async underneath, so 334 * exactly where that is called should makes no difference to us. However 335 * we still need to be very careful with the locks that this callback 336 * acquires and the locks that are acquired and held by any callers of 337 * xe_runtime_pm_get(). We already have the matching annotation 338 * on that side, but we also need it here. For example lockdep should be 339 * able to tell us if the following scenario is in theory possible: 340 * 341 * CPU0 | CPU1 (kworker) 342 * lock(A) | 343 * | xe_pm_runtime_suspend() 344 * | lock(A) 345 * xe_pm_runtime_get() | 346 * 347 * This will clearly deadlock since rpm core needs to wait for 348 * xe_pm_runtime_suspend() to complete, but here we are holding lock(A) 349 * on CPU0 which prevents CPU1 making forward progress. With the 350 * annotation here and in xe_pm_runtime_get() lockdep will see 351 * the potential lock inversion and give us a nice splat. 352 */ 353 lock_map_acquire(&xe_pm_runtime_lockdep_map); 354 355 /* 356 * Applying lock for entire list op as xe_ttm_bo_destroy and xe_bo_move_notify 357 * also checks and delets bo entry from user fault list. 358 */ 359 mutex_lock(&xe->mem_access.vram_userfault.lock); 360 list_for_each_entry_safe(bo, on, 361 &xe->mem_access.vram_userfault.list, vram_userfault_link) 362 xe_bo_runtime_pm_release_mmap_offset(bo); 363 mutex_unlock(&xe->mem_access.vram_userfault.lock); 364 365 if (xe->d3cold.allowed) { 366 err = xe_bo_evict_all(xe); 367 if (err) 368 goto out; 369 xe_display_pm_suspend(xe, true); 370 } 371 372 for_each_gt(gt, xe, id) { 373 err = xe_gt_suspend(gt); 374 if (err) 375 goto out; 376 } 377 378 xe_irq_suspend(xe); 379 380 if (xe->d3cold.allowed) 381 xe_display_pm_suspend_late(xe); 382 out: 383 if (err) 384 xe_display_pm_resume(xe, true); 385 lock_map_release(&xe_pm_runtime_lockdep_map); 386 xe_pm_write_callback_task(xe, NULL); 387 return err; 388 } 389 390 /** 391 * xe_pm_runtime_resume - Waking up from D3hot/D3Cold 392 * @xe: xe device instance 393 * 394 * Returns 0 for success, negative error code otherwise. 395 */ 396 int xe_pm_runtime_resume(struct xe_device *xe) 397 { 398 struct xe_gt *gt; 399 u8 id; 400 int err = 0; 401 402 /* Disable access_ongoing asserts and prevent recursive pm calls */ 403 xe_pm_write_callback_task(xe, current); 404 405 lock_map_acquire(&xe_pm_runtime_lockdep_map); 406 407 if (xe->d3cold.allowed) { 408 err = xe_pcode_ready(xe, true); 409 if (err) 410 goto out; 411 412 xe_display_pm_resume_early(xe); 413 414 /* 415 * This only restores pinned memory which is the memory 416 * required for the GT(s) to resume. 417 */ 418 err = xe_bo_restore_kernel(xe); 419 if (err) 420 goto out; 421 } 422 423 xe_irq_resume(xe); 424 425 for_each_gt(gt, xe, id) 426 xe_gt_resume(gt); 427 428 if (xe->d3cold.allowed) { 429 xe_display_pm_resume(xe, true); 430 err = xe_bo_restore_user(xe); 431 if (err) 432 goto out; 433 } 434 out: 435 lock_map_release(&xe_pm_runtime_lockdep_map); 436 xe_pm_write_callback_task(xe, NULL); 437 return err; 438 } 439 440 /* 441 * For places where resume is synchronous it can be quite easy to deadlock 442 * if we are not careful. Also in practice it might be quite timing 443 * sensitive to ever see the 0 -> 1 transition with the callers locks 444 * held, so deadlocks might exist but are hard for lockdep to ever see. 445 * With this in mind, help lockdep learn about the potentially scary 446 * stuff that can happen inside the runtime_resume callback by acquiring 447 * a dummy lock (it doesn't protect anything and gets compiled out on 448 * non-debug builds). Lockdep then only needs to see the 449 * xe_pm_runtime_lockdep_map -> runtime_resume callback once, and then can 450 * hopefully validate all the (callers_locks) -> xe_pm_runtime_lockdep_map. 451 * For example if the (callers_locks) are ever grabbed in the 452 * runtime_resume callback, lockdep should give us a nice splat. 453 */ 454 static void pm_runtime_lockdep_prime(void) 455 { 456 lock_map_acquire(&xe_pm_runtime_lockdep_map); 457 lock_map_release(&xe_pm_runtime_lockdep_map); 458 } 459 460 /** 461 * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously 462 * @xe: xe device instance 463 */ 464 void xe_pm_runtime_get(struct xe_device *xe) 465 { 466 pm_runtime_get_noresume(xe->drm.dev); 467 468 if (xe_pm_read_callback_task(xe) == current) 469 return; 470 471 pm_runtime_lockdep_prime(); 472 pm_runtime_resume(xe->drm.dev); 473 } 474 475 /** 476 * xe_pm_runtime_put - Put the runtime_pm reference back and mark as idle 477 * @xe: xe device instance 478 */ 479 void xe_pm_runtime_put(struct xe_device *xe) 480 { 481 if (xe_pm_read_callback_task(xe) == current) { 482 pm_runtime_put_noidle(xe->drm.dev); 483 } else { 484 pm_runtime_mark_last_busy(xe->drm.dev); 485 pm_runtime_put(xe->drm.dev); 486 } 487 } 488 489 /** 490 * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl 491 * @xe: xe device instance 492 * 493 * Returns: Any number greater than or equal to 0 for success, negative error 494 * code otherwise. 495 */ 496 int xe_pm_runtime_get_ioctl(struct xe_device *xe) 497 { 498 if (WARN_ON(xe_pm_read_callback_task(xe) == current)) 499 return -ELOOP; 500 501 pm_runtime_lockdep_prime(); 502 return pm_runtime_get_sync(xe->drm.dev); 503 } 504 505 /** 506 * xe_pm_runtime_get_if_active - Get a runtime_pm reference if device active 507 * @xe: xe device instance 508 * 509 * Return: True if device is awake (regardless the previous number of references) 510 * and a new reference was taken, false otherwise. 511 */ 512 bool xe_pm_runtime_get_if_active(struct xe_device *xe) 513 { 514 return pm_runtime_get_if_active(xe->drm.dev) > 0; 515 } 516 517 /** 518 * xe_pm_runtime_get_if_in_use - Get a new reference if device is active with previous ref taken 519 * @xe: xe device instance 520 * 521 * Return: True if device is awake, a previous reference had been already taken, 522 * and a new reference was now taken, false otherwise. 523 */ 524 bool xe_pm_runtime_get_if_in_use(struct xe_device *xe) 525 { 526 if (xe_pm_read_callback_task(xe) == current) { 527 /* The device is awake, grab the ref and move on */ 528 pm_runtime_get_noresume(xe->drm.dev); 529 return true; 530 } 531 532 return pm_runtime_get_if_in_use(xe->drm.dev) > 0; 533 } 534 535 /** 536 * xe_pm_runtime_get_noresume - Bump runtime PM usage counter without resuming 537 * @xe: xe device instance 538 * 539 * This function should be used in inner places where it is surely already 540 * protected by outer-bound callers of `xe_pm_runtime_get`. 541 * It will warn if not protected. 542 * The reference should be put back after this function regardless, since it 543 * will always bump the usage counter, regardless. 544 */ 545 void xe_pm_runtime_get_noresume(struct xe_device *xe) 546 { 547 bool ref; 548 549 ref = xe_pm_runtime_get_if_in_use(xe); 550 551 if (drm_WARN(&xe->drm, !ref, "Missing outer runtime PM protection\n")) 552 pm_runtime_get_noresume(xe->drm.dev); 553 } 554 555 /** 556 * xe_pm_runtime_resume_and_get - Resume, then get a runtime_pm ref if awake. 557 * @xe: xe device instance 558 * 559 * Returns: True if device is awake and the reference was taken, false otherwise. 560 */ 561 bool xe_pm_runtime_resume_and_get(struct xe_device *xe) 562 { 563 if (xe_pm_read_callback_task(xe) == current) { 564 /* The device is awake, grab the ref and move on */ 565 pm_runtime_get_noresume(xe->drm.dev); 566 return true; 567 } 568 569 pm_runtime_lockdep_prime(); 570 return pm_runtime_resume_and_get(xe->drm.dev) >= 0; 571 } 572 573 /** 574 * xe_pm_assert_unbounded_bridge - Disable PM on unbounded pcie parent bridge 575 * @xe: xe device instance 576 */ 577 void xe_pm_assert_unbounded_bridge(struct xe_device *xe) 578 { 579 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 580 struct pci_dev *bridge = pci_upstream_bridge(pdev); 581 582 if (!bridge) 583 return; 584 585 if (!bridge->driver) { 586 drm_warn(&xe->drm, "unbounded parent pci bridge, device won't support any PM support.\n"); 587 device_set_pm_not_required(&pdev->dev); 588 } 589 } 590 591 /** 592 * xe_pm_set_vram_threshold - Set a vram threshold for allowing/blocking D3Cold 593 * @xe: xe device instance 594 * @threshold: VRAM size in bites for the D3cold threshold 595 * 596 * Returns 0 for success, negative error code otherwise. 597 */ 598 int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold) 599 { 600 struct ttm_resource_manager *man; 601 u32 vram_total_mb = 0; 602 int i; 603 604 for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { 605 man = ttm_manager_type(&xe->ttm, i); 606 if (man) 607 vram_total_mb += DIV_ROUND_UP_ULL(man->size, 1024 * 1024); 608 } 609 610 drm_dbg(&xe->drm, "Total vram %u mb\n", vram_total_mb); 611 612 if (threshold > vram_total_mb) 613 return -EINVAL; 614 615 mutex_lock(&xe->d3cold.lock); 616 xe->d3cold.vram_threshold = threshold; 617 mutex_unlock(&xe->d3cold.lock); 618 619 return 0; 620 } 621 622 /** 623 * xe_pm_d3cold_allowed_toggle - Check conditions to toggle d3cold.allowed 624 * @xe: xe device instance 625 * 626 * To be called during runtime_pm idle callback. 627 * Check for all the D3Cold conditions ahead of runtime suspend. 628 */ 629 void xe_pm_d3cold_allowed_toggle(struct xe_device *xe) 630 { 631 struct ttm_resource_manager *man; 632 u32 total_vram_used_mb = 0; 633 u64 vram_used; 634 int i; 635 636 if (!xe->d3cold.capable) { 637 xe->d3cold.allowed = false; 638 return; 639 } 640 641 for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { 642 man = ttm_manager_type(&xe->ttm, i); 643 if (man) { 644 vram_used = ttm_resource_manager_usage(man); 645 total_vram_used_mb += DIV_ROUND_UP_ULL(vram_used, 1024 * 1024); 646 } 647 } 648 649 mutex_lock(&xe->d3cold.lock); 650 651 if (total_vram_used_mb < xe->d3cold.vram_threshold) 652 xe->d3cold.allowed = true; 653 else 654 xe->d3cold.allowed = false; 655 656 mutex_unlock(&xe->d3cold.lock); 657 658 drm_dbg(&xe->drm, 659 "d3cold: allowed=%s\n", str_yes_no(xe->d3cold.allowed)); 660 } 661