1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_pm.h" 7 8 #include <linux/pm_runtime.h> 9 10 #include <drm/drm_managed.h> 11 #include <drm/ttm/ttm_placement.h> 12 13 #include "display/xe_display.h" 14 #include "xe_bo.h" 15 #include "xe_bo_evict.h" 16 #include "xe_device.h" 17 #include "xe_device_sysfs.h" 18 #include "xe_ggtt.h" 19 #include "xe_gt.h" 20 #include "xe_guc.h" 21 #include "xe_irq.h" 22 #include "xe_pcode.h" 23 #include "xe_wa.h" 24 25 /** 26 * DOC: Xe Power Management 27 * 28 * Xe PM implements the main routines for both system level suspend states and 29 * for the opportunistic runtime suspend states. 30 * 31 * System Level Suspend (S-States) - In general this is OS initiated suspend 32 * driven by ACPI for achieving S0ix (a.k.a. S2idle, freeze), S3 (suspend to ram), 33 * S4 (disk). The main functions here are `xe_pm_suspend` and `xe_pm_resume`. They 34 * are the main point for the suspend to and resume from these states. 35 * 36 * PCI Device Suspend (D-States) - This is the opportunistic PCIe device low power 37 * state D3, controlled by the PCI subsystem and ACPI with the help from the 38 * runtime_pm infrastructure. 39 * PCI D3 is special and can mean D3hot, where Vcc power is on for keeping memory 40 * alive and quicker low latency resume or D3Cold where Vcc power is off for 41 * better power savings. 42 * The Vcc control of PCI hierarchy can only be controlled at the PCI root port 43 * level, while the device driver can be behind multiple bridges/switches and 44 * paired with other devices. For this reason, the PCI subsystem cannot perform 45 * the transition towards D3Cold. The lowest runtime PM possible from the PCI 46 * subsystem is D3hot. Then, if all these paired devices in the same root port 47 * are in D3hot, ACPI will assist here and run its own methods (_PR3 and _OFF) 48 * to perform the transition from D3hot to D3cold. Xe may disallow this 49 * transition by calling pci_d3cold_disable(root_pdev) before going to runtime 50 * suspend. It will be based on runtime conditions such as VRAM usage for a 51 * quick and low latency resume for instance. 52 * 53 * Runtime PM - This infrastructure provided by the Linux kernel allows the 54 * device drivers to indicate when the can be runtime suspended, so the device 55 * could be put at D3 (if supported), or allow deeper package sleep states 56 * (PC-states), and/or other low level power states. Xe PM component provides 57 * `xe_pm_runtime_suspend` and `xe_pm_runtime_resume` functions that PCI 58 * subsystem will call before transition to/from runtime suspend. 59 * 60 * Also, Xe PM provides get and put functions that Xe driver will use to 61 * indicate activity. In order to avoid locking complications with the memory 62 * management, whenever possible, these get and put functions needs to be called 63 * from the higher/outer levels. 64 * The main cases that need to be protected from the outer levels are: IOCTL, 65 * sysfs, debugfs, dma-buf sharing, GPU execution. 66 * 67 * This component is not responsible for GT idleness (RC6) nor GT frequency 68 * management (RPS). 69 */ 70 71 /** 72 * xe_pm_suspend - Helper for System suspend, i.e. S0->S3 / S0->S2idle 73 * @xe: xe device instance 74 * 75 * Return: 0 on success 76 */ 77 int xe_pm_suspend(struct xe_device *xe) 78 { 79 struct xe_gt *gt; 80 u8 id; 81 int err; 82 83 for_each_gt(gt, xe, id) 84 xe_gt_suspend_prepare(gt); 85 86 /* FIXME: Super racey... */ 87 err = xe_bo_evict_all(xe); 88 if (err) 89 return err; 90 91 xe_display_pm_suspend(xe); 92 93 for_each_gt(gt, xe, id) { 94 err = xe_gt_suspend(gt); 95 if (err) { 96 xe_display_pm_resume(xe); 97 return err; 98 } 99 } 100 101 xe_irq_suspend(xe); 102 103 xe_display_pm_suspend_late(xe); 104 105 return 0; 106 } 107 108 /** 109 * xe_pm_resume - Helper for System resume S3->S0 / S2idle->S0 110 * @xe: xe device instance 111 * 112 * Return: 0 on success 113 */ 114 int xe_pm_resume(struct xe_device *xe) 115 { 116 struct xe_tile *tile; 117 struct xe_gt *gt; 118 u8 id; 119 int err; 120 121 for_each_tile(tile, xe, id) 122 xe_wa_apply_tile_workarounds(tile); 123 124 for_each_gt(gt, xe, id) { 125 err = xe_pcode_init(gt); 126 if (err) 127 return err; 128 } 129 130 xe_display_pm_resume_early(xe); 131 132 /* 133 * This only restores pinned memory which is the memory required for the 134 * GT(s) to resume. 135 */ 136 err = xe_bo_restore_kernel(xe); 137 if (err) 138 return err; 139 140 xe_irq_resume(xe); 141 142 xe_display_pm_resume(xe); 143 144 for_each_gt(gt, xe, id) 145 xe_gt_resume(gt); 146 147 err = xe_bo_restore_user(xe); 148 if (err) 149 return err; 150 151 return 0; 152 } 153 154 static bool xe_pm_pci_d3cold_capable(struct xe_device *xe) 155 { 156 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 157 struct pci_dev *root_pdev; 158 159 root_pdev = pcie_find_root_port(pdev); 160 if (!root_pdev) 161 return false; 162 163 /* D3Cold requires PME capability */ 164 if (!pci_pme_capable(root_pdev, PCI_D3cold)) { 165 drm_dbg(&xe->drm, "d3cold: PME# not supported\n"); 166 return false; 167 } 168 169 /* D3Cold requires _PR3 power resource */ 170 if (!pci_pr3_present(root_pdev)) { 171 drm_dbg(&xe->drm, "d3cold: ACPI _PR3 not present\n"); 172 return false; 173 } 174 175 return true; 176 } 177 178 static void xe_pm_runtime_init(struct xe_device *xe) 179 { 180 struct device *dev = xe->drm.dev; 181 182 /* 183 * Disable the system suspend direct complete optimization. 184 * We need to ensure that the regular device suspend/resume functions 185 * are called since our runtime_pm cannot guarantee local memory 186 * eviction for d3cold. 187 * TODO: Check HDA audio dependencies claimed by i915, and then enforce 188 * this option to integrated graphics as well. 189 */ 190 if (IS_DGFX(xe)) 191 dev_pm_set_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); 192 193 pm_runtime_use_autosuspend(dev); 194 pm_runtime_set_autosuspend_delay(dev, 1000); 195 pm_runtime_set_active(dev); 196 pm_runtime_allow(dev); 197 pm_runtime_mark_last_busy(dev); 198 pm_runtime_put(dev); 199 } 200 201 void xe_pm_init_early(struct xe_device *xe) 202 { 203 INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list); 204 drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock); 205 } 206 207 /** 208 * xe_pm_init - Initialize Xe Power Management 209 * @xe: xe device instance 210 * 211 * This component is responsible for System and Device sleep states. 212 */ 213 void xe_pm_init(struct xe_device *xe) 214 { 215 /* For now suspend/resume is only allowed with GuC */ 216 if (!xe_device_uc_enabled(xe)) 217 return; 218 219 drmm_mutex_init(&xe->drm, &xe->d3cold.lock); 220 221 xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe); 222 223 if (xe->d3cold.capable) { 224 xe_device_sysfs_init(xe); 225 xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD); 226 } 227 228 xe_pm_runtime_init(xe); 229 } 230 231 /** 232 * xe_pm_runtime_fini - Finalize Runtime PM 233 * @xe: xe device instance 234 */ 235 void xe_pm_runtime_fini(struct xe_device *xe) 236 { 237 struct device *dev = xe->drm.dev; 238 239 pm_runtime_get_sync(dev); 240 pm_runtime_forbid(dev); 241 } 242 243 static void xe_pm_write_callback_task(struct xe_device *xe, 244 struct task_struct *task) 245 { 246 WRITE_ONCE(xe->pm_callback_task, task); 247 248 /* 249 * Just in case it's somehow possible for our writes to be reordered to 250 * the extent that something else re-uses the task written in 251 * pm_callback_task. For example after returning from the callback, but 252 * before the reordered write that resets pm_callback_task back to NULL. 253 */ 254 smp_mb(); /* pairs with xe_pm_read_callback_task */ 255 } 256 257 struct task_struct *xe_pm_read_callback_task(struct xe_device *xe) 258 { 259 smp_mb(); /* pairs with xe_pm_write_callback_task */ 260 261 return READ_ONCE(xe->pm_callback_task); 262 } 263 264 /** 265 * xe_pm_runtime_suspended - Check if runtime_pm state is suspended 266 * @xe: xe device instance 267 * 268 * This does not provide any guarantee that the device is going to remain 269 * suspended as it might be racing with the runtime state transitions. 270 * It can be used only as a non-reliable assertion, to ensure that we are not in 271 * the sleep state while trying to access some memory for instance. 272 * 273 * Returns true if PCI device is suspended, false otherwise. 274 */ 275 bool xe_pm_runtime_suspended(struct xe_device *xe) 276 { 277 return pm_runtime_suspended(xe->drm.dev); 278 } 279 280 /** 281 * xe_pm_runtime_suspend - Prepare our device for D3hot/D3Cold 282 * @xe: xe device instance 283 * 284 * Returns 0 for success, negative error code otherwise. 285 */ 286 int xe_pm_runtime_suspend(struct xe_device *xe) 287 { 288 struct xe_bo *bo, *on; 289 struct xe_gt *gt; 290 u8 id; 291 int err = 0; 292 293 if (xe->d3cold.allowed && xe_device_mem_access_ongoing(xe)) 294 return -EBUSY; 295 296 /* Disable access_ongoing asserts and prevent recursive pm calls */ 297 xe_pm_write_callback_task(xe, current); 298 299 /* 300 * The actual xe_device_mem_access_put() is always async underneath, so 301 * exactly where that is called should makes no difference to us. However 302 * we still need to be very careful with the locks that this callback 303 * acquires and the locks that are acquired and held by any callers of 304 * xe_device_mem_access_get(). We already have the matching annotation 305 * on that side, but we also need it here. For example lockdep should be 306 * able to tell us if the following scenario is in theory possible: 307 * 308 * CPU0 | CPU1 (kworker) 309 * lock(A) | 310 * | xe_pm_runtime_suspend() 311 * | lock(A) 312 * xe_device_mem_access_get() | 313 * 314 * This will clearly deadlock since rpm core needs to wait for 315 * xe_pm_runtime_suspend() to complete, but here we are holding lock(A) 316 * on CPU0 which prevents CPU1 making forward progress. With the 317 * annotation here and in xe_device_mem_access_get() lockdep will see 318 * the potential lock inversion and give us a nice splat. 319 */ 320 lock_map_acquire(&xe_device_mem_access_lockdep_map); 321 322 /* 323 * Applying lock for entire list op as xe_ttm_bo_destroy and xe_bo_move_notify 324 * also checks and delets bo entry from user fault list. 325 */ 326 mutex_lock(&xe->mem_access.vram_userfault.lock); 327 list_for_each_entry_safe(bo, on, 328 &xe->mem_access.vram_userfault.list, vram_userfault_link) 329 xe_bo_runtime_pm_release_mmap_offset(bo); 330 mutex_unlock(&xe->mem_access.vram_userfault.lock); 331 332 if (xe->d3cold.allowed) { 333 err = xe_bo_evict_all(xe); 334 if (err) 335 goto out; 336 } 337 338 for_each_gt(gt, xe, id) { 339 err = xe_gt_suspend(gt); 340 if (err) 341 goto out; 342 } 343 344 xe_irq_suspend(xe); 345 out: 346 lock_map_release(&xe_device_mem_access_lockdep_map); 347 xe_pm_write_callback_task(xe, NULL); 348 return err; 349 } 350 351 /** 352 * xe_pm_runtime_resume - Waking up from D3hot/D3Cold 353 * @xe: xe device instance 354 * 355 * Returns 0 for success, negative error code otherwise. 356 */ 357 int xe_pm_runtime_resume(struct xe_device *xe) 358 { 359 struct xe_gt *gt; 360 u8 id; 361 int err = 0; 362 363 /* Disable access_ongoing asserts and prevent recursive pm calls */ 364 xe_pm_write_callback_task(xe, current); 365 366 lock_map_acquire(&xe_device_mem_access_lockdep_map); 367 368 /* 369 * It can be possible that xe has allowed d3cold but other pcie devices 370 * in gfx card soc would have blocked d3cold, therefore card has not 371 * really lost power. Detecting primary Gt power is sufficient. 372 */ 373 gt = xe_device_get_gt(xe, 0); 374 xe->d3cold.power_lost = xe_guc_in_reset(>->uc.guc); 375 376 if (xe->d3cold.allowed && xe->d3cold.power_lost) { 377 for_each_gt(gt, xe, id) { 378 err = xe_pcode_init(gt); 379 if (err) 380 goto out; 381 } 382 383 /* 384 * This only restores pinned memory which is the memory 385 * required for the GT(s) to resume. 386 */ 387 err = xe_bo_restore_kernel(xe); 388 if (err) 389 goto out; 390 } 391 392 xe_irq_resume(xe); 393 394 for_each_gt(gt, xe, id) 395 xe_gt_resume(gt); 396 397 if (xe->d3cold.allowed && xe->d3cold.power_lost) { 398 err = xe_bo_restore_user(xe); 399 if (err) 400 goto out; 401 } 402 out: 403 lock_map_release(&xe_device_mem_access_lockdep_map); 404 xe_pm_write_callback_task(xe, NULL); 405 return err; 406 } 407 408 /** 409 * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously 410 * @xe: xe device instance 411 * 412 * Returns: Any number greater than or equal to 0 for success, negative error 413 * code otherwise. 414 */ 415 int xe_pm_runtime_get(struct xe_device *xe) 416 { 417 return pm_runtime_get_sync(xe->drm.dev); 418 } 419 420 /** 421 * xe_pm_runtime_put - Put the runtime_pm reference back and mark as idle 422 * @xe: xe device instance 423 * 424 * Returns: Any number greater than or equal to 0 for success, negative error 425 * code otherwise. 426 */ 427 int xe_pm_runtime_put(struct xe_device *xe) 428 { 429 pm_runtime_mark_last_busy(xe->drm.dev); 430 return pm_runtime_put(xe->drm.dev); 431 } 432 433 /** 434 * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl 435 * @xe: xe device instance 436 * 437 * Returns: Any number greater than or equal to 0 for success, negative error 438 * code otherwise. 439 */ 440 int xe_pm_runtime_get_ioctl(struct xe_device *xe) 441 { 442 if (WARN_ON(xe_pm_read_callback_task(xe) == current)) 443 return -ELOOP; 444 445 return pm_runtime_get_sync(xe->drm.dev); 446 } 447 448 /** 449 * xe_pm_runtime_get_if_active - Get a runtime_pm reference if device active 450 * @xe: xe device instance 451 * 452 * Returns: Any number greater than or equal to 0 for success, negative error 453 * code otherwise. 454 */ 455 int xe_pm_runtime_get_if_active(struct xe_device *xe) 456 { 457 return pm_runtime_get_if_active(xe->drm.dev, true); 458 } 459 460 /** 461 * xe_pm_runtime_get_if_in_use - Get a runtime_pm reference and resume if needed 462 * @xe: xe device instance 463 * 464 * Returns: True if device is awake and the reference was taken, false otherwise. 465 */ 466 bool xe_pm_runtime_get_if_in_use(struct xe_device *xe) 467 { 468 if (xe_pm_read_callback_task(xe) == current) { 469 /* The device is awake, grab the ref and move on */ 470 pm_runtime_get_noresume(xe->drm.dev); 471 return true; 472 } 473 474 return pm_runtime_get_if_in_use(xe->drm.dev) > 0; 475 } 476 477 /** 478 * xe_pm_runtime_resume_and_get - Resume, then get a runtime_pm ref if awake. 479 * @xe: xe device instance 480 * 481 * Returns: True if device is awake and the reference was taken, false otherwise. 482 */ 483 bool xe_pm_runtime_resume_and_get(struct xe_device *xe) 484 { 485 if (xe_pm_read_callback_task(xe) == current) { 486 /* The device is awake, grab the ref and move on */ 487 pm_runtime_get_noresume(xe->drm.dev); 488 return true; 489 } 490 491 return pm_runtime_resume_and_get(xe->drm.dev) >= 0; 492 } 493 494 /** 495 * xe_pm_assert_unbounded_bridge - Disable PM on unbounded pcie parent bridge 496 * @xe: xe device instance 497 */ 498 void xe_pm_assert_unbounded_bridge(struct xe_device *xe) 499 { 500 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 501 struct pci_dev *bridge = pci_upstream_bridge(pdev); 502 503 if (!bridge) 504 return; 505 506 if (!bridge->driver) { 507 drm_warn(&xe->drm, "unbounded parent pci bridge, device won't support any PM support.\n"); 508 device_set_pm_not_required(&pdev->dev); 509 } 510 } 511 512 /** 513 * xe_pm_set_vram_threshold - Set a vram threshold for allowing/blocking D3Cold 514 * @xe: xe device instance 515 * @threshold: VRAM size in bites for the D3cold threshold 516 * 517 * Returns 0 for success, negative error code otherwise. 518 */ 519 int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold) 520 { 521 struct ttm_resource_manager *man; 522 u32 vram_total_mb = 0; 523 int i; 524 525 for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { 526 man = ttm_manager_type(&xe->ttm, i); 527 if (man) 528 vram_total_mb += DIV_ROUND_UP_ULL(man->size, 1024 * 1024); 529 } 530 531 drm_dbg(&xe->drm, "Total vram %u mb\n", vram_total_mb); 532 533 if (threshold > vram_total_mb) 534 return -EINVAL; 535 536 mutex_lock(&xe->d3cold.lock); 537 xe->d3cold.vram_threshold = threshold; 538 mutex_unlock(&xe->d3cold.lock); 539 540 return 0; 541 } 542 543 /** 544 * xe_pm_d3cold_allowed_toggle - Check conditions to toggle d3cold.allowed 545 * @xe: xe device instance 546 * 547 * To be called during runtime_pm idle callback. 548 * Check for all the D3Cold conditions ahead of runtime suspend. 549 */ 550 void xe_pm_d3cold_allowed_toggle(struct xe_device *xe) 551 { 552 struct ttm_resource_manager *man; 553 u32 total_vram_used_mb = 0; 554 u64 vram_used; 555 int i; 556 557 if (!xe->d3cold.capable) { 558 xe->d3cold.allowed = false; 559 return; 560 } 561 562 for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { 563 man = ttm_manager_type(&xe->ttm, i); 564 if (man) { 565 vram_used = ttm_resource_manager_usage(man); 566 total_vram_used_mb += DIV_ROUND_UP_ULL(vram_used, 1024 * 1024); 567 } 568 } 569 570 mutex_lock(&xe->d3cold.lock); 571 572 if (total_vram_used_mb < xe->d3cold.vram_threshold) 573 xe->d3cold.allowed = true; 574 else 575 xe->d3cold.allowed = false; 576 577 mutex_unlock(&xe->d3cold.lock); 578 579 drm_dbg(&xe->drm, 580 "d3cold: allowed=%s\n", str_yes_no(xe->d3cold.allowed)); 581 } 582