xref: /linux/drivers/gpu/drm/xe/xe_pm.c (revision 79790b6818e96c58fe2bffee1b418c16e64e7b80)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include "xe_pm.h"
7 
8 #include <linux/pm_runtime.h>
9 
10 #include <drm/drm_managed.h>
11 #include <drm/ttm/ttm_placement.h>
12 
13 #include "display/xe_display.h"
14 #include "xe_bo.h"
15 #include "xe_bo_evict.h"
16 #include "xe_device.h"
17 #include "xe_device_sysfs.h"
18 #include "xe_ggtt.h"
19 #include "xe_gt.h"
20 #include "xe_guc.h"
21 #include "xe_irq.h"
22 #include "xe_pcode.h"
23 #include "xe_wa.h"
24 
25 /**
26  * DOC: Xe Power Management
27  *
28  * Xe PM implements the main routines for both system level suspend states and
29  * for the opportunistic runtime suspend states.
30  *
31  * System Level Suspend (S-States) - In general this is OS initiated suspend
32  * driven by ACPI for achieving S0ix (a.k.a. S2idle, freeze), S3 (suspend to ram),
33  * S4 (disk). The main functions here are `xe_pm_suspend` and `xe_pm_resume`. They
34  * are the main point for the suspend to and resume from these states.
35  *
36  * PCI Device Suspend (D-States) - This is the opportunistic PCIe device low power
37  * state D3, controlled by the PCI subsystem and ACPI with the help from the
38  * runtime_pm infrastructure.
39  * PCI D3 is special and can mean D3hot, where Vcc power is on for keeping memory
40  * alive and quicker low latency resume or D3Cold where Vcc power is off for
41  * better power savings.
42  * The Vcc control of PCI hierarchy can only be controlled at the PCI root port
43  * level, while the device driver can be behind multiple bridges/switches and
44  * paired with other devices. For this reason, the PCI subsystem cannot perform
45  * the transition towards D3Cold. The lowest runtime PM possible from the PCI
46  * subsystem is D3hot. Then, if all these paired devices in the same root port
47  * are in D3hot, ACPI will assist here and run its own methods (_PR3 and _OFF)
48  * to perform the transition from D3hot to D3cold. Xe may disallow this
49  * transition by calling pci_d3cold_disable(root_pdev) before going to runtime
50  * suspend. It will be based on runtime conditions such as VRAM usage for a
51  * quick and low latency resume for instance.
52  *
53  * Runtime PM - This infrastructure provided by the Linux kernel allows the
54  * device drivers to indicate when the can be runtime suspended, so the device
55  * could be put at D3 (if supported), or allow deeper package sleep states
56  * (PC-states), and/or other low level power states. Xe PM component provides
57  * `xe_pm_runtime_suspend` and `xe_pm_runtime_resume` functions that PCI
58  * subsystem will call before transition to/from runtime suspend.
59  *
60  * Also, Xe PM provides get and put functions that Xe driver will use to
61  * indicate activity. In order to avoid locking complications with the memory
62  * management, whenever possible, these get and put functions needs to be called
63  * from the higher/outer levels.
64  * The main cases that need to be protected from the outer levels are: IOCTL,
65  * sysfs, debugfs, dma-buf sharing, GPU execution.
66  *
67  * This component is not responsible for GT idleness (RC6) nor GT frequency
68  * management (RPS).
69  */
70 
71 /**
72  * xe_pm_suspend - Helper for System suspend, i.e. S0->S3 / S0->S2idle
73  * @xe: xe device instance
74  *
75  * Return: 0 on success
76  */
77 int xe_pm_suspend(struct xe_device *xe)
78 {
79 	struct xe_gt *gt;
80 	u8 id;
81 	int err;
82 
83 	drm_dbg(&xe->drm, "Suspending device\n");
84 
85 	for_each_gt(gt, xe, id)
86 		xe_gt_suspend_prepare(gt);
87 
88 	/* FIXME: Super racey... */
89 	err = xe_bo_evict_all(xe);
90 	if (err)
91 		goto err;
92 
93 	xe_display_pm_suspend(xe);
94 
95 	for_each_gt(gt, xe, id) {
96 		err = xe_gt_suspend(gt);
97 		if (err) {
98 			xe_display_pm_resume(xe);
99 			goto err;
100 		}
101 	}
102 
103 	xe_irq_suspend(xe);
104 
105 	xe_display_pm_suspend_late(xe);
106 
107 	drm_dbg(&xe->drm, "Device suspended\n");
108 	return 0;
109 err:
110 	drm_dbg(&xe->drm, "Device suspend failed %d\n", err);
111 	return err;
112 }
113 
114 /**
115  * xe_pm_resume - Helper for System resume S3->S0 / S2idle->S0
116  * @xe: xe device instance
117  *
118  * Return: 0 on success
119  */
120 int xe_pm_resume(struct xe_device *xe)
121 {
122 	struct xe_tile *tile;
123 	struct xe_gt *gt;
124 	u8 id;
125 	int err;
126 
127 	drm_dbg(&xe->drm, "Resuming device\n");
128 
129 	for_each_tile(tile, xe, id)
130 		xe_wa_apply_tile_workarounds(tile);
131 
132 	err = xe_pcode_ready(xe, true);
133 	if (err)
134 		return err;
135 
136 	xe_display_pm_resume_early(xe);
137 
138 	/*
139 	 * This only restores pinned memory which is the memory required for the
140 	 * GT(s) to resume.
141 	 */
142 	err = xe_bo_restore_kernel(xe);
143 	if (err)
144 		goto err;
145 
146 	xe_irq_resume(xe);
147 
148 	xe_display_pm_resume(xe);
149 
150 	for_each_gt(gt, xe, id)
151 		xe_gt_resume(gt);
152 
153 	err = xe_bo_restore_user(xe);
154 	if (err)
155 		goto err;
156 
157 	drm_dbg(&xe->drm, "Device resumed\n");
158 	return 0;
159 err:
160 	drm_dbg(&xe->drm, "Device resume failed %d\n", err);
161 	return err;
162 }
163 
164 static bool xe_pm_pci_d3cold_capable(struct xe_device *xe)
165 {
166 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
167 	struct pci_dev *root_pdev;
168 
169 	root_pdev = pcie_find_root_port(pdev);
170 	if (!root_pdev)
171 		return false;
172 
173 	/* D3Cold requires PME capability */
174 	if (!pci_pme_capable(root_pdev, PCI_D3cold)) {
175 		drm_dbg(&xe->drm, "d3cold: PME# not supported\n");
176 		return false;
177 	}
178 
179 	/* D3Cold requires _PR3 power resource */
180 	if (!pci_pr3_present(root_pdev)) {
181 		drm_dbg(&xe->drm, "d3cold: ACPI _PR3 not present\n");
182 		return false;
183 	}
184 
185 	return true;
186 }
187 
188 static void xe_pm_runtime_init(struct xe_device *xe)
189 {
190 	struct device *dev = xe->drm.dev;
191 
192 	/*
193 	 * Disable the system suspend direct complete optimization.
194 	 * We need to ensure that the regular device suspend/resume functions
195 	 * are called since our runtime_pm cannot guarantee local memory
196 	 * eviction for d3cold.
197 	 * TODO: Check HDA audio dependencies claimed by i915, and then enforce
198 	 *       this option to integrated graphics as well.
199 	 */
200 	if (IS_DGFX(xe))
201 		dev_pm_set_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE);
202 
203 	pm_runtime_use_autosuspend(dev);
204 	pm_runtime_set_autosuspend_delay(dev, 1000);
205 	pm_runtime_set_active(dev);
206 	pm_runtime_allow(dev);
207 	pm_runtime_mark_last_busy(dev);
208 	pm_runtime_put(dev);
209 }
210 
211 void xe_pm_init_early(struct xe_device *xe)
212 {
213 	INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
214 	drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
215 }
216 
217 /**
218  * xe_pm_init - Initialize Xe Power Management
219  * @xe: xe device instance
220  *
221  * This component is responsible for System and Device sleep states.
222  */
223 void xe_pm_init(struct xe_device *xe)
224 {
225 	/* For now suspend/resume is only allowed with GuC */
226 	if (!xe_device_uc_enabled(xe))
227 		return;
228 
229 	drmm_mutex_init(&xe->drm, &xe->d3cold.lock);
230 
231 	xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe);
232 
233 	if (xe->d3cold.capable) {
234 		xe_device_sysfs_init(xe);
235 		xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD);
236 	}
237 
238 	xe_pm_runtime_init(xe);
239 }
240 
241 /**
242  * xe_pm_runtime_fini - Finalize Runtime PM
243  * @xe: xe device instance
244  */
245 void xe_pm_runtime_fini(struct xe_device *xe)
246 {
247 	struct device *dev = xe->drm.dev;
248 
249 	pm_runtime_get_sync(dev);
250 	pm_runtime_forbid(dev);
251 }
252 
253 static void xe_pm_write_callback_task(struct xe_device *xe,
254 				      struct task_struct *task)
255 {
256 	WRITE_ONCE(xe->pm_callback_task, task);
257 
258 	/*
259 	 * Just in case it's somehow possible for our writes to be reordered to
260 	 * the extent that something else re-uses the task written in
261 	 * pm_callback_task. For example after returning from the callback, but
262 	 * before the reordered write that resets pm_callback_task back to NULL.
263 	 */
264 	smp_mb(); /* pairs with xe_pm_read_callback_task */
265 }
266 
267 struct task_struct *xe_pm_read_callback_task(struct xe_device *xe)
268 {
269 	smp_mb(); /* pairs with xe_pm_write_callback_task */
270 
271 	return READ_ONCE(xe->pm_callback_task);
272 }
273 
274 /**
275  * xe_pm_runtime_suspended - Check if runtime_pm state is suspended
276  * @xe: xe device instance
277  *
278  * This does not provide any guarantee that the device is going to remain
279  * suspended as it might be racing with the runtime state transitions.
280  * It can be used only as a non-reliable assertion, to ensure that we are not in
281  * the sleep state while trying to access some memory for instance.
282  *
283  * Returns true if PCI device is suspended, false otherwise.
284  */
285 bool xe_pm_runtime_suspended(struct xe_device *xe)
286 {
287 	return pm_runtime_suspended(xe->drm.dev);
288 }
289 
290 /**
291  * xe_pm_runtime_suspend - Prepare our device for D3hot/D3Cold
292  * @xe: xe device instance
293  *
294  * Returns 0 for success, negative error code otherwise.
295  */
296 int xe_pm_runtime_suspend(struct xe_device *xe)
297 {
298 	struct xe_bo *bo, *on;
299 	struct xe_gt *gt;
300 	u8 id;
301 	int err = 0;
302 
303 	if (xe->d3cold.allowed && xe_device_mem_access_ongoing(xe))
304 		return -EBUSY;
305 
306 	/* Disable access_ongoing asserts and prevent recursive pm calls */
307 	xe_pm_write_callback_task(xe, current);
308 
309 	/*
310 	 * The actual xe_device_mem_access_put() is always async underneath, so
311 	 * exactly where that is called should makes no difference to us. However
312 	 * we still need to be very careful with the locks that this callback
313 	 * acquires and the locks that are acquired and held by any callers of
314 	 * xe_device_mem_access_get(). We already have the matching annotation
315 	 * on that side, but we also need it here. For example lockdep should be
316 	 * able to tell us if the following scenario is in theory possible:
317 	 *
318 	 * CPU0                          | CPU1 (kworker)
319 	 * lock(A)                       |
320 	 *                               | xe_pm_runtime_suspend()
321 	 *                               |      lock(A)
322 	 * xe_device_mem_access_get()    |
323 	 *
324 	 * This will clearly deadlock since rpm core needs to wait for
325 	 * xe_pm_runtime_suspend() to complete, but here we are holding lock(A)
326 	 * on CPU0 which prevents CPU1 making forward progress.  With the
327 	 * annotation here and in xe_device_mem_access_get() lockdep will see
328 	 * the potential lock inversion and give us a nice splat.
329 	 */
330 	lock_map_acquire(&xe_device_mem_access_lockdep_map);
331 
332 	/*
333 	 * Applying lock for entire list op as xe_ttm_bo_destroy and xe_bo_move_notify
334 	 * also checks and delets bo entry from user fault list.
335 	 */
336 	mutex_lock(&xe->mem_access.vram_userfault.lock);
337 	list_for_each_entry_safe(bo, on,
338 				 &xe->mem_access.vram_userfault.list, vram_userfault_link)
339 		xe_bo_runtime_pm_release_mmap_offset(bo);
340 	mutex_unlock(&xe->mem_access.vram_userfault.lock);
341 
342 	if (xe->d3cold.allowed) {
343 		err = xe_bo_evict_all(xe);
344 		if (err)
345 			goto out;
346 	}
347 
348 	for_each_gt(gt, xe, id) {
349 		err = xe_gt_suspend(gt);
350 		if (err)
351 			goto out;
352 	}
353 
354 	xe_irq_suspend(xe);
355 out:
356 	lock_map_release(&xe_device_mem_access_lockdep_map);
357 	xe_pm_write_callback_task(xe, NULL);
358 	return err;
359 }
360 
361 /**
362  * xe_pm_runtime_resume - Waking up from D3hot/D3Cold
363  * @xe: xe device instance
364  *
365  * Returns 0 for success, negative error code otherwise.
366  */
367 int xe_pm_runtime_resume(struct xe_device *xe)
368 {
369 	struct xe_gt *gt;
370 	u8 id;
371 	int err = 0;
372 
373 	/* Disable access_ongoing asserts and prevent recursive pm calls */
374 	xe_pm_write_callback_task(xe, current);
375 
376 	lock_map_acquire(&xe_device_mem_access_lockdep_map);
377 
378 	/*
379 	 * It can be possible that xe has allowed d3cold but other pcie devices
380 	 * in gfx card soc would have blocked d3cold, therefore card has not
381 	 * really lost power. Detecting primary Gt power is sufficient.
382 	 */
383 	gt = xe_device_get_gt(xe, 0);
384 	xe->d3cold.power_lost = xe_guc_in_reset(&gt->uc.guc);
385 
386 	if (xe->d3cold.allowed && xe->d3cold.power_lost) {
387 		err = xe_pcode_ready(xe, true);
388 		if (err)
389 			goto out;
390 
391 		/*
392 		 * This only restores pinned memory which is the memory
393 		 * required for the GT(s) to resume.
394 		 */
395 		err = xe_bo_restore_kernel(xe);
396 		if (err)
397 			goto out;
398 	}
399 
400 	xe_irq_resume(xe);
401 
402 	for_each_gt(gt, xe, id)
403 		xe_gt_resume(gt);
404 
405 	if (xe->d3cold.allowed && xe->d3cold.power_lost) {
406 		err = xe_bo_restore_user(xe);
407 		if (err)
408 			goto out;
409 	}
410 out:
411 	lock_map_release(&xe_device_mem_access_lockdep_map);
412 	xe_pm_write_callback_task(xe, NULL);
413 	return err;
414 }
415 
416 /**
417  * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously
418  * @xe: xe device instance
419  */
420 void xe_pm_runtime_get(struct xe_device *xe)
421 {
422 	pm_runtime_get_noresume(xe->drm.dev);
423 
424 	if (xe_pm_read_callback_task(xe) == current)
425 		return;
426 
427 	pm_runtime_resume(xe->drm.dev);
428 }
429 
430 /**
431  * xe_pm_runtime_put - Put the runtime_pm reference back and mark as idle
432  * @xe: xe device instance
433  */
434 void xe_pm_runtime_put(struct xe_device *xe)
435 {
436 	if (xe_pm_read_callback_task(xe) == current) {
437 		pm_runtime_put_noidle(xe->drm.dev);
438 	} else {
439 		pm_runtime_mark_last_busy(xe->drm.dev);
440 		pm_runtime_put(xe->drm.dev);
441 	}
442 }
443 
444 /**
445  * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl
446  * @xe: xe device instance
447  *
448  * Returns: Any number greater than or equal to 0 for success, negative error
449  * code otherwise.
450  */
451 int xe_pm_runtime_get_ioctl(struct xe_device *xe)
452 {
453 	if (WARN_ON(xe_pm_read_callback_task(xe) == current))
454 		return -ELOOP;
455 
456 	return pm_runtime_get_sync(xe->drm.dev);
457 }
458 
459 /**
460  * xe_pm_runtime_get_if_active - Get a runtime_pm reference if device active
461  * @xe: xe device instance
462  *
463  * Returns: Any number greater than or equal to 0 for success, negative error
464  * code otherwise.
465  */
466 int xe_pm_runtime_get_if_active(struct xe_device *xe)
467 {
468 	return pm_runtime_get_if_active(xe->drm.dev);
469 }
470 
471 /**
472  * xe_pm_runtime_get_if_in_use - Get a runtime_pm reference and resume if needed
473  * @xe: xe device instance
474  *
475  * Returns: True if device is awake and the reference was taken, false otherwise.
476  */
477 bool xe_pm_runtime_get_if_in_use(struct xe_device *xe)
478 {
479 	if (xe_pm_read_callback_task(xe) == current) {
480 		/* The device is awake, grab the ref and move on */
481 		pm_runtime_get_noresume(xe->drm.dev);
482 		return true;
483 	}
484 
485 	return pm_runtime_get_if_in_use(xe->drm.dev) > 0;
486 }
487 
488 /**
489  * xe_pm_runtime_resume_and_get - Resume, then get a runtime_pm ref if awake.
490  * @xe: xe device instance
491  *
492  * Returns: True if device is awake and the reference was taken, false otherwise.
493  */
494 bool xe_pm_runtime_resume_and_get(struct xe_device *xe)
495 {
496 	if (xe_pm_read_callback_task(xe) == current) {
497 		/* The device is awake, grab the ref and move on */
498 		pm_runtime_get_noresume(xe->drm.dev);
499 		return true;
500 	}
501 
502 	return pm_runtime_resume_and_get(xe->drm.dev) >= 0;
503 }
504 
505 /**
506  * xe_pm_assert_unbounded_bridge - Disable PM on unbounded pcie parent bridge
507  * @xe: xe device instance
508  */
509 void xe_pm_assert_unbounded_bridge(struct xe_device *xe)
510 {
511 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
512 	struct pci_dev *bridge = pci_upstream_bridge(pdev);
513 
514 	if (!bridge)
515 		return;
516 
517 	if (!bridge->driver) {
518 		drm_warn(&xe->drm, "unbounded parent pci bridge, device won't support any PM support.\n");
519 		device_set_pm_not_required(&pdev->dev);
520 	}
521 }
522 
523 /**
524  * xe_pm_set_vram_threshold - Set a vram threshold for allowing/blocking D3Cold
525  * @xe: xe device instance
526  * @threshold: VRAM size in bites for the D3cold threshold
527  *
528  * Returns 0 for success, negative error code otherwise.
529  */
530 int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold)
531 {
532 	struct ttm_resource_manager *man;
533 	u32 vram_total_mb = 0;
534 	int i;
535 
536 	for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) {
537 		man = ttm_manager_type(&xe->ttm, i);
538 		if (man)
539 			vram_total_mb += DIV_ROUND_UP_ULL(man->size, 1024 * 1024);
540 	}
541 
542 	drm_dbg(&xe->drm, "Total vram %u mb\n", vram_total_mb);
543 
544 	if (threshold > vram_total_mb)
545 		return -EINVAL;
546 
547 	mutex_lock(&xe->d3cold.lock);
548 	xe->d3cold.vram_threshold = threshold;
549 	mutex_unlock(&xe->d3cold.lock);
550 
551 	return 0;
552 }
553 
554 /**
555  * xe_pm_d3cold_allowed_toggle - Check conditions to toggle d3cold.allowed
556  * @xe: xe device instance
557  *
558  * To be called during runtime_pm idle callback.
559  * Check for all the D3Cold conditions ahead of runtime suspend.
560  */
561 void xe_pm_d3cold_allowed_toggle(struct xe_device *xe)
562 {
563 	struct ttm_resource_manager *man;
564 	u32 total_vram_used_mb = 0;
565 	u64 vram_used;
566 	int i;
567 
568 	if (!xe->d3cold.capable) {
569 		xe->d3cold.allowed = false;
570 		return;
571 	}
572 
573 	for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) {
574 		man = ttm_manager_type(&xe->ttm, i);
575 		if (man) {
576 			vram_used = ttm_resource_manager_usage(man);
577 			total_vram_used_mb += DIV_ROUND_UP_ULL(vram_used, 1024 * 1024);
578 		}
579 	}
580 
581 	mutex_lock(&xe->d3cold.lock);
582 
583 	if (total_vram_used_mb < xe->d3cold.vram_threshold)
584 		xe->d3cold.allowed = true;
585 	else
586 		xe->d3cold.allowed = false;
587 
588 	mutex_unlock(&xe->d3cold.lock);
589 
590 	drm_dbg(&xe->drm,
591 		"d3cold: allowed=%s\n", str_yes_no(xe->d3cold.allowed));
592 }
593