xref: /linux/drivers/gpu/drm/xe/xe_pm.c (revision d6b4137822a1f8d1a6676c18dff551b394557b65)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include "xe_pm.h"
7 
8 #include <linux/pm_runtime.h>
9 
10 #include <drm/drm_managed.h>
11 #include <drm/ttm/ttm_placement.h>
12 
13 #include "display/xe_display.h"
14 #include "xe_bo.h"
15 #include "xe_bo_evict.h"
16 #include "xe_device.h"
17 #include "xe_device_sysfs.h"
18 #include "xe_ggtt.h"
19 #include "xe_gt.h"
20 #include "xe_guc.h"
21 #include "xe_irq.h"
22 #include "xe_pcode.h"
23 #include "xe_wa.h"
24 
25 /**
26  * DOC: Xe Power Management
27  *
28  * Xe PM implements the main routines for both system level suspend states and
29  * for the opportunistic runtime suspend states.
30  *
31  * System Level Suspend (S-States) - In general this is OS initiated suspend
32  * driven by ACPI for achieving S0ix (a.k.a. S2idle, freeze), S3 (suspend to ram),
33  * S4 (disk). The main functions here are `xe_pm_suspend` and `xe_pm_resume`. They
34  * are the main point for the suspend to and resume from these states.
35  *
36  * PCI Device Suspend (D-States) - This is the opportunistic PCIe device low power
37  * state D3, controlled by the PCI subsystem and ACPI with the help from the
38  * runtime_pm infrastructure.
39  * PCI D3 is special and can mean D3hot, where Vcc power is on for keeping memory
40  * alive and quicker low latency resume or D3Cold where Vcc power is off for
41  * better power savings.
42  * The Vcc control of PCI hierarchy can only be controlled at the PCI root port
43  * level, while the device driver can be behind multiple bridges/switches and
44  * paired with other devices. For this reason, the PCI subsystem cannot perform
45  * the transition towards D3Cold. The lowest runtime PM possible from the PCI
46  * subsystem is D3hot. Then, if all these paired devices in the same root port
47  * are in D3hot, ACPI will assist here and run its own methods (_PR3 and _OFF)
48  * to perform the transition from D3hot to D3cold. Xe may disallow this
49  * transition by calling pci_d3cold_disable(root_pdev) before going to runtime
50  * suspend. It will be based on runtime conditions such as VRAM usage for a
51  * quick and low latency resume for instance.
52  *
53  * Runtime PM - This infrastructure provided by the Linux kernel allows the
54  * device drivers to indicate when the can be runtime suspended, so the device
55  * could be put at D3 (if supported), or allow deeper package sleep states
56  * (PC-states), and/or other low level power states. Xe PM component provides
57  * `xe_pm_runtime_suspend` and `xe_pm_runtime_resume` functions that PCI
58  * subsystem will call before transition to/from runtime suspend.
59  *
60  * Also, Xe PM provides get and put functions that Xe driver will use to
61  * indicate activity. In order to avoid locking complications with the memory
62  * management, whenever possible, these get and put functions needs to be called
63  * from the higher/outer levels.
64  * The main cases that need to be protected from the outer levels are: IOCTL,
65  * sysfs, debugfs, dma-buf sharing, GPU execution.
66  *
67  * This component is not responsible for GT idleness (RC6) nor GT frequency
68  * management (RPS).
69  */
70 
71 /**
72  * xe_pm_suspend - Helper for System suspend, i.e. S0->S3 / S0->S2idle
73  * @xe: xe device instance
74  *
75  * Return: 0 on success
76  */
77 int xe_pm_suspend(struct xe_device *xe)
78 {
79 	struct xe_gt *gt;
80 	u8 id;
81 	int err;
82 
83 	for_each_gt(gt, xe, id)
84 		xe_gt_suspend_prepare(gt);
85 
86 	/* FIXME: Super racey... */
87 	err = xe_bo_evict_all(xe);
88 	if (err)
89 		return err;
90 
91 	xe_display_pm_suspend(xe);
92 
93 	for_each_gt(gt, xe, id) {
94 		err = xe_gt_suspend(gt);
95 		if (err) {
96 			xe_display_pm_resume(xe);
97 			return err;
98 		}
99 	}
100 
101 	xe_irq_suspend(xe);
102 
103 	xe_display_pm_suspend_late(xe);
104 
105 	return 0;
106 }
107 
108 /**
109  * xe_pm_resume - Helper for System resume S3->S0 / S2idle->S0
110  * @xe: xe device instance
111  *
112  * Return: 0 on success
113  */
114 int xe_pm_resume(struct xe_device *xe)
115 {
116 	struct xe_tile *tile;
117 	struct xe_gt *gt;
118 	u8 id;
119 	int err;
120 
121 	for_each_tile(tile, xe, id)
122 		xe_wa_apply_tile_workarounds(tile);
123 
124 	for_each_gt(gt, xe, id) {
125 		err = xe_pcode_init(gt);
126 		if (err)
127 			return err;
128 	}
129 
130 	xe_display_pm_resume_early(xe);
131 
132 	/*
133 	 * This only restores pinned memory which is the memory required for the
134 	 * GT(s) to resume.
135 	 */
136 	err = xe_bo_restore_kernel(xe);
137 	if (err)
138 		return err;
139 
140 	xe_irq_resume(xe);
141 
142 	xe_display_pm_resume(xe);
143 
144 	for_each_gt(gt, xe, id)
145 		xe_gt_resume(gt);
146 
147 	err = xe_bo_restore_user(xe);
148 	if (err)
149 		return err;
150 
151 	return 0;
152 }
153 
154 static bool xe_pm_pci_d3cold_capable(struct xe_device *xe)
155 {
156 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
157 	struct pci_dev *root_pdev;
158 
159 	root_pdev = pcie_find_root_port(pdev);
160 	if (!root_pdev)
161 		return false;
162 
163 	/* D3Cold requires PME capability */
164 	if (!pci_pme_capable(root_pdev, PCI_D3cold)) {
165 		drm_dbg(&xe->drm, "d3cold: PME# not supported\n");
166 		return false;
167 	}
168 
169 	/* D3Cold requires _PR3 power resource */
170 	if (!pci_pr3_present(root_pdev)) {
171 		drm_dbg(&xe->drm, "d3cold: ACPI _PR3 not present\n");
172 		return false;
173 	}
174 
175 	return true;
176 }
177 
178 static void xe_pm_runtime_init(struct xe_device *xe)
179 {
180 	struct device *dev = xe->drm.dev;
181 
182 	/*
183 	 * Disable the system suspend direct complete optimization.
184 	 * We need to ensure that the regular device suspend/resume functions
185 	 * are called since our runtime_pm cannot guarantee local memory
186 	 * eviction for d3cold.
187 	 * TODO: Check HDA audio dependencies claimed by i915, and then enforce
188 	 *       this option to integrated graphics as well.
189 	 */
190 	if (IS_DGFX(xe))
191 		dev_pm_set_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE);
192 
193 	pm_runtime_use_autosuspend(dev);
194 	pm_runtime_set_autosuspend_delay(dev, 1000);
195 	pm_runtime_set_active(dev);
196 	pm_runtime_allow(dev);
197 	pm_runtime_mark_last_busy(dev);
198 	pm_runtime_put(dev);
199 }
200 
201 void xe_pm_init_early(struct xe_device *xe)
202 {
203 	INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
204 	drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
205 }
206 
207 /**
208  * xe_pm_init - Initialize Xe Power Management
209  * @xe: xe device instance
210  *
211  * This component is responsible for System and Device sleep states.
212  */
213 void xe_pm_init(struct xe_device *xe)
214 {
215 	/* For now suspend/resume is only allowed with GuC */
216 	if (!xe_device_uc_enabled(xe))
217 		return;
218 
219 	drmm_mutex_init(&xe->drm, &xe->d3cold.lock);
220 
221 	xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe);
222 
223 	if (xe->d3cold.capable) {
224 		xe_device_sysfs_init(xe);
225 		xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD);
226 	}
227 
228 	xe_pm_runtime_init(xe);
229 }
230 
231 /**
232  * xe_pm_runtime_fini - Finalize Runtime PM
233  * @xe: xe device instance
234  */
235 void xe_pm_runtime_fini(struct xe_device *xe)
236 {
237 	struct device *dev = xe->drm.dev;
238 
239 	pm_runtime_get_sync(dev);
240 	pm_runtime_forbid(dev);
241 }
242 
243 static void xe_pm_write_callback_task(struct xe_device *xe,
244 				      struct task_struct *task)
245 {
246 	WRITE_ONCE(xe->pm_callback_task, task);
247 
248 	/*
249 	 * Just in case it's somehow possible for our writes to be reordered to
250 	 * the extent that something else re-uses the task written in
251 	 * pm_callback_task. For example after returning from the callback, but
252 	 * before the reordered write that resets pm_callback_task back to NULL.
253 	 */
254 	smp_mb(); /* pairs with xe_pm_read_callback_task */
255 }
256 
257 struct task_struct *xe_pm_read_callback_task(struct xe_device *xe)
258 {
259 	smp_mb(); /* pairs with xe_pm_write_callback_task */
260 
261 	return READ_ONCE(xe->pm_callback_task);
262 }
263 
264 /**
265  * xe_pm_runtime_suspended - Check if runtime_pm state is suspended
266  * @xe: xe device instance
267  *
268  * This does not provide any guarantee that the device is going to remain
269  * suspended as it might be racing with the runtime state transitions.
270  * It can be used only as a non-reliable assertion, to ensure that we are not in
271  * the sleep state while trying to access some memory for instance.
272  *
273  * Returns true if PCI device is suspended, false otherwise.
274  */
275 bool xe_pm_runtime_suspended(struct xe_device *xe)
276 {
277 	return pm_runtime_suspended(xe->drm.dev);
278 }
279 
280 /**
281  * xe_pm_runtime_suspend - Prepare our device for D3hot/D3Cold
282  * @xe: xe device instance
283  *
284  * Returns 0 for success, negative error code otherwise.
285  */
286 int xe_pm_runtime_suspend(struct xe_device *xe)
287 {
288 	struct xe_bo *bo, *on;
289 	struct xe_gt *gt;
290 	u8 id;
291 	int err = 0;
292 
293 	if (xe->d3cold.allowed && xe_device_mem_access_ongoing(xe))
294 		return -EBUSY;
295 
296 	/* Disable access_ongoing asserts and prevent recursive pm calls */
297 	xe_pm_write_callback_task(xe, current);
298 
299 	/*
300 	 * The actual xe_device_mem_access_put() is always async underneath, so
301 	 * exactly where that is called should makes no difference to us. However
302 	 * we still need to be very careful with the locks that this callback
303 	 * acquires and the locks that are acquired and held by any callers of
304 	 * xe_device_mem_access_get(). We already have the matching annotation
305 	 * on that side, but we also need it here. For example lockdep should be
306 	 * able to tell us if the following scenario is in theory possible:
307 	 *
308 	 * CPU0                          | CPU1 (kworker)
309 	 * lock(A)                       |
310 	 *                               | xe_pm_runtime_suspend()
311 	 *                               |      lock(A)
312 	 * xe_device_mem_access_get()    |
313 	 *
314 	 * This will clearly deadlock since rpm core needs to wait for
315 	 * xe_pm_runtime_suspend() to complete, but here we are holding lock(A)
316 	 * on CPU0 which prevents CPU1 making forward progress.  With the
317 	 * annotation here and in xe_device_mem_access_get() lockdep will see
318 	 * the potential lock inversion and give us a nice splat.
319 	 */
320 	lock_map_acquire(&xe_device_mem_access_lockdep_map);
321 
322 	/*
323 	 * Applying lock for entire list op as xe_ttm_bo_destroy and xe_bo_move_notify
324 	 * also checks and delets bo entry from user fault list.
325 	 */
326 	mutex_lock(&xe->mem_access.vram_userfault.lock);
327 	list_for_each_entry_safe(bo, on,
328 				 &xe->mem_access.vram_userfault.list, vram_userfault_link)
329 		xe_bo_runtime_pm_release_mmap_offset(bo);
330 	mutex_unlock(&xe->mem_access.vram_userfault.lock);
331 
332 	if (xe->d3cold.allowed) {
333 		err = xe_bo_evict_all(xe);
334 		if (err)
335 			goto out;
336 	}
337 
338 	for_each_gt(gt, xe, id) {
339 		err = xe_gt_suspend(gt);
340 		if (err)
341 			goto out;
342 	}
343 
344 	xe_irq_suspend(xe);
345 out:
346 	lock_map_release(&xe_device_mem_access_lockdep_map);
347 	xe_pm_write_callback_task(xe, NULL);
348 	return err;
349 }
350 
351 /**
352  * xe_pm_runtime_resume - Waking up from D3hot/D3Cold
353  * @xe: xe device instance
354  *
355  * Returns 0 for success, negative error code otherwise.
356  */
357 int xe_pm_runtime_resume(struct xe_device *xe)
358 {
359 	struct xe_gt *gt;
360 	u8 id;
361 	int err = 0;
362 
363 	/* Disable access_ongoing asserts and prevent recursive pm calls */
364 	xe_pm_write_callback_task(xe, current);
365 
366 	lock_map_acquire(&xe_device_mem_access_lockdep_map);
367 
368 	/*
369 	 * It can be possible that xe has allowed d3cold but other pcie devices
370 	 * in gfx card soc would have blocked d3cold, therefore card has not
371 	 * really lost power. Detecting primary Gt power is sufficient.
372 	 */
373 	gt = xe_device_get_gt(xe, 0);
374 	xe->d3cold.power_lost = xe_guc_in_reset(&gt->uc.guc);
375 
376 	if (xe->d3cold.allowed && xe->d3cold.power_lost) {
377 		for_each_gt(gt, xe, id) {
378 			err = xe_pcode_init(gt);
379 			if (err)
380 				goto out;
381 		}
382 
383 		/*
384 		 * This only restores pinned memory which is the memory
385 		 * required for the GT(s) to resume.
386 		 */
387 		err = xe_bo_restore_kernel(xe);
388 		if (err)
389 			goto out;
390 	}
391 
392 	xe_irq_resume(xe);
393 
394 	for_each_gt(gt, xe, id)
395 		xe_gt_resume(gt);
396 
397 	if (xe->d3cold.allowed && xe->d3cold.power_lost) {
398 		err = xe_bo_restore_user(xe);
399 		if (err)
400 			goto out;
401 	}
402 out:
403 	lock_map_release(&xe_device_mem_access_lockdep_map);
404 	xe_pm_write_callback_task(xe, NULL);
405 	return err;
406 }
407 
408 /**
409  * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously
410  * @xe: xe device instance
411  *
412  * Returns: Any number greater than or equal to 0 for success, negative error
413  * code otherwise.
414  */
415 int xe_pm_runtime_get(struct xe_device *xe)
416 {
417 	return pm_runtime_get_sync(xe->drm.dev);
418 }
419 
420 /**
421  * xe_pm_runtime_put - Put the runtime_pm reference back and mark as idle
422  * @xe: xe device instance
423  *
424  * Returns: Any number greater than or equal to 0 for success, negative error
425  * code otherwise.
426  */
427 int xe_pm_runtime_put(struct xe_device *xe)
428 {
429 	pm_runtime_mark_last_busy(xe->drm.dev);
430 	return pm_runtime_put(xe->drm.dev);
431 }
432 
433 /**
434  * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl
435  * @xe: xe device instance
436  *
437  * Returns: Any number greater than or equal to 0 for success, negative error
438  * code otherwise.
439  */
440 int xe_pm_runtime_get_ioctl(struct xe_device *xe)
441 {
442 	if (WARN_ON(xe_pm_read_callback_task(xe) == current))
443 		return -ELOOP;
444 
445 	return pm_runtime_get_sync(xe->drm.dev);
446 }
447 
448 /**
449  * xe_pm_runtime_get_if_active - Get a runtime_pm reference if device active
450  * @xe: xe device instance
451  *
452  * Returns: Any number greater than or equal to 0 for success, negative error
453  * code otherwise.
454  */
455 int xe_pm_runtime_get_if_active(struct xe_device *xe)
456 {
457 	return pm_runtime_get_if_active(xe->drm.dev, true);
458 }
459 
460 /**
461  * xe_pm_runtime_get_if_in_use - Get a runtime_pm reference and resume if needed
462  * @xe: xe device instance
463  *
464  * Returns: True if device is awake and the reference was taken, false otherwise.
465  */
466 bool xe_pm_runtime_get_if_in_use(struct xe_device *xe)
467 {
468 	if (xe_pm_read_callback_task(xe) == current) {
469 		/* The device is awake, grab the ref and move on */
470 		pm_runtime_get_noresume(xe->drm.dev);
471 		return true;
472 	}
473 
474 	return pm_runtime_get_if_in_use(xe->drm.dev) > 0;
475 }
476 
477 /**
478  * xe_pm_runtime_resume_and_get - Resume, then get a runtime_pm ref if awake.
479  * @xe: xe device instance
480  *
481  * Returns: True if device is awake and the reference was taken, false otherwise.
482  */
483 bool xe_pm_runtime_resume_and_get(struct xe_device *xe)
484 {
485 	if (xe_pm_read_callback_task(xe) == current) {
486 		/* The device is awake, grab the ref and move on */
487 		pm_runtime_get_noresume(xe->drm.dev);
488 		return true;
489 	}
490 
491 	return pm_runtime_resume_and_get(xe->drm.dev) >= 0;
492 }
493 
494 /**
495  * xe_pm_assert_unbounded_bridge - Disable PM on unbounded pcie parent bridge
496  * @xe: xe device instance
497  */
498 void xe_pm_assert_unbounded_bridge(struct xe_device *xe)
499 {
500 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
501 	struct pci_dev *bridge = pci_upstream_bridge(pdev);
502 
503 	if (!bridge)
504 		return;
505 
506 	if (!bridge->driver) {
507 		drm_warn(&xe->drm, "unbounded parent pci bridge, device won't support any PM support.\n");
508 		device_set_pm_not_required(&pdev->dev);
509 	}
510 }
511 
512 /**
513  * xe_pm_set_vram_threshold - Set a vram threshold for allowing/blocking D3Cold
514  * @xe: xe device instance
515  * @threshold: VRAM size in bites for the D3cold threshold
516  *
517  * Returns 0 for success, negative error code otherwise.
518  */
519 int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold)
520 {
521 	struct ttm_resource_manager *man;
522 	u32 vram_total_mb = 0;
523 	int i;
524 
525 	for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) {
526 		man = ttm_manager_type(&xe->ttm, i);
527 		if (man)
528 			vram_total_mb += DIV_ROUND_UP_ULL(man->size, 1024 * 1024);
529 	}
530 
531 	drm_dbg(&xe->drm, "Total vram %u mb\n", vram_total_mb);
532 
533 	if (threshold > vram_total_mb)
534 		return -EINVAL;
535 
536 	mutex_lock(&xe->d3cold.lock);
537 	xe->d3cold.vram_threshold = threshold;
538 	mutex_unlock(&xe->d3cold.lock);
539 
540 	return 0;
541 }
542 
543 /**
544  * xe_pm_d3cold_allowed_toggle - Check conditions to toggle d3cold.allowed
545  * @xe: xe device instance
546  *
547  * To be called during runtime_pm idle callback.
548  * Check for all the D3Cold conditions ahead of runtime suspend.
549  */
550 void xe_pm_d3cold_allowed_toggle(struct xe_device *xe)
551 {
552 	struct ttm_resource_manager *man;
553 	u32 total_vram_used_mb = 0;
554 	u64 vram_used;
555 	int i;
556 
557 	if (!xe->d3cold.capable) {
558 		xe->d3cold.allowed = false;
559 		return;
560 	}
561 
562 	for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) {
563 		man = ttm_manager_type(&xe->ttm, i);
564 		if (man) {
565 			vram_used = ttm_resource_manager_usage(man);
566 			total_vram_used_mb += DIV_ROUND_UP_ULL(vram_used, 1024 * 1024);
567 		}
568 	}
569 
570 	mutex_lock(&xe->d3cold.lock);
571 
572 	if (total_vram_used_mb < xe->d3cold.vram_threshold)
573 		xe->d3cold.allowed = true;
574 	else
575 		xe->d3cold.allowed = false;
576 
577 	mutex_unlock(&xe->d3cold.lock);
578 
579 	drm_dbg(&xe->drm,
580 		"d3cold: allowed=%s\n", str_yes_no(xe->d3cold.allowed));
581 }
582