xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c (revision 8c13415c8a4383447c21ec832b20b3b283f0e01a)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2024 Advanced Micro Devices, Inc.
4  *
5  * Lockdep annotation for AMDGPU lock ordering
6  *
7  * This module teaches lockdep the correct lock ordering to catch
8  * potential deadlocks at development time rather than runtime.
9  *
10  * Based on dma-resv lockdep approach from:
11  * drivers/dma-buf/dma-resv.c:dma_resv_lockdep()
12  */
13 
14 #include "amdgpu.h"
15 #include "amdgpu_reset.h"
16 
17 #ifdef CONFIG_LOCKDEP
18 
19 /* Lock class keys for associating with real driver locks */
20 static struct lock_class_key amdgpu_userq_sch_mutex_key;
21 static struct lock_class_key amdgpu_userq_mutex_key;
22 static struct lock_class_key amdgpu_notifier_lock_key;
23 static struct lock_class_key amdgpu_vram_lock_key;
24 static struct lock_class_key amdgpu_reset_sem_key;
25 static struct lock_class_key amdgpu_reset_lock_key;
26 static struct lock_class_key amdgpu_srbm_lock_key;
27 static struct lock_class_key amdgpu_grbm_lock_key;
28 static struct lock_class_key amdgpu_mmio_lock_key;
29 
30 /**
31  * amdgpu_lockdep_set_class - Associate lock class keys with real locks
32  * @adev: AMDGPU device
33  *
34  * Call during device init to associate lock classes with actual locks
35  * so lockdep can track them properly.
36  */
37 void amdgpu_lockdep_set_class(struct amdgpu_device *adev)
38 {
39 	lockdep_set_class(&adev->gfx.userq_sch_mutex,
40 			  &amdgpu_userq_sch_mutex_key);
41 	lockdep_set_class(&adev->notifier_lock, &amdgpu_notifier_lock_key);
42 	lockdep_set_class(&adev->srbm_mutex, &amdgpu_srbm_lock_key);
43 	lockdep_set_class(&adev->grbm_idx_mutex, &amdgpu_grbm_lock_key);
44 	lockdep_set_class(&adev->mmio_idx_lock, &amdgpu_mmio_lock_key);
45 
46 	if (adev->reset_domain)
47 		lockdep_set_class(&adev->reset_domain->sem,
48 				  &amdgpu_reset_sem_key);
49 }
50 
51 /**
52  * amdgpu_lockdep_init - Teach lockdep the correct lock ordering
53  *
54  * Instantiates dummy objects and takes locks in the correct order to
55  * train lockdep. This helps catch lock ordering violations during
56  * development.
57  *
58  * Lock ordering hierarchy (outermost to innermost):
59  *
60  * 1. userq_sch_mutex     - Global userq scheduler (enforce_isolation)
61  * 2. userq_mutex         - Per-context userq (held across queue create/destroy)
62  * 3. notifier_lock       - MMU notifier lock
63  * 4. vram_lock           - VRAM allocator lock
64  * 5. reset_domain->sem   - GPU reset synchronization
65  * 6. reset_lock          - Reset control lock
66  * 7. srbm_mutex          - SRBM register access
67  * 8. grbm_idx_mutex      - GRBM index access
68  * 9. mmio_idx_lock       - MMIO index access (spinlock)
69  *
70  * Evidence:
71  * - userq_sch_mutex -> userq_mutex: amdgpu_gfx_kfd_sch_ctrl() calls
72  *   amdgpu_userq_stop_sched_for_enforce_isolation() which takes userq_mutex
73  * - userq_mutex -> notifier_lock: userq paths may trigger MMU notifier
74  *   invalidation which acquires notifier_lock
75  * - notifier_lock -> reset_domain->sem: HMM invalidation callback holds
76  *   notifier_lock and can wait for GPU reset completion, so notifier_lock
77  *   must be outer to reset_domain->sem
78  * - vram_lock -> reset_domain->sem: VRAM management paths may need to
79  *   wait for ongoing reset to complete
80  *
81  * Note: mmap_lock ordering relative to GPU locks is already taught
82  * by dma-resv (drivers/dma-buf/dma-resv.c).
83  */
84 int amdgpu_lockdep_init(void)
85 {
86 	struct amdgpu_reset_domain *reset_domain = NULL;
87 	struct amdgpu_reset_control reset_ctl;
88 	struct mutex userq_sch_mutex;
89 	struct mutex userq_mutex;
90 	struct mutex notifier_lock;
91 	struct mutex vram_lock;
92 	struct mutex srbm_mutex;
93 	struct mutex grbm_idx_mutex;
94 	spinlock_t mmio_idx_lock;
95 	unsigned long flags;
96 
97 	/*
98 	 * Initialize dummy reset domain
99 	 */
100 	reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE,
101 							"lockdep_test");
102 	if (!reset_domain)
103 		return -ENOMEM;
104 
105 	/* Initialize dummy locks */
106 	mutex_init(&userq_sch_mutex);
107 	mutex_init(&userq_mutex);
108 	mutex_init(&notifier_lock);
109 	mutex_init(&vram_lock);
110 	mutex_init(&reset_ctl.reset_lock);
111 	mutex_init(&srbm_mutex);
112 	mutex_init(&grbm_idx_mutex);
113 	spin_lock_init(&mmio_idx_lock);
114 
115 	/*
116 	 * Associate dummy locks with the same class keys used for real
117 	 * driver locks. This ensures lockdep connects the ordering learned
118 	 * here with the actual locks used at runtime.
119 	 */
120 	lockdep_set_class(&userq_sch_mutex, &amdgpu_userq_sch_mutex_key);
121 	lockdep_set_class(&userq_mutex, &amdgpu_userq_mutex_key);
122 	lockdep_set_class(&notifier_lock, &amdgpu_notifier_lock_key);
123 	lockdep_set_class(&vram_lock, &amdgpu_vram_lock_key);
124 	lockdep_set_class(&reset_domain->sem, &amdgpu_reset_sem_key);
125 	lockdep_set_class(&reset_ctl.reset_lock, &amdgpu_reset_lock_key);
126 	lockdep_set_class(&srbm_mutex, &amdgpu_srbm_lock_key);
127 	lockdep_set_class(&grbm_idx_mutex, &amdgpu_grbm_lock_key);
128 	lockdep_set_class(&mmio_idx_lock, &amdgpu_mmio_lock_key);
129 
130 	/*
131 	 * Take locks in the correct order to train lockdep.
132 	 * This establishes the dependency chain.
133 	 */
134 
135 	/* Level 1: Global userq scheduler mutex (outermost) */
136 	mutex_lock(&userq_sch_mutex);
137 
138 	/* Level 2: Per-context userq mutex */
139 	mutex_lock(&userq_mutex);
140 
141 	/* Level 3: MMU notifier lock */
142 	mutex_lock(&notifier_lock);
143 
144 	/* Level 4: VRAM allocator lock */
145 	mutex_lock(&vram_lock);
146 
147 	/* Level 5: Reset domain semaphore */
148 	down_read(&reset_domain->sem);
149 
150 	/* Level 6: Reset control lock */
151 	mutex_lock(&reset_ctl.reset_lock);
152 
153 	/*
154 	 * Mark potential memory reclaim boundary.
155 	 * GPU operations might trigger memory allocation/reclaim.
156 	 */
157 	fs_reclaim_acquire(GFP_KERNEL);
158 
159 	/* Level 7: SRBM register access */
160 	mutex_lock(&srbm_mutex);
161 
162 	/* Level 8: GRBM index access */
163 	mutex_lock(&grbm_idx_mutex);
164 
165 	/* Level 9: MMIO index access (innermost lock, spinlock) */
166 	spin_lock_irqsave(&mmio_idx_lock, flags);
167 
168 	/*
169 	 * All locks acquired in order.
170 	 * Lockdep has now learned the valid dependency chain.
171 	 */
172 
173 	/* Release in reverse order */
174 	spin_unlock_irqrestore(&mmio_idx_lock, flags);
175 	mutex_unlock(&grbm_idx_mutex);
176 	mutex_unlock(&srbm_mutex);
177 
178 	fs_reclaim_release(GFP_KERNEL);
179 
180 	mutex_unlock(&reset_ctl.reset_lock);
181 	up_read(&reset_domain->sem);
182 	mutex_unlock(&vram_lock);
183 	mutex_unlock(&notifier_lock);
184 	mutex_unlock(&userq_mutex);
185 	mutex_unlock(&userq_sch_mutex);
186 
187 	/* Cleanup */
188 	amdgpu_reset_put_reset_domain(reset_domain);
189 
190 	pr_info("AMDGPU: Lockdep annotations initialized (9 lock levels)\n");
191 
192 	return 0;
193 }
194 
195 #endif /* CONFIG_LOCKDEP */
196