xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c (revision fa6fe449343c3d97ed93fd01b020860c663f8807)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2024 Advanced Micro Devices, Inc.
4  *
5  * Lockdep annotation for AMDGPU lock ordering
6  *
7  * This module teaches lockdep the correct lock ordering to catch
8  * potential deadlocks at development time rather than runtime.
9  *
10  * Based on dma-resv lockdep approach from:
11  * drivers/dma-buf/dma-resv.c:dma_resv_lockdep()
12  */
13 
14 #include "amdgpu.h"
15 #include "amdgpu_reset.h"
16 
17 #ifdef CONFIG_LOCKDEP
18 
19 struct amdgpu_lockdep_dummy_locks {
20 	struct mutex reset_lock;
21 	struct mutex userq_sch_mutex;
22 	struct mutex userq_mutex;
23 	struct mutex notifier_lock;
24 	struct mutex vram_lock;
25 	struct mutex srbm_mutex;
26 	struct mutex grbm_idx_mutex;
27 	spinlock_t mmio_idx_lock;
28 };
29 
30 /* Lock class keys for associating with real driver locks */
31 static struct lock_class_key amdgpu_userq_sch_mutex_key;
32 static struct lock_class_key amdgpu_userq_mutex_key;
33 static struct lock_class_key amdgpu_notifier_lock_key;
34 static struct lock_class_key amdgpu_vram_lock_key;
35 static struct lock_class_key amdgpu_reset_sem_key;
36 static struct lock_class_key amdgpu_reset_lock_key;
37 static struct lock_class_key amdgpu_srbm_lock_key;
38 static struct lock_class_key amdgpu_grbm_lock_key;
39 static struct lock_class_key amdgpu_mmio_lock_key;
40 
41 /**
42  * amdgpu_lockdep_set_class - Associate lock class keys with real locks
43  * @adev: AMDGPU device
44  *
45  * Call during device init to associate lock classes with actual locks
46  * so lockdep can track them properly.
47  */
48 void amdgpu_lockdep_set_class(struct amdgpu_device *adev)
49 {
50 	lockdep_set_class(&adev->gfx.userq_sch_mutex,
51 			  &amdgpu_userq_sch_mutex_key);
52 	lockdep_set_class(&adev->notifier_lock, &amdgpu_notifier_lock_key);
53 	lockdep_set_class(&adev->srbm_mutex, &amdgpu_srbm_lock_key);
54 	lockdep_set_class(&adev->grbm_idx_mutex, &amdgpu_grbm_lock_key);
55 	lockdep_set_class(&adev->mmio_idx_lock, &amdgpu_mmio_lock_key);
56 
57 	if (adev->reset_domain)
58 		lockdep_set_class(&adev->reset_domain->sem,
59 				  &amdgpu_reset_sem_key);
60 }
61 
62 /**
63  * amdgpu_lockdep_init - Teach lockdep the correct lock ordering
64  *
65  * Instantiates dummy objects and takes locks in the correct order to
66  * train lockdep. This helps catch lock ordering violations during
67  * development.
68  *
69  * Lock ordering hierarchy (outermost to innermost):
70  *
71  * 1. userq_sch_mutex     - Global userq scheduler (enforce_isolation)
72  * 2. userq_mutex         - Per-context userq (held across queue create/destroy)
73  * 3. notifier_lock       - MMU notifier lock
74  * 4. vram_lock           - VRAM allocator lock
75  * 5. reset_domain->sem   - GPU reset synchronization
76  * 6. reset_lock          - Reset control lock
77  * 7. srbm_mutex          - SRBM register access
78  * 8. grbm_idx_mutex      - GRBM index access
79  * 9. mmio_idx_lock       - MMIO index access (spinlock)
80  *
81  * Evidence:
82  * - userq_sch_mutex -> userq_mutex: amdgpu_gfx_kfd_sch_ctrl() calls
83  *   amdgpu_userq_stop_sched_for_enforce_isolation() which takes userq_mutex
84  * - userq_mutex -> notifier_lock: userq paths may trigger MMU notifier
85  *   invalidation which acquires notifier_lock
86  * - notifier_lock -> reset_domain->sem: HMM invalidation callback holds
87  *   notifier_lock and can wait for GPU reset completion, so notifier_lock
88  *   must be outer to reset_domain->sem
89  * - vram_lock -> reset_domain->sem: VRAM management paths may need to
90  *   wait for ongoing reset to complete
91  *
92  * Note: mmap_lock ordering relative to GPU locks is already taught
93  * by dma-resv (drivers/dma-buf/dma-resv.c).
94  */
95 int amdgpu_lockdep_init(void)
96 {
97 	struct amdgpu_reset_domain *reset_domain = NULL;
98 	struct amdgpu_lockdep_dummy_locks *locks;
99 	unsigned long flags;
100 
101 	locks = kzalloc(sizeof(*locks), GFP_KERNEL);
102 	if (!locks)
103 		return -ENOMEM;
104 
105 	/*
106 	 * Initialize dummy reset domain
107 	 */
108 	reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE,
109 							"lockdep_test");
110 	if (!reset_domain) {
111 		kfree(locks);
112 		return -ENOMEM;
113 	}
114 	/* Initialize dummy locks */
115 	mutex_init(&locks->userq_sch_mutex);
116 	mutex_init(&locks->userq_mutex);
117 	mutex_init(&locks->notifier_lock);
118 	mutex_init(&locks->vram_lock);
119 	mutex_init(&locks->reset_lock);
120 	mutex_init(&locks->srbm_mutex);
121 	mutex_init(&locks->grbm_idx_mutex);
122 	spin_lock_init(&locks->mmio_idx_lock);
123 
124 	/*
125 	 * Associate dummy locks with the same class keys used for real
126 	 * driver locks. This ensures lockdep connects the ordering learned
127 	 * here with the actual locks used at runtime.
128 	 */
129 	lockdep_set_class(&locks->userq_sch_mutex, &amdgpu_userq_sch_mutex_key);
130 	lockdep_set_class(&locks->userq_mutex, &amdgpu_userq_mutex_key);
131 	lockdep_set_class(&locks->notifier_lock, &amdgpu_notifier_lock_key);
132 	lockdep_set_class(&locks->vram_lock, &amdgpu_vram_lock_key);
133 	lockdep_set_class(&reset_domain->sem, &amdgpu_reset_sem_key);
134 	lockdep_set_class(&locks->reset_lock, &amdgpu_reset_lock_key);
135 	lockdep_set_class(&locks->srbm_mutex, &amdgpu_srbm_lock_key);
136 	lockdep_set_class(&locks->grbm_idx_mutex, &amdgpu_grbm_lock_key);
137 	lockdep_set_class(&locks->mmio_idx_lock, &amdgpu_mmio_lock_key);
138 	/*
139 	 * Take locks in the correct order to train lockdep.
140 	 * This establishes the dependency chain.
141 	 */
142 
143 	/* Level 1: Global userq scheduler mutex (outermost) */
144 	mutex_lock(&locks->userq_sch_mutex);
145 
146 	/* Level 2: Per-context userq mutex */
147 	mutex_lock(&locks->userq_mutex);
148 	/* Level 3: MMU notifier lock */
149 	mutex_lock(&locks->notifier_lock);
150 	/* Level 4: VRAM allocator lock */
151 	mutex_lock(&locks->vram_lock);
152 	/* Level 5: Reset domain semaphore */
153 	down_read(&reset_domain->sem);
154 
155 	/* Level 6: Reset control lock */
156 	mutex_lock(&locks->reset_lock);
157 	/*
158 	 * Mark potential memory reclaim boundary.
159 	 * GPU operations might trigger memory allocation/reclaim.
160 	 */
161 	fs_reclaim_acquire(GFP_KERNEL);
162 
163 	/* Level 7: SRBM register access */
164 	mutex_lock(&locks->srbm_mutex);
165 	/* Level 8: GRBM index access */
166 	mutex_lock(&locks->grbm_idx_mutex);
167 
168 	/* Level 9: MMIO index access (innermost lock, spinlock) */
169 	spin_lock_irqsave(&locks->mmio_idx_lock, flags);
170 	/*
171 	 * All locks acquired in order.
172 	 * Lockdep has now learned the valid dependency chain.
173 	 */
174 
175 	/* Release in reverse order */
176 	spin_unlock_irqrestore(&locks->mmio_idx_lock, flags);
177 	mutex_unlock(&locks->grbm_idx_mutex);
178 	mutex_unlock(&locks->srbm_mutex);
179 	fs_reclaim_release(GFP_KERNEL);
180 
181 	mutex_unlock(&locks->reset_lock);
182 	up_read(&reset_domain->sem);
183 
184 	mutex_unlock(&locks->vram_lock);
185 	mutex_unlock(&locks->notifier_lock);
186 	mutex_unlock(&locks->userq_mutex);
187 	mutex_unlock(&locks->userq_sch_mutex);
188 
189 	/* Cleanup */
190 	amdgpu_reset_put_reset_domain(reset_domain);
191 
192 	kfree(locks);
193 	pr_info("AMDGPU: Lockdep annotations initialized (9 lock levels)\n");
194 
195 	return 0;
196 }
197 
198 #endif /* CONFIG_LOCKDEP */
199