xref: /linux/include/asm-generic/rqspinlock.h (revision c2aa3089ad7e7fec3ec4a58d8d0904b5e9b392a1)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Resilient Queued Spin Lock
4  *
5  * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
6  *
7  * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
8  */
9 #ifndef __ASM_GENERIC_RQSPINLOCK_H
10 #define __ASM_GENERIC_RQSPINLOCK_H
11 
12 #include <linux/types.h>
13 #include <vdso/time64.h>
14 #include <linux/percpu.h>
15 #ifdef CONFIG_QUEUED_SPINLOCKS
16 #include <asm/qspinlock.h>
17 #endif
18 
19 struct rqspinlock {
20 	union {
21 		atomic_t val;
22 		u32 locked;
23 	};
24 };
25 
26 /* Even though this is same as struct rqspinlock, we need to emit a distinct
27  * type in BTF for BPF programs.
28  */
29 struct bpf_res_spin_lock {
30 	u32 val;
31 };
32 
33 struct qspinlock;
34 #ifdef CONFIG_QUEUED_SPINLOCKS
35 typedef struct qspinlock rqspinlock_t;
36 #else
37 typedef struct rqspinlock rqspinlock_t;
38 #endif
39 
40 extern int resilient_tas_spin_lock(rqspinlock_t *lock);
41 #ifdef CONFIG_QUEUED_SPINLOCKS
42 extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
43 #endif
44 
45 #ifndef resilient_virt_spin_lock_enabled
46 static __always_inline bool resilient_virt_spin_lock_enabled(void)
47 {
48 	return false;
49 }
50 #endif
51 
52 #ifndef resilient_virt_spin_lock
53 static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
54 {
55 	return 0;
56 }
57 #endif
58 
59 /*
60  * Default timeout for waiting loops is 0.25 seconds
61  */
62 #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
63 
64 /*
65  * Choose 31 as it makes rqspinlock_held cacheline-aligned.
66  */
67 #define RES_NR_HELD 31
68 
69 struct rqspinlock_held {
70 	int cnt;
71 	void *locks[RES_NR_HELD];
72 };
73 
74 DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
75 
76 static __always_inline void grab_held_lock_entry(void *lock)
77 {
78 	int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
79 
80 	if (unlikely(cnt > RES_NR_HELD)) {
81 		/* Still keep the inc so we decrement later. */
82 		return;
83 	}
84 
85 	/*
86 	 * Implied compiler barrier in per-CPU operations; otherwise we can have
87 	 * the compiler reorder inc with write to table, allowing interrupts to
88 	 * overwrite and erase our write to the table (as on interrupt exit it
89 	 * will be reset to NULL).
90 	 *
91 	 * It is fine for cnt inc to be reordered wrt remote readers though,
92 	 * they won't observe our entry until the cnt update is visible, that's
93 	 * all.
94 	 */
95 	this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
96 }
97 
98 /*
99  * We simply don't support out-of-order unlocks, and keep the logic simple here.
100  * The verifier prevents BPF programs from unlocking out-of-order, and the same
101  * holds for in-kernel users.
102  *
103  * It is possible to run into misdetection scenarios of AA deadlocks on the same
104  * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
105  * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
106  * logic to preserve right entries in the table would be to walk the array of
107  * held locks and swap and clear out-of-order entries, but that's too
108  * complicated and we don't have a compelling use case for out of order unlocking.
109  */
110 static __always_inline void release_held_lock_entry(void)
111 {
112 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
113 
114 	if (unlikely(rqh->cnt > RES_NR_HELD))
115 		goto dec;
116 	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
117 dec:
118 	/*
119 	 * Reordering of clearing above with inc and its write in
120 	 * grab_held_lock_entry that came before us (in same acquisition
121 	 * attempt) is ok, we either see a valid entry or NULL when it's
122 	 * visible.
123 	 *
124 	 * But this helper is invoked when we unwind upon failing to acquire the
125 	 * lock. Unlike the unlock path which constitutes a release store after
126 	 * we clear the entry, we need to emit a write barrier here. Otherwise,
127 	 * we may have a situation as follows:
128 	 *
129 	 * <error> for lock B
130 	 * release_held_lock_entry
131 	 *
132 	 * try_cmpxchg_acquire for lock A
133 	 * grab_held_lock_entry
134 	 *
135 	 * Lack of any ordering means reordering may occur such that dec, inc
136 	 * are done before entry is overwritten. This permits a remote lock
137 	 * holder of lock B (which this CPU failed to acquire) to now observe it
138 	 * as being attempted on this CPU, and may lead to misdetection (if this
139 	 * CPU holds a lock it is attempting to acquire, leading to false ABBA
140 	 * diagnosis).
141 	 *
142 	 * In case of unlock, we will always do a release on the lock word after
143 	 * releasing the entry, ensuring that other CPUs cannot hold the lock
144 	 * (and make conclusions about deadlocks) until the entry has been
145 	 * cleared on the local CPU, preventing any anomalies. Reordering is
146 	 * still possible there, but a remote CPU cannot observe a lock in our
147 	 * table which it is already holding, since visibility entails our
148 	 * release store for the said lock has not retired.
149 	 *
150 	 * In theory we don't have a problem if the dec and WRITE_ONCE above get
151 	 * reordered with each other, we either notice an empty NULL entry on
152 	 * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
153 	 * cannot be observed (if dec precedes WRITE_ONCE).
154 	 *
155 	 * Emit the write barrier _before_ the dec, this permits dec-inc
156 	 * reordering but that is harmless as we'd have new entry set to NULL
157 	 * already, i.e. they cannot precede the NULL store above.
158 	 */
159 	smp_wmb();
160 	this_cpu_dec(rqspinlock_held_locks.cnt);
161 }
162 
163 #ifdef CONFIG_QUEUED_SPINLOCKS
164 
165 /**
166  * res_spin_lock - acquire a queued spinlock
167  * @lock: Pointer to queued spinlock structure
168  *
169  * Return:
170  * * 0		- Lock was acquired successfully.
171  * * -EDEADLK	- Lock acquisition failed because of AA/ABBA deadlock.
172  * * -ETIMEDOUT - Lock acquisition failed because of timeout.
173  */
174 static __always_inline int res_spin_lock(rqspinlock_t *lock)
175 {
176 	int val = 0;
177 
178 	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
179 		grab_held_lock_entry(lock);
180 		return 0;
181 	}
182 	return resilient_queued_spin_lock_slowpath(lock, val);
183 }
184 
185 #else
186 
187 #define res_spin_lock(lock) resilient_tas_spin_lock(lock)
188 
189 #endif /* CONFIG_QUEUED_SPINLOCKS */
190 
191 static __always_inline void res_spin_unlock(rqspinlock_t *lock)
192 {
193 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
194 
195 	if (unlikely(rqh->cnt > RES_NR_HELD))
196 		goto unlock;
197 	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
198 unlock:
199 	/*
200 	 * Release barrier, ensures correct ordering. See release_held_lock_entry
201 	 * for details.  Perform release store instead of queued_spin_unlock,
202 	 * since we use this function for test-and-set fallback as well. When we
203 	 * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
204 	 *
205 	 * Like release_held_lock_entry, we can do the release before the dec.
206 	 * We simply care about not seeing the 'lock' in our table from a remote
207 	 * CPU once the lock has been released, which doesn't rely on the dec.
208 	 *
209 	 * Unlike smp_wmb(), release is not a two way fence, hence it is
210 	 * possible for a inc to move up and reorder with our clearing of the
211 	 * entry. This isn't a problem however, as for a misdiagnosis of ABBA,
212 	 * the remote CPU needs to hold this lock, which won't be released until
213 	 * the store below is done, which would ensure the entry is overwritten
214 	 * to NULL, etc.
215 	 */
216 	smp_store_release(&lock->locked, 0);
217 	this_cpu_dec(rqspinlock_held_locks.cnt);
218 }
219 
220 #ifdef CONFIG_QUEUED_SPINLOCKS
221 #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
222 #else
223 #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
224 #endif
225 
226 #define raw_res_spin_lock(lock)                    \
227 	({                                         \
228 		int __ret;                         \
229 		preempt_disable();                 \
230 		__ret = res_spin_lock(lock);	   \
231 		if (__ret)                         \
232 			preempt_enable();          \
233 		__ret;                             \
234 	})
235 
236 #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
237 
238 #define raw_res_spin_lock_irqsave(lock, flags)    \
239 	({                                        \
240 		int __ret;                        \
241 		local_irq_save(flags);            \
242 		__ret = raw_res_spin_lock(lock);  \
243 		if (__ret)                        \
244 			local_irq_restore(flags); \
245 		__ret;                            \
246 	})
247 
248 #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
249 
250 #endif /* __ASM_GENERIC_RQSPINLOCK_H */
251