xref: /linux/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h (revision fb7399cf2d0b33825b8039f95c45395c7deba25c)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3 #ifndef BPF_ARENA_SPIN_LOCK_H
4 #define BPF_ARENA_SPIN_LOCK_H
5 
6 #include <vmlinux.h>
7 #include <bpf/bpf_helpers.h>
8 #include "bpf_atomic.h"
9 
10 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
11 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
12 
13 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
14 
15 #define EBUSY 16
16 #define EOPNOTSUPP 95
17 #define ETIMEDOUT 110
18 
19 #ifndef __arena
20 #define __arena __attribute__((address_space(1)))
21 #endif
22 
23 extern unsigned long CONFIG_NR_CPUS __kconfig;
24 
25 /*
26  * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
27  * PowerPC overrides the definition to define lock->val as u32 instead of
28  * atomic_t, leading to compilation errors.  Import a local definition below so
29  * that we don't depend on the vmlinux.h version.
30  */
31 
32 struct __qspinlock {
33 	union {
34 		atomic_t val;
35 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
36 		struct {
37 			u8 locked;
38 			u8 pending;
39 		};
40 		struct {
41 			u16 locked_pending;
42 			u16 tail;
43 		};
44 #else
45 		struct {
46 			u16 tail;
47 			u16 locked_pending;
48 		};
49 		struct {
50 			u8 reserved[2];
51 			u8 pending;
52 			u8 locked;
53 		};
54 #endif
55 	};
56 };
57 
58 #define arena_spinlock_t struct __qspinlock
59 /* FIXME: Using typedef causes CO-RE relocation error */
60 /* typedef struct qspinlock arena_spinlock_t; */
61 
62 struct arena_mcs_spinlock {
63 	struct arena_mcs_spinlock __arena *next;
64 	int locked;
65 	int count;
66 };
67 
68 struct arena_qnode {
69 	struct arena_mcs_spinlock mcs;
70 };
71 
72 #define _Q_MAX_NODES		4
73 #define _Q_PENDING_LOOPS	1
74 
75 /*
76  * Bitfields in the atomic value:
77  *
78  *  0- 7: locked byte
79  *     8: pending
80  *  9-15: not used
81  * 16-17: tail index
82  * 18-31: tail cpu (+1)
83  */
84 #define _Q_MAX_CPUS		1024
85 
86 #define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
87 				      << _Q_ ## type ## _OFFSET)
88 #define _Q_LOCKED_OFFSET	0
89 #define _Q_LOCKED_BITS		8
90 #define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
91 
92 #define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
93 #define _Q_PENDING_BITS		8
94 #define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
95 
96 #define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
97 #define _Q_TAIL_IDX_BITS	2
98 #define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
99 
100 #define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
101 #define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
102 #define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
103 
104 #define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
105 #define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
106 
107 #define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
108 #define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
109 
110 struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
111 
112 static inline u32 encode_tail(int cpu, int idx)
113 {
114 	u32 tail;
115 
116 	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
117 	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
118 
119 	return tail;
120 }
121 
122 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
123 {
124 	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
125 	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
126 
127 	return &qnodes[cpu][idx].mcs;
128 }
129 
130 static inline
131 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
132 {
133 	return &((struct arena_qnode __arena *)base + idx)->mcs;
134 }
135 
136 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
137 
138 /**
139  * xchg_tail - Put in the new queue tail code word & retrieve previous one
140  * @lock : Pointer to queued spinlock structure
141  * @tail : The new queue tail code word
142  * Return: The previous queue tail code word
143  *
144  * xchg(lock, tail)
145  *
146  * p,*,* -> n,*,* ; prev = xchg(lock, node)
147  */
148 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
149 {
150 	u32 old, new;
151 
152 	old = atomic_read(&lock->val);
153 	do {
154 		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
155 		/*
156 		 * We can use relaxed semantics since the caller ensures that
157 		 * the MCS node is properly initialized before updating the
158 		 * tail.
159 		 */
160 		/* These loops are not expected to stall, but we still need to
161 		 * prove to the verifier they will terminate eventually.
162 		 */
163 		cond_break_label(out);
164 	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
165 
166 	return old;
167 out:
168 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
169 	return old;
170 }
171 
172 /**
173  * clear_pending - clear the pending bit.
174  * @lock: Pointer to queued spinlock structure
175  *
176  * *,1,* -> *,0,*
177  */
178 static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
179 {
180 	WRITE_ONCE(lock->pending, 0);
181 }
182 
183 /**
184  * clear_pending_set_locked - take ownership and clear the pending bit.
185  * @lock: Pointer to queued spinlock structure
186  *
187  * *,1,0 -> *,0,1
188  *
189  * Lock stealing is not allowed if this function is used.
190  */
191 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
192 {
193 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
194 }
195 
196 /**
197  * set_locked - Set the lock bit and own the lock
198  * @lock: Pointer to queued spinlock structure
199  *
200  * *,*,0 -> *,0,1
201  */
202 static __always_inline void set_locked(arena_spinlock_t __arena *lock)
203 {
204 	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
205 }
206 
207 static __always_inline
208 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
209 {
210 	u32 old, new;
211 
212 	old = atomic_read(&lock->val);
213 	do {
214 		new = old | _Q_PENDING_VAL;
215 		/*
216 		 * These loops are not expected to stall, but we still need to
217 		 * prove to the verifier they will terminate eventually.
218 		 */
219 		cond_break_label(out);
220 	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
221 
222 	return old;
223 out:
224 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
225 	return old;
226 }
227 
228 /**
229  * arena_spin_trylock - try to acquire the queued spinlock
230  * @lock : Pointer to queued spinlock structure
231  * Return: 1 if lock acquired, 0 if failed
232  */
233 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
234 {
235 	int val = atomic_read(&lock->val);
236 
237 	if (unlikely(val))
238 		return 0;
239 
240 	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
241 }
242 
243 __noinline
244 int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
245 {
246 	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
247 	int ret = -ETIMEDOUT;
248 	u32 old, tail;
249 	int idx;
250 
251 	/*
252 	 * Wait for in-progress pending->locked hand-overs with a bounded
253 	 * number of spins so that we guarantee forward progress.
254 	 *
255 	 * 0,1,0 -> 0,0,1
256 	 */
257 	if (val == _Q_PENDING_VAL) {
258 		int cnt = _Q_PENDING_LOOPS;
259 		val = atomic_cond_read_relaxed_label(&lock->val,
260 						     (VAL != _Q_PENDING_VAL) || !cnt--,
261 						     release_err);
262 	}
263 
264 	/*
265 	 * If we observe any contention; queue.
266 	 */
267 	if (val & ~_Q_LOCKED_MASK)
268 		goto queue;
269 
270 	/*
271 	 * trylock || pending
272 	 *
273 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
274 	 */
275 	val = arena_fetch_set_pending_acquire(lock);
276 
277 	/*
278 	 * If we observe contention, there is a concurrent locker.
279 	 *
280 	 * Undo and queue; our setting of PENDING might have made the
281 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
282 	 * on @next to become !NULL.
283 	 */
284 	if (unlikely(val & ~_Q_LOCKED_MASK)) {
285 
286 		/* Undo PENDING if we set it. */
287 		if (!(val & _Q_PENDING_MASK))
288 			clear_pending(lock);
289 
290 		goto queue;
291 	}
292 
293 	/*
294 	 * We're pending, wait for the owner to go away.
295 	 *
296 	 * 0,1,1 -> *,1,0
297 	 *
298 	 * this wait loop must be a load-acquire such that we match the
299 	 * store-release that clears the locked bit and create lock
300 	 * sequentiality; this is because not all
301 	 * clear_pending_set_locked() implementations imply full
302 	 * barriers.
303 	 */
304 	if (val & _Q_LOCKED_MASK)
305 		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
306 
307 	/*
308 	 * take ownership and clear the pending bit.
309 	 *
310 	 * 0,1,0 -> 0,0,1
311 	 */
312 	clear_pending_set_locked(lock);
313 	return 0;
314 
315 	/*
316 	 * End of pending bit optimistic spinning and beginning of MCS
317 	 * queuing.
318 	 */
319 queue:
320 	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
321 	idx = node0->count++;
322 	tail = encode_tail(bpf_get_smp_processor_id(), idx);
323 
324 	/*
325 	 * 4 nodes are allocated based on the assumption that there will not be
326 	 * nested NMIs taking spinlocks. That may not be true in some
327 	 * architectures even though the chance of needing more than 4 nodes
328 	 * will still be extremely unlikely. When that happens, we simply return
329 	 * an error. Original qspinlock has a trylock fallback in this case.
330 	 */
331 	if (unlikely(idx >= _Q_MAX_NODES)) {
332 		ret = -EBUSY;
333 		goto release_node_err;
334 	}
335 
336 	node = grab_mcs_node(node0, idx);
337 
338 	/*
339 	 * Ensure that we increment the head node->count before initialising
340 	 * the actual node. If the compiler is kind enough to reorder these
341 	 * stores, then an IRQ could overwrite our assignments.
342 	 */
343 	barrier();
344 
345 	node->locked = 0;
346 	node->next = NULL;
347 
348 	/*
349 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
350 	 * attempt the trylock once more in the hope someone let go while we
351 	 * weren't watching.
352 	 */
353 	if (arena_spin_trylock(lock))
354 		goto release;
355 
356 	/*
357 	 * Ensure that the initialisation of @node is complete before we
358 	 * publish the updated tail via xchg_tail() and potentially link
359 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
360 	 */
361 	smp_wmb();
362 
363 	/*
364 	 * Publish the updated tail.
365 	 * We have already touched the queueing cacheline; don't bother with
366 	 * pending stuff.
367 	 *
368 	 * p,*,* -> n,*,*
369 	 */
370 	old = xchg_tail(lock, tail);
371 	next = NULL;
372 
373 	/*
374 	 * if there was a previous node; link it and wait until reaching the
375 	 * head of the waitqueue.
376 	 */
377 	if (old & _Q_TAIL_MASK) {
378 		prev = decode_tail(old);
379 
380 		/* Link @node into the waitqueue. */
381 		WRITE_ONCE(prev->next, node);
382 
383 		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
384 
385 		/*
386 		 * While waiting for the MCS lock, the next pointer may have
387 		 * been set by another lock waiter. We cannot prefetch here
388 		 * due to lack of equivalent instruction in BPF ISA.
389 		 */
390 		next = READ_ONCE(node->next);
391 	}
392 
393 	/*
394 	 * we're at the head of the waitqueue, wait for the owner & pending to
395 	 * go away.
396 	 *
397 	 * *,x,y -> *,0,0
398 	 *
399 	 * this wait loop must use a load-acquire such that we match the
400 	 * store-release that clears the locked bit and create lock
401 	 * sequentiality; this is because the set_locked() function below
402 	 * does not imply a full barrier.
403 	 */
404 	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
405 					     release_node_err);
406 
407 	/*
408 	 * claim the lock:
409 	 *
410 	 * n,0,0 -> 0,0,1 : lock, uncontended
411 	 * *,*,0 -> *,*,1 : lock, contended
412 	 *
413 	 * If the queue head is the only one in the queue (lock value == tail)
414 	 * and nobody is pending, clear the tail code and grab the lock.
415 	 * Otherwise, we only need to grab the lock.
416 	 */
417 
418 	/*
419 	 * In the PV case we might already have _Q_LOCKED_VAL set, because
420 	 * of lock stealing; therefore we must also allow:
421 	 *
422 	 * n,0,1 -> 0,0,1
423 	 *
424 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
425 	 *       above wait condition, therefore any concurrent setting of
426 	 *       PENDING will make the uncontended transition fail.
427 	 */
428 	if ((val & _Q_TAIL_MASK) == tail) {
429 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
430 			goto release; /* No contention */
431 	}
432 
433 	/*
434 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
435 	 * which will then detect the remaining tail and queue behind us
436 	 * ensuring we'll see a @next.
437 	 */
438 	set_locked(lock);
439 
440 	/*
441 	 * contended path; wait for next if not observed yet, release.
442 	 */
443 	if (!next)
444 		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
445 
446 	arch_mcs_spin_unlock_contended(&next->locked);
447 
448 release:;
449 	/*
450 	 * release the node
451 	 *
452 	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
453 	 * decrements count it incremented before returning, thus we're fine.
454 	 * For contexts interrupting us, they either observe our dec or not.
455 	 * Just ensure the compiler doesn't reorder this statement, as a
456 	 * this_cpu_dec implicitly implied that.
457 	 */
458 	barrier();
459 	node0->count--;
460 	return 0;
461 release_node_err:
462 	barrier();
463 	node0->count--;
464 	goto release_err;
465 release_err:
466 	return ret;
467 }
468 
469 /**
470  * arena_spin_lock - acquire a queued spinlock
471  * @lock: Pointer to queued spinlock structure
472  *
473  * On error, returned value will be negative.
474  * On success, zero is returned.
475  *
476  * The return value _must_ be tested against zero for success,
477  * instead of checking it against negative, for passing the
478  * BPF verifier.
479  *
480  * The user should do:
481  *	if (arena_spin_lock(...) != 0) // failure
482  *		or
483  *	if (arena_spin_lock(...) == 0) // success
484  *		or
485  *	if (arena_spin_lock(...)) // failure
486  *		or
487  *	if (!arena_spin_lock(...)) // success
488  * instead of:
489  *	if (arena_spin_lock(...) < 0) // failure
490  *
491  * The return value can still be inspected later.
492  */
493 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
494 {
495 	int val = 0;
496 
497 	if (CONFIG_NR_CPUS > 1024)
498 		return -EOPNOTSUPP;
499 
500 	bpf_preempt_disable();
501 	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
502 		return 0;
503 
504 	val = arena_spin_lock_slowpath(lock, val);
505 	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
506 	if (val)
507 		bpf_preempt_enable();
508 	return val;
509 }
510 
511 /**
512  * arena_spin_unlock - release a queued spinlock
513  * @lock : Pointer to queued spinlock structure
514  */
515 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
516 {
517 	/*
518 	 * unlock() needs release semantics:
519 	 */
520 	smp_store_release(&lock->locked, 0);
521 	bpf_preempt_enable();
522 }
523 
524 #define arena_spin_lock_irqsave(lock, flags)             \
525 	({                                               \
526 		int __ret;                               \
527 		bpf_local_irq_save(&(flags));            \
528 		__ret = arena_spin_lock((lock));         \
529 		if (__ret)                               \
530 			bpf_local_irq_restore(&(flags)); \
531 		(__ret);                                 \
532 	})
533 
534 #define arena_spin_unlock_irqrestore(lock, flags) \
535 	({                                        \
536 		arena_spin_unlock((lock));        \
537 		bpf_local_irq_restore(&(flags));  \
538 	})
539 
540 #endif
541 
542 #endif /* BPF_ARENA_SPIN_LOCK_H */
543