xref: /linux/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h (revision b9b23fe1761117f4a0109a25d16d337c900437ad)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3 #ifndef BPF_ARENA_SPIN_LOCK_H
4 #define BPF_ARENA_SPIN_LOCK_H
5 
6 #include <vmlinux.h>
7 #include <bpf/bpf_helpers.h>
8 #include <bpf_atomic.h>
9 
10 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
11 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
12 
13 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
14 
15 #define EBUSY 16
16 #define EOPNOTSUPP 95
17 #define ETIMEDOUT 110
18 
19 extern unsigned long CONFIG_NR_CPUS __kconfig;
20 
21 /*
22  * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
23  * PowerPC overrides the definition to define lock->val as u32 instead of
24  * atomic_t, leading to compilation errors.  Import a local definition below so
25  * that we don't depend on the vmlinux.h version.
26  */
27 
28 struct __qspinlock {
29 	union {
30 		atomic_t val;
31 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
32 		struct {
33 			u8 locked;
34 			u8 pending;
35 		};
36 		struct {
37 			u16 locked_pending;
38 			u16 tail;
39 		};
40 #else
41 		struct {
42 			u16 tail;
43 			u16 locked_pending;
44 		};
45 		struct {
46 			u8 reserved[2];
47 			u8 pending;
48 			u8 locked;
49 		};
50 #endif
51 	};
52 };
53 
54 #define arena_spinlock_t struct __qspinlock
55 /* FIXME: Using typedef causes CO-RE relocation error */
56 /* typedef struct qspinlock arena_spinlock_t; */
57 
58 struct arena_mcs_spinlock {
59 	struct arena_mcs_spinlock __arena *next;
60 	int locked;
61 	int count;
62 };
63 
64 struct arena_qnode {
65 	struct arena_mcs_spinlock mcs;
66 };
67 
68 #define _Q_MAX_NODES		4
69 #define _Q_PENDING_LOOPS	1
70 
71 /*
72  * Bitfields in the atomic value:
73  *
74  *  0- 7: locked byte
75  *     8: pending
76  *  9-15: not used
77  * 16-17: tail index
78  * 18-31: tail cpu (+1)
79  */
80 #define _Q_MAX_CPUS		1024
81 
82 #define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
83 				      << _Q_ ## type ## _OFFSET)
84 #define _Q_LOCKED_OFFSET	0
85 #define _Q_LOCKED_BITS		8
86 #define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
87 
88 #define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
89 #define _Q_PENDING_BITS		8
90 #define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
91 
92 #define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
93 #define _Q_TAIL_IDX_BITS	2
94 #define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
95 
96 #define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
97 #define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
98 #define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
99 
100 #define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
101 #define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
102 
103 #define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
104 #define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
105 
106 /*
107  * The qnodes are marked __weak so we can define them in the header
108  * while still ensuring all compilation units use the same struct
109  * instance.
110  */
111 struct arena_qnode __weak __arena __hidden qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
112 
113 static inline u32 encode_tail(int cpu, int idx)
114 {
115 	u32 tail;
116 
117 	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
118 	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
119 
120 	return tail;
121 }
122 
123 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
124 {
125 	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
126 	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
127 
128 	return &qnodes[cpu][idx].mcs;
129 }
130 
131 static inline
132 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
133 {
134 	return &((struct arena_qnode __arena *)base + idx)->mcs;
135 }
136 
137 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
138 
139 /**
140  * xchg_tail - Put in the new queue tail code word & retrieve previous one
141  * @lock : Pointer to queued spinlock structure
142  * @tail : The new queue tail code word
143  * Return: The previous queue tail code word
144  *
145  * xchg(lock, tail)
146  *
147  * p,*,* -> n,*,* ; prev = xchg(lock, node)
148  */
149 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
150 {
151 	u32 old, new;
152 
153 	old = atomic_read(&lock->val);
154 	do {
155 		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
156 		/*
157 		 * We can use relaxed semantics since the caller ensures that
158 		 * the MCS node is properly initialized before updating the
159 		 * tail.
160 		 */
161 		/* These loops are not expected to stall, but we still need to
162 		 * prove to the verifier they will terminate eventually.
163 		 */
164 		cond_break_label(out);
165 	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
166 
167 	return old;
168 out:
169 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
170 	return old;
171 }
172 
173 /**
174  * clear_pending - clear the pending bit.
175  * @lock: Pointer to queued spinlock structure
176  *
177  * *,1,* -> *,0,*
178  */
179 static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
180 {
181 	WRITE_ONCE(lock->pending, 0);
182 }
183 
184 /**
185  * clear_pending_set_locked - take ownership and clear the pending bit.
186  * @lock: Pointer to queued spinlock structure
187  *
188  * *,1,0 -> *,0,1
189  *
190  * Lock stealing is not allowed if this function is used.
191  */
192 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
193 {
194 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
195 }
196 
197 /**
198  * set_locked - Set the lock bit and own the lock
199  * @lock: Pointer to queued spinlock structure
200  *
201  * *,*,0 -> *,0,1
202  */
203 static __always_inline void set_locked(arena_spinlock_t __arena *lock)
204 {
205 	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
206 }
207 
208 static __always_inline
209 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
210 {
211 	u32 old, new;
212 
213 	old = atomic_read(&lock->val);
214 	do {
215 		new = old | _Q_PENDING_VAL;
216 		/*
217 		 * These loops are not expected to stall, but we still need to
218 		 * prove to the verifier they will terminate eventually.
219 		 */
220 		cond_break_label(out);
221 	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
222 
223 	return old;
224 out:
225 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
226 	return old;
227 }
228 
229 /**
230  * arena_spin_trylock - try to acquire the queued spinlock
231  * @lock : Pointer to queued spinlock structure
232  * Return: 1 if lock acquired, 0 if failed
233  */
234 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
235 {
236 	int val = atomic_read(&lock->val);
237 
238 	if (unlikely(val))
239 		return 0;
240 
241 	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
242 }
243 
244 __noinline __weak
245 int arena_spin_lock_slowpath(arena_spinlock_t __arena *lock, u32 val)
246 {
247 	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
248 	int ret = -ETIMEDOUT;
249 	u32 old, tail;
250 	int idx;
251 
252 	/*
253 	 * Wait for in-progress pending->locked hand-overs with a bounded
254 	 * number of spins so that we guarantee forward progress.
255 	 *
256 	 * 0,1,0 -> 0,0,1
257 	 */
258 	if (val == _Q_PENDING_VAL) {
259 		int cnt = _Q_PENDING_LOOPS;
260 		val = atomic_cond_read_relaxed_label(&lock->val,
261 						     (VAL != _Q_PENDING_VAL) || !cnt--,
262 						     release_err);
263 	}
264 
265 	/*
266 	 * If we observe any contention; queue.
267 	 */
268 	if (val & ~_Q_LOCKED_MASK)
269 		goto queue;
270 
271 	/*
272 	 * trylock || pending
273 	 *
274 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
275 	 */
276 	val = arena_fetch_set_pending_acquire(lock);
277 
278 	/*
279 	 * If we observe contention, there is a concurrent locker.
280 	 *
281 	 * Undo and queue; our setting of PENDING might have made the
282 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
283 	 * on @next to become !NULL.
284 	 */
285 	if (unlikely(val & ~_Q_LOCKED_MASK)) {
286 
287 		/* Undo PENDING if we set it. */
288 		if (!(val & _Q_PENDING_MASK))
289 			clear_pending(lock);
290 
291 		goto queue;
292 	}
293 
294 	/*
295 	 * We're pending, wait for the owner to go away.
296 	 *
297 	 * 0,1,1 -> *,1,0
298 	 *
299 	 * this wait loop must be a load-acquire such that we match the
300 	 * store-release that clears the locked bit and create lock
301 	 * sequentiality; this is because not all
302 	 * clear_pending_set_locked() implementations imply full
303 	 * barriers.
304 	 */
305 	if (val & _Q_LOCKED_MASK)
306 		(void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
307 
308 	/*
309 	 * take ownership and clear the pending bit.
310 	 *
311 	 * 0,1,0 -> 0,0,1
312 	 */
313 	clear_pending_set_locked(lock);
314 	return 0;
315 
316 	/*
317 	 * End of pending bit optimistic spinning and beginning of MCS
318 	 * queuing.
319 	 */
320 queue:
321 	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
322 	idx = node0->count++;
323 	tail = encode_tail(bpf_get_smp_processor_id(), idx);
324 
325 	/*
326 	 * 4 nodes are allocated based on the assumption that there will not be
327 	 * nested NMIs taking spinlocks. That may not be true in some
328 	 * architectures even though the chance of needing more than 4 nodes
329 	 * will still be extremely unlikely. When that happens, we simply return
330 	 * an error. Original qspinlock has a trylock fallback in this case.
331 	 */
332 	if (unlikely(idx >= _Q_MAX_NODES)) {
333 		ret = -EBUSY;
334 		goto release_node_err;
335 	}
336 
337 	node = grab_mcs_node(node0, idx);
338 
339 	/*
340 	 * Ensure that we increment the head node->count before initialising
341 	 * the actual node. If the compiler is kind enough to reorder these
342 	 * stores, then an IRQ could overwrite our assignments.
343 	 */
344 	barrier();
345 
346 	node->locked = 0;
347 	node->next = NULL;
348 
349 	/*
350 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
351 	 * attempt the trylock once more in the hope someone let go while we
352 	 * weren't watching.
353 	 */
354 	if (arena_spin_trylock(lock))
355 		goto release;
356 
357 	/*
358 	 * Ensure that the initialisation of @node is complete before we
359 	 * publish the updated tail via xchg_tail() and potentially link
360 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
361 	 */
362 	smp_wmb();
363 
364 	/*
365 	 * Publish the updated tail.
366 	 * We have already touched the queueing cacheline; don't bother with
367 	 * pending stuff.
368 	 *
369 	 * p,*,* -> n,*,*
370 	 */
371 	old = xchg_tail(lock, tail);
372 	next = NULL;
373 
374 	/*
375 	 * if there was a previous node; link it and wait until reaching the
376 	 * head of the waitqueue.
377 	 */
378 	if (old & _Q_TAIL_MASK) {
379 		prev = decode_tail(old);
380 
381 		/* Link @node into the waitqueue. */
382 		WRITE_ONCE(prev->next, node);
383 
384 		(void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
385 
386 		/*
387 		 * While waiting for the MCS lock, the next pointer may have
388 		 * been set by another lock waiter. We cannot prefetch here
389 		 * due to lack of equivalent instruction in BPF ISA.
390 		 */
391 		next = READ_ONCE(node->next);
392 	}
393 
394 	/*
395 	 * we're at the head of the waitqueue, wait for the owner & pending to
396 	 * go away.
397 	 *
398 	 * *,x,y -> *,0,0
399 	 *
400 	 * this wait loop must use a load-acquire such that we match the
401 	 * store-release that clears the locked bit and create lock
402 	 * sequentiality; this is because the set_locked() function below
403 	 * does not imply a full barrier.
404 	 */
405 	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
406 					     release_node_err);
407 
408 	/*
409 	 * claim the lock:
410 	 *
411 	 * n,0,0 -> 0,0,1 : lock, uncontended
412 	 * *,*,0 -> *,*,1 : lock, contended
413 	 *
414 	 * If the queue head is the only one in the queue (lock value == tail)
415 	 * and nobody is pending, clear the tail code and grab the lock.
416 	 * Otherwise, we only need to grab the lock.
417 	 */
418 
419 	/*
420 	 * In the PV case we might already have _Q_LOCKED_VAL set, because
421 	 * of lock stealing; therefore we must also allow:
422 	 *
423 	 * n,0,1 -> 0,0,1
424 	 *
425 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
426 	 *       above wait condition, therefore any concurrent setting of
427 	 *       PENDING will make the uncontended transition fail.
428 	 */
429 	if ((val & _Q_TAIL_MASK) == tail) {
430 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
431 			goto release; /* No contention */
432 	}
433 
434 	/*
435 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
436 	 * which will then detect the remaining tail and queue behind us
437 	 * ensuring we'll see a @next.
438 	 */
439 	set_locked(lock);
440 
441 	/*
442 	 * contended path; wait for next if not observed yet, release.
443 	 */
444 	if (!next)
445 		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
446 
447 	arch_mcs_spin_unlock_contended(&next->locked);
448 
449 release:;
450 	/*
451 	 * release the node
452 	 *
453 	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
454 	 * decrements count it incremented before returning, thus we're fine.
455 	 * For contexts interrupting us, they either observe our dec or not.
456 	 * Just ensure the compiler doesn't reorder this statement, as a
457 	 * this_cpu_dec implicitly implied that.
458 	 */
459 	barrier();
460 	node0->count--;
461 	return 0;
462 release_node_err:
463 	barrier();
464 	node0->count--;
465 	goto release_err;
466 release_err:
467 	return ret;
468 }
469 
470 /**
471  * arena_spin_lock - acquire a queued spinlock
472  * @lock: Pointer to queued spinlock structure
473  *
474  * On error, returned value will be negative.
475  * On success, zero is returned.
476  *
477  * The return value _must_ be tested against zero for success,
478  * instead of checking it against negative, for passing the
479  * BPF verifier.
480  *
481  * The user should do:
482  *	if (arena_spin_lock(...) != 0) // failure
483  *		or
484  *	if (arena_spin_lock(...) == 0) // success
485  *		or
486  *	if (arena_spin_lock(...)) // failure
487  *		or
488  *	if (!arena_spin_lock(...)) // success
489  * instead of:
490  *	if (arena_spin_lock(...) < 0) // failure
491  *
492  * The return value can still be inspected later.
493  */
494 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
495 {
496 	int val = 0;
497 
498 	if (CONFIG_NR_CPUS > 1024)
499 		return -EOPNOTSUPP;
500 
501 	bpf_preempt_disable();
502 	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
503 		return 0;
504 
505 	val = arena_spin_lock_slowpath(lock, val);
506 	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
507 	if (val)
508 		bpf_preempt_enable();
509 	return val;
510 }
511 
512 /**
513  * arena_spin_unlock - release a queued spinlock
514  * @lock : Pointer to queued spinlock structure
515  */
516 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
517 {
518 	/*
519 	 * unlock() needs release semantics:
520 	 */
521 	smp_store_release(&lock->locked, 0);
522 	bpf_preempt_enable();
523 }
524 
525 #define arena_spin_lock_irqsave(lock, flags)             \
526 	({                                               \
527 		int __ret;                               \
528 		bpf_local_irq_save(&(flags));            \
529 		__ret = arena_spin_lock((lock));         \
530 		if (__ret)                               \
531 			bpf_local_irq_restore(&(flags)); \
532 		(__ret);                                 \
533 	})
534 
535 #define arena_spin_unlock_irqrestore(lock, flags) \
536 	({                                        \
537 		arena_spin_unlock((lock));        \
538 		bpf_local_irq_restore(&(flags));  \
539 	})
540 
541 #endif
542 
543 #endif /* BPF_ARENA_SPIN_LOCK_H */
544