xref: /linux/kernel/bpf/rqspinlock.c (revision 164c246571e97b6f1bdcc7ff5bb68f16da8afedc)
1a8fcf2a3SKumar Kartikeya Dwivedi // SPDX-License-Identifier: GPL-2.0-or-later
2a8fcf2a3SKumar Kartikeya Dwivedi /*
3a8fcf2a3SKumar Kartikeya Dwivedi  * Resilient Queued Spin Lock
4a8fcf2a3SKumar Kartikeya Dwivedi  *
5a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
6a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2013-2014,2018 Red Hat, Inc.
7a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2015 Intel Corp.
8a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
914c48ee8SKumar Kartikeya Dwivedi  * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
10a8fcf2a3SKumar Kartikeya Dwivedi  *
11a8fcf2a3SKumar Kartikeya Dwivedi  * Authors: Waiman Long <longman@redhat.com>
12a8fcf2a3SKumar Kartikeya Dwivedi  *          Peter Zijlstra <peterz@infradead.org>
1314c48ee8SKumar Kartikeya Dwivedi  *          Kumar Kartikeya Dwivedi <memxor@gmail.com>
14a8fcf2a3SKumar Kartikeya Dwivedi  */
15a8fcf2a3SKumar Kartikeya Dwivedi 
16a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/smp.h>
17a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/bug.h>
18a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/cpumask.h>
19a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/percpu.h>
20a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/hardirq.h>
21a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/mutex.h>
22a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/prefetch.h>
23a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/byteorder.h>
24a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/qspinlock.h>
25a8fcf2a3SKumar Kartikeya Dwivedi #include <trace/events/lock.h>
2630ff1332SKumar Kartikeya Dwivedi #include <asm/rqspinlock.h>
2714c48ee8SKumar Kartikeya Dwivedi #include <linux/timekeeping.h>
28a8fcf2a3SKumar Kartikeya Dwivedi 
29a8fcf2a3SKumar Kartikeya Dwivedi /*
30a8fcf2a3SKumar Kartikeya Dwivedi  * Include queued spinlock definitions and statistics code
31a8fcf2a3SKumar Kartikeya Dwivedi  */
32a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/qspinlock.h"
33a926d099SKumar Kartikeya Dwivedi #include "../locking/lock_events.h"
34a8fcf2a3SKumar Kartikeya Dwivedi 
35a8fcf2a3SKumar Kartikeya Dwivedi /*
36a8fcf2a3SKumar Kartikeya Dwivedi  * The basic principle of a queue-based spinlock can best be understood
37a8fcf2a3SKumar Kartikeya Dwivedi  * by studying a classic queue-based spinlock implementation called the
38a8fcf2a3SKumar Kartikeya Dwivedi  * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
39a8fcf2a3SKumar Kartikeya Dwivedi  * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
40a8fcf2a3SKumar Kartikeya Dwivedi  * Scott") is available at
41a8fcf2a3SKumar Kartikeya Dwivedi  *
42a8fcf2a3SKumar Kartikeya Dwivedi  * https://bugzilla.kernel.org/show_bug.cgi?id=206115
43a8fcf2a3SKumar Kartikeya Dwivedi  *
44a8fcf2a3SKumar Kartikeya Dwivedi  * This queued spinlock implementation is based on the MCS lock, however to
45a8fcf2a3SKumar Kartikeya Dwivedi  * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
46a8fcf2a3SKumar Kartikeya Dwivedi  * existing API, we must modify it somehow.
47a8fcf2a3SKumar Kartikeya Dwivedi  *
48a8fcf2a3SKumar Kartikeya Dwivedi  * In particular; where the traditional MCS lock consists of a tail pointer
49a8fcf2a3SKumar Kartikeya Dwivedi  * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
50a8fcf2a3SKumar Kartikeya Dwivedi  * unlock the next pending (next->locked), we compress both these: {tail,
51a8fcf2a3SKumar Kartikeya Dwivedi  * next->locked} into a single u32 value.
52a8fcf2a3SKumar Kartikeya Dwivedi  *
53a8fcf2a3SKumar Kartikeya Dwivedi  * Since a spinlock disables recursion of its own context and there is a limit
54a8fcf2a3SKumar Kartikeya Dwivedi  * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
55a8fcf2a3SKumar Kartikeya Dwivedi  * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
56a8fcf2a3SKumar Kartikeya Dwivedi  * we can encode the tail by combining the 2-bit nesting level with the cpu
57a8fcf2a3SKumar Kartikeya Dwivedi  * number. With one byte for the lock value and 3 bytes for the tail, only a
58a8fcf2a3SKumar Kartikeya Dwivedi  * 32-bit word is now needed. Even though we only need 1 bit for the lock,
59a8fcf2a3SKumar Kartikeya Dwivedi  * we extend it to a full byte to achieve better performance for architectures
60a8fcf2a3SKumar Kartikeya Dwivedi  * that support atomic byte write.
61a8fcf2a3SKumar Kartikeya Dwivedi  *
62a8fcf2a3SKumar Kartikeya Dwivedi  * We also change the first spinner to spin on the lock bit instead of its
63a8fcf2a3SKumar Kartikeya Dwivedi  * node; whereby avoiding the need to carry a node from lock to unlock, and
64a8fcf2a3SKumar Kartikeya Dwivedi  * preserving existing lock API. This also makes the unlock code simpler and
65a8fcf2a3SKumar Kartikeya Dwivedi  * faster.
66a8fcf2a3SKumar Kartikeya Dwivedi  *
67a8fcf2a3SKumar Kartikeya Dwivedi  * N.B. The current implementation only supports architectures that allow
68a8fcf2a3SKumar Kartikeya Dwivedi  *      atomic operations on smaller 8-bit and 16-bit data types.
69a8fcf2a3SKumar Kartikeya Dwivedi  *
70a8fcf2a3SKumar Kartikeya Dwivedi  */
71a8fcf2a3SKumar Kartikeya Dwivedi 
72a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/mcs_spinlock.h"
73a8fcf2a3SKumar Kartikeya Dwivedi 
7414c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout {
7514c48ee8SKumar Kartikeya Dwivedi 	u64 timeout_end;
7614c48ee8SKumar Kartikeya Dwivedi 	u64 duration;
7714c48ee8SKumar Kartikeya Dwivedi 	u16 spin;
7814c48ee8SKumar Kartikeya Dwivedi };
7914c48ee8SKumar Kartikeya Dwivedi 
80*164c2465SKumar Kartikeya Dwivedi #define RES_TIMEOUT_VAL	2
81*164c2465SKumar Kartikeya Dwivedi 
8214c48ee8SKumar Kartikeya Dwivedi static noinline int check_timeout(struct rqspinlock_timeout *ts)
8314c48ee8SKumar Kartikeya Dwivedi {
8414c48ee8SKumar Kartikeya Dwivedi 	u64 time = ktime_get_mono_fast_ns();
8514c48ee8SKumar Kartikeya Dwivedi 
8614c48ee8SKumar Kartikeya Dwivedi 	if (!ts->timeout_end) {
8714c48ee8SKumar Kartikeya Dwivedi 		ts->timeout_end = time + ts->duration;
8814c48ee8SKumar Kartikeya Dwivedi 		return 0;
8914c48ee8SKumar Kartikeya Dwivedi 	}
9014c48ee8SKumar Kartikeya Dwivedi 
9114c48ee8SKumar Kartikeya Dwivedi 	if (time > ts->timeout_end)
9214c48ee8SKumar Kartikeya Dwivedi 		return -ETIMEDOUT;
9314c48ee8SKumar Kartikeya Dwivedi 
9414c48ee8SKumar Kartikeya Dwivedi 	return 0;
9514c48ee8SKumar Kartikeya Dwivedi }
9614c48ee8SKumar Kartikeya Dwivedi 
97ebababcdSKumar Kartikeya Dwivedi /*
98ebababcdSKumar Kartikeya Dwivedi  * Do not amortize with spins when res_smp_cond_load_acquire is defined,
99ebababcdSKumar Kartikeya Dwivedi  * as the macro does internal amortization for us.
100ebababcdSKumar Kartikeya Dwivedi  */
101ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire
10214c48ee8SKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret)                    \
10314c48ee8SKumar Kartikeya Dwivedi 	({                                            \
10414c48ee8SKumar Kartikeya Dwivedi 		if (!(ts).spin++)                     \
10514c48ee8SKumar Kartikeya Dwivedi 			(ret) = check_timeout(&(ts)); \
10614c48ee8SKumar Kartikeya Dwivedi 		(ret);                                \
10714c48ee8SKumar Kartikeya Dwivedi 	})
108ebababcdSKumar Kartikeya Dwivedi #else
109ebababcdSKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret, mask)	      \
110ebababcdSKumar Kartikeya Dwivedi 	({ (ret) = check_timeout(&(ts)); })
111ebababcdSKumar Kartikeya Dwivedi #endif
11214c48ee8SKumar Kartikeya Dwivedi 
11314c48ee8SKumar Kartikeya Dwivedi /*
11414c48ee8SKumar Kartikeya Dwivedi  * Initialize the 'spin' member.
11514c48ee8SKumar Kartikeya Dwivedi  */
11614c48ee8SKumar Kartikeya Dwivedi #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 1; })
11714c48ee8SKumar Kartikeya Dwivedi 
11814c48ee8SKumar Kartikeya Dwivedi /*
11914c48ee8SKumar Kartikeya Dwivedi  * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
12014c48ee8SKumar Kartikeya Dwivedi  * Duration is defined for each spin attempt, so set it here.
12114c48ee8SKumar Kartikeya Dwivedi  */
12214c48ee8SKumar Kartikeya Dwivedi #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
12314c48ee8SKumar Kartikeya Dwivedi 
124a8fcf2a3SKumar Kartikeya Dwivedi /*
125a8fcf2a3SKumar Kartikeya Dwivedi  * Per-CPU queue node structures; we can never have more than 4 nested
126a8fcf2a3SKumar Kartikeya Dwivedi  * contexts: task, softirq, hardirq, nmi.
127a8fcf2a3SKumar Kartikeya Dwivedi  *
128a8fcf2a3SKumar Kartikeya Dwivedi  * Exactly fits one 64-byte cacheline on a 64-bit architecture.
129a8fcf2a3SKumar Kartikeya Dwivedi  */
130a8fcf2a3SKumar Kartikeya Dwivedi static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
131a8fcf2a3SKumar Kartikeya Dwivedi 
132ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire
133ebababcdSKumar Kartikeya Dwivedi #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
134ebababcdSKumar Kartikeya Dwivedi #endif
135ebababcdSKumar Kartikeya Dwivedi 
136ebababcdSKumar Kartikeya Dwivedi #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
137ebababcdSKumar Kartikeya Dwivedi 
138a8fcf2a3SKumar Kartikeya Dwivedi /**
139a8fcf2a3SKumar Kartikeya Dwivedi  * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
140a8fcf2a3SKumar Kartikeya Dwivedi  * @lock: Pointer to queued spinlock structure
141a8fcf2a3SKumar Kartikeya Dwivedi  * @val: Current value of the queued spinlock 32-bit word
142a8fcf2a3SKumar Kartikeya Dwivedi  *
143337ffea5SKumar Kartikeya Dwivedi  * Return:
144337ffea5SKumar Kartikeya Dwivedi  * * 0		- Lock was acquired successfully.
145337ffea5SKumar Kartikeya Dwivedi  * * -ETIMEDOUT - Lock acquisition failed because of timeout.
146337ffea5SKumar Kartikeya Dwivedi  *
147a8fcf2a3SKumar Kartikeya Dwivedi  * (queue tail, pending bit, lock value)
148a8fcf2a3SKumar Kartikeya Dwivedi  *
149a8fcf2a3SKumar Kartikeya Dwivedi  *              fast     :    slow                                  :    unlock
150a8fcf2a3SKumar Kartikeya Dwivedi  *                       :                                          :
151a8fcf2a3SKumar Kartikeya Dwivedi  * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
152a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       | ^--------.------.             /  :
153a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v           \      \            |  :
154a8fcf2a3SKumar Kartikeya Dwivedi  * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
155a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       | ^--'              |           |  :
156a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v                   |           |  :
157a8fcf2a3SKumar Kartikeya Dwivedi  * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
158a8fcf2a3SKumar Kartikeya Dwivedi  *   queue               :       | ^--'                          |  :
159a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v                               |  :
160a8fcf2a3SKumar Kartikeya Dwivedi  * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
161a8fcf2a3SKumar Kartikeya Dwivedi  *   queue               :         ^--'                             :
162a8fcf2a3SKumar Kartikeya Dwivedi  */
163337ffea5SKumar Kartikeya Dwivedi int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
164a8fcf2a3SKumar Kartikeya Dwivedi {
165a8fcf2a3SKumar Kartikeya Dwivedi 	struct mcs_spinlock *prev, *next, *node;
16614c48ee8SKumar Kartikeya Dwivedi 	struct rqspinlock_timeout ts;
167337ffea5SKumar Kartikeya Dwivedi 	int idx, ret = 0;
168a8fcf2a3SKumar Kartikeya Dwivedi 	u32 old, tail;
169a8fcf2a3SKumar Kartikeya Dwivedi 
170a8fcf2a3SKumar Kartikeya Dwivedi 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
171a8fcf2a3SKumar Kartikeya Dwivedi 
17214c48ee8SKumar Kartikeya Dwivedi 	RES_INIT_TIMEOUT(ts);
17314c48ee8SKumar Kartikeya Dwivedi 
174a8fcf2a3SKumar Kartikeya Dwivedi 	/*
175a8fcf2a3SKumar Kartikeya Dwivedi 	 * Wait for in-progress pending->locked hand-overs with a bounded
176a8fcf2a3SKumar Kartikeya Dwivedi 	 * number of spins so that we guarantee forward progress.
177a8fcf2a3SKumar Kartikeya Dwivedi 	 *
178a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,0 -> 0,0,1
179a8fcf2a3SKumar Kartikeya Dwivedi 	 */
180a8fcf2a3SKumar Kartikeya Dwivedi 	if (val == _Q_PENDING_VAL) {
181a8fcf2a3SKumar Kartikeya Dwivedi 		int cnt = _Q_PENDING_LOOPS;
182a8fcf2a3SKumar Kartikeya Dwivedi 		val = atomic_cond_read_relaxed(&lock->val,
183a8fcf2a3SKumar Kartikeya Dwivedi 					       (VAL != _Q_PENDING_VAL) || !cnt--);
184a8fcf2a3SKumar Kartikeya Dwivedi 	}
185a8fcf2a3SKumar Kartikeya Dwivedi 
186a8fcf2a3SKumar Kartikeya Dwivedi 	/*
187a8fcf2a3SKumar Kartikeya Dwivedi 	 * If we observe any contention; queue.
188a8fcf2a3SKumar Kartikeya Dwivedi 	 */
189a8fcf2a3SKumar Kartikeya Dwivedi 	if (val & ~_Q_LOCKED_MASK)
190a8fcf2a3SKumar Kartikeya Dwivedi 		goto queue;
191a8fcf2a3SKumar Kartikeya Dwivedi 
192a8fcf2a3SKumar Kartikeya Dwivedi 	/*
193a8fcf2a3SKumar Kartikeya Dwivedi 	 * trylock || pending
194a8fcf2a3SKumar Kartikeya Dwivedi 	 *
195a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
196a8fcf2a3SKumar Kartikeya Dwivedi 	 */
197a8fcf2a3SKumar Kartikeya Dwivedi 	val = queued_fetch_set_pending_acquire(lock);
198a8fcf2a3SKumar Kartikeya Dwivedi 
199a8fcf2a3SKumar Kartikeya Dwivedi 	/*
200a8fcf2a3SKumar Kartikeya Dwivedi 	 * If we observe contention, there is a concurrent locker.
201a8fcf2a3SKumar Kartikeya Dwivedi 	 *
202a8fcf2a3SKumar Kartikeya Dwivedi 	 * Undo and queue; our setting of PENDING might have made the
203a8fcf2a3SKumar Kartikeya Dwivedi 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
204a8fcf2a3SKumar Kartikeya Dwivedi 	 * on @next to become !NULL.
205a8fcf2a3SKumar Kartikeya Dwivedi 	 */
206a8fcf2a3SKumar Kartikeya Dwivedi 	if (unlikely(val & ~_Q_LOCKED_MASK)) {
207a8fcf2a3SKumar Kartikeya Dwivedi 
208a8fcf2a3SKumar Kartikeya Dwivedi 		/* Undo PENDING if we set it. */
209a8fcf2a3SKumar Kartikeya Dwivedi 		if (!(val & _Q_PENDING_MASK))
210a8fcf2a3SKumar Kartikeya Dwivedi 			clear_pending(lock);
211a8fcf2a3SKumar Kartikeya Dwivedi 
212a8fcf2a3SKumar Kartikeya Dwivedi 		goto queue;
213a8fcf2a3SKumar Kartikeya Dwivedi 	}
214a8fcf2a3SKumar Kartikeya Dwivedi 
215a8fcf2a3SKumar Kartikeya Dwivedi 	/*
216a8fcf2a3SKumar Kartikeya Dwivedi 	 * We're pending, wait for the owner to go away.
217a8fcf2a3SKumar Kartikeya Dwivedi 	 *
218a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,1 -> *,1,0
219a8fcf2a3SKumar Kartikeya Dwivedi 	 *
220a8fcf2a3SKumar Kartikeya Dwivedi 	 * this wait loop must be a load-acquire such that we match the
221a8fcf2a3SKumar Kartikeya Dwivedi 	 * store-release that clears the locked bit and create lock
222a8fcf2a3SKumar Kartikeya Dwivedi 	 * sequentiality; this is because not all
223a8fcf2a3SKumar Kartikeya Dwivedi 	 * clear_pending_set_locked() implementations imply full
224a8fcf2a3SKumar Kartikeya Dwivedi 	 * barriers.
225a8fcf2a3SKumar Kartikeya Dwivedi 	 */
226337ffea5SKumar Kartikeya Dwivedi 	if (val & _Q_LOCKED_MASK) {
227337ffea5SKumar Kartikeya Dwivedi 		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
228337ffea5SKumar Kartikeya Dwivedi 		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret));
229337ffea5SKumar Kartikeya Dwivedi 	}
230337ffea5SKumar Kartikeya Dwivedi 
231337ffea5SKumar Kartikeya Dwivedi 	if (ret) {
232337ffea5SKumar Kartikeya Dwivedi 		/*
233337ffea5SKumar Kartikeya Dwivedi 		 * We waited for the locked bit to go back to 0, as the pending
234337ffea5SKumar Kartikeya Dwivedi 		 * waiter, but timed out. We need to clear the pending bit since
235337ffea5SKumar Kartikeya Dwivedi 		 * we own it. Once a stuck owner has been recovered, the lock
236337ffea5SKumar Kartikeya Dwivedi 		 * must be restored to a valid state, hence removing the pending
237337ffea5SKumar Kartikeya Dwivedi 		 * bit is necessary.
238337ffea5SKumar Kartikeya Dwivedi 		 *
239337ffea5SKumar Kartikeya Dwivedi 		 * *,1,* -> *,0,*
240337ffea5SKumar Kartikeya Dwivedi 		 */
241337ffea5SKumar Kartikeya Dwivedi 		clear_pending(lock);
242337ffea5SKumar Kartikeya Dwivedi 		lockevent_inc(rqspinlock_lock_timeout);
243337ffea5SKumar Kartikeya Dwivedi 		return ret;
244337ffea5SKumar Kartikeya Dwivedi 	}
245a8fcf2a3SKumar Kartikeya Dwivedi 
246a8fcf2a3SKumar Kartikeya Dwivedi 	/*
247a8fcf2a3SKumar Kartikeya Dwivedi 	 * take ownership and clear the pending bit.
248a8fcf2a3SKumar Kartikeya Dwivedi 	 *
249a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,0 -> 0,0,1
250a8fcf2a3SKumar Kartikeya Dwivedi 	 */
251a8fcf2a3SKumar Kartikeya Dwivedi 	clear_pending_set_locked(lock);
252a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_inc(lock_pending);
253337ffea5SKumar Kartikeya Dwivedi 	return 0;
254a8fcf2a3SKumar Kartikeya Dwivedi 
255a8fcf2a3SKumar Kartikeya Dwivedi 	/*
256a8fcf2a3SKumar Kartikeya Dwivedi 	 * End of pending bit optimistic spinning and beginning of MCS
257a8fcf2a3SKumar Kartikeya Dwivedi 	 * queuing.
258a8fcf2a3SKumar Kartikeya Dwivedi 	 */
259a8fcf2a3SKumar Kartikeya Dwivedi queue:
260a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_inc(lock_slowpath);
261a8fcf2a3SKumar Kartikeya Dwivedi 	node = this_cpu_ptr(&rqnodes[0].mcs);
262a8fcf2a3SKumar Kartikeya Dwivedi 	idx = node->count++;
263a8fcf2a3SKumar Kartikeya Dwivedi 	tail = encode_tail(smp_processor_id(), idx);
264a8fcf2a3SKumar Kartikeya Dwivedi 
265a8fcf2a3SKumar Kartikeya Dwivedi 	trace_contention_begin(lock, LCB_F_SPIN);
266a8fcf2a3SKumar Kartikeya Dwivedi 
267a8fcf2a3SKumar Kartikeya Dwivedi 	/*
268a8fcf2a3SKumar Kartikeya Dwivedi 	 * 4 nodes are allocated based on the assumption that there will
269a8fcf2a3SKumar Kartikeya Dwivedi 	 * not be nested NMIs taking spinlocks. That may not be true in
270a8fcf2a3SKumar Kartikeya Dwivedi 	 * some architectures even though the chance of needing more than
271a8fcf2a3SKumar Kartikeya Dwivedi 	 * 4 nodes will still be extremely unlikely. When that happens,
272a8fcf2a3SKumar Kartikeya Dwivedi 	 * we fall back to spinning on the lock directly without using
273a8fcf2a3SKumar Kartikeya Dwivedi 	 * any MCS node. This is not the most elegant solution, but is
274a8fcf2a3SKumar Kartikeya Dwivedi 	 * simple enough.
275a8fcf2a3SKumar Kartikeya Dwivedi 	 */
276a8fcf2a3SKumar Kartikeya Dwivedi 	if (unlikely(idx >= _Q_MAX_NODES)) {
277a8fcf2a3SKumar Kartikeya Dwivedi 		lockevent_inc(lock_no_node);
278a8fcf2a3SKumar Kartikeya Dwivedi 		while (!queued_spin_trylock(lock))
279a8fcf2a3SKumar Kartikeya Dwivedi 			cpu_relax();
280a8fcf2a3SKumar Kartikeya Dwivedi 		goto release;
281a8fcf2a3SKumar Kartikeya Dwivedi 	}
282a8fcf2a3SKumar Kartikeya Dwivedi 
283a8fcf2a3SKumar Kartikeya Dwivedi 	node = grab_mcs_node(node, idx);
284a8fcf2a3SKumar Kartikeya Dwivedi 
285a8fcf2a3SKumar Kartikeya Dwivedi 	/*
286a8fcf2a3SKumar Kartikeya Dwivedi 	 * Keep counts of non-zero index values:
287a8fcf2a3SKumar Kartikeya Dwivedi 	 */
288a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
289a8fcf2a3SKumar Kartikeya Dwivedi 
290a8fcf2a3SKumar Kartikeya Dwivedi 	/*
291a8fcf2a3SKumar Kartikeya Dwivedi 	 * Ensure that we increment the head node->count before initialising
292a8fcf2a3SKumar Kartikeya Dwivedi 	 * the actual node. If the compiler is kind enough to reorder these
293a8fcf2a3SKumar Kartikeya Dwivedi 	 * stores, then an IRQ could overwrite our assignments.
294a8fcf2a3SKumar Kartikeya Dwivedi 	 */
295a8fcf2a3SKumar Kartikeya Dwivedi 	barrier();
296a8fcf2a3SKumar Kartikeya Dwivedi 
297a8fcf2a3SKumar Kartikeya Dwivedi 	node->locked = 0;
298a8fcf2a3SKumar Kartikeya Dwivedi 	node->next = NULL;
299a8fcf2a3SKumar Kartikeya Dwivedi 
300a8fcf2a3SKumar Kartikeya Dwivedi 	/*
301a8fcf2a3SKumar Kartikeya Dwivedi 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
302a8fcf2a3SKumar Kartikeya Dwivedi 	 * attempt the trylock once more in the hope someone let go while we
303a8fcf2a3SKumar Kartikeya Dwivedi 	 * weren't watching.
304a8fcf2a3SKumar Kartikeya Dwivedi 	 */
305a8fcf2a3SKumar Kartikeya Dwivedi 	if (queued_spin_trylock(lock))
306a8fcf2a3SKumar Kartikeya Dwivedi 		goto release;
307a8fcf2a3SKumar Kartikeya Dwivedi 
308a8fcf2a3SKumar Kartikeya Dwivedi 	/*
309a8fcf2a3SKumar Kartikeya Dwivedi 	 * Ensure that the initialisation of @node is complete before we
310a8fcf2a3SKumar Kartikeya Dwivedi 	 * publish the updated tail via xchg_tail() and potentially link
311a8fcf2a3SKumar Kartikeya Dwivedi 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
312a8fcf2a3SKumar Kartikeya Dwivedi 	 */
313a8fcf2a3SKumar Kartikeya Dwivedi 	smp_wmb();
314a8fcf2a3SKumar Kartikeya Dwivedi 
315a8fcf2a3SKumar Kartikeya Dwivedi 	/*
316a8fcf2a3SKumar Kartikeya Dwivedi 	 * Publish the updated tail.
317a8fcf2a3SKumar Kartikeya Dwivedi 	 * We have already touched the queueing cacheline; don't bother with
318a8fcf2a3SKumar Kartikeya Dwivedi 	 * pending stuff.
319a8fcf2a3SKumar Kartikeya Dwivedi 	 *
320a8fcf2a3SKumar Kartikeya Dwivedi 	 * p,*,* -> n,*,*
321a8fcf2a3SKumar Kartikeya Dwivedi 	 */
322a8fcf2a3SKumar Kartikeya Dwivedi 	old = xchg_tail(lock, tail);
323a8fcf2a3SKumar Kartikeya Dwivedi 	next = NULL;
324a8fcf2a3SKumar Kartikeya Dwivedi 
325a8fcf2a3SKumar Kartikeya Dwivedi 	/*
326a8fcf2a3SKumar Kartikeya Dwivedi 	 * if there was a previous node; link it and wait until reaching the
327a8fcf2a3SKumar Kartikeya Dwivedi 	 * head of the waitqueue.
328a8fcf2a3SKumar Kartikeya Dwivedi 	 */
329a8fcf2a3SKumar Kartikeya Dwivedi 	if (old & _Q_TAIL_MASK) {
330*164c2465SKumar Kartikeya Dwivedi 		int val;
331*164c2465SKumar Kartikeya Dwivedi 
332a8fcf2a3SKumar Kartikeya Dwivedi 		prev = decode_tail(old, rqnodes);
333a8fcf2a3SKumar Kartikeya Dwivedi 
334a8fcf2a3SKumar Kartikeya Dwivedi 		/* Link @node into the waitqueue. */
335a8fcf2a3SKumar Kartikeya Dwivedi 		WRITE_ONCE(prev->next, node);
336a8fcf2a3SKumar Kartikeya Dwivedi 
337*164c2465SKumar Kartikeya Dwivedi 		val = arch_mcs_spin_lock_contended(&node->locked);
338*164c2465SKumar Kartikeya Dwivedi 		if (val == RES_TIMEOUT_VAL) {
339*164c2465SKumar Kartikeya Dwivedi 			ret = -EDEADLK;
340*164c2465SKumar Kartikeya Dwivedi 			goto waitq_timeout;
341*164c2465SKumar Kartikeya Dwivedi 		}
342a8fcf2a3SKumar Kartikeya Dwivedi 
343a8fcf2a3SKumar Kartikeya Dwivedi 		/*
344a8fcf2a3SKumar Kartikeya Dwivedi 		 * While waiting for the MCS lock, the next pointer may have
345a8fcf2a3SKumar Kartikeya Dwivedi 		 * been set by another lock waiter. We optimistically load
346a8fcf2a3SKumar Kartikeya Dwivedi 		 * the next pointer & prefetch the cacheline for writing
347a8fcf2a3SKumar Kartikeya Dwivedi 		 * to reduce latency in the upcoming MCS unlock operation.
348a8fcf2a3SKumar Kartikeya Dwivedi 		 */
349a8fcf2a3SKumar Kartikeya Dwivedi 		next = READ_ONCE(node->next);
350a8fcf2a3SKumar Kartikeya Dwivedi 		if (next)
351a8fcf2a3SKumar Kartikeya Dwivedi 			prefetchw(next);
352a8fcf2a3SKumar Kartikeya Dwivedi 	}
353a8fcf2a3SKumar Kartikeya Dwivedi 
354a8fcf2a3SKumar Kartikeya Dwivedi 	/*
355a8fcf2a3SKumar Kartikeya Dwivedi 	 * we're at the head of the waitqueue, wait for the owner & pending to
356a8fcf2a3SKumar Kartikeya Dwivedi 	 * go away.
357a8fcf2a3SKumar Kartikeya Dwivedi 	 *
358a8fcf2a3SKumar Kartikeya Dwivedi 	 * *,x,y -> *,0,0
359a8fcf2a3SKumar Kartikeya Dwivedi 	 *
360a8fcf2a3SKumar Kartikeya Dwivedi 	 * this wait loop must use a load-acquire such that we match the
361a8fcf2a3SKumar Kartikeya Dwivedi 	 * store-release that clears the locked bit and create lock
362a8fcf2a3SKumar Kartikeya Dwivedi 	 * sequentiality; this is because the set_locked() function below
363a8fcf2a3SKumar Kartikeya Dwivedi 	 * does not imply a full barrier.
364*164c2465SKumar Kartikeya Dwivedi 	 *
365*164c2465SKumar Kartikeya Dwivedi 	 * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is
366*164c2465SKumar Kartikeya Dwivedi 	 * meant to span maximum allowed time per critical section, and we may
367*164c2465SKumar Kartikeya Dwivedi 	 * have both the owner of the lock and the pending bit waiter ahead of
368*164c2465SKumar Kartikeya Dwivedi 	 * us.
369a8fcf2a3SKumar Kartikeya Dwivedi 	 */
370*164c2465SKumar Kartikeya Dwivedi 	RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
371*164c2465SKumar Kartikeya Dwivedi 	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
372*164c2465SKumar Kartikeya Dwivedi 					   RES_CHECK_TIMEOUT(ts, ret));
373*164c2465SKumar Kartikeya Dwivedi 
374*164c2465SKumar Kartikeya Dwivedi waitq_timeout:
375*164c2465SKumar Kartikeya Dwivedi 	if (ret) {
376*164c2465SKumar Kartikeya Dwivedi 		/*
377*164c2465SKumar Kartikeya Dwivedi 		 * If the tail is still pointing to us, then we are the final waiter,
378*164c2465SKumar Kartikeya Dwivedi 		 * and are responsible for resetting the tail back to 0. Otherwise, if
379*164c2465SKumar Kartikeya Dwivedi 		 * the cmpxchg operation fails, we signal the next waiter to take exit
380*164c2465SKumar Kartikeya Dwivedi 		 * and try the same. For a waiter with tail node 'n':
381*164c2465SKumar Kartikeya Dwivedi 		 *
382*164c2465SKumar Kartikeya Dwivedi 		 * n,*,* -> 0,*,*
383*164c2465SKumar Kartikeya Dwivedi 		 *
384*164c2465SKumar Kartikeya Dwivedi 		 * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is
385*164c2465SKumar Kartikeya Dwivedi 		 * possible locked/pending bits keep changing and we see failures even
386*164c2465SKumar Kartikeya Dwivedi 		 * when we remain the head of wait queue. However, eventually,
387*164c2465SKumar Kartikeya Dwivedi 		 * pending bit owner will unset the pending bit, and new waiters
388*164c2465SKumar Kartikeya Dwivedi 		 * will queue behind us. This will leave the lock owner in
389*164c2465SKumar Kartikeya Dwivedi 		 * charge, and it will eventually either set locked bit to 0, or
390*164c2465SKumar Kartikeya Dwivedi 		 * leave it as 1, allowing us to make progress.
391*164c2465SKumar Kartikeya Dwivedi 		 *
392*164c2465SKumar Kartikeya Dwivedi 		 * We terminate the whole wait queue for two reasons. Firstly,
393*164c2465SKumar Kartikeya Dwivedi 		 * we eschew per-waiter timeouts with one applied at the head of
394*164c2465SKumar Kartikeya Dwivedi 		 * the wait queue.  This allows everyone to break out faster
395*164c2465SKumar Kartikeya Dwivedi 		 * once we've seen the owner / pending waiter not responding for
396*164c2465SKumar Kartikeya Dwivedi 		 * the timeout duration from the head.  Secondly, it avoids
397*164c2465SKumar Kartikeya Dwivedi 		 * complicated synchronization, because when not leaving in FIFO
398*164c2465SKumar Kartikeya Dwivedi 		 * order, prev's next pointer needs to be fixed up etc.
399*164c2465SKumar Kartikeya Dwivedi 		 */
400*164c2465SKumar Kartikeya Dwivedi 		if (!try_cmpxchg_tail(lock, tail, 0)) {
401*164c2465SKumar Kartikeya Dwivedi 			next = smp_cond_load_relaxed(&node->next, VAL);
402*164c2465SKumar Kartikeya Dwivedi 			WRITE_ONCE(next->locked, RES_TIMEOUT_VAL);
403*164c2465SKumar Kartikeya Dwivedi 		}
404*164c2465SKumar Kartikeya Dwivedi 		lockevent_inc(rqspinlock_lock_timeout);
405*164c2465SKumar Kartikeya Dwivedi 		goto release;
406*164c2465SKumar Kartikeya Dwivedi 	}
407a8fcf2a3SKumar Kartikeya Dwivedi 
408a8fcf2a3SKumar Kartikeya Dwivedi 	/*
409a8fcf2a3SKumar Kartikeya Dwivedi 	 * claim the lock:
410a8fcf2a3SKumar Kartikeya Dwivedi 	 *
411a8fcf2a3SKumar Kartikeya Dwivedi 	 * n,0,0 -> 0,0,1 : lock, uncontended
412a8fcf2a3SKumar Kartikeya Dwivedi 	 * *,*,0 -> *,*,1 : lock, contended
413a8fcf2a3SKumar Kartikeya Dwivedi 	 *
414a8fcf2a3SKumar Kartikeya Dwivedi 	 * If the queue head is the only one in the queue (lock value == tail)
415a8fcf2a3SKumar Kartikeya Dwivedi 	 * and nobody is pending, clear the tail code and grab the lock.
416a8fcf2a3SKumar Kartikeya Dwivedi 	 * Otherwise, we only need to grab the lock.
417a8fcf2a3SKumar Kartikeya Dwivedi 	 */
418a8fcf2a3SKumar Kartikeya Dwivedi 
419a8fcf2a3SKumar Kartikeya Dwivedi 	/*
420a8fcf2a3SKumar Kartikeya Dwivedi 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
421a8fcf2a3SKumar Kartikeya Dwivedi 	 *       above wait condition, therefore any concurrent setting of
422a8fcf2a3SKumar Kartikeya Dwivedi 	 *       PENDING will make the uncontended transition fail.
423a8fcf2a3SKumar Kartikeya Dwivedi 	 */
424a8fcf2a3SKumar Kartikeya Dwivedi 	if ((val & _Q_TAIL_MASK) == tail) {
425a8fcf2a3SKumar Kartikeya Dwivedi 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
426a8fcf2a3SKumar Kartikeya Dwivedi 			goto release; /* No contention */
427a8fcf2a3SKumar Kartikeya Dwivedi 	}
428a8fcf2a3SKumar Kartikeya Dwivedi 
429a8fcf2a3SKumar Kartikeya Dwivedi 	/*
430a8fcf2a3SKumar Kartikeya Dwivedi 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
431a8fcf2a3SKumar Kartikeya Dwivedi 	 * which will then detect the remaining tail and queue behind us
432a8fcf2a3SKumar Kartikeya Dwivedi 	 * ensuring we'll see a @next.
433a8fcf2a3SKumar Kartikeya Dwivedi 	 */
434a8fcf2a3SKumar Kartikeya Dwivedi 	set_locked(lock);
435a8fcf2a3SKumar Kartikeya Dwivedi 
436a8fcf2a3SKumar Kartikeya Dwivedi 	/*
437a8fcf2a3SKumar Kartikeya Dwivedi 	 * contended path; wait for next if not observed yet, release.
438a8fcf2a3SKumar Kartikeya Dwivedi 	 */
439a8fcf2a3SKumar Kartikeya Dwivedi 	if (!next)
440a8fcf2a3SKumar Kartikeya Dwivedi 		next = smp_cond_load_relaxed(&node->next, (VAL));
441a8fcf2a3SKumar Kartikeya Dwivedi 
442a8fcf2a3SKumar Kartikeya Dwivedi 	arch_mcs_spin_unlock_contended(&next->locked);
443a8fcf2a3SKumar Kartikeya Dwivedi 
444a8fcf2a3SKumar Kartikeya Dwivedi release:
445a8fcf2a3SKumar Kartikeya Dwivedi 	trace_contention_end(lock, 0);
446a8fcf2a3SKumar Kartikeya Dwivedi 
447a8fcf2a3SKumar Kartikeya Dwivedi 	/*
448a8fcf2a3SKumar Kartikeya Dwivedi 	 * release the node
449a8fcf2a3SKumar Kartikeya Dwivedi 	 */
450a8fcf2a3SKumar Kartikeya Dwivedi 	__this_cpu_dec(rqnodes[0].mcs.count);
451*164c2465SKumar Kartikeya Dwivedi 	return ret;
452a8fcf2a3SKumar Kartikeya Dwivedi }
453a8fcf2a3SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
454