xref: /linux/kernel/bpf/rqspinlock.c (revision 337ffea51aeec0b2212d862383a04088e5c063f7)
1a8fcf2a3SKumar Kartikeya Dwivedi // SPDX-License-Identifier: GPL-2.0-or-later
2a8fcf2a3SKumar Kartikeya Dwivedi /*
3a8fcf2a3SKumar Kartikeya Dwivedi  * Resilient Queued Spin Lock
4a8fcf2a3SKumar Kartikeya Dwivedi  *
5a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
6a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2013-2014,2018 Red Hat, Inc.
7a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2015 Intel Corp.
8a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
914c48ee8SKumar Kartikeya Dwivedi  * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
10a8fcf2a3SKumar Kartikeya Dwivedi  *
11a8fcf2a3SKumar Kartikeya Dwivedi  * Authors: Waiman Long <longman@redhat.com>
12a8fcf2a3SKumar Kartikeya Dwivedi  *          Peter Zijlstra <peterz@infradead.org>
1314c48ee8SKumar Kartikeya Dwivedi  *          Kumar Kartikeya Dwivedi <memxor@gmail.com>
14a8fcf2a3SKumar Kartikeya Dwivedi  */
15a8fcf2a3SKumar Kartikeya Dwivedi 
16a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/smp.h>
17a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/bug.h>
18a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/cpumask.h>
19a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/percpu.h>
20a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/hardirq.h>
21a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/mutex.h>
22a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/prefetch.h>
23a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/byteorder.h>
24a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/qspinlock.h>
25a8fcf2a3SKumar Kartikeya Dwivedi #include <trace/events/lock.h>
2630ff1332SKumar Kartikeya Dwivedi #include <asm/rqspinlock.h>
2714c48ee8SKumar Kartikeya Dwivedi #include <linux/timekeeping.h>
28a8fcf2a3SKumar Kartikeya Dwivedi 
29a8fcf2a3SKumar Kartikeya Dwivedi /*
30a8fcf2a3SKumar Kartikeya Dwivedi  * Include queued spinlock definitions and statistics code
31a8fcf2a3SKumar Kartikeya Dwivedi  */
32a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/qspinlock.h"
33a926d099SKumar Kartikeya Dwivedi #include "../locking/lock_events.h"
34a8fcf2a3SKumar Kartikeya Dwivedi 
35a8fcf2a3SKumar Kartikeya Dwivedi /*
36a8fcf2a3SKumar Kartikeya Dwivedi  * The basic principle of a queue-based spinlock can best be understood
37a8fcf2a3SKumar Kartikeya Dwivedi  * by studying a classic queue-based spinlock implementation called the
38a8fcf2a3SKumar Kartikeya Dwivedi  * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
39a8fcf2a3SKumar Kartikeya Dwivedi  * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
40a8fcf2a3SKumar Kartikeya Dwivedi  * Scott") is available at
41a8fcf2a3SKumar Kartikeya Dwivedi  *
42a8fcf2a3SKumar Kartikeya Dwivedi  * https://bugzilla.kernel.org/show_bug.cgi?id=206115
43a8fcf2a3SKumar Kartikeya Dwivedi  *
44a8fcf2a3SKumar Kartikeya Dwivedi  * This queued spinlock implementation is based on the MCS lock, however to
45a8fcf2a3SKumar Kartikeya Dwivedi  * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
46a8fcf2a3SKumar Kartikeya Dwivedi  * existing API, we must modify it somehow.
47a8fcf2a3SKumar Kartikeya Dwivedi  *
48a8fcf2a3SKumar Kartikeya Dwivedi  * In particular; where the traditional MCS lock consists of a tail pointer
49a8fcf2a3SKumar Kartikeya Dwivedi  * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
50a8fcf2a3SKumar Kartikeya Dwivedi  * unlock the next pending (next->locked), we compress both these: {tail,
51a8fcf2a3SKumar Kartikeya Dwivedi  * next->locked} into a single u32 value.
52a8fcf2a3SKumar Kartikeya Dwivedi  *
53a8fcf2a3SKumar Kartikeya Dwivedi  * Since a spinlock disables recursion of its own context and there is a limit
54a8fcf2a3SKumar Kartikeya Dwivedi  * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
55a8fcf2a3SKumar Kartikeya Dwivedi  * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
56a8fcf2a3SKumar Kartikeya Dwivedi  * we can encode the tail by combining the 2-bit nesting level with the cpu
57a8fcf2a3SKumar Kartikeya Dwivedi  * number. With one byte for the lock value and 3 bytes for the tail, only a
58a8fcf2a3SKumar Kartikeya Dwivedi  * 32-bit word is now needed. Even though we only need 1 bit for the lock,
59a8fcf2a3SKumar Kartikeya Dwivedi  * we extend it to a full byte to achieve better performance for architectures
60a8fcf2a3SKumar Kartikeya Dwivedi  * that support atomic byte write.
61a8fcf2a3SKumar Kartikeya Dwivedi  *
62a8fcf2a3SKumar Kartikeya Dwivedi  * We also change the first spinner to spin on the lock bit instead of its
63a8fcf2a3SKumar Kartikeya Dwivedi  * node; whereby avoiding the need to carry a node from lock to unlock, and
64a8fcf2a3SKumar Kartikeya Dwivedi  * preserving existing lock API. This also makes the unlock code simpler and
65a8fcf2a3SKumar Kartikeya Dwivedi  * faster.
66a8fcf2a3SKumar Kartikeya Dwivedi  *
67a8fcf2a3SKumar Kartikeya Dwivedi  * N.B. The current implementation only supports architectures that allow
68a8fcf2a3SKumar Kartikeya Dwivedi  *      atomic operations on smaller 8-bit and 16-bit data types.
69a8fcf2a3SKumar Kartikeya Dwivedi  *
70a8fcf2a3SKumar Kartikeya Dwivedi  */
71a8fcf2a3SKumar Kartikeya Dwivedi 
72a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/mcs_spinlock.h"
73a8fcf2a3SKumar Kartikeya Dwivedi 
7414c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout {
7514c48ee8SKumar Kartikeya Dwivedi 	u64 timeout_end;
7614c48ee8SKumar Kartikeya Dwivedi 	u64 duration;
7714c48ee8SKumar Kartikeya Dwivedi 	u16 spin;
7814c48ee8SKumar Kartikeya Dwivedi };
7914c48ee8SKumar Kartikeya Dwivedi 
8014c48ee8SKumar Kartikeya Dwivedi static noinline int check_timeout(struct rqspinlock_timeout *ts)
8114c48ee8SKumar Kartikeya Dwivedi {
8214c48ee8SKumar Kartikeya Dwivedi 	u64 time = ktime_get_mono_fast_ns();
8314c48ee8SKumar Kartikeya Dwivedi 
8414c48ee8SKumar Kartikeya Dwivedi 	if (!ts->timeout_end) {
8514c48ee8SKumar Kartikeya Dwivedi 		ts->timeout_end = time + ts->duration;
8614c48ee8SKumar Kartikeya Dwivedi 		return 0;
8714c48ee8SKumar Kartikeya Dwivedi 	}
8814c48ee8SKumar Kartikeya Dwivedi 
8914c48ee8SKumar Kartikeya Dwivedi 	if (time > ts->timeout_end)
9014c48ee8SKumar Kartikeya Dwivedi 		return -ETIMEDOUT;
9114c48ee8SKumar Kartikeya Dwivedi 
9214c48ee8SKumar Kartikeya Dwivedi 	return 0;
9314c48ee8SKumar Kartikeya Dwivedi }
9414c48ee8SKumar Kartikeya Dwivedi 
95ebababcdSKumar Kartikeya Dwivedi /*
96ebababcdSKumar Kartikeya Dwivedi  * Do not amortize with spins when res_smp_cond_load_acquire is defined,
97ebababcdSKumar Kartikeya Dwivedi  * as the macro does internal amortization for us.
98ebababcdSKumar Kartikeya Dwivedi  */
99ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire
10014c48ee8SKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret)                    \
10114c48ee8SKumar Kartikeya Dwivedi 	({                                            \
10214c48ee8SKumar Kartikeya Dwivedi 		if (!(ts).spin++)                     \
10314c48ee8SKumar Kartikeya Dwivedi 			(ret) = check_timeout(&(ts)); \
10414c48ee8SKumar Kartikeya Dwivedi 		(ret);                                \
10514c48ee8SKumar Kartikeya Dwivedi 	})
106ebababcdSKumar Kartikeya Dwivedi #else
107ebababcdSKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret, mask)	      \
108ebababcdSKumar Kartikeya Dwivedi 	({ (ret) = check_timeout(&(ts)); })
109ebababcdSKumar Kartikeya Dwivedi #endif
11014c48ee8SKumar Kartikeya Dwivedi 
11114c48ee8SKumar Kartikeya Dwivedi /*
11214c48ee8SKumar Kartikeya Dwivedi  * Initialize the 'spin' member.
11314c48ee8SKumar Kartikeya Dwivedi  */
11414c48ee8SKumar Kartikeya Dwivedi #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 1; })
11514c48ee8SKumar Kartikeya Dwivedi 
11614c48ee8SKumar Kartikeya Dwivedi /*
11714c48ee8SKumar Kartikeya Dwivedi  * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
11814c48ee8SKumar Kartikeya Dwivedi  * Duration is defined for each spin attempt, so set it here.
11914c48ee8SKumar Kartikeya Dwivedi  */
12014c48ee8SKumar Kartikeya Dwivedi #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
12114c48ee8SKumar Kartikeya Dwivedi 
122a8fcf2a3SKumar Kartikeya Dwivedi /*
123a8fcf2a3SKumar Kartikeya Dwivedi  * Per-CPU queue node structures; we can never have more than 4 nested
124a8fcf2a3SKumar Kartikeya Dwivedi  * contexts: task, softirq, hardirq, nmi.
125a8fcf2a3SKumar Kartikeya Dwivedi  *
126a8fcf2a3SKumar Kartikeya Dwivedi  * Exactly fits one 64-byte cacheline on a 64-bit architecture.
127a8fcf2a3SKumar Kartikeya Dwivedi  */
128a8fcf2a3SKumar Kartikeya Dwivedi static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
129a8fcf2a3SKumar Kartikeya Dwivedi 
130ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire
131ebababcdSKumar Kartikeya Dwivedi #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
132ebababcdSKumar Kartikeya Dwivedi #endif
133ebababcdSKumar Kartikeya Dwivedi 
134ebababcdSKumar Kartikeya Dwivedi #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
135ebababcdSKumar Kartikeya Dwivedi 
136a8fcf2a3SKumar Kartikeya Dwivedi /**
137a8fcf2a3SKumar Kartikeya Dwivedi  * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
138a8fcf2a3SKumar Kartikeya Dwivedi  * @lock: Pointer to queued spinlock structure
139a8fcf2a3SKumar Kartikeya Dwivedi  * @val: Current value of the queued spinlock 32-bit word
140a8fcf2a3SKumar Kartikeya Dwivedi  *
141*337ffea5SKumar Kartikeya Dwivedi  * Return:
142*337ffea5SKumar Kartikeya Dwivedi  * * 0		- Lock was acquired successfully.
143*337ffea5SKumar Kartikeya Dwivedi  * * -ETIMEDOUT - Lock acquisition failed because of timeout.
144*337ffea5SKumar Kartikeya Dwivedi  *
145a8fcf2a3SKumar Kartikeya Dwivedi  * (queue tail, pending bit, lock value)
146a8fcf2a3SKumar Kartikeya Dwivedi  *
147a8fcf2a3SKumar Kartikeya Dwivedi  *              fast     :    slow                                  :    unlock
148a8fcf2a3SKumar Kartikeya Dwivedi  *                       :                                          :
149a8fcf2a3SKumar Kartikeya Dwivedi  * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
150a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       | ^--------.------.             /  :
151a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v           \      \            |  :
152a8fcf2a3SKumar Kartikeya Dwivedi  * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
153a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       | ^--'              |           |  :
154a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v                   |           |  :
155a8fcf2a3SKumar Kartikeya Dwivedi  * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
156a8fcf2a3SKumar Kartikeya Dwivedi  *   queue               :       | ^--'                          |  :
157a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v                               |  :
158a8fcf2a3SKumar Kartikeya Dwivedi  * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
159a8fcf2a3SKumar Kartikeya Dwivedi  *   queue               :         ^--'                             :
160a8fcf2a3SKumar Kartikeya Dwivedi  */
161*337ffea5SKumar Kartikeya Dwivedi int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
162a8fcf2a3SKumar Kartikeya Dwivedi {
163a8fcf2a3SKumar Kartikeya Dwivedi 	struct mcs_spinlock *prev, *next, *node;
16414c48ee8SKumar Kartikeya Dwivedi 	struct rqspinlock_timeout ts;
165*337ffea5SKumar Kartikeya Dwivedi 	int idx, ret = 0;
166a8fcf2a3SKumar Kartikeya Dwivedi 	u32 old, tail;
167a8fcf2a3SKumar Kartikeya Dwivedi 
168a8fcf2a3SKumar Kartikeya Dwivedi 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
169a8fcf2a3SKumar Kartikeya Dwivedi 
17014c48ee8SKumar Kartikeya Dwivedi 	RES_INIT_TIMEOUT(ts);
17114c48ee8SKumar Kartikeya Dwivedi 
172a8fcf2a3SKumar Kartikeya Dwivedi 	/*
173a8fcf2a3SKumar Kartikeya Dwivedi 	 * Wait for in-progress pending->locked hand-overs with a bounded
174a8fcf2a3SKumar Kartikeya Dwivedi 	 * number of spins so that we guarantee forward progress.
175a8fcf2a3SKumar Kartikeya Dwivedi 	 *
176a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,0 -> 0,0,1
177a8fcf2a3SKumar Kartikeya Dwivedi 	 */
178a8fcf2a3SKumar Kartikeya Dwivedi 	if (val == _Q_PENDING_VAL) {
179a8fcf2a3SKumar Kartikeya Dwivedi 		int cnt = _Q_PENDING_LOOPS;
180a8fcf2a3SKumar Kartikeya Dwivedi 		val = atomic_cond_read_relaxed(&lock->val,
181a8fcf2a3SKumar Kartikeya Dwivedi 					       (VAL != _Q_PENDING_VAL) || !cnt--);
182a8fcf2a3SKumar Kartikeya Dwivedi 	}
183a8fcf2a3SKumar Kartikeya Dwivedi 
184a8fcf2a3SKumar Kartikeya Dwivedi 	/*
185a8fcf2a3SKumar Kartikeya Dwivedi 	 * If we observe any contention; queue.
186a8fcf2a3SKumar Kartikeya Dwivedi 	 */
187a8fcf2a3SKumar Kartikeya Dwivedi 	if (val & ~_Q_LOCKED_MASK)
188a8fcf2a3SKumar Kartikeya Dwivedi 		goto queue;
189a8fcf2a3SKumar Kartikeya Dwivedi 
190a8fcf2a3SKumar Kartikeya Dwivedi 	/*
191a8fcf2a3SKumar Kartikeya Dwivedi 	 * trylock || pending
192a8fcf2a3SKumar Kartikeya Dwivedi 	 *
193a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
194a8fcf2a3SKumar Kartikeya Dwivedi 	 */
195a8fcf2a3SKumar Kartikeya Dwivedi 	val = queued_fetch_set_pending_acquire(lock);
196a8fcf2a3SKumar Kartikeya Dwivedi 
197a8fcf2a3SKumar Kartikeya Dwivedi 	/*
198a8fcf2a3SKumar Kartikeya Dwivedi 	 * If we observe contention, there is a concurrent locker.
199a8fcf2a3SKumar Kartikeya Dwivedi 	 *
200a8fcf2a3SKumar Kartikeya Dwivedi 	 * Undo and queue; our setting of PENDING might have made the
201a8fcf2a3SKumar Kartikeya Dwivedi 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
202a8fcf2a3SKumar Kartikeya Dwivedi 	 * on @next to become !NULL.
203a8fcf2a3SKumar Kartikeya Dwivedi 	 */
204a8fcf2a3SKumar Kartikeya Dwivedi 	if (unlikely(val & ~_Q_LOCKED_MASK)) {
205a8fcf2a3SKumar Kartikeya Dwivedi 
206a8fcf2a3SKumar Kartikeya Dwivedi 		/* Undo PENDING if we set it. */
207a8fcf2a3SKumar Kartikeya Dwivedi 		if (!(val & _Q_PENDING_MASK))
208a8fcf2a3SKumar Kartikeya Dwivedi 			clear_pending(lock);
209a8fcf2a3SKumar Kartikeya Dwivedi 
210a8fcf2a3SKumar Kartikeya Dwivedi 		goto queue;
211a8fcf2a3SKumar Kartikeya Dwivedi 	}
212a8fcf2a3SKumar Kartikeya Dwivedi 
213a8fcf2a3SKumar Kartikeya Dwivedi 	/*
214a8fcf2a3SKumar Kartikeya Dwivedi 	 * We're pending, wait for the owner to go away.
215a8fcf2a3SKumar Kartikeya Dwivedi 	 *
216a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,1 -> *,1,0
217a8fcf2a3SKumar Kartikeya Dwivedi 	 *
218a8fcf2a3SKumar Kartikeya Dwivedi 	 * this wait loop must be a load-acquire such that we match the
219a8fcf2a3SKumar Kartikeya Dwivedi 	 * store-release that clears the locked bit and create lock
220a8fcf2a3SKumar Kartikeya Dwivedi 	 * sequentiality; this is because not all
221a8fcf2a3SKumar Kartikeya Dwivedi 	 * clear_pending_set_locked() implementations imply full
222a8fcf2a3SKumar Kartikeya Dwivedi 	 * barriers.
223a8fcf2a3SKumar Kartikeya Dwivedi 	 */
224*337ffea5SKumar Kartikeya Dwivedi 	if (val & _Q_LOCKED_MASK) {
225*337ffea5SKumar Kartikeya Dwivedi 		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
226*337ffea5SKumar Kartikeya Dwivedi 		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret));
227*337ffea5SKumar Kartikeya Dwivedi 	}
228*337ffea5SKumar Kartikeya Dwivedi 
229*337ffea5SKumar Kartikeya Dwivedi 	if (ret) {
230*337ffea5SKumar Kartikeya Dwivedi 		/*
231*337ffea5SKumar Kartikeya Dwivedi 		 * We waited for the locked bit to go back to 0, as the pending
232*337ffea5SKumar Kartikeya Dwivedi 		 * waiter, but timed out. We need to clear the pending bit since
233*337ffea5SKumar Kartikeya Dwivedi 		 * we own it. Once a stuck owner has been recovered, the lock
234*337ffea5SKumar Kartikeya Dwivedi 		 * must be restored to a valid state, hence removing the pending
235*337ffea5SKumar Kartikeya Dwivedi 		 * bit is necessary.
236*337ffea5SKumar Kartikeya Dwivedi 		 *
237*337ffea5SKumar Kartikeya Dwivedi 		 * *,1,* -> *,0,*
238*337ffea5SKumar Kartikeya Dwivedi 		 */
239*337ffea5SKumar Kartikeya Dwivedi 		clear_pending(lock);
240*337ffea5SKumar Kartikeya Dwivedi 		lockevent_inc(rqspinlock_lock_timeout);
241*337ffea5SKumar Kartikeya Dwivedi 		return ret;
242*337ffea5SKumar Kartikeya Dwivedi 	}
243a8fcf2a3SKumar Kartikeya Dwivedi 
244a8fcf2a3SKumar Kartikeya Dwivedi 	/*
245a8fcf2a3SKumar Kartikeya Dwivedi 	 * take ownership and clear the pending bit.
246a8fcf2a3SKumar Kartikeya Dwivedi 	 *
247a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,0 -> 0,0,1
248a8fcf2a3SKumar Kartikeya Dwivedi 	 */
249a8fcf2a3SKumar Kartikeya Dwivedi 	clear_pending_set_locked(lock);
250a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_inc(lock_pending);
251*337ffea5SKumar Kartikeya Dwivedi 	return 0;
252a8fcf2a3SKumar Kartikeya Dwivedi 
253a8fcf2a3SKumar Kartikeya Dwivedi 	/*
254a8fcf2a3SKumar Kartikeya Dwivedi 	 * End of pending bit optimistic spinning and beginning of MCS
255a8fcf2a3SKumar Kartikeya Dwivedi 	 * queuing.
256a8fcf2a3SKumar Kartikeya Dwivedi 	 */
257a8fcf2a3SKumar Kartikeya Dwivedi queue:
258a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_inc(lock_slowpath);
259a8fcf2a3SKumar Kartikeya Dwivedi 	node = this_cpu_ptr(&rqnodes[0].mcs);
260a8fcf2a3SKumar Kartikeya Dwivedi 	idx = node->count++;
261a8fcf2a3SKumar Kartikeya Dwivedi 	tail = encode_tail(smp_processor_id(), idx);
262a8fcf2a3SKumar Kartikeya Dwivedi 
263a8fcf2a3SKumar Kartikeya Dwivedi 	trace_contention_begin(lock, LCB_F_SPIN);
264a8fcf2a3SKumar Kartikeya Dwivedi 
265a8fcf2a3SKumar Kartikeya Dwivedi 	/*
266a8fcf2a3SKumar Kartikeya Dwivedi 	 * 4 nodes are allocated based on the assumption that there will
267a8fcf2a3SKumar Kartikeya Dwivedi 	 * not be nested NMIs taking spinlocks. That may not be true in
268a8fcf2a3SKumar Kartikeya Dwivedi 	 * some architectures even though the chance of needing more than
269a8fcf2a3SKumar Kartikeya Dwivedi 	 * 4 nodes will still be extremely unlikely. When that happens,
270a8fcf2a3SKumar Kartikeya Dwivedi 	 * we fall back to spinning on the lock directly without using
271a8fcf2a3SKumar Kartikeya Dwivedi 	 * any MCS node. This is not the most elegant solution, but is
272a8fcf2a3SKumar Kartikeya Dwivedi 	 * simple enough.
273a8fcf2a3SKumar Kartikeya Dwivedi 	 */
274a8fcf2a3SKumar Kartikeya Dwivedi 	if (unlikely(idx >= _Q_MAX_NODES)) {
275a8fcf2a3SKumar Kartikeya Dwivedi 		lockevent_inc(lock_no_node);
276a8fcf2a3SKumar Kartikeya Dwivedi 		while (!queued_spin_trylock(lock))
277a8fcf2a3SKumar Kartikeya Dwivedi 			cpu_relax();
278a8fcf2a3SKumar Kartikeya Dwivedi 		goto release;
279a8fcf2a3SKumar Kartikeya Dwivedi 	}
280a8fcf2a3SKumar Kartikeya Dwivedi 
281a8fcf2a3SKumar Kartikeya Dwivedi 	node = grab_mcs_node(node, idx);
282a8fcf2a3SKumar Kartikeya Dwivedi 
283a8fcf2a3SKumar Kartikeya Dwivedi 	/*
284a8fcf2a3SKumar Kartikeya Dwivedi 	 * Keep counts of non-zero index values:
285a8fcf2a3SKumar Kartikeya Dwivedi 	 */
286a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
287a8fcf2a3SKumar Kartikeya Dwivedi 
288a8fcf2a3SKumar Kartikeya Dwivedi 	/*
289a8fcf2a3SKumar Kartikeya Dwivedi 	 * Ensure that we increment the head node->count before initialising
290a8fcf2a3SKumar Kartikeya Dwivedi 	 * the actual node. If the compiler is kind enough to reorder these
291a8fcf2a3SKumar Kartikeya Dwivedi 	 * stores, then an IRQ could overwrite our assignments.
292a8fcf2a3SKumar Kartikeya Dwivedi 	 */
293a8fcf2a3SKumar Kartikeya Dwivedi 	barrier();
294a8fcf2a3SKumar Kartikeya Dwivedi 
295a8fcf2a3SKumar Kartikeya Dwivedi 	node->locked = 0;
296a8fcf2a3SKumar Kartikeya Dwivedi 	node->next = NULL;
297a8fcf2a3SKumar Kartikeya Dwivedi 
298a8fcf2a3SKumar Kartikeya Dwivedi 	/*
299a8fcf2a3SKumar Kartikeya Dwivedi 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
300a8fcf2a3SKumar Kartikeya Dwivedi 	 * attempt the trylock once more in the hope someone let go while we
301a8fcf2a3SKumar Kartikeya Dwivedi 	 * weren't watching.
302a8fcf2a3SKumar Kartikeya Dwivedi 	 */
303a8fcf2a3SKumar Kartikeya Dwivedi 	if (queued_spin_trylock(lock))
304a8fcf2a3SKumar Kartikeya Dwivedi 		goto release;
305a8fcf2a3SKumar Kartikeya Dwivedi 
306a8fcf2a3SKumar Kartikeya Dwivedi 	/*
307a8fcf2a3SKumar Kartikeya Dwivedi 	 * Ensure that the initialisation of @node is complete before we
308a8fcf2a3SKumar Kartikeya Dwivedi 	 * publish the updated tail via xchg_tail() and potentially link
309a8fcf2a3SKumar Kartikeya Dwivedi 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
310a8fcf2a3SKumar Kartikeya Dwivedi 	 */
311a8fcf2a3SKumar Kartikeya Dwivedi 	smp_wmb();
312a8fcf2a3SKumar Kartikeya Dwivedi 
313a8fcf2a3SKumar Kartikeya Dwivedi 	/*
314a8fcf2a3SKumar Kartikeya Dwivedi 	 * Publish the updated tail.
315a8fcf2a3SKumar Kartikeya Dwivedi 	 * We have already touched the queueing cacheline; don't bother with
316a8fcf2a3SKumar Kartikeya Dwivedi 	 * pending stuff.
317a8fcf2a3SKumar Kartikeya Dwivedi 	 *
318a8fcf2a3SKumar Kartikeya Dwivedi 	 * p,*,* -> n,*,*
319a8fcf2a3SKumar Kartikeya Dwivedi 	 */
320a8fcf2a3SKumar Kartikeya Dwivedi 	old = xchg_tail(lock, tail);
321a8fcf2a3SKumar Kartikeya Dwivedi 	next = NULL;
322a8fcf2a3SKumar Kartikeya Dwivedi 
323a8fcf2a3SKumar Kartikeya Dwivedi 	/*
324a8fcf2a3SKumar Kartikeya Dwivedi 	 * if there was a previous node; link it and wait until reaching the
325a8fcf2a3SKumar Kartikeya Dwivedi 	 * head of the waitqueue.
326a8fcf2a3SKumar Kartikeya Dwivedi 	 */
327a8fcf2a3SKumar Kartikeya Dwivedi 	if (old & _Q_TAIL_MASK) {
328a8fcf2a3SKumar Kartikeya Dwivedi 		prev = decode_tail(old, rqnodes);
329a8fcf2a3SKumar Kartikeya Dwivedi 
330a8fcf2a3SKumar Kartikeya Dwivedi 		/* Link @node into the waitqueue. */
331a8fcf2a3SKumar Kartikeya Dwivedi 		WRITE_ONCE(prev->next, node);
332a8fcf2a3SKumar Kartikeya Dwivedi 
333a8fcf2a3SKumar Kartikeya Dwivedi 		arch_mcs_spin_lock_contended(&node->locked);
334a8fcf2a3SKumar Kartikeya Dwivedi 
335a8fcf2a3SKumar Kartikeya Dwivedi 		/*
336a8fcf2a3SKumar Kartikeya Dwivedi 		 * While waiting for the MCS lock, the next pointer may have
337a8fcf2a3SKumar Kartikeya Dwivedi 		 * been set by another lock waiter. We optimistically load
338a8fcf2a3SKumar Kartikeya Dwivedi 		 * the next pointer & prefetch the cacheline for writing
339a8fcf2a3SKumar Kartikeya Dwivedi 		 * to reduce latency in the upcoming MCS unlock operation.
340a8fcf2a3SKumar Kartikeya Dwivedi 		 */
341a8fcf2a3SKumar Kartikeya Dwivedi 		next = READ_ONCE(node->next);
342a8fcf2a3SKumar Kartikeya Dwivedi 		if (next)
343a8fcf2a3SKumar Kartikeya Dwivedi 			prefetchw(next);
344a8fcf2a3SKumar Kartikeya Dwivedi 	}
345a8fcf2a3SKumar Kartikeya Dwivedi 
346a8fcf2a3SKumar Kartikeya Dwivedi 	/*
347a8fcf2a3SKumar Kartikeya Dwivedi 	 * we're at the head of the waitqueue, wait for the owner & pending to
348a8fcf2a3SKumar Kartikeya Dwivedi 	 * go away.
349a8fcf2a3SKumar Kartikeya Dwivedi 	 *
350a8fcf2a3SKumar Kartikeya Dwivedi 	 * *,x,y -> *,0,0
351a8fcf2a3SKumar Kartikeya Dwivedi 	 *
352a8fcf2a3SKumar Kartikeya Dwivedi 	 * this wait loop must use a load-acquire such that we match the
353a8fcf2a3SKumar Kartikeya Dwivedi 	 * store-release that clears the locked bit and create lock
354a8fcf2a3SKumar Kartikeya Dwivedi 	 * sequentiality; this is because the set_locked() function below
355a8fcf2a3SKumar Kartikeya Dwivedi 	 * does not imply a full barrier.
356a8fcf2a3SKumar Kartikeya Dwivedi 	 */
357a8fcf2a3SKumar Kartikeya Dwivedi 	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
358a8fcf2a3SKumar Kartikeya Dwivedi 
359a8fcf2a3SKumar Kartikeya Dwivedi 	/*
360a8fcf2a3SKumar Kartikeya Dwivedi 	 * claim the lock:
361a8fcf2a3SKumar Kartikeya Dwivedi 	 *
362a8fcf2a3SKumar Kartikeya Dwivedi 	 * n,0,0 -> 0,0,1 : lock, uncontended
363a8fcf2a3SKumar Kartikeya Dwivedi 	 * *,*,0 -> *,*,1 : lock, contended
364a8fcf2a3SKumar Kartikeya Dwivedi 	 *
365a8fcf2a3SKumar Kartikeya Dwivedi 	 * If the queue head is the only one in the queue (lock value == tail)
366a8fcf2a3SKumar Kartikeya Dwivedi 	 * and nobody is pending, clear the tail code and grab the lock.
367a8fcf2a3SKumar Kartikeya Dwivedi 	 * Otherwise, we only need to grab the lock.
368a8fcf2a3SKumar Kartikeya Dwivedi 	 */
369a8fcf2a3SKumar Kartikeya Dwivedi 
370a8fcf2a3SKumar Kartikeya Dwivedi 	/*
371a8fcf2a3SKumar Kartikeya Dwivedi 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
372a8fcf2a3SKumar Kartikeya Dwivedi 	 *       above wait condition, therefore any concurrent setting of
373a8fcf2a3SKumar Kartikeya Dwivedi 	 *       PENDING will make the uncontended transition fail.
374a8fcf2a3SKumar Kartikeya Dwivedi 	 */
375a8fcf2a3SKumar Kartikeya Dwivedi 	if ((val & _Q_TAIL_MASK) == tail) {
376a8fcf2a3SKumar Kartikeya Dwivedi 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
377a8fcf2a3SKumar Kartikeya Dwivedi 			goto release; /* No contention */
378a8fcf2a3SKumar Kartikeya Dwivedi 	}
379a8fcf2a3SKumar Kartikeya Dwivedi 
380a8fcf2a3SKumar Kartikeya Dwivedi 	/*
381a8fcf2a3SKumar Kartikeya Dwivedi 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
382a8fcf2a3SKumar Kartikeya Dwivedi 	 * which will then detect the remaining tail and queue behind us
383a8fcf2a3SKumar Kartikeya Dwivedi 	 * ensuring we'll see a @next.
384a8fcf2a3SKumar Kartikeya Dwivedi 	 */
385a8fcf2a3SKumar Kartikeya Dwivedi 	set_locked(lock);
386a8fcf2a3SKumar Kartikeya Dwivedi 
387a8fcf2a3SKumar Kartikeya Dwivedi 	/*
388a8fcf2a3SKumar Kartikeya Dwivedi 	 * contended path; wait for next if not observed yet, release.
389a8fcf2a3SKumar Kartikeya Dwivedi 	 */
390a8fcf2a3SKumar Kartikeya Dwivedi 	if (!next)
391a8fcf2a3SKumar Kartikeya Dwivedi 		next = smp_cond_load_relaxed(&node->next, (VAL));
392a8fcf2a3SKumar Kartikeya Dwivedi 
393a8fcf2a3SKumar Kartikeya Dwivedi 	arch_mcs_spin_unlock_contended(&next->locked);
394a8fcf2a3SKumar Kartikeya Dwivedi 
395a8fcf2a3SKumar Kartikeya Dwivedi release:
396a8fcf2a3SKumar Kartikeya Dwivedi 	trace_contention_end(lock, 0);
397a8fcf2a3SKumar Kartikeya Dwivedi 
398a8fcf2a3SKumar Kartikeya Dwivedi 	/*
399a8fcf2a3SKumar Kartikeya Dwivedi 	 * release the node
400a8fcf2a3SKumar Kartikeya Dwivedi 	 */
401a8fcf2a3SKumar Kartikeya Dwivedi 	__this_cpu_dec(rqnodes[0].mcs.count);
402*337ffea5SKumar Kartikeya Dwivedi 	return 0;
403a8fcf2a3SKumar Kartikeya Dwivedi }
404a8fcf2a3SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
405