xref: /linux/kernel/bpf/rqspinlock.c (revision 14c48ee81452752d85e55a9b55e18cd90d251193)
1a8fcf2a3SKumar Kartikeya Dwivedi // SPDX-License-Identifier: GPL-2.0-or-later
2a8fcf2a3SKumar Kartikeya Dwivedi /*
3a8fcf2a3SKumar Kartikeya Dwivedi  * Resilient Queued Spin Lock
4a8fcf2a3SKumar Kartikeya Dwivedi  *
5a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
6a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2013-2014,2018 Red Hat, Inc.
7a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2015 Intel Corp.
8a8fcf2a3SKumar Kartikeya Dwivedi  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
9*14c48ee8SKumar Kartikeya Dwivedi  * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
10a8fcf2a3SKumar Kartikeya Dwivedi  *
11a8fcf2a3SKumar Kartikeya Dwivedi  * Authors: Waiman Long <longman@redhat.com>
12a8fcf2a3SKumar Kartikeya Dwivedi  *          Peter Zijlstra <peterz@infradead.org>
13*14c48ee8SKumar Kartikeya Dwivedi  *          Kumar Kartikeya Dwivedi <memxor@gmail.com>
14a8fcf2a3SKumar Kartikeya Dwivedi  */
15a8fcf2a3SKumar Kartikeya Dwivedi 
16a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/smp.h>
17a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/bug.h>
18a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/cpumask.h>
19a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/percpu.h>
20a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/hardirq.h>
21a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/mutex.h>
22a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/prefetch.h>
23a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/byteorder.h>
24a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/qspinlock.h>
25a8fcf2a3SKumar Kartikeya Dwivedi #include <trace/events/lock.h>
2630ff1332SKumar Kartikeya Dwivedi #include <asm/rqspinlock.h>
27*14c48ee8SKumar Kartikeya Dwivedi #include <linux/timekeeping.h>
28a8fcf2a3SKumar Kartikeya Dwivedi 
29a8fcf2a3SKumar Kartikeya Dwivedi /*
30a8fcf2a3SKumar Kartikeya Dwivedi  * Include queued spinlock definitions and statistics code
31a8fcf2a3SKumar Kartikeya Dwivedi  */
32a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/qspinlock.h"
33a926d099SKumar Kartikeya Dwivedi #include "../locking/lock_events.h"
34a8fcf2a3SKumar Kartikeya Dwivedi 
35a8fcf2a3SKumar Kartikeya Dwivedi /*
36a8fcf2a3SKumar Kartikeya Dwivedi  * The basic principle of a queue-based spinlock can best be understood
37a8fcf2a3SKumar Kartikeya Dwivedi  * by studying a classic queue-based spinlock implementation called the
38a8fcf2a3SKumar Kartikeya Dwivedi  * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
39a8fcf2a3SKumar Kartikeya Dwivedi  * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
40a8fcf2a3SKumar Kartikeya Dwivedi  * Scott") is available at
41a8fcf2a3SKumar Kartikeya Dwivedi  *
42a8fcf2a3SKumar Kartikeya Dwivedi  * https://bugzilla.kernel.org/show_bug.cgi?id=206115
43a8fcf2a3SKumar Kartikeya Dwivedi  *
44a8fcf2a3SKumar Kartikeya Dwivedi  * This queued spinlock implementation is based on the MCS lock, however to
45a8fcf2a3SKumar Kartikeya Dwivedi  * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
46a8fcf2a3SKumar Kartikeya Dwivedi  * existing API, we must modify it somehow.
47a8fcf2a3SKumar Kartikeya Dwivedi  *
48a8fcf2a3SKumar Kartikeya Dwivedi  * In particular; where the traditional MCS lock consists of a tail pointer
49a8fcf2a3SKumar Kartikeya Dwivedi  * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
50a8fcf2a3SKumar Kartikeya Dwivedi  * unlock the next pending (next->locked), we compress both these: {tail,
51a8fcf2a3SKumar Kartikeya Dwivedi  * next->locked} into a single u32 value.
52a8fcf2a3SKumar Kartikeya Dwivedi  *
53a8fcf2a3SKumar Kartikeya Dwivedi  * Since a spinlock disables recursion of its own context and there is a limit
54a8fcf2a3SKumar Kartikeya Dwivedi  * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
55a8fcf2a3SKumar Kartikeya Dwivedi  * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
56a8fcf2a3SKumar Kartikeya Dwivedi  * we can encode the tail by combining the 2-bit nesting level with the cpu
57a8fcf2a3SKumar Kartikeya Dwivedi  * number. With one byte for the lock value and 3 bytes for the tail, only a
58a8fcf2a3SKumar Kartikeya Dwivedi  * 32-bit word is now needed. Even though we only need 1 bit for the lock,
59a8fcf2a3SKumar Kartikeya Dwivedi  * we extend it to a full byte to achieve better performance for architectures
60a8fcf2a3SKumar Kartikeya Dwivedi  * that support atomic byte write.
61a8fcf2a3SKumar Kartikeya Dwivedi  *
62a8fcf2a3SKumar Kartikeya Dwivedi  * We also change the first spinner to spin on the lock bit instead of its
63a8fcf2a3SKumar Kartikeya Dwivedi  * node; whereby avoiding the need to carry a node from lock to unlock, and
64a8fcf2a3SKumar Kartikeya Dwivedi  * preserving existing lock API. This also makes the unlock code simpler and
65a8fcf2a3SKumar Kartikeya Dwivedi  * faster.
66a8fcf2a3SKumar Kartikeya Dwivedi  *
67a8fcf2a3SKumar Kartikeya Dwivedi  * N.B. The current implementation only supports architectures that allow
68a8fcf2a3SKumar Kartikeya Dwivedi  *      atomic operations on smaller 8-bit and 16-bit data types.
69a8fcf2a3SKumar Kartikeya Dwivedi  *
70a8fcf2a3SKumar Kartikeya Dwivedi  */
71a8fcf2a3SKumar Kartikeya Dwivedi 
72a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/mcs_spinlock.h"
73a8fcf2a3SKumar Kartikeya Dwivedi 
74*14c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout {
75*14c48ee8SKumar Kartikeya Dwivedi 	u64 timeout_end;
76*14c48ee8SKumar Kartikeya Dwivedi 	u64 duration;
77*14c48ee8SKumar Kartikeya Dwivedi 	u16 spin;
78*14c48ee8SKumar Kartikeya Dwivedi };
79*14c48ee8SKumar Kartikeya Dwivedi 
80*14c48ee8SKumar Kartikeya Dwivedi static noinline int check_timeout(struct rqspinlock_timeout *ts)
81*14c48ee8SKumar Kartikeya Dwivedi {
82*14c48ee8SKumar Kartikeya Dwivedi 	u64 time = ktime_get_mono_fast_ns();
83*14c48ee8SKumar Kartikeya Dwivedi 
84*14c48ee8SKumar Kartikeya Dwivedi 	if (!ts->timeout_end) {
85*14c48ee8SKumar Kartikeya Dwivedi 		ts->timeout_end = time + ts->duration;
86*14c48ee8SKumar Kartikeya Dwivedi 		return 0;
87*14c48ee8SKumar Kartikeya Dwivedi 	}
88*14c48ee8SKumar Kartikeya Dwivedi 
89*14c48ee8SKumar Kartikeya Dwivedi 	if (time > ts->timeout_end)
90*14c48ee8SKumar Kartikeya Dwivedi 		return -ETIMEDOUT;
91*14c48ee8SKumar Kartikeya Dwivedi 
92*14c48ee8SKumar Kartikeya Dwivedi 	return 0;
93*14c48ee8SKumar Kartikeya Dwivedi }
94*14c48ee8SKumar Kartikeya Dwivedi 
95*14c48ee8SKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret)                    \
96*14c48ee8SKumar Kartikeya Dwivedi 	({                                            \
97*14c48ee8SKumar Kartikeya Dwivedi 		if (!(ts).spin++)                     \
98*14c48ee8SKumar Kartikeya Dwivedi 			(ret) = check_timeout(&(ts)); \
99*14c48ee8SKumar Kartikeya Dwivedi 		(ret);                                \
100*14c48ee8SKumar Kartikeya Dwivedi 	})
101*14c48ee8SKumar Kartikeya Dwivedi 
102*14c48ee8SKumar Kartikeya Dwivedi /*
103*14c48ee8SKumar Kartikeya Dwivedi  * Initialize the 'spin' member.
104*14c48ee8SKumar Kartikeya Dwivedi  */
105*14c48ee8SKumar Kartikeya Dwivedi #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 1; })
106*14c48ee8SKumar Kartikeya Dwivedi 
107*14c48ee8SKumar Kartikeya Dwivedi /*
108*14c48ee8SKumar Kartikeya Dwivedi  * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
109*14c48ee8SKumar Kartikeya Dwivedi  * Duration is defined for each spin attempt, so set it here.
110*14c48ee8SKumar Kartikeya Dwivedi  */
111*14c48ee8SKumar Kartikeya Dwivedi #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
112*14c48ee8SKumar Kartikeya Dwivedi 
113a8fcf2a3SKumar Kartikeya Dwivedi /*
114a8fcf2a3SKumar Kartikeya Dwivedi  * Per-CPU queue node structures; we can never have more than 4 nested
115a8fcf2a3SKumar Kartikeya Dwivedi  * contexts: task, softirq, hardirq, nmi.
116a8fcf2a3SKumar Kartikeya Dwivedi  *
117a8fcf2a3SKumar Kartikeya Dwivedi  * Exactly fits one 64-byte cacheline on a 64-bit architecture.
118a8fcf2a3SKumar Kartikeya Dwivedi  */
119a8fcf2a3SKumar Kartikeya Dwivedi static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
120a8fcf2a3SKumar Kartikeya Dwivedi 
121a8fcf2a3SKumar Kartikeya Dwivedi /**
122a8fcf2a3SKumar Kartikeya Dwivedi  * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
123a8fcf2a3SKumar Kartikeya Dwivedi  * @lock: Pointer to queued spinlock structure
124a8fcf2a3SKumar Kartikeya Dwivedi  * @val: Current value of the queued spinlock 32-bit word
125a8fcf2a3SKumar Kartikeya Dwivedi  *
126a8fcf2a3SKumar Kartikeya Dwivedi  * (queue tail, pending bit, lock value)
127a8fcf2a3SKumar Kartikeya Dwivedi  *
128a8fcf2a3SKumar Kartikeya Dwivedi  *              fast     :    slow                                  :    unlock
129a8fcf2a3SKumar Kartikeya Dwivedi  *                       :                                          :
130a8fcf2a3SKumar Kartikeya Dwivedi  * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
131a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       | ^--------.------.             /  :
132a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v           \      \            |  :
133a8fcf2a3SKumar Kartikeya Dwivedi  * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
134a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       | ^--'              |           |  :
135a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v                   |           |  :
136a8fcf2a3SKumar Kartikeya Dwivedi  * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
137a8fcf2a3SKumar Kartikeya Dwivedi  *   queue               :       | ^--'                          |  :
138a8fcf2a3SKumar Kartikeya Dwivedi  *                       :       v                               |  :
139a8fcf2a3SKumar Kartikeya Dwivedi  * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
140a8fcf2a3SKumar Kartikeya Dwivedi  *   queue               :         ^--'                             :
141a8fcf2a3SKumar Kartikeya Dwivedi  */
14230ff1332SKumar Kartikeya Dwivedi void __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
143a8fcf2a3SKumar Kartikeya Dwivedi {
144a8fcf2a3SKumar Kartikeya Dwivedi 	struct mcs_spinlock *prev, *next, *node;
145*14c48ee8SKumar Kartikeya Dwivedi 	struct rqspinlock_timeout ts;
146a8fcf2a3SKumar Kartikeya Dwivedi 	u32 old, tail;
147a8fcf2a3SKumar Kartikeya Dwivedi 	int idx;
148a8fcf2a3SKumar Kartikeya Dwivedi 
149a8fcf2a3SKumar Kartikeya Dwivedi 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
150a8fcf2a3SKumar Kartikeya Dwivedi 
151*14c48ee8SKumar Kartikeya Dwivedi 	RES_INIT_TIMEOUT(ts);
152*14c48ee8SKumar Kartikeya Dwivedi 
153a8fcf2a3SKumar Kartikeya Dwivedi 	/*
154a8fcf2a3SKumar Kartikeya Dwivedi 	 * Wait for in-progress pending->locked hand-overs with a bounded
155a8fcf2a3SKumar Kartikeya Dwivedi 	 * number of spins so that we guarantee forward progress.
156a8fcf2a3SKumar Kartikeya Dwivedi 	 *
157a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,0 -> 0,0,1
158a8fcf2a3SKumar Kartikeya Dwivedi 	 */
159a8fcf2a3SKumar Kartikeya Dwivedi 	if (val == _Q_PENDING_VAL) {
160a8fcf2a3SKumar Kartikeya Dwivedi 		int cnt = _Q_PENDING_LOOPS;
161a8fcf2a3SKumar Kartikeya Dwivedi 		val = atomic_cond_read_relaxed(&lock->val,
162a8fcf2a3SKumar Kartikeya Dwivedi 					       (VAL != _Q_PENDING_VAL) || !cnt--);
163a8fcf2a3SKumar Kartikeya Dwivedi 	}
164a8fcf2a3SKumar Kartikeya Dwivedi 
165a8fcf2a3SKumar Kartikeya Dwivedi 	/*
166a8fcf2a3SKumar Kartikeya Dwivedi 	 * If we observe any contention; queue.
167a8fcf2a3SKumar Kartikeya Dwivedi 	 */
168a8fcf2a3SKumar Kartikeya Dwivedi 	if (val & ~_Q_LOCKED_MASK)
169a8fcf2a3SKumar Kartikeya Dwivedi 		goto queue;
170a8fcf2a3SKumar Kartikeya Dwivedi 
171a8fcf2a3SKumar Kartikeya Dwivedi 	/*
172a8fcf2a3SKumar Kartikeya Dwivedi 	 * trylock || pending
173a8fcf2a3SKumar Kartikeya Dwivedi 	 *
174a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
175a8fcf2a3SKumar Kartikeya Dwivedi 	 */
176a8fcf2a3SKumar Kartikeya Dwivedi 	val = queued_fetch_set_pending_acquire(lock);
177a8fcf2a3SKumar Kartikeya Dwivedi 
178a8fcf2a3SKumar Kartikeya Dwivedi 	/*
179a8fcf2a3SKumar Kartikeya Dwivedi 	 * If we observe contention, there is a concurrent locker.
180a8fcf2a3SKumar Kartikeya Dwivedi 	 *
181a8fcf2a3SKumar Kartikeya Dwivedi 	 * Undo and queue; our setting of PENDING might have made the
182a8fcf2a3SKumar Kartikeya Dwivedi 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
183a8fcf2a3SKumar Kartikeya Dwivedi 	 * on @next to become !NULL.
184a8fcf2a3SKumar Kartikeya Dwivedi 	 */
185a8fcf2a3SKumar Kartikeya Dwivedi 	if (unlikely(val & ~_Q_LOCKED_MASK)) {
186a8fcf2a3SKumar Kartikeya Dwivedi 
187a8fcf2a3SKumar Kartikeya Dwivedi 		/* Undo PENDING if we set it. */
188a8fcf2a3SKumar Kartikeya Dwivedi 		if (!(val & _Q_PENDING_MASK))
189a8fcf2a3SKumar Kartikeya Dwivedi 			clear_pending(lock);
190a8fcf2a3SKumar Kartikeya Dwivedi 
191a8fcf2a3SKumar Kartikeya Dwivedi 		goto queue;
192a8fcf2a3SKumar Kartikeya Dwivedi 	}
193a8fcf2a3SKumar Kartikeya Dwivedi 
194a8fcf2a3SKumar Kartikeya Dwivedi 	/*
195a8fcf2a3SKumar Kartikeya Dwivedi 	 * We're pending, wait for the owner to go away.
196a8fcf2a3SKumar Kartikeya Dwivedi 	 *
197a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,1 -> *,1,0
198a8fcf2a3SKumar Kartikeya Dwivedi 	 *
199a8fcf2a3SKumar Kartikeya Dwivedi 	 * this wait loop must be a load-acquire such that we match the
200a8fcf2a3SKumar Kartikeya Dwivedi 	 * store-release that clears the locked bit and create lock
201a8fcf2a3SKumar Kartikeya Dwivedi 	 * sequentiality; this is because not all
202a8fcf2a3SKumar Kartikeya Dwivedi 	 * clear_pending_set_locked() implementations imply full
203a8fcf2a3SKumar Kartikeya Dwivedi 	 * barriers.
204a8fcf2a3SKumar Kartikeya Dwivedi 	 */
205a8fcf2a3SKumar Kartikeya Dwivedi 	if (val & _Q_LOCKED_MASK)
206a8fcf2a3SKumar Kartikeya Dwivedi 		smp_cond_load_acquire(&lock->locked, !VAL);
207a8fcf2a3SKumar Kartikeya Dwivedi 
208a8fcf2a3SKumar Kartikeya Dwivedi 	/*
209a8fcf2a3SKumar Kartikeya Dwivedi 	 * take ownership and clear the pending bit.
210a8fcf2a3SKumar Kartikeya Dwivedi 	 *
211a8fcf2a3SKumar Kartikeya Dwivedi 	 * 0,1,0 -> 0,0,1
212a8fcf2a3SKumar Kartikeya Dwivedi 	 */
213a8fcf2a3SKumar Kartikeya Dwivedi 	clear_pending_set_locked(lock);
214a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_inc(lock_pending);
215a8fcf2a3SKumar Kartikeya Dwivedi 	return;
216a8fcf2a3SKumar Kartikeya Dwivedi 
217a8fcf2a3SKumar Kartikeya Dwivedi 	/*
218a8fcf2a3SKumar Kartikeya Dwivedi 	 * End of pending bit optimistic spinning and beginning of MCS
219a8fcf2a3SKumar Kartikeya Dwivedi 	 * queuing.
220a8fcf2a3SKumar Kartikeya Dwivedi 	 */
221a8fcf2a3SKumar Kartikeya Dwivedi queue:
222a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_inc(lock_slowpath);
223a8fcf2a3SKumar Kartikeya Dwivedi 	node = this_cpu_ptr(&rqnodes[0].mcs);
224a8fcf2a3SKumar Kartikeya Dwivedi 	idx = node->count++;
225a8fcf2a3SKumar Kartikeya Dwivedi 	tail = encode_tail(smp_processor_id(), idx);
226a8fcf2a3SKumar Kartikeya Dwivedi 
227a8fcf2a3SKumar Kartikeya Dwivedi 	trace_contention_begin(lock, LCB_F_SPIN);
228a8fcf2a3SKumar Kartikeya Dwivedi 
229a8fcf2a3SKumar Kartikeya Dwivedi 	/*
230a8fcf2a3SKumar Kartikeya Dwivedi 	 * 4 nodes are allocated based on the assumption that there will
231a8fcf2a3SKumar Kartikeya Dwivedi 	 * not be nested NMIs taking spinlocks. That may not be true in
232a8fcf2a3SKumar Kartikeya Dwivedi 	 * some architectures even though the chance of needing more than
233a8fcf2a3SKumar Kartikeya Dwivedi 	 * 4 nodes will still be extremely unlikely. When that happens,
234a8fcf2a3SKumar Kartikeya Dwivedi 	 * we fall back to spinning on the lock directly without using
235a8fcf2a3SKumar Kartikeya Dwivedi 	 * any MCS node. This is not the most elegant solution, but is
236a8fcf2a3SKumar Kartikeya Dwivedi 	 * simple enough.
237a8fcf2a3SKumar Kartikeya Dwivedi 	 */
238a8fcf2a3SKumar Kartikeya Dwivedi 	if (unlikely(idx >= _Q_MAX_NODES)) {
239a8fcf2a3SKumar Kartikeya Dwivedi 		lockevent_inc(lock_no_node);
240a8fcf2a3SKumar Kartikeya Dwivedi 		while (!queued_spin_trylock(lock))
241a8fcf2a3SKumar Kartikeya Dwivedi 			cpu_relax();
242a8fcf2a3SKumar Kartikeya Dwivedi 		goto release;
243a8fcf2a3SKumar Kartikeya Dwivedi 	}
244a8fcf2a3SKumar Kartikeya Dwivedi 
245a8fcf2a3SKumar Kartikeya Dwivedi 	node = grab_mcs_node(node, idx);
246a8fcf2a3SKumar Kartikeya Dwivedi 
247a8fcf2a3SKumar Kartikeya Dwivedi 	/*
248a8fcf2a3SKumar Kartikeya Dwivedi 	 * Keep counts of non-zero index values:
249a8fcf2a3SKumar Kartikeya Dwivedi 	 */
250a8fcf2a3SKumar Kartikeya Dwivedi 	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
251a8fcf2a3SKumar Kartikeya Dwivedi 
252a8fcf2a3SKumar Kartikeya Dwivedi 	/*
253a8fcf2a3SKumar Kartikeya Dwivedi 	 * Ensure that we increment the head node->count before initialising
254a8fcf2a3SKumar Kartikeya Dwivedi 	 * the actual node. If the compiler is kind enough to reorder these
255a8fcf2a3SKumar Kartikeya Dwivedi 	 * stores, then an IRQ could overwrite our assignments.
256a8fcf2a3SKumar Kartikeya Dwivedi 	 */
257a8fcf2a3SKumar Kartikeya Dwivedi 	barrier();
258a8fcf2a3SKumar Kartikeya Dwivedi 
259a8fcf2a3SKumar Kartikeya Dwivedi 	node->locked = 0;
260a8fcf2a3SKumar Kartikeya Dwivedi 	node->next = NULL;
261a8fcf2a3SKumar Kartikeya Dwivedi 
262a8fcf2a3SKumar Kartikeya Dwivedi 	/*
263a8fcf2a3SKumar Kartikeya Dwivedi 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
264a8fcf2a3SKumar Kartikeya Dwivedi 	 * attempt the trylock once more in the hope someone let go while we
265a8fcf2a3SKumar Kartikeya Dwivedi 	 * weren't watching.
266a8fcf2a3SKumar Kartikeya Dwivedi 	 */
267a8fcf2a3SKumar Kartikeya Dwivedi 	if (queued_spin_trylock(lock))
268a8fcf2a3SKumar Kartikeya Dwivedi 		goto release;
269a8fcf2a3SKumar Kartikeya Dwivedi 
270a8fcf2a3SKumar Kartikeya Dwivedi 	/*
271a8fcf2a3SKumar Kartikeya Dwivedi 	 * Ensure that the initialisation of @node is complete before we
272a8fcf2a3SKumar Kartikeya Dwivedi 	 * publish the updated tail via xchg_tail() and potentially link
273a8fcf2a3SKumar Kartikeya Dwivedi 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
274a8fcf2a3SKumar Kartikeya Dwivedi 	 */
275a8fcf2a3SKumar Kartikeya Dwivedi 	smp_wmb();
276a8fcf2a3SKumar Kartikeya Dwivedi 
277a8fcf2a3SKumar Kartikeya Dwivedi 	/*
278a8fcf2a3SKumar Kartikeya Dwivedi 	 * Publish the updated tail.
279a8fcf2a3SKumar Kartikeya Dwivedi 	 * We have already touched the queueing cacheline; don't bother with
280a8fcf2a3SKumar Kartikeya Dwivedi 	 * pending stuff.
281a8fcf2a3SKumar Kartikeya Dwivedi 	 *
282a8fcf2a3SKumar Kartikeya Dwivedi 	 * p,*,* -> n,*,*
283a8fcf2a3SKumar Kartikeya Dwivedi 	 */
284a8fcf2a3SKumar Kartikeya Dwivedi 	old = xchg_tail(lock, tail);
285a8fcf2a3SKumar Kartikeya Dwivedi 	next = NULL;
286a8fcf2a3SKumar Kartikeya Dwivedi 
287a8fcf2a3SKumar Kartikeya Dwivedi 	/*
288a8fcf2a3SKumar Kartikeya Dwivedi 	 * if there was a previous node; link it and wait until reaching the
289a8fcf2a3SKumar Kartikeya Dwivedi 	 * head of the waitqueue.
290a8fcf2a3SKumar Kartikeya Dwivedi 	 */
291a8fcf2a3SKumar Kartikeya Dwivedi 	if (old & _Q_TAIL_MASK) {
292a8fcf2a3SKumar Kartikeya Dwivedi 		prev = decode_tail(old, rqnodes);
293a8fcf2a3SKumar Kartikeya Dwivedi 
294a8fcf2a3SKumar Kartikeya Dwivedi 		/* Link @node into the waitqueue. */
295a8fcf2a3SKumar Kartikeya Dwivedi 		WRITE_ONCE(prev->next, node);
296a8fcf2a3SKumar Kartikeya Dwivedi 
297a8fcf2a3SKumar Kartikeya Dwivedi 		arch_mcs_spin_lock_contended(&node->locked);
298a8fcf2a3SKumar Kartikeya Dwivedi 
299a8fcf2a3SKumar Kartikeya Dwivedi 		/*
300a8fcf2a3SKumar Kartikeya Dwivedi 		 * While waiting for the MCS lock, the next pointer may have
301a8fcf2a3SKumar Kartikeya Dwivedi 		 * been set by another lock waiter. We optimistically load
302a8fcf2a3SKumar Kartikeya Dwivedi 		 * the next pointer & prefetch the cacheline for writing
303a8fcf2a3SKumar Kartikeya Dwivedi 		 * to reduce latency in the upcoming MCS unlock operation.
304a8fcf2a3SKumar Kartikeya Dwivedi 		 */
305a8fcf2a3SKumar Kartikeya Dwivedi 		next = READ_ONCE(node->next);
306a8fcf2a3SKumar Kartikeya Dwivedi 		if (next)
307a8fcf2a3SKumar Kartikeya Dwivedi 			prefetchw(next);
308a8fcf2a3SKumar Kartikeya Dwivedi 	}
309a8fcf2a3SKumar Kartikeya Dwivedi 
310a8fcf2a3SKumar Kartikeya Dwivedi 	/*
311a8fcf2a3SKumar Kartikeya Dwivedi 	 * we're at the head of the waitqueue, wait for the owner & pending to
312a8fcf2a3SKumar Kartikeya Dwivedi 	 * go away.
313a8fcf2a3SKumar Kartikeya Dwivedi 	 *
314a8fcf2a3SKumar Kartikeya Dwivedi 	 * *,x,y -> *,0,0
315a8fcf2a3SKumar Kartikeya Dwivedi 	 *
316a8fcf2a3SKumar Kartikeya Dwivedi 	 * this wait loop must use a load-acquire such that we match the
317a8fcf2a3SKumar Kartikeya Dwivedi 	 * store-release that clears the locked bit and create lock
318a8fcf2a3SKumar Kartikeya Dwivedi 	 * sequentiality; this is because the set_locked() function below
319a8fcf2a3SKumar Kartikeya Dwivedi 	 * does not imply a full barrier.
320a8fcf2a3SKumar Kartikeya Dwivedi 	 */
321a8fcf2a3SKumar Kartikeya Dwivedi 	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
322a8fcf2a3SKumar Kartikeya Dwivedi 
323a8fcf2a3SKumar Kartikeya Dwivedi 	/*
324a8fcf2a3SKumar Kartikeya Dwivedi 	 * claim the lock:
325a8fcf2a3SKumar Kartikeya Dwivedi 	 *
326a8fcf2a3SKumar Kartikeya Dwivedi 	 * n,0,0 -> 0,0,1 : lock, uncontended
327a8fcf2a3SKumar Kartikeya Dwivedi 	 * *,*,0 -> *,*,1 : lock, contended
328a8fcf2a3SKumar Kartikeya Dwivedi 	 *
329a8fcf2a3SKumar Kartikeya Dwivedi 	 * If the queue head is the only one in the queue (lock value == tail)
330a8fcf2a3SKumar Kartikeya Dwivedi 	 * and nobody is pending, clear the tail code and grab the lock.
331a8fcf2a3SKumar Kartikeya Dwivedi 	 * Otherwise, we only need to grab the lock.
332a8fcf2a3SKumar Kartikeya Dwivedi 	 */
333a8fcf2a3SKumar Kartikeya Dwivedi 
334a8fcf2a3SKumar Kartikeya Dwivedi 	/*
335a8fcf2a3SKumar Kartikeya Dwivedi 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
336a8fcf2a3SKumar Kartikeya Dwivedi 	 *       above wait condition, therefore any concurrent setting of
337a8fcf2a3SKumar Kartikeya Dwivedi 	 *       PENDING will make the uncontended transition fail.
338a8fcf2a3SKumar Kartikeya Dwivedi 	 */
339a8fcf2a3SKumar Kartikeya Dwivedi 	if ((val & _Q_TAIL_MASK) == tail) {
340a8fcf2a3SKumar Kartikeya Dwivedi 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
341a8fcf2a3SKumar Kartikeya Dwivedi 			goto release; /* No contention */
342a8fcf2a3SKumar Kartikeya Dwivedi 	}
343a8fcf2a3SKumar Kartikeya Dwivedi 
344a8fcf2a3SKumar Kartikeya Dwivedi 	/*
345a8fcf2a3SKumar Kartikeya Dwivedi 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
346a8fcf2a3SKumar Kartikeya Dwivedi 	 * which will then detect the remaining tail and queue behind us
347a8fcf2a3SKumar Kartikeya Dwivedi 	 * ensuring we'll see a @next.
348a8fcf2a3SKumar Kartikeya Dwivedi 	 */
349a8fcf2a3SKumar Kartikeya Dwivedi 	set_locked(lock);
350a8fcf2a3SKumar Kartikeya Dwivedi 
351a8fcf2a3SKumar Kartikeya Dwivedi 	/*
352a8fcf2a3SKumar Kartikeya Dwivedi 	 * contended path; wait for next if not observed yet, release.
353a8fcf2a3SKumar Kartikeya Dwivedi 	 */
354a8fcf2a3SKumar Kartikeya Dwivedi 	if (!next)
355a8fcf2a3SKumar Kartikeya Dwivedi 		next = smp_cond_load_relaxed(&node->next, (VAL));
356a8fcf2a3SKumar Kartikeya Dwivedi 
357a8fcf2a3SKumar Kartikeya Dwivedi 	arch_mcs_spin_unlock_contended(&next->locked);
358a8fcf2a3SKumar Kartikeya Dwivedi 
359a8fcf2a3SKumar Kartikeya Dwivedi release:
360a8fcf2a3SKumar Kartikeya Dwivedi 	trace_contention_end(lock, 0);
361a8fcf2a3SKumar Kartikeya Dwivedi 
362a8fcf2a3SKumar Kartikeya Dwivedi 	/*
363a8fcf2a3SKumar Kartikeya Dwivedi 	 * release the node
364a8fcf2a3SKumar Kartikeya Dwivedi 	 */
365a8fcf2a3SKumar Kartikeya Dwivedi 	__this_cpu_dec(rqnodes[0].mcs.count);
366a8fcf2a3SKumar Kartikeya Dwivedi }
367a8fcf2a3SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
368