1a8fcf2a3SKumar Kartikeya Dwivedi // SPDX-License-Identifier: GPL-2.0-or-later 2a8fcf2a3SKumar Kartikeya Dwivedi /* 3a8fcf2a3SKumar Kartikeya Dwivedi * Resilient Queued Spin Lock 4a8fcf2a3SKumar Kartikeya Dwivedi * 5a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2015 Intel Corp. 8a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 914c48ee8SKumar Kartikeya Dwivedi * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10a8fcf2a3SKumar Kartikeya Dwivedi * 11a8fcf2a3SKumar Kartikeya Dwivedi * Authors: Waiman Long <longman@redhat.com> 12a8fcf2a3SKumar Kartikeya Dwivedi * Peter Zijlstra <peterz@infradead.org> 1314c48ee8SKumar Kartikeya Dwivedi * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14a8fcf2a3SKumar Kartikeya Dwivedi */ 15a8fcf2a3SKumar Kartikeya Dwivedi 16a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/smp.h> 17a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/bug.h> 18a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/cpumask.h> 19a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/percpu.h> 20a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/hardirq.h> 21a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/mutex.h> 22a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/prefetch.h> 23a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/byteorder.h> 24c9102a68SKumar Kartikeya Dwivedi #ifdef CONFIG_QUEUED_SPINLOCKS 25a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/qspinlock.h> 26c9102a68SKumar Kartikeya Dwivedi #endif 27a8fcf2a3SKumar Kartikeya Dwivedi #include <trace/events/lock.h> 2830ff1332SKumar Kartikeya Dwivedi #include <asm/rqspinlock.h> 2914c48ee8SKumar Kartikeya Dwivedi #include <linux/timekeeping.h> 30a8fcf2a3SKumar Kartikeya Dwivedi 31a8fcf2a3SKumar Kartikeya Dwivedi /* 32a8fcf2a3SKumar Kartikeya Dwivedi * Include queued spinlock definitions and statistics code 33a8fcf2a3SKumar Kartikeya Dwivedi */ 34c9102a68SKumar Kartikeya Dwivedi #ifdef CONFIG_QUEUED_SPINLOCKS 35a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/qspinlock.h" 36a926d099SKumar Kartikeya Dwivedi #include "../locking/lock_events.h" 3731158ad0SKumar Kartikeya Dwivedi #include "rqspinlock.h" 38c9102a68SKumar Kartikeya Dwivedi #include "../locking/mcs_spinlock.h" 39c9102a68SKumar Kartikeya Dwivedi #endif 40a8fcf2a3SKumar Kartikeya Dwivedi 41a8fcf2a3SKumar Kartikeya Dwivedi /* 42a8fcf2a3SKumar Kartikeya Dwivedi * The basic principle of a queue-based spinlock can best be understood 43a8fcf2a3SKumar Kartikeya Dwivedi * by studying a classic queue-based spinlock implementation called the 44a8fcf2a3SKumar Kartikeya Dwivedi * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 45a8fcf2a3SKumar Kartikeya Dwivedi * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 46a8fcf2a3SKumar Kartikeya Dwivedi * Scott") is available at 47a8fcf2a3SKumar Kartikeya Dwivedi * 48a8fcf2a3SKumar Kartikeya Dwivedi * https://bugzilla.kernel.org/show_bug.cgi?id=206115 49a8fcf2a3SKumar Kartikeya Dwivedi * 50a8fcf2a3SKumar Kartikeya Dwivedi * This queued spinlock implementation is based on the MCS lock, however to 51a8fcf2a3SKumar Kartikeya Dwivedi * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 52a8fcf2a3SKumar Kartikeya Dwivedi * existing API, we must modify it somehow. 53a8fcf2a3SKumar Kartikeya Dwivedi * 54a8fcf2a3SKumar Kartikeya Dwivedi * In particular; where the traditional MCS lock consists of a tail pointer 55a8fcf2a3SKumar Kartikeya Dwivedi * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 56a8fcf2a3SKumar Kartikeya Dwivedi * unlock the next pending (next->locked), we compress both these: {tail, 57a8fcf2a3SKumar Kartikeya Dwivedi * next->locked} into a single u32 value. 58a8fcf2a3SKumar Kartikeya Dwivedi * 59a8fcf2a3SKumar Kartikeya Dwivedi * Since a spinlock disables recursion of its own context and there is a limit 60a8fcf2a3SKumar Kartikeya Dwivedi * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 61a8fcf2a3SKumar Kartikeya Dwivedi * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 62a8fcf2a3SKumar Kartikeya Dwivedi * we can encode the tail by combining the 2-bit nesting level with the cpu 63a8fcf2a3SKumar Kartikeya Dwivedi * number. With one byte for the lock value and 3 bytes for the tail, only a 64a8fcf2a3SKumar Kartikeya Dwivedi * 32-bit word is now needed. Even though we only need 1 bit for the lock, 65a8fcf2a3SKumar Kartikeya Dwivedi * we extend it to a full byte to achieve better performance for architectures 66a8fcf2a3SKumar Kartikeya Dwivedi * that support atomic byte write. 67a8fcf2a3SKumar Kartikeya Dwivedi * 68a8fcf2a3SKumar Kartikeya Dwivedi * We also change the first spinner to spin on the lock bit instead of its 69a8fcf2a3SKumar Kartikeya Dwivedi * node; whereby avoiding the need to carry a node from lock to unlock, and 70a8fcf2a3SKumar Kartikeya Dwivedi * preserving existing lock API. This also makes the unlock code simpler and 71a8fcf2a3SKumar Kartikeya Dwivedi * faster. 72a8fcf2a3SKumar Kartikeya Dwivedi * 73a8fcf2a3SKumar Kartikeya Dwivedi * N.B. The current implementation only supports architectures that allow 74a8fcf2a3SKumar Kartikeya Dwivedi * atomic operations on smaller 8-bit and 16-bit data types. 75a8fcf2a3SKumar Kartikeya Dwivedi * 76a8fcf2a3SKumar Kartikeya Dwivedi */ 77a8fcf2a3SKumar Kartikeya Dwivedi 7814c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout { 7914c48ee8SKumar Kartikeya Dwivedi u64 timeout_end; 8014c48ee8SKumar Kartikeya Dwivedi u64 duration; 8131158ad0SKumar Kartikeya Dwivedi u64 cur; 8214c48ee8SKumar Kartikeya Dwivedi u16 spin; 8314c48ee8SKumar Kartikeya Dwivedi }; 8414c48ee8SKumar Kartikeya Dwivedi 85164c2465SKumar Kartikeya Dwivedi #define RES_TIMEOUT_VAL 2 86164c2465SKumar Kartikeya Dwivedi 8731158ad0SKumar Kartikeya Dwivedi DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 8831158ad0SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 8931158ad0SKumar Kartikeya Dwivedi 9031158ad0SKumar Kartikeya Dwivedi static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) 9131158ad0SKumar Kartikeya Dwivedi { 9231158ad0SKumar Kartikeya Dwivedi if (!(atomic_read_acquire(&lock->val) & (mask))) 9331158ad0SKumar Kartikeya Dwivedi return true; 9431158ad0SKumar Kartikeya Dwivedi return false; 9531158ad0SKumar Kartikeya Dwivedi } 9631158ad0SKumar Kartikeya Dwivedi 9731158ad0SKumar Kartikeya Dwivedi static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, 9831158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 9931158ad0SKumar Kartikeya Dwivedi { 10031158ad0SKumar Kartikeya Dwivedi struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 10131158ad0SKumar Kartikeya Dwivedi int cnt = min(RES_NR_HELD, rqh->cnt); 10231158ad0SKumar Kartikeya Dwivedi 10331158ad0SKumar Kartikeya Dwivedi /* 10431158ad0SKumar Kartikeya Dwivedi * Return an error if we hold the lock we are attempting to acquire. 10531158ad0SKumar Kartikeya Dwivedi * We'll iterate over max 32 locks; no need to do is_lock_released. 10631158ad0SKumar Kartikeya Dwivedi */ 10731158ad0SKumar Kartikeya Dwivedi for (int i = 0; i < cnt - 1; i++) { 10831158ad0SKumar Kartikeya Dwivedi if (rqh->locks[i] == lock) 10931158ad0SKumar Kartikeya Dwivedi return -EDEADLK; 11031158ad0SKumar Kartikeya Dwivedi } 11131158ad0SKumar Kartikeya Dwivedi return 0; 11231158ad0SKumar Kartikeya Dwivedi } 11331158ad0SKumar Kartikeya Dwivedi 11431158ad0SKumar Kartikeya Dwivedi /* 11531158ad0SKumar Kartikeya Dwivedi * This focuses on the most common case of ABBA deadlocks (or ABBA involving 11631158ad0SKumar Kartikeya Dwivedi * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 11731158ad0SKumar Kartikeya Dwivedi * timeouts as the final line of defense. 11831158ad0SKumar Kartikeya Dwivedi */ 11931158ad0SKumar Kartikeya Dwivedi static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, 12031158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 12131158ad0SKumar Kartikeya Dwivedi { 12231158ad0SKumar Kartikeya Dwivedi struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 12331158ad0SKumar Kartikeya Dwivedi int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 12431158ad0SKumar Kartikeya Dwivedi void *remote_lock; 12531158ad0SKumar Kartikeya Dwivedi int cpu; 12631158ad0SKumar Kartikeya Dwivedi 12731158ad0SKumar Kartikeya Dwivedi /* 12831158ad0SKumar Kartikeya Dwivedi * Find the CPU holding the lock that we want to acquire. If there is a 12931158ad0SKumar Kartikeya Dwivedi * deadlock scenario, we will read a stable set on the remote CPU and 13031158ad0SKumar Kartikeya Dwivedi * find the target. This would be a constant time operation instead of 13131158ad0SKumar Kartikeya Dwivedi * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 13231158ad0SKumar Kartikeya Dwivedi * that requires increasing the size of the lock word. 13331158ad0SKumar Kartikeya Dwivedi */ 13431158ad0SKumar Kartikeya Dwivedi for_each_possible_cpu(cpu) { 13531158ad0SKumar Kartikeya Dwivedi struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 13631158ad0SKumar Kartikeya Dwivedi int real_cnt = READ_ONCE(rqh_cpu->cnt); 13731158ad0SKumar Kartikeya Dwivedi int cnt = min(RES_NR_HELD, real_cnt); 13831158ad0SKumar Kartikeya Dwivedi 13931158ad0SKumar Kartikeya Dwivedi /* 14031158ad0SKumar Kartikeya Dwivedi * Let's ensure to break out of this loop if the lock is available for 14131158ad0SKumar Kartikeya Dwivedi * us to potentially acquire. 14231158ad0SKumar Kartikeya Dwivedi */ 14331158ad0SKumar Kartikeya Dwivedi if (is_lock_released(lock, mask, ts)) 14431158ad0SKumar Kartikeya Dwivedi return 0; 14531158ad0SKumar Kartikeya Dwivedi 14631158ad0SKumar Kartikeya Dwivedi /* 14731158ad0SKumar Kartikeya Dwivedi * Skip ourselves, and CPUs whose count is less than 2, as they need at 14831158ad0SKumar Kartikeya Dwivedi * least one held lock and one acquisition attempt (reflected as top 14931158ad0SKumar Kartikeya Dwivedi * most entry) to participate in an ABBA deadlock. 15031158ad0SKumar Kartikeya Dwivedi * 15131158ad0SKumar Kartikeya Dwivedi * If cnt is more than RES_NR_HELD, it means the current lock being 15231158ad0SKumar Kartikeya Dwivedi * acquired won't appear in the table, and other locks in the table are 15331158ad0SKumar Kartikeya Dwivedi * already held, so we can't determine ABBA. 15431158ad0SKumar Kartikeya Dwivedi */ 15531158ad0SKumar Kartikeya Dwivedi if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 15631158ad0SKumar Kartikeya Dwivedi continue; 15731158ad0SKumar Kartikeya Dwivedi 15831158ad0SKumar Kartikeya Dwivedi /* 15931158ad0SKumar Kartikeya Dwivedi * Obtain the entry at the top, this corresponds to the lock the 16031158ad0SKumar Kartikeya Dwivedi * remote CPU is attempting to acquire in a deadlock situation, 16131158ad0SKumar Kartikeya Dwivedi * and would be one of the locks we hold on the current CPU. 16231158ad0SKumar Kartikeya Dwivedi */ 16331158ad0SKumar Kartikeya Dwivedi remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 16431158ad0SKumar Kartikeya Dwivedi /* 16531158ad0SKumar Kartikeya Dwivedi * If it is NULL, we've raced and cannot determine a deadlock 16631158ad0SKumar Kartikeya Dwivedi * conclusively, skip this CPU. 16731158ad0SKumar Kartikeya Dwivedi */ 16831158ad0SKumar Kartikeya Dwivedi if (!remote_lock) 16931158ad0SKumar Kartikeya Dwivedi continue; 17031158ad0SKumar Kartikeya Dwivedi /* 17131158ad0SKumar Kartikeya Dwivedi * Find if the lock we're attempting to acquire is held by this CPU. 17231158ad0SKumar Kartikeya Dwivedi * Don't consider the topmost entry, as that must be the latest lock 17331158ad0SKumar Kartikeya Dwivedi * being held or acquired. For a deadlock, the target CPU must also 17431158ad0SKumar Kartikeya Dwivedi * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 17531158ad0SKumar Kartikeya Dwivedi * entries are important. 17631158ad0SKumar Kartikeya Dwivedi */ 17731158ad0SKumar Kartikeya Dwivedi for (int i = 0; i < cnt - 1; i++) { 17831158ad0SKumar Kartikeya Dwivedi if (READ_ONCE(rqh_cpu->locks[i]) != lock) 17931158ad0SKumar Kartikeya Dwivedi continue; 18031158ad0SKumar Kartikeya Dwivedi /* 18131158ad0SKumar Kartikeya Dwivedi * We found our lock as held on the remote CPU. Is the 18231158ad0SKumar Kartikeya Dwivedi * acquisition attempt on the remote CPU for a lock held 18331158ad0SKumar Kartikeya Dwivedi * by us? If so, we have a deadlock situation, and need 18431158ad0SKumar Kartikeya Dwivedi * to recover. 18531158ad0SKumar Kartikeya Dwivedi */ 18631158ad0SKumar Kartikeya Dwivedi for (int i = 0; i < rqh_cnt - 1; i++) { 18731158ad0SKumar Kartikeya Dwivedi if (rqh->locks[i] == remote_lock) 18831158ad0SKumar Kartikeya Dwivedi return -EDEADLK; 18931158ad0SKumar Kartikeya Dwivedi } 19031158ad0SKumar Kartikeya Dwivedi /* 19131158ad0SKumar Kartikeya Dwivedi * Inconclusive; retry again later. 19231158ad0SKumar Kartikeya Dwivedi */ 19331158ad0SKumar Kartikeya Dwivedi return 0; 19431158ad0SKumar Kartikeya Dwivedi } 19531158ad0SKumar Kartikeya Dwivedi } 19631158ad0SKumar Kartikeya Dwivedi return 0; 19731158ad0SKumar Kartikeya Dwivedi } 19831158ad0SKumar Kartikeya Dwivedi 19931158ad0SKumar Kartikeya Dwivedi static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, 20031158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 20131158ad0SKumar Kartikeya Dwivedi { 20231158ad0SKumar Kartikeya Dwivedi int ret; 20331158ad0SKumar Kartikeya Dwivedi 20431158ad0SKumar Kartikeya Dwivedi ret = check_deadlock_AA(lock, mask, ts); 20531158ad0SKumar Kartikeya Dwivedi if (ret) 20631158ad0SKumar Kartikeya Dwivedi return ret; 20731158ad0SKumar Kartikeya Dwivedi ret = check_deadlock_ABBA(lock, mask, ts); 20831158ad0SKumar Kartikeya Dwivedi if (ret) 20931158ad0SKumar Kartikeya Dwivedi return ret; 21031158ad0SKumar Kartikeya Dwivedi 21131158ad0SKumar Kartikeya Dwivedi return 0; 21231158ad0SKumar Kartikeya Dwivedi } 21331158ad0SKumar Kartikeya Dwivedi 21431158ad0SKumar Kartikeya Dwivedi static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 21531158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 21614c48ee8SKumar Kartikeya Dwivedi { 21714c48ee8SKumar Kartikeya Dwivedi u64 time = ktime_get_mono_fast_ns(); 21831158ad0SKumar Kartikeya Dwivedi u64 prev = ts->cur; 21914c48ee8SKumar Kartikeya Dwivedi 22014c48ee8SKumar Kartikeya Dwivedi if (!ts->timeout_end) { 22131158ad0SKumar Kartikeya Dwivedi ts->cur = time; 22214c48ee8SKumar Kartikeya Dwivedi ts->timeout_end = time + ts->duration; 22314c48ee8SKumar Kartikeya Dwivedi return 0; 22414c48ee8SKumar Kartikeya Dwivedi } 22514c48ee8SKumar Kartikeya Dwivedi 22614c48ee8SKumar Kartikeya Dwivedi if (time > ts->timeout_end) 22714c48ee8SKumar Kartikeya Dwivedi return -ETIMEDOUT; 22814c48ee8SKumar Kartikeya Dwivedi 22931158ad0SKumar Kartikeya Dwivedi /* 23031158ad0SKumar Kartikeya Dwivedi * A millisecond interval passed from last time? Trigger deadlock 23131158ad0SKumar Kartikeya Dwivedi * checks. 23231158ad0SKumar Kartikeya Dwivedi */ 23331158ad0SKumar Kartikeya Dwivedi if (prev + NSEC_PER_MSEC < time) { 23431158ad0SKumar Kartikeya Dwivedi ts->cur = time; 23531158ad0SKumar Kartikeya Dwivedi return check_deadlock(lock, mask, ts); 23631158ad0SKumar Kartikeya Dwivedi } 23731158ad0SKumar Kartikeya Dwivedi 23814c48ee8SKumar Kartikeya Dwivedi return 0; 23914c48ee8SKumar Kartikeya Dwivedi } 24014c48ee8SKumar Kartikeya Dwivedi 241ebababcdSKumar Kartikeya Dwivedi /* 242ebababcdSKumar Kartikeya Dwivedi * Do not amortize with spins when res_smp_cond_load_acquire is defined, 243ebababcdSKumar Kartikeya Dwivedi * as the macro does internal amortization for us. 244ebababcdSKumar Kartikeya Dwivedi */ 245ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire 24631158ad0SKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 24714c48ee8SKumar Kartikeya Dwivedi ({ \ 24814c48ee8SKumar Kartikeya Dwivedi if (!(ts).spin++) \ 24931158ad0SKumar Kartikeya Dwivedi (ret) = check_timeout((lock), (mask), &(ts)); \ 25014c48ee8SKumar Kartikeya Dwivedi (ret); \ 25114c48ee8SKumar Kartikeya Dwivedi }) 252ebababcdSKumar Kartikeya Dwivedi #else 253ebababcdSKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 254ebababcdSKumar Kartikeya Dwivedi ({ (ret) = check_timeout(&(ts)); }) 255ebababcdSKumar Kartikeya Dwivedi #endif 25614c48ee8SKumar Kartikeya Dwivedi 25714c48ee8SKumar Kartikeya Dwivedi /* 25814c48ee8SKumar Kartikeya Dwivedi * Initialize the 'spin' member. 25931158ad0SKumar Kartikeya Dwivedi * Set spin member to 0 to trigger AA/ABBA checks immediately. 26014c48ee8SKumar Kartikeya Dwivedi */ 26131158ad0SKumar Kartikeya Dwivedi #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 26214c48ee8SKumar Kartikeya Dwivedi 26314c48ee8SKumar Kartikeya Dwivedi /* 26414c48ee8SKumar Kartikeya Dwivedi * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 26514c48ee8SKumar Kartikeya Dwivedi * Duration is defined for each spin attempt, so set it here. 26614c48ee8SKumar Kartikeya Dwivedi */ 26714c48ee8SKumar Kartikeya Dwivedi #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 26814c48ee8SKumar Kartikeya Dwivedi 269a8fcf2a3SKumar Kartikeya Dwivedi /* 270c9102a68SKumar Kartikeya Dwivedi * Provide a test-and-set fallback for cases when queued spin lock support is 271c9102a68SKumar Kartikeya Dwivedi * absent from the architecture. 272c9102a68SKumar Kartikeya Dwivedi */ 273c9102a68SKumar Kartikeya Dwivedi int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 274c9102a68SKumar Kartikeya Dwivedi { 275c9102a68SKumar Kartikeya Dwivedi struct rqspinlock_timeout ts; 276c9102a68SKumar Kartikeya Dwivedi int val, ret = 0; 277c9102a68SKumar Kartikeya Dwivedi 278c9102a68SKumar Kartikeya Dwivedi RES_INIT_TIMEOUT(ts); 279c9102a68SKumar Kartikeya Dwivedi grab_held_lock_entry(lock); 280c9102a68SKumar Kartikeya Dwivedi 281c9102a68SKumar Kartikeya Dwivedi /* 282c9102a68SKumar Kartikeya Dwivedi * Since the waiting loop's time is dependent on the amount of 283c9102a68SKumar Kartikeya Dwivedi * contention, a short timeout unlike rqspinlock waiting loops 284c9102a68SKumar Kartikeya Dwivedi * isn't enough. Choose a second as the timeout value. 285c9102a68SKumar Kartikeya Dwivedi */ 286c9102a68SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 287c9102a68SKumar Kartikeya Dwivedi retry: 288c9102a68SKumar Kartikeya Dwivedi val = atomic_read(&lock->val); 289c9102a68SKumar Kartikeya Dwivedi 290c9102a68SKumar Kartikeya Dwivedi if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 291c9102a68SKumar Kartikeya Dwivedi if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 292c9102a68SKumar Kartikeya Dwivedi goto out; 293c9102a68SKumar Kartikeya Dwivedi cpu_relax(); 294c9102a68SKumar Kartikeya Dwivedi goto retry; 295c9102a68SKumar Kartikeya Dwivedi } 296c9102a68SKumar Kartikeya Dwivedi 297c9102a68SKumar Kartikeya Dwivedi return 0; 298c9102a68SKumar Kartikeya Dwivedi out: 299c9102a68SKumar Kartikeya Dwivedi release_held_lock_entry(); 300c9102a68SKumar Kartikeya Dwivedi return ret; 301c9102a68SKumar Kartikeya Dwivedi } 302c9102a68SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 303c9102a68SKumar Kartikeya Dwivedi 304c9102a68SKumar Kartikeya Dwivedi #ifdef CONFIG_QUEUED_SPINLOCKS 305c9102a68SKumar Kartikeya Dwivedi 306c9102a68SKumar Kartikeya Dwivedi /* 307a8fcf2a3SKumar Kartikeya Dwivedi * Per-CPU queue node structures; we can never have more than 4 nested 308a8fcf2a3SKumar Kartikeya Dwivedi * contexts: task, softirq, hardirq, nmi. 309a8fcf2a3SKumar Kartikeya Dwivedi * 310a8fcf2a3SKumar Kartikeya Dwivedi * Exactly fits one 64-byte cacheline on a 64-bit architecture. 311a8fcf2a3SKumar Kartikeya Dwivedi */ 312a8fcf2a3SKumar Kartikeya Dwivedi static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 313a8fcf2a3SKumar Kartikeya Dwivedi 314ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire 315ebababcdSKumar Kartikeya Dwivedi #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 316ebababcdSKumar Kartikeya Dwivedi #endif 317ebababcdSKumar Kartikeya Dwivedi 318ebababcdSKumar Kartikeya Dwivedi #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 319ebababcdSKumar Kartikeya Dwivedi 320a8fcf2a3SKumar Kartikeya Dwivedi /** 321a8fcf2a3SKumar Kartikeya Dwivedi * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 322a8fcf2a3SKumar Kartikeya Dwivedi * @lock: Pointer to queued spinlock structure 323a8fcf2a3SKumar Kartikeya Dwivedi * @val: Current value of the queued spinlock 32-bit word 324a8fcf2a3SKumar Kartikeya Dwivedi * 325337ffea5SKumar Kartikeya Dwivedi * Return: 326337ffea5SKumar Kartikeya Dwivedi * * 0 - Lock was acquired successfully. 32731158ad0SKumar Kartikeya Dwivedi * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 328337ffea5SKumar Kartikeya Dwivedi * * -ETIMEDOUT - Lock acquisition failed because of timeout. 329337ffea5SKumar Kartikeya Dwivedi * 330a8fcf2a3SKumar Kartikeya Dwivedi * (queue tail, pending bit, lock value) 331a8fcf2a3SKumar Kartikeya Dwivedi * 332a8fcf2a3SKumar Kartikeya Dwivedi * fast : slow : unlock 333a8fcf2a3SKumar Kartikeya Dwivedi * : : 334a8fcf2a3SKumar Kartikeya Dwivedi * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 335a8fcf2a3SKumar Kartikeya Dwivedi * : | ^--------.------. / : 336a8fcf2a3SKumar Kartikeya Dwivedi * : v \ \ | : 337a8fcf2a3SKumar Kartikeya Dwivedi * pending : (0,1,1) +--> (0,1,0) \ | : 338a8fcf2a3SKumar Kartikeya Dwivedi * : | ^--' | | : 339a8fcf2a3SKumar Kartikeya Dwivedi * : v | | : 340a8fcf2a3SKumar Kartikeya Dwivedi * uncontended : (n,x,y) +--> (n,0,0) --' | : 341a8fcf2a3SKumar Kartikeya Dwivedi * queue : | ^--' | : 342a8fcf2a3SKumar Kartikeya Dwivedi * : v | : 343a8fcf2a3SKumar Kartikeya Dwivedi * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 344a8fcf2a3SKumar Kartikeya Dwivedi * queue : ^--' : 345a8fcf2a3SKumar Kartikeya Dwivedi */ 346337ffea5SKumar Kartikeya Dwivedi int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 347a8fcf2a3SKumar Kartikeya Dwivedi { 348a8fcf2a3SKumar Kartikeya Dwivedi struct mcs_spinlock *prev, *next, *node; 34914c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout ts; 350337ffea5SKumar Kartikeya Dwivedi int idx, ret = 0; 351a8fcf2a3SKumar Kartikeya Dwivedi u32 old, tail; 352a8fcf2a3SKumar Kartikeya Dwivedi 353a8fcf2a3SKumar Kartikeya Dwivedi BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 354a8fcf2a3SKumar Kartikeya Dwivedi 355*ecbd8047SKumar Kartikeya Dwivedi if (resilient_virt_spin_lock_enabled()) 356*ecbd8047SKumar Kartikeya Dwivedi return resilient_virt_spin_lock(lock); 357*ecbd8047SKumar Kartikeya Dwivedi 35814c48ee8SKumar Kartikeya Dwivedi RES_INIT_TIMEOUT(ts); 35914c48ee8SKumar Kartikeya Dwivedi 360a8fcf2a3SKumar Kartikeya Dwivedi /* 361a8fcf2a3SKumar Kartikeya Dwivedi * Wait for in-progress pending->locked hand-overs with a bounded 362a8fcf2a3SKumar Kartikeya Dwivedi * number of spins so that we guarantee forward progress. 363a8fcf2a3SKumar Kartikeya Dwivedi * 364a8fcf2a3SKumar Kartikeya Dwivedi * 0,1,0 -> 0,0,1 365a8fcf2a3SKumar Kartikeya Dwivedi */ 366a8fcf2a3SKumar Kartikeya Dwivedi if (val == _Q_PENDING_VAL) { 367a8fcf2a3SKumar Kartikeya Dwivedi int cnt = _Q_PENDING_LOOPS; 368a8fcf2a3SKumar Kartikeya Dwivedi val = atomic_cond_read_relaxed(&lock->val, 369a8fcf2a3SKumar Kartikeya Dwivedi (VAL != _Q_PENDING_VAL) || !cnt--); 370a8fcf2a3SKumar Kartikeya Dwivedi } 371a8fcf2a3SKumar Kartikeya Dwivedi 372a8fcf2a3SKumar Kartikeya Dwivedi /* 373a8fcf2a3SKumar Kartikeya Dwivedi * If we observe any contention; queue. 374a8fcf2a3SKumar Kartikeya Dwivedi */ 375a8fcf2a3SKumar Kartikeya Dwivedi if (val & ~_Q_LOCKED_MASK) 376a8fcf2a3SKumar Kartikeya Dwivedi goto queue; 377a8fcf2a3SKumar Kartikeya Dwivedi 378a8fcf2a3SKumar Kartikeya Dwivedi /* 379a8fcf2a3SKumar Kartikeya Dwivedi * trylock || pending 380a8fcf2a3SKumar Kartikeya Dwivedi * 381a8fcf2a3SKumar Kartikeya Dwivedi * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 382a8fcf2a3SKumar Kartikeya Dwivedi */ 383a8fcf2a3SKumar Kartikeya Dwivedi val = queued_fetch_set_pending_acquire(lock); 384a8fcf2a3SKumar Kartikeya Dwivedi 385a8fcf2a3SKumar Kartikeya Dwivedi /* 386a8fcf2a3SKumar Kartikeya Dwivedi * If we observe contention, there is a concurrent locker. 387a8fcf2a3SKumar Kartikeya Dwivedi * 388a8fcf2a3SKumar Kartikeya Dwivedi * Undo and queue; our setting of PENDING might have made the 389a8fcf2a3SKumar Kartikeya Dwivedi * n,0,0 -> 0,0,0 transition fail and it will now be waiting 390a8fcf2a3SKumar Kartikeya Dwivedi * on @next to become !NULL. 391a8fcf2a3SKumar Kartikeya Dwivedi */ 392a8fcf2a3SKumar Kartikeya Dwivedi if (unlikely(val & ~_Q_LOCKED_MASK)) { 393a8fcf2a3SKumar Kartikeya Dwivedi 394a8fcf2a3SKumar Kartikeya Dwivedi /* Undo PENDING if we set it. */ 395a8fcf2a3SKumar Kartikeya Dwivedi if (!(val & _Q_PENDING_MASK)) 396a8fcf2a3SKumar Kartikeya Dwivedi clear_pending(lock); 397a8fcf2a3SKumar Kartikeya Dwivedi 398a8fcf2a3SKumar Kartikeya Dwivedi goto queue; 399a8fcf2a3SKumar Kartikeya Dwivedi } 400a8fcf2a3SKumar Kartikeya Dwivedi 401a8fcf2a3SKumar Kartikeya Dwivedi /* 40231158ad0SKumar Kartikeya Dwivedi * Grab an entry in the held locks array, to enable deadlock detection. 40331158ad0SKumar Kartikeya Dwivedi */ 40431158ad0SKumar Kartikeya Dwivedi grab_held_lock_entry(lock); 40531158ad0SKumar Kartikeya Dwivedi 40631158ad0SKumar Kartikeya Dwivedi /* 407a8fcf2a3SKumar Kartikeya Dwivedi * We're pending, wait for the owner to go away. 408a8fcf2a3SKumar Kartikeya Dwivedi * 409a8fcf2a3SKumar Kartikeya Dwivedi * 0,1,1 -> *,1,0 410a8fcf2a3SKumar Kartikeya Dwivedi * 411a8fcf2a3SKumar Kartikeya Dwivedi * this wait loop must be a load-acquire such that we match the 412a8fcf2a3SKumar Kartikeya Dwivedi * store-release that clears the locked bit and create lock 413a8fcf2a3SKumar Kartikeya Dwivedi * sequentiality; this is because not all 414a8fcf2a3SKumar Kartikeya Dwivedi * clear_pending_set_locked() implementations imply full 415a8fcf2a3SKumar Kartikeya Dwivedi * barriers. 416a8fcf2a3SKumar Kartikeya Dwivedi */ 417337ffea5SKumar Kartikeya Dwivedi if (val & _Q_LOCKED_MASK) { 418337ffea5SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 41931158ad0SKumar Kartikeya Dwivedi res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 420337ffea5SKumar Kartikeya Dwivedi } 421337ffea5SKumar Kartikeya Dwivedi 422337ffea5SKumar Kartikeya Dwivedi if (ret) { 423337ffea5SKumar Kartikeya Dwivedi /* 424337ffea5SKumar Kartikeya Dwivedi * We waited for the locked bit to go back to 0, as the pending 425337ffea5SKumar Kartikeya Dwivedi * waiter, but timed out. We need to clear the pending bit since 426337ffea5SKumar Kartikeya Dwivedi * we own it. Once a stuck owner has been recovered, the lock 427337ffea5SKumar Kartikeya Dwivedi * must be restored to a valid state, hence removing the pending 428337ffea5SKumar Kartikeya Dwivedi * bit is necessary. 429337ffea5SKumar Kartikeya Dwivedi * 430337ffea5SKumar Kartikeya Dwivedi * *,1,* -> *,0,* 431337ffea5SKumar Kartikeya Dwivedi */ 432337ffea5SKumar Kartikeya Dwivedi clear_pending(lock); 433337ffea5SKumar Kartikeya Dwivedi lockevent_inc(rqspinlock_lock_timeout); 43431158ad0SKumar Kartikeya Dwivedi goto err_release_entry; 435337ffea5SKumar Kartikeya Dwivedi } 436a8fcf2a3SKumar Kartikeya Dwivedi 437a8fcf2a3SKumar Kartikeya Dwivedi /* 438a8fcf2a3SKumar Kartikeya Dwivedi * take ownership and clear the pending bit. 439a8fcf2a3SKumar Kartikeya Dwivedi * 440a8fcf2a3SKumar Kartikeya Dwivedi * 0,1,0 -> 0,0,1 441a8fcf2a3SKumar Kartikeya Dwivedi */ 442a8fcf2a3SKumar Kartikeya Dwivedi clear_pending_set_locked(lock); 443a8fcf2a3SKumar Kartikeya Dwivedi lockevent_inc(lock_pending); 444337ffea5SKumar Kartikeya Dwivedi return 0; 445a8fcf2a3SKumar Kartikeya Dwivedi 446a8fcf2a3SKumar Kartikeya Dwivedi /* 447a8fcf2a3SKumar Kartikeya Dwivedi * End of pending bit optimistic spinning and beginning of MCS 448a8fcf2a3SKumar Kartikeya Dwivedi * queuing. 449a8fcf2a3SKumar Kartikeya Dwivedi */ 450a8fcf2a3SKumar Kartikeya Dwivedi queue: 451a8fcf2a3SKumar Kartikeya Dwivedi lockevent_inc(lock_slowpath); 45231158ad0SKumar Kartikeya Dwivedi /* 45331158ad0SKumar Kartikeya Dwivedi * Grab deadlock detection entry for the queue path. 45431158ad0SKumar Kartikeya Dwivedi */ 45531158ad0SKumar Kartikeya Dwivedi grab_held_lock_entry(lock); 45631158ad0SKumar Kartikeya Dwivedi 457a8fcf2a3SKumar Kartikeya Dwivedi node = this_cpu_ptr(&rqnodes[0].mcs); 458a8fcf2a3SKumar Kartikeya Dwivedi idx = node->count++; 459a8fcf2a3SKumar Kartikeya Dwivedi tail = encode_tail(smp_processor_id(), idx); 460a8fcf2a3SKumar Kartikeya Dwivedi 461a8fcf2a3SKumar Kartikeya Dwivedi trace_contention_begin(lock, LCB_F_SPIN); 462a8fcf2a3SKumar Kartikeya Dwivedi 463a8fcf2a3SKumar Kartikeya Dwivedi /* 464a8fcf2a3SKumar Kartikeya Dwivedi * 4 nodes are allocated based on the assumption that there will 465a8fcf2a3SKumar Kartikeya Dwivedi * not be nested NMIs taking spinlocks. That may not be true in 466a8fcf2a3SKumar Kartikeya Dwivedi * some architectures even though the chance of needing more than 467a8fcf2a3SKumar Kartikeya Dwivedi * 4 nodes will still be extremely unlikely. When that happens, 468a8fcf2a3SKumar Kartikeya Dwivedi * we fall back to spinning on the lock directly without using 469a8fcf2a3SKumar Kartikeya Dwivedi * any MCS node. This is not the most elegant solution, but is 470a8fcf2a3SKumar Kartikeya Dwivedi * simple enough. 471a8fcf2a3SKumar Kartikeya Dwivedi */ 472a8fcf2a3SKumar Kartikeya Dwivedi if (unlikely(idx >= _Q_MAX_NODES)) { 473a8fcf2a3SKumar Kartikeya Dwivedi lockevent_inc(lock_no_node); 4743bb15936SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 4753bb15936SKumar Kartikeya Dwivedi while (!queued_spin_trylock(lock)) { 47631158ad0SKumar Kartikeya Dwivedi if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { 4773bb15936SKumar Kartikeya Dwivedi lockevent_inc(rqspinlock_lock_timeout); 47831158ad0SKumar Kartikeya Dwivedi goto err_release_node; 4793bb15936SKumar Kartikeya Dwivedi } 480a8fcf2a3SKumar Kartikeya Dwivedi cpu_relax(); 4813bb15936SKumar Kartikeya Dwivedi } 482a8fcf2a3SKumar Kartikeya Dwivedi goto release; 483a8fcf2a3SKumar Kartikeya Dwivedi } 484a8fcf2a3SKumar Kartikeya Dwivedi 485a8fcf2a3SKumar Kartikeya Dwivedi node = grab_mcs_node(node, idx); 486a8fcf2a3SKumar Kartikeya Dwivedi 487a8fcf2a3SKumar Kartikeya Dwivedi /* 488a8fcf2a3SKumar Kartikeya Dwivedi * Keep counts of non-zero index values: 489a8fcf2a3SKumar Kartikeya Dwivedi */ 490a8fcf2a3SKumar Kartikeya Dwivedi lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 491a8fcf2a3SKumar Kartikeya Dwivedi 492a8fcf2a3SKumar Kartikeya Dwivedi /* 493a8fcf2a3SKumar Kartikeya Dwivedi * Ensure that we increment the head node->count before initialising 494a8fcf2a3SKumar Kartikeya Dwivedi * the actual node. If the compiler is kind enough to reorder these 495a8fcf2a3SKumar Kartikeya Dwivedi * stores, then an IRQ could overwrite our assignments. 496a8fcf2a3SKumar Kartikeya Dwivedi */ 497a8fcf2a3SKumar Kartikeya Dwivedi barrier(); 498a8fcf2a3SKumar Kartikeya Dwivedi 499a8fcf2a3SKumar Kartikeya Dwivedi node->locked = 0; 500a8fcf2a3SKumar Kartikeya Dwivedi node->next = NULL; 501a8fcf2a3SKumar Kartikeya Dwivedi 502a8fcf2a3SKumar Kartikeya Dwivedi /* 503a8fcf2a3SKumar Kartikeya Dwivedi * We touched a (possibly) cold cacheline in the per-cpu queue node; 504a8fcf2a3SKumar Kartikeya Dwivedi * attempt the trylock once more in the hope someone let go while we 505a8fcf2a3SKumar Kartikeya Dwivedi * weren't watching. 506a8fcf2a3SKumar Kartikeya Dwivedi */ 507a8fcf2a3SKumar Kartikeya Dwivedi if (queued_spin_trylock(lock)) 508a8fcf2a3SKumar Kartikeya Dwivedi goto release; 509a8fcf2a3SKumar Kartikeya Dwivedi 510a8fcf2a3SKumar Kartikeya Dwivedi /* 511a8fcf2a3SKumar Kartikeya Dwivedi * Ensure that the initialisation of @node is complete before we 512a8fcf2a3SKumar Kartikeya Dwivedi * publish the updated tail via xchg_tail() and potentially link 513a8fcf2a3SKumar Kartikeya Dwivedi * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 514a8fcf2a3SKumar Kartikeya Dwivedi */ 515a8fcf2a3SKumar Kartikeya Dwivedi smp_wmb(); 516a8fcf2a3SKumar Kartikeya Dwivedi 517a8fcf2a3SKumar Kartikeya Dwivedi /* 518a8fcf2a3SKumar Kartikeya Dwivedi * Publish the updated tail. 519a8fcf2a3SKumar Kartikeya Dwivedi * We have already touched the queueing cacheline; don't bother with 520a8fcf2a3SKumar Kartikeya Dwivedi * pending stuff. 521a8fcf2a3SKumar Kartikeya Dwivedi * 522a8fcf2a3SKumar Kartikeya Dwivedi * p,*,* -> n,*,* 523a8fcf2a3SKumar Kartikeya Dwivedi */ 524a8fcf2a3SKumar Kartikeya Dwivedi old = xchg_tail(lock, tail); 525a8fcf2a3SKumar Kartikeya Dwivedi next = NULL; 526a8fcf2a3SKumar Kartikeya Dwivedi 527a8fcf2a3SKumar Kartikeya Dwivedi /* 528a8fcf2a3SKumar Kartikeya Dwivedi * if there was a previous node; link it and wait until reaching the 529a8fcf2a3SKumar Kartikeya Dwivedi * head of the waitqueue. 530a8fcf2a3SKumar Kartikeya Dwivedi */ 531a8fcf2a3SKumar Kartikeya Dwivedi if (old & _Q_TAIL_MASK) { 532164c2465SKumar Kartikeya Dwivedi int val; 533164c2465SKumar Kartikeya Dwivedi 534a8fcf2a3SKumar Kartikeya Dwivedi prev = decode_tail(old, rqnodes); 535a8fcf2a3SKumar Kartikeya Dwivedi 536a8fcf2a3SKumar Kartikeya Dwivedi /* Link @node into the waitqueue. */ 537a8fcf2a3SKumar Kartikeya Dwivedi WRITE_ONCE(prev->next, node); 538a8fcf2a3SKumar Kartikeya Dwivedi 539164c2465SKumar Kartikeya Dwivedi val = arch_mcs_spin_lock_contended(&node->locked); 540164c2465SKumar Kartikeya Dwivedi if (val == RES_TIMEOUT_VAL) { 541164c2465SKumar Kartikeya Dwivedi ret = -EDEADLK; 542164c2465SKumar Kartikeya Dwivedi goto waitq_timeout; 543164c2465SKumar Kartikeya Dwivedi } 544a8fcf2a3SKumar Kartikeya Dwivedi 545a8fcf2a3SKumar Kartikeya Dwivedi /* 546a8fcf2a3SKumar Kartikeya Dwivedi * While waiting for the MCS lock, the next pointer may have 547a8fcf2a3SKumar Kartikeya Dwivedi * been set by another lock waiter. We optimistically load 548a8fcf2a3SKumar Kartikeya Dwivedi * the next pointer & prefetch the cacheline for writing 549a8fcf2a3SKumar Kartikeya Dwivedi * to reduce latency in the upcoming MCS unlock operation. 550a8fcf2a3SKumar Kartikeya Dwivedi */ 551a8fcf2a3SKumar Kartikeya Dwivedi next = READ_ONCE(node->next); 552a8fcf2a3SKumar Kartikeya Dwivedi if (next) 553a8fcf2a3SKumar Kartikeya Dwivedi prefetchw(next); 554a8fcf2a3SKumar Kartikeya Dwivedi } 555a8fcf2a3SKumar Kartikeya Dwivedi 556a8fcf2a3SKumar Kartikeya Dwivedi /* 557a8fcf2a3SKumar Kartikeya Dwivedi * we're at the head of the waitqueue, wait for the owner & pending to 558a8fcf2a3SKumar Kartikeya Dwivedi * go away. 559a8fcf2a3SKumar Kartikeya Dwivedi * 560a8fcf2a3SKumar Kartikeya Dwivedi * *,x,y -> *,0,0 561a8fcf2a3SKumar Kartikeya Dwivedi * 562a8fcf2a3SKumar Kartikeya Dwivedi * this wait loop must use a load-acquire such that we match the 563a8fcf2a3SKumar Kartikeya Dwivedi * store-release that clears the locked bit and create lock 564a8fcf2a3SKumar Kartikeya Dwivedi * sequentiality; this is because the set_locked() function below 565a8fcf2a3SKumar Kartikeya Dwivedi * does not imply a full barrier. 566164c2465SKumar Kartikeya Dwivedi * 567164c2465SKumar Kartikeya Dwivedi * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 568164c2465SKumar Kartikeya Dwivedi * meant to span maximum allowed time per critical section, and we may 569164c2465SKumar Kartikeya Dwivedi * have both the owner of the lock and the pending bit waiter ahead of 570164c2465SKumar Kartikeya Dwivedi * us. 571a8fcf2a3SKumar Kartikeya Dwivedi */ 572164c2465SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 573164c2465SKumar Kartikeya Dwivedi val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 57431158ad0SKumar Kartikeya Dwivedi RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 575164c2465SKumar Kartikeya Dwivedi 576164c2465SKumar Kartikeya Dwivedi waitq_timeout: 577164c2465SKumar Kartikeya Dwivedi if (ret) { 578164c2465SKumar Kartikeya Dwivedi /* 579164c2465SKumar Kartikeya Dwivedi * If the tail is still pointing to us, then we are the final waiter, 580164c2465SKumar Kartikeya Dwivedi * and are responsible for resetting the tail back to 0. Otherwise, if 581164c2465SKumar Kartikeya Dwivedi * the cmpxchg operation fails, we signal the next waiter to take exit 582164c2465SKumar Kartikeya Dwivedi * and try the same. For a waiter with tail node 'n': 583164c2465SKumar Kartikeya Dwivedi * 584164c2465SKumar Kartikeya Dwivedi * n,*,* -> 0,*,* 585164c2465SKumar Kartikeya Dwivedi * 586164c2465SKumar Kartikeya Dwivedi * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 587164c2465SKumar Kartikeya Dwivedi * possible locked/pending bits keep changing and we see failures even 588164c2465SKumar Kartikeya Dwivedi * when we remain the head of wait queue. However, eventually, 589164c2465SKumar Kartikeya Dwivedi * pending bit owner will unset the pending bit, and new waiters 590164c2465SKumar Kartikeya Dwivedi * will queue behind us. This will leave the lock owner in 591164c2465SKumar Kartikeya Dwivedi * charge, and it will eventually either set locked bit to 0, or 592164c2465SKumar Kartikeya Dwivedi * leave it as 1, allowing us to make progress. 593164c2465SKumar Kartikeya Dwivedi * 594164c2465SKumar Kartikeya Dwivedi * We terminate the whole wait queue for two reasons. Firstly, 595164c2465SKumar Kartikeya Dwivedi * we eschew per-waiter timeouts with one applied at the head of 596164c2465SKumar Kartikeya Dwivedi * the wait queue. This allows everyone to break out faster 597164c2465SKumar Kartikeya Dwivedi * once we've seen the owner / pending waiter not responding for 598164c2465SKumar Kartikeya Dwivedi * the timeout duration from the head. Secondly, it avoids 599164c2465SKumar Kartikeya Dwivedi * complicated synchronization, because when not leaving in FIFO 600164c2465SKumar Kartikeya Dwivedi * order, prev's next pointer needs to be fixed up etc. 601164c2465SKumar Kartikeya Dwivedi */ 602164c2465SKumar Kartikeya Dwivedi if (!try_cmpxchg_tail(lock, tail, 0)) { 603164c2465SKumar Kartikeya Dwivedi next = smp_cond_load_relaxed(&node->next, VAL); 604164c2465SKumar Kartikeya Dwivedi WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 605164c2465SKumar Kartikeya Dwivedi } 606164c2465SKumar Kartikeya Dwivedi lockevent_inc(rqspinlock_lock_timeout); 60731158ad0SKumar Kartikeya Dwivedi goto err_release_node; 608164c2465SKumar Kartikeya Dwivedi } 609a8fcf2a3SKumar Kartikeya Dwivedi 610a8fcf2a3SKumar Kartikeya Dwivedi /* 611a8fcf2a3SKumar Kartikeya Dwivedi * claim the lock: 612a8fcf2a3SKumar Kartikeya Dwivedi * 613a8fcf2a3SKumar Kartikeya Dwivedi * n,0,0 -> 0,0,1 : lock, uncontended 614a8fcf2a3SKumar Kartikeya Dwivedi * *,*,0 -> *,*,1 : lock, contended 615a8fcf2a3SKumar Kartikeya Dwivedi * 616a8fcf2a3SKumar Kartikeya Dwivedi * If the queue head is the only one in the queue (lock value == tail) 617a8fcf2a3SKumar Kartikeya Dwivedi * and nobody is pending, clear the tail code and grab the lock. 618a8fcf2a3SKumar Kartikeya Dwivedi * Otherwise, we only need to grab the lock. 619a8fcf2a3SKumar Kartikeya Dwivedi */ 620a8fcf2a3SKumar Kartikeya Dwivedi 621a8fcf2a3SKumar Kartikeya Dwivedi /* 622a8fcf2a3SKumar Kartikeya Dwivedi * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 623a8fcf2a3SKumar Kartikeya Dwivedi * above wait condition, therefore any concurrent setting of 624a8fcf2a3SKumar Kartikeya Dwivedi * PENDING will make the uncontended transition fail. 625a8fcf2a3SKumar Kartikeya Dwivedi */ 626a8fcf2a3SKumar Kartikeya Dwivedi if ((val & _Q_TAIL_MASK) == tail) { 627a8fcf2a3SKumar Kartikeya Dwivedi if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 628a8fcf2a3SKumar Kartikeya Dwivedi goto release; /* No contention */ 629a8fcf2a3SKumar Kartikeya Dwivedi } 630a8fcf2a3SKumar Kartikeya Dwivedi 631a8fcf2a3SKumar Kartikeya Dwivedi /* 632a8fcf2a3SKumar Kartikeya Dwivedi * Either somebody is queued behind us or _Q_PENDING_VAL got set 633a8fcf2a3SKumar Kartikeya Dwivedi * which will then detect the remaining tail and queue behind us 634a8fcf2a3SKumar Kartikeya Dwivedi * ensuring we'll see a @next. 635a8fcf2a3SKumar Kartikeya Dwivedi */ 636a8fcf2a3SKumar Kartikeya Dwivedi set_locked(lock); 637a8fcf2a3SKumar Kartikeya Dwivedi 638a8fcf2a3SKumar Kartikeya Dwivedi /* 639a8fcf2a3SKumar Kartikeya Dwivedi * contended path; wait for next if not observed yet, release. 640a8fcf2a3SKumar Kartikeya Dwivedi */ 641a8fcf2a3SKumar Kartikeya Dwivedi if (!next) 642a8fcf2a3SKumar Kartikeya Dwivedi next = smp_cond_load_relaxed(&node->next, (VAL)); 643a8fcf2a3SKumar Kartikeya Dwivedi 644a8fcf2a3SKumar Kartikeya Dwivedi arch_mcs_spin_unlock_contended(&next->locked); 645a8fcf2a3SKumar Kartikeya Dwivedi 646a8fcf2a3SKumar Kartikeya Dwivedi release: 647a8fcf2a3SKumar Kartikeya Dwivedi trace_contention_end(lock, 0); 648a8fcf2a3SKumar Kartikeya Dwivedi 649a8fcf2a3SKumar Kartikeya Dwivedi /* 650a8fcf2a3SKumar Kartikeya Dwivedi * release the node 651a8fcf2a3SKumar Kartikeya Dwivedi */ 652a8fcf2a3SKumar Kartikeya Dwivedi __this_cpu_dec(rqnodes[0].mcs.count); 653164c2465SKumar Kartikeya Dwivedi return ret; 65431158ad0SKumar Kartikeya Dwivedi err_release_node: 65531158ad0SKumar Kartikeya Dwivedi trace_contention_end(lock, ret); 65631158ad0SKumar Kartikeya Dwivedi __this_cpu_dec(rqnodes[0].mcs.count); 65731158ad0SKumar Kartikeya Dwivedi err_release_entry: 65831158ad0SKumar Kartikeya Dwivedi release_held_lock_entry(); 65931158ad0SKumar Kartikeya Dwivedi return ret; 660a8fcf2a3SKumar Kartikeya Dwivedi } 661a8fcf2a3SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 662c9102a68SKumar Kartikeya Dwivedi 663c9102a68SKumar Kartikeya Dwivedi #endif /* CONFIG_QUEUED_SPINLOCKS */ 664