1a8fcf2a3SKumar Kartikeya Dwivedi // SPDX-License-Identifier: GPL-2.0-or-later 2a8fcf2a3SKumar Kartikeya Dwivedi /* 3a8fcf2a3SKumar Kartikeya Dwivedi * Resilient Queued Spin Lock 4a8fcf2a3SKumar Kartikeya Dwivedi * 5a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2015 Intel Corp. 8a8fcf2a3SKumar Kartikeya Dwivedi * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 914c48ee8SKumar Kartikeya Dwivedi * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10a8fcf2a3SKumar Kartikeya Dwivedi * 11a8fcf2a3SKumar Kartikeya Dwivedi * Authors: Waiman Long <longman@redhat.com> 12a8fcf2a3SKumar Kartikeya Dwivedi * Peter Zijlstra <peterz@infradead.org> 1314c48ee8SKumar Kartikeya Dwivedi * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14a8fcf2a3SKumar Kartikeya Dwivedi */ 15a8fcf2a3SKumar Kartikeya Dwivedi 16a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/smp.h> 17a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/bug.h> 18*97eb35f3SKumar Kartikeya Dwivedi #include <linux/bpf.h> 19*97eb35f3SKumar Kartikeya Dwivedi #include <linux/err.h> 20a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/cpumask.h> 21a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/percpu.h> 22a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/hardirq.h> 23a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/mutex.h> 24a8fcf2a3SKumar Kartikeya Dwivedi #include <linux/prefetch.h> 25a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/byteorder.h> 26c9102a68SKumar Kartikeya Dwivedi #ifdef CONFIG_QUEUED_SPINLOCKS 27a8fcf2a3SKumar Kartikeya Dwivedi #include <asm/qspinlock.h> 28c9102a68SKumar Kartikeya Dwivedi #endif 29a8fcf2a3SKumar Kartikeya Dwivedi #include <trace/events/lock.h> 3030ff1332SKumar Kartikeya Dwivedi #include <asm/rqspinlock.h> 3114c48ee8SKumar Kartikeya Dwivedi #include <linux/timekeeping.h> 32a8fcf2a3SKumar Kartikeya Dwivedi 33a8fcf2a3SKumar Kartikeya Dwivedi /* 34a8fcf2a3SKumar Kartikeya Dwivedi * Include queued spinlock definitions and statistics code 35a8fcf2a3SKumar Kartikeya Dwivedi */ 36c9102a68SKumar Kartikeya Dwivedi #ifdef CONFIG_QUEUED_SPINLOCKS 37a8fcf2a3SKumar Kartikeya Dwivedi #include "../locking/qspinlock.h" 38a926d099SKumar Kartikeya Dwivedi #include "../locking/lock_events.h" 3931158ad0SKumar Kartikeya Dwivedi #include "rqspinlock.h" 40c9102a68SKumar Kartikeya Dwivedi #include "../locking/mcs_spinlock.h" 41c9102a68SKumar Kartikeya Dwivedi #endif 42a8fcf2a3SKumar Kartikeya Dwivedi 43a8fcf2a3SKumar Kartikeya Dwivedi /* 44a8fcf2a3SKumar Kartikeya Dwivedi * The basic principle of a queue-based spinlock can best be understood 45a8fcf2a3SKumar Kartikeya Dwivedi * by studying a classic queue-based spinlock implementation called the 46a8fcf2a3SKumar Kartikeya Dwivedi * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47a8fcf2a3SKumar Kartikeya Dwivedi * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48a8fcf2a3SKumar Kartikeya Dwivedi * Scott") is available at 49a8fcf2a3SKumar Kartikeya Dwivedi * 50a8fcf2a3SKumar Kartikeya Dwivedi * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51a8fcf2a3SKumar Kartikeya Dwivedi * 52a8fcf2a3SKumar Kartikeya Dwivedi * This queued spinlock implementation is based on the MCS lock, however to 53a8fcf2a3SKumar Kartikeya Dwivedi * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54a8fcf2a3SKumar Kartikeya Dwivedi * existing API, we must modify it somehow. 55a8fcf2a3SKumar Kartikeya Dwivedi * 56a8fcf2a3SKumar Kartikeya Dwivedi * In particular; where the traditional MCS lock consists of a tail pointer 57a8fcf2a3SKumar Kartikeya Dwivedi * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58a8fcf2a3SKumar Kartikeya Dwivedi * unlock the next pending (next->locked), we compress both these: {tail, 59a8fcf2a3SKumar Kartikeya Dwivedi * next->locked} into a single u32 value. 60a8fcf2a3SKumar Kartikeya Dwivedi * 61a8fcf2a3SKumar Kartikeya Dwivedi * Since a spinlock disables recursion of its own context and there is a limit 62a8fcf2a3SKumar Kartikeya Dwivedi * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63a8fcf2a3SKumar Kartikeya Dwivedi * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64a8fcf2a3SKumar Kartikeya Dwivedi * we can encode the tail by combining the 2-bit nesting level with the cpu 65a8fcf2a3SKumar Kartikeya Dwivedi * number. With one byte for the lock value and 3 bytes for the tail, only a 66a8fcf2a3SKumar Kartikeya Dwivedi * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67a8fcf2a3SKumar Kartikeya Dwivedi * we extend it to a full byte to achieve better performance for architectures 68a8fcf2a3SKumar Kartikeya Dwivedi * that support atomic byte write. 69a8fcf2a3SKumar Kartikeya Dwivedi * 70a8fcf2a3SKumar Kartikeya Dwivedi * We also change the first spinner to spin on the lock bit instead of its 71a8fcf2a3SKumar Kartikeya Dwivedi * node; whereby avoiding the need to carry a node from lock to unlock, and 72a8fcf2a3SKumar Kartikeya Dwivedi * preserving existing lock API. This also makes the unlock code simpler and 73a8fcf2a3SKumar Kartikeya Dwivedi * faster. 74a8fcf2a3SKumar Kartikeya Dwivedi * 75a8fcf2a3SKumar Kartikeya Dwivedi * N.B. The current implementation only supports architectures that allow 76a8fcf2a3SKumar Kartikeya Dwivedi * atomic operations on smaller 8-bit and 16-bit data types. 77a8fcf2a3SKumar Kartikeya Dwivedi * 78a8fcf2a3SKumar Kartikeya Dwivedi */ 79a8fcf2a3SKumar Kartikeya Dwivedi 8014c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout { 8114c48ee8SKumar Kartikeya Dwivedi u64 timeout_end; 8214c48ee8SKumar Kartikeya Dwivedi u64 duration; 8331158ad0SKumar Kartikeya Dwivedi u64 cur; 8414c48ee8SKumar Kartikeya Dwivedi u16 spin; 8514c48ee8SKumar Kartikeya Dwivedi }; 8614c48ee8SKumar Kartikeya Dwivedi 87164c2465SKumar Kartikeya Dwivedi #define RES_TIMEOUT_VAL 2 88164c2465SKumar Kartikeya Dwivedi 8931158ad0SKumar Kartikeya Dwivedi DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 9031158ad0SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 9131158ad0SKumar Kartikeya Dwivedi 9231158ad0SKumar Kartikeya Dwivedi static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) 9331158ad0SKumar Kartikeya Dwivedi { 9431158ad0SKumar Kartikeya Dwivedi if (!(atomic_read_acquire(&lock->val) & (mask))) 9531158ad0SKumar Kartikeya Dwivedi return true; 9631158ad0SKumar Kartikeya Dwivedi return false; 9731158ad0SKumar Kartikeya Dwivedi } 9831158ad0SKumar Kartikeya Dwivedi 9931158ad0SKumar Kartikeya Dwivedi static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, 10031158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 10131158ad0SKumar Kartikeya Dwivedi { 10231158ad0SKumar Kartikeya Dwivedi struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 10331158ad0SKumar Kartikeya Dwivedi int cnt = min(RES_NR_HELD, rqh->cnt); 10431158ad0SKumar Kartikeya Dwivedi 10531158ad0SKumar Kartikeya Dwivedi /* 10631158ad0SKumar Kartikeya Dwivedi * Return an error if we hold the lock we are attempting to acquire. 10731158ad0SKumar Kartikeya Dwivedi * We'll iterate over max 32 locks; no need to do is_lock_released. 10831158ad0SKumar Kartikeya Dwivedi */ 10931158ad0SKumar Kartikeya Dwivedi for (int i = 0; i < cnt - 1; i++) { 11031158ad0SKumar Kartikeya Dwivedi if (rqh->locks[i] == lock) 11131158ad0SKumar Kartikeya Dwivedi return -EDEADLK; 11231158ad0SKumar Kartikeya Dwivedi } 11331158ad0SKumar Kartikeya Dwivedi return 0; 11431158ad0SKumar Kartikeya Dwivedi } 11531158ad0SKumar Kartikeya Dwivedi 11631158ad0SKumar Kartikeya Dwivedi /* 11731158ad0SKumar Kartikeya Dwivedi * This focuses on the most common case of ABBA deadlocks (or ABBA involving 11831158ad0SKumar Kartikeya Dwivedi * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 11931158ad0SKumar Kartikeya Dwivedi * timeouts as the final line of defense. 12031158ad0SKumar Kartikeya Dwivedi */ 12131158ad0SKumar Kartikeya Dwivedi static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, 12231158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 12331158ad0SKumar Kartikeya Dwivedi { 12431158ad0SKumar Kartikeya Dwivedi struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 12531158ad0SKumar Kartikeya Dwivedi int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 12631158ad0SKumar Kartikeya Dwivedi void *remote_lock; 12731158ad0SKumar Kartikeya Dwivedi int cpu; 12831158ad0SKumar Kartikeya Dwivedi 12931158ad0SKumar Kartikeya Dwivedi /* 13031158ad0SKumar Kartikeya Dwivedi * Find the CPU holding the lock that we want to acquire. If there is a 13131158ad0SKumar Kartikeya Dwivedi * deadlock scenario, we will read a stable set on the remote CPU and 13231158ad0SKumar Kartikeya Dwivedi * find the target. This would be a constant time operation instead of 13331158ad0SKumar Kartikeya Dwivedi * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 13431158ad0SKumar Kartikeya Dwivedi * that requires increasing the size of the lock word. 13531158ad0SKumar Kartikeya Dwivedi */ 13631158ad0SKumar Kartikeya Dwivedi for_each_possible_cpu(cpu) { 13731158ad0SKumar Kartikeya Dwivedi struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 13831158ad0SKumar Kartikeya Dwivedi int real_cnt = READ_ONCE(rqh_cpu->cnt); 13931158ad0SKumar Kartikeya Dwivedi int cnt = min(RES_NR_HELD, real_cnt); 14031158ad0SKumar Kartikeya Dwivedi 14131158ad0SKumar Kartikeya Dwivedi /* 14231158ad0SKumar Kartikeya Dwivedi * Let's ensure to break out of this loop if the lock is available for 14331158ad0SKumar Kartikeya Dwivedi * us to potentially acquire. 14431158ad0SKumar Kartikeya Dwivedi */ 14531158ad0SKumar Kartikeya Dwivedi if (is_lock_released(lock, mask, ts)) 14631158ad0SKumar Kartikeya Dwivedi return 0; 14731158ad0SKumar Kartikeya Dwivedi 14831158ad0SKumar Kartikeya Dwivedi /* 14931158ad0SKumar Kartikeya Dwivedi * Skip ourselves, and CPUs whose count is less than 2, as they need at 15031158ad0SKumar Kartikeya Dwivedi * least one held lock and one acquisition attempt (reflected as top 15131158ad0SKumar Kartikeya Dwivedi * most entry) to participate in an ABBA deadlock. 15231158ad0SKumar Kartikeya Dwivedi * 15331158ad0SKumar Kartikeya Dwivedi * If cnt is more than RES_NR_HELD, it means the current lock being 15431158ad0SKumar Kartikeya Dwivedi * acquired won't appear in the table, and other locks in the table are 15531158ad0SKumar Kartikeya Dwivedi * already held, so we can't determine ABBA. 15631158ad0SKumar Kartikeya Dwivedi */ 15731158ad0SKumar Kartikeya Dwivedi if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 15831158ad0SKumar Kartikeya Dwivedi continue; 15931158ad0SKumar Kartikeya Dwivedi 16031158ad0SKumar Kartikeya Dwivedi /* 16131158ad0SKumar Kartikeya Dwivedi * Obtain the entry at the top, this corresponds to the lock the 16231158ad0SKumar Kartikeya Dwivedi * remote CPU is attempting to acquire in a deadlock situation, 16331158ad0SKumar Kartikeya Dwivedi * and would be one of the locks we hold on the current CPU. 16431158ad0SKumar Kartikeya Dwivedi */ 16531158ad0SKumar Kartikeya Dwivedi remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 16631158ad0SKumar Kartikeya Dwivedi /* 16731158ad0SKumar Kartikeya Dwivedi * If it is NULL, we've raced and cannot determine a deadlock 16831158ad0SKumar Kartikeya Dwivedi * conclusively, skip this CPU. 16931158ad0SKumar Kartikeya Dwivedi */ 17031158ad0SKumar Kartikeya Dwivedi if (!remote_lock) 17131158ad0SKumar Kartikeya Dwivedi continue; 17231158ad0SKumar Kartikeya Dwivedi /* 17331158ad0SKumar Kartikeya Dwivedi * Find if the lock we're attempting to acquire is held by this CPU. 17431158ad0SKumar Kartikeya Dwivedi * Don't consider the topmost entry, as that must be the latest lock 17531158ad0SKumar Kartikeya Dwivedi * being held or acquired. For a deadlock, the target CPU must also 17631158ad0SKumar Kartikeya Dwivedi * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 17731158ad0SKumar Kartikeya Dwivedi * entries are important. 17831158ad0SKumar Kartikeya Dwivedi */ 17931158ad0SKumar Kartikeya Dwivedi for (int i = 0; i < cnt - 1; i++) { 18031158ad0SKumar Kartikeya Dwivedi if (READ_ONCE(rqh_cpu->locks[i]) != lock) 18131158ad0SKumar Kartikeya Dwivedi continue; 18231158ad0SKumar Kartikeya Dwivedi /* 18331158ad0SKumar Kartikeya Dwivedi * We found our lock as held on the remote CPU. Is the 18431158ad0SKumar Kartikeya Dwivedi * acquisition attempt on the remote CPU for a lock held 18531158ad0SKumar Kartikeya Dwivedi * by us? If so, we have a deadlock situation, and need 18631158ad0SKumar Kartikeya Dwivedi * to recover. 18731158ad0SKumar Kartikeya Dwivedi */ 18831158ad0SKumar Kartikeya Dwivedi for (int i = 0; i < rqh_cnt - 1; i++) { 18931158ad0SKumar Kartikeya Dwivedi if (rqh->locks[i] == remote_lock) 19031158ad0SKumar Kartikeya Dwivedi return -EDEADLK; 19131158ad0SKumar Kartikeya Dwivedi } 19231158ad0SKumar Kartikeya Dwivedi /* 19331158ad0SKumar Kartikeya Dwivedi * Inconclusive; retry again later. 19431158ad0SKumar Kartikeya Dwivedi */ 19531158ad0SKumar Kartikeya Dwivedi return 0; 19631158ad0SKumar Kartikeya Dwivedi } 19731158ad0SKumar Kartikeya Dwivedi } 19831158ad0SKumar Kartikeya Dwivedi return 0; 19931158ad0SKumar Kartikeya Dwivedi } 20031158ad0SKumar Kartikeya Dwivedi 20131158ad0SKumar Kartikeya Dwivedi static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, 20231158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 20331158ad0SKumar Kartikeya Dwivedi { 20431158ad0SKumar Kartikeya Dwivedi int ret; 20531158ad0SKumar Kartikeya Dwivedi 20631158ad0SKumar Kartikeya Dwivedi ret = check_deadlock_AA(lock, mask, ts); 20731158ad0SKumar Kartikeya Dwivedi if (ret) 20831158ad0SKumar Kartikeya Dwivedi return ret; 20931158ad0SKumar Kartikeya Dwivedi ret = check_deadlock_ABBA(lock, mask, ts); 21031158ad0SKumar Kartikeya Dwivedi if (ret) 21131158ad0SKumar Kartikeya Dwivedi return ret; 21231158ad0SKumar Kartikeya Dwivedi 21331158ad0SKumar Kartikeya Dwivedi return 0; 21431158ad0SKumar Kartikeya Dwivedi } 21531158ad0SKumar Kartikeya Dwivedi 21631158ad0SKumar Kartikeya Dwivedi static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 21731158ad0SKumar Kartikeya Dwivedi struct rqspinlock_timeout *ts) 21814c48ee8SKumar Kartikeya Dwivedi { 21914c48ee8SKumar Kartikeya Dwivedi u64 time = ktime_get_mono_fast_ns(); 22031158ad0SKumar Kartikeya Dwivedi u64 prev = ts->cur; 22114c48ee8SKumar Kartikeya Dwivedi 22214c48ee8SKumar Kartikeya Dwivedi if (!ts->timeout_end) { 22331158ad0SKumar Kartikeya Dwivedi ts->cur = time; 22414c48ee8SKumar Kartikeya Dwivedi ts->timeout_end = time + ts->duration; 22514c48ee8SKumar Kartikeya Dwivedi return 0; 22614c48ee8SKumar Kartikeya Dwivedi } 22714c48ee8SKumar Kartikeya Dwivedi 22814c48ee8SKumar Kartikeya Dwivedi if (time > ts->timeout_end) 22914c48ee8SKumar Kartikeya Dwivedi return -ETIMEDOUT; 23014c48ee8SKumar Kartikeya Dwivedi 23131158ad0SKumar Kartikeya Dwivedi /* 23231158ad0SKumar Kartikeya Dwivedi * A millisecond interval passed from last time? Trigger deadlock 23331158ad0SKumar Kartikeya Dwivedi * checks. 23431158ad0SKumar Kartikeya Dwivedi */ 23531158ad0SKumar Kartikeya Dwivedi if (prev + NSEC_PER_MSEC < time) { 23631158ad0SKumar Kartikeya Dwivedi ts->cur = time; 23731158ad0SKumar Kartikeya Dwivedi return check_deadlock(lock, mask, ts); 23831158ad0SKumar Kartikeya Dwivedi } 23931158ad0SKumar Kartikeya Dwivedi 24014c48ee8SKumar Kartikeya Dwivedi return 0; 24114c48ee8SKumar Kartikeya Dwivedi } 24214c48ee8SKumar Kartikeya Dwivedi 243ebababcdSKumar Kartikeya Dwivedi /* 244ebababcdSKumar Kartikeya Dwivedi * Do not amortize with spins when res_smp_cond_load_acquire is defined, 245ebababcdSKumar Kartikeya Dwivedi * as the macro does internal amortization for us. 246ebababcdSKumar Kartikeya Dwivedi */ 247ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire 24831158ad0SKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 24914c48ee8SKumar Kartikeya Dwivedi ({ \ 25014c48ee8SKumar Kartikeya Dwivedi if (!(ts).spin++) \ 25131158ad0SKumar Kartikeya Dwivedi (ret) = check_timeout((lock), (mask), &(ts)); \ 25214c48ee8SKumar Kartikeya Dwivedi (ret); \ 25314c48ee8SKumar Kartikeya Dwivedi }) 254ebababcdSKumar Kartikeya Dwivedi #else 255ebababcdSKumar Kartikeya Dwivedi #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 256ebababcdSKumar Kartikeya Dwivedi ({ (ret) = check_timeout(&(ts)); }) 257ebababcdSKumar Kartikeya Dwivedi #endif 25814c48ee8SKumar Kartikeya Dwivedi 25914c48ee8SKumar Kartikeya Dwivedi /* 26014c48ee8SKumar Kartikeya Dwivedi * Initialize the 'spin' member. 26131158ad0SKumar Kartikeya Dwivedi * Set spin member to 0 to trigger AA/ABBA checks immediately. 26214c48ee8SKumar Kartikeya Dwivedi */ 26331158ad0SKumar Kartikeya Dwivedi #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 26414c48ee8SKumar Kartikeya Dwivedi 26514c48ee8SKumar Kartikeya Dwivedi /* 26614c48ee8SKumar Kartikeya Dwivedi * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 26714c48ee8SKumar Kartikeya Dwivedi * Duration is defined for each spin attempt, so set it here. 26814c48ee8SKumar Kartikeya Dwivedi */ 26914c48ee8SKumar Kartikeya Dwivedi #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 27014c48ee8SKumar Kartikeya Dwivedi 271a8fcf2a3SKumar Kartikeya Dwivedi /* 272c9102a68SKumar Kartikeya Dwivedi * Provide a test-and-set fallback for cases when queued spin lock support is 273c9102a68SKumar Kartikeya Dwivedi * absent from the architecture. 274c9102a68SKumar Kartikeya Dwivedi */ 275c9102a68SKumar Kartikeya Dwivedi int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 276c9102a68SKumar Kartikeya Dwivedi { 277c9102a68SKumar Kartikeya Dwivedi struct rqspinlock_timeout ts; 278c9102a68SKumar Kartikeya Dwivedi int val, ret = 0; 279c9102a68SKumar Kartikeya Dwivedi 280c9102a68SKumar Kartikeya Dwivedi RES_INIT_TIMEOUT(ts); 281c9102a68SKumar Kartikeya Dwivedi grab_held_lock_entry(lock); 282c9102a68SKumar Kartikeya Dwivedi 283c9102a68SKumar Kartikeya Dwivedi /* 284c9102a68SKumar Kartikeya Dwivedi * Since the waiting loop's time is dependent on the amount of 285c9102a68SKumar Kartikeya Dwivedi * contention, a short timeout unlike rqspinlock waiting loops 286c9102a68SKumar Kartikeya Dwivedi * isn't enough. Choose a second as the timeout value. 287c9102a68SKumar Kartikeya Dwivedi */ 288c9102a68SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 289c9102a68SKumar Kartikeya Dwivedi retry: 290c9102a68SKumar Kartikeya Dwivedi val = atomic_read(&lock->val); 291c9102a68SKumar Kartikeya Dwivedi 292c9102a68SKumar Kartikeya Dwivedi if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 293c9102a68SKumar Kartikeya Dwivedi if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 294c9102a68SKumar Kartikeya Dwivedi goto out; 295c9102a68SKumar Kartikeya Dwivedi cpu_relax(); 296c9102a68SKumar Kartikeya Dwivedi goto retry; 297c9102a68SKumar Kartikeya Dwivedi } 298c9102a68SKumar Kartikeya Dwivedi 299c9102a68SKumar Kartikeya Dwivedi return 0; 300c9102a68SKumar Kartikeya Dwivedi out: 301c9102a68SKumar Kartikeya Dwivedi release_held_lock_entry(); 302c9102a68SKumar Kartikeya Dwivedi return ret; 303c9102a68SKumar Kartikeya Dwivedi } 304c9102a68SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 305c9102a68SKumar Kartikeya Dwivedi 306c9102a68SKumar Kartikeya Dwivedi #ifdef CONFIG_QUEUED_SPINLOCKS 307c9102a68SKumar Kartikeya Dwivedi 308c9102a68SKumar Kartikeya Dwivedi /* 309a8fcf2a3SKumar Kartikeya Dwivedi * Per-CPU queue node structures; we can never have more than 4 nested 310a8fcf2a3SKumar Kartikeya Dwivedi * contexts: task, softirq, hardirq, nmi. 311a8fcf2a3SKumar Kartikeya Dwivedi * 312a8fcf2a3SKumar Kartikeya Dwivedi * Exactly fits one 64-byte cacheline on a 64-bit architecture. 313a8fcf2a3SKumar Kartikeya Dwivedi */ 314a8fcf2a3SKumar Kartikeya Dwivedi static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 315a8fcf2a3SKumar Kartikeya Dwivedi 316ebababcdSKumar Kartikeya Dwivedi #ifndef res_smp_cond_load_acquire 317ebababcdSKumar Kartikeya Dwivedi #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 318ebababcdSKumar Kartikeya Dwivedi #endif 319ebababcdSKumar Kartikeya Dwivedi 320ebababcdSKumar Kartikeya Dwivedi #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 321ebababcdSKumar Kartikeya Dwivedi 322a8fcf2a3SKumar Kartikeya Dwivedi /** 323a8fcf2a3SKumar Kartikeya Dwivedi * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 324a8fcf2a3SKumar Kartikeya Dwivedi * @lock: Pointer to queued spinlock structure 325a8fcf2a3SKumar Kartikeya Dwivedi * @val: Current value of the queued spinlock 32-bit word 326a8fcf2a3SKumar Kartikeya Dwivedi * 327337ffea5SKumar Kartikeya Dwivedi * Return: 328337ffea5SKumar Kartikeya Dwivedi * * 0 - Lock was acquired successfully. 32931158ad0SKumar Kartikeya Dwivedi * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 330337ffea5SKumar Kartikeya Dwivedi * * -ETIMEDOUT - Lock acquisition failed because of timeout. 331337ffea5SKumar Kartikeya Dwivedi * 332a8fcf2a3SKumar Kartikeya Dwivedi * (queue tail, pending bit, lock value) 333a8fcf2a3SKumar Kartikeya Dwivedi * 334a8fcf2a3SKumar Kartikeya Dwivedi * fast : slow : unlock 335a8fcf2a3SKumar Kartikeya Dwivedi * : : 336a8fcf2a3SKumar Kartikeya Dwivedi * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 337a8fcf2a3SKumar Kartikeya Dwivedi * : | ^--------.------. / : 338a8fcf2a3SKumar Kartikeya Dwivedi * : v \ \ | : 339a8fcf2a3SKumar Kartikeya Dwivedi * pending : (0,1,1) +--> (0,1,0) \ | : 340a8fcf2a3SKumar Kartikeya Dwivedi * : | ^--' | | : 341a8fcf2a3SKumar Kartikeya Dwivedi * : v | | : 342a8fcf2a3SKumar Kartikeya Dwivedi * uncontended : (n,x,y) +--> (n,0,0) --' | : 343a8fcf2a3SKumar Kartikeya Dwivedi * queue : | ^--' | : 344a8fcf2a3SKumar Kartikeya Dwivedi * : v | : 345a8fcf2a3SKumar Kartikeya Dwivedi * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 346a8fcf2a3SKumar Kartikeya Dwivedi * queue : ^--' : 347a8fcf2a3SKumar Kartikeya Dwivedi */ 348337ffea5SKumar Kartikeya Dwivedi int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 349a8fcf2a3SKumar Kartikeya Dwivedi { 350a8fcf2a3SKumar Kartikeya Dwivedi struct mcs_spinlock *prev, *next, *node; 35114c48ee8SKumar Kartikeya Dwivedi struct rqspinlock_timeout ts; 352337ffea5SKumar Kartikeya Dwivedi int idx, ret = 0; 353a8fcf2a3SKumar Kartikeya Dwivedi u32 old, tail; 354a8fcf2a3SKumar Kartikeya Dwivedi 355a8fcf2a3SKumar Kartikeya Dwivedi BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 356a8fcf2a3SKumar Kartikeya Dwivedi 357ecbd8047SKumar Kartikeya Dwivedi if (resilient_virt_spin_lock_enabled()) 358ecbd8047SKumar Kartikeya Dwivedi return resilient_virt_spin_lock(lock); 359ecbd8047SKumar Kartikeya Dwivedi 36014c48ee8SKumar Kartikeya Dwivedi RES_INIT_TIMEOUT(ts); 36114c48ee8SKumar Kartikeya Dwivedi 362a8fcf2a3SKumar Kartikeya Dwivedi /* 363a8fcf2a3SKumar Kartikeya Dwivedi * Wait for in-progress pending->locked hand-overs with a bounded 364a8fcf2a3SKumar Kartikeya Dwivedi * number of spins so that we guarantee forward progress. 365a8fcf2a3SKumar Kartikeya Dwivedi * 366a8fcf2a3SKumar Kartikeya Dwivedi * 0,1,0 -> 0,0,1 367a8fcf2a3SKumar Kartikeya Dwivedi */ 368a8fcf2a3SKumar Kartikeya Dwivedi if (val == _Q_PENDING_VAL) { 369a8fcf2a3SKumar Kartikeya Dwivedi int cnt = _Q_PENDING_LOOPS; 370a8fcf2a3SKumar Kartikeya Dwivedi val = atomic_cond_read_relaxed(&lock->val, 371a8fcf2a3SKumar Kartikeya Dwivedi (VAL != _Q_PENDING_VAL) || !cnt--); 372a8fcf2a3SKumar Kartikeya Dwivedi } 373a8fcf2a3SKumar Kartikeya Dwivedi 374a8fcf2a3SKumar Kartikeya Dwivedi /* 375a8fcf2a3SKumar Kartikeya Dwivedi * If we observe any contention; queue. 376a8fcf2a3SKumar Kartikeya Dwivedi */ 377a8fcf2a3SKumar Kartikeya Dwivedi if (val & ~_Q_LOCKED_MASK) 378a8fcf2a3SKumar Kartikeya Dwivedi goto queue; 379a8fcf2a3SKumar Kartikeya Dwivedi 380a8fcf2a3SKumar Kartikeya Dwivedi /* 381a8fcf2a3SKumar Kartikeya Dwivedi * trylock || pending 382a8fcf2a3SKumar Kartikeya Dwivedi * 383a8fcf2a3SKumar Kartikeya Dwivedi * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 384a8fcf2a3SKumar Kartikeya Dwivedi */ 385a8fcf2a3SKumar Kartikeya Dwivedi val = queued_fetch_set_pending_acquire(lock); 386a8fcf2a3SKumar Kartikeya Dwivedi 387a8fcf2a3SKumar Kartikeya Dwivedi /* 388a8fcf2a3SKumar Kartikeya Dwivedi * If we observe contention, there is a concurrent locker. 389a8fcf2a3SKumar Kartikeya Dwivedi * 390a8fcf2a3SKumar Kartikeya Dwivedi * Undo and queue; our setting of PENDING might have made the 391a8fcf2a3SKumar Kartikeya Dwivedi * n,0,0 -> 0,0,0 transition fail and it will now be waiting 392a8fcf2a3SKumar Kartikeya Dwivedi * on @next to become !NULL. 393a8fcf2a3SKumar Kartikeya Dwivedi */ 394a8fcf2a3SKumar Kartikeya Dwivedi if (unlikely(val & ~_Q_LOCKED_MASK)) { 395a8fcf2a3SKumar Kartikeya Dwivedi 396a8fcf2a3SKumar Kartikeya Dwivedi /* Undo PENDING if we set it. */ 397a8fcf2a3SKumar Kartikeya Dwivedi if (!(val & _Q_PENDING_MASK)) 398a8fcf2a3SKumar Kartikeya Dwivedi clear_pending(lock); 399a8fcf2a3SKumar Kartikeya Dwivedi 400a8fcf2a3SKumar Kartikeya Dwivedi goto queue; 401a8fcf2a3SKumar Kartikeya Dwivedi } 402a8fcf2a3SKumar Kartikeya Dwivedi 403a8fcf2a3SKumar Kartikeya Dwivedi /* 40431158ad0SKumar Kartikeya Dwivedi * Grab an entry in the held locks array, to enable deadlock detection. 40531158ad0SKumar Kartikeya Dwivedi */ 40631158ad0SKumar Kartikeya Dwivedi grab_held_lock_entry(lock); 40731158ad0SKumar Kartikeya Dwivedi 40831158ad0SKumar Kartikeya Dwivedi /* 409a8fcf2a3SKumar Kartikeya Dwivedi * We're pending, wait for the owner to go away. 410a8fcf2a3SKumar Kartikeya Dwivedi * 411a8fcf2a3SKumar Kartikeya Dwivedi * 0,1,1 -> *,1,0 412a8fcf2a3SKumar Kartikeya Dwivedi * 413a8fcf2a3SKumar Kartikeya Dwivedi * this wait loop must be a load-acquire such that we match the 414a8fcf2a3SKumar Kartikeya Dwivedi * store-release that clears the locked bit and create lock 415a8fcf2a3SKumar Kartikeya Dwivedi * sequentiality; this is because not all 416a8fcf2a3SKumar Kartikeya Dwivedi * clear_pending_set_locked() implementations imply full 417a8fcf2a3SKumar Kartikeya Dwivedi * barriers. 418a8fcf2a3SKumar Kartikeya Dwivedi */ 419337ffea5SKumar Kartikeya Dwivedi if (val & _Q_LOCKED_MASK) { 420337ffea5SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 42131158ad0SKumar Kartikeya Dwivedi res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 422337ffea5SKumar Kartikeya Dwivedi } 423337ffea5SKumar Kartikeya Dwivedi 424337ffea5SKumar Kartikeya Dwivedi if (ret) { 425337ffea5SKumar Kartikeya Dwivedi /* 426337ffea5SKumar Kartikeya Dwivedi * We waited for the locked bit to go back to 0, as the pending 427337ffea5SKumar Kartikeya Dwivedi * waiter, but timed out. We need to clear the pending bit since 428337ffea5SKumar Kartikeya Dwivedi * we own it. Once a stuck owner has been recovered, the lock 429337ffea5SKumar Kartikeya Dwivedi * must be restored to a valid state, hence removing the pending 430337ffea5SKumar Kartikeya Dwivedi * bit is necessary. 431337ffea5SKumar Kartikeya Dwivedi * 432337ffea5SKumar Kartikeya Dwivedi * *,1,* -> *,0,* 433337ffea5SKumar Kartikeya Dwivedi */ 434337ffea5SKumar Kartikeya Dwivedi clear_pending(lock); 435337ffea5SKumar Kartikeya Dwivedi lockevent_inc(rqspinlock_lock_timeout); 43631158ad0SKumar Kartikeya Dwivedi goto err_release_entry; 437337ffea5SKumar Kartikeya Dwivedi } 438a8fcf2a3SKumar Kartikeya Dwivedi 439a8fcf2a3SKumar Kartikeya Dwivedi /* 440a8fcf2a3SKumar Kartikeya Dwivedi * take ownership and clear the pending bit. 441a8fcf2a3SKumar Kartikeya Dwivedi * 442a8fcf2a3SKumar Kartikeya Dwivedi * 0,1,0 -> 0,0,1 443a8fcf2a3SKumar Kartikeya Dwivedi */ 444a8fcf2a3SKumar Kartikeya Dwivedi clear_pending_set_locked(lock); 445a8fcf2a3SKumar Kartikeya Dwivedi lockevent_inc(lock_pending); 446337ffea5SKumar Kartikeya Dwivedi return 0; 447a8fcf2a3SKumar Kartikeya Dwivedi 448a8fcf2a3SKumar Kartikeya Dwivedi /* 449a8fcf2a3SKumar Kartikeya Dwivedi * End of pending bit optimistic spinning and beginning of MCS 450a8fcf2a3SKumar Kartikeya Dwivedi * queuing. 451a8fcf2a3SKumar Kartikeya Dwivedi */ 452a8fcf2a3SKumar Kartikeya Dwivedi queue: 453a8fcf2a3SKumar Kartikeya Dwivedi lockevent_inc(lock_slowpath); 45431158ad0SKumar Kartikeya Dwivedi /* 45531158ad0SKumar Kartikeya Dwivedi * Grab deadlock detection entry for the queue path. 45631158ad0SKumar Kartikeya Dwivedi */ 45731158ad0SKumar Kartikeya Dwivedi grab_held_lock_entry(lock); 45831158ad0SKumar Kartikeya Dwivedi 459a8fcf2a3SKumar Kartikeya Dwivedi node = this_cpu_ptr(&rqnodes[0].mcs); 460a8fcf2a3SKumar Kartikeya Dwivedi idx = node->count++; 461a8fcf2a3SKumar Kartikeya Dwivedi tail = encode_tail(smp_processor_id(), idx); 462a8fcf2a3SKumar Kartikeya Dwivedi 463a8fcf2a3SKumar Kartikeya Dwivedi trace_contention_begin(lock, LCB_F_SPIN); 464a8fcf2a3SKumar Kartikeya Dwivedi 465a8fcf2a3SKumar Kartikeya Dwivedi /* 466a8fcf2a3SKumar Kartikeya Dwivedi * 4 nodes are allocated based on the assumption that there will 467a8fcf2a3SKumar Kartikeya Dwivedi * not be nested NMIs taking spinlocks. That may not be true in 468a8fcf2a3SKumar Kartikeya Dwivedi * some architectures even though the chance of needing more than 469a8fcf2a3SKumar Kartikeya Dwivedi * 4 nodes will still be extremely unlikely. When that happens, 470a8fcf2a3SKumar Kartikeya Dwivedi * we fall back to spinning on the lock directly without using 471a8fcf2a3SKumar Kartikeya Dwivedi * any MCS node. This is not the most elegant solution, but is 472a8fcf2a3SKumar Kartikeya Dwivedi * simple enough. 473a8fcf2a3SKumar Kartikeya Dwivedi */ 474a8fcf2a3SKumar Kartikeya Dwivedi if (unlikely(idx >= _Q_MAX_NODES)) { 475a8fcf2a3SKumar Kartikeya Dwivedi lockevent_inc(lock_no_node); 4763bb15936SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 4773bb15936SKumar Kartikeya Dwivedi while (!queued_spin_trylock(lock)) { 47831158ad0SKumar Kartikeya Dwivedi if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { 4793bb15936SKumar Kartikeya Dwivedi lockevent_inc(rqspinlock_lock_timeout); 48031158ad0SKumar Kartikeya Dwivedi goto err_release_node; 4813bb15936SKumar Kartikeya Dwivedi } 482a8fcf2a3SKumar Kartikeya Dwivedi cpu_relax(); 4833bb15936SKumar Kartikeya Dwivedi } 484a8fcf2a3SKumar Kartikeya Dwivedi goto release; 485a8fcf2a3SKumar Kartikeya Dwivedi } 486a8fcf2a3SKumar Kartikeya Dwivedi 487a8fcf2a3SKumar Kartikeya Dwivedi node = grab_mcs_node(node, idx); 488a8fcf2a3SKumar Kartikeya Dwivedi 489a8fcf2a3SKumar Kartikeya Dwivedi /* 490a8fcf2a3SKumar Kartikeya Dwivedi * Keep counts of non-zero index values: 491a8fcf2a3SKumar Kartikeya Dwivedi */ 492a8fcf2a3SKumar Kartikeya Dwivedi lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 493a8fcf2a3SKumar Kartikeya Dwivedi 494a8fcf2a3SKumar Kartikeya Dwivedi /* 495a8fcf2a3SKumar Kartikeya Dwivedi * Ensure that we increment the head node->count before initialising 496a8fcf2a3SKumar Kartikeya Dwivedi * the actual node. If the compiler is kind enough to reorder these 497a8fcf2a3SKumar Kartikeya Dwivedi * stores, then an IRQ could overwrite our assignments. 498a8fcf2a3SKumar Kartikeya Dwivedi */ 499a8fcf2a3SKumar Kartikeya Dwivedi barrier(); 500a8fcf2a3SKumar Kartikeya Dwivedi 501a8fcf2a3SKumar Kartikeya Dwivedi node->locked = 0; 502a8fcf2a3SKumar Kartikeya Dwivedi node->next = NULL; 503a8fcf2a3SKumar Kartikeya Dwivedi 504a8fcf2a3SKumar Kartikeya Dwivedi /* 505a8fcf2a3SKumar Kartikeya Dwivedi * We touched a (possibly) cold cacheline in the per-cpu queue node; 506a8fcf2a3SKumar Kartikeya Dwivedi * attempt the trylock once more in the hope someone let go while we 507a8fcf2a3SKumar Kartikeya Dwivedi * weren't watching. 508a8fcf2a3SKumar Kartikeya Dwivedi */ 509a8fcf2a3SKumar Kartikeya Dwivedi if (queued_spin_trylock(lock)) 510a8fcf2a3SKumar Kartikeya Dwivedi goto release; 511a8fcf2a3SKumar Kartikeya Dwivedi 512a8fcf2a3SKumar Kartikeya Dwivedi /* 513a8fcf2a3SKumar Kartikeya Dwivedi * Ensure that the initialisation of @node is complete before we 514a8fcf2a3SKumar Kartikeya Dwivedi * publish the updated tail via xchg_tail() and potentially link 515a8fcf2a3SKumar Kartikeya Dwivedi * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 516a8fcf2a3SKumar Kartikeya Dwivedi */ 517a8fcf2a3SKumar Kartikeya Dwivedi smp_wmb(); 518a8fcf2a3SKumar Kartikeya Dwivedi 519a8fcf2a3SKumar Kartikeya Dwivedi /* 520a8fcf2a3SKumar Kartikeya Dwivedi * Publish the updated tail. 521a8fcf2a3SKumar Kartikeya Dwivedi * We have already touched the queueing cacheline; don't bother with 522a8fcf2a3SKumar Kartikeya Dwivedi * pending stuff. 523a8fcf2a3SKumar Kartikeya Dwivedi * 524a8fcf2a3SKumar Kartikeya Dwivedi * p,*,* -> n,*,* 525a8fcf2a3SKumar Kartikeya Dwivedi */ 526a8fcf2a3SKumar Kartikeya Dwivedi old = xchg_tail(lock, tail); 527a8fcf2a3SKumar Kartikeya Dwivedi next = NULL; 528a8fcf2a3SKumar Kartikeya Dwivedi 529a8fcf2a3SKumar Kartikeya Dwivedi /* 530a8fcf2a3SKumar Kartikeya Dwivedi * if there was a previous node; link it and wait until reaching the 531a8fcf2a3SKumar Kartikeya Dwivedi * head of the waitqueue. 532a8fcf2a3SKumar Kartikeya Dwivedi */ 533a8fcf2a3SKumar Kartikeya Dwivedi if (old & _Q_TAIL_MASK) { 534164c2465SKumar Kartikeya Dwivedi int val; 535164c2465SKumar Kartikeya Dwivedi 536a8fcf2a3SKumar Kartikeya Dwivedi prev = decode_tail(old, rqnodes); 537a8fcf2a3SKumar Kartikeya Dwivedi 538a8fcf2a3SKumar Kartikeya Dwivedi /* Link @node into the waitqueue. */ 539a8fcf2a3SKumar Kartikeya Dwivedi WRITE_ONCE(prev->next, node); 540a8fcf2a3SKumar Kartikeya Dwivedi 541164c2465SKumar Kartikeya Dwivedi val = arch_mcs_spin_lock_contended(&node->locked); 542164c2465SKumar Kartikeya Dwivedi if (val == RES_TIMEOUT_VAL) { 543164c2465SKumar Kartikeya Dwivedi ret = -EDEADLK; 544164c2465SKumar Kartikeya Dwivedi goto waitq_timeout; 545164c2465SKumar Kartikeya Dwivedi } 546a8fcf2a3SKumar Kartikeya Dwivedi 547a8fcf2a3SKumar Kartikeya Dwivedi /* 548a8fcf2a3SKumar Kartikeya Dwivedi * While waiting for the MCS lock, the next pointer may have 549a8fcf2a3SKumar Kartikeya Dwivedi * been set by another lock waiter. We optimistically load 550a8fcf2a3SKumar Kartikeya Dwivedi * the next pointer & prefetch the cacheline for writing 551a8fcf2a3SKumar Kartikeya Dwivedi * to reduce latency in the upcoming MCS unlock operation. 552a8fcf2a3SKumar Kartikeya Dwivedi */ 553a8fcf2a3SKumar Kartikeya Dwivedi next = READ_ONCE(node->next); 554a8fcf2a3SKumar Kartikeya Dwivedi if (next) 555a8fcf2a3SKumar Kartikeya Dwivedi prefetchw(next); 556a8fcf2a3SKumar Kartikeya Dwivedi } 557a8fcf2a3SKumar Kartikeya Dwivedi 558a8fcf2a3SKumar Kartikeya Dwivedi /* 559a8fcf2a3SKumar Kartikeya Dwivedi * we're at the head of the waitqueue, wait for the owner & pending to 560a8fcf2a3SKumar Kartikeya Dwivedi * go away. 561a8fcf2a3SKumar Kartikeya Dwivedi * 562a8fcf2a3SKumar Kartikeya Dwivedi * *,x,y -> *,0,0 563a8fcf2a3SKumar Kartikeya Dwivedi * 564a8fcf2a3SKumar Kartikeya Dwivedi * this wait loop must use a load-acquire such that we match the 565a8fcf2a3SKumar Kartikeya Dwivedi * store-release that clears the locked bit and create lock 566a8fcf2a3SKumar Kartikeya Dwivedi * sequentiality; this is because the set_locked() function below 567a8fcf2a3SKumar Kartikeya Dwivedi * does not imply a full barrier. 568164c2465SKumar Kartikeya Dwivedi * 569164c2465SKumar Kartikeya Dwivedi * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 570164c2465SKumar Kartikeya Dwivedi * meant to span maximum allowed time per critical section, and we may 571164c2465SKumar Kartikeya Dwivedi * have both the owner of the lock and the pending bit waiter ahead of 572164c2465SKumar Kartikeya Dwivedi * us. 573a8fcf2a3SKumar Kartikeya Dwivedi */ 574164c2465SKumar Kartikeya Dwivedi RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 575164c2465SKumar Kartikeya Dwivedi val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 57631158ad0SKumar Kartikeya Dwivedi RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 577164c2465SKumar Kartikeya Dwivedi 578164c2465SKumar Kartikeya Dwivedi waitq_timeout: 579164c2465SKumar Kartikeya Dwivedi if (ret) { 580164c2465SKumar Kartikeya Dwivedi /* 581164c2465SKumar Kartikeya Dwivedi * If the tail is still pointing to us, then we are the final waiter, 582164c2465SKumar Kartikeya Dwivedi * and are responsible for resetting the tail back to 0. Otherwise, if 583164c2465SKumar Kartikeya Dwivedi * the cmpxchg operation fails, we signal the next waiter to take exit 584164c2465SKumar Kartikeya Dwivedi * and try the same. For a waiter with tail node 'n': 585164c2465SKumar Kartikeya Dwivedi * 586164c2465SKumar Kartikeya Dwivedi * n,*,* -> 0,*,* 587164c2465SKumar Kartikeya Dwivedi * 588164c2465SKumar Kartikeya Dwivedi * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 589164c2465SKumar Kartikeya Dwivedi * possible locked/pending bits keep changing and we see failures even 590164c2465SKumar Kartikeya Dwivedi * when we remain the head of wait queue. However, eventually, 591164c2465SKumar Kartikeya Dwivedi * pending bit owner will unset the pending bit, and new waiters 592164c2465SKumar Kartikeya Dwivedi * will queue behind us. This will leave the lock owner in 593164c2465SKumar Kartikeya Dwivedi * charge, and it will eventually either set locked bit to 0, or 594164c2465SKumar Kartikeya Dwivedi * leave it as 1, allowing us to make progress. 595164c2465SKumar Kartikeya Dwivedi * 596164c2465SKumar Kartikeya Dwivedi * We terminate the whole wait queue for two reasons. Firstly, 597164c2465SKumar Kartikeya Dwivedi * we eschew per-waiter timeouts with one applied at the head of 598164c2465SKumar Kartikeya Dwivedi * the wait queue. This allows everyone to break out faster 599164c2465SKumar Kartikeya Dwivedi * once we've seen the owner / pending waiter not responding for 600164c2465SKumar Kartikeya Dwivedi * the timeout duration from the head. Secondly, it avoids 601164c2465SKumar Kartikeya Dwivedi * complicated synchronization, because when not leaving in FIFO 602164c2465SKumar Kartikeya Dwivedi * order, prev's next pointer needs to be fixed up etc. 603164c2465SKumar Kartikeya Dwivedi */ 604164c2465SKumar Kartikeya Dwivedi if (!try_cmpxchg_tail(lock, tail, 0)) { 605164c2465SKumar Kartikeya Dwivedi next = smp_cond_load_relaxed(&node->next, VAL); 606164c2465SKumar Kartikeya Dwivedi WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 607164c2465SKumar Kartikeya Dwivedi } 608164c2465SKumar Kartikeya Dwivedi lockevent_inc(rqspinlock_lock_timeout); 60931158ad0SKumar Kartikeya Dwivedi goto err_release_node; 610164c2465SKumar Kartikeya Dwivedi } 611a8fcf2a3SKumar Kartikeya Dwivedi 612a8fcf2a3SKumar Kartikeya Dwivedi /* 613a8fcf2a3SKumar Kartikeya Dwivedi * claim the lock: 614a8fcf2a3SKumar Kartikeya Dwivedi * 615a8fcf2a3SKumar Kartikeya Dwivedi * n,0,0 -> 0,0,1 : lock, uncontended 616a8fcf2a3SKumar Kartikeya Dwivedi * *,*,0 -> *,*,1 : lock, contended 617a8fcf2a3SKumar Kartikeya Dwivedi * 618a8fcf2a3SKumar Kartikeya Dwivedi * If the queue head is the only one in the queue (lock value == tail) 619a8fcf2a3SKumar Kartikeya Dwivedi * and nobody is pending, clear the tail code and grab the lock. 620a8fcf2a3SKumar Kartikeya Dwivedi * Otherwise, we only need to grab the lock. 621a8fcf2a3SKumar Kartikeya Dwivedi */ 622a8fcf2a3SKumar Kartikeya Dwivedi 623a8fcf2a3SKumar Kartikeya Dwivedi /* 624a8fcf2a3SKumar Kartikeya Dwivedi * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 625a8fcf2a3SKumar Kartikeya Dwivedi * above wait condition, therefore any concurrent setting of 626a8fcf2a3SKumar Kartikeya Dwivedi * PENDING will make the uncontended transition fail. 627a8fcf2a3SKumar Kartikeya Dwivedi */ 628a8fcf2a3SKumar Kartikeya Dwivedi if ((val & _Q_TAIL_MASK) == tail) { 629a8fcf2a3SKumar Kartikeya Dwivedi if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 630a8fcf2a3SKumar Kartikeya Dwivedi goto release; /* No contention */ 631a8fcf2a3SKumar Kartikeya Dwivedi } 632a8fcf2a3SKumar Kartikeya Dwivedi 633a8fcf2a3SKumar Kartikeya Dwivedi /* 634a8fcf2a3SKumar Kartikeya Dwivedi * Either somebody is queued behind us or _Q_PENDING_VAL got set 635a8fcf2a3SKumar Kartikeya Dwivedi * which will then detect the remaining tail and queue behind us 636a8fcf2a3SKumar Kartikeya Dwivedi * ensuring we'll see a @next. 637a8fcf2a3SKumar Kartikeya Dwivedi */ 638a8fcf2a3SKumar Kartikeya Dwivedi set_locked(lock); 639a8fcf2a3SKumar Kartikeya Dwivedi 640a8fcf2a3SKumar Kartikeya Dwivedi /* 641a8fcf2a3SKumar Kartikeya Dwivedi * contended path; wait for next if not observed yet, release. 642a8fcf2a3SKumar Kartikeya Dwivedi */ 643a8fcf2a3SKumar Kartikeya Dwivedi if (!next) 644a8fcf2a3SKumar Kartikeya Dwivedi next = smp_cond_load_relaxed(&node->next, (VAL)); 645a8fcf2a3SKumar Kartikeya Dwivedi 646a8fcf2a3SKumar Kartikeya Dwivedi arch_mcs_spin_unlock_contended(&next->locked); 647a8fcf2a3SKumar Kartikeya Dwivedi 648a8fcf2a3SKumar Kartikeya Dwivedi release: 649a8fcf2a3SKumar Kartikeya Dwivedi trace_contention_end(lock, 0); 650a8fcf2a3SKumar Kartikeya Dwivedi 651a8fcf2a3SKumar Kartikeya Dwivedi /* 652a8fcf2a3SKumar Kartikeya Dwivedi * release the node 653a8fcf2a3SKumar Kartikeya Dwivedi */ 654a8fcf2a3SKumar Kartikeya Dwivedi __this_cpu_dec(rqnodes[0].mcs.count); 655164c2465SKumar Kartikeya Dwivedi return ret; 65631158ad0SKumar Kartikeya Dwivedi err_release_node: 65731158ad0SKumar Kartikeya Dwivedi trace_contention_end(lock, ret); 65831158ad0SKumar Kartikeya Dwivedi __this_cpu_dec(rqnodes[0].mcs.count); 65931158ad0SKumar Kartikeya Dwivedi err_release_entry: 66031158ad0SKumar Kartikeya Dwivedi release_held_lock_entry(); 66131158ad0SKumar Kartikeya Dwivedi return ret; 662a8fcf2a3SKumar Kartikeya Dwivedi } 663a8fcf2a3SKumar Kartikeya Dwivedi EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 664c9102a68SKumar Kartikeya Dwivedi 665c9102a68SKumar Kartikeya Dwivedi #endif /* CONFIG_QUEUED_SPINLOCKS */ 666*97eb35f3SKumar Kartikeya Dwivedi 667*97eb35f3SKumar Kartikeya Dwivedi __bpf_kfunc_start_defs(); 668*97eb35f3SKumar Kartikeya Dwivedi 669*97eb35f3SKumar Kartikeya Dwivedi __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 670*97eb35f3SKumar Kartikeya Dwivedi { 671*97eb35f3SKumar Kartikeya Dwivedi int ret; 672*97eb35f3SKumar Kartikeya Dwivedi 673*97eb35f3SKumar Kartikeya Dwivedi BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 674*97eb35f3SKumar Kartikeya Dwivedi BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); 675*97eb35f3SKumar Kartikeya Dwivedi 676*97eb35f3SKumar Kartikeya Dwivedi preempt_disable(); 677*97eb35f3SKumar Kartikeya Dwivedi ret = res_spin_lock((rqspinlock_t *)lock); 678*97eb35f3SKumar Kartikeya Dwivedi if (unlikely(ret)) { 679*97eb35f3SKumar Kartikeya Dwivedi preempt_enable(); 680*97eb35f3SKumar Kartikeya Dwivedi return ret; 681*97eb35f3SKumar Kartikeya Dwivedi } 682*97eb35f3SKumar Kartikeya Dwivedi return 0; 683*97eb35f3SKumar Kartikeya Dwivedi } 684*97eb35f3SKumar Kartikeya Dwivedi 685*97eb35f3SKumar Kartikeya Dwivedi __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 686*97eb35f3SKumar Kartikeya Dwivedi { 687*97eb35f3SKumar Kartikeya Dwivedi res_spin_unlock((rqspinlock_t *)lock); 688*97eb35f3SKumar Kartikeya Dwivedi preempt_enable(); 689*97eb35f3SKumar Kartikeya Dwivedi } 690*97eb35f3SKumar Kartikeya Dwivedi 691*97eb35f3SKumar Kartikeya Dwivedi __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 692*97eb35f3SKumar Kartikeya Dwivedi { 693*97eb35f3SKumar Kartikeya Dwivedi u64 *ptr = (u64 *)flags__irq_flag; 694*97eb35f3SKumar Kartikeya Dwivedi unsigned long flags; 695*97eb35f3SKumar Kartikeya Dwivedi int ret; 696*97eb35f3SKumar Kartikeya Dwivedi 697*97eb35f3SKumar Kartikeya Dwivedi preempt_disable(); 698*97eb35f3SKumar Kartikeya Dwivedi local_irq_save(flags); 699*97eb35f3SKumar Kartikeya Dwivedi ret = res_spin_lock((rqspinlock_t *)lock); 700*97eb35f3SKumar Kartikeya Dwivedi if (unlikely(ret)) { 701*97eb35f3SKumar Kartikeya Dwivedi local_irq_restore(flags); 702*97eb35f3SKumar Kartikeya Dwivedi preempt_enable(); 703*97eb35f3SKumar Kartikeya Dwivedi return ret; 704*97eb35f3SKumar Kartikeya Dwivedi } 705*97eb35f3SKumar Kartikeya Dwivedi *ptr = flags; 706*97eb35f3SKumar Kartikeya Dwivedi return 0; 707*97eb35f3SKumar Kartikeya Dwivedi } 708*97eb35f3SKumar Kartikeya Dwivedi 709*97eb35f3SKumar Kartikeya Dwivedi __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 710*97eb35f3SKumar Kartikeya Dwivedi { 711*97eb35f3SKumar Kartikeya Dwivedi u64 *ptr = (u64 *)flags__irq_flag; 712*97eb35f3SKumar Kartikeya Dwivedi unsigned long flags = *ptr; 713*97eb35f3SKumar Kartikeya Dwivedi 714*97eb35f3SKumar Kartikeya Dwivedi res_spin_unlock((rqspinlock_t *)lock); 715*97eb35f3SKumar Kartikeya Dwivedi local_irq_restore(flags); 716*97eb35f3SKumar Kartikeya Dwivedi preempt_enable(); 717*97eb35f3SKumar Kartikeya Dwivedi } 718*97eb35f3SKumar Kartikeya Dwivedi 719*97eb35f3SKumar Kartikeya Dwivedi __bpf_kfunc_end_defs(); 720*97eb35f3SKumar Kartikeya Dwivedi 721*97eb35f3SKumar Kartikeya Dwivedi BTF_KFUNCS_START(rqspinlock_kfunc_ids) 722*97eb35f3SKumar Kartikeya Dwivedi BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 723*97eb35f3SKumar Kartikeya Dwivedi BTF_ID_FLAGS(func, bpf_res_spin_unlock) 724*97eb35f3SKumar Kartikeya Dwivedi BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 725*97eb35f3SKumar Kartikeya Dwivedi BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 726*97eb35f3SKumar Kartikeya Dwivedi BTF_KFUNCS_END(rqspinlock_kfunc_ids) 727*97eb35f3SKumar Kartikeya Dwivedi 728*97eb35f3SKumar Kartikeya Dwivedi static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 729*97eb35f3SKumar Kartikeya Dwivedi .owner = THIS_MODULE, 730*97eb35f3SKumar Kartikeya Dwivedi .set = &rqspinlock_kfunc_ids, 731*97eb35f3SKumar Kartikeya Dwivedi }; 732*97eb35f3SKumar Kartikeya Dwivedi 733*97eb35f3SKumar Kartikeya Dwivedi static __init int rqspinlock_register_kfuncs(void) 734*97eb35f3SKumar Kartikeya Dwivedi { 735*97eb35f3SKumar Kartikeya Dwivedi return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 736*97eb35f3SKumar Kartikeya Dwivedi } 737*97eb35f3SKumar Kartikeya Dwivedi late_initcall(rqspinlock_register_kfuncs); 738