1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * Resilient Queued Spin Lock 4 * 5 * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 * 7 * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 */ 9 #ifndef __ASM_GENERIC_RQSPINLOCK_H 10 #define __ASM_GENERIC_RQSPINLOCK_H 11 12 #include <linux/types.h> 13 #include <vdso/time64.h> 14 #include <linux/percpu.h> 15 #ifdef CONFIG_QUEUED_SPINLOCKS 16 #include <asm/qspinlock.h> 17 #endif 18 19 struct rqspinlock { 20 union { 21 atomic_t val; 22 u32 locked; 23 }; 24 }; 25 26 /* Even though this is same as struct rqspinlock, we need to emit a distinct 27 * type in BTF for BPF programs. 28 */ 29 struct bpf_res_spin_lock { 30 u32 val; 31 }; 32 33 struct qspinlock; 34 #ifdef CONFIG_QUEUED_SPINLOCKS 35 typedef struct qspinlock rqspinlock_t; 36 #else 37 typedef struct rqspinlock rqspinlock_t; 38 #endif 39 40 extern int resilient_tas_spin_lock(rqspinlock_t *lock); 41 #ifdef CONFIG_QUEUED_SPINLOCKS 42 extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); 43 #endif 44 45 #ifndef resilient_virt_spin_lock_enabled 46 static __always_inline bool resilient_virt_spin_lock_enabled(void) 47 { 48 return false; 49 } 50 #endif 51 52 #ifndef resilient_virt_spin_lock 53 static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) 54 { 55 return 0; 56 } 57 #endif 58 59 /* 60 * Default timeout for waiting loops is 0.25 seconds 61 */ 62 #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) 63 64 /* 65 * Choose 31 as it makes rqspinlock_held cacheline-aligned. 66 */ 67 #define RES_NR_HELD 31 68 69 struct rqspinlock_held { 70 int cnt; 71 void *locks[RES_NR_HELD]; 72 }; 73 74 DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 75 76 static __always_inline void grab_held_lock_entry(void *lock) 77 { 78 int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); 79 80 if (unlikely(cnt > RES_NR_HELD)) { 81 /* Still keep the inc so we decrement later. */ 82 return; 83 } 84 85 /* 86 * Implied compiler barrier in per-CPU operations; otherwise we can have 87 * the compiler reorder inc with write to table, allowing interrupts to 88 * overwrite and erase our write to the table (as on interrupt exit it 89 * will be reset to NULL). 90 * 91 * It is fine for cnt inc to be reordered wrt remote readers though, 92 * they won't observe our entry until the cnt update is visible, that's 93 * all. 94 */ 95 this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); 96 } 97 98 /* 99 * We simply don't support out-of-order unlocks, and keep the logic simple here. 100 * The verifier prevents BPF programs from unlocking out-of-order, and the same 101 * holds for in-kernel users. 102 * 103 * It is possible to run into misdetection scenarios of AA deadlocks on the same 104 * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries 105 * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct 106 * logic to preserve right entries in the table would be to walk the array of 107 * held locks and swap and clear out-of-order entries, but that's too 108 * complicated and we don't have a compelling use case for out of order unlocking. 109 */ 110 static __always_inline void release_held_lock_entry(void) 111 { 112 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 113 114 if (unlikely(rqh->cnt > RES_NR_HELD)) 115 goto dec; 116 WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 117 dec: 118 /* 119 * Reordering of clearing above with inc and its write in 120 * grab_held_lock_entry that came before us (in same acquisition 121 * attempt) is ok, we either see a valid entry or NULL when it's 122 * visible. 123 * 124 * But this helper is invoked when we unwind upon failing to acquire the 125 * lock. Unlike the unlock path which constitutes a release store after 126 * we clear the entry, we need to emit a write barrier here. Otherwise, 127 * we may have a situation as follows: 128 * 129 * <error> for lock B 130 * release_held_lock_entry 131 * 132 * try_cmpxchg_acquire for lock A 133 * grab_held_lock_entry 134 * 135 * Lack of any ordering means reordering may occur such that dec, inc 136 * are done before entry is overwritten. This permits a remote lock 137 * holder of lock B (which this CPU failed to acquire) to now observe it 138 * as being attempted on this CPU, and may lead to misdetection (if this 139 * CPU holds a lock it is attempting to acquire, leading to false ABBA 140 * diagnosis). 141 * 142 * In case of unlock, we will always do a release on the lock word after 143 * releasing the entry, ensuring that other CPUs cannot hold the lock 144 * (and make conclusions about deadlocks) until the entry has been 145 * cleared on the local CPU, preventing any anomalies. Reordering is 146 * still possible there, but a remote CPU cannot observe a lock in our 147 * table which it is already holding, since visibility entails our 148 * release store for the said lock has not retired. 149 * 150 * In theory we don't have a problem if the dec and WRITE_ONCE above get 151 * reordered with each other, we either notice an empty NULL entry on 152 * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which 153 * cannot be observed (if dec precedes WRITE_ONCE). 154 * 155 * Emit the write barrier _before_ the dec, this permits dec-inc 156 * reordering but that is harmless as we'd have new entry set to NULL 157 * already, i.e. they cannot precede the NULL store above. 158 */ 159 smp_wmb(); 160 this_cpu_dec(rqspinlock_held_locks.cnt); 161 } 162 163 #ifdef CONFIG_QUEUED_SPINLOCKS 164 165 /** 166 * res_spin_lock - acquire a queued spinlock 167 * @lock: Pointer to queued spinlock structure 168 * 169 * Return: 170 * * 0 - Lock was acquired successfully. 171 * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 172 * * -ETIMEDOUT - Lock acquisition failed because of timeout. 173 */ 174 static __always_inline int res_spin_lock(rqspinlock_t *lock) 175 { 176 int val = 0; 177 178 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { 179 grab_held_lock_entry(lock); 180 return 0; 181 } 182 return resilient_queued_spin_lock_slowpath(lock, val); 183 } 184 185 #else 186 187 #define res_spin_lock(lock) resilient_tas_spin_lock(lock) 188 189 #endif /* CONFIG_QUEUED_SPINLOCKS */ 190 191 static __always_inline void res_spin_unlock(rqspinlock_t *lock) 192 { 193 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 194 195 if (unlikely(rqh->cnt > RES_NR_HELD)) 196 goto unlock; 197 WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 198 unlock: 199 /* 200 * Release barrier, ensures correct ordering. See release_held_lock_entry 201 * for details. Perform release store instead of queued_spin_unlock, 202 * since we use this function for test-and-set fallback as well. When we 203 * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. 204 * 205 * Like release_held_lock_entry, we can do the release before the dec. 206 * We simply care about not seeing the 'lock' in our table from a remote 207 * CPU once the lock has been released, which doesn't rely on the dec. 208 * 209 * Unlike smp_wmb(), release is not a two way fence, hence it is 210 * possible for a inc to move up and reorder with our clearing of the 211 * entry. This isn't a problem however, as for a misdiagnosis of ABBA, 212 * the remote CPU needs to hold this lock, which won't be released until 213 * the store below is done, which would ensure the entry is overwritten 214 * to NULL, etc. 215 */ 216 smp_store_release(&lock->locked, 0); 217 this_cpu_dec(rqspinlock_held_locks.cnt); 218 } 219 220 #ifdef CONFIG_QUEUED_SPINLOCKS 221 #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) 222 #else 223 #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) 224 #endif 225 226 #define raw_res_spin_lock(lock) \ 227 ({ \ 228 int __ret; \ 229 preempt_disable(); \ 230 __ret = res_spin_lock(lock); \ 231 if (__ret) \ 232 preempt_enable(); \ 233 __ret; \ 234 }) 235 236 #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) 237 238 #define raw_res_spin_lock_irqsave(lock, flags) \ 239 ({ \ 240 int __ret; \ 241 local_irq_save(flags); \ 242 __ret = raw_res_spin_lock(lock); \ 243 if (__ret) \ 244 local_irq_restore(flags); \ 245 __ret; \ 246 }) 247 248 #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) 249 250 #endif /* __ASM_GENERIC_RQSPINLOCK_H */ 251