1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 #ifndef BPF_ARENA_SPIN_LOCK_H 4 #define BPF_ARENA_SPIN_LOCK_H 5 6 #include <vmlinux.h> 7 #include <bpf/bpf_helpers.h> 8 #include "bpf_atomic.h" 9 10 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) 11 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) 12 13 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) 14 15 #define EBUSY 16 16 #define EOPNOTSUPP 95 17 #define ETIMEDOUT 110 18 19 #ifndef __arena 20 #define __arena __attribute__((address_space(1))) 21 #endif 22 23 extern unsigned long CONFIG_NR_CPUS __kconfig; 24 25 /* 26 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but 27 * PowerPC overrides the definition to define lock->val as u32 instead of 28 * atomic_t, leading to compilation errors. Import a local definition below so 29 * that we don't depend on the vmlinux.h version. 30 */ 31 32 struct __qspinlock { 33 union { 34 atomic_t val; 35 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 36 struct { 37 u8 locked; 38 u8 pending; 39 }; 40 struct { 41 u16 locked_pending; 42 u16 tail; 43 }; 44 #else 45 struct { 46 u16 tail; 47 u16 locked_pending; 48 }; 49 struct { 50 u8 reserved[2]; 51 u8 pending; 52 u8 locked; 53 }; 54 #endif 55 }; 56 }; 57 58 #define arena_spinlock_t struct __qspinlock 59 /* FIXME: Using typedef causes CO-RE relocation error */ 60 /* typedef struct qspinlock arena_spinlock_t; */ 61 62 struct arena_mcs_spinlock { 63 struct arena_mcs_spinlock __arena *next; 64 int locked; 65 int count; 66 }; 67 68 struct arena_qnode { 69 struct arena_mcs_spinlock mcs; 70 }; 71 72 #define _Q_MAX_NODES 4 73 #define _Q_PENDING_LOOPS 1 74 75 /* 76 * Bitfields in the atomic value: 77 * 78 * 0- 7: locked byte 79 * 8: pending 80 * 9-15: not used 81 * 16-17: tail index 82 * 18-31: tail cpu (+1) 83 */ 84 #define _Q_MAX_CPUS 1024 85 86 #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ 87 << _Q_ ## type ## _OFFSET) 88 #define _Q_LOCKED_OFFSET 0 89 #define _Q_LOCKED_BITS 8 90 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) 91 92 #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) 93 #define _Q_PENDING_BITS 8 94 #define _Q_PENDING_MASK _Q_SET_MASK(PENDING) 95 96 #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) 97 #define _Q_TAIL_IDX_BITS 2 98 #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) 99 100 #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) 101 #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) 102 #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) 103 104 #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET 105 #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) 106 107 #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) 108 #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) 109 110 struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; 111 112 static inline u32 encode_tail(int cpu, int idx) 113 { 114 u32 tail; 115 116 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 117 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 118 119 return tail; 120 } 121 122 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) 123 { 124 u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 125 u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 126 127 return &qnodes[cpu][idx].mcs; 128 } 129 130 static inline 131 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) 132 { 133 return &((struct arena_qnode __arena *)base + idx)->mcs; 134 } 135 136 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 137 138 /** 139 * xchg_tail - Put in the new queue tail code word & retrieve previous one 140 * @lock : Pointer to queued spinlock structure 141 * @tail : The new queue tail code word 142 * Return: The previous queue tail code word 143 * 144 * xchg(lock, tail) 145 * 146 * p,*,* -> n,*,* ; prev = xchg(lock, node) 147 */ 148 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) 149 { 150 u32 old, new; 151 152 old = atomic_read(&lock->val); 153 do { 154 new = (old & _Q_LOCKED_PENDING_MASK) | tail; 155 /* 156 * We can use relaxed semantics since the caller ensures that 157 * the MCS node is properly initialized before updating the 158 * tail. 159 */ 160 /* These loops are not expected to stall, but we still need to 161 * prove to the verifier they will terminate eventually. 162 */ 163 cond_break_label(out); 164 } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 165 166 return old; 167 out: 168 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); 169 return old; 170 } 171 172 /** 173 * clear_pending - clear the pending bit. 174 * @lock: Pointer to queued spinlock structure 175 * 176 * *,1,* -> *,0,* 177 */ 178 static __always_inline void clear_pending(arena_spinlock_t __arena *lock) 179 { 180 WRITE_ONCE(lock->pending, 0); 181 } 182 183 /** 184 * clear_pending_set_locked - take ownership and clear the pending bit. 185 * @lock: Pointer to queued spinlock structure 186 * 187 * *,1,0 -> *,0,1 188 * 189 * Lock stealing is not allowed if this function is used. 190 */ 191 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) 192 { 193 WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 194 } 195 196 /** 197 * set_locked - Set the lock bit and own the lock 198 * @lock: Pointer to queued spinlock structure 199 * 200 * *,*,0 -> *,0,1 201 */ 202 static __always_inline void set_locked(arena_spinlock_t __arena *lock) 203 { 204 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 205 } 206 207 static __always_inline 208 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) 209 { 210 u32 old, new; 211 212 old = atomic_read(&lock->val); 213 do { 214 new = old | _Q_PENDING_VAL; 215 /* 216 * These loops are not expected to stall, but we still need to 217 * prove to the verifier they will terminate eventually. 218 */ 219 cond_break_label(out); 220 } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); 221 222 return old; 223 out: 224 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); 225 return old; 226 } 227 228 /** 229 * arena_spin_trylock - try to acquire the queued spinlock 230 * @lock : Pointer to queued spinlock structure 231 * Return: 1 if lock acquired, 0 if failed 232 */ 233 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) 234 { 235 int val = atomic_read(&lock->val); 236 237 if (unlikely(val)) 238 return 0; 239 240 return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); 241 } 242 243 __noinline 244 int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) 245 { 246 struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; 247 int ret = -ETIMEDOUT; 248 u32 old, tail; 249 int idx; 250 251 /* 252 * Wait for in-progress pending->locked hand-overs with a bounded 253 * number of spins so that we guarantee forward progress. 254 * 255 * 0,1,0 -> 0,0,1 256 */ 257 if (val == _Q_PENDING_VAL) { 258 int cnt = _Q_PENDING_LOOPS; 259 val = atomic_cond_read_relaxed_label(&lock->val, 260 (VAL != _Q_PENDING_VAL) || !cnt--, 261 release_err); 262 } 263 264 /* 265 * If we observe any contention; queue. 266 */ 267 if (val & ~_Q_LOCKED_MASK) 268 goto queue; 269 270 /* 271 * trylock || pending 272 * 273 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 274 */ 275 val = arena_fetch_set_pending_acquire(lock); 276 277 /* 278 * If we observe contention, there is a concurrent locker. 279 * 280 * Undo and queue; our setting of PENDING might have made the 281 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 282 * on @next to become !NULL. 283 */ 284 if (unlikely(val & ~_Q_LOCKED_MASK)) { 285 286 /* Undo PENDING if we set it. */ 287 if (!(val & _Q_PENDING_MASK)) 288 clear_pending(lock); 289 290 goto queue; 291 } 292 293 /* 294 * We're pending, wait for the owner to go away. 295 * 296 * 0,1,1 -> *,1,0 297 * 298 * this wait loop must be a load-acquire such that we match the 299 * store-release that clears the locked bit and create lock 300 * sequentiality; this is because not all 301 * clear_pending_set_locked() implementations imply full 302 * barriers. 303 */ 304 if (val & _Q_LOCKED_MASK) 305 smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); 306 307 /* 308 * take ownership and clear the pending bit. 309 * 310 * 0,1,0 -> 0,0,1 311 */ 312 clear_pending_set_locked(lock); 313 return 0; 314 315 /* 316 * End of pending bit optimistic spinning and beginning of MCS 317 * queuing. 318 */ 319 queue: 320 node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; 321 idx = node0->count++; 322 tail = encode_tail(bpf_get_smp_processor_id(), idx); 323 324 /* 325 * 4 nodes are allocated based on the assumption that there will not be 326 * nested NMIs taking spinlocks. That may not be true in some 327 * architectures even though the chance of needing more than 4 nodes 328 * will still be extremely unlikely. When that happens, we simply return 329 * an error. Original qspinlock has a trylock fallback in this case. 330 */ 331 if (unlikely(idx >= _Q_MAX_NODES)) { 332 ret = -EBUSY; 333 goto release_node_err; 334 } 335 336 node = grab_mcs_node(node0, idx); 337 338 /* 339 * Ensure that we increment the head node->count before initialising 340 * the actual node. If the compiler is kind enough to reorder these 341 * stores, then an IRQ could overwrite our assignments. 342 */ 343 barrier(); 344 345 node->locked = 0; 346 node->next = NULL; 347 348 /* 349 * We touched a (possibly) cold cacheline in the per-cpu queue node; 350 * attempt the trylock once more in the hope someone let go while we 351 * weren't watching. 352 */ 353 if (arena_spin_trylock(lock)) 354 goto release; 355 356 /* 357 * Ensure that the initialisation of @node is complete before we 358 * publish the updated tail via xchg_tail() and potentially link 359 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 360 */ 361 smp_wmb(); 362 363 /* 364 * Publish the updated tail. 365 * We have already touched the queueing cacheline; don't bother with 366 * pending stuff. 367 * 368 * p,*,* -> n,*,* 369 */ 370 old = xchg_tail(lock, tail); 371 next = NULL; 372 373 /* 374 * if there was a previous node; link it and wait until reaching the 375 * head of the waitqueue. 376 */ 377 if (old & _Q_TAIL_MASK) { 378 prev = decode_tail(old); 379 380 /* Link @node into the waitqueue. */ 381 WRITE_ONCE(prev->next, node); 382 383 arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); 384 385 /* 386 * While waiting for the MCS lock, the next pointer may have 387 * been set by another lock waiter. We cannot prefetch here 388 * due to lack of equivalent instruction in BPF ISA. 389 */ 390 next = READ_ONCE(node->next); 391 } 392 393 /* 394 * we're at the head of the waitqueue, wait for the owner & pending to 395 * go away. 396 * 397 * *,x,y -> *,0,0 398 * 399 * this wait loop must use a load-acquire such that we match the 400 * store-release that clears the locked bit and create lock 401 * sequentiality; this is because the set_locked() function below 402 * does not imply a full barrier. 403 */ 404 val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), 405 release_node_err); 406 407 /* 408 * claim the lock: 409 * 410 * n,0,0 -> 0,0,1 : lock, uncontended 411 * *,*,0 -> *,*,1 : lock, contended 412 * 413 * If the queue head is the only one in the queue (lock value == tail) 414 * and nobody is pending, clear the tail code and grab the lock. 415 * Otherwise, we only need to grab the lock. 416 */ 417 418 /* 419 * In the PV case we might already have _Q_LOCKED_VAL set, because 420 * of lock stealing; therefore we must also allow: 421 * 422 * n,0,1 -> 0,0,1 423 * 424 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 425 * above wait condition, therefore any concurrent setting of 426 * PENDING will make the uncontended transition fail. 427 */ 428 if ((val & _Q_TAIL_MASK) == tail) { 429 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 430 goto release; /* No contention */ 431 } 432 433 /* 434 * Either somebody is queued behind us or _Q_PENDING_VAL got set 435 * which will then detect the remaining tail and queue behind us 436 * ensuring we'll see a @next. 437 */ 438 set_locked(lock); 439 440 /* 441 * contended path; wait for next if not observed yet, release. 442 */ 443 if (!next) 444 next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); 445 446 arch_mcs_spin_unlock_contended(&next->locked); 447 448 release:; 449 /* 450 * release the node 451 * 452 * Doing a normal dec vs this_cpu_dec is fine. An upper context always 453 * decrements count it incremented before returning, thus we're fine. 454 * For contexts interrupting us, they either observe our dec or not. 455 * Just ensure the compiler doesn't reorder this statement, as a 456 * this_cpu_dec implicitly implied that. 457 */ 458 barrier(); 459 node0->count--; 460 return 0; 461 release_node_err: 462 barrier(); 463 node0->count--; 464 goto release_err; 465 release_err: 466 return ret; 467 } 468 469 /** 470 * arena_spin_lock - acquire a queued spinlock 471 * @lock: Pointer to queued spinlock structure 472 * 473 * On error, returned value will be negative. 474 * On success, zero is returned. 475 * 476 * The return value _must_ be tested against zero for success, 477 * instead of checking it against negative, for passing the 478 * BPF verifier. 479 * 480 * The user should do: 481 * if (arena_spin_lock(...) != 0) // failure 482 * or 483 * if (arena_spin_lock(...) == 0) // success 484 * or 485 * if (arena_spin_lock(...)) // failure 486 * or 487 * if (!arena_spin_lock(...)) // success 488 * instead of: 489 * if (arena_spin_lock(...) < 0) // failure 490 * 491 * The return value can still be inspected later. 492 */ 493 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) 494 { 495 int val = 0; 496 497 if (CONFIG_NR_CPUS > 1024) 498 return -EOPNOTSUPP; 499 500 bpf_preempt_disable(); 501 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) 502 return 0; 503 504 val = arena_spin_lock_slowpath(lock, val); 505 /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ 506 if (val) 507 bpf_preempt_enable(); 508 return val; 509 } 510 511 /** 512 * arena_spin_unlock - release a queued spinlock 513 * @lock : Pointer to queued spinlock structure 514 */ 515 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) 516 { 517 /* 518 * unlock() needs release semantics: 519 */ 520 smp_store_release(&lock->locked, 0); 521 bpf_preempt_enable(); 522 } 523 524 #define arena_spin_lock_irqsave(lock, flags) \ 525 ({ \ 526 int __ret; \ 527 bpf_local_irq_save(&(flags)); \ 528 __ret = arena_spin_lock((lock)); \ 529 if (__ret) \ 530 bpf_local_irq_restore(&(flags)); \ 531 (__ret); \ 532 }) 533 534 #define arena_spin_unlock_irqrestore(lock, flags) \ 535 ({ \ 536 arena_spin_unlock((lock)); \ 537 bpf_local_irq_restore(&(flags)); \ 538 }) 539 540 #endif 541 542 #endif /* BPF_ARENA_SPIN_LOCK_H */ 543