1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 #ifndef BPF_ARENA_SPIN_LOCK_H 4 #define BPF_ARENA_SPIN_LOCK_H 5 6 #include <vmlinux.h> 7 #include <bpf/bpf_helpers.h> 8 #include <bpf_atomic.h> 9 10 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) 11 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) 12 13 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) 14 15 #define EBUSY 16 16 #define EOPNOTSUPP 95 17 #define ETIMEDOUT 110 18 19 extern unsigned long CONFIG_NR_CPUS __kconfig; 20 21 /* 22 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but 23 * PowerPC overrides the definition to define lock->val as u32 instead of 24 * atomic_t, leading to compilation errors. Import a local definition below so 25 * that we don't depend on the vmlinux.h version. 26 */ 27 28 struct __qspinlock { 29 union { 30 atomic_t val; 31 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 32 struct { 33 u8 locked; 34 u8 pending; 35 }; 36 struct { 37 u16 locked_pending; 38 u16 tail; 39 }; 40 #else 41 struct { 42 u16 tail; 43 u16 locked_pending; 44 }; 45 struct { 46 u8 reserved[2]; 47 u8 pending; 48 u8 locked; 49 }; 50 #endif 51 }; 52 }; 53 54 #define arena_spinlock_t struct __qspinlock 55 /* FIXME: Using typedef causes CO-RE relocation error */ 56 /* typedef struct qspinlock arena_spinlock_t; */ 57 58 struct arena_mcs_spinlock { 59 struct arena_mcs_spinlock __arena *next; 60 int locked; 61 int count; 62 }; 63 64 struct arena_qnode { 65 struct arena_mcs_spinlock mcs; 66 }; 67 68 #define _Q_MAX_NODES 4 69 #define _Q_PENDING_LOOPS 1 70 71 /* 72 * Bitfields in the atomic value: 73 * 74 * 0- 7: locked byte 75 * 8: pending 76 * 9-15: not used 77 * 16-17: tail index 78 * 18-31: tail cpu (+1) 79 */ 80 #define _Q_MAX_CPUS 1024 81 82 #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ 83 << _Q_ ## type ## _OFFSET) 84 #define _Q_LOCKED_OFFSET 0 85 #define _Q_LOCKED_BITS 8 86 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) 87 88 #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) 89 #define _Q_PENDING_BITS 8 90 #define _Q_PENDING_MASK _Q_SET_MASK(PENDING) 91 92 #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) 93 #define _Q_TAIL_IDX_BITS 2 94 #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) 95 96 #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) 97 #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) 98 #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) 99 100 #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET 101 #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) 102 103 #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) 104 #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) 105 106 /* 107 * The qnodes are marked __weak so we can define them in the header 108 * while still ensuring all compilation units use the same struct 109 * instance. 110 */ 111 struct arena_qnode __weak __arena __hidden qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; 112 113 static inline u32 encode_tail(int cpu, int idx) 114 { 115 u32 tail; 116 117 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 118 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 119 120 return tail; 121 } 122 123 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) 124 { 125 u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 126 u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 127 128 return &qnodes[cpu][idx].mcs; 129 } 130 131 static inline 132 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) 133 { 134 return &((struct arena_qnode __arena *)base + idx)->mcs; 135 } 136 137 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 138 139 /** 140 * xchg_tail - Put in the new queue tail code word & retrieve previous one 141 * @lock : Pointer to queued spinlock structure 142 * @tail : The new queue tail code word 143 * Return: The previous queue tail code word 144 * 145 * xchg(lock, tail) 146 * 147 * p,*,* -> n,*,* ; prev = xchg(lock, node) 148 */ 149 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) 150 { 151 u32 old, new; 152 153 old = atomic_read(&lock->val); 154 do { 155 new = (old & _Q_LOCKED_PENDING_MASK) | tail; 156 /* 157 * We can use relaxed semantics since the caller ensures that 158 * the MCS node is properly initialized before updating the 159 * tail. 160 */ 161 /* These loops are not expected to stall, but we still need to 162 * prove to the verifier they will terminate eventually. 163 */ 164 cond_break_label(out); 165 } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 166 167 return old; 168 out: 169 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); 170 return old; 171 } 172 173 /** 174 * clear_pending - clear the pending bit. 175 * @lock: Pointer to queued spinlock structure 176 * 177 * *,1,* -> *,0,* 178 */ 179 static __always_inline void clear_pending(arena_spinlock_t __arena *lock) 180 { 181 WRITE_ONCE(lock->pending, 0); 182 } 183 184 /** 185 * clear_pending_set_locked - take ownership and clear the pending bit. 186 * @lock: Pointer to queued spinlock structure 187 * 188 * *,1,0 -> *,0,1 189 * 190 * Lock stealing is not allowed if this function is used. 191 */ 192 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) 193 { 194 WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 195 } 196 197 /** 198 * set_locked - Set the lock bit and own the lock 199 * @lock: Pointer to queued spinlock structure 200 * 201 * *,*,0 -> *,0,1 202 */ 203 static __always_inline void set_locked(arena_spinlock_t __arena *lock) 204 { 205 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 206 } 207 208 static __always_inline 209 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) 210 { 211 u32 old, new; 212 213 old = atomic_read(&lock->val); 214 do { 215 new = old | _Q_PENDING_VAL; 216 /* 217 * These loops are not expected to stall, but we still need to 218 * prove to the verifier they will terminate eventually. 219 */ 220 cond_break_label(out); 221 } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); 222 223 return old; 224 out: 225 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); 226 return old; 227 } 228 229 /** 230 * arena_spin_trylock - try to acquire the queued spinlock 231 * @lock : Pointer to queued spinlock structure 232 * Return: 1 if lock acquired, 0 if failed 233 */ 234 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) 235 { 236 int val = atomic_read(&lock->val); 237 238 if (unlikely(val)) 239 return 0; 240 241 return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); 242 } 243 244 __noinline __weak 245 int arena_spin_lock_slowpath(arena_spinlock_t __arena *lock, u32 val) 246 { 247 struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; 248 int ret = -ETIMEDOUT; 249 u32 old, tail; 250 int idx; 251 252 /* 253 * Wait for in-progress pending->locked hand-overs with a bounded 254 * number of spins so that we guarantee forward progress. 255 * 256 * 0,1,0 -> 0,0,1 257 */ 258 if (val == _Q_PENDING_VAL) { 259 int cnt = _Q_PENDING_LOOPS; 260 val = atomic_cond_read_relaxed_label(&lock->val, 261 (VAL != _Q_PENDING_VAL) || !cnt--, 262 release_err); 263 } 264 265 /* 266 * If we observe any contention; queue. 267 */ 268 if (val & ~_Q_LOCKED_MASK) 269 goto queue; 270 271 /* 272 * trylock || pending 273 * 274 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 275 */ 276 val = arena_fetch_set_pending_acquire(lock); 277 278 /* 279 * If we observe contention, there is a concurrent locker. 280 * 281 * Undo and queue; our setting of PENDING might have made the 282 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 283 * on @next to become !NULL. 284 */ 285 if (unlikely(val & ~_Q_LOCKED_MASK)) { 286 287 /* Undo PENDING if we set it. */ 288 if (!(val & _Q_PENDING_MASK)) 289 clear_pending(lock); 290 291 goto queue; 292 } 293 294 /* 295 * We're pending, wait for the owner to go away. 296 * 297 * 0,1,1 -> *,1,0 298 * 299 * this wait loop must be a load-acquire such that we match the 300 * store-release that clears the locked bit and create lock 301 * sequentiality; this is because not all 302 * clear_pending_set_locked() implementations imply full 303 * barriers. 304 */ 305 if (val & _Q_LOCKED_MASK) 306 (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); 307 308 /* 309 * take ownership and clear the pending bit. 310 * 311 * 0,1,0 -> 0,0,1 312 */ 313 clear_pending_set_locked(lock); 314 return 0; 315 316 /* 317 * End of pending bit optimistic spinning and beginning of MCS 318 * queuing. 319 */ 320 queue: 321 node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; 322 idx = node0->count++; 323 tail = encode_tail(bpf_get_smp_processor_id(), idx); 324 325 /* 326 * 4 nodes are allocated based on the assumption that there will not be 327 * nested NMIs taking spinlocks. That may not be true in some 328 * architectures even though the chance of needing more than 4 nodes 329 * will still be extremely unlikely. When that happens, we simply return 330 * an error. Original qspinlock has a trylock fallback in this case. 331 */ 332 if (unlikely(idx >= _Q_MAX_NODES)) { 333 ret = -EBUSY; 334 goto release_node_err; 335 } 336 337 node = grab_mcs_node(node0, idx); 338 339 /* 340 * Ensure that we increment the head node->count before initialising 341 * the actual node. If the compiler is kind enough to reorder these 342 * stores, then an IRQ could overwrite our assignments. 343 */ 344 barrier(); 345 346 node->locked = 0; 347 node->next = NULL; 348 349 /* 350 * We touched a (possibly) cold cacheline in the per-cpu queue node; 351 * attempt the trylock once more in the hope someone let go while we 352 * weren't watching. 353 */ 354 if (arena_spin_trylock(lock)) 355 goto release; 356 357 /* 358 * Ensure that the initialisation of @node is complete before we 359 * publish the updated tail via xchg_tail() and potentially link 360 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 361 */ 362 smp_wmb(); 363 364 /* 365 * Publish the updated tail. 366 * We have already touched the queueing cacheline; don't bother with 367 * pending stuff. 368 * 369 * p,*,* -> n,*,* 370 */ 371 old = xchg_tail(lock, tail); 372 next = NULL; 373 374 /* 375 * if there was a previous node; link it and wait until reaching the 376 * head of the waitqueue. 377 */ 378 if (old & _Q_TAIL_MASK) { 379 prev = decode_tail(old); 380 381 /* Link @node into the waitqueue. */ 382 WRITE_ONCE(prev->next, node); 383 384 (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); 385 386 /* 387 * While waiting for the MCS lock, the next pointer may have 388 * been set by another lock waiter. We cannot prefetch here 389 * due to lack of equivalent instruction in BPF ISA. 390 */ 391 next = READ_ONCE(node->next); 392 } 393 394 /* 395 * we're at the head of the waitqueue, wait for the owner & pending to 396 * go away. 397 * 398 * *,x,y -> *,0,0 399 * 400 * this wait loop must use a load-acquire such that we match the 401 * store-release that clears the locked bit and create lock 402 * sequentiality; this is because the set_locked() function below 403 * does not imply a full barrier. 404 */ 405 val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), 406 release_node_err); 407 408 /* 409 * claim the lock: 410 * 411 * n,0,0 -> 0,0,1 : lock, uncontended 412 * *,*,0 -> *,*,1 : lock, contended 413 * 414 * If the queue head is the only one in the queue (lock value == tail) 415 * and nobody is pending, clear the tail code and grab the lock. 416 * Otherwise, we only need to grab the lock. 417 */ 418 419 /* 420 * In the PV case we might already have _Q_LOCKED_VAL set, because 421 * of lock stealing; therefore we must also allow: 422 * 423 * n,0,1 -> 0,0,1 424 * 425 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 426 * above wait condition, therefore any concurrent setting of 427 * PENDING will make the uncontended transition fail. 428 */ 429 if ((val & _Q_TAIL_MASK) == tail) { 430 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 431 goto release; /* No contention */ 432 } 433 434 /* 435 * Either somebody is queued behind us or _Q_PENDING_VAL got set 436 * which will then detect the remaining tail and queue behind us 437 * ensuring we'll see a @next. 438 */ 439 set_locked(lock); 440 441 /* 442 * contended path; wait for next if not observed yet, release. 443 */ 444 if (!next) 445 next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); 446 447 arch_mcs_spin_unlock_contended(&next->locked); 448 449 release:; 450 /* 451 * release the node 452 * 453 * Doing a normal dec vs this_cpu_dec is fine. An upper context always 454 * decrements count it incremented before returning, thus we're fine. 455 * For contexts interrupting us, they either observe our dec or not. 456 * Just ensure the compiler doesn't reorder this statement, as a 457 * this_cpu_dec implicitly implied that. 458 */ 459 barrier(); 460 node0->count--; 461 return 0; 462 release_node_err: 463 barrier(); 464 node0->count--; 465 goto release_err; 466 release_err: 467 return ret; 468 } 469 470 /** 471 * arena_spin_lock - acquire a queued spinlock 472 * @lock: Pointer to queued spinlock structure 473 * 474 * On error, returned value will be negative. 475 * On success, zero is returned. 476 * 477 * The return value _must_ be tested against zero for success, 478 * instead of checking it against negative, for passing the 479 * BPF verifier. 480 * 481 * The user should do: 482 * if (arena_spin_lock(...) != 0) // failure 483 * or 484 * if (arena_spin_lock(...) == 0) // success 485 * or 486 * if (arena_spin_lock(...)) // failure 487 * or 488 * if (!arena_spin_lock(...)) // success 489 * instead of: 490 * if (arena_spin_lock(...) < 0) // failure 491 * 492 * The return value can still be inspected later. 493 */ 494 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) 495 { 496 int val = 0; 497 498 if (CONFIG_NR_CPUS > 1024) 499 return -EOPNOTSUPP; 500 501 bpf_preempt_disable(); 502 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) 503 return 0; 504 505 val = arena_spin_lock_slowpath(lock, val); 506 /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ 507 if (val) 508 bpf_preempt_enable(); 509 return val; 510 } 511 512 /** 513 * arena_spin_unlock - release a queued spinlock 514 * @lock : Pointer to queued spinlock structure 515 */ 516 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) 517 { 518 /* 519 * unlock() needs release semantics: 520 */ 521 smp_store_release(&lock->locked, 0); 522 bpf_preempt_enable(); 523 } 524 525 #define arena_spin_lock_irqsave(lock, flags) \ 526 ({ \ 527 int __ret; \ 528 bpf_local_irq_save(&(flags)); \ 529 __ret = arena_spin_lock((lock)); \ 530 if (__ret) \ 531 bpf_local_irq_restore(&(flags)); \ 532 (__ret); \ 533 }) 534 535 #define arena_spin_unlock_irqrestore(lock, flags) \ 536 ({ \ 537 arena_spin_unlock((lock)); \ 538 bpf_local_irq_restore(&(flags)); \ 539 }) 540 541 #endif 542 543 #endif /* BPF_ARENA_SPIN_LOCK_H */ 544