1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Resilient Queued Spin Lock 4 * 5 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6 * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7 * (C) Copyright 2015 Intel Corp. 8 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 9 * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10 * 11 * Authors: Waiman Long <longman@redhat.com> 12 * Peter Zijlstra <peterz@infradead.org> 13 * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14 */ 15 16 #include <linux/smp.h> 17 #include <linux/bug.h> 18 #include <linux/bpf.h> 19 #include <linux/err.h> 20 #include <linux/cpumask.h> 21 #include <linux/percpu.h> 22 #include <linux/hardirq.h> 23 #include <linux/mutex.h> 24 #include <linux/prefetch.h> 25 #include <asm/byteorder.h> 26 #ifdef CONFIG_QUEUED_SPINLOCKS 27 #include <asm/qspinlock.h> 28 #endif 29 #include <trace/events/lock.h> 30 #include <asm/rqspinlock.h> 31 #include <linux/timekeeping.h> 32 33 /* 34 * Include queued spinlock definitions and statistics code 35 */ 36 #ifdef CONFIG_QUEUED_SPINLOCKS 37 #include "../locking/qspinlock.h" 38 #include "../locking/lock_events.h" 39 #include "rqspinlock.h" 40 #include "../locking/mcs_spinlock.h" 41 #endif 42 43 /* 44 * The basic principle of a queue-based spinlock can best be understood 45 * by studying a classic queue-based spinlock implementation called the 46 * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47 * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48 * Scott") is available at 49 * 50 * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51 * 52 * This queued spinlock implementation is based on the MCS lock, however to 53 * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54 * existing API, we must modify it somehow. 55 * 56 * In particular; where the traditional MCS lock consists of a tail pointer 57 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58 * unlock the next pending (next->locked), we compress both these: {tail, 59 * next->locked} into a single u32 value. 60 * 61 * Since a spinlock disables recursion of its own context and there is a limit 62 * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63 * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64 * we can encode the tail by combining the 2-bit nesting level with the cpu 65 * number. With one byte for the lock value and 3 bytes for the tail, only a 66 * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67 * we extend it to a full byte to achieve better performance for architectures 68 * that support atomic byte write. 69 * 70 * We also change the first spinner to spin on the lock bit instead of its 71 * node; whereby avoiding the need to carry a node from lock to unlock, and 72 * preserving existing lock API. This also makes the unlock code simpler and 73 * faster. 74 * 75 * N.B. The current implementation only supports architectures that allow 76 * atomic operations on smaller 8-bit and 16-bit data types. 77 * 78 */ 79 80 struct rqspinlock_timeout { 81 u64 timeout_end; 82 u64 duration; 83 u64 cur; 84 u16 spin; 85 }; 86 87 #define RES_TIMEOUT_VAL 2 88 89 DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 90 EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 91 92 static bool is_lock_released(rqspinlock_t *lock, u32 mask) 93 { 94 if (!(atomic_read_acquire(&lock->val) & (mask))) 95 return true; 96 return false; 97 } 98 99 static noinline int check_deadlock_AA(rqspinlock_t *lock) 100 { 101 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 102 int cnt = min(RES_NR_HELD, rqh->cnt); 103 104 /* 105 * Return an error if we hold the lock we are attempting to acquire. 106 * We'll iterate over max 32 locks; no need to do is_lock_released. 107 */ 108 for (int i = 0; i < cnt - 1; i++) { 109 if (rqh->locks[i] == lock) 110 return -EDEADLK; 111 } 112 return 0; 113 } 114 115 /* 116 * This focuses on the most common case of ABBA deadlocks (or ABBA involving 117 * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 118 * timeouts as the final line of defense. 119 */ 120 static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) 121 { 122 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 123 int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 124 void *remote_lock; 125 int cpu; 126 127 /* 128 * Find the CPU holding the lock that we want to acquire. If there is a 129 * deadlock scenario, we will read a stable set on the remote CPU and 130 * find the target. This would be a constant time operation instead of 131 * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 132 * that requires increasing the size of the lock word. 133 */ 134 for_each_possible_cpu(cpu) { 135 struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 136 int real_cnt = READ_ONCE(rqh_cpu->cnt); 137 int cnt = min(RES_NR_HELD, real_cnt); 138 139 /* 140 * Let's ensure to break out of this loop if the lock is available for 141 * us to potentially acquire. 142 */ 143 if (is_lock_released(lock, mask)) 144 return 0; 145 146 /* 147 * Skip ourselves, and CPUs whose count is less than 2, as they need at 148 * least one held lock and one acquisition attempt (reflected as top 149 * most entry) to participate in an ABBA deadlock. 150 * 151 * If cnt is more than RES_NR_HELD, it means the current lock being 152 * acquired won't appear in the table, and other locks in the table are 153 * already held, so we can't determine ABBA. 154 */ 155 if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 156 continue; 157 158 /* 159 * Obtain the entry at the top, this corresponds to the lock the 160 * remote CPU is attempting to acquire in a deadlock situation, 161 * and would be one of the locks we hold on the current CPU. 162 */ 163 remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 164 /* 165 * If it is NULL, we've raced and cannot determine a deadlock 166 * conclusively, skip this CPU. 167 */ 168 if (!remote_lock) 169 continue; 170 /* 171 * Find if the lock we're attempting to acquire is held by this CPU. 172 * Don't consider the topmost entry, as that must be the latest lock 173 * being held or acquired. For a deadlock, the target CPU must also 174 * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 175 * entries are important. 176 */ 177 for (int i = 0; i < cnt - 1; i++) { 178 if (READ_ONCE(rqh_cpu->locks[i]) != lock) 179 continue; 180 /* 181 * We found our lock as held on the remote CPU. Is the 182 * acquisition attempt on the remote CPU for a lock held 183 * by us? If so, we have a deadlock situation, and need 184 * to recover. 185 */ 186 for (int i = 0; i < rqh_cnt - 1; i++) { 187 if (rqh->locks[i] == remote_lock) 188 return -EDEADLK; 189 } 190 /* 191 * Inconclusive; retry again later. 192 */ 193 return 0; 194 } 195 } 196 return 0; 197 } 198 199 static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 200 struct rqspinlock_timeout *ts) 201 { 202 u64 prev = ts->cur; 203 u64 time; 204 205 if (!ts->timeout_end) { 206 if (check_deadlock_AA(lock)) 207 return -EDEADLK; 208 ts->cur = ktime_get_mono_fast_ns(); 209 ts->timeout_end = ts->cur + ts->duration; 210 return 0; 211 } 212 213 time = ktime_get_mono_fast_ns(); 214 if (time > ts->timeout_end) 215 return -ETIMEDOUT; 216 217 /* 218 * A millisecond interval passed from last time? Trigger deadlock 219 * checks. 220 */ 221 if (prev + NSEC_PER_MSEC < time) { 222 ts->cur = time; 223 return check_deadlock_ABBA(lock, mask); 224 } 225 226 return 0; 227 } 228 229 /* 230 * Do not amortize with spins when res_smp_cond_load_acquire is defined, 231 * as the macro does internal amortization for us. 232 */ 233 #ifndef res_smp_cond_load_acquire 234 #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 235 ({ \ 236 if (!(ts).spin++) \ 237 (ret) = check_timeout((lock), (mask), &(ts)); \ 238 (ret); \ 239 }) 240 #else 241 #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 242 ({ (ret) = check_timeout((lock), (mask), &(ts)); }) 243 #endif 244 245 /* 246 * Initialize the 'spin' member. 247 * Set spin member to 0 to trigger AA/ABBA checks immediately. 248 */ 249 #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 250 251 /* 252 * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 253 * Duration is defined for each spin attempt, so set it here. 254 */ 255 #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 256 257 /* 258 * Provide a test-and-set fallback for cases when queued spin lock support is 259 * absent from the architecture. 260 */ 261 int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 262 { 263 struct rqspinlock_timeout ts; 264 int val, ret = 0; 265 266 RES_INIT_TIMEOUT(ts); 267 /* 268 * The fast path is not invoked for the TAS fallback, so we must grab 269 * the deadlock detection entry here. 270 */ 271 grab_held_lock_entry(lock); 272 273 /* 274 * Since the waiting loop's time is dependent on the amount of 275 * contention, a short timeout unlike rqspinlock waiting loops 276 * isn't enough. Choose a second as the timeout value. 277 */ 278 RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 279 retry: 280 val = atomic_read(&lock->val); 281 282 if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 283 if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 284 goto out; 285 cpu_relax(); 286 goto retry; 287 } 288 289 return 0; 290 out: 291 release_held_lock_entry(); 292 return ret; 293 } 294 EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 295 296 #ifdef CONFIG_QUEUED_SPINLOCKS 297 298 /* 299 * Per-CPU queue node structures; we can never have more than 4 nested 300 * contexts: task, softirq, hardirq, nmi. 301 * 302 * Exactly fits one 64-byte cacheline on a 64-bit architecture. 303 */ 304 static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 305 306 #ifndef res_smp_cond_load_acquire 307 #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 308 #endif 309 310 #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 311 312 /** 313 * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 314 * @lock: Pointer to queued spinlock structure 315 * @val: Current value of the queued spinlock 32-bit word 316 * 317 * Return: 318 * * 0 - Lock was acquired successfully. 319 * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 320 * * -ETIMEDOUT - Lock acquisition failed because of timeout. 321 * 322 * (queue tail, pending bit, lock value) 323 * 324 * fast : slow : unlock 325 * : : 326 * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 327 * : | ^--------.------. / : 328 * : v \ \ | : 329 * pending : (0,1,1) +--> (0,1,0) \ | : 330 * : | ^--' | | : 331 * : v | | : 332 * uncontended : (n,x,y) +--> (n,0,0) --' | : 333 * queue : | ^--' | : 334 * : v | : 335 * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 336 * queue : ^--' : 337 */ 338 int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 339 { 340 struct mcs_spinlock *prev, *next, *node; 341 struct rqspinlock_timeout ts; 342 int idx, ret = 0; 343 u32 old, tail; 344 345 BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 346 347 if (resilient_virt_spin_lock_enabled()) 348 return resilient_virt_spin_lock(lock); 349 350 RES_INIT_TIMEOUT(ts); 351 352 /* 353 * Wait for in-progress pending->locked hand-overs with a bounded 354 * number of spins so that we guarantee forward progress. 355 * 356 * 0,1,0 -> 0,0,1 357 */ 358 if (val == _Q_PENDING_VAL) { 359 int cnt = _Q_PENDING_LOOPS; 360 val = atomic_cond_read_relaxed(&lock->val, 361 (VAL != _Q_PENDING_VAL) || !cnt--); 362 } 363 364 /* 365 * If we observe any contention; queue. 366 */ 367 if (val & ~_Q_LOCKED_MASK) 368 goto queue; 369 370 /* 371 * trylock || pending 372 * 373 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 374 */ 375 val = queued_fetch_set_pending_acquire(lock); 376 377 /* 378 * If we observe contention, there is a concurrent locker. 379 * 380 * Undo and queue; our setting of PENDING might have made the 381 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 382 * on @next to become !NULL. 383 */ 384 if (unlikely(val & ~_Q_LOCKED_MASK)) { 385 386 /* Undo PENDING if we set it. */ 387 if (!(val & _Q_PENDING_MASK)) 388 clear_pending(lock); 389 390 goto queue; 391 } 392 393 /* Deadlock detection entry already held after failing fast path. */ 394 395 /* 396 * We're pending, wait for the owner to go away. 397 * 398 * 0,1,1 -> *,1,0 399 * 400 * this wait loop must be a load-acquire such that we match the 401 * store-release that clears the locked bit and create lock 402 * sequentiality; this is because not all 403 * clear_pending_set_locked() implementations imply full 404 * barriers. 405 */ 406 if (val & _Q_LOCKED_MASK) { 407 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 408 res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 409 } 410 411 if (ret) { 412 /* 413 * We waited for the locked bit to go back to 0, as the pending 414 * waiter, but timed out. We need to clear the pending bit since 415 * we own it. Once a stuck owner has been recovered, the lock 416 * must be restored to a valid state, hence removing the pending 417 * bit is necessary. 418 * 419 * *,1,* -> *,0,* 420 */ 421 clear_pending(lock); 422 lockevent_inc(rqspinlock_lock_timeout); 423 goto err_release_entry; 424 } 425 426 /* 427 * take ownership and clear the pending bit. 428 * 429 * 0,1,0 -> 0,0,1 430 */ 431 clear_pending_set_locked(lock); 432 lockevent_inc(lock_pending); 433 return 0; 434 435 /* 436 * End of pending bit optimistic spinning and beginning of MCS 437 * queuing. 438 */ 439 queue: 440 /* 441 * Do not queue if we're a waiter and someone is attempting this lock on 442 * the same CPU. In case of NMIs, this prevents long timeouts where we 443 * interrupt the pending waiter, and the owner, that will eventually 444 * signal the head of our queue, both of which are logically but not 445 * physically part of the queue, hence outside the scope of the idx > 0 446 * check above for the trylock fallback. 447 */ 448 if (check_deadlock_AA(lock)) { 449 ret = -EDEADLK; 450 goto err_release_entry; 451 } 452 453 lockevent_inc(lock_slowpath); 454 /* Deadlock detection entry already held after failing fast path. */ 455 node = this_cpu_ptr(&rqnodes[0].mcs); 456 idx = node->count++; 457 tail = encode_tail(smp_processor_id(), idx); 458 459 trace_contention_begin(lock, LCB_F_SPIN); 460 461 /* 462 * 4 nodes are allocated based on the assumption that there will 463 * not be nested NMIs taking spinlocks. That may not be true in 464 * some architectures even though the chance of needing more than 465 * 4 nodes will still be extremely unlikely. When that happens, 466 * we fall back to attempting a trylock operation without using 467 * any MCS node. Unlike qspinlock which cannot fail, we have the 468 * option of failing the slow path, and under contention, such a 469 * trylock spinning will likely be treated unfairly due to lack of 470 * queueing, hence do not spin. 471 */ 472 if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { 473 lockevent_inc(lock_no_node); 474 if (!queued_spin_trylock(lock)) { 475 ret = -EDEADLK; 476 goto err_release_node; 477 } 478 goto release; 479 } 480 481 node = grab_mcs_node(node, idx); 482 483 /* 484 * Keep counts of non-zero index values: 485 */ 486 lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 487 488 /* 489 * Ensure that we increment the head node->count before initialising 490 * the actual node. If the compiler is kind enough to reorder these 491 * stores, then an IRQ could overwrite our assignments. 492 */ 493 barrier(); 494 495 node->locked = 0; 496 node->next = NULL; 497 498 /* 499 * We touched a (possibly) cold cacheline in the per-cpu queue node; 500 * attempt the trylock once more in the hope someone let go while we 501 * weren't watching. 502 */ 503 if (queued_spin_trylock(lock)) 504 goto release; 505 506 /* 507 * Ensure that the initialisation of @node is complete before we 508 * publish the updated tail via xchg_tail() and potentially link 509 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 510 */ 511 smp_wmb(); 512 513 /* 514 * Publish the updated tail. 515 * We have already touched the queueing cacheline; don't bother with 516 * pending stuff. 517 * 518 * p,*,* -> n,*,* 519 */ 520 old = xchg_tail(lock, tail); 521 next = NULL; 522 523 /* 524 * if there was a previous node; link it and wait until reaching the 525 * head of the waitqueue. 526 */ 527 if (old & _Q_TAIL_MASK) { 528 int val; 529 530 prev = decode_tail(old, rqnodes); 531 532 /* Link @node into the waitqueue. */ 533 WRITE_ONCE(prev->next, node); 534 535 val = arch_mcs_spin_lock_contended(&node->locked); 536 if (val == RES_TIMEOUT_VAL) { 537 ret = -ETIMEDOUT; 538 goto waitq_timeout; 539 } 540 541 /* 542 * While waiting for the MCS lock, the next pointer may have 543 * been set by another lock waiter. We optimistically load 544 * the next pointer & prefetch the cacheline for writing 545 * to reduce latency in the upcoming MCS unlock operation. 546 */ 547 next = READ_ONCE(node->next); 548 if (next) 549 prefetchw(next); 550 } 551 552 /* 553 * we're at the head of the waitqueue, wait for the owner & pending to 554 * go away. 555 * 556 * *,x,y -> *,0,0 557 * 558 * this wait loop must use a load-acquire such that we match the 559 * store-release that clears the locked bit and create lock 560 * sequentiality; this is because the set_locked() function below 561 * does not imply a full barrier. 562 * 563 * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 564 * meant to span maximum allowed time per critical section, and we may 565 * have both the owner of the lock and the pending bit waiter ahead of 566 * us. 567 */ 568 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 569 val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 570 RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 571 572 /* Disable queue destruction when we detect deadlocks. */ 573 if (ret == -EDEADLK) { 574 if (!next) 575 next = smp_cond_load_relaxed(&node->next, (VAL)); 576 arch_mcs_spin_unlock_contended(&next->locked); 577 goto err_release_node; 578 } 579 580 waitq_timeout: 581 if (ret) { 582 /* 583 * If the tail is still pointing to us, then we are the final waiter, 584 * and are responsible for resetting the tail back to 0. Otherwise, if 585 * the cmpxchg operation fails, we signal the next waiter to take exit 586 * and try the same. For a waiter with tail node 'n': 587 * 588 * n,*,* -> 0,*,* 589 * 590 * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 591 * possible locked/pending bits keep changing and we see failures even 592 * when we remain the head of wait queue. However, eventually, 593 * pending bit owner will unset the pending bit, and new waiters 594 * will queue behind us. This will leave the lock owner in 595 * charge, and it will eventually either set locked bit to 0, or 596 * leave it as 1, allowing us to make progress. 597 * 598 * We terminate the whole wait queue for two reasons. Firstly, 599 * we eschew per-waiter timeouts with one applied at the head of 600 * the wait queue. This allows everyone to break out faster 601 * once we've seen the owner / pending waiter not responding for 602 * the timeout duration from the head. Secondly, it avoids 603 * complicated synchronization, because when not leaving in FIFO 604 * order, prev's next pointer needs to be fixed up etc. 605 */ 606 if (!try_cmpxchg_tail(lock, tail, 0)) { 607 next = smp_cond_load_relaxed(&node->next, VAL); 608 WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 609 } 610 lockevent_inc(rqspinlock_lock_timeout); 611 goto err_release_node; 612 } 613 614 /* 615 * claim the lock: 616 * 617 * n,0,0 -> 0,0,1 : lock, uncontended 618 * *,*,0 -> *,*,1 : lock, contended 619 * 620 * If the queue head is the only one in the queue (lock value == tail) 621 * and nobody is pending, clear the tail code and grab the lock. 622 * Otherwise, we only need to grab the lock. 623 */ 624 625 /* 626 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 627 * above wait condition, therefore any concurrent setting of 628 * PENDING will make the uncontended transition fail. 629 */ 630 if ((val & _Q_TAIL_MASK) == tail) { 631 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 632 goto release; /* No contention */ 633 } 634 635 /* 636 * Either somebody is queued behind us or _Q_PENDING_VAL got set 637 * which will then detect the remaining tail and queue behind us 638 * ensuring we'll see a @next. 639 */ 640 set_locked(lock); 641 642 /* 643 * contended path; wait for next if not observed yet, release. 644 */ 645 if (!next) 646 next = smp_cond_load_relaxed(&node->next, (VAL)); 647 648 arch_mcs_spin_unlock_contended(&next->locked); 649 650 release: 651 trace_contention_end(lock, 0); 652 653 /* 654 * release the node 655 */ 656 __this_cpu_dec(rqnodes[0].mcs.count); 657 return ret; 658 err_release_node: 659 trace_contention_end(lock, ret); 660 __this_cpu_dec(rqnodes[0].mcs.count); 661 err_release_entry: 662 release_held_lock_entry(); 663 return ret; 664 } 665 EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 666 667 #endif /* CONFIG_QUEUED_SPINLOCKS */ 668 669 __bpf_kfunc_start_defs(); 670 671 static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) 672 { 673 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 674 struct bpf_stream_stage ss; 675 struct bpf_prog *prog; 676 677 prog = bpf_prog_find_from_stack(); 678 if (!prog) 679 return; 680 bpf_stream_stage(ss, prog, BPF_STDERR, ({ 681 bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); 682 bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); 683 bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); 684 for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) 685 bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); 686 bpf_stream_dump_stack(ss); 687 })); 688 } 689 690 #define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) 691 692 __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 693 { 694 int ret; 695 696 BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 697 BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); 698 699 preempt_disable(); 700 ret = res_spin_lock((rqspinlock_t *)lock); 701 if (unlikely(ret)) { 702 bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); 703 preempt_enable(); 704 return ret; 705 } 706 return 0; 707 } 708 709 __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 710 { 711 res_spin_unlock((rqspinlock_t *)lock); 712 preempt_enable(); 713 } 714 715 __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 716 { 717 u64 *ptr = (u64 *)flags__irq_flag; 718 unsigned long flags; 719 int ret; 720 721 preempt_disable(); 722 local_irq_save(flags); 723 ret = res_spin_lock((rqspinlock_t *)lock); 724 if (unlikely(ret)) { 725 bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); 726 local_irq_restore(flags); 727 preempt_enable(); 728 return ret; 729 } 730 *ptr = flags; 731 return 0; 732 } 733 734 __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 735 { 736 u64 *ptr = (u64 *)flags__irq_flag; 737 unsigned long flags = *ptr; 738 739 res_spin_unlock((rqspinlock_t *)lock); 740 local_irq_restore(flags); 741 preempt_enable(); 742 } 743 744 __bpf_kfunc_end_defs(); 745 746 BTF_KFUNCS_START(rqspinlock_kfunc_ids) 747 BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 748 BTF_ID_FLAGS(func, bpf_res_spin_unlock) 749 BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 750 BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 751 BTF_KFUNCS_END(rqspinlock_kfunc_ids) 752 753 static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 754 .owner = THIS_MODULE, 755 .set = &rqspinlock_kfunc_ids, 756 }; 757 758 static __init int rqspinlock_register_kfuncs(void) 759 { 760 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 761 } 762 late_initcall(rqspinlock_register_kfuncs); 763