1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _GEN_PV_LOCK_SLOWPATH 3 #error "do not include this file" 4 #endif 5 6 #include <linux/hash.h> 7 #include <linux/bootmem.h> 8 #include <linux/debug_locks.h> 9 10 /* 11 * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead 12 * of spinning them. 13 * 14 * This relies on the architecture to provide two paravirt hypercalls: 15 * 16 * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val 17 * pv_kick(cpu) -- wakes a suspended vcpu 18 * 19 * Using these we implement __pv_queued_spin_lock_slowpath() and 20 * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and 21 * native_queued_spin_unlock(). 22 */ 23 24 #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 25 26 /* 27 * Queue Node Adaptive Spinning 28 * 29 * A queue node vCPU will stop spinning if the vCPU in the previous node is 30 * not running. The one lock stealing attempt allowed at slowpath entry 31 * mitigates the slight slowdown for non-overcommitted guest with this 32 * aggressive wait-early mechanism. 33 * 34 * The status of the previous node will be checked at fixed interval 35 * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't 36 * pound on the cacheline of the previous node too heavily. 37 */ 38 #define PV_PREV_CHECK_MASK 0xff 39 40 /* 41 * Queue node uses: vcpu_running & vcpu_halted. 42 * Queue head uses: vcpu_running & vcpu_hashed. 43 */ 44 enum vcpu_state { 45 vcpu_running = 0, 46 vcpu_halted, /* Used only in pv_wait_node */ 47 vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ 48 }; 49 50 struct pv_node { 51 struct mcs_spinlock mcs; 52 struct mcs_spinlock __res[3]; 53 54 int cpu; 55 u8 state; 56 }; 57 58 /* 59 * Hybrid PV queued/unfair lock 60 * 61 * By replacing the regular queued_spin_trylock() with the function below, 62 * it will be called once when a lock waiter enter the PV slowpath before 63 * being queued. 64 * 65 * The pending bit is set by the queue head vCPU of the MCS wait queue in 66 * pv_wait_head_or_lock() to signal that it is ready to spin on the lock. 67 * When that bit becomes visible to the incoming waiters, no lock stealing 68 * is allowed. The function will return immediately to make the waiters 69 * enter the MCS wait queue. So lock starvation shouldn't happen as long 70 * as the queued mode vCPUs are actively running to set the pending bit 71 * and hence disabling lock stealing. 72 * 73 * When the pending bit isn't set, the lock waiters will stay in the unfair 74 * mode spinning on the lock unless the MCS wait queue is empty. In this 75 * case, the lock waiters will enter the queued mode slowpath trying to 76 * become the queue head and set the pending bit. 77 * 78 * This hybrid PV queued/unfair lock combines the best attributes of a 79 * queued lock (no lock starvation) and an unfair lock (good performance 80 * on not heavily contended locks). 81 */ 82 #define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) 83 static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) 84 { 85 /* 86 * Stay in unfair lock mode as long as queued mode waiters are 87 * present in the MCS wait queue but the pending bit isn't set. 88 */ 89 for (;;) { 90 int val = atomic_read(&lock->val); 91 92 if (!(val & _Q_LOCKED_PENDING_MASK) && 93 (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { 94 qstat_inc(qstat_pv_lock_stealing, true); 95 return true; 96 } 97 if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) 98 break; 99 100 cpu_relax(); 101 } 102 103 return false; 104 } 105 106 /* 107 * The pending bit is used by the queue head vCPU to indicate that it 108 * is actively spinning on the lock and no lock stealing is allowed. 109 */ 110 #if _Q_PENDING_BITS == 8 111 static __always_inline void set_pending(struct qspinlock *lock) 112 { 113 WRITE_ONCE(lock->pending, 1); 114 } 115 116 /* 117 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory 118 * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the 119 * lock just to be sure that it will get it. 120 */ 121 static __always_inline int trylock_clear_pending(struct qspinlock *lock) 122 { 123 return !READ_ONCE(lock->locked) && 124 (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, 125 _Q_LOCKED_VAL) == _Q_PENDING_VAL); 126 } 127 #else /* _Q_PENDING_BITS == 8 */ 128 static __always_inline void set_pending(struct qspinlock *lock) 129 { 130 atomic_or(_Q_PENDING_VAL, &lock->val); 131 } 132 133 static __always_inline int trylock_clear_pending(struct qspinlock *lock) 134 { 135 int val = atomic_read(&lock->val); 136 137 for (;;) { 138 int old, new; 139 140 if (val & _Q_LOCKED_MASK) 141 break; 142 143 /* 144 * Try to clear pending bit & set locked bit 145 */ 146 old = val; 147 new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; 148 val = atomic_cmpxchg_acquire(&lock->val, old, new); 149 150 if (val == old) 151 return 1; 152 } 153 return 0; 154 } 155 #endif /* _Q_PENDING_BITS == 8 */ 156 157 /* 158 * Lock and MCS node addresses hash table for fast lookup 159 * 160 * Hashing is done on a per-cacheline basis to minimize the need to access 161 * more than one cacheline. 162 * 163 * Dynamically allocate a hash table big enough to hold at least 4X the 164 * number of possible cpus in the system. Allocation is done on page 165 * granularity. So the minimum number of hash buckets should be at least 166 * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page. 167 * 168 * Since we should not be holding locks from NMI context (very rare indeed) the 169 * max load factor is 0.75, which is around the point where open addressing 170 * breaks down. 171 * 172 */ 173 struct pv_hash_entry { 174 struct qspinlock *lock; 175 struct pv_node *node; 176 }; 177 178 #define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry)) 179 #define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry)) 180 181 static struct pv_hash_entry *pv_lock_hash; 182 static unsigned int pv_lock_hash_bits __read_mostly; 183 184 /* 185 * Allocate memory for the PV qspinlock hash buckets 186 * 187 * This function should be called from the paravirt spinlock initialization 188 * routine. 189 */ 190 void __init __pv_init_lock_hash(void) 191 { 192 int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE); 193 194 if (pv_hash_size < PV_HE_MIN) 195 pv_hash_size = PV_HE_MIN; 196 197 /* 198 * Allocate space from bootmem which should be page-size aligned 199 * and hence cacheline aligned. 200 */ 201 pv_lock_hash = alloc_large_system_hash("PV qspinlock", 202 sizeof(struct pv_hash_entry), 203 pv_hash_size, 0, 204 HASH_EARLY | HASH_ZERO, 205 &pv_lock_hash_bits, NULL, 206 pv_hash_size, pv_hash_size); 207 } 208 209 #define for_each_hash_entry(he, offset, hash) \ 210 for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \ 211 offset < (1 << pv_lock_hash_bits); \ 212 offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)]) 213 214 static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) 215 { 216 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); 217 struct pv_hash_entry *he; 218 int hopcnt = 0; 219 220 for_each_hash_entry(he, offset, hash) { 221 hopcnt++; 222 if (!cmpxchg(&he->lock, NULL, lock)) { 223 WRITE_ONCE(he->node, node); 224 qstat_hop(hopcnt); 225 return &he->lock; 226 } 227 } 228 /* 229 * Hard assume there is a free entry for us. 230 * 231 * This is guaranteed by ensuring every blocked lock only ever consumes 232 * a single entry, and since we only have 4 nesting levels per CPU 233 * and allocated 4*nr_possible_cpus(), this must be so. 234 * 235 * The single entry is guaranteed by having the lock owner unhash 236 * before it releases. 237 */ 238 BUG(); 239 } 240 241 static struct pv_node *pv_unhash(struct qspinlock *lock) 242 { 243 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); 244 struct pv_hash_entry *he; 245 struct pv_node *node; 246 247 for_each_hash_entry(he, offset, hash) { 248 if (READ_ONCE(he->lock) == lock) { 249 node = READ_ONCE(he->node); 250 WRITE_ONCE(he->lock, NULL); 251 return node; 252 } 253 } 254 /* 255 * Hard assume we'll find an entry. 256 * 257 * This guarantees a limited lookup time and is itself guaranteed by 258 * having the lock owner do the unhash -- IFF the unlock sees the 259 * SLOW flag, there MUST be a hash entry. 260 */ 261 BUG(); 262 } 263 264 /* 265 * Return true if when it is time to check the previous node which is not 266 * in a running state. 267 */ 268 static inline bool 269 pv_wait_early(struct pv_node *prev, int loop) 270 { 271 if ((loop & PV_PREV_CHECK_MASK) != 0) 272 return false; 273 274 return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); 275 } 276 277 /* 278 * Initialize the PV part of the mcs_spinlock node. 279 */ 280 static void pv_init_node(struct mcs_spinlock *node) 281 { 282 struct pv_node *pn = (struct pv_node *)node; 283 284 BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); 285 286 pn->cpu = smp_processor_id(); 287 pn->state = vcpu_running; 288 } 289 290 /* 291 * Wait for node->locked to become true, halt the vcpu after a short spin. 292 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its 293 * behalf. 294 */ 295 static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) 296 { 297 struct pv_node *pn = (struct pv_node *)node; 298 struct pv_node *pp = (struct pv_node *)prev; 299 int loop; 300 bool wait_early; 301 302 for (;;) { 303 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { 304 if (READ_ONCE(node->locked)) 305 return; 306 if (pv_wait_early(pp, loop)) { 307 wait_early = true; 308 break; 309 } 310 cpu_relax(); 311 } 312 313 /* 314 * Order pn->state vs pn->locked thusly: 315 * 316 * [S] pn->state = vcpu_halted [S] next->locked = 1 317 * MB MB 318 * [L] pn->locked [RmW] pn->state = vcpu_hashed 319 * 320 * Matches the cmpxchg() from pv_kick_node(). 321 */ 322 smp_store_mb(pn->state, vcpu_halted); 323 324 if (!READ_ONCE(node->locked)) { 325 qstat_inc(qstat_pv_wait_node, true); 326 qstat_inc(qstat_pv_wait_early, wait_early); 327 pv_wait(&pn->state, vcpu_halted); 328 } 329 330 /* 331 * If pv_kick_node() changed us to vcpu_hashed, retain that 332 * value so that pv_wait_head_or_lock() knows to not also try 333 * to hash this lock. 334 */ 335 cmpxchg(&pn->state, vcpu_halted, vcpu_running); 336 337 /* 338 * If the locked flag is still not set after wakeup, it is a 339 * spurious wakeup and the vCPU should wait again. However, 340 * there is a pretty high overhead for CPU halting and kicking. 341 * So it is better to spin for a while in the hope that the 342 * MCS lock will be released soon. 343 */ 344 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); 345 } 346 347 /* 348 * By now our node->locked should be 1 and our caller will not actually 349 * spin-wait for it. We do however rely on our caller to do a 350 * load-acquire for us. 351 */ 352 } 353 354 /* 355 * Called after setting next->locked = 1 when we're the lock owner. 356 * 357 * Instead of waking the waiters stuck in pv_wait_node() advance their state 358 * such that they're waiting in pv_wait_head_or_lock(), this avoids a 359 * wake/sleep cycle. 360 */ 361 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) 362 { 363 struct pv_node *pn = (struct pv_node *)node; 364 365 /* 366 * If the vCPU is indeed halted, advance its state to match that of 367 * pv_wait_node(). If OTOH this fails, the vCPU was running and will 368 * observe its next->locked value and advance itself. 369 * 370 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() 371 * 372 * The write to next->locked in arch_mcs_spin_unlock_contended() 373 * must be ordered before the read of pn->state in the cmpxchg() 374 * below for the code to work correctly. To guarantee full ordering 375 * irrespective of the success or failure of the cmpxchg(), 376 * a relaxed version with explicit barrier is used. The control 377 * dependency will order the reading of pn->state before any 378 * subsequent writes. 379 */ 380 smp_mb__before_atomic(); 381 if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) 382 != vcpu_halted) 383 return; 384 385 /* 386 * Put the lock into the hash table and set the _Q_SLOW_VAL. 387 * 388 * As this is the same vCPU that will check the _Q_SLOW_VAL value and 389 * the hash table later on at unlock time, no atomic instruction is 390 * needed. 391 */ 392 WRITE_ONCE(lock->locked, _Q_SLOW_VAL); 393 (void)pv_hash(lock, pn); 394 } 395 396 /* 397 * Wait for l->locked to become clear and acquire the lock; 398 * halt the vcpu after a short spin. 399 * __pv_queued_spin_unlock() will wake us. 400 * 401 * The current value of the lock will be returned for additional processing. 402 */ 403 static u32 404 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) 405 { 406 struct pv_node *pn = (struct pv_node *)node; 407 struct qspinlock **lp = NULL; 408 int waitcnt = 0; 409 int loop; 410 411 /* 412 * If pv_kick_node() already advanced our state, we don't need to 413 * insert ourselves into the hash table anymore. 414 */ 415 if (READ_ONCE(pn->state) == vcpu_hashed) 416 lp = (struct qspinlock **)1; 417 418 /* 419 * Tracking # of slowpath locking operations 420 */ 421 qstat_inc(qstat_lock_slowpath, true); 422 423 for (;; waitcnt++) { 424 /* 425 * Set correct vCPU state to be used by queue node wait-early 426 * mechanism. 427 */ 428 WRITE_ONCE(pn->state, vcpu_running); 429 430 /* 431 * Set the pending bit in the active lock spinning loop to 432 * disable lock stealing before attempting to acquire the lock. 433 */ 434 set_pending(lock); 435 for (loop = SPIN_THRESHOLD; loop; loop--) { 436 if (trylock_clear_pending(lock)) 437 goto gotlock; 438 cpu_relax(); 439 } 440 clear_pending(lock); 441 442 443 if (!lp) { /* ONCE */ 444 lp = pv_hash(lock, pn); 445 446 /* 447 * We must hash before setting _Q_SLOW_VAL, such that 448 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() 449 * we'll be sure to be able to observe our hash entry. 450 * 451 * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL 452 * MB RMB 453 * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> 454 * 455 * Matches the smp_rmb() in __pv_queued_spin_unlock(). 456 */ 457 if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) { 458 /* 459 * The lock was free and now we own the lock. 460 * Change the lock value back to _Q_LOCKED_VAL 461 * and unhash the table. 462 */ 463 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 464 WRITE_ONCE(*lp, NULL); 465 goto gotlock; 466 } 467 } 468 WRITE_ONCE(pn->state, vcpu_hashed); 469 qstat_inc(qstat_pv_wait_head, true); 470 qstat_inc(qstat_pv_wait_again, waitcnt); 471 pv_wait(&lock->locked, _Q_SLOW_VAL); 472 473 /* 474 * Because of lock stealing, the queue head vCPU may not be 475 * able to acquire the lock before it has to wait again. 476 */ 477 } 478 479 /* 480 * The cmpxchg() or xchg() call before coming here provides the 481 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL 482 * here is to indicate to the compiler that the value will always 483 * be nozero to enable better code optimization. 484 */ 485 gotlock: 486 return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); 487 } 488 489 /* 490 * PV versions of the unlock fastpath and slowpath functions to be used 491 * instead of queued_spin_unlock(). 492 */ 493 __visible void 494 __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) 495 { 496 struct pv_node *node; 497 498 if (unlikely(locked != _Q_SLOW_VAL)) { 499 WARN(!debug_locks_silent, 500 "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", 501 (unsigned long)lock, atomic_read(&lock->val)); 502 return; 503 } 504 505 /* 506 * A failed cmpxchg doesn't provide any memory-ordering guarantees, 507 * so we need a barrier to order the read of the node data in 508 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. 509 * 510 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. 511 */ 512 smp_rmb(); 513 514 /* 515 * Since the above failed to release, this must be the SLOW path. 516 * Therefore start by looking up the blocked node and unhashing it. 517 */ 518 node = pv_unhash(lock); 519 520 /* 521 * Now that we have a reference to the (likely) blocked pv_node, 522 * release the lock. 523 */ 524 smp_store_release(&lock->locked, 0); 525 526 /* 527 * At this point the memory pointed at by lock can be freed/reused, 528 * however we can still use the pv_node to kick the CPU. 529 * The other vCPU may not really be halted, but kicking an active 530 * vCPU is harmless other than the additional latency in completing 531 * the unlock. 532 */ 533 qstat_inc(qstat_pv_kick_unlock, true); 534 pv_kick(node->cpu); 535 } 536 537 /* 538 * Include the architecture specific callee-save thunk of the 539 * __pv_queued_spin_unlock(). This thunk is put together with 540 * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock 541 * function close to each other sharing consecutive instruction cachelines. 542 * Alternatively, architecture specific version of __pv_queued_spin_unlock() 543 * can be defined. 544 */ 545 #include <asm/qspinlock_paravirt.h> 546 547 #ifndef __pv_queued_spin_unlock 548 __visible void __pv_queued_spin_unlock(struct qspinlock *lock) 549 { 550 u8 locked; 551 552 /* 553 * We must not unlock if SLOW, because in that case we must first 554 * unhash. Otherwise it would be possible to have multiple @lock 555 * entries, which would be BAD. 556 */ 557 locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); 558 if (likely(locked == _Q_LOCKED_VAL)) 559 return; 560 561 __pv_queued_spin_unlock_slowpath(lock, locked); 562 } 563 #endif /* __pv_queued_spin_unlock */ 564