1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2016 Facebook 4 * Copyright (C) 2013-2014 Jens Axboe 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/random.h> 9 #include <linux/sbitmap.h> 10 #include <linux/seq_file.h> 11 12 static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) 13 { 14 unsigned depth = sb->depth; 15 16 sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags); 17 if (!sb->alloc_hint) 18 return -ENOMEM; 19 20 if (depth && !sb->round_robin) { 21 int i; 22 23 for_each_possible_cpu(i) 24 *per_cpu_ptr(sb->alloc_hint, i) = prandom_u32() % depth; 25 } 26 return 0; 27 } 28 29 static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, 30 unsigned int depth) 31 { 32 unsigned hint; 33 34 hint = this_cpu_read(*sb->alloc_hint); 35 if (unlikely(hint >= depth)) { 36 hint = depth ? prandom_u32() % depth : 0; 37 this_cpu_write(*sb->alloc_hint, hint); 38 } 39 40 return hint; 41 } 42 43 static inline void update_alloc_hint_after_get(struct sbitmap *sb, 44 unsigned int depth, 45 unsigned int hint, 46 unsigned int nr) 47 { 48 if (nr == -1) { 49 /* If the map is full, a hint won't do us much good. */ 50 this_cpu_write(*sb->alloc_hint, 0); 51 } else if (nr == hint || unlikely(sb->round_robin)) { 52 /* Only update the hint if we used it. */ 53 hint = nr + 1; 54 if (hint >= depth - 1) 55 hint = 0; 56 this_cpu_write(*sb->alloc_hint, hint); 57 } 58 } 59 60 /* 61 * See if we have deferred clears that we can batch move 62 */ 63 static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) 64 { 65 unsigned long mask; 66 67 if (!READ_ONCE(map->cleared)) 68 return false; 69 70 /* 71 * First get a stable cleared mask, setting the old mask to 0. 72 */ 73 mask = xchg(&map->cleared, 0); 74 75 /* 76 * Now clear the masked bits in our free word 77 */ 78 atomic_long_andnot(mask, (atomic_long_t *)&map->word); 79 BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); 80 return true; 81 } 82 83 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, 84 gfp_t flags, int node, bool round_robin, 85 bool alloc_hint) 86 { 87 unsigned int bits_per_word; 88 unsigned int i; 89 90 if (shift < 0) 91 shift = sbitmap_calculate_shift(depth); 92 93 bits_per_word = 1U << shift; 94 if (bits_per_word > BITS_PER_LONG) 95 return -EINVAL; 96 97 sb->shift = shift; 98 sb->depth = depth; 99 sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 100 sb->round_robin = round_robin; 101 102 if (depth == 0) { 103 sb->map = NULL; 104 return 0; 105 } 106 107 if (alloc_hint) { 108 if (init_alloc_hint(sb, flags)) 109 return -ENOMEM; 110 } else { 111 sb->alloc_hint = NULL; 112 } 113 114 sb->map = kcalloc_node(sb->map_nr, sizeof(*sb->map), flags, node); 115 if (!sb->map) { 116 free_percpu(sb->alloc_hint); 117 return -ENOMEM; 118 } 119 120 for (i = 0; i < sb->map_nr; i++) { 121 sb->map[i].depth = min(depth, bits_per_word); 122 depth -= sb->map[i].depth; 123 } 124 return 0; 125 } 126 EXPORT_SYMBOL_GPL(sbitmap_init_node); 127 128 void sbitmap_resize(struct sbitmap *sb, unsigned int depth) 129 { 130 unsigned int bits_per_word = 1U << sb->shift; 131 unsigned int i; 132 133 for (i = 0; i < sb->map_nr; i++) 134 sbitmap_deferred_clear(&sb->map[i]); 135 136 sb->depth = depth; 137 sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 138 139 for (i = 0; i < sb->map_nr; i++) { 140 sb->map[i].depth = min(depth, bits_per_word); 141 depth -= sb->map[i].depth; 142 } 143 } 144 EXPORT_SYMBOL_GPL(sbitmap_resize); 145 146 static int __sbitmap_get_word(unsigned long *word, unsigned long depth, 147 unsigned int hint, bool wrap) 148 { 149 int nr; 150 151 /* don't wrap if starting from 0 */ 152 wrap = wrap && hint; 153 154 while (1) { 155 nr = find_next_zero_bit(word, depth, hint); 156 if (unlikely(nr >= depth)) { 157 /* 158 * We started with an offset, and we didn't reset the 159 * offset to 0 in a failure case, so start from 0 to 160 * exhaust the map. 161 */ 162 if (hint && wrap) { 163 hint = 0; 164 continue; 165 } 166 return -1; 167 } 168 169 if (!test_and_set_bit_lock(nr, word)) 170 break; 171 172 hint = nr + 1; 173 if (hint >= depth - 1) 174 hint = 0; 175 } 176 177 return nr; 178 } 179 180 static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index, 181 unsigned int alloc_hint) 182 { 183 struct sbitmap_word *map = &sb->map[index]; 184 int nr; 185 186 do { 187 nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint, 188 !sb->round_robin); 189 if (nr != -1) 190 break; 191 if (!sbitmap_deferred_clear(map)) 192 break; 193 } while (1); 194 195 return nr; 196 } 197 198 static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) 199 { 200 unsigned int i, index; 201 int nr = -1; 202 203 index = SB_NR_TO_INDEX(sb, alloc_hint); 204 205 /* 206 * Unless we're doing round robin tag allocation, just use the 207 * alloc_hint to find the right word index. No point in looping 208 * twice in find_next_zero_bit() for that case. 209 */ 210 if (sb->round_robin) 211 alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); 212 else 213 alloc_hint = 0; 214 215 for (i = 0; i < sb->map_nr; i++) { 216 nr = sbitmap_find_bit_in_index(sb, index, alloc_hint); 217 if (nr != -1) { 218 nr += index << sb->shift; 219 break; 220 } 221 222 /* Jump to next index. */ 223 alloc_hint = 0; 224 if (++index >= sb->map_nr) 225 index = 0; 226 } 227 228 return nr; 229 } 230 231 int sbitmap_get(struct sbitmap *sb) 232 { 233 int nr; 234 unsigned int hint, depth; 235 236 if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) 237 return -1; 238 239 depth = READ_ONCE(sb->depth); 240 hint = update_alloc_hint_before_get(sb, depth); 241 nr = __sbitmap_get(sb, hint); 242 update_alloc_hint_after_get(sb, depth, hint, nr); 243 244 return nr; 245 } 246 EXPORT_SYMBOL_GPL(sbitmap_get); 247 248 static int __sbitmap_get_shallow(struct sbitmap *sb, 249 unsigned int alloc_hint, 250 unsigned long shallow_depth) 251 { 252 unsigned int i, index; 253 int nr = -1; 254 255 index = SB_NR_TO_INDEX(sb, alloc_hint); 256 257 for (i = 0; i < sb->map_nr; i++) { 258 again: 259 nr = __sbitmap_get_word(&sb->map[index].word, 260 min(sb->map[index].depth, shallow_depth), 261 SB_NR_TO_BIT(sb, alloc_hint), true); 262 if (nr != -1) { 263 nr += index << sb->shift; 264 break; 265 } 266 267 if (sbitmap_deferred_clear(&sb->map[index])) 268 goto again; 269 270 /* Jump to next index. */ 271 index++; 272 alloc_hint = index << sb->shift; 273 274 if (index >= sb->map_nr) { 275 index = 0; 276 alloc_hint = 0; 277 } 278 } 279 280 return nr; 281 } 282 283 int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) 284 { 285 int nr; 286 unsigned int hint, depth; 287 288 if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) 289 return -1; 290 291 depth = READ_ONCE(sb->depth); 292 hint = update_alloc_hint_before_get(sb, depth); 293 nr = __sbitmap_get_shallow(sb, hint, shallow_depth); 294 update_alloc_hint_after_get(sb, depth, hint, nr); 295 296 return nr; 297 } 298 EXPORT_SYMBOL_GPL(sbitmap_get_shallow); 299 300 bool sbitmap_any_bit_set(const struct sbitmap *sb) 301 { 302 unsigned int i; 303 304 for (i = 0; i < sb->map_nr; i++) { 305 if (sb->map[i].word & ~sb->map[i].cleared) 306 return true; 307 } 308 return false; 309 } 310 EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); 311 312 static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) 313 { 314 unsigned int i, weight = 0; 315 316 for (i = 0; i < sb->map_nr; i++) { 317 const struct sbitmap_word *word = &sb->map[i]; 318 319 if (set) 320 weight += bitmap_weight(&word->word, word->depth); 321 else 322 weight += bitmap_weight(&word->cleared, word->depth); 323 } 324 return weight; 325 } 326 327 static unsigned int sbitmap_cleared(const struct sbitmap *sb) 328 { 329 return __sbitmap_weight(sb, false); 330 } 331 332 unsigned int sbitmap_weight(const struct sbitmap *sb) 333 { 334 return __sbitmap_weight(sb, true) - sbitmap_cleared(sb); 335 } 336 EXPORT_SYMBOL_GPL(sbitmap_weight); 337 338 void sbitmap_show(struct sbitmap *sb, struct seq_file *m) 339 { 340 seq_printf(m, "depth=%u\n", sb->depth); 341 seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); 342 seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); 343 seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); 344 seq_printf(m, "map_nr=%u\n", sb->map_nr); 345 } 346 EXPORT_SYMBOL_GPL(sbitmap_show); 347 348 static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) 349 { 350 if ((offset & 0xf) == 0) { 351 if (offset != 0) 352 seq_putc(m, '\n'); 353 seq_printf(m, "%08x:", offset); 354 } 355 if ((offset & 0x1) == 0) 356 seq_putc(m, ' '); 357 seq_printf(m, "%02x", byte); 358 } 359 360 void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) 361 { 362 u8 byte = 0; 363 unsigned int byte_bits = 0; 364 unsigned int offset = 0; 365 int i; 366 367 for (i = 0; i < sb->map_nr; i++) { 368 unsigned long word = READ_ONCE(sb->map[i].word); 369 unsigned long cleared = READ_ONCE(sb->map[i].cleared); 370 unsigned int word_bits = READ_ONCE(sb->map[i].depth); 371 372 word &= ~cleared; 373 374 while (word_bits > 0) { 375 unsigned int bits = min(8 - byte_bits, word_bits); 376 377 byte |= (word & (BIT(bits) - 1)) << byte_bits; 378 byte_bits += bits; 379 if (byte_bits == 8) { 380 emit_byte(m, offset, byte); 381 byte = 0; 382 byte_bits = 0; 383 offset++; 384 } 385 word >>= bits; 386 word_bits -= bits; 387 } 388 } 389 if (byte_bits) { 390 emit_byte(m, offset, byte); 391 offset++; 392 } 393 if (offset) 394 seq_putc(m, '\n'); 395 } 396 EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); 397 398 static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, 399 unsigned int depth) 400 { 401 unsigned int wake_batch; 402 unsigned int shallow_depth; 403 404 /* 405 * For each batch, we wake up one queue. We need to make sure that our 406 * batch size is small enough that the full depth of the bitmap, 407 * potentially limited by a shallow depth, is enough to wake up all of 408 * the queues. 409 * 410 * Each full word of the bitmap has bits_per_word bits, and there might 411 * be a partial word. There are depth / bits_per_word full words and 412 * depth % bits_per_word bits left over. In bitwise arithmetic: 413 * 414 * bits_per_word = 1 << shift 415 * depth / bits_per_word = depth >> shift 416 * depth % bits_per_word = depth & ((1 << shift) - 1) 417 * 418 * Each word can be limited to sbq->min_shallow_depth bits. 419 */ 420 shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); 421 depth = ((depth >> sbq->sb.shift) * shallow_depth + 422 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); 423 wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, 424 SBQ_WAKE_BATCH); 425 426 return wake_batch; 427 } 428 429 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, 430 int shift, bool round_robin, gfp_t flags, int node) 431 { 432 int ret; 433 int i; 434 435 ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, 436 round_robin, true); 437 if (ret) 438 return ret; 439 440 sbq->min_shallow_depth = UINT_MAX; 441 sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); 442 atomic_set(&sbq->wake_index, 0); 443 atomic_set(&sbq->ws_active, 0); 444 445 sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); 446 if (!sbq->ws) { 447 sbitmap_free(&sbq->sb); 448 return -ENOMEM; 449 } 450 451 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 452 init_waitqueue_head(&sbq->ws[i].wait); 453 atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); 454 } 455 456 return 0; 457 } 458 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); 459 460 static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, 461 unsigned int depth) 462 { 463 unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); 464 int i; 465 466 if (sbq->wake_batch != wake_batch) { 467 WRITE_ONCE(sbq->wake_batch, wake_batch); 468 /* 469 * Pairs with the memory barrier in sbitmap_queue_wake_up() 470 * to ensure that the batch size is updated before the wait 471 * counts. 472 */ 473 smp_mb(); 474 for (i = 0; i < SBQ_WAIT_QUEUES; i++) 475 atomic_set(&sbq->ws[i].wait_cnt, 1); 476 } 477 } 478 479 void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) 480 { 481 sbitmap_queue_update_wake_batch(sbq, depth); 482 sbitmap_resize(&sbq->sb, depth); 483 } 484 EXPORT_SYMBOL_GPL(sbitmap_queue_resize); 485 486 int __sbitmap_queue_get(struct sbitmap_queue *sbq) 487 { 488 return sbitmap_get(&sbq->sb); 489 } 490 EXPORT_SYMBOL_GPL(__sbitmap_queue_get); 491 492 int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, 493 unsigned int shallow_depth) 494 { 495 WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); 496 497 return sbitmap_get_shallow(&sbq->sb, shallow_depth); 498 } 499 EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow); 500 501 void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, 502 unsigned int min_shallow_depth) 503 { 504 sbq->min_shallow_depth = min_shallow_depth; 505 sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); 506 } 507 EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); 508 509 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) 510 { 511 int i, wake_index; 512 513 if (!atomic_read(&sbq->ws_active)) 514 return NULL; 515 516 wake_index = atomic_read(&sbq->wake_index); 517 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 518 struct sbq_wait_state *ws = &sbq->ws[wake_index]; 519 520 if (waitqueue_active(&ws->wait)) { 521 if (wake_index != atomic_read(&sbq->wake_index)) 522 atomic_set(&sbq->wake_index, wake_index); 523 return ws; 524 } 525 526 wake_index = sbq_index_inc(wake_index); 527 } 528 529 return NULL; 530 } 531 532 static bool __sbq_wake_up(struct sbitmap_queue *sbq) 533 { 534 struct sbq_wait_state *ws; 535 unsigned int wake_batch; 536 int wait_cnt; 537 538 ws = sbq_wake_ptr(sbq); 539 if (!ws) 540 return false; 541 542 wait_cnt = atomic_dec_return(&ws->wait_cnt); 543 if (wait_cnt <= 0) { 544 int ret; 545 546 wake_batch = READ_ONCE(sbq->wake_batch); 547 548 /* 549 * Pairs with the memory barrier in sbitmap_queue_resize() to 550 * ensure that we see the batch size update before the wait 551 * count is reset. 552 */ 553 smp_mb__before_atomic(); 554 555 /* 556 * For concurrent callers of this, the one that failed the 557 * atomic_cmpxhcg() race should call this function again 558 * to wakeup a new batch on a different 'ws'. 559 */ 560 ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch); 561 if (ret == wait_cnt) { 562 sbq_index_atomic_inc(&sbq->wake_index); 563 wake_up_nr(&ws->wait, wake_batch); 564 return false; 565 } 566 567 return true; 568 } 569 570 return false; 571 } 572 573 void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) 574 { 575 while (__sbq_wake_up(sbq)) 576 ; 577 } 578 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); 579 580 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, 581 unsigned int cpu) 582 { 583 /* 584 * Once the clear bit is set, the bit may be allocated out. 585 * 586 * Orders READ/WRITE on the asssociated instance(such as request 587 * of blk_mq) by this bit for avoiding race with re-allocation, 588 * and its pair is the memory barrier implied in __sbitmap_get_word. 589 * 590 * One invariant is that the clear bit has to be zero when the bit 591 * is in use. 592 */ 593 smp_mb__before_atomic(); 594 sbitmap_deferred_clear_bit(&sbq->sb, nr); 595 596 /* 597 * Pairs with the memory barrier in set_current_state() to ensure the 598 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker 599 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the 600 * waiter. See the comment on waitqueue_active(). 601 */ 602 smp_mb__after_atomic(); 603 sbitmap_queue_wake_up(sbq); 604 605 if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth)) 606 *per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr; 607 } 608 EXPORT_SYMBOL_GPL(sbitmap_queue_clear); 609 610 void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) 611 { 612 int i, wake_index; 613 614 /* 615 * Pairs with the memory barrier in set_current_state() like in 616 * sbitmap_queue_wake_up(). 617 */ 618 smp_mb(); 619 wake_index = atomic_read(&sbq->wake_index); 620 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 621 struct sbq_wait_state *ws = &sbq->ws[wake_index]; 622 623 if (waitqueue_active(&ws->wait)) 624 wake_up(&ws->wait); 625 626 wake_index = sbq_index_inc(wake_index); 627 } 628 } 629 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); 630 631 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) 632 { 633 bool first; 634 int i; 635 636 sbitmap_show(&sbq->sb, m); 637 638 seq_puts(m, "alloc_hint={"); 639 first = true; 640 for_each_possible_cpu(i) { 641 if (!first) 642 seq_puts(m, ", "); 643 first = false; 644 seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i)); 645 } 646 seq_puts(m, "}\n"); 647 648 seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); 649 seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); 650 seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); 651 652 seq_puts(m, "ws={\n"); 653 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 654 struct sbq_wait_state *ws = &sbq->ws[i]; 655 656 seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", 657 atomic_read(&ws->wait_cnt), 658 waitqueue_active(&ws->wait) ? "active" : "inactive"); 659 } 660 seq_puts(m, "}\n"); 661 662 seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); 663 seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); 664 } 665 EXPORT_SYMBOL_GPL(sbitmap_queue_show); 666 667 void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, 668 struct sbq_wait_state *ws, 669 struct sbq_wait *sbq_wait) 670 { 671 if (!sbq_wait->sbq) { 672 sbq_wait->sbq = sbq; 673 atomic_inc(&sbq->ws_active); 674 add_wait_queue(&ws->wait, &sbq_wait->wait); 675 } 676 } 677 EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); 678 679 void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) 680 { 681 list_del_init(&sbq_wait->wait.entry); 682 if (sbq_wait->sbq) { 683 atomic_dec(&sbq_wait->sbq->ws_active); 684 sbq_wait->sbq = NULL; 685 } 686 } 687 EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); 688 689 void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, 690 struct sbq_wait_state *ws, 691 struct sbq_wait *sbq_wait, int state) 692 { 693 if (!sbq_wait->sbq) { 694 atomic_inc(&sbq->ws_active); 695 sbq_wait->sbq = sbq; 696 } 697 prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); 698 } 699 EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); 700 701 void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, 702 struct sbq_wait *sbq_wait) 703 { 704 finish_wait(&ws->wait, &sbq_wait->wait); 705 if (sbq_wait->sbq) { 706 atomic_dec(&sbq->ws_active); 707 sbq_wait->sbq = NULL; 708 } 709 } 710 EXPORT_SYMBOL_GPL(sbitmap_finish_wait); 711