1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code for working with individual keys, and sorted sets of keys with in a 4 * btree node 5 * 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "btree_cache.h" 11 #include "bset.h" 12 #include "eytzinger.h" 13 #include "trace.h" 14 #include "util.h" 15 16 #include <asm/unaligned.h> 17 #include <linux/console.h> 18 #include <linux/random.h> 19 #include <linux/prefetch.h> 20 21 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, 22 struct btree *); 23 24 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) 25 { 26 unsigned n = ARRAY_SIZE(iter->data); 27 28 while (n && __btree_node_iter_set_end(iter, n - 1)) 29 --n; 30 31 return n; 32 } 33 34 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) 35 { 36 return bch2_bkey_to_bset_inlined(b, k); 37 } 38 39 /* 40 * There are never duplicate live keys in the btree - but including keys that 41 * have been flagged as deleted (and will be cleaned up later) we _will_ see 42 * duplicates. 43 * 44 * Thus the sort order is: usual key comparison first, but for keys that compare 45 * equal the deleted key(s) come first, and the (at most one) live version comes 46 * last. 47 * 48 * The main reason for this is insertion: to handle overwrites, we first iterate 49 * over keys that compare equal to our insert key, and then insert immediately 50 * prior to the first key greater than the key we're inserting - our insert 51 * position will be after all keys that compare equal to our insert key, which 52 * by the time we actually do the insert will all be deleted. 53 */ 54 55 void bch2_dump_bset(struct bch_fs *c, struct btree *b, 56 struct bset *i, unsigned set) 57 { 58 struct bkey_packed *_k, *_n; 59 struct bkey uk, n; 60 struct bkey_s_c k; 61 struct printbuf buf = PRINTBUF; 62 63 if (!i->u64s) 64 return; 65 66 for (_k = i->start; 67 _k < vstruct_last(i); 68 _k = _n) { 69 _n = bkey_p_next(_k); 70 71 k = bkey_disassemble(b, _k, &uk); 72 73 printbuf_reset(&buf); 74 if (c) 75 bch2_bkey_val_to_text(&buf, c, k); 76 else 77 bch2_bkey_to_text(&buf, k.k); 78 printk(KERN_ERR "block %u key %5zu: %s\n", set, 79 _k->_data - i->_data, buf.buf); 80 81 if (_n == vstruct_last(i)) 82 continue; 83 84 n = bkey_unpack_key(b, _n); 85 86 if (bpos_lt(n.p, k.k->p)) { 87 printk(KERN_ERR "Key skipped backwards\n"); 88 continue; 89 } 90 91 if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) 92 printk(KERN_ERR "Duplicate keys\n"); 93 } 94 95 printbuf_exit(&buf); 96 } 97 98 void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) 99 { 100 struct bset_tree *t; 101 102 console_lock(); 103 for_each_bset(b, t) 104 bch2_dump_bset(c, b, bset(b, t), t - b->set); 105 console_unlock(); 106 } 107 108 void bch2_dump_btree_node_iter(struct btree *b, 109 struct btree_node_iter *iter) 110 { 111 struct btree_node_iter_set *set; 112 struct printbuf buf = PRINTBUF; 113 114 printk(KERN_ERR "btree node iter with %u/%u sets:\n", 115 __btree_node_iter_used(iter), b->nsets); 116 117 btree_node_iter_for_each(iter, set) { 118 struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); 119 struct bset_tree *t = bch2_bkey_to_bset(b, k); 120 struct bkey uk = bkey_unpack_key(b, k); 121 122 printbuf_reset(&buf); 123 bch2_bkey_to_text(&buf, &uk); 124 printk(KERN_ERR "set %zu key %u: %s\n", 125 t - b->set, set->k, buf.buf); 126 } 127 128 printbuf_exit(&buf); 129 } 130 131 #ifdef CONFIG_BCACHEFS_DEBUG 132 133 void __bch2_verify_btree_nr_keys(struct btree *b) 134 { 135 struct bset_tree *t; 136 struct bkey_packed *k; 137 struct btree_nr_keys nr = { 0 }; 138 139 for_each_bset(b, t) 140 bset_tree_for_each_key(b, t, k) 141 if (!bkey_deleted(k)) 142 btree_keys_account_key_add(&nr, t - b->set, k); 143 144 BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); 145 } 146 147 static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, 148 struct btree *b) 149 { 150 struct btree_node_iter iter = *_iter; 151 const struct bkey_packed *k, *n; 152 153 k = bch2_btree_node_iter_peek_all(&iter, b); 154 __bch2_btree_node_iter_advance(&iter, b); 155 n = bch2_btree_node_iter_peek_all(&iter, b); 156 157 bkey_unpack_key(b, k); 158 159 if (n && 160 bkey_iter_cmp(b, k, n) > 0) { 161 struct btree_node_iter_set *set; 162 struct bkey ku = bkey_unpack_key(b, k); 163 struct bkey nu = bkey_unpack_key(b, n); 164 struct printbuf buf1 = PRINTBUF; 165 struct printbuf buf2 = PRINTBUF; 166 167 bch2_dump_btree_node(NULL, b); 168 bch2_bkey_to_text(&buf1, &ku); 169 bch2_bkey_to_text(&buf2, &nu); 170 printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", 171 buf1.buf, buf2.buf); 172 printk(KERN_ERR "iter was:"); 173 174 btree_node_iter_for_each(_iter, set) { 175 struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); 176 struct bset_tree *t = bch2_bkey_to_bset(b, k2); 177 printk(" [%zi %zi]", t - b->set, 178 k2->_data - bset(b, t)->_data); 179 } 180 panic("\n"); 181 } 182 } 183 184 void bch2_btree_node_iter_verify(struct btree_node_iter *iter, 185 struct btree *b) 186 { 187 struct btree_node_iter_set *set, *s2; 188 struct bkey_packed *k, *p; 189 struct bset_tree *t; 190 191 if (bch2_btree_node_iter_end(iter)) 192 return; 193 194 /* Verify no duplicates: */ 195 btree_node_iter_for_each(iter, set) { 196 BUG_ON(set->k > set->end); 197 btree_node_iter_for_each(iter, s2) 198 BUG_ON(set != s2 && set->end == s2->end); 199 } 200 201 /* Verify that set->end is correct: */ 202 btree_node_iter_for_each(iter, set) { 203 for_each_bset(b, t) 204 if (set->end == t->end_offset) 205 goto found; 206 BUG(); 207 found: 208 BUG_ON(set->k < btree_bkey_first_offset(t) || 209 set->k >= t->end_offset); 210 } 211 212 /* Verify iterator is sorted: */ 213 btree_node_iter_for_each(iter, set) 214 BUG_ON(set != iter->data && 215 btree_node_iter_cmp(b, set[-1], set[0]) > 0); 216 217 k = bch2_btree_node_iter_peek_all(iter, b); 218 219 for_each_bset(b, t) { 220 if (iter->data[0].end == t->end_offset) 221 continue; 222 223 p = bch2_bkey_prev_all(b, t, 224 bch2_btree_node_iter_bset_pos(iter, b, t)); 225 226 BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); 227 } 228 } 229 230 void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, 231 struct bkey_packed *insert, unsigned clobber_u64s) 232 { 233 struct bset_tree *t = bch2_bkey_to_bset(b, where); 234 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); 235 struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); 236 struct printbuf buf1 = PRINTBUF; 237 struct printbuf buf2 = PRINTBUF; 238 #if 0 239 BUG_ON(prev && 240 bkey_iter_cmp(b, prev, insert) > 0); 241 #else 242 if (prev && 243 bkey_iter_cmp(b, prev, insert) > 0) { 244 struct bkey k1 = bkey_unpack_key(b, prev); 245 struct bkey k2 = bkey_unpack_key(b, insert); 246 247 bch2_dump_btree_node(NULL, b); 248 bch2_bkey_to_text(&buf1, &k1); 249 bch2_bkey_to_text(&buf2, &k2); 250 251 panic("prev > insert:\n" 252 "prev key %s\n" 253 "insert key %s\n", 254 buf1.buf, buf2.buf); 255 } 256 #endif 257 #if 0 258 BUG_ON(next != btree_bkey_last(b, t) && 259 bkey_iter_cmp(b, insert, next) > 0); 260 #else 261 if (next != btree_bkey_last(b, t) && 262 bkey_iter_cmp(b, insert, next) > 0) { 263 struct bkey k1 = bkey_unpack_key(b, insert); 264 struct bkey k2 = bkey_unpack_key(b, next); 265 266 bch2_dump_btree_node(NULL, b); 267 bch2_bkey_to_text(&buf1, &k1); 268 bch2_bkey_to_text(&buf2, &k2); 269 270 panic("insert > next:\n" 271 "insert key %s\n" 272 "next key %s\n", 273 buf1.buf, buf2.buf); 274 } 275 #endif 276 } 277 278 #else 279 280 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, 281 struct btree *b) {} 282 283 #endif 284 285 /* Auxiliary search trees */ 286 287 #define BFLOAT_FAILED_UNPACKED U8_MAX 288 #define BFLOAT_FAILED U8_MAX 289 290 struct bkey_float { 291 u8 exponent; 292 u8 key_offset; 293 u16 mantissa; 294 }; 295 #define BKEY_MANTISSA_BITS 16 296 297 static unsigned bkey_float_byte_offset(unsigned idx) 298 { 299 return idx * sizeof(struct bkey_float); 300 } 301 302 struct ro_aux_tree { 303 u8 nothing[0]; 304 struct bkey_float f[]; 305 }; 306 307 struct rw_aux_tree { 308 u16 offset; 309 struct bpos k; 310 }; 311 312 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) 313 { 314 BUG_ON(t->aux_data_offset == U16_MAX); 315 316 switch (bset_aux_tree_type(t)) { 317 case BSET_NO_AUX_TREE: 318 return t->aux_data_offset; 319 case BSET_RO_AUX_TREE: 320 return t->aux_data_offset + 321 DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + 322 t->size * sizeof(u8), 8); 323 case BSET_RW_AUX_TREE: 324 return t->aux_data_offset + 325 DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); 326 default: 327 BUG(); 328 } 329 } 330 331 static unsigned bset_aux_tree_buf_start(const struct btree *b, 332 const struct bset_tree *t) 333 { 334 return t == b->set 335 ? DIV_ROUND_UP(b->unpack_fn_len, 8) 336 : bset_aux_tree_buf_end(t - 1); 337 } 338 339 static void *__aux_tree_base(const struct btree *b, 340 const struct bset_tree *t) 341 { 342 return b->aux_data + t->aux_data_offset * 8; 343 } 344 345 static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, 346 const struct bset_tree *t) 347 { 348 EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); 349 350 return __aux_tree_base(b, t); 351 } 352 353 static u8 *ro_aux_tree_prev(const struct btree *b, 354 const struct bset_tree *t) 355 { 356 EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); 357 358 return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); 359 } 360 361 static struct bkey_float *bkey_float(const struct btree *b, 362 const struct bset_tree *t, 363 unsigned idx) 364 { 365 return ro_aux_tree_base(b, t)->f + idx; 366 } 367 368 static void bset_aux_tree_verify(const struct btree *b) 369 { 370 #ifdef CONFIG_BCACHEFS_DEBUG 371 const struct bset_tree *t; 372 373 for_each_bset(b, t) { 374 if (t->aux_data_offset == U16_MAX) 375 continue; 376 377 BUG_ON(t != b->set && 378 t[-1].aux_data_offset == U16_MAX); 379 380 BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); 381 BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); 382 BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); 383 } 384 #endif 385 } 386 387 void bch2_btree_keys_init(struct btree *b) 388 { 389 unsigned i; 390 391 b->nsets = 0; 392 memset(&b->nr, 0, sizeof(b->nr)); 393 394 for (i = 0; i < MAX_BSETS; i++) 395 b->set[i].data_offset = U16_MAX; 396 397 bch2_bset_set_no_aux_tree(b, b->set); 398 } 399 400 /* Binary tree stuff for auxiliary search trees */ 401 402 /* 403 * Cacheline/offset <-> bkey pointer arithmetic: 404 * 405 * t->tree is a binary search tree in an array; each node corresponds to a key 406 * in one cacheline in t->set (BSET_CACHELINE bytes). 407 * 408 * This means we don't have to store the full index of the key that a node in 409 * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and 410 * then bkey_float->m gives us the offset within that cacheline, in units of 8 411 * bytes. 412 * 413 * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to 414 * make this work. 415 * 416 * To construct the bfloat for an arbitrary key we need to know what the key 417 * immediately preceding it is: we have to check if the two keys differ in the 418 * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size 419 * of the previous key so we can walk backwards to it from t->tree[j]'s key. 420 */ 421 422 static inline void *bset_cacheline(const struct btree *b, 423 const struct bset_tree *t, 424 unsigned cacheline) 425 { 426 return (void *) round_down((unsigned long) btree_bkey_first(b, t), 427 L1_CACHE_BYTES) + 428 cacheline * BSET_CACHELINE; 429 } 430 431 static struct bkey_packed *cacheline_to_bkey(const struct btree *b, 432 const struct bset_tree *t, 433 unsigned cacheline, 434 unsigned offset) 435 { 436 return bset_cacheline(b, t, cacheline) + offset * 8; 437 } 438 439 static unsigned bkey_to_cacheline(const struct btree *b, 440 const struct bset_tree *t, 441 const struct bkey_packed *k) 442 { 443 return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; 444 } 445 446 static ssize_t __bkey_to_cacheline_offset(const struct btree *b, 447 const struct bset_tree *t, 448 unsigned cacheline, 449 const struct bkey_packed *k) 450 { 451 return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); 452 } 453 454 static unsigned bkey_to_cacheline_offset(const struct btree *b, 455 const struct bset_tree *t, 456 unsigned cacheline, 457 const struct bkey_packed *k) 458 { 459 size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); 460 461 EBUG_ON(m > U8_MAX); 462 return m; 463 } 464 465 static inline struct bkey_packed *tree_to_bkey(const struct btree *b, 466 const struct bset_tree *t, 467 unsigned j) 468 { 469 return cacheline_to_bkey(b, t, 470 __eytzinger1_to_inorder(j, t->size - 1, t->extra), 471 bkey_float(b, t, j)->key_offset); 472 } 473 474 static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, 475 const struct bset_tree *t, 476 unsigned j) 477 { 478 unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; 479 480 return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); 481 } 482 483 static struct rw_aux_tree *rw_aux_tree(const struct btree *b, 484 const struct bset_tree *t) 485 { 486 EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); 487 488 return __aux_tree_base(b, t); 489 } 490 491 /* 492 * For the write set - the one we're currently inserting keys into - we don't 493 * maintain a full search tree, we just keep a simple lookup table in t->prev. 494 */ 495 static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, 496 struct bset_tree *t, 497 unsigned j) 498 { 499 return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); 500 } 501 502 static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, 503 unsigned j, struct bkey_packed *k) 504 { 505 EBUG_ON(k >= btree_bkey_last(b, t)); 506 507 rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { 508 .offset = __btree_node_key_to_offset(b, k), 509 .k = bkey_unpack_pos(b, k), 510 }; 511 } 512 513 static void bch2_bset_verify_rw_aux_tree(struct btree *b, 514 struct bset_tree *t) 515 { 516 struct bkey_packed *k = btree_bkey_first(b, t); 517 unsigned j = 0; 518 519 if (!bch2_expensive_debug_checks) 520 return; 521 522 BUG_ON(bset_has_ro_aux_tree(t)); 523 524 if (!bset_has_rw_aux_tree(t)) 525 return; 526 527 BUG_ON(t->size < 1); 528 BUG_ON(rw_aux_to_bkey(b, t, j) != k); 529 530 goto start; 531 while (1) { 532 if (rw_aux_to_bkey(b, t, j) == k) { 533 BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, 534 bkey_unpack_pos(b, k))); 535 start: 536 if (++j == t->size) 537 break; 538 539 BUG_ON(rw_aux_tree(b, t)[j].offset <= 540 rw_aux_tree(b, t)[j - 1].offset); 541 } 542 543 k = bkey_p_next(k); 544 BUG_ON(k >= btree_bkey_last(b, t)); 545 } 546 } 547 548 /* returns idx of first entry >= offset: */ 549 static unsigned rw_aux_tree_bsearch(struct btree *b, 550 struct bset_tree *t, 551 unsigned offset) 552 { 553 unsigned bset_offs = offset - btree_bkey_first_offset(t); 554 unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); 555 unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; 556 557 EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); 558 EBUG_ON(!t->size); 559 EBUG_ON(idx > t->size); 560 561 while (idx < t->size && 562 rw_aux_tree(b, t)[idx].offset < offset) 563 idx++; 564 565 while (idx && 566 rw_aux_tree(b, t)[idx - 1].offset >= offset) 567 idx--; 568 569 EBUG_ON(idx < t->size && 570 rw_aux_tree(b, t)[idx].offset < offset); 571 EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); 572 EBUG_ON(idx + 1 < t->size && 573 rw_aux_tree(b, t)[idx].offset == 574 rw_aux_tree(b, t)[idx + 1].offset); 575 576 return idx; 577 } 578 579 static inline unsigned bkey_mantissa(const struct bkey_packed *k, 580 const struct bkey_float *f, 581 unsigned idx) 582 { 583 u64 v; 584 585 EBUG_ON(!bkey_packed(k)); 586 587 v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); 588 589 /* 590 * In little endian, we're shifting off low bits (and then the bits we 591 * want are at the low end), in big endian we're shifting off high bits 592 * (and then the bits we want are at the high end, so we shift them 593 * back down): 594 */ 595 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 596 v >>= f->exponent & 7; 597 #else 598 v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; 599 #endif 600 return (u16) v; 601 } 602 603 static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, 604 unsigned j, 605 struct bkey_packed *min_key, 606 struct bkey_packed *max_key) 607 { 608 struct bkey_float *f = bkey_float(b, t, j); 609 struct bkey_packed *m = tree_to_bkey(b, t, j); 610 struct bkey_packed *l = is_power_of_2(j) 611 ? min_key 612 : tree_to_prev_bkey(b, t, j >> ffs(j)); 613 struct bkey_packed *r = is_power_of_2(j + 1) 614 ? max_key 615 : tree_to_bkey(b, t, j >> (ffz(j) + 1)); 616 unsigned mantissa; 617 int shift, exponent, high_bit; 618 619 /* 620 * for failed bfloats, the lookup code falls back to comparing against 621 * the original key. 622 */ 623 624 if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || 625 !b->nr_key_bits) { 626 f->exponent = BFLOAT_FAILED_UNPACKED; 627 return; 628 } 629 630 /* 631 * The greatest differing bit of l and r is the first bit we must 632 * include in the bfloat mantissa we're creating in order to do 633 * comparisons - that bit always becomes the high bit of 634 * bfloat->mantissa, and thus the exponent we're calculating here is 635 * the position of what will become the low bit in bfloat->mantissa: 636 * 637 * Note that this may be negative - we may be running off the low end 638 * of the key: we handle this later: 639 */ 640 high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), 641 min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); 642 exponent = high_bit - (BKEY_MANTISSA_BITS - 1); 643 644 /* 645 * Then we calculate the actual shift value, from the start of the key 646 * (k->_data), to get the key bits starting at exponent: 647 */ 648 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 649 shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; 650 651 EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); 652 #else 653 shift = high_bit_offset + 654 b->nr_key_bits - 655 exponent - 656 BKEY_MANTISSA_BITS; 657 658 EBUG_ON(shift < KEY_PACKED_BITS_START); 659 #endif 660 EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); 661 662 f->exponent = shift; 663 mantissa = bkey_mantissa(m, f, j); 664 665 /* 666 * If we've got garbage bits, set them to all 1s - it's legal for the 667 * bfloat to compare larger than the original key, but not smaller: 668 */ 669 if (exponent < 0) 670 mantissa |= ~(~0U << -exponent); 671 672 f->mantissa = mantissa; 673 } 674 675 /* bytes remaining - only valid for last bset: */ 676 static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) 677 { 678 bset_aux_tree_verify(b); 679 680 return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); 681 } 682 683 static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) 684 { 685 return __bset_tree_capacity(b, t) / 686 (sizeof(struct bkey_float) + sizeof(u8)); 687 } 688 689 static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) 690 { 691 return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); 692 } 693 694 static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) 695 { 696 struct bkey_packed *k; 697 698 t->size = 1; 699 t->extra = BSET_RW_AUX_TREE_VAL; 700 rw_aux_tree(b, t)[0].offset = 701 __btree_node_key_to_offset(b, btree_bkey_first(b, t)); 702 703 bset_tree_for_each_key(b, t, k) { 704 if (t->size == bset_rw_tree_capacity(b, t)) 705 break; 706 707 if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > 708 L1_CACHE_BYTES) 709 rw_aux_tree_set(b, t, t->size++, k); 710 } 711 } 712 713 static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) 714 { 715 struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); 716 struct bkey_i min_key, max_key; 717 unsigned j, cacheline = 1; 718 719 t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), 720 bset_ro_tree_capacity(b, t)); 721 retry: 722 if (t->size < 2) { 723 t->size = 0; 724 t->extra = BSET_NO_AUX_TREE_VAL; 725 return; 726 } 727 728 t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; 729 730 /* First we figure out where the first key in each cacheline is */ 731 eytzinger1_for_each(j, t->size - 1) { 732 while (bkey_to_cacheline(b, t, k) < cacheline) 733 prev = k, k = bkey_p_next(k); 734 735 if (k >= btree_bkey_last(b, t)) { 736 /* XXX: this path sucks */ 737 t->size--; 738 goto retry; 739 } 740 741 ro_aux_tree_prev(b, t)[j] = prev->u64s; 742 bkey_float(b, t, j)->key_offset = 743 bkey_to_cacheline_offset(b, t, cacheline++, k); 744 745 EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); 746 EBUG_ON(tree_to_bkey(b, t, j) != k); 747 } 748 749 while (k != btree_bkey_last(b, t)) 750 prev = k, k = bkey_p_next(k); 751 752 if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { 753 bkey_init(&min_key.k); 754 min_key.k.p = b->data->min_key; 755 } 756 757 if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { 758 bkey_init(&max_key.k); 759 max_key.k.p = b->data->max_key; 760 } 761 762 /* Then we build the tree */ 763 eytzinger1_for_each(j, t->size - 1) 764 make_bfloat(b, t, j, 765 bkey_to_packed(&min_key), 766 bkey_to_packed(&max_key)); 767 } 768 769 static void bset_alloc_tree(struct btree *b, struct bset_tree *t) 770 { 771 struct bset_tree *i; 772 773 for (i = b->set; i != t; i++) 774 BUG_ON(bset_has_rw_aux_tree(i)); 775 776 bch2_bset_set_no_aux_tree(b, t); 777 778 /* round up to next cacheline: */ 779 t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), 780 SMP_CACHE_BYTES / sizeof(u64)); 781 782 bset_aux_tree_verify(b); 783 } 784 785 void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, 786 bool writeable) 787 { 788 if (writeable 789 ? bset_has_rw_aux_tree(t) 790 : bset_has_ro_aux_tree(t)) 791 return; 792 793 bset_alloc_tree(b, t); 794 795 if (!__bset_tree_capacity(b, t)) 796 return; 797 798 if (writeable) 799 __build_rw_aux_tree(b, t); 800 else 801 __build_ro_aux_tree(b, t); 802 803 bset_aux_tree_verify(b); 804 } 805 806 void bch2_bset_init_first(struct btree *b, struct bset *i) 807 { 808 struct bset_tree *t; 809 810 BUG_ON(b->nsets); 811 812 memset(i, 0, sizeof(*i)); 813 get_random_bytes(&i->seq, sizeof(i->seq)); 814 SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); 815 816 t = &b->set[b->nsets++]; 817 set_btree_bset(b, t, i); 818 } 819 820 void bch2_bset_init_next(struct bch_fs *c, struct btree *b, 821 struct btree_node_entry *bne) 822 { 823 struct bset *i = &bne->keys; 824 struct bset_tree *t; 825 826 BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); 827 BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); 828 BUG_ON(b->nsets >= MAX_BSETS); 829 830 memset(i, 0, sizeof(*i)); 831 i->seq = btree_bset_first(b)->seq; 832 SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); 833 834 t = &b->set[b->nsets++]; 835 set_btree_bset(b, t, i); 836 } 837 838 /* 839 * find _some_ key in the same bset as @k that precedes @k - not necessarily the 840 * immediate predecessor: 841 */ 842 static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, 843 struct bkey_packed *k) 844 { 845 struct bkey_packed *p; 846 unsigned offset; 847 int j; 848 849 EBUG_ON(k < btree_bkey_first(b, t) || 850 k > btree_bkey_last(b, t)); 851 852 if (k == btree_bkey_first(b, t)) 853 return NULL; 854 855 switch (bset_aux_tree_type(t)) { 856 case BSET_NO_AUX_TREE: 857 p = btree_bkey_first(b, t); 858 break; 859 case BSET_RO_AUX_TREE: 860 j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); 861 862 do { 863 p = j ? tree_to_bkey(b, t, 864 __inorder_to_eytzinger1(j--, 865 t->size - 1, t->extra)) 866 : btree_bkey_first(b, t); 867 } while (p >= k); 868 break; 869 case BSET_RW_AUX_TREE: 870 offset = __btree_node_key_to_offset(b, k); 871 j = rw_aux_tree_bsearch(b, t, offset); 872 p = j ? rw_aux_to_bkey(b, t, j - 1) 873 : btree_bkey_first(b, t); 874 break; 875 } 876 877 return p; 878 } 879 880 struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, 881 struct bset_tree *t, 882 struct bkey_packed *k, 883 unsigned min_key_type) 884 { 885 struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; 886 887 while ((p = __bkey_prev(b, t, k)) && !ret) { 888 for (i = p; i != k; i = bkey_p_next(i)) 889 if (i->type >= min_key_type) 890 ret = i; 891 892 k = p; 893 } 894 895 if (bch2_expensive_debug_checks) { 896 BUG_ON(ret >= orig_k); 897 898 for (i = ret 899 ? bkey_p_next(ret) 900 : btree_bkey_first(b, t); 901 i != orig_k; 902 i = bkey_p_next(i)) 903 BUG_ON(i->type >= min_key_type); 904 } 905 906 return ret; 907 } 908 909 /* Insert */ 910 911 static void bch2_bset_fix_lookup_table(struct btree *b, 912 struct bset_tree *t, 913 struct bkey_packed *_where, 914 unsigned clobber_u64s, 915 unsigned new_u64s) 916 { 917 int shift = new_u64s - clobber_u64s; 918 unsigned l, j, where = __btree_node_key_to_offset(b, _where); 919 920 EBUG_ON(bset_has_ro_aux_tree(t)); 921 922 if (!bset_has_rw_aux_tree(t)) 923 return; 924 925 /* returns first entry >= where */ 926 l = rw_aux_tree_bsearch(b, t, where); 927 928 if (!l) /* never delete first entry */ 929 l++; 930 else if (l < t->size && 931 where < t->end_offset && 932 rw_aux_tree(b, t)[l].offset == where) 933 rw_aux_tree_set(b, t, l++, _where); 934 935 /* l now > where */ 936 937 for (j = l; 938 j < t->size && 939 rw_aux_tree(b, t)[j].offset < where + clobber_u64s; 940 j++) 941 ; 942 943 if (j < t->size && 944 rw_aux_tree(b, t)[j].offset + shift == 945 rw_aux_tree(b, t)[l - 1].offset) 946 j++; 947 948 memmove(&rw_aux_tree(b, t)[l], 949 &rw_aux_tree(b, t)[j], 950 (void *) &rw_aux_tree(b, t)[t->size] - 951 (void *) &rw_aux_tree(b, t)[j]); 952 t->size -= j - l; 953 954 for (j = l; j < t->size; j++) 955 rw_aux_tree(b, t)[j].offset += shift; 956 957 EBUG_ON(l < t->size && 958 rw_aux_tree(b, t)[l].offset == 959 rw_aux_tree(b, t)[l - 1].offset); 960 961 if (t->size < bset_rw_tree_capacity(b, t) && 962 (l < t->size 963 ? rw_aux_tree(b, t)[l].offset 964 : t->end_offset) - 965 rw_aux_tree(b, t)[l - 1].offset > 966 L1_CACHE_BYTES / sizeof(u64)) { 967 struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); 968 struct bkey_packed *end = l < t->size 969 ? rw_aux_to_bkey(b, t, l) 970 : btree_bkey_last(b, t); 971 struct bkey_packed *k = start; 972 973 while (1) { 974 k = bkey_p_next(k); 975 if (k == end) 976 break; 977 978 if ((void *) k - (void *) start >= L1_CACHE_BYTES) { 979 memmove(&rw_aux_tree(b, t)[l + 1], 980 &rw_aux_tree(b, t)[l], 981 (void *) &rw_aux_tree(b, t)[t->size] - 982 (void *) &rw_aux_tree(b, t)[l]); 983 t->size++; 984 rw_aux_tree_set(b, t, l, k); 985 break; 986 } 987 } 988 } 989 990 bch2_bset_verify_rw_aux_tree(b, t); 991 bset_aux_tree_verify(b); 992 } 993 994 void bch2_bset_insert(struct btree *b, 995 struct btree_node_iter *iter, 996 struct bkey_packed *where, 997 struct bkey_i *insert, 998 unsigned clobber_u64s) 999 { 1000 struct bkey_format *f = &b->format; 1001 struct bset_tree *t = bset_tree_last(b); 1002 struct bkey_packed packed, *src = bkey_to_packed(insert); 1003 1004 bch2_bset_verify_rw_aux_tree(b, t); 1005 bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); 1006 1007 if (bch2_bkey_pack_key(&packed, &insert->k, f)) 1008 src = &packed; 1009 1010 if (!bkey_deleted(&insert->k)) 1011 btree_keys_account_key_add(&b->nr, t - b->set, src); 1012 1013 if (src->u64s != clobber_u64s) { 1014 u64 *src_p = (u64 *) where->_data + clobber_u64s; 1015 u64 *dst_p = (u64 *) where->_data + src->u64s; 1016 1017 EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < 1018 (int) clobber_u64s - src->u64s); 1019 1020 memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); 1021 le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); 1022 set_btree_bset_end(b, t); 1023 } 1024 1025 memcpy_u64s_small(where, src, 1026 bkeyp_key_u64s(f, src)); 1027 memcpy_u64s(bkeyp_val(f, where), &insert->v, 1028 bkeyp_val_u64s(f, src)); 1029 1030 if (src->u64s != clobber_u64s) 1031 bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); 1032 1033 bch2_verify_btree_nr_keys(b); 1034 } 1035 1036 void bch2_bset_delete(struct btree *b, 1037 struct bkey_packed *where, 1038 unsigned clobber_u64s) 1039 { 1040 struct bset_tree *t = bset_tree_last(b); 1041 u64 *src_p = (u64 *) where->_data + clobber_u64s; 1042 u64 *dst_p = where->_data; 1043 1044 bch2_bset_verify_rw_aux_tree(b, t); 1045 1046 EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); 1047 1048 memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); 1049 le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); 1050 set_btree_bset_end(b, t); 1051 1052 bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); 1053 } 1054 1055 /* Lookup */ 1056 1057 __flatten 1058 static struct bkey_packed *bset_search_write_set(const struct btree *b, 1059 struct bset_tree *t, 1060 struct bpos *search) 1061 { 1062 unsigned l = 0, r = t->size; 1063 1064 while (l + 1 != r) { 1065 unsigned m = (l + r) >> 1; 1066 1067 if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) 1068 l = m; 1069 else 1070 r = m; 1071 } 1072 1073 return rw_aux_to_bkey(b, t, l); 1074 } 1075 1076 static inline void prefetch_four_cachelines(void *p) 1077 { 1078 #ifdef CONFIG_X86_64 1079 asm("prefetcht0 (-127 + 64 * 0)(%0);" 1080 "prefetcht0 (-127 + 64 * 1)(%0);" 1081 "prefetcht0 (-127 + 64 * 2)(%0);" 1082 "prefetcht0 (-127 + 64 * 3)(%0);" 1083 : 1084 : "r" (p + 127)); 1085 #else 1086 prefetch(p + L1_CACHE_BYTES * 0); 1087 prefetch(p + L1_CACHE_BYTES * 1); 1088 prefetch(p + L1_CACHE_BYTES * 2); 1089 prefetch(p + L1_CACHE_BYTES * 3); 1090 #endif 1091 } 1092 1093 static inline bool bkey_mantissa_bits_dropped(const struct btree *b, 1094 const struct bkey_float *f, 1095 unsigned idx) 1096 { 1097 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 1098 unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; 1099 1100 return f->exponent > key_bits_start; 1101 #else 1102 unsigned key_bits_end = high_bit_offset + b->nr_key_bits; 1103 1104 return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; 1105 #endif 1106 } 1107 1108 __flatten 1109 static struct bkey_packed *bset_search_tree(const struct btree *b, 1110 const struct bset_tree *t, 1111 const struct bpos *search, 1112 const struct bkey_packed *packed_search) 1113 { 1114 struct ro_aux_tree *base = ro_aux_tree_base(b, t); 1115 struct bkey_float *f; 1116 struct bkey_packed *k; 1117 unsigned inorder, n = 1, l, r; 1118 int cmp; 1119 1120 do { 1121 if (likely(n << 4 < t->size)) 1122 prefetch(&base->f[n << 4]); 1123 1124 f = &base->f[n]; 1125 if (unlikely(f->exponent >= BFLOAT_FAILED)) 1126 goto slowpath; 1127 1128 l = f->mantissa; 1129 r = bkey_mantissa(packed_search, f, n); 1130 1131 if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) 1132 goto slowpath; 1133 1134 n = n * 2 + (l < r); 1135 continue; 1136 slowpath: 1137 k = tree_to_bkey(b, t, n); 1138 cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); 1139 if (!cmp) 1140 return k; 1141 1142 n = n * 2 + (cmp < 0); 1143 } while (n < t->size); 1144 1145 inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); 1146 1147 /* 1148 * n would have been the node we recursed to - the low bit tells us if 1149 * we recursed left or recursed right. 1150 */ 1151 if (likely(!(n & 1))) { 1152 --inorder; 1153 if (unlikely(!inorder)) 1154 return btree_bkey_first(b, t); 1155 1156 f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; 1157 } 1158 1159 return cacheline_to_bkey(b, t, inorder, f->key_offset); 1160 } 1161 1162 static __always_inline __flatten 1163 struct bkey_packed *__bch2_bset_search(struct btree *b, 1164 struct bset_tree *t, 1165 struct bpos *search, 1166 const struct bkey_packed *lossy_packed_search) 1167 { 1168 1169 /* 1170 * First, we search for a cacheline, then lastly we do a linear search 1171 * within that cacheline. 1172 * 1173 * To search for the cacheline, there's three different possibilities: 1174 * * The set is too small to have a search tree, so we just do a linear 1175 * search over the whole set. 1176 * * The set is the one we're currently inserting into; keeping a full 1177 * auxiliary search tree up to date would be too expensive, so we 1178 * use a much simpler lookup table to do a binary search - 1179 * bset_search_write_set(). 1180 * * Or we use the auxiliary search tree we constructed earlier - 1181 * bset_search_tree() 1182 */ 1183 1184 switch (bset_aux_tree_type(t)) { 1185 case BSET_NO_AUX_TREE: 1186 return btree_bkey_first(b, t); 1187 case BSET_RW_AUX_TREE: 1188 return bset_search_write_set(b, t, search); 1189 case BSET_RO_AUX_TREE: 1190 return bset_search_tree(b, t, search, lossy_packed_search); 1191 default: 1192 BUG(); 1193 } 1194 } 1195 1196 static __always_inline __flatten 1197 struct bkey_packed *bch2_bset_search_linear(struct btree *b, 1198 struct bset_tree *t, 1199 struct bpos *search, 1200 struct bkey_packed *packed_search, 1201 const struct bkey_packed *lossy_packed_search, 1202 struct bkey_packed *m) 1203 { 1204 if (lossy_packed_search) 1205 while (m != btree_bkey_last(b, t) && 1206 bkey_iter_cmp_p_or_unp(b, m, 1207 lossy_packed_search, search) < 0) 1208 m = bkey_p_next(m); 1209 1210 if (!packed_search) 1211 while (m != btree_bkey_last(b, t) && 1212 bkey_iter_pos_cmp(b, m, search) < 0) 1213 m = bkey_p_next(m); 1214 1215 if (bch2_expensive_debug_checks) { 1216 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); 1217 1218 BUG_ON(prev && 1219 bkey_iter_cmp_p_or_unp(b, prev, 1220 packed_search, search) >= 0); 1221 } 1222 1223 return m; 1224 } 1225 1226 /* Btree node iterator */ 1227 1228 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, 1229 struct btree *b, 1230 const struct bkey_packed *k, 1231 const struct bkey_packed *end) 1232 { 1233 if (k != end) { 1234 struct btree_node_iter_set *pos; 1235 1236 btree_node_iter_for_each(iter, pos) 1237 ; 1238 1239 BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); 1240 *pos = (struct btree_node_iter_set) { 1241 __btree_node_key_to_offset(b, k), 1242 __btree_node_key_to_offset(b, end) 1243 }; 1244 } 1245 } 1246 1247 void bch2_btree_node_iter_push(struct btree_node_iter *iter, 1248 struct btree *b, 1249 const struct bkey_packed *k, 1250 const struct bkey_packed *end) 1251 { 1252 __bch2_btree_node_iter_push(iter, b, k, end); 1253 bch2_btree_node_iter_sort(iter, b); 1254 } 1255 1256 noinline __flatten __cold 1257 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, 1258 struct btree *b, struct bpos *search) 1259 { 1260 struct bkey_packed *k; 1261 1262 trace_bkey_pack_pos_fail(search); 1263 1264 bch2_btree_node_iter_init_from_start(iter, b); 1265 1266 while ((k = bch2_btree_node_iter_peek(iter, b)) && 1267 bkey_iter_pos_cmp(b, k, search) < 0) 1268 bch2_btree_node_iter_advance(iter, b); 1269 } 1270 1271 /** 1272 * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a 1273 * given position 1274 * 1275 * @iter: iterator to initialize 1276 * @b: btree node to search 1277 * @search: search key 1278 * 1279 * Main entry point to the lookup code for individual btree nodes: 1280 * 1281 * NOTE: 1282 * 1283 * When you don't filter out deleted keys, btree nodes _do_ contain duplicate 1284 * keys. This doesn't matter for most code, but it does matter for lookups. 1285 * 1286 * Some adjacent keys with a string of equal keys: 1287 * i j k k k k l m 1288 * 1289 * If you search for k, the lookup code isn't guaranteed to return you any 1290 * specific k. The lookup code is conceptually doing a binary search and 1291 * iterating backwards is very expensive so if the pivot happens to land at the 1292 * last k that's what you'll get. 1293 * 1294 * This works out ok, but it's something to be aware of: 1295 * 1296 * - For non extents, we guarantee that the live key comes last - see 1297 * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't 1298 * see will only be deleted keys you don't care about. 1299 * 1300 * - For extents, deleted keys sort last (see the comment at the top of this 1301 * file). But when you're searching for extents, you actually want the first 1302 * key strictly greater than your search key - an extent that compares equal 1303 * to the search key is going to have 0 sectors after the search key. 1304 * 1305 * But this does mean that we can't just search for 1306 * bpos_successor(start_of_range) to get the first extent that overlaps with 1307 * the range we want - if we're unlucky and there's an extent that ends 1308 * exactly where we searched, then there could be a deleted key at the same 1309 * position and we'd get that when we search instead of the preceding extent 1310 * we needed. 1311 * 1312 * So we've got to search for start_of_range, then after the lookup iterate 1313 * past any extents that compare equal to the position we searched for. 1314 */ 1315 __flatten 1316 void bch2_btree_node_iter_init(struct btree_node_iter *iter, 1317 struct btree *b, struct bpos *search) 1318 { 1319 struct bkey_packed p, *packed_search = NULL; 1320 struct btree_node_iter_set *pos = iter->data; 1321 struct bkey_packed *k[MAX_BSETS]; 1322 unsigned i; 1323 1324 EBUG_ON(bpos_lt(*search, b->data->min_key)); 1325 EBUG_ON(bpos_gt(*search, b->data->max_key)); 1326 bset_aux_tree_verify(b); 1327 1328 memset(iter, 0, sizeof(*iter)); 1329 1330 switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { 1331 case BKEY_PACK_POS_EXACT: 1332 packed_search = &p; 1333 break; 1334 case BKEY_PACK_POS_SMALLER: 1335 packed_search = NULL; 1336 break; 1337 case BKEY_PACK_POS_FAIL: 1338 btree_node_iter_init_pack_failed(iter, b, search); 1339 return; 1340 } 1341 1342 for (i = 0; i < b->nsets; i++) { 1343 k[i] = __bch2_bset_search(b, b->set + i, search, &p); 1344 prefetch_four_cachelines(k[i]); 1345 } 1346 1347 for (i = 0; i < b->nsets; i++) { 1348 struct bset_tree *t = b->set + i; 1349 struct bkey_packed *end = btree_bkey_last(b, t); 1350 1351 k[i] = bch2_bset_search_linear(b, t, search, 1352 packed_search, &p, k[i]); 1353 if (k[i] != end) 1354 *pos++ = (struct btree_node_iter_set) { 1355 __btree_node_key_to_offset(b, k[i]), 1356 __btree_node_key_to_offset(b, end) 1357 }; 1358 } 1359 1360 bch2_btree_node_iter_sort(iter, b); 1361 } 1362 1363 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, 1364 struct btree *b) 1365 { 1366 struct bset_tree *t; 1367 1368 memset(iter, 0, sizeof(*iter)); 1369 1370 for_each_bset(b, t) 1371 __bch2_btree_node_iter_push(iter, b, 1372 btree_bkey_first(b, t), 1373 btree_bkey_last(b, t)); 1374 bch2_btree_node_iter_sort(iter, b); 1375 } 1376 1377 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, 1378 struct btree *b, 1379 struct bset_tree *t) 1380 { 1381 struct btree_node_iter_set *set; 1382 1383 btree_node_iter_for_each(iter, set) 1384 if (set->end == t->end_offset) 1385 return __btree_node_offset_to_key(b, set->k); 1386 1387 return btree_bkey_last(b, t); 1388 } 1389 1390 static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, 1391 struct btree *b, 1392 unsigned first) 1393 { 1394 bool ret; 1395 1396 if ((ret = (btree_node_iter_cmp(b, 1397 iter->data[first], 1398 iter->data[first + 1]) > 0))) 1399 swap(iter->data[first], iter->data[first + 1]); 1400 return ret; 1401 } 1402 1403 void bch2_btree_node_iter_sort(struct btree_node_iter *iter, 1404 struct btree *b) 1405 { 1406 /* unrolled bubble sort: */ 1407 1408 if (!__btree_node_iter_set_end(iter, 2)) { 1409 btree_node_iter_sort_two(iter, b, 0); 1410 btree_node_iter_sort_two(iter, b, 1); 1411 } 1412 1413 if (!__btree_node_iter_set_end(iter, 1)) 1414 btree_node_iter_sort_two(iter, b, 0); 1415 } 1416 1417 void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, 1418 struct btree_node_iter_set *set) 1419 { 1420 struct btree_node_iter_set *last = 1421 iter->data + ARRAY_SIZE(iter->data) - 1; 1422 1423 memmove(&set[0], &set[1], (void *) last - (void *) set); 1424 *last = (struct btree_node_iter_set) { 0, 0 }; 1425 } 1426 1427 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, 1428 struct btree *b) 1429 { 1430 iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; 1431 1432 EBUG_ON(iter->data->k > iter->data->end); 1433 1434 if (unlikely(__btree_node_iter_set_end(iter, 0))) { 1435 /* avoid an expensive memmove call: */ 1436 iter->data[0] = iter->data[1]; 1437 iter->data[1] = iter->data[2]; 1438 iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; 1439 return; 1440 } 1441 1442 if (__btree_node_iter_set_end(iter, 1)) 1443 return; 1444 1445 if (!btree_node_iter_sort_two(iter, b, 0)) 1446 return; 1447 1448 if (__btree_node_iter_set_end(iter, 2)) 1449 return; 1450 1451 btree_node_iter_sort_two(iter, b, 1); 1452 } 1453 1454 void bch2_btree_node_iter_advance(struct btree_node_iter *iter, 1455 struct btree *b) 1456 { 1457 if (bch2_expensive_debug_checks) { 1458 bch2_btree_node_iter_verify(iter, b); 1459 bch2_btree_node_iter_next_check(iter, b); 1460 } 1461 1462 __bch2_btree_node_iter_advance(iter, b); 1463 } 1464 1465 /* 1466 * Expensive: 1467 */ 1468 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, 1469 struct btree *b) 1470 { 1471 struct bkey_packed *k, *prev = NULL; 1472 struct btree_node_iter_set *set; 1473 struct bset_tree *t; 1474 unsigned end = 0; 1475 1476 if (bch2_expensive_debug_checks) 1477 bch2_btree_node_iter_verify(iter, b); 1478 1479 for_each_bset(b, t) { 1480 k = bch2_bkey_prev_all(b, t, 1481 bch2_btree_node_iter_bset_pos(iter, b, t)); 1482 if (k && 1483 (!prev || bkey_iter_cmp(b, k, prev) > 0)) { 1484 prev = k; 1485 end = t->end_offset; 1486 } 1487 } 1488 1489 if (!prev) 1490 return NULL; 1491 1492 /* 1493 * We're manually memmoving instead of just calling sort() to ensure the 1494 * prev we picked ends up in slot 0 - sort won't necessarily put it 1495 * there because of duplicate deleted keys: 1496 */ 1497 btree_node_iter_for_each(iter, set) 1498 if (set->end == end) 1499 goto found; 1500 1501 BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); 1502 found: 1503 BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); 1504 1505 memmove(&iter->data[1], 1506 &iter->data[0], 1507 (void *) set - (void *) &iter->data[0]); 1508 1509 iter->data[0].k = __btree_node_key_to_offset(b, prev); 1510 iter->data[0].end = end; 1511 1512 if (bch2_expensive_debug_checks) 1513 bch2_btree_node_iter_verify(iter, b); 1514 return prev; 1515 } 1516 1517 struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, 1518 struct btree *b) 1519 { 1520 struct bkey_packed *prev; 1521 1522 do { 1523 prev = bch2_btree_node_iter_prev_all(iter, b); 1524 } while (prev && bkey_deleted(prev)); 1525 1526 return prev; 1527 } 1528 1529 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, 1530 struct btree *b, 1531 struct bkey *u) 1532 { 1533 struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); 1534 1535 return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; 1536 } 1537 1538 /* Mergesort */ 1539 1540 void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) 1541 { 1542 const struct bset_tree *t; 1543 1544 for_each_bset(b, t) { 1545 enum bset_aux_tree_type type = bset_aux_tree_type(t); 1546 size_t j; 1547 1548 stats->sets[type].nr++; 1549 stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * 1550 sizeof(u64); 1551 1552 if (bset_has_ro_aux_tree(t)) { 1553 stats->floats += t->size - 1; 1554 1555 for (j = 1; j < t->size; j++) 1556 stats->failed += 1557 bkey_float(b, t, j)->exponent == 1558 BFLOAT_FAILED; 1559 } 1560 } 1561 } 1562 1563 void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, 1564 struct bkey_packed *k) 1565 { 1566 struct bset_tree *t = bch2_bkey_to_bset(b, k); 1567 struct bkey uk; 1568 unsigned j, inorder; 1569 1570 if (!bset_has_ro_aux_tree(t)) 1571 return; 1572 1573 inorder = bkey_to_cacheline(b, t, k); 1574 if (!inorder || inorder >= t->size) 1575 return; 1576 1577 j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); 1578 if (k != tree_to_bkey(b, t, j)) 1579 return; 1580 1581 switch (bkey_float(b, t, j)->exponent) { 1582 case BFLOAT_FAILED: 1583 uk = bkey_unpack_key(b, k); 1584 prt_printf(out, 1585 " failed unpacked at depth %u\n" 1586 "\t", 1587 ilog2(j)); 1588 bch2_bpos_to_text(out, uk.p); 1589 prt_printf(out, "\n"); 1590 break; 1591 } 1592 } 1593