1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 /* 16 * Copyright (c) 2019 by Delphix. All rights reserved. 17 */ 18 19 #include <sys/btree.h> 20 #include <sys/bitops.h> 21 #include <sys/zfs_context.h> 22 23 kmem_cache_t *zfs_btree_leaf_cache; 24 25 /* 26 * Control the extent of the verification that occurs when zfs_btree_verify is 27 * called. Primarily used for debugging when extending the btree logic and 28 * functionality. As the intensity is increased, new verification steps are 29 * added. These steps are cumulative; intensity = 3 includes the intensity = 1 30 * and intensity = 2 steps as well. 31 * 32 * Intensity 1: Verify that the tree's height is consistent throughout. 33 * Intensity 2: Verify that a core node's children's parent pointers point 34 * to the core node. 35 * Intensity 3: Verify that the total number of elements in the tree matches the 36 * sum of the number of elements in each node. Also verifies that each node's 37 * count obeys the invariants (less than or equal to maximum value, greater than 38 * or equal to half the maximum minus one). 39 * Intensity 4: Verify that each element compares less than the element 40 * immediately after it and greater than the one immediately before it using the 41 * comparator function. For core nodes, also checks that each element is greater 42 * than the last element in the first of the two nodes it separates, and less 43 * than the first element in the second of the two nodes. 44 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside 45 * of each node is poisoned appropriately. Note that poisoning always occurs if 46 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal 47 * operation. 48 * 49 * Intensity 4 and 5 are particularly expensive to perform; the previous levels 50 * are a few memory operations per node, while these levels require multiple 51 * operations per element. In addition, when creating large btrees, these 52 * operations are called at every step, resulting in extremely slow operation 53 * (while the asymptotic complexity of the other steps is the same, the 54 * importance of the constant factors cannot be denied). 55 */ 56 int zfs_btree_verify_intensity = 0; 57 58 /* 59 * Convenience functions to silence warnings from memcpy/memmove's 60 * return values and change argument order to src, dest. 61 */ 62 static void 63 bcpy(const void *src, void *dest, size_t size) 64 { 65 (void) memcpy(dest, src, size); 66 } 67 68 static void 69 bmov(const void *src, void *dest, size_t size) 70 { 71 (void) memmove(dest, src, size); 72 } 73 74 static boolean_t 75 zfs_btree_is_core(struct zfs_btree_hdr *hdr) 76 { 77 return (hdr->bth_first == -1); 78 } 79 80 #ifdef _ILP32 81 #define BTREE_POISON 0xabadb10c 82 #else 83 #define BTREE_POISON 0xabadb10cdeadbeef 84 #endif 85 86 static void 87 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 88 { 89 #ifdef ZFS_DEBUG 90 size_t size = tree->bt_elem_size; 91 if (zfs_btree_is_core(hdr)) { 92 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 93 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 94 i++) { 95 node->btc_children[i] = 96 (zfs_btree_hdr_t *)BTREE_POISON; 97 } 98 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, 99 (BTREE_CORE_ELEMS - hdr->bth_count) * size); 100 } else { 101 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 102 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); 103 (void) memset(leaf->btl_elems + 104 (hdr->bth_first + hdr->bth_count) * size, 0x0f, 105 BTREE_LEAF_ESIZE - 106 (hdr->bth_first + hdr->bth_count) * size); 107 } 108 #endif 109 } 110 111 static inline void 112 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 113 uint32_t idx, uint32_t count) 114 { 115 #ifdef ZFS_DEBUG 116 size_t size = tree->bt_elem_size; 117 if (zfs_btree_is_core(hdr)) { 118 ASSERT3U(idx, >=, hdr->bth_count); 119 ASSERT3U(idx, <=, BTREE_CORE_ELEMS); 120 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); 121 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 122 for (uint32_t i = 1; i <= count; i++) { 123 node->btc_children[idx + i] = 124 (zfs_btree_hdr_t *)BTREE_POISON; 125 } 126 (void) memset(node->btc_elems + idx * size, 0x0f, count * size); 127 } else { 128 ASSERT3U(idx, <=, tree->bt_leaf_cap); 129 ASSERT3U(idx + count, <=, tree->bt_leaf_cap); 130 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 131 (void) memset(leaf->btl_elems + 132 (hdr->bth_first + idx) * size, 0x0f, count * size); 133 } 134 #endif 135 } 136 137 static inline void 138 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 139 uint32_t idx) 140 { 141 #ifdef ZFS_DEBUG 142 size_t size = tree->bt_elem_size; 143 if (zfs_btree_is_core(hdr)) { 144 ASSERT3U(idx, <, BTREE_CORE_ELEMS); 145 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 146 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; 147 VERIFY3P(node->btc_children[idx + 1], ==, cval); 148 for (size_t i = 0; i < size; i++) 149 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); 150 } else { 151 ASSERT3U(idx, <, tree->bt_leaf_cap); 152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 153 if (idx >= tree->bt_leaf_cap - hdr->bth_first) 154 return; 155 for (size_t i = 0; i < size; i++) { 156 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) 157 * size + i], ==, 0x0f); 158 } 159 } 160 #endif 161 } 162 163 void 164 zfs_btree_init(void) 165 { 166 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", 167 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 168 } 169 170 void 171 zfs_btree_fini(void) 172 { 173 kmem_cache_destroy(zfs_btree_leaf_cache); 174 } 175 176 void 177 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), 178 size_t size) 179 { 180 ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2); 181 182 memset(tree, 0, sizeof (*tree)); 183 tree->bt_compar = compar; 184 tree->bt_elem_size = size; 185 tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2); 186 tree->bt_height = -1; 187 tree->bt_bulk = NULL; 188 } 189 190 /* 191 * Find value in the array of elements provided. Uses a simple binary search. 192 */ 193 static void * 194 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, 195 const void *value, zfs_btree_index_t *where) 196 { 197 uint32_t max = nelems; 198 uint32_t min = 0; 199 while (max > min) { 200 uint32_t idx = (min + max) / 2; 201 uint8_t *cur = buf + idx * tree->bt_elem_size; 202 int comp = tree->bt_compar(cur, value); 203 if (comp < 0) { 204 min = idx + 1; 205 } else if (comp > 0) { 206 max = idx; 207 } else { 208 where->bti_offset = idx; 209 where->bti_before = B_FALSE; 210 return (cur); 211 } 212 } 213 214 where->bti_offset = max; 215 where->bti_before = B_TRUE; 216 return (NULL); 217 } 218 219 /* 220 * Find the given value in the tree. where may be passed as null to use as a 221 * membership test or if the btree is being used as a map. 222 */ 223 void * 224 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) 225 { 226 if (tree->bt_height == -1) { 227 if (where != NULL) { 228 where->bti_node = NULL; 229 where->bti_offset = 0; 230 } 231 ASSERT0(tree->bt_num_elems); 232 return (NULL); 233 } 234 235 /* 236 * If we're in bulk-insert mode, we check the last spot in the tree 237 * and the last leaf in the tree before doing the normal search, 238 * because for most workloads the vast majority of finds in 239 * bulk-insert mode are to insert new elements. 240 */ 241 zfs_btree_index_t idx; 242 size_t size = tree->bt_elem_size; 243 if (tree->bt_bulk != NULL) { 244 zfs_btree_leaf_t *last_leaf = tree->bt_bulk; 245 int comp = tree->bt_compar(last_leaf->btl_elems + 246 (last_leaf->btl_hdr.bth_first + 247 last_leaf->btl_hdr.bth_count - 1) * size, value); 248 if (comp < 0) { 249 /* 250 * If what they're looking for is after the last 251 * element, it's not in the tree. 252 */ 253 if (where != NULL) { 254 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 255 where->bti_offset = 256 last_leaf->btl_hdr.bth_count; 257 where->bti_before = B_TRUE; 258 } 259 return (NULL); 260 } else if (comp == 0) { 261 if (where != NULL) { 262 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 263 where->bti_offset = 264 last_leaf->btl_hdr.bth_count - 1; 265 where->bti_before = B_FALSE; 266 } 267 return (last_leaf->btl_elems + 268 (last_leaf->btl_hdr.bth_first + 269 last_leaf->btl_hdr.bth_count - 1) * size); 270 } 271 if (tree->bt_compar(last_leaf->btl_elems + 272 last_leaf->btl_hdr.bth_first * size, value) <= 0) { 273 /* 274 * If what they're looking for is after the first 275 * element in the last leaf, it's in the last leaf or 276 * it's not in the tree. 277 */ 278 void *d = zfs_btree_find_in_buf(tree, 279 last_leaf->btl_elems + 280 last_leaf->btl_hdr.bth_first * size, 281 last_leaf->btl_hdr.bth_count, value, &idx); 282 283 if (where != NULL) { 284 idx.bti_node = (zfs_btree_hdr_t *)last_leaf; 285 *where = idx; 286 } 287 return (d); 288 } 289 } 290 291 zfs_btree_core_t *node = NULL; 292 uint32_t child = 0; 293 uint64_t depth = 0; 294 295 /* 296 * Iterate down the tree, finding which child the value should be in 297 * by comparing with the separators. 298 */ 299 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; 300 node = (zfs_btree_core_t *)node->btc_children[child], depth++) { 301 ASSERT3P(node, !=, NULL); 302 void *d = zfs_btree_find_in_buf(tree, node->btc_elems, 303 node->btc_hdr.bth_count, value, &idx); 304 EQUIV(d != NULL, !idx.bti_before); 305 if (d != NULL) { 306 if (where != NULL) { 307 idx.bti_node = (zfs_btree_hdr_t *)node; 308 *where = idx; 309 } 310 return (d); 311 } 312 ASSERT(idx.bti_before); 313 child = idx.bti_offset; 314 } 315 316 /* 317 * The value is in this leaf, or it would be if it were in the 318 * tree. Find its proper location and return it. 319 */ 320 zfs_btree_leaf_t *leaf = (depth == 0 ? 321 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); 322 void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + 323 leaf->btl_hdr.bth_first * size, 324 leaf->btl_hdr.bth_count, value, &idx); 325 326 if (where != NULL) { 327 idx.bti_node = (zfs_btree_hdr_t *)leaf; 328 *where = idx; 329 } 330 331 return (d); 332 } 333 334 /* 335 * To explain the following functions, it is useful to understand the four 336 * kinds of shifts used in btree operation. First, a shift is a movement of 337 * elements within a node. It is used to create gaps for inserting new 338 * elements and children, or cover gaps created when things are removed. A 339 * shift has two fundamental properties, each of which can be one of two 340 * values, making four types of shifts. There is the direction of the shift 341 * (left or right) and the shape of the shift (parallelogram or isoceles 342 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only 343 * applies to shifts of core nodes. 344 * 345 * The names derive from the following imagining of the layout of a node: 346 * 347 * Elements: * * * * * * * ... * * * 348 * Children: * * * * * * * * ... * * * 349 * 350 * This layout follows from the fact that the elements act as separators 351 * between pairs of children, and that children root subtrees "below" the 352 * current node. A left and right shift are fairly self-explanatory; a left 353 * shift moves things to the left, while a right shift moves things to the 354 * right. A parallelogram shift is a shift with the same number of elements 355 * and children being moved, while a trapezoid shift is a shift that moves one 356 * more children than elements. An example follows: 357 * 358 * A parallelogram shift could contain the following: 359 * _______________ 360 * \* * * * \ * * * ... * * * 361 * * \ * * * *\ * * * ... * * * 362 * --------------- 363 * A trapezoid shift could contain the following: 364 * ___________ 365 * * / * * * \ * * * ... * * * 366 * * / * * * *\ * * * ... * * * 367 * --------------- 368 * 369 * Note that a parallelogram shift is always shaped like a "left-leaning" 370 * parallelogram, where the starting index of the children being moved is 371 * always one higher than the starting index of the elements being moved. No 372 * "right-leaning" parallelogram shifts are needed (shifts where the starting 373 * element index and starting child index being moved are the same) to achieve 374 * any btree operations, so we ignore them. 375 */ 376 377 enum bt_shift_shape { 378 BSS_TRAPEZOID, 379 BSS_PARALLELOGRAM 380 }; 381 382 enum bt_shift_direction { 383 BSD_LEFT, 384 BSD_RIGHT 385 }; 386 387 /* 388 * Shift elements and children in the provided core node by off spots. The 389 * first element moved is idx, and count elements are moved. The shape of the 390 * shift is determined by shape. The direction is determined by dir. 391 */ 392 static inline void 393 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 394 uint32_t count, uint32_t off, enum bt_shift_shape shape, 395 enum bt_shift_direction dir) 396 { 397 size_t size = tree->bt_elem_size; 398 ASSERT(zfs_btree_is_core(&node->btc_hdr)); 399 400 uint8_t *e_start = node->btc_elems + idx * size; 401 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : 402 e_start + off * size); 403 bmov(e_start, e_out, count * size); 404 405 zfs_btree_hdr_t **c_start = node->btc_children + idx + 406 (shape == BSS_TRAPEZOID ? 0 : 1); 407 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : 408 c_start + off); 409 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 410 bmov(c_start, c_out, c_count * sizeof (*c_start)); 411 } 412 413 /* 414 * Shift elements and children in the provided core node left by one spot. 415 * The first element moved is idx, and count elements are moved. The 416 * shape of the shift is determined by trap; true if the shift is a trapezoid, 417 * false if it is a parallelogram. 418 */ 419 static inline void 420 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 421 uint32_t count, enum bt_shift_shape shape) 422 { 423 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); 424 } 425 426 /* 427 * Shift elements and children in the provided core node right by one spot. 428 * Starts with elements[idx] and children[idx] and one more child than element. 429 */ 430 static inline void 431 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 432 uint32_t count, enum bt_shift_shape shape) 433 { 434 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); 435 } 436 437 /* 438 * Shift elements and children in the provided leaf node by off spots. 439 * The first element moved is idx, and count elements are moved. The direction 440 * is determined by left. 441 */ 442 static inline void 443 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, 444 uint32_t count, uint32_t off, enum bt_shift_direction dir) 445 { 446 size_t size = tree->bt_elem_size; 447 zfs_btree_hdr_t *hdr = &node->btl_hdr; 448 ASSERT(!zfs_btree_is_core(hdr)); 449 450 if (count == 0) 451 return; 452 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; 453 uint8_t *out = (dir == BSD_LEFT ? start - off * size : 454 start + off * size); 455 bmov(start, out, count * size); 456 } 457 458 /* 459 * Grow leaf for n new elements before idx. 460 */ 461 static void 462 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 463 uint32_t n) 464 { 465 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 466 ASSERT(!zfs_btree_is_core(hdr)); 467 ASSERT3U(idx, <=, hdr->bth_count); 468 uint32_t capacity = tree->bt_leaf_cap; 469 ASSERT3U(hdr->bth_count + n, <=, capacity); 470 boolean_t cl = (hdr->bth_first >= n); 471 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); 472 473 if (cl && (!cr || idx <= hdr->bth_count / 2)) { 474 /* Grow left. */ 475 hdr->bth_first -= n; 476 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); 477 } else if (cr) { 478 /* Grow right. */ 479 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, 480 BSD_RIGHT); 481 } else { 482 /* Grow both ways. */ 483 uint32_t fn = hdr->bth_first - 484 (capacity - (hdr->bth_count + n)) / 2; 485 hdr->bth_first -= fn; 486 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); 487 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, 488 n - fn, BSD_RIGHT); 489 } 490 hdr->bth_count += n; 491 } 492 493 /* 494 * Shrink leaf for count elements starting from idx. 495 */ 496 static void 497 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 498 uint32_t n) 499 { 500 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 501 ASSERT(!zfs_btree_is_core(hdr)); 502 ASSERT3U(idx, <=, hdr->bth_count); 503 ASSERT3U(idx + n, <=, hdr->bth_count); 504 505 if (idx <= (hdr->bth_count - n) / 2) { 506 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); 507 zfs_btree_poison_node_at(tree, hdr, 0, n); 508 hdr->bth_first += n; 509 } else { 510 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, 511 BSD_LEFT); 512 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); 513 } 514 hdr->bth_count -= n; 515 } 516 517 /* 518 * Move children and elements from one core node to another. The shape 519 * parameter behaves the same as it does in the shift logic. 520 */ 521 static inline void 522 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, 523 uint32_t count, zfs_btree_core_t *dest, uint32_t didx, 524 enum bt_shift_shape shape) 525 { 526 size_t size = tree->bt_elem_size; 527 ASSERT(zfs_btree_is_core(&source->btc_hdr)); 528 ASSERT(zfs_btree_is_core(&dest->btc_hdr)); 529 530 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, 531 count * size); 532 533 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 534 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), 535 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), 536 c_count * sizeof (*source->btc_children)); 537 } 538 539 static inline void 540 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, 541 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) 542 { 543 size_t size = tree->bt_elem_size; 544 ASSERT(!zfs_btree_is_core(&source->btl_hdr)); 545 ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); 546 547 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, 548 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, 549 count * size); 550 } 551 552 /* 553 * Find the first element in the subtree rooted at hdr, return its value and 554 * put its location in where if non-null. 555 */ 556 static void * 557 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 558 zfs_btree_index_t *where) 559 { 560 zfs_btree_hdr_t *node; 561 562 for (node = hdr; zfs_btree_is_core(node); 563 node = ((zfs_btree_core_t *)node)->btc_children[0]) 564 ; 565 566 ASSERT(!zfs_btree_is_core(node)); 567 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 568 if (where != NULL) { 569 where->bti_node = node; 570 where->bti_offset = 0; 571 where->bti_before = B_FALSE; 572 } 573 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); 574 } 575 576 /* Insert an element and a child into a core node at the given offset. */ 577 static void 578 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, 579 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) 580 { 581 size_t size = tree->bt_elem_size; 582 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 583 ASSERT3P(par_hdr, ==, new_node->bth_parent); 584 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); 585 586 if (zfs_btree_verify_intensity >= 5) { 587 zfs_btree_verify_poison_at(tree, par_hdr, 588 par_hdr->bth_count); 589 } 590 /* Shift existing elements and children */ 591 uint32_t count = par_hdr->bth_count - offset; 592 bt_shift_core_right(tree, parent, offset, count, 593 BSS_PARALLELOGRAM); 594 595 /* Insert new values */ 596 parent->btc_children[offset + 1] = new_node; 597 bcpy(buf, parent->btc_elems + offset * size, size); 598 par_hdr->bth_count++; 599 } 600 601 /* 602 * Insert new_node into the parent of old_node directly after old_node, with 603 * buf as the dividing element between the two. 604 */ 605 static void 606 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, 607 zfs_btree_hdr_t *new_node, void *buf) 608 { 609 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); 610 size_t size = tree->bt_elem_size; 611 zfs_btree_core_t *parent = old_node->bth_parent; 612 613 /* 614 * If this is the root node we were splitting, we create a new root 615 * and increase the height of the tree. 616 */ 617 if (parent == NULL) { 618 ASSERT3P(old_node, ==, tree->bt_root); 619 tree->bt_num_nodes++; 620 zfs_btree_core_t *new_root = 621 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * 622 size, KM_SLEEP); 623 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; 624 new_root_hdr->bth_parent = NULL; 625 new_root_hdr->bth_first = -1; 626 new_root_hdr->bth_count = 1; 627 628 old_node->bth_parent = new_node->bth_parent = new_root; 629 new_root->btc_children[0] = old_node; 630 new_root->btc_children[1] = new_node; 631 bcpy(buf, new_root->btc_elems, size); 632 633 tree->bt_height++; 634 tree->bt_root = new_root_hdr; 635 zfs_btree_poison_node(tree, new_root_hdr); 636 return; 637 } 638 639 /* 640 * Since we have the new separator, binary search for where to put 641 * new_node. 642 */ 643 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 644 zfs_btree_index_t idx; 645 ASSERT(zfs_btree_is_core(par_hdr)); 646 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, 647 par_hdr->bth_count, buf, &idx), ==, NULL); 648 ASSERT(idx.bti_before); 649 uint32_t offset = idx.bti_offset; 650 ASSERT3U(offset, <=, par_hdr->bth_count); 651 ASSERT3P(parent->btc_children[offset], ==, old_node); 652 653 /* 654 * If the parent isn't full, shift things to accommodate our insertions 655 * and return. 656 */ 657 if (par_hdr->bth_count != BTREE_CORE_ELEMS) { 658 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); 659 return; 660 } 661 662 /* 663 * We need to split this core node into two. Currently there are 664 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for 665 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the 666 * current node, and the others will be moved to the new core node. 667 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One 668 * will be used as the new separator in our parent, and the others 669 * will be split among the two core nodes. 670 * 671 * Usually we will split the node in half evenly, with 672 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we 673 * instead move only about a quarter of the elements (and children) to 674 * the new node. Since the average state after a long time is a 3/4 675 * full node, shortcutting directly to that state improves efficiency. 676 * 677 * We do this in two stages: first we split into two nodes, and then we 678 * reuse our existing logic to insert the new element and child. 679 */ 680 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 681 2 : 4)) - 1, 2); 682 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; 683 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); 684 tree->bt_num_nodes++; 685 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + 686 BTREE_CORE_ELEMS * size, KM_SLEEP); 687 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; 688 new_par_hdr->bth_parent = par_hdr->bth_parent; 689 new_par_hdr->bth_first = -1; 690 new_par_hdr->bth_count = move_count; 691 zfs_btree_poison_node(tree, new_par_hdr); 692 693 par_hdr->bth_count = keep_count; 694 695 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 696 0, BSS_TRAPEZOID); 697 698 /* Store the new separator in a buffer. */ 699 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); 700 bcpy(parent->btc_elems + keep_count * size, tmp_buf, 701 size); 702 zfs_btree_poison_node(tree, par_hdr); 703 704 if (offset < keep_count) { 705 /* Insert the new node into the left half */ 706 zfs_btree_insert_core_impl(tree, parent, offset, new_node, 707 buf); 708 709 /* 710 * Move the new separator to the existing buffer. 711 */ 712 bcpy(tmp_buf, buf, size); 713 } else if (offset > keep_count) { 714 /* Insert the new node into the right half */ 715 new_node->bth_parent = new_parent; 716 zfs_btree_insert_core_impl(tree, new_parent, 717 offset - keep_count - 1, new_node, buf); 718 719 /* 720 * Move the new separator to the existing buffer. 721 */ 722 bcpy(tmp_buf, buf, size); 723 } else { 724 /* 725 * Move the new separator into the right half, and replace it 726 * with buf. We also need to shift back the elements in the 727 * right half to accommodate new_node. 728 */ 729 bt_shift_core_right(tree, new_parent, 0, move_count, 730 BSS_TRAPEZOID); 731 new_parent->btc_children[0] = new_node; 732 bcpy(tmp_buf, new_parent->btc_elems, size); 733 new_par_hdr->bth_count++; 734 } 735 kmem_free(tmp_buf, size); 736 zfs_btree_poison_node(tree, par_hdr); 737 738 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) 739 new_parent->btc_children[i]->bth_parent = new_parent; 740 741 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) 742 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); 743 744 /* 745 * Now that the node is split, we need to insert the new node into its 746 * parent. This may cause further splitting. 747 */ 748 zfs_btree_insert_into_parent(tree, &parent->btc_hdr, 749 &new_parent->btc_hdr, buf); 750 } 751 752 /* Insert an element into a leaf node at the given offset. */ 753 static void 754 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 755 uint32_t idx, const void *value) 756 { 757 size_t size = tree->bt_elem_size; 758 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 759 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); 760 761 if (zfs_btree_verify_intensity >= 5) { 762 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, 763 leaf->btl_hdr.bth_count); 764 } 765 766 bt_grow_leaf(tree, leaf, idx, 1); 767 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; 768 bcpy(value, start, size); 769 } 770 771 static void 772 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); 773 774 /* Helper function for inserting a new value into leaf at the given index. */ 775 static void 776 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 777 const void *value, uint32_t idx) 778 { 779 size_t size = tree->bt_elem_size; 780 uint32_t capacity = tree->bt_leaf_cap; 781 782 /* 783 * If the leaf isn't full, shift the elements after idx and insert 784 * value. 785 */ 786 if (leaf->btl_hdr.bth_count != capacity) { 787 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 788 return; 789 } 790 791 /* 792 * Otherwise, we split the leaf node into two nodes. If we're not bulk 793 * inserting, each is of size (capacity / 2). If we are bulk 794 * inserting, we move a quarter of the elements to the new node so 795 * inserts into the old node don't cause immediate splitting but the 796 * tree stays relatively dense. Since the average state after a long 797 * time is a 3/4 full node, shortcutting directly to that state 798 * improves efficiency. At the end of the bulk insertion process 799 * we'll need to go through and fix up any nodes (the last leaf and 800 * its ancestors, potentially) that are below the minimum. 801 * 802 * In either case, we're left with one extra element. The leftover 803 * element will become the new dividing element between the two nodes. 804 */ 805 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; 806 uint32_t keep_count = capacity - move_count - 1; 807 ASSERT3U(keep_count, >=, 1); 808 /* If we insert on left. move one more to keep leaves balanced. */ 809 if (idx < keep_count) { 810 keep_count--; 811 move_count++; 812 } 813 tree->bt_num_nodes++; 814 zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, 815 KM_SLEEP); 816 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; 817 new_hdr->bth_parent = leaf->btl_hdr.bth_parent; 818 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + 819 (idx >= keep_count && idx <= keep_count + move_count / 2); 820 new_hdr->bth_count = move_count; 821 zfs_btree_poison_node(tree, new_hdr); 822 823 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) 824 tree->bt_bulk = new_leaf; 825 826 /* Copy the back part to the new leaf. */ 827 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); 828 829 /* We store the new separator in a buffer we control for simplicity. */ 830 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 831 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, 832 buf, size); 833 834 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); 835 836 if (idx < keep_count) { 837 /* Insert into the existing leaf. */ 838 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 839 } else if (idx > keep_count) { 840 /* Insert into the new leaf. */ 841 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 842 1, value); 843 } else { 844 /* 845 * Insert planned separator into the new leaf, and use 846 * the new value as the new separator. 847 */ 848 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); 849 bcpy(value, buf, size); 850 } 851 852 /* 853 * Now that the node is split, we need to insert the new node into its 854 * parent. This may cause further splitting, bur only of core nodes. 855 */ 856 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, 857 buf); 858 kmem_free(buf, size); 859 } 860 861 static uint32_t 862 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 863 { 864 void *buf; 865 if (zfs_btree_is_core(hdr)) { 866 buf = ((zfs_btree_core_t *)hdr)->btc_elems; 867 } else { 868 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + 869 hdr->bth_first * tree->bt_elem_size; 870 } 871 zfs_btree_index_t idx; 872 zfs_btree_core_t *parent = hdr->bth_parent; 873 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, 874 parent->btc_hdr.bth_count, buf, &idx), ==, NULL); 875 ASSERT(idx.bti_before); 876 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); 877 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); 878 return (idx.bti_offset); 879 } 880 881 /* 882 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some 883 * nodes may violate the invariant that non-root nodes must be at least half 884 * full. All nodes violating this invariant should be the last node in their 885 * particular level. To correct the invariant, we take values from their left 886 * neighbor until they are half full. They must have a left neighbor at their 887 * level because the last node at a level is not the first node unless it's 888 * the root. 889 */ 890 static void 891 zfs_btree_bulk_finish(zfs_btree_t *tree) 892 { 893 ASSERT3P(tree->bt_bulk, !=, NULL); 894 ASSERT3P(tree->bt_root, !=, NULL); 895 zfs_btree_leaf_t *leaf = tree->bt_bulk; 896 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 897 zfs_btree_core_t *parent = hdr->bth_parent; 898 size_t size = tree->bt_elem_size; 899 uint32_t capacity = tree->bt_leaf_cap; 900 901 /* 902 * The invariant doesn't apply to the root node, if that's the only 903 * node in the tree we're done. 904 */ 905 if (parent == NULL) { 906 tree->bt_bulk = NULL; 907 return; 908 } 909 910 /* First, take elements to rebalance the leaf node. */ 911 if (hdr->bth_count < capacity / 2) { 912 /* 913 * First, find the left neighbor. The simplest way to do this 914 * is to call zfs_btree_prev twice; the first time finds some 915 * ancestor of this node, and the second time finds the left 916 * neighbor. The ancestor found is the lowest common ancestor 917 * of leaf and the neighbor. 918 */ 919 zfs_btree_index_t idx = { 920 .bti_node = hdr, 921 .bti_offset = 0 922 }; 923 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 924 ASSERT(zfs_btree_is_core(idx.bti_node)); 925 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; 926 uint32_t common_idx = idx.bti_offset; 927 928 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 929 ASSERT(!zfs_btree_is_core(idx.bti_node)); 930 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; 931 zfs_btree_hdr_t *l_hdr = idx.bti_node; 932 uint32_t move_count = (capacity / 2) - hdr->bth_count; 933 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, 934 capacity / 2); 935 936 if (zfs_btree_verify_intensity >= 5) { 937 for (uint32_t i = 0; i < move_count; i++) { 938 zfs_btree_verify_poison_at(tree, hdr, 939 leaf->btl_hdr.bth_count + i); 940 } 941 } 942 943 /* First, shift elements in leaf back. */ 944 bt_grow_leaf(tree, leaf, 0, move_count); 945 946 /* Next, move the separator from the common ancestor to leaf. */ 947 uint8_t *separator = common->btc_elems + common_idx * size; 948 uint8_t *out = leaf->btl_elems + 949 (hdr->bth_first + move_count - 1) * size; 950 bcpy(separator, out, size); 951 952 /* 953 * Now we move elements from the tail of the left neighbor to 954 * fill the remaining spots in leaf. 955 */ 956 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - 957 (move_count - 1), move_count - 1, leaf, 0); 958 959 /* 960 * Finally, move the new last element in the left neighbor to 961 * the separator. 962 */ 963 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + 964 l_hdr->bth_count - move_count) * size, separator, size); 965 966 /* Adjust the node's counts, and we're done. */ 967 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, 968 move_count); 969 970 ASSERT3U(l_hdr->bth_count, >=, capacity / 2); 971 ASSERT3U(hdr->bth_count, >=, capacity / 2); 972 } 973 974 /* 975 * Now we have to rebalance any ancestors of leaf that may also 976 * violate the invariant. 977 */ 978 capacity = BTREE_CORE_ELEMS; 979 while (parent->btc_hdr.bth_parent != NULL) { 980 zfs_btree_core_t *cur = parent; 981 zfs_btree_hdr_t *hdr = &cur->btc_hdr; 982 parent = hdr->bth_parent; 983 /* 984 * If the invariant isn't violated, move on to the next 985 * ancestor. 986 */ 987 if (hdr->bth_count >= capacity / 2) 988 continue; 989 990 /* 991 * Because the smallest number of nodes we can move when 992 * splitting is 2, we never need to worry about not having a 993 * left sibling (a sibling is a neighbor with the same parent). 994 */ 995 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 996 ASSERT3U(parent_idx, >, 0); 997 zfs_btree_core_t *l_neighbor = 998 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; 999 uint32_t move_count = (capacity / 2) - hdr->bth_count; 1000 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, 1001 capacity / 2); 1002 1003 if (zfs_btree_verify_intensity >= 5) { 1004 for (uint32_t i = 0; i < move_count; i++) { 1005 zfs_btree_verify_poison_at(tree, hdr, 1006 hdr->bth_count + i); 1007 } 1008 } 1009 /* First, shift things in the right node back. */ 1010 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, 1011 BSS_TRAPEZOID, BSD_RIGHT); 1012 1013 /* Next, move the separator to the right node. */ 1014 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * 1015 size); 1016 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); 1017 bcpy(separator, e_out, size); 1018 1019 /* 1020 * Now, move elements and children from the left node to the 1021 * right. We move one more child than elements. 1022 */ 1023 move_count--; 1024 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; 1025 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, 1026 BSS_TRAPEZOID); 1027 1028 /* 1029 * Finally, move the last element in the left node to the 1030 * separator's position. 1031 */ 1032 move_idx--; 1033 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); 1034 1035 l_neighbor->btc_hdr.bth_count -= move_count + 1; 1036 hdr->bth_count += move_count + 1; 1037 1038 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); 1039 ASSERT3U(hdr->bth_count, >=, capacity / 2); 1040 1041 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); 1042 1043 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1044 cur->btc_children[i]->bth_parent = cur; 1045 } 1046 1047 tree->bt_bulk = NULL; 1048 zfs_btree_verify(tree); 1049 } 1050 1051 /* 1052 * Insert value into tree at the location specified by where. 1053 */ 1054 void 1055 zfs_btree_add_idx(zfs_btree_t *tree, const void *value, 1056 const zfs_btree_index_t *where) 1057 { 1058 zfs_btree_index_t idx = {0}; 1059 1060 /* If we're not inserting in the last leaf, end bulk insert mode. */ 1061 if (tree->bt_bulk != NULL) { 1062 if (where->bti_node != &tree->bt_bulk->btl_hdr) { 1063 zfs_btree_bulk_finish(tree); 1064 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); 1065 where = &idx; 1066 } 1067 } 1068 1069 tree->bt_num_elems++; 1070 /* 1071 * If this is the first element in the tree, create a leaf root node 1072 * and add the value to it. 1073 */ 1074 if (where->bti_node == NULL) { 1075 ASSERT3U(tree->bt_num_elems, ==, 1); 1076 ASSERT3S(tree->bt_height, ==, -1); 1077 ASSERT3P(tree->bt_root, ==, NULL); 1078 ASSERT0(where->bti_offset); 1079 1080 tree->bt_num_nodes++; 1081 zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, 1082 KM_SLEEP); 1083 tree->bt_root = &leaf->btl_hdr; 1084 tree->bt_height++; 1085 1086 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 1087 hdr->bth_parent = NULL; 1088 hdr->bth_first = 0; 1089 hdr->bth_count = 0; 1090 zfs_btree_poison_node(tree, hdr); 1091 1092 zfs_btree_insert_into_leaf(tree, leaf, value, 0); 1093 tree->bt_bulk = leaf; 1094 } else if (!zfs_btree_is_core(where->bti_node)) { 1095 /* 1096 * If we're inserting into a leaf, go directly to the helper 1097 * function. 1098 */ 1099 zfs_btree_insert_into_leaf(tree, 1100 (zfs_btree_leaf_t *)where->bti_node, value, 1101 where->bti_offset); 1102 } else { 1103 /* 1104 * If we're inserting into a core node, we can't just shift 1105 * the existing element in that slot in the same node without 1106 * breaking our ordering invariants. Instead we place the new 1107 * value in the node at that spot and then insert the old 1108 * separator into the first slot in the subtree to the right. 1109 */ 1110 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; 1111 1112 /* 1113 * We can ignore bti_before, because either way the value 1114 * should end up in bti_offset. 1115 */ 1116 uint32_t off = where->bti_offset; 1117 zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; 1118 size_t size = tree->bt_elem_size; 1119 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 1120 bcpy(node->btc_elems + off * size, buf, size); 1121 bcpy(value, node->btc_elems + off * size, size); 1122 1123 /* 1124 * Find the first slot in the subtree to the right, insert 1125 * there. 1126 */ 1127 zfs_btree_index_t new_idx; 1128 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, 1129 NULL); 1130 ASSERT0(new_idx.bti_offset); 1131 ASSERT(!zfs_btree_is_core(new_idx.bti_node)); 1132 zfs_btree_insert_into_leaf(tree, 1133 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); 1134 kmem_free(buf, size); 1135 } 1136 zfs_btree_verify(tree); 1137 } 1138 1139 /* 1140 * Return the first element in the tree, and put its location in where if 1141 * non-null. 1142 */ 1143 void * 1144 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) 1145 { 1146 if (tree->bt_height == -1) { 1147 ASSERT0(tree->bt_num_elems); 1148 return (NULL); 1149 } 1150 return (zfs_btree_first_helper(tree, tree->bt_root, where)); 1151 } 1152 1153 /* 1154 * Find the last element in the subtree rooted at hdr, return its value and 1155 * put its location in where if non-null. 1156 */ 1157 static void * 1158 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, 1159 zfs_btree_index_t *where) 1160 { 1161 zfs_btree_hdr_t *node; 1162 1163 for (node = hdr; zfs_btree_is_core(node); node = 1164 ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) 1165 ; 1166 1167 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 1168 if (where != NULL) { 1169 where->bti_node = node; 1170 where->bti_offset = node->bth_count - 1; 1171 where->bti_before = B_FALSE; 1172 } 1173 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * 1174 btree->bt_elem_size); 1175 } 1176 1177 /* 1178 * Return the last element in the tree, and put its location in where if 1179 * non-null. 1180 */ 1181 void * 1182 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) 1183 { 1184 if (tree->bt_height == -1) { 1185 ASSERT0(tree->bt_num_elems); 1186 return (NULL); 1187 } 1188 return (zfs_btree_last_helper(tree, tree->bt_root, where)); 1189 } 1190 1191 /* 1192 * This function contains the logic to find the next node in the tree. A 1193 * helper function is used because there are multiple internal consumemrs of 1194 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each 1195 * node after we've finished with it. 1196 */ 1197 static void * 1198 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1199 zfs_btree_index_t *out_idx, 1200 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) 1201 { 1202 if (idx->bti_node == NULL) { 1203 ASSERT3S(tree->bt_height, ==, -1); 1204 return (NULL); 1205 } 1206 1207 uint32_t offset = idx->bti_offset; 1208 if (!zfs_btree_is_core(idx->bti_node)) { 1209 /* 1210 * When finding the next element of an element in a leaf, 1211 * there are two cases. If the element isn't the last one in 1212 * the leaf, in which case we just return the next element in 1213 * the leaf. Otherwise, we need to traverse up our parents 1214 * until we find one where our ancestor isn't the last child 1215 * of its parent. Once we do, the next element is the 1216 * separator after our ancestor in its parent. 1217 */ 1218 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1219 uint32_t new_off = offset + (idx->bti_before ? 0 : 1); 1220 if (leaf->btl_hdr.bth_count > new_off) { 1221 out_idx->bti_node = &leaf->btl_hdr; 1222 out_idx->bti_offset = new_off; 1223 out_idx->bti_before = B_FALSE; 1224 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1225 new_off) * tree->bt_elem_size); 1226 } 1227 1228 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1229 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1230 node != NULL; node = node->btc_hdr.bth_parent) { 1231 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1232 ASSERT(zfs_btree_is_core(hdr)); 1233 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1234 if (done_func != NULL) 1235 done_func(tree, prev); 1236 if (i == hdr->bth_count) { 1237 prev = hdr; 1238 continue; 1239 } 1240 out_idx->bti_node = hdr; 1241 out_idx->bti_offset = i; 1242 out_idx->bti_before = B_FALSE; 1243 return (node->btc_elems + i * tree->bt_elem_size); 1244 } 1245 if (done_func != NULL) 1246 done_func(tree, prev); 1247 /* 1248 * We've traversed all the way up and been at the end of the 1249 * node every time, so this was the last element in the tree. 1250 */ 1251 return (NULL); 1252 } 1253 1254 /* If we were before an element in a core node, return that element. */ 1255 ASSERT(zfs_btree_is_core(idx->bti_node)); 1256 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1257 if (idx->bti_before) { 1258 out_idx->bti_before = B_FALSE; 1259 return (node->btc_elems + offset * tree->bt_elem_size); 1260 } 1261 1262 /* 1263 * The next element from one in a core node is the first element in 1264 * the subtree just to the right of the separator. 1265 */ 1266 zfs_btree_hdr_t *child = node->btc_children[offset + 1]; 1267 return (zfs_btree_first_helper(tree, child, out_idx)); 1268 } 1269 1270 /* 1271 * Return the next valued node in the tree. The same address can be safely 1272 * passed for idx and out_idx. 1273 */ 1274 void * 1275 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1276 zfs_btree_index_t *out_idx) 1277 { 1278 return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); 1279 } 1280 1281 /* 1282 * Return the previous valued node in the tree. The same value can be safely 1283 * passed for idx and out_idx. 1284 */ 1285 void * 1286 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1287 zfs_btree_index_t *out_idx) 1288 { 1289 if (idx->bti_node == NULL) { 1290 ASSERT3S(tree->bt_height, ==, -1); 1291 return (NULL); 1292 } 1293 1294 uint32_t offset = idx->bti_offset; 1295 if (!zfs_btree_is_core(idx->bti_node)) { 1296 /* 1297 * When finding the previous element of an element in a leaf, 1298 * there are two cases. If the element isn't the first one in 1299 * the leaf, in which case we just return the previous element 1300 * in the leaf. Otherwise, we need to traverse up our parents 1301 * until we find one where our previous ancestor isn't the 1302 * first child. Once we do, the previous element is the 1303 * separator after our previous ancestor. 1304 */ 1305 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1306 if (offset != 0) { 1307 out_idx->bti_node = &leaf->btl_hdr; 1308 out_idx->bti_offset = offset - 1; 1309 out_idx->bti_before = B_FALSE; 1310 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1311 offset - 1) * tree->bt_elem_size); 1312 } 1313 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1314 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1315 node != NULL; node = node->btc_hdr.bth_parent) { 1316 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1317 ASSERT(zfs_btree_is_core(hdr)); 1318 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1319 if (i == 0) { 1320 prev = hdr; 1321 continue; 1322 } 1323 out_idx->bti_node = hdr; 1324 out_idx->bti_offset = i - 1; 1325 out_idx->bti_before = B_FALSE; 1326 return (node->btc_elems + (i - 1) * tree->bt_elem_size); 1327 } 1328 /* 1329 * We've traversed all the way up and been at the start of the 1330 * node every time, so this was the first node in the tree. 1331 */ 1332 return (NULL); 1333 } 1334 1335 /* 1336 * The previous element from one in a core node is the last element in 1337 * the subtree just to the left of the separator. 1338 */ 1339 ASSERT(zfs_btree_is_core(idx->bti_node)); 1340 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1341 zfs_btree_hdr_t *child = node->btc_children[offset]; 1342 return (zfs_btree_last_helper(tree, child, out_idx)); 1343 } 1344 1345 /* 1346 * Get the value at the provided index in the tree. 1347 * 1348 * Note that the value returned from this function can be mutated, but only 1349 * if it will not change the ordering of the element with respect to any other 1350 * elements that could be in the tree. 1351 */ 1352 void * 1353 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) 1354 { 1355 ASSERT(!idx->bti_before); 1356 size_t size = tree->bt_elem_size; 1357 if (!zfs_btree_is_core(idx->bti_node)) { 1358 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1359 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1360 idx->bti_offset) * size); 1361 } 1362 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1363 return (node->btc_elems + idx->bti_offset * size); 1364 } 1365 1366 /* Add the given value to the tree. Must not already be in the tree. */ 1367 void 1368 zfs_btree_add(zfs_btree_t *tree, const void *node) 1369 { 1370 zfs_btree_index_t where = {0}; 1371 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); 1372 zfs_btree_add_idx(tree, node, &where); 1373 } 1374 1375 /* Helper function to free a tree node. */ 1376 static void 1377 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) 1378 { 1379 tree->bt_num_nodes--; 1380 if (!zfs_btree_is_core(node)) { 1381 kmem_cache_free(zfs_btree_leaf_cache, node); 1382 } else { 1383 kmem_free(node, sizeof (zfs_btree_core_t) + 1384 BTREE_CORE_ELEMS * tree->bt_elem_size); 1385 } 1386 } 1387 1388 /* 1389 * Remove the rm_hdr and the separator to its left from the parent node. The 1390 * buffer that rm_hdr was stored in may already be freed, so its contents 1391 * cannot be accessed. 1392 */ 1393 static void 1394 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, 1395 zfs_btree_hdr_t *rm_hdr) 1396 { 1397 size_t size = tree->bt_elem_size; 1398 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; 1399 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1400 /* 1401 * If the node is the root node and rm_hdr is one of two children, 1402 * promote the other child to the root. 1403 */ 1404 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { 1405 ASSERT3U(hdr->bth_count, ==, 1); 1406 ASSERT3P(tree->bt_root, ==, node); 1407 ASSERT3P(node->btc_children[1], ==, rm_hdr); 1408 tree->bt_root = node->btc_children[0]; 1409 node->btc_children[0]->bth_parent = NULL; 1410 zfs_btree_node_destroy(tree, hdr); 1411 tree->bt_height--; 1412 return; 1413 } 1414 1415 uint32_t idx; 1416 for (idx = 0; idx <= hdr->bth_count; idx++) { 1417 if (node->btc_children[idx] == rm_hdr) 1418 break; 1419 } 1420 ASSERT3U(idx, <=, hdr->bth_count); 1421 1422 /* 1423 * If the node is the root or it has more than the minimum number of 1424 * children, just remove the child and separator, and return. 1425 */ 1426 if (hdr->bth_parent == NULL || 1427 hdr->bth_count > min_count) { 1428 /* 1429 * Shift the element and children to the right of rm_hdr to 1430 * the left by one spot. 1431 */ 1432 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1433 BSS_PARALLELOGRAM); 1434 hdr->bth_count--; 1435 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); 1436 return; 1437 } 1438 1439 ASSERT3U(hdr->bth_count, ==, min_count); 1440 1441 /* 1442 * Now we try to take a node from a neighbor. We check left, then 1443 * right. If the neighbor exists and has more than the minimum number 1444 * of elements, we move the separator between us and them to our 1445 * node, move their closest element (last for left, first for right) 1446 * to the separator, and move their closest child to our node. Along 1447 * the way we need to collapse the gap made by idx, and (for our right 1448 * neighbor) the gap made by removing their first element and child. 1449 * 1450 * Note: this logic currently doesn't support taking from a neighbor 1451 * that isn't a sibling (i.e. a neighbor with a different 1452 * parent). This isn't critical functionality, but may be worth 1453 * implementing in the future for completeness' sake. 1454 */ 1455 zfs_btree_core_t *parent = hdr->bth_parent; 1456 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1457 1458 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1459 parent->btc_children[parent_idx - 1]); 1460 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1461 /* We can take a node from the left neighbor. */ 1462 ASSERT(zfs_btree_is_core(l_hdr)); 1463 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; 1464 1465 /* 1466 * Start by shifting the elements and children in the current 1467 * node to the right by one spot. 1468 */ 1469 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); 1470 1471 /* 1472 * Move the separator between node and neighbor to the first 1473 * element slot in the current node. 1474 */ 1475 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1476 size; 1477 bcpy(separator, node->btc_elems, size); 1478 1479 /* Move the last child of neighbor to our first child slot. */ 1480 node->btc_children[0] = 1481 neighbor->btc_children[l_hdr->bth_count]; 1482 node->btc_children[0]->bth_parent = node; 1483 1484 /* Move the last element of neighbor to the separator spot. */ 1485 uint8_t *take_elem = neighbor->btc_elems + 1486 (l_hdr->bth_count - 1) * size; 1487 bcpy(take_elem, separator, size); 1488 l_hdr->bth_count--; 1489 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); 1490 return; 1491 } 1492 1493 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1494 NULL : parent->btc_children[parent_idx + 1]); 1495 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1496 /* We can take a node from the right neighbor. */ 1497 ASSERT(zfs_btree_is_core(r_hdr)); 1498 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; 1499 1500 /* 1501 * Shift elements in node left by one spot to overwrite rm_hdr 1502 * and the separator before it. 1503 */ 1504 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1505 BSS_PARALLELOGRAM); 1506 1507 /* 1508 * Move the separator between node and neighbor to the last 1509 * element spot in node. 1510 */ 1511 uint8_t *separator = parent->btc_elems + parent_idx * size; 1512 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, 1513 size); 1514 1515 /* 1516 * Move the first child of neighbor to the last child spot in 1517 * node. 1518 */ 1519 node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; 1520 node->btc_children[hdr->bth_count]->bth_parent = node; 1521 1522 /* Move the first element of neighbor to the separator spot. */ 1523 uint8_t *take_elem = neighbor->btc_elems; 1524 bcpy(take_elem, separator, size); 1525 r_hdr->bth_count--; 1526 1527 /* 1528 * Shift the elements and children of neighbor to cover the 1529 * stolen elements. 1530 */ 1531 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, 1532 BSS_TRAPEZOID); 1533 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); 1534 return; 1535 } 1536 1537 /* 1538 * In this case, neither of our neighbors can spare an element, so we 1539 * need to merge with one of them. We prefer the left one, 1540 * arbitrarily. Move the separator into the leftmost merging node 1541 * (which may be us or the left neighbor), and then move the right 1542 * merging node's elements. Once that's done, we go back and delete 1543 * the element we're removing. Finally, go into the parent and delete 1544 * the right merging node and the separator. This may cause further 1545 * merging. 1546 */ 1547 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; 1548 uint32_t new_idx = idx; 1549 if (l_hdr != NULL) { 1550 keep_hdr = l_hdr; 1551 new_rm_hdr = hdr; 1552 new_idx += keep_hdr->bth_count + 1; 1553 } else { 1554 ASSERT3P(r_hdr, !=, NULL); 1555 keep_hdr = hdr; 1556 new_rm_hdr = r_hdr; 1557 parent_idx++; 1558 } 1559 1560 ASSERT(zfs_btree_is_core(keep_hdr)); 1561 ASSERT(zfs_btree_is_core(new_rm_hdr)); 1562 1563 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; 1564 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; 1565 1566 if (zfs_btree_verify_intensity >= 5) { 1567 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { 1568 zfs_btree_verify_poison_at(tree, keep_hdr, 1569 keep_hdr->bth_count + i); 1570 } 1571 } 1572 1573 /* Move the separator into the left node. */ 1574 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; 1575 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1576 size; 1577 bcpy(separator, e_out, size); 1578 keep_hdr->bth_count++; 1579 1580 /* Move all our elements and children into the left node. */ 1581 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, 1582 keep_hdr->bth_count, BSS_TRAPEZOID); 1583 1584 uint32_t old_count = keep_hdr->bth_count; 1585 1586 /* Update bookkeeping */ 1587 keep_hdr->bth_count += new_rm_hdr->bth_count; 1588 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); 1589 1590 /* 1591 * Shift the element and children to the right of rm_hdr to 1592 * the left by one spot. 1593 */ 1594 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); 1595 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, 1596 BSS_PARALLELOGRAM); 1597 keep_hdr->bth_count--; 1598 1599 /* Reparent all our children to point to the left node. */ 1600 zfs_btree_hdr_t **new_start = keep->btc_children + 1601 old_count - 1; 1602 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) 1603 new_start[i]->bth_parent = keep; 1604 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { 1605 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); 1606 ASSERT3P(keep->btc_children[i], !=, rm_hdr); 1607 } 1608 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); 1609 1610 new_rm_hdr->bth_count = 0; 1611 zfs_btree_node_destroy(tree, new_rm_hdr); 1612 zfs_btree_remove_from_node(tree, parent, new_rm_hdr); 1613 } 1614 1615 /* Remove the element at the specific location. */ 1616 void 1617 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) 1618 { 1619 size_t size = tree->bt_elem_size; 1620 zfs_btree_hdr_t *hdr = where->bti_node; 1621 uint32_t idx = where->bti_offset; 1622 1623 ASSERT(!where->bti_before); 1624 if (tree->bt_bulk != NULL) { 1625 /* 1626 * Leave bulk insert mode. Note that our index would be 1627 * invalid after we correct the tree, so we copy the value 1628 * we're planning to remove and find it again after 1629 * bulk_finish. 1630 */ 1631 uint8_t *value = zfs_btree_get(tree, where); 1632 uint8_t *tmp = kmem_alloc(size, KM_SLEEP); 1633 bcpy(value, tmp, size); 1634 zfs_btree_bulk_finish(tree); 1635 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); 1636 kmem_free(tmp, size); 1637 hdr = where->bti_node; 1638 idx = where->bti_offset; 1639 } 1640 1641 tree->bt_num_elems--; 1642 /* 1643 * If the element happens to be in a core node, we move a leaf node's 1644 * element into its place and then remove the leaf node element. This 1645 * makes the rebalance logic not need to be recursive both upwards and 1646 * downwards. 1647 */ 1648 if (zfs_btree_is_core(hdr)) { 1649 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1650 zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; 1651 void *new_value = zfs_btree_last_helper(tree, left_subtree, 1652 where); 1653 ASSERT3P(new_value, !=, NULL); 1654 1655 bcpy(new_value, node->btc_elems + idx * size, size); 1656 1657 hdr = where->bti_node; 1658 idx = where->bti_offset; 1659 ASSERT(!where->bti_before); 1660 } 1661 1662 /* 1663 * First, we'll update the leaf's metadata. Then, we shift any 1664 * elements after the idx to the left. After that, we rebalance if 1665 * needed. 1666 */ 1667 ASSERT(!zfs_btree_is_core(hdr)); 1668 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 1669 ASSERT3U(hdr->bth_count, >, 0); 1670 1671 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; 1672 1673 /* 1674 * If we're over the minimum size or this is the root, just overwrite 1675 * the value and return. 1676 */ 1677 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { 1678 bt_shrink_leaf(tree, leaf, idx, 1); 1679 if (hdr->bth_parent == NULL) { 1680 ASSERT0(tree->bt_height); 1681 if (hdr->bth_count == 0) { 1682 tree->bt_root = NULL; 1683 tree->bt_height--; 1684 zfs_btree_node_destroy(tree, &leaf->btl_hdr); 1685 } 1686 } 1687 zfs_btree_verify(tree); 1688 return; 1689 } 1690 ASSERT3U(hdr->bth_count, ==, min_count); 1691 1692 /* 1693 * Now we try to take a node from a sibling. We check left, then 1694 * right. If they exist and have more than the minimum number of 1695 * elements, we move the separator between us and them to our node 1696 * and move their closest element (last for left, first for right) to 1697 * the separator. Along the way we need to collapse the gap made by 1698 * idx, and (for our right neighbor) the gap made by removing their 1699 * first element. 1700 * 1701 * Note: this logic currently doesn't support taking from a neighbor 1702 * that isn't a sibling. This isn't critical functionality, but may be 1703 * worth implementing in the future for completeness' sake. 1704 */ 1705 zfs_btree_core_t *parent = hdr->bth_parent; 1706 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1707 1708 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1709 parent->btc_children[parent_idx - 1]); 1710 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1711 /* We can take a node from the left neighbor. */ 1712 ASSERT(!zfs_btree_is_core(l_hdr)); 1713 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; 1714 1715 /* 1716 * Move our elements back by one spot to make room for the 1717 * stolen element and overwrite the element being removed. 1718 */ 1719 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); 1720 1721 /* Move the separator to our first spot. */ 1722 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1723 size; 1724 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); 1725 1726 /* Move our neighbor's last element to the separator. */ 1727 uint8_t *take_elem = neighbor->btl_elems + 1728 (l_hdr->bth_first + l_hdr->bth_count - 1) * size; 1729 bcpy(take_elem, separator, size); 1730 1731 /* Delete our neighbor's last element. */ 1732 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); 1733 zfs_btree_verify(tree); 1734 return; 1735 } 1736 1737 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1738 NULL : parent->btc_children[parent_idx + 1]); 1739 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1740 /* We can take a node from the right neighbor. */ 1741 ASSERT(!zfs_btree_is_core(r_hdr)); 1742 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; 1743 1744 /* 1745 * Move our elements after the element being removed forwards 1746 * by one spot to make room for the stolen element and 1747 * overwrite the element being removed. 1748 */ 1749 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, 1750 1, BSD_LEFT); 1751 1752 /* Move the separator between us to our last spot. */ 1753 uint8_t *separator = parent->btc_elems + parent_idx * size; 1754 bcpy(separator, leaf->btl_elems + (hdr->bth_first + 1755 hdr->bth_count - 1) * size, size); 1756 1757 /* Move our neighbor's first element to the separator. */ 1758 uint8_t *take_elem = neighbor->btl_elems + 1759 r_hdr->bth_first * size; 1760 bcpy(take_elem, separator, size); 1761 1762 /* Delete our neighbor's first element. */ 1763 bt_shrink_leaf(tree, neighbor, 0, 1); 1764 zfs_btree_verify(tree); 1765 return; 1766 } 1767 1768 /* 1769 * In this case, neither of our neighbors can spare an element, so we 1770 * need to merge with one of them. We prefer the left one, arbitrarily. 1771 * After remove we move the separator into the leftmost merging node 1772 * (which may be us or the left neighbor), and then move the right 1773 * merging node's elements. Once that's done, we go back and delete 1774 * the element we're removing. Finally, go into the parent and delete 1775 * the right merging node and the separator. This may cause further 1776 * merging. 1777 */ 1778 zfs_btree_hdr_t *rm_hdr, *k_hdr; 1779 if (l_hdr != NULL) { 1780 k_hdr = l_hdr; 1781 rm_hdr = hdr; 1782 } else { 1783 ASSERT3P(r_hdr, !=, NULL); 1784 k_hdr = hdr; 1785 rm_hdr = r_hdr; 1786 parent_idx++; 1787 } 1788 ASSERT(!zfs_btree_is_core(k_hdr)); 1789 ASSERT(!zfs_btree_is_core(rm_hdr)); 1790 ASSERT3U(k_hdr->bth_count, ==, min_count); 1791 ASSERT3U(rm_hdr->bth_count, ==, min_count); 1792 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; 1793 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; 1794 1795 if (zfs_btree_verify_intensity >= 5) { 1796 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { 1797 zfs_btree_verify_poison_at(tree, k_hdr, 1798 k_hdr->bth_count + i); 1799 } 1800 } 1801 1802 /* 1803 * Remove the value from the node. It will go below the minimum, 1804 * but we'll fix it in no time. 1805 */ 1806 bt_shrink_leaf(tree, leaf, idx, 1); 1807 1808 /* Prepare space for elements to be moved from the right. */ 1809 uint32_t k_count = k_hdr->bth_count; 1810 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); 1811 ASSERT3U(k_hdr->bth_count, ==, min_count * 2); 1812 1813 /* Move the separator into the first open spot. */ 1814 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; 1815 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; 1816 bcpy(separator, out, size); 1817 1818 /* Move our elements to the left neighbor. */ 1819 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); 1820 zfs_btree_node_destroy(tree, rm_hdr); 1821 1822 /* Remove the emptied node from the parent. */ 1823 zfs_btree_remove_from_node(tree, parent, rm_hdr); 1824 zfs_btree_verify(tree); 1825 } 1826 1827 /* Remove the given value from the tree. */ 1828 void 1829 zfs_btree_remove(zfs_btree_t *tree, const void *value) 1830 { 1831 zfs_btree_index_t where = {0}; 1832 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); 1833 zfs_btree_remove_idx(tree, &where); 1834 } 1835 1836 /* Return the number of elements in the tree. */ 1837 ulong_t 1838 zfs_btree_numnodes(zfs_btree_t *tree) 1839 { 1840 return (tree->bt_num_elems); 1841 } 1842 1843 /* 1844 * This function is used to visit all the elements in the tree before 1845 * destroying the tree. This allows the calling code to perform any cleanup it 1846 * needs to do. This is more efficient than just removing the first element 1847 * over and over, because it removes all rebalancing. Once the destroy_nodes() 1848 * function has been called, no other btree operations are valid until it 1849 * returns NULL, which point the only valid operation is zfs_btree_destroy(). 1850 * 1851 * example: 1852 * 1853 * zfs_btree_index_t *cookie = NULL; 1854 * my_data_t *node; 1855 * 1856 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) 1857 * free(node->ptr); 1858 * zfs_btree_destroy(tree); 1859 * 1860 */ 1861 void * 1862 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) 1863 { 1864 if (*cookie == NULL) { 1865 if (tree->bt_height == -1) 1866 return (NULL); 1867 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); 1868 return (zfs_btree_first(tree, *cookie)); 1869 } 1870 1871 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, 1872 zfs_btree_node_destroy); 1873 if (rval == NULL) { 1874 tree->bt_root = NULL; 1875 tree->bt_height = -1; 1876 tree->bt_num_elems = 0; 1877 kmem_free(*cookie, sizeof (**cookie)); 1878 tree->bt_bulk = NULL; 1879 } 1880 return (rval); 1881 } 1882 1883 static void 1884 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1885 { 1886 if (zfs_btree_is_core(hdr)) { 1887 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; 1888 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1889 zfs_btree_clear_helper(tree, btc->btc_children[i]); 1890 } 1891 1892 zfs_btree_node_destroy(tree, hdr); 1893 } 1894 1895 void 1896 zfs_btree_clear(zfs_btree_t *tree) 1897 { 1898 if (tree->bt_root == NULL) { 1899 ASSERT0(tree->bt_num_elems); 1900 return; 1901 } 1902 1903 zfs_btree_clear_helper(tree, tree->bt_root); 1904 tree->bt_num_elems = 0; 1905 tree->bt_root = NULL; 1906 tree->bt_num_nodes = 0; 1907 tree->bt_height = -1; 1908 tree->bt_bulk = NULL; 1909 } 1910 1911 void 1912 zfs_btree_destroy(zfs_btree_t *tree) 1913 { 1914 ASSERT0(tree->bt_num_elems); 1915 ASSERT3P(tree->bt_root, ==, NULL); 1916 } 1917 1918 /* Verify that every child of this node has the correct parent pointer. */ 1919 static void 1920 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1921 { 1922 if (!zfs_btree_is_core(hdr)) 1923 return; 1924 1925 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1926 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1927 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); 1928 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); 1929 } 1930 } 1931 1932 /* Verify that every node has the correct parent pointer. */ 1933 static void 1934 zfs_btree_verify_pointers(zfs_btree_t *tree) 1935 { 1936 if (tree->bt_height == -1) { 1937 VERIFY3P(tree->bt_root, ==, NULL); 1938 return; 1939 } 1940 VERIFY3P(tree->bt_root->bth_parent, ==, NULL); 1941 zfs_btree_verify_pointers_helper(tree, tree->bt_root); 1942 } 1943 1944 /* 1945 * Verify that all the current node and its children satisfy the count 1946 * invariants, and return the total count in the subtree rooted in this node. 1947 */ 1948 static uint64_t 1949 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1950 { 1951 if (!zfs_btree_is_core(hdr)) { 1952 if (tree->bt_root != hdr && tree->bt_bulk && 1953 hdr != &tree->bt_bulk->btl_hdr) { 1954 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); 1955 } 1956 1957 return (hdr->bth_count); 1958 } else { 1959 1960 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1961 uint64_t ret = hdr->bth_count; 1962 if (tree->bt_root != hdr && tree->bt_bulk == NULL) 1963 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); 1964 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1965 ret += zfs_btree_verify_counts_helper(tree, 1966 node->btc_children[i]); 1967 } 1968 1969 return (ret); 1970 } 1971 } 1972 1973 /* 1974 * Verify that all nodes satisfy the invariants and that the total number of 1975 * elements is correct. 1976 */ 1977 static void 1978 zfs_btree_verify_counts(zfs_btree_t *tree) 1979 { 1980 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); 1981 if (tree->bt_height == -1) { 1982 return; 1983 } 1984 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, 1985 tree->bt_num_elems); 1986 } 1987 1988 /* 1989 * Check that the subtree rooted at this node has a uniform height. Returns 1990 * the number of nodes under this node, to help verify bt_num_nodes. 1991 */ 1992 static uint64_t 1993 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 1994 int64_t height) 1995 { 1996 if (!zfs_btree_is_core(hdr)) { 1997 VERIFY0(height); 1998 return (1); 1999 } 2000 2001 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2002 uint64_t ret = 1; 2003 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2004 ret += zfs_btree_verify_height_helper(tree, 2005 node->btc_children[i], height - 1); 2006 } 2007 return (ret); 2008 } 2009 2010 /* 2011 * Check that the tree rooted at this node has a uniform height, and that the 2012 * bt_height in the tree is correct. 2013 */ 2014 static void 2015 zfs_btree_verify_height(zfs_btree_t *tree) 2016 { 2017 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2018 if (tree->bt_height == -1) { 2019 return; 2020 } 2021 2022 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, 2023 tree->bt_height), ==, tree->bt_num_nodes); 2024 } 2025 2026 /* 2027 * Check that the elements in this node are sorted, and that if this is a core 2028 * node, the separators are properly between the subtrees they separaate and 2029 * that the children also satisfy this requirement. 2030 */ 2031 static void 2032 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2033 { 2034 size_t size = tree->bt_elem_size; 2035 if (!zfs_btree_is_core(hdr)) { 2036 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2037 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2038 VERIFY3S(tree->bt_compar(leaf->btl_elems + 2039 (hdr->bth_first + i - 1) * size, 2040 leaf->btl_elems + 2041 (hdr->bth_first + i) * size), ==, -1); 2042 } 2043 return; 2044 } 2045 2046 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2047 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2048 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, 2049 node->btc_elems + i * size), ==, -1); 2050 } 2051 for (uint32_t i = 0; i < hdr->bth_count; i++) { 2052 uint8_t *left_child_last = NULL; 2053 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; 2054 if (zfs_btree_is_core(left_child_hdr)) { 2055 zfs_btree_core_t *left_child = 2056 (zfs_btree_core_t *)left_child_hdr; 2057 left_child_last = left_child->btc_elems + 2058 (left_child_hdr->bth_count - 1) * size; 2059 } else { 2060 zfs_btree_leaf_t *left_child = 2061 (zfs_btree_leaf_t *)left_child_hdr; 2062 left_child_last = left_child->btl_elems + 2063 (left_child_hdr->bth_first + 2064 left_child_hdr->bth_count - 1) * size; 2065 } 2066 int comp = tree->bt_compar(node->btc_elems + i * size, 2067 left_child_last); 2068 if (comp <= 0) { 2069 panic("btree: compar returned %d (expected 1) at " 2070 "%px %d: compar(%px, %px)", comp, node, i, 2071 node->btc_elems + i * size, left_child_last); 2072 } 2073 2074 uint8_t *right_child_first = NULL; 2075 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; 2076 if (zfs_btree_is_core(right_child_hdr)) { 2077 zfs_btree_core_t *right_child = 2078 (zfs_btree_core_t *)right_child_hdr; 2079 right_child_first = right_child->btc_elems; 2080 } else { 2081 zfs_btree_leaf_t *right_child = 2082 (zfs_btree_leaf_t *)right_child_hdr; 2083 right_child_first = right_child->btl_elems + 2084 right_child_hdr->bth_first * size; 2085 } 2086 comp = tree->bt_compar(node->btc_elems + i * size, 2087 right_child_first); 2088 if (comp >= 0) { 2089 panic("btree: compar returned %d (expected -1) at " 2090 "%px %d: compar(%px, %px)", comp, node, i, 2091 node->btc_elems + i * size, right_child_first); 2092 } 2093 } 2094 for (uint32_t i = 0; i <= hdr->bth_count; i++) 2095 zfs_btree_verify_order_helper(tree, node->btc_children[i]); 2096 } 2097 2098 /* Check that all elements in the tree are in sorted order. */ 2099 static void 2100 zfs_btree_verify_order(zfs_btree_t *tree) 2101 { 2102 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2103 if (tree->bt_height == -1) { 2104 return; 2105 } 2106 2107 zfs_btree_verify_order_helper(tree, tree->bt_root); 2108 } 2109 2110 #ifdef ZFS_DEBUG 2111 /* Check that all unused memory is poisoned correctly. */ 2112 static void 2113 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2114 { 2115 size_t size = tree->bt_elem_size; 2116 if (!zfs_btree_is_core(hdr)) { 2117 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2118 for (size_t i = 0; i < hdr->bth_first * size; i++) 2119 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2120 for (size_t i = (hdr->bth_first + hdr->bth_count) * size; 2121 i < BTREE_LEAF_ESIZE; i++) 2122 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2123 } else { 2124 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2125 for (size_t i = hdr->bth_count * size; 2126 i < BTREE_CORE_ELEMS * size; i++) 2127 VERIFY3U(node->btc_elems[i], ==, 0x0f); 2128 2129 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 2130 i++) { 2131 VERIFY3P(node->btc_children[i], ==, 2132 (zfs_btree_hdr_t *)BTREE_POISON); 2133 } 2134 2135 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2136 zfs_btree_verify_poison_helper(tree, 2137 node->btc_children[i]); 2138 } 2139 } 2140 } 2141 #endif 2142 2143 /* Check that unused memory in the tree is still poisoned. */ 2144 static void 2145 zfs_btree_verify_poison(zfs_btree_t *tree) 2146 { 2147 #ifdef ZFS_DEBUG 2148 if (tree->bt_height == -1) 2149 return; 2150 zfs_btree_verify_poison_helper(tree, tree->bt_root); 2151 #endif 2152 } 2153 2154 void 2155 zfs_btree_verify(zfs_btree_t *tree) 2156 { 2157 if (zfs_btree_verify_intensity == 0) 2158 return; 2159 zfs_btree_verify_height(tree); 2160 if (zfs_btree_verify_intensity == 1) 2161 return; 2162 zfs_btree_verify_pointers(tree); 2163 if (zfs_btree_verify_intensity == 2) 2164 return; 2165 zfs_btree_verify_counts(tree); 2166 if (zfs_btree_verify_intensity == 3) 2167 return; 2168 zfs_btree_verify_order(tree); 2169 2170 if (zfs_btree_verify_intensity == 4) 2171 return; 2172 zfs_btree_verify_poison(tree); 2173 } 2174