1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 /* 16 * Copyright (c) 2019 by Delphix. All rights reserved. 17 */ 18 19 #include <sys/btree.h> 20 #include <sys/bitops.h> 21 #include <sys/zfs_context.h> 22 23 kmem_cache_t *zfs_btree_leaf_cache; 24 25 /* 26 * Control the extent of the verification that occurs when zfs_btree_verify is 27 * called. Primarily used for debugging when extending the btree logic and 28 * functionality. As the intensity is increased, new verification steps are 29 * added. These steps are cumulative; intensity = 3 includes the intensity = 1 30 * and intensity = 2 steps as well. 31 * 32 * Intensity 1: Verify that the tree's height is consistent throughout. 33 * Intensity 2: Verify that a core node's children's parent pointers point 34 * to the core node. 35 * Intensity 3: Verify that the total number of elements in the tree matches the 36 * sum of the number of elements in each node. Also verifies that each node's 37 * count obeys the invariants (less than or equal to maximum value, greater than 38 * or equal to half the maximum minus one). 39 * Intensity 4: Verify that each element compares less than the element 40 * immediately after it and greater than the one immediately before it using the 41 * comparator function. For core nodes, also checks that each element is greater 42 * than the last element in the first of the two nodes it separates, and less 43 * than the first element in the second of the two nodes. 44 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside 45 * of each node is poisoned appropriately. Note that poisoning always occurs if 46 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal 47 * operation. 48 * 49 * Intensity 4 and 5 are particularly expensive to perform; the previous levels 50 * are a few memory operations per node, while these levels require multiple 51 * operations per element. In addition, when creating large btrees, these 52 * operations are called at every step, resulting in extremely slow operation 53 * (while the asymptotic complexity of the other steps is the same, the 54 * importance of the constant factors cannot be denied). 55 */ 56 int zfs_btree_verify_intensity = 0; 57 58 /* 59 * A convenience function to silence warnings from memmove's return value and 60 * change argument order to src, dest. 61 */ 62 void 63 bmov(const void *src, void *dest, size_t size) 64 { 65 (void) memmove(dest, src, size); 66 } 67 68 #ifdef _ILP32 69 #define BTREE_POISON 0xabadb10c 70 #else 71 #define BTREE_POISON 0xabadb10cdeadbeef 72 #endif 73 74 static void 75 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 76 { 77 #ifdef ZFS_DEBUG 78 size_t size = tree->bt_elem_size; 79 if (!hdr->bth_core) { 80 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 81 (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, 82 BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - 83 hdr->bth_count * size); 84 } else { 85 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 86 for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { 87 node->btc_children[i] = 88 (zfs_btree_hdr_t *)BTREE_POISON; 89 } 90 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, 91 (BTREE_CORE_ELEMS - hdr->bth_count) * size); 92 } 93 #endif 94 } 95 96 static inline void 97 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 98 uint64_t offset) 99 { 100 #ifdef ZFS_DEBUG 101 size_t size = tree->bt_elem_size; 102 ASSERT3U(offset, >=, hdr->bth_count); 103 if (!hdr->bth_core) { 104 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 105 (void) memset(leaf->btl_elems + offset * size, 0x0f, size); 106 } else { 107 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 108 node->btc_children[offset + 1] = 109 (zfs_btree_hdr_t *)BTREE_POISON; 110 (void) memset(node->btc_elems + offset * size, 0x0f, size); 111 } 112 #endif 113 } 114 115 static inline void 116 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 117 uint64_t offset) 118 { 119 #ifdef ZFS_DEBUG 120 size_t size = tree->bt_elem_size; 121 uint8_t eval = 0x0f; 122 if (hdr->bth_core) { 123 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 124 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; 125 VERIFY3P(node->btc_children[offset + 1], ==, cval); 126 for (int i = 0; i < size; i++) 127 VERIFY3U(node->btc_elems[offset * size + i], ==, eval); 128 } else { 129 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 130 for (int i = 0; i < size; i++) 131 VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); 132 } 133 #endif 134 } 135 136 void 137 zfs_btree_init(void) 138 { 139 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", 140 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, 141 NULL, 0); 142 } 143 144 void 145 zfs_btree_fini(void) 146 { 147 kmem_cache_destroy(zfs_btree_leaf_cache); 148 } 149 150 void 151 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), 152 size_t size) 153 { 154 /* 155 * We need a minimmum of 4 elements so that when we split a node we 156 * always have at least two elements in each node. This simplifies the 157 * logic in zfs_btree_bulk_finish, since it means the last leaf will 158 * always have a left sibling to share with (unless it's the root). 159 */ 160 ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); 161 162 bzero(tree, sizeof (*tree)); 163 tree->bt_compar = compar; 164 tree->bt_elem_size = size; 165 tree->bt_height = -1; 166 tree->bt_bulk = NULL; 167 } 168 169 /* 170 * Find value in the array of elements provided. Uses a simple binary search. 171 */ 172 static void * 173 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, 174 const void *value, zfs_btree_index_t *where) 175 { 176 uint64_t max = nelems; 177 uint64_t min = 0; 178 while (max > min) { 179 uint64_t idx = (min + max) / 2; 180 uint8_t *cur = buf + idx * tree->bt_elem_size; 181 int comp = tree->bt_compar(cur, value); 182 if (comp == -1) { 183 min = idx + 1; 184 } else if (comp == 1) { 185 max = idx; 186 } else { 187 ASSERT0(comp); 188 where->bti_offset = idx; 189 where->bti_before = B_FALSE; 190 return (cur); 191 } 192 } 193 194 where->bti_offset = max; 195 where->bti_before = B_TRUE; 196 return (NULL); 197 } 198 199 /* 200 * Find the given value in the tree. where may be passed as null to use as a 201 * membership test or if the btree is being used as a map. 202 */ 203 void * 204 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) 205 { 206 if (tree->bt_height == -1) { 207 if (where != NULL) { 208 where->bti_node = NULL; 209 where->bti_offset = 0; 210 } 211 ASSERT0(tree->bt_num_elems); 212 return (NULL); 213 } 214 215 /* 216 * If we're in bulk-insert mode, we check the last spot in the tree 217 * and the last leaf in the tree before doing the normal search, 218 * because for most workloads the vast majority of finds in 219 * bulk-insert mode are to insert new elements. 220 */ 221 zfs_btree_index_t idx; 222 if (tree->bt_bulk != NULL) { 223 zfs_btree_leaf_t *last_leaf = tree->bt_bulk; 224 int compar = tree->bt_compar(last_leaf->btl_elems + 225 ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), 226 value); 227 if (compar < 0) { 228 /* 229 * If what they're looking for is after the last 230 * element, it's not in the tree. 231 */ 232 if (where != NULL) { 233 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 234 where->bti_offset = 235 last_leaf->btl_hdr.bth_count; 236 where->bti_before = B_TRUE; 237 } 238 return (NULL); 239 } else if (compar == 0) { 240 if (where != NULL) { 241 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 242 where->bti_offset = 243 last_leaf->btl_hdr.bth_count - 1; 244 where->bti_before = B_FALSE; 245 } 246 return (last_leaf->btl_elems + 247 ((last_leaf->btl_hdr.bth_count - 1) * 248 tree->bt_elem_size)); 249 } 250 if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { 251 /* 252 * If what they're looking for is after the first 253 * element in the last leaf, it's in the last leaf or 254 * it's not in the tree. 255 */ 256 void *d = zfs_btree_find_in_buf(tree, 257 last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, 258 value, &idx); 259 260 if (where != NULL) { 261 idx.bti_node = (zfs_btree_hdr_t *)last_leaf; 262 *where = idx; 263 } 264 return (d); 265 } 266 } 267 268 zfs_btree_core_t *node = NULL; 269 uint64_t child = 0; 270 uint64_t depth = 0; 271 272 /* 273 * Iterate down the tree, finding which child the value should be in 274 * by comparing with the separators. 275 */ 276 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; 277 node = (zfs_btree_core_t *)node->btc_children[child], depth++) { 278 ASSERT3P(node, !=, NULL); 279 void *d = zfs_btree_find_in_buf(tree, node->btc_elems, 280 node->btc_hdr.bth_count, value, &idx); 281 EQUIV(d != NULL, !idx.bti_before); 282 if (d != NULL) { 283 if (where != NULL) { 284 idx.bti_node = (zfs_btree_hdr_t *)node; 285 *where = idx; 286 } 287 return (d); 288 } 289 ASSERT(idx.bti_before); 290 child = idx.bti_offset; 291 } 292 293 /* 294 * The value is in this leaf, or it would be if it were in the 295 * tree. Find its proper location and return it. 296 */ 297 zfs_btree_leaf_t *leaf = (depth == 0 ? 298 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); 299 void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, 300 leaf->btl_hdr.bth_count, value, &idx); 301 302 if (where != NULL) { 303 idx.bti_node = (zfs_btree_hdr_t *)leaf; 304 *where = idx; 305 } 306 307 return (d); 308 } 309 310 /* 311 * To explain the following functions, it is useful to understand the four 312 * kinds of shifts used in btree operation. First, a shift is a movement of 313 * elements within a node. It is used to create gaps for inserting new 314 * elements and children, or cover gaps created when things are removed. A 315 * shift has two fundamental properties, each of which can be one of two 316 * values, making four types of shifts. There is the direction of the shift 317 * (left or right) and the shape of the shift (parallelogram or isoceles 318 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only 319 * applies to shifts of core nodes. 320 * 321 * The names derive from the following imagining of the layout of a node: 322 * 323 * Elements: * * * * * * * ... * * * 324 * Children: * * * * * * * * ... * * * 325 * 326 * This layout follows from the fact that the elements act as separators 327 * between pairs of children, and that children root subtrees "below" the 328 * current node. A left and right shift are fairly self-explanatory; a left 329 * shift moves things to the left, while a right shift moves things to the 330 * right. A parallelogram shift is a shift with the same number of elements 331 * and children being moved, while a trapezoid shift is a shift that moves one 332 * more children than elements. An example follows: 333 * 334 * A parallelogram shift could contain the following: 335 * _______________ 336 * \* * * * \ * * * ... * * * 337 * * \ * * * *\ * * * ... * * * 338 * --------------- 339 * A trapezoid shift could contain the following: 340 * ___________ 341 * * / * * * \ * * * ... * * * 342 * * / * * * *\ * * * ... * * * 343 * --------------- 344 * 345 * Note that a parellelogram shift is always shaped like a "left-leaning" 346 * parallelogram, where the starting index of the children being moved is 347 * always one higher than the starting index of the elements being moved. No 348 * "right-leaning" parallelogram shifts are needed (shifts where the starting 349 * element index and starting child index being moved are the same) to achieve 350 * any btree operations, so we ignore them. 351 */ 352 353 enum bt_shift_shape { 354 BSS_TRAPEZOID, 355 BSS_PARALLELOGRAM 356 }; 357 358 enum bt_shift_direction { 359 BSD_LEFT, 360 BSD_RIGHT 361 }; 362 363 /* 364 * Shift elements and children in the provided core node by off spots. The 365 * first element moved is idx, and count elements are moved. The shape of the 366 * shift is determined by shape. The direction is determined by dir. 367 */ 368 static inline void 369 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, 370 uint64_t count, uint64_t off, enum bt_shift_shape shape, 371 enum bt_shift_direction dir) 372 { 373 size_t size = tree->bt_elem_size; 374 ASSERT(node->btc_hdr.bth_core); 375 376 uint8_t *e_start = node->btc_elems + idx * size; 377 int sign = (dir == BSD_LEFT ? -1 : +1); 378 uint8_t *e_out = e_start + sign * off * size; 379 uint64_t e_count = count; 380 bmov(e_start, e_out, e_count * size); 381 382 zfs_btree_hdr_t **c_start = node->btc_children + idx + 383 (shape == BSS_TRAPEZOID ? 0 : 1); 384 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : 385 c_start + off); 386 uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 387 bmov(c_start, c_out, c_count * sizeof (*c_start)); 388 } 389 390 /* 391 * Shift elements and children in the provided core node left by one spot. 392 * The first element moved is idx, and count elements are moved. The 393 * shape of the shift is determined by trap; true if the shift is a trapezoid, 394 * false if it is a parallelogram. 395 */ 396 static inline void 397 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, 398 uint64_t count, enum bt_shift_shape shape) 399 { 400 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); 401 } 402 403 /* 404 * Shift elements and children in the provided core node right by one spot. 405 * Starts with elements[idx] and children[idx] and one more child than element. 406 */ 407 static inline void 408 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, 409 uint64_t count, enum bt_shift_shape shape) 410 { 411 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); 412 } 413 414 /* 415 * Shift elements and children in the provided leaf node by off spots. 416 * The first element moved is idx, and count elements are moved. The direction 417 * is determined by left. 418 */ 419 static inline void 420 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, 421 uint64_t count, uint64_t off, enum bt_shift_direction dir) 422 { 423 size_t size = tree->bt_elem_size; 424 ASSERT(!node->btl_hdr.bth_core); 425 426 uint8_t *start = node->btl_elems + idx * size; 427 int sign = (dir == BSD_LEFT ? -1 : +1); 428 uint8_t *out = start + sign * off * size; 429 bmov(start, out, count * size); 430 } 431 432 static inline void 433 bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, 434 uint64_t count) 435 { 436 bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); 437 } 438 439 static inline void 440 bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, 441 uint64_t count) 442 { 443 bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); 444 } 445 446 /* 447 * Move children and elements from one core node to another. The shape 448 * parameter behaves the same as it does in the shift logic. 449 */ 450 static inline void 451 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, 452 uint64_t count, zfs_btree_core_t *dest, uint64_t didx, 453 enum bt_shift_shape shape) 454 { 455 size_t size = tree->bt_elem_size; 456 ASSERT(source->btc_hdr.bth_core); 457 ASSERT(dest->btc_hdr.bth_core); 458 459 bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, 460 count * size); 461 462 uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 463 bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), 464 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), 465 c_count * sizeof (*source->btc_children)); 466 } 467 468 static inline void 469 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, 470 uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) 471 { 472 size_t size = tree->bt_elem_size; 473 ASSERT(!source->btl_hdr.bth_core); 474 ASSERT(!dest->btl_hdr.bth_core); 475 476 bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, 477 count * size); 478 } 479 480 /* 481 * Find the first element in the subtree rooted at hdr, return its value and 482 * put its location in where if non-null. 483 */ 484 static void * 485 zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) 486 { 487 zfs_btree_hdr_t *node; 488 489 for (node = hdr; node->bth_core; node = 490 ((zfs_btree_core_t *)node)->btc_children[0]) 491 ; 492 493 ASSERT(!node->bth_core); 494 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 495 if (where != NULL) { 496 where->bti_node = node; 497 where->bti_offset = 0; 498 where->bti_before = B_FALSE; 499 } 500 return (&leaf->btl_elems[0]); 501 } 502 503 /* Insert an element and a child into a core node at the given offset. */ 504 static void 505 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, 506 uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) 507 { 508 uint64_t size = tree->bt_elem_size; 509 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 510 ASSERT3P(par_hdr, ==, new_node->bth_parent); 511 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); 512 513 if (zfs_btree_verify_intensity >= 5) { 514 zfs_btree_verify_poison_at(tree, par_hdr, 515 par_hdr->bth_count); 516 } 517 /* Shift existing elements and children */ 518 uint64_t count = par_hdr->bth_count - offset; 519 bt_shift_core_right(tree, parent, offset, count, 520 BSS_PARALLELOGRAM); 521 522 /* Insert new values */ 523 parent->btc_children[offset + 1] = new_node; 524 bmov(buf, parent->btc_elems + offset * size, size); 525 par_hdr->bth_count++; 526 } 527 528 /* 529 * Insert new_node into the parent of old_node directly after old_node, with 530 * buf as the dividing element between the two. 531 */ 532 static void 533 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, 534 zfs_btree_hdr_t *new_node, void *buf) 535 { 536 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); 537 uint64_t size = tree->bt_elem_size; 538 zfs_btree_core_t *parent = old_node->bth_parent; 539 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 540 541 /* 542 * If this is the root node we were splitting, we create a new root 543 * and increase the height of the tree. 544 */ 545 if (parent == NULL) { 546 ASSERT3P(old_node, ==, tree->bt_root); 547 tree->bt_num_nodes++; 548 zfs_btree_core_t *new_root = 549 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * 550 size, KM_SLEEP); 551 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; 552 new_root_hdr->bth_parent = NULL; 553 new_root_hdr->bth_core = B_TRUE; 554 new_root_hdr->bth_count = 1; 555 556 old_node->bth_parent = new_node->bth_parent = new_root; 557 new_root->btc_children[0] = old_node; 558 new_root->btc_children[1] = new_node; 559 bmov(buf, new_root->btc_elems, size); 560 561 tree->bt_height++; 562 tree->bt_root = new_root_hdr; 563 zfs_btree_poison_node(tree, new_root_hdr); 564 return; 565 } 566 567 /* 568 * Since we have the new separator, binary search for where to put 569 * new_node. 570 */ 571 zfs_btree_index_t idx; 572 ASSERT(par_hdr->bth_core); 573 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, 574 par_hdr->bth_count, buf, &idx), ==, NULL); 575 ASSERT(idx.bti_before); 576 uint64_t offset = idx.bti_offset; 577 ASSERT3U(offset, <=, par_hdr->bth_count); 578 ASSERT3P(parent->btc_children[offset], ==, old_node); 579 580 /* 581 * If the parent isn't full, shift things to accomodate our insertions 582 * and return. 583 */ 584 if (par_hdr->bth_count != BTREE_CORE_ELEMS) { 585 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); 586 return; 587 } 588 589 /* 590 * We need to split this core node into two. Currently there are 591 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for 592 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the 593 * current node, and the others will be moved to the new core node. 594 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One 595 * will be used as the new separator in our parent, and the others 596 * will be split among the two core nodes. 597 * 598 * Usually we will split the node in half evenly, with 599 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we 600 * instead move only about a quarter of the elements (and children) to 601 * the new node. Since the average state after a long time is a 3/4 602 * full node, shortcutting directly to that state improves efficiency. 603 * 604 * We do this in two stages: first we split into two nodes, and then we 605 * reuse our existing logic to insert the new element and child. 606 */ 607 uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 608 2 : 4)) - 1, 2); 609 uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; 610 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); 611 tree->bt_num_nodes++; 612 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + 613 BTREE_CORE_ELEMS * size, KM_SLEEP); 614 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; 615 new_par_hdr->bth_parent = par_hdr->bth_parent; 616 new_par_hdr->bth_core = B_TRUE; 617 new_par_hdr->bth_count = move_count; 618 zfs_btree_poison_node(tree, new_par_hdr); 619 620 par_hdr->bth_count = keep_count; 621 622 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 623 0, BSS_TRAPEZOID); 624 625 /* Store the new separator in a buffer. */ 626 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); 627 bmov(parent->btc_elems + keep_count * size, tmp_buf, 628 size); 629 zfs_btree_poison_node(tree, par_hdr); 630 631 if (offset < keep_count) { 632 /* Insert the new node into the left half */ 633 zfs_btree_insert_core_impl(tree, parent, offset, new_node, 634 buf); 635 636 /* 637 * Move the new separator to the existing buffer. 638 */ 639 bmov(tmp_buf, buf, size); 640 } else if (offset > keep_count) { 641 /* Insert the new node into the right half */ 642 new_node->bth_parent = new_parent; 643 zfs_btree_insert_core_impl(tree, new_parent, 644 offset - keep_count - 1, new_node, buf); 645 646 /* 647 * Move the new separator to the existing buffer. 648 */ 649 bmov(tmp_buf, buf, size); 650 } else { 651 /* 652 * Move the new separator into the right half, and replace it 653 * with buf. We also need to shift back the elements in the 654 * right half to accomodate new_node. 655 */ 656 bt_shift_core_right(tree, new_parent, 0, move_count, 657 BSS_TRAPEZOID); 658 new_parent->btc_children[0] = new_node; 659 bmov(tmp_buf, new_parent->btc_elems, size); 660 new_par_hdr->bth_count++; 661 } 662 kmem_free(tmp_buf, size); 663 zfs_btree_poison_node(tree, par_hdr); 664 665 for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) 666 new_parent->btc_children[i]->bth_parent = new_parent; 667 668 for (int i = 0; i <= parent->btc_hdr.bth_count; i++) 669 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); 670 671 /* 672 * Now that the node is split, we need to insert the new node into its 673 * parent. This may cause further splitting. 674 */ 675 zfs_btree_insert_into_parent(tree, &parent->btc_hdr, 676 &new_parent->btc_hdr, buf); 677 } 678 679 /* Insert an element into a leaf node at the given offset. */ 680 static void 681 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 682 uint64_t idx, const void *value) 683 { 684 uint64_t size = tree->bt_elem_size; 685 uint8_t *start = leaf->btl_elems + (idx * size); 686 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 687 uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - 688 sizeof (zfs_btree_hdr_t)) / size, 2); 689 uint64_t count = leaf->btl_hdr.bth_count - idx; 690 ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); 691 692 if (zfs_btree_verify_intensity >= 5) { 693 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, 694 leaf->btl_hdr.bth_count); 695 } 696 697 bt_shift_leaf_right(tree, leaf, idx, count); 698 bmov(value, start, size); 699 hdr->bth_count++; 700 } 701 702 /* Helper function for inserting a new value into leaf at the given index. */ 703 static void 704 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 705 const void *value, uint64_t idx) 706 { 707 uint64_t size = tree->bt_elem_size; 708 uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - 709 sizeof (zfs_btree_hdr_t)) / size, 2); 710 711 /* 712 * If the leaf isn't full, shift the elements after idx and insert 713 * value. 714 */ 715 if (leaf->btl_hdr.bth_count != capacity) { 716 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 717 return; 718 } 719 720 /* 721 * Otherwise, we split the leaf node into two nodes. If we're not bulk 722 * inserting, each is of size (capacity / 2). If we are bulk 723 * inserting, we move a quarter of the elements to the new node so 724 * inserts into the old node don't cause immediate splitting but the 725 * tree stays relatively dense. Since the average state after a long 726 * time is a 3/4 full node, shortcutting directly to that state 727 * improves efficiency. At the end of the bulk insertion process 728 * we'll need to go through and fix up any nodes (the last leaf and 729 * its ancestors, potentially) that are below the minimum. 730 * 731 * In either case, we're left with one extra element. The leftover 732 * element will become the new dividing element between the two nodes. 733 */ 734 uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - 735 1, 2); 736 uint64_t keep_count = capacity - move_count - 1; 737 ASSERT3U(capacity - move_count, >=, 2); 738 tree->bt_num_nodes++; 739 zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, 740 KM_SLEEP); 741 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; 742 new_hdr->bth_parent = leaf->btl_hdr.bth_parent; 743 new_hdr->bth_core = B_FALSE; 744 new_hdr->bth_count = move_count; 745 zfs_btree_poison_node(tree, new_hdr); 746 747 leaf->btl_hdr.bth_count = keep_count; 748 749 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) 750 tree->bt_bulk = new_leaf; 751 752 /* Copy the back part to the new leaf. */ 753 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 754 0); 755 756 /* We store the new separator in a buffer we control for simplicity. */ 757 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 758 bmov(leaf->btl_elems + (keep_count * size), buf, size); 759 zfs_btree_poison_node(tree, &leaf->btl_hdr); 760 761 if (idx < keep_count) { 762 /* Insert into the existing leaf. */ 763 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 764 } else if (idx > keep_count) { 765 /* Insert into the new leaf. */ 766 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 767 1, value); 768 } else { 769 /* 770 * Shift the elements in the new leaf to make room for the 771 * separator, and use the new value as the new separator. 772 */ 773 bt_shift_leaf_right(tree, new_leaf, 0, move_count); 774 bmov(buf, new_leaf->btl_elems, size); 775 bmov(value, buf, size); 776 new_hdr->bth_count++; 777 } 778 779 /* 780 * Now that the node is split, we need to insert the new node into its 781 * parent. This may cause further splitting, bur only of core nodes. 782 */ 783 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, 784 buf); 785 kmem_free(buf, size); 786 } 787 788 static uint64_t 789 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 790 { 791 void *buf; 792 if (hdr->bth_core) { 793 buf = ((zfs_btree_core_t *)hdr)->btc_elems; 794 } else { 795 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; 796 } 797 zfs_btree_index_t idx; 798 zfs_btree_core_t *parent = hdr->bth_parent; 799 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, 800 parent->btc_hdr.bth_count, buf, &idx), ==, NULL); 801 ASSERT(idx.bti_before); 802 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); 803 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); 804 return (idx.bti_offset); 805 } 806 807 /* 808 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some 809 * nodes may violate the invariant that non-root nodes must be at least half 810 * full. All nodes violating this invariant should be the last node in their 811 * particular level. To correct the invariant, we take values from their left 812 * neighbor until they are half full. They must have a left neighbor at their 813 * level because the last node at a level is not the first node unless it's 814 * the root. 815 */ 816 static void 817 zfs_btree_bulk_finish(zfs_btree_t *tree) 818 { 819 ASSERT3P(tree->bt_bulk, !=, NULL); 820 ASSERT3P(tree->bt_root, !=, NULL); 821 zfs_btree_leaf_t *leaf = tree->bt_bulk; 822 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 823 zfs_btree_core_t *parent = hdr->bth_parent; 824 uint64_t size = tree->bt_elem_size; 825 uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - 826 sizeof (zfs_btree_hdr_t)) / size, 2); 827 828 /* 829 * The invariant doesn't apply to the root node, if that's the only 830 * node in the tree we're done. 831 */ 832 if (parent == NULL) { 833 tree->bt_bulk = NULL; 834 return; 835 } 836 837 /* First, take elements to rebalance the leaf node. */ 838 if (hdr->bth_count < capacity / 2) { 839 /* 840 * First, find the left neighbor. The simplest way to do this 841 * is to call zfs_btree_prev twice; the first time finds some 842 * ancestor of this node, and the second time finds the left 843 * neighbor. The ancestor found is the lowest common ancestor 844 * of leaf and the neighbor. 845 */ 846 zfs_btree_index_t idx = { 847 .bti_node = hdr, 848 .bti_offset = 0 849 }; 850 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 851 ASSERT(idx.bti_node->bth_core); 852 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; 853 uint64_t common_idx = idx.bti_offset; 854 855 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 856 ASSERT(!idx.bti_node->bth_core); 857 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; 858 zfs_btree_hdr_t *l_hdr = idx.bti_node; 859 uint64_t move_count = (capacity / 2) - hdr->bth_count; 860 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, 861 capacity / 2); 862 863 if (zfs_btree_verify_intensity >= 5) { 864 for (int i = 0; i < move_count; i++) { 865 zfs_btree_verify_poison_at(tree, hdr, 866 leaf->btl_hdr.bth_count + i); 867 } 868 } 869 870 /* First, shift elements in leaf back. */ 871 bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, 872 BSD_RIGHT); 873 874 /* Next, move the separator from the common ancestor to leaf. */ 875 uint8_t *separator = common->btc_elems + (common_idx * size); 876 uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); 877 bmov(separator, out, size); 878 move_count--; 879 880 /* 881 * Now we move elements from the tail of the left neighbor to 882 * fill the remaining spots in leaf. 883 */ 884 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - 885 move_count, move_count, leaf, 0); 886 887 /* 888 * Finally, move the new last element in the left neighbor to 889 * the separator. 890 */ 891 bmov(l_neighbor->btl_elems + (l_hdr->bth_count - 892 move_count - 1) * size, separator, size); 893 894 /* Adjust the node's counts, and we're done. */ 895 l_hdr->bth_count -= move_count + 1; 896 hdr->bth_count += move_count + 1; 897 898 ASSERT3U(l_hdr->bth_count, >=, capacity / 2); 899 ASSERT3U(hdr->bth_count, >=, capacity / 2); 900 zfs_btree_poison_node(tree, l_hdr); 901 } 902 903 /* 904 * Now we have to rebalance any ancestors of leaf that may also 905 * violate the invariant. 906 */ 907 capacity = BTREE_CORE_ELEMS; 908 while (parent->btc_hdr.bth_parent != NULL) { 909 zfs_btree_core_t *cur = parent; 910 zfs_btree_hdr_t *hdr = &cur->btc_hdr; 911 parent = hdr->bth_parent; 912 /* 913 * If the invariant isn't violated, move on to the next 914 * ancestor. 915 */ 916 if (hdr->bth_count >= capacity / 2) 917 continue; 918 919 /* 920 * Because the smallest number of nodes we can move when 921 * splitting is 2, we never need to worry about not having a 922 * left sibling (a sibling is a neighbor with the same parent). 923 */ 924 uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 925 ASSERT3U(parent_idx, >, 0); 926 zfs_btree_core_t *l_neighbor = 927 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; 928 uint64_t move_count = (capacity / 2) - hdr->bth_count; 929 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, 930 capacity / 2); 931 932 if (zfs_btree_verify_intensity >= 5) { 933 for (int i = 0; i < move_count; i++) { 934 zfs_btree_verify_poison_at(tree, hdr, 935 hdr->bth_count + i); 936 } 937 } 938 /* First, shift things in the right node back. */ 939 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, 940 BSS_TRAPEZOID, BSD_RIGHT); 941 942 /* Next, move the separator to the right node. */ 943 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * 944 size); 945 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); 946 bmov(separator, e_out, size); 947 948 /* 949 * Now, move elements and children from the left node to the 950 * right. We move one more child than elements. 951 */ 952 move_count--; 953 uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; 954 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, 955 BSS_TRAPEZOID); 956 957 /* 958 * Finally, move the last element in the left node to the 959 * separator's position. 960 */ 961 move_idx--; 962 bmov(l_neighbor->btc_elems + move_idx * size, separator, size); 963 964 l_neighbor->btc_hdr.bth_count -= move_count + 1; 965 hdr->bth_count += move_count + 1; 966 967 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); 968 ASSERT3U(hdr->bth_count, >=, capacity / 2); 969 970 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); 971 972 for (int i = 0; i <= hdr->bth_count; i++) 973 cur->btc_children[i]->bth_parent = cur; 974 } 975 976 tree->bt_bulk = NULL; 977 } 978 979 /* 980 * Insert value into tree at the location specified by where. 981 */ 982 void 983 zfs_btree_add_idx(zfs_btree_t *tree, const void *value, 984 const zfs_btree_index_t *where) 985 { 986 zfs_btree_index_t idx = {0}; 987 988 /* If we're not inserting in the last leaf, end bulk insert mode. */ 989 if (tree->bt_bulk != NULL) { 990 if (where->bti_node != &tree->bt_bulk->btl_hdr) { 991 zfs_btree_bulk_finish(tree); 992 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); 993 where = &idx; 994 } 995 } 996 997 tree->bt_num_elems++; 998 /* 999 * If this is the first element in the tree, create a leaf root node 1000 * and add the value to it. 1001 */ 1002 if (where->bti_node == NULL) { 1003 ASSERT3U(tree->bt_num_elems, ==, 1); 1004 ASSERT3S(tree->bt_height, ==, -1); 1005 ASSERT3P(tree->bt_root, ==, NULL); 1006 ASSERT0(where->bti_offset); 1007 1008 tree->bt_num_nodes++; 1009 zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, 1010 KM_SLEEP); 1011 tree->bt_root = &leaf->btl_hdr; 1012 tree->bt_height++; 1013 1014 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 1015 hdr->bth_parent = NULL; 1016 hdr->bth_core = B_FALSE; 1017 hdr->bth_count = 0; 1018 zfs_btree_poison_node(tree, hdr); 1019 1020 zfs_btree_insert_into_leaf(tree, leaf, value, 0); 1021 tree->bt_bulk = leaf; 1022 } else if (!where->bti_node->bth_core) { 1023 /* 1024 * If we're inserting into a leaf, go directly to the helper 1025 * function. 1026 */ 1027 zfs_btree_insert_into_leaf(tree, 1028 (zfs_btree_leaf_t *)where->bti_node, value, 1029 where->bti_offset); 1030 } else { 1031 /* 1032 * If we're inserting into a core node, we can't just shift 1033 * the existing element in that slot in the same node without 1034 * breaking our ordering invariants. Instead we place the new 1035 * value in the node at that spot and then insert the old 1036 * separator into the first slot in the subtree to the right. 1037 */ 1038 ASSERT(where->bti_node->bth_core); 1039 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; 1040 1041 /* 1042 * We can ignore bti_before, because either way the value 1043 * should end up in bti_offset. 1044 */ 1045 uint64_t off = where->bti_offset; 1046 zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; 1047 size_t size = tree->bt_elem_size; 1048 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 1049 bmov(node->btc_elems + off * size, buf, size); 1050 bmov(value, node->btc_elems + off * size, size); 1051 1052 /* 1053 * Find the first slot in the subtree to the right, insert 1054 * there. 1055 */ 1056 zfs_btree_index_t new_idx; 1057 VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); 1058 ASSERT0(new_idx.bti_offset); 1059 ASSERT(!new_idx.bti_node->bth_core); 1060 zfs_btree_insert_into_leaf(tree, 1061 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); 1062 kmem_free(buf, size); 1063 } 1064 zfs_btree_verify(tree); 1065 } 1066 1067 /* 1068 * Return the first element in the tree, and put its location in where if 1069 * non-null. 1070 */ 1071 void * 1072 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) 1073 { 1074 if (tree->bt_height == -1) { 1075 ASSERT0(tree->bt_num_elems); 1076 return (NULL); 1077 } 1078 return (zfs_btree_first_helper(tree->bt_root, where)); 1079 } 1080 1081 /* 1082 * Find the last element in the subtree rooted at hdr, return its value and 1083 * put its location in where if non-null. 1084 */ 1085 static void * 1086 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, 1087 zfs_btree_index_t *where) 1088 { 1089 zfs_btree_hdr_t *node; 1090 1091 for (node = hdr; node->bth_core; node = 1092 ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) 1093 ; 1094 1095 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 1096 if (where != NULL) { 1097 where->bti_node = node; 1098 where->bti_offset = node->bth_count - 1; 1099 where->bti_before = B_FALSE; 1100 } 1101 return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); 1102 } 1103 1104 /* 1105 * Return the last element in the tree, and put its location in where if 1106 * non-null. 1107 */ 1108 void * 1109 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) 1110 { 1111 if (tree->bt_height == -1) { 1112 ASSERT0(tree->bt_num_elems); 1113 return (NULL); 1114 } 1115 return (zfs_btree_last_helper(tree, tree->bt_root, where)); 1116 } 1117 1118 /* 1119 * This function contains the logic to find the next node in the tree. A 1120 * helper function is used because there are multiple internal consumemrs of 1121 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each 1122 * node after we've finished with it. 1123 */ 1124 static void * 1125 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1126 zfs_btree_index_t *out_idx, 1127 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) 1128 { 1129 if (idx->bti_node == NULL) { 1130 ASSERT3S(tree->bt_height, ==, -1); 1131 return (NULL); 1132 } 1133 1134 uint64_t offset = idx->bti_offset; 1135 if (!idx->bti_node->bth_core) { 1136 /* 1137 * When finding the next element of an element in a leaf, 1138 * there are two cases. If the element isn't the last one in 1139 * the leaf, in which case we just return the next element in 1140 * the leaf. Otherwise, we need to traverse up our parents 1141 * until we find one where our ancestor isn't the last child 1142 * of its parent. Once we do, the next element is the 1143 * separator after our ancestor in its parent. 1144 */ 1145 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1146 uint64_t new_off = offset + (idx->bti_before ? 0 : 1); 1147 if (leaf->btl_hdr.bth_count > new_off) { 1148 out_idx->bti_node = &leaf->btl_hdr; 1149 out_idx->bti_offset = new_off; 1150 out_idx->bti_before = B_FALSE; 1151 return (leaf->btl_elems + new_off * tree->bt_elem_size); 1152 } 1153 1154 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1155 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1156 node != NULL; node = node->btc_hdr.bth_parent) { 1157 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1158 ASSERT(hdr->bth_core); 1159 uint64_t i = zfs_btree_find_parent_idx(tree, prev); 1160 if (done_func != NULL) 1161 done_func(tree, prev); 1162 if (i == hdr->bth_count) { 1163 prev = hdr; 1164 continue; 1165 } 1166 out_idx->bti_node = hdr; 1167 out_idx->bti_offset = i; 1168 out_idx->bti_before = B_FALSE; 1169 return (node->btc_elems + i * tree->bt_elem_size); 1170 } 1171 if (done_func != NULL) 1172 done_func(tree, prev); 1173 /* 1174 * We've traversed all the way up and been at the end of the 1175 * node every time, so this was the last element in the tree. 1176 */ 1177 return (NULL); 1178 } 1179 1180 /* If we were before an element in a core node, return that element. */ 1181 ASSERT(idx->bti_node->bth_core); 1182 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1183 if (idx->bti_before) { 1184 out_idx->bti_before = B_FALSE; 1185 return (node->btc_elems + offset * tree->bt_elem_size); 1186 } 1187 1188 /* 1189 * The next element from one in a core node is the first element in 1190 * the subtree just to the right of the separator. 1191 */ 1192 zfs_btree_hdr_t *child = node->btc_children[offset + 1]; 1193 return (zfs_btree_first_helper(child, out_idx)); 1194 } 1195 1196 /* 1197 * Return the next valued node in the tree. The same address can be safely 1198 * passed for idx and out_idx. 1199 */ 1200 void * 1201 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1202 zfs_btree_index_t *out_idx) 1203 { 1204 return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); 1205 } 1206 1207 /* 1208 * Return the previous valued node in the tree. The same value can be safely 1209 * passed for idx and out_idx. 1210 */ 1211 void * 1212 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1213 zfs_btree_index_t *out_idx) 1214 { 1215 if (idx->bti_node == NULL) { 1216 ASSERT3S(tree->bt_height, ==, -1); 1217 return (NULL); 1218 } 1219 1220 uint64_t offset = idx->bti_offset; 1221 if (!idx->bti_node->bth_core) { 1222 /* 1223 * When finding the previous element of an element in a leaf, 1224 * there are two cases. If the element isn't the first one in 1225 * the leaf, in which case we just return the previous element 1226 * in the leaf. Otherwise, we need to traverse up our parents 1227 * until we find one where our previous ancestor isn't the 1228 * first child. Once we do, the previous element is the 1229 * separator after our previous ancestor. 1230 */ 1231 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1232 if (offset != 0) { 1233 out_idx->bti_node = &leaf->btl_hdr; 1234 out_idx->bti_offset = offset - 1; 1235 out_idx->bti_before = B_FALSE; 1236 return (leaf->btl_elems + (offset - 1) * 1237 tree->bt_elem_size); 1238 } 1239 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1240 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1241 node != NULL; node = node->btc_hdr.bth_parent) { 1242 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1243 ASSERT(hdr->bth_core); 1244 uint64_t i = zfs_btree_find_parent_idx(tree, prev); 1245 if (i == 0) { 1246 prev = hdr; 1247 continue; 1248 } 1249 out_idx->bti_node = hdr; 1250 out_idx->bti_offset = i - 1; 1251 out_idx->bti_before = B_FALSE; 1252 return (node->btc_elems + (i - 1) * tree->bt_elem_size); 1253 } 1254 /* 1255 * We've traversed all the way up and been at the start of the 1256 * node every time, so this was the first node in the tree. 1257 */ 1258 return (NULL); 1259 } 1260 1261 /* 1262 * The previous element from one in a core node is the last element in 1263 * the subtree just to the left of the separator. 1264 */ 1265 ASSERT(idx->bti_node->bth_core); 1266 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1267 zfs_btree_hdr_t *child = node->btc_children[offset]; 1268 return (zfs_btree_last_helper(tree, child, out_idx)); 1269 } 1270 1271 /* 1272 * Get the value at the provided index in the tree. 1273 * 1274 * Note that the value returned from this function can be mutated, but only 1275 * if it will not change the ordering of the element with respect to any other 1276 * elements that could be in the tree. 1277 */ 1278 void * 1279 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) 1280 { 1281 ASSERT(!idx->bti_before); 1282 if (!idx->bti_node->bth_core) { 1283 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1284 return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); 1285 } 1286 ASSERT(idx->bti_node->bth_core); 1287 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1288 return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); 1289 } 1290 1291 /* Add the given value to the tree. Must not already be in the tree. */ 1292 void 1293 zfs_btree_add(zfs_btree_t *tree, const void *node) 1294 { 1295 zfs_btree_index_t where = {0}; 1296 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); 1297 zfs_btree_add_idx(tree, node, &where); 1298 } 1299 1300 /* Helper function to free a tree node. */ 1301 static void 1302 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) 1303 { 1304 tree->bt_num_nodes--; 1305 if (!node->bth_core) { 1306 kmem_cache_free(zfs_btree_leaf_cache, node); 1307 } else { 1308 kmem_free(node, sizeof (zfs_btree_core_t) + 1309 BTREE_CORE_ELEMS * tree->bt_elem_size); 1310 } 1311 } 1312 1313 /* 1314 * Remove the rm_hdr and the separator to its left from the parent node. The 1315 * buffer that rm_hdr was stored in may already be freed, so its contents 1316 * cannot be accessed. 1317 */ 1318 static void 1319 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, 1320 zfs_btree_hdr_t *rm_hdr) 1321 { 1322 size_t size = tree->bt_elem_size; 1323 uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; 1324 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1325 /* 1326 * If the node is the root node and rm_hdr is one of two children, 1327 * promote the other child to the root. 1328 */ 1329 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { 1330 ASSERT3U(hdr->bth_count, ==, 1); 1331 ASSERT3P(tree->bt_root, ==, node); 1332 ASSERT3P(node->btc_children[1], ==, rm_hdr); 1333 tree->bt_root = node->btc_children[0]; 1334 node->btc_children[0]->bth_parent = NULL; 1335 zfs_btree_node_destroy(tree, hdr); 1336 tree->bt_height--; 1337 return; 1338 } 1339 1340 uint64_t idx; 1341 for (idx = 0; idx <= hdr->bth_count; idx++) { 1342 if (node->btc_children[idx] == rm_hdr) 1343 break; 1344 } 1345 ASSERT3U(idx, <=, hdr->bth_count); 1346 1347 /* 1348 * If the node is the root or it has more than the minimum number of 1349 * children, just remove the child and separator, and return. 1350 */ 1351 if (hdr->bth_parent == NULL || 1352 hdr->bth_count > min_count) { 1353 /* 1354 * Shift the element and children to the right of rm_hdr to 1355 * the left by one spot. 1356 */ 1357 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1358 BSS_PARALLELOGRAM); 1359 hdr->bth_count--; 1360 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); 1361 return; 1362 } 1363 1364 ASSERT3U(hdr->bth_count, ==, min_count); 1365 1366 /* 1367 * Now we try to take a node from a neighbor. We check left, then 1368 * right. If the neighbor exists and has more than the minimum number 1369 * of elements, we move the separator betweeen us and them to our 1370 * node, move their closest element (last for left, first for right) 1371 * to the separator, and move their closest child to our node. Along 1372 * the way we need to collapse the gap made by idx, and (for our right 1373 * neighbor) the gap made by removing their first element and child. 1374 * 1375 * Note: this logic currently doesn't support taking from a neighbor 1376 * that isn't a sibling (i.e. a neighbor with a different 1377 * parent). This isn't critical functionality, but may be worth 1378 * implementing in the future for completeness' sake. 1379 */ 1380 zfs_btree_core_t *parent = hdr->bth_parent; 1381 uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1382 1383 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1384 parent->btc_children[parent_idx - 1]); 1385 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1386 /* We can take a node from the left neighbor. */ 1387 ASSERT(l_hdr->bth_core); 1388 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; 1389 1390 /* 1391 * Start by shifting the elements and children in the current 1392 * node to the right by one spot. 1393 */ 1394 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); 1395 1396 /* 1397 * Move the separator between node and neighbor to the first 1398 * element slot in the current node. 1399 */ 1400 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1401 size; 1402 bmov(separator, node->btc_elems, size); 1403 1404 /* Move the last child of neighbor to our first child slot. */ 1405 zfs_btree_hdr_t **take_child = neighbor->btc_children + 1406 l_hdr->bth_count; 1407 bmov(take_child, node->btc_children, sizeof (*take_child)); 1408 node->btc_children[0]->bth_parent = node; 1409 1410 /* Move the last element of neighbor to the separator spot. */ 1411 uint8_t *take_elem = neighbor->btc_elems + 1412 (l_hdr->bth_count - 1) * size; 1413 bmov(take_elem, separator, size); 1414 l_hdr->bth_count--; 1415 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); 1416 return; 1417 } 1418 1419 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1420 NULL : parent->btc_children[parent_idx + 1]); 1421 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1422 /* We can take a node from the right neighbor. */ 1423 ASSERT(r_hdr->bth_core); 1424 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; 1425 1426 /* 1427 * Shift elements in node left by one spot to overwrite rm_hdr 1428 * and the separator before it. 1429 */ 1430 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1431 BSS_PARALLELOGRAM); 1432 1433 /* 1434 * Move the separator between node and neighbor to the last 1435 * element spot in node. 1436 */ 1437 uint8_t *separator = parent->btc_elems + parent_idx * size; 1438 bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, 1439 size); 1440 1441 /* 1442 * Move the first child of neighbor to the last child spot in 1443 * node. 1444 */ 1445 zfs_btree_hdr_t **take_child = neighbor->btc_children; 1446 bmov(take_child, node->btc_children + hdr->bth_count, 1447 sizeof (*take_child)); 1448 node->btc_children[hdr->bth_count]->bth_parent = node; 1449 1450 /* Move the first element of neighbor to the separator spot. */ 1451 uint8_t *take_elem = neighbor->btc_elems; 1452 bmov(take_elem, separator, size); 1453 r_hdr->bth_count--; 1454 1455 /* 1456 * Shift the elements and children of neighbor to cover the 1457 * stolen elements. 1458 */ 1459 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, 1460 BSS_TRAPEZOID); 1461 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); 1462 return; 1463 } 1464 1465 /* 1466 * In this case, neither of our neighbors can spare an element, so we 1467 * need to merge with one of them. We prefer the left one, 1468 * arabitrarily. Move the separator into the leftmost merging node 1469 * (which may be us or the left neighbor), and then move the right 1470 * merging node's elements. Once that's done, we go back and delete 1471 * the element we're removing. Finally, go into the parent and delete 1472 * the right merging node and the separator. This may cause further 1473 * merging. 1474 */ 1475 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; 1476 uint64_t new_idx = idx; 1477 if (l_hdr != NULL) { 1478 keep_hdr = l_hdr; 1479 new_rm_hdr = hdr; 1480 new_idx += keep_hdr->bth_count + 1; 1481 } else { 1482 ASSERT3P(r_hdr, !=, NULL); 1483 keep_hdr = hdr; 1484 new_rm_hdr = r_hdr; 1485 parent_idx++; 1486 } 1487 1488 ASSERT(keep_hdr->bth_core); 1489 ASSERT(new_rm_hdr->bth_core); 1490 1491 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; 1492 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; 1493 1494 if (zfs_btree_verify_intensity >= 5) { 1495 for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { 1496 zfs_btree_verify_poison_at(tree, keep_hdr, 1497 keep_hdr->bth_count + i); 1498 } 1499 } 1500 1501 /* Move the separator into the left node. */ 1502 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; 1503 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1504 size; 1505 bmov(separator, e_out, size); 1506 keep_hdr->bth_count++; 1507 1508 /* Move all our elements and children into the left node. */ 1509 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, 1510 keep_hdr->bth_count, BSS_TRAPEZOID); 1511 1512 uint64_t old_count = keep_hdr->bth_count; 1513 1514 /* Update bookkeeping */ 1515 keep_hdr->bth_count += new_rm_hdr->bth_count; 1516 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); 1517 1518 /* 1519 * Shift the element and children to the right of rm_hdr to 1520 * the left by one spot. 1521 */ 1522 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); 1523 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, 1524 BSS_PARALLELOGRAM); 1525 keep_hdr->bth_count--; 1526 1527 /* Reparent all our children to point to the left node. */ 1528 zfs_btree_hdr_t **new_start = keep->btc_children + 1529 old_count - 1; 1530 for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) 1531 new_start[i]->bth_parent = keep; 1532 for (int i = 0; i <= keep_hdr->bth_count; i++) { 1533 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); 1534 ASSERT3P(keep->btc_children[i], !=, rm_hdr); 1535 } 1536 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); 1537 1538 new_rm_hdr->bth_count = 0; 1539 zfs_btree_node_destroy(tree, new_rm_hdr); 1540 zfs_btree_remove_from_node(tree, parent, new_rm_hdr); 1541 } 1542 1543 /* Remove the element at the specific location. */ 1544 void 1545 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) 1546 { 1547 size_t size = tree->bt_elem_size; 1548 zfs_btree_hdr_t *hdr = where->bti_node; 1549 uint64_t idx = where->bti_offset; 1550 uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - 1551 sizeof (zfs_btree_hdr_t)) / size, 2); 1552 1553 ASSERT(!where->bti_before); 1554 if (tree->bt_bulk != NULL) { 1555 /* 1556 * Leave bulk insert mode. Note that our index would be 1557 * invalid after we correct the tree, so we copy the value 1558 * we're planning to remove and find it again after 1559 * bulk_finish. 1560 */ 1561 uint8_t *value = zfs_btree_get(tree, where); 1562 uint8_t *tmp = kmem_alloc(size, KM_SLEEP); 1563 bmov(value, tmp, size); 1564 zfs_btree_bulk_finish(tree); 1565 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); 1566 kmem_free(tmp, size); 1567 hdr = where->bti_node; 1568 idx = where->bti_offset; 1569 } 1570 1571 tree->bt_num_elems--; 1572 /* 1573 * If the element happens to be in a core node, we move a leaf node's 1574 * element into its place and then remove the leaf node element. This 1575 * makes the rebalance logic not need to be recursive both upwards and 1576 * downwards. 1577 */ 1578 if (hdr->bth_core) { 1579 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1580 zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; 1581 void *new_value = zfs_btree_last_helper(tree, left_subtree, 1582 where); 1583 ASSERT3P(new_value, !=, NULL); 1584 1585 bmov(new_value, node->btc_elems + idx * size, size); 1586 1587 hdr = where->bti_node; 1588 idx = where->bti_offset; 1589 ASSERT(!where->bti_before); 1590 } 1591 1592 /* 1593 * First, we'll update the leaf's metadata. Then, we shift any 1594 * elements after the idx to the left. After that, we rebalance if 1595 * needed. 1596 */ 1597 ASSERT(!hdr->bth_core); 1598 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 1599 ASSERT3U(hdr->bth_count, >, 0); 1600 1601 uint64_t min_count = (capacity / 2) - 1; 1602 1603 /* 1604 * If we're over the minimum size or this is the root, just overwrite 1605 * the value and return. 1606 */ 1607 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { 1608 hdr->bth_count--; 1609 bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); 1610 if (hdr->bth_parent == NULL) { 1611 ASSERT0(tree->bt_height); 1612 if (hdr->bth_count == 0) { 1613 tree->bt_root = NULL; 1614 tree->bt_height--; 1615 zfs_btree_node_destroy(tree, &leaf->btl_hdr); 1616 } 1617 } 1618 if (tree->bt_root != NULL) 1619 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); 1620 zfs_btree_verify(tree); 1621 return; 1622 } 1623 ASSERT3U(hdr->bth_count, ==, min_count); 1624 1625 /* 1626 * Now we try to take a node from a sibling. We check left, then 1627 * right. If they exist and have more than the minimum number of 1628 * elements, we move the separator betweeen us and them to our node 1629 * and move their closest element (last for left, first for right) to 1630 * the separator. Along the way we need to collapse the gap made by 1631 * idx, and (for our right neighbor) the gap made by removing their 1632 * first element. 1633 * 1634 * Note: this logic currently doesn't support taking from a neighbor 1635 * that isn't a sibling. This isn't critical functionality, but may be 1636 * worth implementing in the future for completeness' sake. 1637 */ 1638 zfs_btree_core_t *parent = hdr->bth_parent; 1639 uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1640 1641 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1642 parent->btc_children[parent_idx - 1]); 1643 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1644 /* We can take a node from the left neighbor. */ 1645 ASSERT(!l_hdr->bth_core); 1646 1647 /* 1648 * Move our elements back by one spot to make room for the 1649 * stolen element and overwrite the element being removed. 1650 */ 1651 bt_shift_leaf_right(tree, leaf, 0, idx); 1652 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1653 size; 1654 uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + 1655 (l_hdr->bth_count - 1) * size; 1656 /* Move the separator to our first spot. */ 1657 bmov(separator, leaf->btl_elems, size); 1658 1659 /* Move our neighbor's last element to the separator. */ 1660 bmov(take_elem, separator, size); 1661 1662 /* Update the bookkeeping. */ 1663 l_hdr->bth_count--; 1664 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); 1665 1666 zfs_btree_verify(tree); 1667 return; 1668 } 1669 1670 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1671 NULL : parent->btc_children[parent_idx + 1]); 1672 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1673 /* We can take a node from the right neighbor. */ 1674 ASSERT(!r_hdr->bth_core); 1675 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; 1676 1677 /* 1678 * Move our elements after the element being removed forwards 1679 * by one spot to make room for the stolen element and 1680 * overwrite the element being removed. 1681 */ 1682 bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - 1683 1); 1684 1685 uint8_t *separator = parent->btc_elems + parent_idx * size; 1686 uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; 1687 /* Move the separator between us to our last spot. */ 1688 bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, 1689 size); 1690 1691 /* Move our neighbor's first element to the separator. */ 1692 bmov(take_elem, separator, size); 1693 1694 /* Update the bookkeeping. */ 1695 r_hdr->bth_count--; 1696 1697 /* 1698 * Move our neighbors elements forwards to overwrite the 1699 * stolen element. 1700 */ 1701 bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); 1702 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); 1703 zfs_btree_verify(tree); 1704 return; 1705 } 1706 1707 /* 1708 * In this case, neither of our neighbors can spare an element, so we 1709 * need to merge with one of them. We prefer the left one, 1710 * arabitrarily. Move the separator into the leftmost merging node 1711 * (which may be us or the left neighbor), and then move the right 1712 * merging node's elements. Once that's done, we go back and delete 1713 * the element we're removing. Finally, go into the parent and delete 1714 * the right merging node and the separator. This may cause further 1715 * merging. 1716 */ 1717 zfs_btree_hdr_t *rm_hdr, *keep_hdr; 1718 uint64_t new_idx = idx; 1719 if (l_hdr != NULL) { 1720 keep_hdr = l_hdr; 1721 rm_hdr = hdr; 1722 new_idx += keep_hdr->bth_count + 1; // 449 1723 } else { 1724 ASSERT3P(r_hdr, !=, NULL); 1725 keep_hdr = hdr; 1726 rm_hdr = r_hdr; 1727 parent_idx++; 1728 } 1729 1730 ASSERT(!keep_hdr->bth_core); 1731 ASSERT(!rm_hdr->bth_core); 1732 ASSERT3U(keep_hdr->bth_count, ==, min_count); 1733 ASSERT3U(rm_hdr->bth_count, ==, min_count); 1734 1735 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; 1736 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; 1737 1738 if (zfs_btree_verify_intensity >= 5) { 1739 for (int i = 0; i < rm_hdr->bth_count + 1; i++) { 1740 zfs_btree_verify_poison_at(tree, keep_hdr, 1741 keep_hdr->bth_count + i); 1742 } 1743 } 1744 /* 1745 * Move the separator into the first open spot in the left 1746 * neighbor. 1747 */ 1748 uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; 1749 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1750 size; 1751 bmov(separator, out, size); 1752 keep_hdr->bth_count++; 1753 1754 /* Move our elements to the left neighbor. */ 1755 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, 1756 keep_hdr->bth_count); 1757 1758 /* Update the bookkeeping. */ 1759 keep_hdr->bth_count += rm_hdr->bth_count; 1760 ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); 1761 1762 /* Remove the value from the node */ 1763 keep_hdr->bth_count--; 1764 bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - 1765 new_idx); 1766 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); 1767 1768 rm_hdr->bth_count = 0; 1769 zfs_btree_node_destroy(tree, rm_hdr); 1770 /* Remove the emptied node from the parent. */ 1771 zfs_btree_remove_from_node(tree, parent, rm_hdr); 1772 zfs_btree_verify(tree); 1773 } 1774 1775 /* Remove the given value from the tree. */ 1776 void 1777 zfs_btree_remove(zfs_btree_t *tree, const void *value) 1778 { 1779 zfs_btree_index_t where = {0}; 1780 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); 1781 zfs_btree_remove_idx(tree, &where); 1782 } 1783 1784 /* Return the number of elements in the tree. */ 1785 ulong_t 1786 zfs_btree_numnodes(zfs_btree_t *tree) 1787 { 1788 return (tree->bt_num_elems); 1789 } 1790 1791 /* 1792 * This function is used to visit all the elements in the tree before 1793 * destroying the tree. This allows the calling code to perform any cleanup it 1794 * needs to do. This is more efficient than just removing the first element 1795 * over and over, because it removes all rebalancing. Once the destroy_nodes() 1796 * function has been called, no other btree operations are valid until it 1797 * returns NULL, which point the only valid operation is zfs_btree_destroy(). 1798 * 1799 * example: 1800 * 1801 * zfs_btree_index_t *cookie = NULL; 1802 * my_data_t *node; 1803 * 1804 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) 1805 * free(node->ptr); 1806 * zfs_btree_destroy(tree); 1807 * 1808 */ 1809 void * 1810 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) 1811 { 1812 if (*cookie == NULL) { 1813 if (tree->bt_height == -1) 1814 return (NULL); 1815 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); 1816 return (zfs_btree_first(tree, *cookie)); 1817 } 1818 1819 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, 1820 zfs_btree_node_destroy); 1821 if (rval == NULL) { 1822 tree->bt_root = NULL; 1823 tree->bt_height = -1; 1824 tree->bt_num_elems = 0; 1825 kmem_free(*cookie, sizeof (**cookie)); 1826 tree->bt_bulk = NULL; 1827 } 1828 return (rval); 1829 } 1830 1831 static void 1832 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1833 { 1834 if (hdr->bth_core) { 1835 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; 1836 for (int i = 0; i <= hdr->bth_count; i++) { 1837 zfs_btree_clear_helper(tree, btc->btc_children[i]); 1838 } 1839 } 1840 1841 zfs_btree_node_destroy(tree, hdr); 1842 } 1843 1844 void 1845 zfs_btree_clear(zfs_btree_t *tree) 1846 { 1847 if (tree->bt_root == NULL) { 1848 ASSERT0(tree->bt_num_elems); 1849 return; 1850 } 1851 1852 zfs_btree_clear_helper(tree, tree->bt_root); 1853 tree->bt_num_elems = 0; 1854 tree->bt_root = NULL; 1855 tree->bt_num_nodes = 0; 1856 tree->bt_height = -1; 1857 tree->bt_bulk = NULL; 1858 } 1859 1860 void 1861 zfs_btree_destroy(zfs_btree_t *tree) 1862 { 1863 ASSERT0(tree->bt_num_elems); 1864 ASSERT3P(tree->bt_root, ==, NULL); 1865 } 1866 1867 /* Verify that every child of this node has the correct parent pointer. */ 1868 static void 1869 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1870 { 1871 if (!hdr->bth_core) 1872 return; 1873 1874 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1875 for (int i = 0; i <= hdr->bth_count; i++) { 1876 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); 1877 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); 1878 } 1879 } 1880 1881 /* Verify that every node has the correct parent pointer. */ 1882 static void 1883 zfs_btree_verify_pointers(zfs_btree_t *tree) 1884 { 1885 if (tree->bt_height == -1) { 1886 VERIFY3P(tree->bt_root, ==, NULL); 1887 return; 1888 } 1889 VERIFY3P(tree->bt_root->bth_parent, ==, NULL); 1890 zfs_btree_verify_pointers_helper(tree, tree->bt_root); 1891 } 1892 1893 /* 1894 * Verify that all the current node and its children satisfy the count 1895 * invariants, and return the total count in the subtree rooted in this node. 1896 */ 1897 static uint64_t 1898 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1899 { 1900 if (!hdr->bth_core) { 1901 if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { 1902 uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - 1903 sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); 1904 VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); 1905 } 1906 1907 return (hdr->bth_count); 1908 } else { 1909 1910 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1911 uint64_t ret = hdr->bth_count; 1912 if (tree->bt_root != hdr && tree->bt_bulk == NULL) 1913 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); 1914 for (int i = 0; i <= hdr->bth_count; i++) { 1915 ret += zfs_btree_verify_counts_helper(tree, 1916 node->btc_children[i]); 1917 } 1918 1919 return (ret); 1920 } 1921 } 1922 1923 /* 1924 * Verify that all nodes satisfy the invariants and that the total number of 1925 * elements is correct. 1926 */ 1927 static void 1928 zfs_btree_verify_counts(zfs_btree_t *tree) 1929 { 1930 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); 1931 if (tree->bt_height == -1) { 1932 return; 1933 } 1934 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, 1935 tree->bt_num_elems); 1936 } 1937 1938 /* 1939 * Check that the subtree rooted at this node has a uniform height. Returns 1940 * the number of nodes under this node, to help verify bt_num_nodes. 1941 */ 1942 static uint64_t 1943 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 1944 int64_t height) 1945 { 1946 if (!hdr->bth_core) { 1947 VERIFY0(height); 1948 return (1); 1949 } 1950 1951 VERIFY(hdr->bth_core); 1952 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1953 uint64_t ret = 1; 1954 for (int i = 0; i <= hdr->bth_count; i++) { 1955 ret += zfs_btree_verify_height_helper(tree, 1956 node->btc_children[i], height - 1); 1957 } 1958 return (ret); 1959 } 1960 1961 /* 1962 * Check that the tree rooted at this node has a uniform height, and that the 1963 * bt_height in the tree is correct. 1964 */ 1965 static void 1966 zfs_btree_verify_height(zfs_btree_t *tree) 1967 { 1968 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 1969 if (tree->bt_height == -1) { 1970 return; 1971 } 1972 1973 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, 1974 tree->bt_height), ==, tree->bt_num_nodes); 1975 } 1976 1977 /* 1978 * Check that the elements in this node are sorted, and that if this is a core 1979 * node, the separators are properly between the subtrees they separaate and 1980 * that the children also satisfy this requirement. 1981 */ 1982 static void 1983 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1984 { 1985 size_t size = tree->bt_elem_size; 1986 if (!hdr->bth_core) { 1987 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 1988 for (int i = 1; i < hdr->bth_count; i++) { 1989 VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * 1990 size, leaf->btl_elems + i * size), ==, -1); 1991 } 1992 return; 1993 } 1994 1995 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1996 for (int i = 1; i < hdr->bth_count; i++) { 1997 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, 1998 node->btc_elems + i * size), ==, -1); 1999 } 2000 for (int i = 0; i < hdr->bth_count; i++) { 2001 uint8_t *left_child_last = NULL; 2002 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; 2003 if (left_child_hdr->bth_core) { 2004 zfs_btree_core_t *left_child = 2005 (zfs_btree_core_t *)left_child_hdr; 2006 left_child_last = left_child->btc_elems + 2007 (left_child_hdr->bth_count - 1) * size; 2008 } else { 2009 zfs_btree_leaf_t *left_child = 2010 (zfs_btree_leaf_t *)left_child_hdr; 2011 left_child_last = left_child->btl_elems + 2012 (left_child_hdr->bth_count - 1) * size; 2013 } 2014 if (tree->bt_compar(node->btc_elems + i * size, 2015 left_child_last) != 1) { 2016 panic("btree: compar returned %d (expected 1) at " 2017 "%px %d: compar(%px, %px)", tree->bt_compar( 2018 node->btc_elems + i * size, left_child_last), 2019 (void *)node, i, (void *)(node->btc_elems + i * 2020 size), (void *)left_child_last); 2021 } 2022 2023 uint8_t *right_child_first = NULL; 2024 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; 2025 if (right_child_hdr->bth_core) { 2026 zfs_btree_core_t *right_child = 2027 (zfs_btree_core_t *)right_child_hdr; 2028 right_child_first = right_child->btc_elems; 2029 } else { 2030 zfs_btree_leaf_t *right_child = 2031 (zfs_btree_leaf_t *)right_child_hdr; 2032 right_child_first = right_child->btl_elems; 2033 } 2034 if (tree->bt_compar(node->btc_elems + i * size, 2035 right_child_first) != -1) { 2036 panic("btree: compar returned %d (expected -1) at " 2037 "%px %d: compar(%px, %px)", tree->bt_compar( 2038 node->btc_elems + i * size, right_child_first), 2039 (void *)node, i, (void *)(node->btc_elems + i * 2040 size), (void *)right_child_first); 2041 } 2042 } 2043 for (int i = 0; i <= hdr->bth_count; i++) { 2044 zfs_btree_verify_order_helper(tree, node->btc_children[i]); 2045 } 2046 } 2047 2048 /* Check that all elements in the tree are in sorted order. */ 2049 static void 2050 zfs_btree_verify_order(zfs_btree_t *tree) 2051 { 2052 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2053 if (tree->bt_height == -1) { 2054 return; 2055 } 2056 2057 zfs_btree_verify_order_helper(tree, tree->bt_root); 2058 } 2059 2060 #ifdef ZFS_DEBUG 2061 /* Check that all unused memory is poisoned correctly. */ 2062 static void 2063 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2064 { 2065 size_t size = tree->bt_elem_size; 2066 if (!hdr->bth_core) { 2067 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2068 uint8_t val = 0x0f; 2069 for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - 2070 sizeof (zfs_btree_hdr_t); i++) { 2071 VERIFY3U(leaf->btl_elems[i], ==, val); 2072 } 2073 } else { 2074 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2075 uint8_t val = 0x0f; 2076 for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; 2077 i++) { 2078 VERIFY3U(node->btc_elems[i], ==, val); 2079 } 2080 2081 for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { 2082 VERIFY3P(node->btc_children[i], ==, 2083 (zfs_btree_hdr_t *)BTREE_POISON); 2084 } 2085 2086 for (int i = 0; i <= hdr->bth_count; i++) { 2087 zfs_btree_verify_poison_helper(tree, 2088 node->btc_children[i]); 2089 } 2090 } 2091 } 2092 #endif 2093 2094 /* Check that unused memory in the tree is still poisoned. */ 2095 static void 2096 zfs_btree_verify_poison(zfs_btree_t *tree) 2097 { 2098 #ifdef ZFS_DEBUG 2099 if (tree->bt_height == -1) 2100 return; 2101 zfs_btree_verify_poison_helper(tree, tree->bt_root); 2102 #endif 2103 } 2104 2105 void 2106 zfs_btree_verify(zfs_btree_t *tree) 2107 { 2108 if (zfs_btree_verify_intensity == 0) 2109 return; 2110 zfs_btree_verify_height(tree); 2111 if (zfs_btree_verify_intensity == 1) 2112 return; 2113 zfs_btree_verify_pointers(tree); 2114 if (zfs_btree_verify_intensity == 2) 2115 return; 2116 zfs_btree_verify_counts(tree); 2117 if (zfs_btree_verify_intensity == 3) 2118 return; 2119 zfs_btree_verify_order(tree); 2120 2121 if (zfs_btree_verify_intensity == 4) 2122 return; 2123 zfs_btree_verify_poison(tree); 2124 } 2125