1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 /* 16 * Copyright (c) 2019 by Delphix. All rights reserved. 17 */ 18 19 #include <sys/btree.h> 20 #include <sys/bitops.h> 21 #include <sys/zfs_context.h> 22 23 kmem_cache_t *zfs_btree_leaf_cache; 24 25 /* 26 * Control the extent of the verification that occurs when zfs_btree_verify is 27 * called. Primarily used for debugging when extending the btree logic and 28 * functionality. As the intensity is increased, new verification steps are 29 * added. These steps are cumulative; intensity = 3 includes the intensity = 1 30 * and intensity = 2 steps as well. 31 * 32 * Intensity 1: Verify that the tree's height is consistent throughout. 33 * Intensity 2: Verify that a core node's children's parent pointers point 34 * to the core node. 35 * Intensity 3: Verify that the total number of elements in the tree matches the 36 * sum of the number of elements in each node. Also verifies that each node's 37 * count obeys the invariants (less than or equal to maximum value, greater than 38 * or equal to half the maximum minus one). 39 * Intensity 4: Verify that each element compares less than the element 40 * immediately after it and greater than the one immediately before it using the 41 * comparator function. For core nodes, also checks that each element is greater 42 * than the last element in the first of the two nodes it separates, and less 43 * than the first element in the second of the two nodes. 44 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside 45 * of each node is poisoned appropriately. Note that poisoning always occurs if 46 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal 47 * operation. 48 * 49 * Intensity 4 and 5 are particularly expensive to perform; the previous levels 50 * are a few memory operations per node, while these levels require multiple 51 * operations per element. In addition, when creating large btrees, these 52 * operations are called at every step, resulting in extremely slow operation 53 * (while the asymptotic complexity of the other steps is the same, the 54 * importance of the constant factors cannot be denied). 55 */ 56 uint_t zfs_btree_verify_intensity = 0; 57 58 /* 59 * Convenience functions to silence warnings from memcpy/memmove's 60 * return values and change argument order to src, dest. 61 */ 62 static void 63 bcpy(const void *src, void *dest, size_t size) 64 { 65 (void) memcpy(dest, src, size); 66 } 67 68 static void 69 bmov(const void *src, void *dest, size_t size) 70 { 71 (void) memmove(dest, src, size); 72 } 73 74 static boolean_t 75 zfs_btree_is_core(struct zfs_btree_hdr *hdr) 76 { 77 return (hdr->bth_first == -1); 78 } 79 80 #ifdef _ILP32 81 #define BTREE_POISON 0xabadb10c 82 #else 83 #define BTREE_POISON 0xabadb10cdeadbeef 84 #endif 85 86 static void 87 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 88 { 89 #ifdef ZFS_DEBUG 90 size_t size = tree->bt_elem_size; 91 if (zfs_btree_is_core(hdr)) { 92 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 93 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 94 i++) { 95 node->btc_children[i] = 96 (zfs_btree_hdr_t *)BTREE_POISON; 97 } 98 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, 99 (BTREE_CORE_ELEMS - hdr->bth_count) * size); 100 } else { 101 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 102 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); 103 (void) memset(leaf->btl_elems + 104 (hdr->bth_first + hdr->bth_count) * size, 0x0f, 105 tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - 106 (hdr->bth_first + hdr->bth_count) * size); 107 } 108 #endif 109 } 110 111 static inline void 112 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 113 uint32_t idx, uint32_t count) 114 { 115 #ifdef ZFS_DEBUG 116 size_t size = tree->bt_elem_size; 117 if (zfs_btree_is_core(hdr)) { 118 ASSERT3U(idx, >=, hdr->bth_count); 119 ASSERT3U(idx, <=, BTREE_CORE_ELEMS); 120 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); 121 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 122 for (uint32_t i = 1; i <= count; i++) { 123 node->btc_children[idx + i] = 124 (zfs_btree_hdr_t *)BTREE_POISON; 125 } 126 (void) memset(node->btc_elems + idx * size, 0x0f, count * size); 127 } else { 128 ASSERT3U(idx, <=, tree->bt_leaf_cap); 129 ASSERT3U(idx + count, <=, tree->bt_leaf_cap); 130 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 131 (void) memset(leaf->btl_elems + 132 (hdr->bth_first + idx) * size, 0x0f, count * size); 133 } 134 #endif 135 } 136 137 static inline void 138 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 139 uint32_t idx) 140 { 141 #ifdef ZFS_DEBUG 142 size_t size = tree->bt_elem_size; 143 if (zfs_btree_is_core(hdr)) { 144 ASSERT3U(idx, <, BTREE_CORE_ELEMS); 145 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 146 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; 147 VERIFY3P(node->btc_children[idx + 1], ==, cval); 148 for (size_t i = 0; i < size; i++) 149 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); 150 } else { 151 ASSERT3U(idx, <, tree->bt_leaf_cap); 152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 153 if (idx >= tree->bt_leaf_cap - hdr->bth_first) 154 return; 155 for (size_t i = 0; i < size; i++) { 156 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) 157 * size + i], ==, 0x0f); 158 } 159 } 160 #endif 161 } 162 163 void 164 zfs_btree_init(void) 165 { 166 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", 167 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 168 } 169 170 void 171 zfs_btree_fini(void) 172 { 173 kmem_cache_destroy(zfs_btree_leaf_cache); 174 } 175 176 static void * 177 zfs_btree_leaf_alloc(zfs_btree_t *tree) 178 { 179 if (tree->bt_leaf_size == BTREE_LEAF_SIZE) 180 return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); 181 else 182 return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); 183 } 184 185 static void 186 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) 187 { 188 if (tree->bt_leaf_size == BTREE_LEAF_SIZE) 189 return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); 190 else 191 return (kmem_free(ptr, tree->bt_leaf_size)); 192 } 193 194 void 195 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), 196 size_t size) 197 { 198 zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); 199 } 200 201 void 202 zfs_btree_create_custom(zfs_btree_t *tree, 203 int (*compar) (const void *, const void *), 204 size_t size, size_t lsize) 205 { 206 size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); 207 208 ASSERT3U(size, <=, esize / 2); 209 memset(tree, 0, sizeof (*tree)); 210 tree->bt_compar = compar; 211 tree->bt_elem_size = size; 212 tree->bt_leaf_size = lsize; 213 tree->bt_leaf_cap = P2ALIGN(esize / size, 2); 214 tree->bt_height = -1; 215 tree->bt_bulk = NULL; 216 } 217 218 /* 219 * Find value in the array of elements provided. Uses a simple binary search. 220 */ 221 static void * 222 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, 223 const void *value, zfs_btree_index_t *where) 224 { 225 uint32_t max = nelems; 226 uint32_t min = 0; 227 while (max > min) { 228 uint32_t idx = (min + max) / 2; 229 uint8_t *cur = buf + idx * tree->bt_elem_size; 230 int comp = tree->bt_compar(cur, value); 231 if (comp < 0) { 232 min = idx + 1; 233 } else if (comp > 0) { 234 max = idx; 235 } else { 236 where->bti_offset = idx; 237 where->bti_before = B_FALSE; 238 return (cur); 239 } 240 } 241 242 where->bti_offset = max; 243 where->bti_before = B_TRUE; 244 return (NULL); 245 } 246 247 /* 248 * Find the given value in the tree. where may be passed as null to use as a 249 * membership test or if the btree is being used as a map. 250 */ 251 void * 252 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) 253 { 254 if (tree->bt_height == -1) { 255 if (where != NULL) { 256 where->bti_node = NULL; 257 where->bti_offset = 0; 258 } 259 ASSERT0(tree->bt_num_elems); 260 return (NULL); 261 } 262 263 /* 264 * If we're in bulk-insert mode, we check the last spot in the tree 265 * and the last leaf in the tree before doing the normal search, 266 * because for most workloads the vast majority of finds in 267 * bulk-insert mode are to insert new elements. 268 */ 269 zfs_btree_index_t idx; 270 size_t size = tree->bt_elem_size; 271 if (tree->bt_bulk != NULL) { 272 zfs_btree_leaf_t *last_leaf = tree->bt_bulk; 273 int comp = tree->bt_compar(last_leaf->btl_elems + 274 (last_leaf->btl_hdr.bth_first + 275 last_leaf->btl_hdr.bth_count - 1) * size, value); 276 if (comp < 0) { 277 /* 278 * If what they're looking for is after the last 279 * element, it's not in the tree. 280 */ 281 if (where != NULL) { 282 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 283 where->bti_offset = 284 last_leaf->btl_hdr.bth_count; 285 where->bti_before = B_TRUE; 286 } 287 return (NULL); 288 } else if (comp == 0) { 289 if (where != NULL) { 290 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 291 where->bti_offset = 292 last_leaf->btl_hdr.bth_count - 1; 293 where->bti_before = B_FALSE; 294 } 295 return (last_leaf->btl_elems + 296 (last_leaf->btl_hdr.bth_first + 297 last_leaf->btl_hdr.bth_count - 1) * size); 298 } 299 if (tree->bt_compar(last_leaf->btl_elems + 300 last_leaf->btl_hdr.bth_first * size, value) <= 0) { 301 /* 302 * If what they're looking for is after the first 303 * element in the last leaf, it's in the last leaf or 304 * it's not in the tree. 305 */ 306 void *d = zfs_btree_find_in_buf(tree, 307 last_leaf->btl_elems + 308 last_leaf->btl_hdr.bth_first * size, 309 last_leaf->btl_hdr.bth_count, value, &idx); 310 311 if (where != NULL) { 312 idx.bti_node = (zfs_btree_hdr_t *)last_leaf; 313 *where = idx; 314 } 315 return (d); 316 } 317 } 318 319 zfs_btree_core_t *node = NULL; 320 uint32_t child = 0; 321 uint32_t depth = 0; 322 323 /* 324 * Iterate down the tree, finding which child the value should be in 325 * by comparing with the separators. 326 */ 327 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; 328 node = (zfs_btree_core_t *)node->btc_children[child], depth++) { 329 ASSERT3P(node, !=, NULL); 330 void *d = zfs_btree_find_in_buf(tree, node->btc_elems, 331 node->btc_hdr.bth_count, value, &idx); 332 EQUIV(d != NULL, !idx.bti_before); 333 if (d != NULL) { 334 if (where != NULL) { 335 idx.bti_node = (zfs_btree_hdr_t *)node; 336 *where = idx; 337 } 338 return (d); 339 } 340 ASSERT(idx.bti_before); 341 child = idx.bti_offset; 342 } 343 344 /* 345 * The value is in this leaf, or it would be if it were in the 346 * tree. Find its proper location and return it. 347 */ 348 zfs_btree_leaf_t *leaf = (depth == 0 ? 349 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); 350 void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + 351 leaf->btl_hdr.bth_first * size, 352 leaf->btl_hdr.bth_count, value, &idx); 353 354 if (where != NULL) { 355 idx.bti_node = (zfs_btree_hdr_t *)leaf; 356 *where = idx; 357 } 358 359 return (d); 360 } 361 362 /* 363 * To explain the following functions, it is useful to understand the four 364 * kinds of shifts used in btree operation. First, a shift is a movement of 365 * elements within a node. It is used to create gaps for inserting new 366 * elements and children, or cover gaps created when things are removed. A 367 * shift has two fundamental properties, each of which can be one of two 368 * values, making four types of shifts. There is the direction of the shift 369 * (left or right) and the shape of the shift (parallelogram or isoceles 370 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only 371 * applies to shifts of core nodes. 372 * 373 * The names derive from the following imagining of the layout of a node: 374 * 375 * Elements: * * * * * * * ... * * * 376 * Children: * * * * * * * * ... * * * 377 * 378 * This layout follows from the fact that the elements act as separators 379 * between pairs of children, and that children root subtrees "below" the 380 * current node. A left and right shift are fairly self-explanatory; a left 381 * shift moves things to the left, while a right shift moves things to the 382 * right. A parallelogram shift is a shift with the same number of elements 383 * and children being moved, while a trapezoid shift is a shift that moves one 384 * more children than elements. An example follows: 385 * 386 * A parallelogram shift could contain the following: 387 * _______________ 388 * \* * * * \ * * * ... * * * 389 * * \ * * * *\ * * * ... * * * 390 * --------------- 391 * A trapezoid shift could contain the following: 392 * ___________ 393 * * / * * * \ * * * ... * * * 394 * * / * * * *\ * * * ... * * * 395 * --------------- 396 * 397 * Note that a parallelogram shift is always shaped like a "left-leaning" 398 * parallelogram, where the starting index of the children being moved is 399 * always one higher than the starting index of the elements being moved. No 400 * "right-leaning" parallelogram shifts are needed (shifts where the starting 401 * element index and starting child index being moved are the same) to achieve 402 * any btree operations, so we ignore them. 403 */ 404 405 enum bt_shift_shape { 406 BSS_TRAPEZOID, 407 BSS_PARALLELOGRAM 408 }; 409 410 enum bt_shift_direction { 411 BSD_LEFT, 412 BSD_RIGHT 413 }; 414 415 /* 416 * Shift elements and children in the provided core node by off spots. The 417 * first element moved is idx, and count elements are moved. The shape of the 418 * shift is determined by shape. The direction is determined by dir. 419 */ 420 static inline void 421 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 422 uint32_t count, uint32_t off, enum bt_shift_shape shape, 423 enum bt_shift_direction dir) 424 { 425 size_t size = tree->bt_elem_size; 426 ASSERT(zfs_btree_is_core(&node->btc_hdr)); 427 428 uint8_t *e_start = node->btc_elems + idx * size; 429 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : 430 e_start + off * size); 431 bmov(e_start, e_out, count * size); 432 433 zfs_btree_hdr_t **c_start = node->btc_children + idx + 434 (shape == BSS_TRAPEZOID ? 0 : 1); 435 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : 436 c_start + off); 437 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 438 bmov(c_start, c_out, c_count * sizeof (*c_start)); 439 } 440 441 /* 442 * Shift elements and children in the provided core node left by one spot. 443 * The first element moved is idx, and count elements are moved. The 444 * shape of the shift is determined by trap; true if the shift is a trapezoid, 445 * false if it is a parallelogram. 446 */ 447 static inline void 448 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 449 uint32_t count, enum bt_shift_shape shape) 450 { 451 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); 452 } 453 454 /* 455 * Shift elements and children in the provided core node right by one spot. 456 * Starts with elements[idx] and children[idx] and one more child than element. 457 */ 458 static inline void 459 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 460 uint32_t count, enum bt_shift_shape shape) 461 { 462 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); 463 } 464 465 /* 466 * Shift elements and children in the provided leaf node by off spots. 467 * The first element moved is idx, and count elements are moved. The direction 468 * is determined by left. 469 */ 470 static inline void 471 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, 472 uint32_t count, uint32_t off, enum bt_shift_direction dir) 473 { 474 size_t size = tree->bt_elem_size; 475 zfs_btree_hdr_t *hdr = &node->btl_hdr; 476 ASSERT(!zfs_btree_is_core(hdr)); 477 478 if (count == 0) 479 return; 480 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; 481 uint8_t *out = (dir == BSD_LEFT ? start - off * size : 482 start + off * size); 483 bmov(start, out, count * size); 484 } 485 486 /* 487 * Grow leaf for n new elements before idx. 488 */ 489 static void 490 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 491 uint32_t n) 492 { 493 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 494 ASSERT(!zfs_btree_is_core(hdr)); 495 ASSERT3U(idx, <=, hdr->bth_count); 496 uint32_t capacity = tree->bt_leaf_cap; 497 ASSERT3U(hdr->bth_count + n, <=, capacity); 498 boolean_t cl = (hdr->bth_first >= n); 499 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); 500 501 if (cl && (!cr || idx <= hdr->bth_count / 2)) { 502 /* Grow left. */ 503 hdr->bth_first -= n; 504 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); 505 } else if (cr) { 506 /* Grow right. */ 507 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, 508 BSD_RIGHT); 509 } else { 510 /* Grow both ways. */ 511 uint32_t fn = hdr->bth_first - 512 (capacity - (hdr->bth_count + n)) / 2; 513 hdr->bth_first -= fn; 514 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); 515 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, 516 n - fn, BSD_RIGHT); 517 } 518 hdr->bth_count += n; 519 } 520 521 /* 522 * Shrink leaf for count elements starting from idx. 523 */ 524 static void 525 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 526 uint32_t n) 527 { 528 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 529 ASSERT(!zfs_btree_is_core(hdr)); 530 ASSERT3U(idx, <=, hdr->bth_count); 531 ASSERT3U(idx + n, <=, hdr->bth_count); 532 533 if (idx <= (hdr->bth_count - n) / 2) { 534 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); 535 zfs_btree_poison_node_at(tree, hdr, 0, n); 536 hdr->bth_first += n; 537 } else { 538 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, 539 BSD_LEFT); 540 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); 541 } 542 hdr->bth_count -= n; 543 } 544 545 /* 546 * Move children and elements from one core node to another. The shape 547 * parameter behaves the same as it does in the shift logic. 548 */ 549 static inline void 550 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, 551 uint32_t count, zfs_btree_core_t *dest, uint32_t didx, 552 enum bt_shift_shape shape) 553 { 554 size_t size = tree->bt_elem_size; 555 ASSERT(zfs_btree_is_core(&source->btc_hdr)); 556 ASSERT(zfs_btree_is_core(&dest->btc_hdr)); 557 558 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, 559 count * size); 560 561 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 562 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), 563 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), 564 c_count * sizeof (*source->btc_children)); 565 } 566 567 static inline void 568 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, 569 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) 570 { 571 size_t size = tree->bt_elem_size; 572 ASSERT(!zfs_btree_is_core(&source->btl_hdr)); 573 ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); 574 575 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, 576 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, 577 count * size); 578 } 579 580 /* 581 * Find the first element in the subtree rooted at hdr, return its value and 582 * put its location in where if non-null. 583 */ 584 static void * 585 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 586 zfs_btree_index_t *where) 587 { 588 zfs_btree_hdr_t *node; 589 590 for (node = hdr; zfs_btree_is_core(node); 591 node = ((zfs_btree_core_t *)node)->btc_children[0]) 592 ; 593 594 ASSERT(!zfs_btree_is_core(node)); 595 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 596 if (where != NULL) { 597 where->bti_node = node; 598 where->bti_offset = 0; 599 where->bti_before = B_FALSE; 600 } 601 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); 602 } 603 604 /* Insert an element and a child into a core node at the given offset. */ 605 static void 606 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, 607 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) 608 { 609 size_t size = tree->bt_elem_size; 610 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 611 ASSERT3P(par_hdr, ==, new_node->bth_parent); 612 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); 613 614 if (zfs_btree_verify_intensity >= 5) { 615 zfs_btree_verify_poison_at(tree, par_hdr, 616 par_hdr->bth_count); 617 } 618 /* Shift existing elements and children */ 619 uint32_t count = par_hdr->bth_count - offset; 620 bt_shift_core_right(tree, parent, offset, count, 621 BSS_PARALLELOGRAM); 622 623 /* Insert new values */ 624 parent->btc_children[offset + 1] = new_node; 625 bcpy(buf, parent->btc_elems + offset * size, size); 626 par_hdr->bth_count++; 627 } 628 629 /* 630 * Insert new_node into the parent of old_node directly after old_node, with 631 * buf as the dividing element between the two. 632 */ 633 static void 634 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, 635 zfs_btree_hdr_t *new_node, void *buf) 636 { 637 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); 638 size_t size = tree->bt_elem_size; 639 zfs_btree_core_t *parent = old_node->bth_parent; 640 641 /* 642 * If this is the root node we were splitting, we create a new root 643 * and increase the height of the tree. 644 */ 645 if (parent == NULL) { 646 ASSERT3P(old_node, ==, tree->bt_root); 647 tree->bt_num_nodes++; 648 zfs_btree_core_t *new_root = 649 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * 650 size, KM_SLEEP); 651 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; 652 new_root_hdr->bth_parent = NULL; 653 new_root_hdr->bth_first = -1; 654 new_root_hdr->bth_count = 1; 655 656 old_node->bth_parent = new_node->bth_parent = new_root; 657 new_root->btc_children[0] = old_node; 658 new_root->btc_children[1] = new_node; 659 bcpy(buf, new_root->btc_elems, size); 660 661 tree->bt_height++; 662 tree->bt_root = new_root_hdr; 663 zfs_btree_poison_node(tree, new_root_hdr); 664 return; 665 } 666 667 /* 668 * Since we have the new separator, binary search for where to put 669 * new_node. 670 */ 671 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 672 zfs_btree_index_t idx; 673 ASSERT(zfs_btree_is_core(par_hdr)); 674 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, 675 par_hdr->bth_count, buf, &idx), ==, NULL); 676 ASSERT(idx.bti_before); 677 uint32_t offset = idx.bti_offset; 678 ASSERT3U(offset, <=, par_hdr->bth_count); 679 ASSERT3P(parent->btc_children[offset], ==, old_node); 680 681 /* 682 * If the parent isn't full, shift things to accommodate our insertions 683 * and return. 684 */ 685 if (par_hdr->bth_count != BTREE_CORE_ELEMS) { 686 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); 687 return; 688 } 689 690 /* 691 * We need to split this core node into two. Currently there are 692 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for 693 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the 694 * current node, and the others will be moved to the new core node. 695 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One 696 * will be used as the new separator in our parent, and the others 697 * will be split among the two core nodes. 698 * 699 * Usually we will split the node in half evenly, with 700 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we 701 * instead move only about a quarter of the elements (and children) to 702 * the new node. Since the average state after a long time is a 3/4 703 * full node, shortcutting directly to that state improves efficiency. 704 * 705 * We do this in two stages: first we split into two nodes, and then we 706 * reuse our existing logic to insert the new element and child. 707 */ 708 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 709 2 : 4)) - 1, 2); 710 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; 711 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); 712 tree->bt_num_nodes++; 713 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + 714 BTREE_CORE_ELEMS * size, KM_SLEEP); 715 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; 716 new_par_hdr->bth_parent = par_hdr->bth_parent; 717 new_par_hdr->bth_first = -1; 718 new_par_hdr->bth_count = move_count; 719 zfs_btree_poison_node(tree, new_par_hdr); 720 721 par_hdr->bth_count = keep_count; 722 723 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 724 0, BSS_TRAPEZOID); 725 726 /* Store the new separator in a buffer. */ 727 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); 728 bcpy(parent->btc_elems + keep_count * size, tmp_buf, 729 size); 730 zfs_btree_poison_node(tree, par_hdr); 731 732 if (offset < keep_count) { 733 /* Insert the new node into the left half */ 734 zfs_btree_insert_core_impl(tree, parent, offset, new_node, 735 buf); 736 737 /* 738 * Move the new separator to the existing buffer. 739 */ 740 bcpy(tmp_buf, buf, size); 741 } else if (offset > keep_count) { 742 /* Insert the new node into the right half */ 743 new_node->bth_parent = new_parent; 744 zfs_btree_insert_core_impl(tree, new_parent, 745 offset - keep_count - 1, new_node, buf); 746 747 /* 748 * Move the new separator to the existing buffer. 749 */ 750 bcpy(tmp_buf, buf, size); 751 } else { 752 /* 753 * Move the new separator into the right half, and replace it 754 * with buf. We also need to shift back the elements in the 755 * right half to accommodate new_node. 756 */ 757 bt_shift_core_right(tree, new_parent, 0, move_count, 758 BSS_TRAPEZOID); 759 new_parent->btc_children[0] = new_node; 760 bcpy(tmp_buf, new_parent->btc_elems, size); 761 new_par_hdr->bth_count++; 762 } 763 kmem_free(tmp_buf, size); 764 zfs_btree_poison_node(tree, par_hdr); 765 766 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) 767 new_parent->btc_children[i]->bth_parent = new_parent; 768 769 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) 770 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); 771 772 /* 773 * Now that the node is split, we need to insert the new node into its 774 * parent. This may cause further splitting. 775 */ 776 zfs_btree_insert_into_parent(tree, &parent->btc_hdr, 777 &new_parent->btc_hdr, buf); 778 } 779 780 /* Insert an element into a leaf node at the given offset. */ 781 static void 782 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 783 uint32_t idx, const void *value) 784 { 785 size_t size = tree->bt_elem_size; 786 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 787 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); 788 789 if (zfs_btree_verify_intensity >= 5) { 790 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, 791 leaf->btl_hdr.bth_count); 792 } 793 794 bt_grow_leaf(tree, leaf, idx, 1); 795 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; 796 bcpy(value, start, size); 797 } 798 799 static void 800 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); 801 802 /* Helper function for inserting a new value into leaf at the given index. */ 803 static void 804 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 805 const void *value, uint32_t idx) 806 { 807 size_t size = tree->bt_elem_size; 808 uint32_t capacity = tree->bt_leaf_cap; 809 810 /* 811 * If the leaf isn't full, shift the elements after idx and insert 812 * value. 813 */ 814 if (leaf->btl_hdr.bth_count != capacity) { 815 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 816 return; 817 } 818 819 /* 820 * Otherwise, we split the leaf node into two nodes. If we're not bulk 821 * inserting, each is of size (capacity / 2). If we are bulk 822 * inserting, we move a quarter of the elements to the new node so 823 * inserts into the old node don't cause immediate splitting but the 824 * tree stays relatively dense. Since the average state after a long 825 * time is a 3/4 full node, shortcutting directly to that state 826 * improves efficiency. At the end of the bulk insertion process 827 * we'll need to go through and fix up any nodes (the last leaf and 828 * its ancestors, potentially) that are below the minimum. 829 * 830 * In either case, we're left with one extra element. The leftover 831 * element will become the new dividing element between the two nodes. 832 */ 833 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; 834 uint32_t keep_count = capacity - move_count - 1; 835 ASSERT3U(keep_count, >=, 1); 836 /* If we insert on left. move one more to keep leaves balanced. */ 837 if (idx < keep_count) { 838 keep_count--; 839 move_count++; 840 } 841 tree->bt_num_nodes++; 842 zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); 843 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; 844 new_hdr->bth_parent = leaf->btl_hdr.bth_parent; 845 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + 846 (idx >= keep_count && idx <= keep_count + move_count / 2); 847 new_hdr->bth_count = move_count; 848 zfs_btree_poison_node(tree, new_hdr); 849 850 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) 851 tree->bt_bulk = new_leaf; 852 853 /* Copy the back part to the new leaf. */ 854 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); 855 856 /* We store the new separator in a buffer we control for simplicity. */ 857 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 858 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, 859 buf, size); 860 861 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); 862 863 if (idx < keep_count) { 864 /* Insert into the existing leaf. */ 865 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 866 } else if (idx > keep_count) { 867 /* Insert into the new leaf. */ 868 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 869 1, value); 870 } else { 871 /* 872 * Insert planned separator into the new leaf, and use 873 * the new value as the new separator. 874 */ 875 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); 876 bcpy(value, buf, size); 877 } 878 879 /* 880 * Now that the node is split, we need to insert the new node into its 881 * parent. This may cause further splitting, bur only of core nodes. 882 */ 883 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, 884 buf); 885 kmem_free(buf, size); 886 } 887 888 static uint32_t 889 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 890 { 891 void *buf; 892 if (zfs_btree_is_core(hdr)) { 893 buf = ((zfs_btree_core_t *)hdr)->btc_elems; 894 } else { 895 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + 896 hdr->bth_first * tree->bt_elem_size; 897 } 898 zfs_btree_index_t idx; 899 zfs_btree_core_t *parent = hdr->bth_parent; 900 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, 901 parent->btc_hdr.bth_count, buf, &idx), ==, NULL); 902 ASSERT(idx.bti_before); 903 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); 904 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); 905 return (idx.bti_offset); 906 } 907 908 /* 909 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some 910 * nodes may violate the invariant that non-root nodes must be at least half 911 * full. All nodes violating this invariant should be the last node in their 912 * particular level. To correct the invariant, we take values from their left 913 * neighbor until they are half full. They must have a left neighbor at their 914 * level because the last node at a level is not the first node unless it's 915 * the root. 916 */ 917 static void 918 zfs_btree_bulk_finish(zfs_btree_t *tree) 919 { 920 ASSERT3P(tree->bt_bulk, !=, NULL); 921 ASSERT3P(tree->bt_root, !=, NULL); 922 zfs_btree_leaf_t *leaf = tree->bt_bulk; 923 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 924 zfs_btree_core_t *parent = hdr->bth_parent; 925 size_t size = tree->bt_elem_size; 926 uint32_t capacity = tree->bt_leaf_cap; 927 928 /* 929 * The invariant doesn't apply to the root node, if that's the only 930 * node in the tree we're done. 931 */ 932 if (parent == NULL) { 933 tree->bt_bulk = NULL; 934 return; 935 } 936 937 /* First, take elements to rebalance the leaf node. */ 938 if (hdr->bth_count < capacity / 2) { 939 /* 940 * First, find the left neighbor. The simplest way to do this 941 * is to call zfs_btree_prev twice; the first time finds some 942 * ancestor of this node, and the second time finds the left 943 * neighbor. The ancestor found is the lowest common ancestor 944 * of leaf and the neighbor. 945 */ 946 zfs_btree_index_t idx = { 947 .bti_node = hdr, 948 .bti_offset = 0 949 }; 950 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 951 ASSERT(zfs_btree_is_core(idx.bti_node)); 952 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; 953 uint32_t common_idx = idx.bti_offset; 954 955 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 956 ASSERT(!zfs_btree_is_core(idx.bti_node)); 957 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; 958 zfs_btree_hdr_t *l_hdr = idx.bti_node; 959 uint32_t move_count = (capacity / 2) - hdr->bth_count; 960 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, 961 capacity / 2); 962 963 if (zfs_btree_verify_intensity >= 5) { 964 for (uint32_t i = 0; i < move_count; i++) { 965 zfs_btree_verify_poison_at(tree, hdr, 966 leaf->btl_hdr.bth_count + i); 967 } 968 } 969 970 /* First, shift elements in leaf back. */ 971 bt_grow_leaf(tree, leaf, 0, move_count); 972 973 /* Next, move the separator from the common ancestor to leaf. */ 974 uint8_t *separator = common->btc_elems + common_idx * size; 975 uint8_t *out = leaf->btl_elems + 976 (hdr->bth_first + move_count - 1) * size; 977 bcpy(separator, out, size); 978 979 /* 980 * Now we move elements from the tail of the left neighbor to 981 * fill the remaining spots in leaf. 982 */ 983 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - 984 (move_count - 1), move_count - 1, leaf, 0); 985 986 /* 987 * Finally, move the new last element in the left neighbor to 988 * the separator. 989 */ 990 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + 991 l_hdr->bth_count - move_count) * size, separator, size); 992 993 /* Adjust the node's counts, and we're done. */ 994 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, 995 move_count); 996 997 ASSERT3U(l_hdr->bth_count, >=, capacity / 2); 998 ASSERT3U(hdr->bth_count, >=, capacity / 2); 999 } 1000 1001 /* 1002 * Now we have to rebalance any ancestors of leaf that may also 1003 * violate the invariant. 1004 */ 1005 capacity = BTREE_CORE_ELEMS; 1006 while (parent->btc_hdr.bth_parent != NULL) { 1007 zfs_btree_core_t *cur = parent; 1008 zfs_btree_hdr_t *hdr = &cur->btc_hdr; 1009 parent = hdr->bth_parent; 1010 /* 1011 * If the invariant isn't violated, move on to the next 1012 * ancestor. 1013 */ 1014 if (hdr->bth_count >= capacity / 2) 1015 continue; 1016 1017 /* 1018 * Because the smallest number of nodes we can move when 1019 * splitting is 2, we never need to worry about not having a 1020 * left sibling (a sibling is a neighbor with the same parent). 1021 */ 1022 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1023 ASSERT3U(parent_idx, >, 0); 1024 zfs_btree_core_t *l_neighbor = 1025 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; 1026 uint32_t move_count = (capacity / 2) - hdr->bth_count; 1027 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, 1028 capacity / 2); 1029 1030 if (zfs_btree_verify_intensity >= 5) { 1031 for (uint32_t i = 0; i < move_count; i++) { 1032 zfs_btree_verify_poison_at(tree, hdr, 1033 hdr->bth_count + i); 1034 } 1035 } 1036 /* First, shift things in the right node back. */ 1037 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, 1038 BSS_TRAPEZOID, BSD_RIGHT); 1039 1040 /* Next, move the separator to the right node. */ 1041 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * 1042 size); 1043 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); 1044 bcpy(separator, e_out, size); 1045 1046 /* 1047 * Now, move elements and children from the left node to the 1048 * right. We move one more child than elements. 1049 */ 1050 move_count--; 1051 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; 1052 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, 1053 BSS_TRAPEZOID); 1054 1055 /* 1056 * Finally, move the last element in the left node to the 1057 * separator's position. 1058 */ 1059 move_idx--; 1060 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); 1061 1062 l_neighbor->btc_hdr.bth_count -= move_count + 1; 1063 hdr->bth_count += move_count + 1; 1064 1065 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); 1066 ASSERT3U(hdr->bth_count, >=, capacity / 2); 1067 1068 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); 1069 1070 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1071 cur->btc_children[i]->bth_parent = cur; 1072 } 1073 1074 tree->bt_bulk = NULL; 1075 zfs_btree_verify(tree); 1076 } 1077 1078 /* 1079 * Insert value into tree at the location specified by where. 1080 */ 1081 void 1082 zfs_btree_add_idx(zfs_btree_t *tree, const void *value, 1083 const zfs_btree_index_t *where) 1084 { 1085 zfs_btree_index_t idx = {0}; 1086 1087 /* If we're not inserting in the last leaf, end bulk insert mode. */ 1088 if (tree->bt_bulk != NULL) { 1089 if (where->bti_node != &tree->bt_bulk->btl_hdr) { 1090 zfs_btree_bulk_finish(tree); 1091 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); 1092 where = &idx; 1093 } 1094 } 1095 1096 tree->bt_num_elems++; 1097 /* 1098 * If this is the first element in the tree, create a leaf root node 1099 * and add the value to it. 1100 */ 1101 if (where->bti_node == NULL) { 1102 ASSERT3U(tree->bt_num_elems, ==, 1); 1103 ASSERT3S(tree->bt_height, ==, -1); 1104 ASSERT3P(tree->bt_root, ==, NULL); 1105 ASSERT0(where->bti_offset); 1106 1107 tree->bt_num_nodes++; 1108 zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); 1109 tree->bt_root = &leaf->btl_hdr; 1110 tree->bt_height++; 1111 1112 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 1113 hdr->bth_parent = NULL; 1114 hdr->bth_first = 0; 1115 hdr->bth_count = 0; 1116 zfs_btree_poison_node(tree, hdr); 1117 1118 zfs_btree_insert_into_leaf(tree, leaf, value, 0); 1119 tree->bt_bulk = leaf; 1120 } else if (!zfs_btree_is_core(where->bti_node)) { 1121 /* 1122 * If we're inserting into a leaf, go directly to the helper 1123 * function. 1124 */ 1125 zfs_btree_insert_into_leaf(tree, 1126 (zfs_btree_leaf_t *)where->bti_node, value, 1127 where->bti_offset); 1128 } else { 1129 /* 1130 * If we're inserting into a core node, we can't just shift 1131 * the existing element in that slot in the same node without 1132 * breaking our ordering invariants. Instead we place the new 1133 * value in the node at that spot and then insert the old 1134 * separator into the first slot in the subtree to the right. 1135 */ 1136 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; 1137 1138 /* 1139 * We can ignore bti_before, because either way the value 1140 * should end up in bti_offset. 1141 */ 1142 uint32_t off = where->bti_offset; 1143 zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; 1144 size_t size = tree->bt_elem_size; 1145 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 1146 bcpy(node->btc_elems + off * size, buf, size); 1147 bcpy(value, node->btc_elems + off * size, size); 1148 1149 /* 1150 * Find the first slot in the subtree to the right, insert 1151 * there. 1152 */ 1153 zfs_btree_index_t new_idx; 1154 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, 1155 NULL); 1156 ASSERT0(new_idx.bti_offset); 1157 ASSERT(!zfs_btree_is_core(new_idx.bti_node)); 1158 zfs_btree_insert_into_leaf(tree, 1159 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); 1160 kmem_free(buf, size); 1161 } 1162 zfs_btree_verify(tree); 1163 } 1164 1165 /* 1166 * Return the first element in the tree, and put its location in where if 1167 * non-null. 1168 */ 1169 void * 1170 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) 1171 { 1172 if (tree->bt_height == -1) { 1173 ASSERT0(tree->bt_num_elems); 1174 return (NULL); 1175 } 1176 return (zfs_btree_first_helper(tree, tree->bt_root, where)); 1177 } 1178 1179 /* 1180 * Find the last element in the subtree rooted at hdr, return its value and 1181 * put its location in where if non-null. 1182 */ 1183 static void * 1184 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, 1185 zfs_btree_index_t *where) 1186 { 1187 zfs_btree_hdr_t *node; 1188 1189 for (node = hdr; zfs_btree_is_core(node); node = 1190 ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) 1191 ; 1192 1193 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 1194 if (where != NULL) { 1195 where->bti_node = node; 1196 where->bti_offset = node->bth_count - 1; 1197 where->bti_before = B_FALSE; 1198 } 1199 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * 1200 btree->bt_elem_size); 1201 } 1202 1203 /* 1204 * Return the last element in the tree, and put its location in where if 1205 * non-null. 1206 */ 1207 void * 1208 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) 1209 { 1210 if (tree->bt_height == -1) { 1211 ASSERT0(tree->bt_num_elems); 1212 return (NULL); 1213 } 1214 return (zfs_btree_last_helper(tree, tree->bt_root, where)); 1215 } 1216 1217 /* 1218 * This function contains the logic to find the next node in the tree. A 1219 * helper function is used because there are multiple internal consumemrs of 1220 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each 1221 * node after we've finished with it. 1222 */ 1223 static void * 1224 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1225 zfs_btree_index_t *out_idx, 1226 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) 1227 { 1228 if (idx->bti_node == NULL) { 1229 ASSERT3S(tree->bt_height, ==, -1); 1230 return (NULL); 1231 } 1232 1233 uint32_t offset = idx->bti_offset; 1234 if (!zfs_btree_is_core(idx->bti_node)) { 1235 /* 1236 * When finding the next element of an element in a leaf, 1237 * there are two cases. If the element isn't the last one in 1238 * the leaf, in which case we just return the next element in 1239 * the leaf. Otherwise, we need to traverse up our parents 1240 * until we find one where our ancestor isn't the last child 1241 * of its parent. Once we do, the next element is the 1242 * separator after our ancestor in its parent. 1243 */ 1244 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1245 uint32_t new_off = offset + (idx->bti_before ? 0 : 1); 1246 if (leaf->btl_hdr.bth_count > new_off) { 1247 out_idx->bti_node = &leaf->btl_hdr; 1248 out_idx->bti_offset = new_off; 1249 out_idx->bti_before = B_FALSE; 1250 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1251 new_off) * tree->bt_elem_size); 1252 } 1253 1254 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1255 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1256 node != NULL; node = node->btc_hdr.bth_parent) { 1257 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1258 ASSERT(zfs_btree_is_core(hdr)); 1259 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1260 if (done_func != NULL) 1261 done_func(tree, prev); 1262 if (i == hdr->bth_count) { 1263 prev = hdr; 1264 continue; 1265 } 1266 out_idx->bti_node = hdr; 1267 out_idx->bti_offset = i; 1268 out_idx->bti_before = B_FALSE; 1269 return (node->btc_elems + i * tree->bt_elem_size); 1270 } 1271 if (done_func != NULL) 1272 done_func(tree, prev); 1273 /* 1274 * We've traversed all the way up and been at the end of the 1275 * node every time, so this was the last element in the tree. 1276 */ 1277 return (NULL); 1278 } 1279 1280 /* If we were before an element in a core node, return that element. */ 1281 ASSERT(zfs_btree_is_core(idx->bti_node)); 1282 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1283 if (idx->bti_before) { 1284 out_idx->bti_before = B_FALSE; 1285 return (node->btc_elems + offset * tree->bt_elem_size); 1286 } 1287 1288 /* 1289 * The next element from one in a core node is the first element in 1290 * the subtree just to the right of the separator. 1291 */ 1292 zfs_btree_hdr_t *child = node->btc_children[offset + 1]; 1293 return (zfs_btree_first_helper(tree, child, out_idx)); 1294 } 1295 1296 /* 1297 * Return the next valued node in the tree. The same address can be safely 1298 * passed for idx and out_idx. 1299 */ 1300 void * 1301 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1302 zfs_btree_index_t *out_idx) 1303 { 1304 return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); 1305 } 1306 1307 /* 1308 * Return the previous valued node in the tree. The same value can be safely 1309 * passed for idx and out_idx. 1310 */ 1311 void * 1312 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1313 zfs_btree_index_t *out_idx) 1314 { 1315 if (idx->bti_node == NULL) { 1316 ASSERT3S(tree->bt_height, ==, -1); 1317 return (NULL); 1318 } 1319 1320 uint32_t offset = idx->bti_offset; 1321 if (!zfs_btree_is_core(idx->bti_node)) { 1322 /* 1323 * When finding the previous element of an element in a leaf, 1324 * there are two cases. If the element isn't the first one in 1325 * the leaf, in which case we just return the previous element 1326 * in the leaf. Otherwise, we need to traverse up our parents 1327 * until we find one where our previous ancestor isn't the 1328 * first child. Once we do, the previous element is the 1329 * separator after our previous ancestor. 1330 */ 1331 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1332 if (offset != 0) { 1333 out_idx->bti_node = &leaf->btl_hdr; 1334 out_idx->bti_offset = offset - 1; 1335 out_idx->bti_before = B_FALSE; 1336 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1337 offset - 1) * tree->bt_elem_size); 1338 } 1339 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1340 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1341 node != NULL; node = node->btc_hdr.bth_parent) { 1342 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1343 ASSERT(zfs_btree_is_core(hdr)); 1344 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1345 if (i == 0) { 1346 prev = hdr; 1347 continue; 1348 } 1349 out_idx->bti_node = hdr; 1350 out_idx->bti_offset = i - 1; 1351 out_idx->bti_before = B_FALSE; 1352 return (node->btc_elems + (i - 1) * tree->bt_elem_size); 1353 } 1354 /* 1355 * We've traversed all the way up and been at the start of the 1356 * node every time, so this was the first node in the tree. 1357 */ 1358 return (NULL); 1359 } 1360 1361 /* 1362 * The previous element from one in a core node is the last element in 1363 * the subtree just to the left of the separator. 1364 */ 1365 ASSERT(zfs_btree_is_core(idx->bti_node)); 1366 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1367 zfs_btree_hdr_t *child = node->btc_children[offset]; 1368 return (zfs_btree_last_helper(tree, child, out_idx)); 1369 } 1370 1371 /* 1372 * Get the value at the provided index in the tree. 1373 * 1374 * Note that the value returned from this function can be mutated, but only 1375 * if it will not change the ordering of the element with respect to any other 1376 * elements that could be in the tree. 1377 */ 1378 void * 1379 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) 1380 { 1381 ASSERT(!idx->bti_before); 1382 size_t size = tree->bt_elem_size; 1383 if (!zfs_btree_is_core(idx->bti_node)) { 1384 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1385 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1386 idx->bti_offset) * size); 1387 } 1388 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1389 return (node->btc_elems + idx->bti_offset * size); 1390 } 1391 1392 /* Add the given value to the tree. Must not already be in the tree. */ 1393 void 1394 zfs_btree_add(zfs_btree_t *tree, const void *node) 1395 { 1396 zfs_btree_index_t where = {0}; 1397 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); 1398 zfs_btree_add_idx(tree, node, &where); 1399 } 1400 1401 /* Helper function to free a tree node. */ 1402 static void 1403 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) 1404 { 1405 tree->bt_num_nodes--; 1406 if (!zfs_btree_is_core(node)) { 1407 zfs_btree_leaf_free(tree, node); 1408 } else { 1409 kmem_free(node, sizeof (zfs_btree_core_t) + 1410 BTREE_CORE_ELEMS * tree->bt_elem_size); 1411 } 1412 } 1413 1414 /* 1415 * Remove the rm_hdr and the separator to its left from the parent node. The 1416 * buffer that rm_hdr was stored in may already be freed, so its contents 1417 * cannot be accessed. 1418 */ 1419 static void 1420 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, 1421 zfs_btree_hdr_t *rm_hdr) 1422 { 1423 size_t size = tree->bt_elem_size; 1424 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; 1425 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1426 /* 1427 * If the node is the root node and rm_hdr is one of two children, 1428 * promote the other child to the root. 1429 */ 1430 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { 1431 ASSERT3U(hdr->bth_count, ==, 1); 1432 ASSERT3P(tree->bt_root, ==, node); 1433 ASSERT3P(node->btc_children[1], ==, rm_hdr); 1434 tree->bt_root = node->btc_children[0]; 1435 node->btc_children[0]->bth_parent = NULL; 1436 zfs_btree_node_destroy(tree, hdr); 1437 tree->bt_height--; 1438 return; 1439 } 1440 1441 uint32_t idx; 1442 for (idx = 0; idx <= hdr->bth_count; idx++) { 1443 if (node->btc_children[idx] == rm_hdr) 1444 break; 1445 } 1446 ASSERT3U(idx, <=, hdr->bth_count); 1447 1448 /* 1449 * If the node is the root or it has more than the minimum number of 1450 * children, just remove the child and separator, and return. 1451 */ 1452 if (hdr->bth_parent == NULL || 1453 hdr->bth_count > min_count) { 1454 /* 1455 * Shift the element and children to the right of rm_hdr to 1456 * the left by one spot. 1457 */ 1458 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1459 BSS_PARALLELOGRAM); 1460 hdr->bth_count--; 1461 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); 1462 return; 1463 } 1464 1465 ASSERT3U(hdr->bth_count, ==, min_count); 1466 1467 /* 1468 * Now we try to take a node from a neighbor. We check left, then 1469 * right. If the neighbor exists and has more than the minimum number 1470 * of elements, we move the separator between us and them to our 1471 * node, move their closest element (last for left, first for right) 1472 * to the separator, and move their closest child to our node. Along 1473 * the way we need to collapse the gap made by idx, and (for our right 1474 * neighbor) the gap made by removing their first element and child. 1475 * 1476 * Note: this logic currently doesn't support taking from a neighbor 1477 * that isn't a sibling (i.e. a neighbor with a different 1478 * parent). This isn't critical functionality, but may be worth 1479 * implementing in the future for completeness' sake. 1480 */ 1481 zfs_btree_core_t *parent = hdr->bth_parent; 1482 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1483 1484 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1485 parent->btc_children[parent_idx - 1]); 1486 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1487 /* We can take a node from the left neighbor. */ 1488 ASSERT(zfs_btree_is_core(l_hdr)); 1489 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; 1490 1491 /* 1492 * Start by shifting the elements and children in the current 1493 * node to the right by one spot. 1494 */ 1495 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); 1496 1497 /* 1498 * Move the separator between node and neighbor to the first 1499 * element slot in the current node. 1500 */ 1501 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1502 size; 1503 bcpy(separator, node->btc_elems, size); 1504 1505 /* Move the last child of neighbor to our first child slot. */ 1506 node->btc_children[0] = 1507 neighbor->btc_children[l_hdr->bth_count]; 1508 node->btc_children[0]->bth_parent = node; 1509 1510 /* Move the last element of neighbor to the separator spot. */ 1511 uint8_t *take_elem = neighbor->btc_elems + 1512 (l_hdr->bth_count - 1) * size; 1513 bcpy(take_elem, separator, size); 1514 l_hdr->bth_count--; 1515 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); 1516 return; 1517 } 1518 1519 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1520 NULL : parent->btc_children[parent_idx + 1]); 1521 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1522 /* We can take a node from the right neighbor. */ 1523 ASSERT(zfs_btree_is_core(r_hdr)); 1524 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; 1525 1526 /* 1527 * Shift elements in node left by one spot to overwrite rm_hdr 1528 * and the separator before it. 1529 */ 1530 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1531 BSS_PARALLELOGRAM); 1532 1533 /* 1534 * Move the separator between node and neighbor to the last 1535 * element spot in node. 1536 */ 1537 uint8_t *separator = parent->btc_elems + parent_idx * size; 1538 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, 1539 size); 1540 1541 /* 1542 * Move the first child of neighbor to the last child spot in 1543 * node. 1544 */ 1545 node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; 1546 node->btc_children[hdr->bth_count]->bth_parent = node; 1547 1548 /* Move the first element of neighbor to the separator spot. */ 1549 uint8_t *take_elem = neighbor->btc_elems; 1550 bcpy(take_elem, separator, size); 1551 r_hdr->bth_count--; 1552 1553 /* 1554 * Shift the elements and children of neighbor to cover the 1555 * stolen elements. 1556 */ 1557 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, 1558 BSS_TRAPEZOID); 1559 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); 1560 return; 1561 } 1562 1563 /* 1564 * In this case, neither of our neighbors can spare an element, so we 1565 * need to merge with one of them. We prefer the left one, 1566 * arbitrarily. Move the separator into the leftmost merging node 1567 * (which may be us or the left neighbor), and then move the right 1568 * merging node's elements. Once that's done, we go back and delete 1569 * the element we're removing. Finally, go into the parent and delete 1570 * the right merging node and the separator. This may cause further 1571 * merging. 1572 */ 1573 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; 1574 uint32_t new_idx = idx; 1575 if (l_hdr != NULL) { 1576 keep_hdr = l_hdr; 1577 new_rm_hdr = hdr; 1578 new_idx += keep_hdr->bth_count + 1; 1579 } else { 1580 ASSERT3P(r_hdr, !=, NULL); 1581 keep_hdr = hdr; 1582 new_rm_hdr = r_hdr; 1583 parent_idx++; 1584 } 1585 1586 ASSERT(zfs_btree_is_core(keep_hdr)); 1587 ASSERT(zfs_btree_is_core(new_rm_hdr)); 1588 1589 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; 1590 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; 1591 1592 if (zfs_btree_verify_intensity >= 5) { 1593 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { 1594 zfs_btree_verify_poison_at(tree, keep_hdr, 1595 keep_hdr->bth_count + i); 1596 } 1597 } 1598 1599 /* Move the separator into the left node. */ 1600 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; 1601 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1602 size; 1603 bcpy(separator, e_out, size); 1604 keep_hdr->bth_count++; 1605 1606 /* Move all our elements and children into the left node. */ 1607 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, 1608 keep_hdr->bth_count, BSS_TRAPEZOID); 1609 1610 uint32_t old_count = keep_hdr->bth_count; 1611 1612 /* Update bookkeeping */ 1613 keep_hdr->bth_count += new_rm_hdr->bth_count; 1614 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); 1615 1616 /* 1617 * Shift the element and children to the right of rm_hdr to 1618 * the left by one spot. 1619 */ 1620 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); 1621 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, 1622 BSS_PARALLELOGRAM); 1623 keep_hdr->bth_count--; 1624 1625 /* Reparent all our children to point to the left node. */ 1626 zfs_btree_hdr_t **new_start = keep->btc_children + 1627 old_count - 1; 1628 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) 1629 new_start[i]->bth_parent = keep; 1630 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { 1631 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); 1632 ASSERT3P(keep->btc_children[i], !=, rm_hdr); 1633 } 1634 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); 1635 1636 new_rm_hdr->bth_count = 0; 1637 zfs_btree_remove_from_node(tree, parent, new_rm_hdr); 1638 zfs_btree_node_destroy(tree, new_rm_hdr); 1639 } 1640 1641 /* Remove the element at the specific location. */ 1642 void 1643 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) 1644 { 1645 size_t size = tree->bt_elem_size; 1646 zfs_btree_hdr_t *hdr = where->bti_node; 1647 uint32_t idx = where->bti_offset; 1648 1649 ASSERT(!where->bti_before); 1650 if (tree->bt_bulk != NULL) { 1651 /* 1652 * Leave bulk insert mode. Note that our index would be 1653 * invalid after we correct the tree, so we copy the value 1654 * we're planning to remove and find it again after 1655 * bulk_finish. 1656 */ 1657 uint8_t *value = zfs_btree_get(tree, where); 1658 uint8_t *tmp = kmem_alloc(size, KM_SLEEP); 1659 bcpy(value, tmp, size); 1660 zfs_btree_bulk_finish(tree); 1661 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); 1662 kmem_free(tmp, size); 1663 hdr = where->bti_node; 1664 idx = where->bti_offset; 1665 } 1666 1667 tree->bt_num_elems--; 1668 /* 1669 * If the element happens to be in a core node, we move a leaf node's 1670 * element into its place and then remove the leaf node element. This 1671 * makes the rebalance logic not need to be recursive both upwards and 1672 * downwards. 1673 */ 1674 if (zfs_btree_is_core(hdr)) { 1675 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1676 zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; 1677 void *new_value = zfs_btree_last_helper(tree, left_subtree, 1678 where); 1679 ASSERT3P(new_value, !=, NULL); 1680 1681 bcpy(new_value, node->btc_elems + idx * size, size); 1682 1683 hdr = where->bti_node; 1684 idx = where->bti_offset; 1685 ASSERT(!where->bti_before); 1686 } 1687 1688 /* 1689 * First, we'll update the leaf's metadata. Then, we shift any 1690 * elements after the idx to the left. After that, we rebalance if 1691 * needed. 1692 */ 1693 ASSERT(!zfs_btree_is_core(hdr)); 1694 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 1695 ASSERT3U(hdr->bth_count, >, 0); 1696 1697 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; 1698 1699 /* 1700 * If we're over the minimum size or this is the root, just overwrite 1701 * the value and return. 1702 */ 1703 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { 1704 bt_shrink_leaf(tree, leaf, idx, 1); 1705 if (hdr->bth_parent == NULL) { 1706 ASSERT0(tree->bt_height); 1707 if (hdr->bth_count == 0) { 1708 tree->bt_root = NULL; 1709 tree->bt_height--; 1710 zfs_btree_node_destroy(tree, &leaf->btl_hdr); 1711 } 1712 } 1713 zfs_btree_verify(tree); 1714 return; 1715 } 1716 ASSERT3U(hdr->bth_count, ==, min_count); 1717 1718 /* 1719 * Now we try to take a node from a sibling. We check left, then 1720 * right. If they exist and have more than the minimum number of 1721 * elements, we move the separator between us and them to our node 1722 * and move their closest element (last for left, first for right) to 1723 * the separator. Along the way we need to collapse the gap made by 1724 * idx, and (for our right neighbor) the gap made by removing their 1725 * first element. 1726 * 1727 * Note: this logic currently doesn't support taking from a neighbor 1728 * that isn't a sibling. This isn't critical functionality, but may be 1729 * worth implementing in the future for completeness' sake. 1730 */ 1731 zfs_btree_core_t *parent = hdr->bth_parent; 1732 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1733 1734 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1735 parent->btc_children[parent_idx - 1]); 1736 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1737 /* We can take a node from the left neighbor. */ 1738 ASSERT(!zfs_btree_is_core(l_hdr)); 1739 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; 1740 1741 /* 1742 * Move our elements back by one spot to make room for the 1743 * stolen element and overwrite the element being removed. 1744 */ 1745 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); 1746 1747 /* Move the separator to our first spot. */ 1748 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1749 size; 1750 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); 1751 1752 /* Move our neighbor's last element to the separator. */ 1753 uint8_t *take_elem = neighbor->btl_elems + 1754 (l_hdr->bth_first + l_hdr->bth_count - 1) * size; 1755 bcpy(take_elem, separator, size); 1756 1757 /* Delete our neighbor's last element. */ 1758 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); 1759 zfs_btree_verify(tree); 1760 return; 1761 } 1762 1763 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1764 NULL : parent->btc_children[parent_idx + 1]); 1765 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1766 /* We can take a node from the right neighbor. */ 1767 ASSERT(!zfs_btree_is_core(r_hdr)); 1768 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; 1769 1770 /* 1771 * Move our elements after the element being removed forwards 1772 * by one spot to make room for the stolen element and 1773 * overwrite the element being removed. 1774 */ 1775 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, 1776 1, BSD_LEFT); 1777 1778 /* Move the separator between us to our last spot. */ 1779 uint8_t *separator = parent->btc_elems + parent_idx * size; 1780 bcpy(separator, leaf->btl_elems + (hdr->bth_first + 1781 hdr->bth_count - 1) * size, size); 1782 1783 /* Move our neighbor's first element to the separator. */ 1784 uint8_t *take_elem = neighbor->btl_elems + 1785 r_hdr->bth_first * size; 1786 bcpy(take_elem, separator, size); 1787 1788 /* Delete our neighbor's first element. */ 1789 bt_shrink_leaf(tree, neighbor, 0, 1); 1790 zfs_btree_verify(tree); 1791 return; 1792 } 1793 1794 /* 1795 * In this case, neither of our neighbors can spare an element, so we 1796 * need to merge with one of them. We prefer the left one, arbitrarily. 1797 * After remove we move the separator into the leftmost merging node 1798 * (which may be us or the left neighbor), and then move the right 1799 * merging node's elements. Once that's done, we go back and delete 1800 * the element we're removing. Finally, go into the parent and delete 1801 * the right merging node and the separator. This may cause further 1802 * merging. 1803 */ 1804 zfs_btree_hdr_t *rm_hdr, *k_hdr; 1805 if (l_hdr != NULL) { 1806 k_hdr = l_hdr; 1807 rm_hdr = hdr; 1808 } else { 1809 ASSERT3P(r_hdr, !=, NULL); 1810 k_hdr = hdr; 1811 rm_hdr = r_hdr; 1812 parent_idx++; 1813 } 1814 ASSERT(!zfs_btree_is_core(k_hdr)); 1815 ASSERT(!zfs_btree_is_core(rm_hdr)); 1816 ASSERT3U(k_hdr->bth_count, ==, min_count); 1817 ASSERT3U(rm_hdr->bth_count, ==, min_count); 1818 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; 1819 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; 1820 1821 if (zfs_btree_verify_intensity >= 5) { 1822 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { 1823 zfs_btree_verify_poison_at(tree, k_hdr, 1824 k_hdr->bth_count + i); 1825 } 1826 } 1827 1828 /* 1829 * Remove the value from the node. It will go below the minimum, 1830 * but we'll fix it in no time. 1831 */ 1832 bt_shrink_leaf(tree, leaf, idx, 1); 1833 1834 /* Prepare space for elements to be moved from the right. */ 1835 uint32_t k_count = k_hdr->bth_count; 1836 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); 1837 ASSERT3U(k_hdr->bth_count, ==, min_count * 2); 1838 1839 /* Move the separator into the first open spot. */ 1840 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; 1841 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; 1842 bcpy(separator, out, size); 1843 1844 /* Move our elements to the left neighbor. */ 1845 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); 1846 1847 /* Remove the emptied node from the parent. */ 1848 zfs_btree_remove_from_node(tree, parent, rm_hdr); 1849 zfs_btree_node_destroy(tree, rm_hdr); 1850 zfs_btree_verify(tree); 1851 } 1852 1853 /* Remove the given value from the tree. */ 1854 void 1855 zfs_btree_remove(zfs_btree_t *tree, const void *value) 1856 { 1857 zfs_btree_index_t where = {0}; 1858 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); 1859 zfs_btree_remove_idx(tree, &where); 1860 } 1861 1862 /* Return the number of elements in the tree. */ 1863 ulong_t 1864 zfs_btree_numnodes(zfs_btree_t *tree) 1865 { 1866 return (tree->bt_num_elems); 1867 } 1868 1869 /* 1870 * This function is used to visit all the elements in the tree before 1871 * destroying the tree. This allows the calling code to perform any cleanup it 1872 * needs to do. This is more efficient than just removing the first element 1873 * over and over, because it removes all rebalancing. Once the destroy_nodes() 1874 * function has been called, no other btree operations are valid until it 1875 * returns NULL, which point the only valid operation is zfs_btree_destroy(). 1876 * 1877 * example: 1878 * 1879 * zfs_btree_index_t *cookie = NULL; 1880 * my_data_t *node; 1881 * 1882 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) 1883 * free(node->ptr); 1884 * zfs_btree_destroy(tree); 1885 * 1886 */ 1887 void * 1888 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) 1889 { 1890 if (*cookie == NULL) { 1891 if (tree->bt_height == -1) 1892 return (NULL); 1893 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); 1894 return (zfs_btree_first(tree, *cookie)); 1895 } 1896 1897 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, 1898 zfs_btree_node_destroy); 1899 if (rval == NULL) { 1900 tree->bt_root = NULL; 1901 tree->bt_height = -1; 1902 tree->bt_num_elems = 0; 1903 kmem_free(*cookie, sizeof (**cookie)); 1904 tree->bt_bulk = NULL; 1905 } 1906 return (rval); 1907 } 1908 1909 static void 1910 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1911 { 1912 if (zfs_btree_is_core(hdr)) { 1913 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; 1914 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1915 zfs_btree_clear_helper(tree, btc->btc_children[i]); 1916 } 1917 1918 zfs_btree_node_destroy(tree, hdr); 1919 } 1920 1921 void 1922 zfs_btree_clear(zfs_btree_t *tree) 1923 { 1924 if (tree->bt_root == NULL) { 1925 ASSERT0(tree->bt_num_elems); 1926 return; 1927 } 1928 1929 zfs_btree_clear_helper(tree, tree->bt_root); 1930 tree->bt_num_elems = 0; 1931 tree->bt_root = NULL; 1932 tree->bt_num_nodes = 0; 1933 tree->bt_height = -1; 1934 tree->bt_bulk = NULL; 1935 } 1936 1937 void 1938 zfs_btree_destroy(zfs_btree_t *tree) 1939 { 1940 ASSERT0(tree->bt_num_elems); 1941 ASSERT3P(tree->bt_root, ==, NULL); 1942 } 1943 1944 /* Verify that every child of this node has the correct parent pointer. */ 1945 static void 1946 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1947 { 1948 if (!zfs_btree_is_core(hdr)) 1949 return; 1950 1951 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1952 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1953 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); 1954 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); 1955 } 1956 } 1957 1958 /* Verify that every node has the correct parent pointer. */ 1959 static void 1960 zfs_btree_verify_pointers(zfs_btree_t *tree) 1961 { 1962 if (tree->bt_height == -1) { 1963 VERIFY3P(tree->bt_root, ==, NULL); 1964 return; 1965 } 1966 VERIFY3P(tree->bt_root->bth_parent, ==, NULL); 1967 zfs_btree_verify_pointers_helper(tree, tree->bt_root); 1968 } 1969 1970 /* 1971 * Verify that all the current node and its children satisfy the count 1972 * invariants, and return the total count in the subtree rooted in this node. 1973 */ 1974 static uint64_t 1975 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1976 { 1977 if (!zfs_btree_is_core(hdr)) { 1978 if (tree->bt_root != hdr && tree->bt_bulk && 1979 hdr != &tree->bt_bulk->btl_hdr) { 1980 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); 1981 } 1982 1983 return (hdr->bth_count); 1984 } else { 1985 1986 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1987 uint64_t ret = hdr->bth_count; 1988 if (tree->bt_root != hdr && tree->bt_bulk == NULL) 1989 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); 1990 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1991 ret += zfs_btree_verify_counts_helper(tree, 1992 node->btc_children[i]); 1993 } 1994 1995 return (ret); 1996 } 1997 } 1998 1999 /* 2000 * Verify that all nodes satisfy the invariants and that the total number of 2001 * elements is correct. 2002 */ 2003 static void 2004 zfs_btree_verify_counts(zfs_btree_t *tree) 2005 { 2006 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); 2007 if (tree->bt_height == -1) { 2008 return; 2009 } 2010 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, 2011 tree->bt_num_elems); 2012 } 2013 2014 /* 2015 * Check that the subtree rooted at this node has a uniform height. Returns 2016 * the number of nodes under this node, to help verify bt_num_nodes. 2017 */ 2018 static uint64_t 2019 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 2020 int32_t height) 2021 { 2022 if (!zfs_btree_is_core(hdr)) { 2023 VERIFY0(height); 2024 return (1); 2025 } 2026 2027 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2028 uint64_t ret = 1; 2029 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2030 ret += zfs_btree_verify_height_helper(tree, 2031 node->btc_children[i], height - 1); 2032 } 2033 return (ret); 2034 } 2035 2036 /* 2037 * Check that the tree rooted at this node has a uniform height, and that the 2038 * bt_height in the tree is correct. 2039 */ 2040 static void 2041 zfs_btree_verify_height(zfs_btree_t *tree) 2042 { 2043 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2044 if (tree->bt_height == -1) { 2045 return; 2046 } 2047 2048 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, 2049 tree->bt_height), ==, tree->bt_num_nodes); 2050 } 2051 2052 /* 2053 * Check that the elements in this node are sorted, and that if this is a core 2054 * node, the separators are properly between the subtrees they separaate and 2055 * that the children also satisfy this requirement. 2056 */ 2057 static void 2058 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2059 { 2060 size_t size = tree->bt_elem_size; 2061 if (!zfs_btree_is_core(hdr)) { 2062 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2063 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2064 VERIFY3S(tree->bt_compar(leaf->btl_elems + 2065 (hdr->bth_first + i - 1) * size, 2066 leaf->btl_elems + 2067 (hdr->bth_first + i) * size), ==, -1); 2068 } 2069 return; 2070 } 2071 2072 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2073 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2074 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, 2075 node->btc_elems + i * size), ==, -1); 2076 } 2077 for (uint32_t i = 0; i < hdr->bth_count; i++) { 2078 uint8_t *left_child_last = NULL; 2079 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; 2080 if (zfs_btree_is_core(left_child_hdr)) { 2081 zfs_btree_core_t *left_child = 2082 (zfs_btree_core_t *)left_child_hdr; 2083 left_child_last = left_child->btc_elems + 2084 (left_child_hdr->bth_count - 1) * size; 2085 } else { 2086 zfs_btree_leaf_t *left_child = 2087 (zfs_btree_leaf_t *)left_child_hdr; 2088 left_child_last = left_child->btl_elems + 2089 (left_child_hdr->bth_first + 2090 left_child_hdr->bth_count - 1) * size; 2091 } 2092 int comp = tree->bt_compar(node->btc_elems + i * size, 2093 left_child_last); 2094 if (comp <= 0) { 2095 panic("btree: compar returned %d (expected 1) at " 2096 "%px %d: compar(%px, %px)", comp, node, i, 2097 node->btc_elems + i * size, left_child_last); 2098 } 2099 2100 uint8_t *right_child_first = NULL; 2101 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; 2102 if (zfs_btree_is_core(right_child_hdr)) { 2103 zfs_btree_core_t *right_child = 2104 (zfs_btree_core_t *)right_child_hdr; 2105 right_child_first = right_child->btc_elems; 2106 } else { 2107 zfs_btree_leaf_t *right_child = 2108 (zfs_btree_leaf_t *)right_child_hdr; 2109 right_child_first = right_child->btl_elems + 2110 right_child_hdr->bth_first * size; 2111 } 2112 comp = tree->bt_compar(node->btc_elems + i * size, 2113 right_child_first); 2114 if (comp >= 0) { 2115 panic("btree: compar returned %d (expected -1) at " 2116 "%px %d: compar(%px, %px)", comp, node, i, 2117 node->btc_elems + i * size, right_child_first); 2118 } 2119 } 2120 for (uint32_t i = 0; i <= hdr->bth_count; i++) 2121 zfs_btree_verify_order_helper(tree, node->btc_children[i]); 2122 } 2123 2124 /* Check that all elements in the tree are in sorted order. */ 2125 static void 2126 zfs_btree_verify_order(zfs_btree_t *tree) 2127 { 2128 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2129 if (tree->bt_height == -1) { 2130 return; 2131 } 2132 2133 zfs_btree_verify_order_helper(tree, tree->bt_root); 2134 } 2135 2136 #ifdef ZFS_DEBUG 2137 /* Check that all unused memory is poisoned correctly. */ 2138 static void 2139 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2140 { 2141 size_t size = tree->bt_elem_size; 2142 if (!zfs_btree_is_core(hdr)) { 2143 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2144 for (size_t i = 0; i < hdr->bth_first * size; i++) 2145 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2146 size_t esize = tree->bt_leaf_size - 2147 offsetof(zfs_btree_leaf_t, btl_elems); 2148 for (size_t i = (hdr->bth_first + hdr->bth_count) * size; 2149 i < esize; i++) 2150 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2151 } else { 2152 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2153 for (size_t i = hdr->bth_count * size; 2154 i < BTREE_CORE_ELEMS * size; i++) 2155 VERIFY3U(node->btc_elems[i], ==, 0x0f); 2156 2157 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 2158 i++) { 2159 VERIFY3P(node->btc_children[i], ==, 2160 (zfs_btree_hdr_t *)BTREE_POISON); 2161 } 2162 2163 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2164 zfs_btree_verify_poison_helper(tree, 2165 node->btc_children[i]); 2166 } 2167 } 2168 } 2169 #endif 2170 2171 /* Check that unused memory in the tree is still poisoned. */ 2172 static void 2173 zfs_btree_verify_poison(zfs_btree_t *tree) 2174 { 2175 #ifdef ZFS_DEBUG 2176 if (tree->bt_height == -1) 2177 return; 2178 zfs_btree_verify_poison_helper(tree, tree->bt_root); 2179 #endif 2180 } 2181 2182 void 2183 zfs_btree_verify(zfs_btree_t *tree) 2184 { 2185 if (zfs_btree_verify_intensity == 0) 2186 return; 2187 zfs_btree_verify_height(tree); 2188 if (zfs_btree_verify_intensity == 1) 2189 return; 2190 zfs_btree_verify_pointers(tree); 2191 if (zfs_btree_verify_intensity == 2) 2192 return; 2193 zfs_btree_verify_counts(tree); 2194 if (zfs_btree_verify_intensity == 3) 2195 return; 2196 zfs_btree_verify_order(tree); 2197 2198 if (zfs_btree_verify_intensity == 4) 2199 return; 2200 zfs_btree_verify_poison(tree); 2201 } 2202 2203 /* BEGIN CSTYLED */ 2204 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, 2205 "Enable btree verification. Levels above 4 require ZFS be built " 2206 "with debugging"); 2207 /* END CSTYLED */ 2208