1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * This file and its contents are supplied under the terms of the 6 * Common Development and Distribution License ("CDDL"), version 1.0. 7 * You may only use this file in accordance with the terms of version 8 * 1.0 of the CDDL. 9 * 10 * A full copy of the text of the CDDL should have accompanied this 11 * source. A copy of the CDDL is also available via the Internet at 12 * http://www.illumos.org/license/CDDL. 13 * 14 * CDDL HEADER END 15 */ 16 /* 17 * Copyright (c) 2019 by Delphix. All rights reserved. 18 */ 19 20 #include <sys/btree.h> 21 #include <sys/bitops.h> 22 #include <sys/zfs_context.h> 23 24 kmem_cache_t *zfs_btree_leaf_cache; 25 26 /* 27 * Control the extent of the verification that occurs when zfs_btree_verify is 28 * called. Primarily used for debugging when extending the btree logic and 29 * functionality. As the intensity is increased, new verification steps are 30 * added. These steps are cumulative; intensity = 3 includes the intensity = 1 31 * and intensity = 2 steps as well. 32 * 33 * Intensity 1: Verify that the tree's height is consistent throughout. 34 * Intensity 2: Verify that a core node's children's parent pointers point 35 * to the core node. 36 * Intensity 3: Verify that the total number of elements in the tree matches the 37 * sum of the number of elements in each node. Also verifies that each node's 38 * count obeys the invariants (less than or equal to maximum value, greater than 39 * or equal to half the maximum minus one). 40 * Intensity 4: Verify that each element compares less than the element 41 * immediately after it and greater than the one immediately before it using the 42 * comparator function. For core nodes, also checks that each element is greater 43 * than the last element in the first of the two nodes it separates, and less 44 * than the first element in the second of the two nodes. 45 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside 46 * of each node is poisoned appropriately. Note that poisoning always occurs if 47 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal 48 * operation. 49 * 50 * Intensity 4 and 5 are particularly expensive to perform; the previous levels 51 * are a few memory operations per node, while these levels require multiple 52 * operations per element. In addition, when creating large btrees, these 53 * operations are called at every step, resulting in extremely slow operation 54 * (while the asymptotic complexity of the other steps is the same, the 55 * importance of the constant factors cannot be denied). 56 */ 57 uint_t zfs_btree_verify_intensity = 0; 58 59 /* 60 * Convenience functions to silence warnings from memcpy/memmove's 61 * return values and change argument order to src, dest. 62 */ 63 static void 64 bcpy(const void *src, void *dest, size_t size) 65 { 66 (void) memcpy(dest, src, size); 67 } 68 69 static void 70 bmov(const void *src, void *dest, size_t size) 71 { 72 (void) memmove(dest, src, size); 73 } 74 75 static boolean_t 76 zfs_btree_is_core(struct zfs_btree_hdr *hdr) 77 { 78 return (hdr->bth_first == -1); 79 } 80 81 #ifdef _ILP32 82 #define BTREE_POISON 0xabadb10c 83 #else 84 #define BTREE_POISON 0xabadb10cdeadbeef 85 #endif 86 87 static void 88 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 89 { 90 #ifdef ZFS_DEBUG 91 size_t size = tree->bt_elem_size; 92 if (zfs_btree_is_core(hdr)) { 93 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 94 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 95 i++) { 96 node->btc_children[i] = 97 (zfs_btree_hdr_t *)BTREE_POISON; 98 } 99 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, 100 (BTREE_CORE_ELEMS - hdr->bth_count) * size); 101 } else { 102 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 103 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); 104 (void) memset(leaf->btl_elems + 105 (hdr->bth_first + hdr->bth_count) * size, 0x0f, 106 tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - 107 (hdr->bth_first + hdr->bth_count) * size); 108 } 109 #endif 110 } 111 112 static inline void 113 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 114 uint32_t idx, uint32_t count) 115 { 116 #ifdef ZFS_DEBUG 117 size_t size = tree->bt_elem_size; 118 if (zfs_btree_is_core(hdr)) { 119 ASSERT3U(idx, >=, hdr->bth_count); 120 ASSERT3U(idx, <=, BTREE_CORE_ELEMS); 121 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); 122 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 123 for (uint32_t i = 1; i <= count; i++) { 124 node->btc_children[idx + i] = 125 (zfs_btree_hdr_t *)BTREE_POISON; 126 } 127 (void) memset(node->btc_elems + idx * size, 0x0f, count * size); 128 } else { 129 ASSERT3U(idx, <=, tree->bt_leaf_cap); 130 ASSERT3U(idx + count, <=, tree->bt_leaf_cap); 131 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 132 (void) memset(leaf->btl_elems + 133 (hdr->bth_first + idx) * size, 0x0f, count * size); 134 } 135 #endif 136 } 137 138 static inline void 139 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 140 uint32_t idx) 141 { 142 #ifdef ZFS_DEBUG 143 size_t size = tree->bt_elem_size; 144 if (zfs_btree_is_core(hdr)) { 145 ASSERT3U(idx, <, BTREE_CORE_ELEMS); 146 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 147 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; 148 VERIFY3P(node->btc_children[idx + 1], ==, cval); 149 for (size_t i = 0; i < size; i++) 150 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); 151 } else { 152 ASSERT3U(idx, <, tree->bt_leaf_cap); 153 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 154 if (idx >= tree->bt_leaf_cap - hdr->bth_first) 155 return; 156 for (size_t i = 0; i < size; i++) { 157 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) 158 * size + i], ==, 0x0f); 159 } 160 } 161 #endif 162 } 163 164 void 165 zfs_btree_init(void) 166 { 167 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", 168 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 169 } 170 171 void 172 zfs_btree_fini(void) 173 { 174 kmem_cache_destroy(zfs_btree_leaf_cache); 175 } 176 177 static void * 178 zfs_btree_leaf_alloc(zfs_btree_t *tree) 179 { 180 if (tree->bt_leaf_size == BTREE_LEAF_SIZE) 181 return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); 182 else 183 return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); 184 } 185 186 static void 187 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) 188 { 189 if (tree->bt_leaf_size == BTREE_LEAF_SIZE) 190 return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); 191 else 192 return (kmem_free(ptr, tree->bt_leaf_size)); 193 } 194 195 void 196 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), 197 bt_find_in_buf_f bt_find_in_buf, size_t size) 198 { 199 zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, 200 BTREE_LEAF_SIZE); 201 } 202 203 static void * 204 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, 205 const void *value, zfs_btree_index_t *where); 206 207 void 208 zfs_btree_create_custom(zfs_btree_t *tree, 209 int (*compar) (const void *, const void *), 210 bt_find_in_buf_f bt_find_in_buf, 211 size_t size, size_t lsize) 212 { 213 size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); 214 215 ASSERT3U(size, <=, esize / 2); 216 memset(tree, 0, sizeof (*tree)); 217 tree->bt_compar = compar; 218 tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? 219 zfs_btree_find_in_buf : bt_find_in_buf; 220 tree->bt_elem_size = size; 221 tree->bt_leaf_size = lsize; 222 tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t); 223 tree->bt_height = -1; 224 tree->bt_bulk = NULL; 225 } 226 227 /* 228 * Find value in the array of elements provided. Uses a simple binary search. 229 */ 230 static void * 231 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, 232 const void *value, zfs_btree_index_t *where) 233 { 234 uint32_t max = nelems; 235 uint32_t min = 0; 236 while (max > min) { 237 uint32_t idx = (min + max) / 2; 238 uint8_t *cur = buf + idx * tree->bt_elem_size; 239 int comp = tree->bt_compar(cur, value); 240 if (comp < 0) { 241 min = idx + 1; 242 } else if (comp > 0) { 243 max = idx; 244 } else { 245 where->bti_offset = idx; 246 where->bti_before = B_FALSE; 247 return (cur); 248 } 249 } 250 251 where->bti_offset = max; 252 where->bti_before = B_TRUE; 253 return (NULL); 254 } 255 256 /* 257 * Find the given value in the tree. where may be passed as null to use as a 258 * membership test or if the btree is being used as a map. 259 */ 260 void * 261 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) 262 { 263 if (tree->bt_height == -1) { 264 if (where != NULL) { 265 where->bti_node = NULL; 266 where->bti_offset = 0; 267 } 268 ASSERT0(tree->bt_num_elems); 269 return (NULL); 270 } 271 272 /* 273 * If we're in bulk-insert mode, we check the last spot in the tree 274 * and the last leaf in the tree before doing the normal search, 275 * because for most workloads the vast majority of finds in 276 * bulk-insert mode are to insert new elements. 277 */ 278 zfs_btree_index_t idx; 279 size_t size = tree->bt_elem_size; 280 if (tree->bt_bulk != NULL) { 281 zfs_btree_leaf_t *last_leaf = tree->bt_bulk; 282 int comp = tree->bt_compar(last_leaf->btl_elems + 283 (last_leaf->btl_hdr.bth_first + 284 last_leaf->btl_hdr.bth_count - 1) * size, value); 285 if (comp < 0) { 286 /* 287 * If what they're looking for is after the last 288 * element, it's not in the tree. 289 */ 290 if (where != NULL) { 291 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 292 where->bti_offset = 293 last_leaf->btl_hdr.bth_count; 294 where->bti_before = B_TRUE; 295 } 296 return (NULL); 297 } else if (comp == 0) { 298 if (where != NULL) { 299 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 300 where->bti_offset = 301 last_leaf->btl_hdr.bth_count - 1; 302 where->bti_before = B_FALSE; 303 } 304 return (last_leaf->btl_elems + 305 (last_leaf->btl_hdr.bth_first + 306 last_leaf->btl_hdr.bth_count - 1) * size); 307 } 308 if (tree->bt_compar(last_leaf->btl_elems + 309 last_leaf->btl_hdr.bth_first * size, value) <= 0) { 310 /* 311 * If what they're looking for is after the first 312 * element in the last leaf, it's in the last leaf or 313 * it's not in the tree. 314 */ 315 void *d = tree->bt_find_in_buf(tree, 316 last_leaf->btl_elems + 317 last_leaf->btl_hdr.bth_first * size, 318 last_leaf->btl_hdr.bth_count, value, &idx); 319 320 if (where != NULL) { 321 idx.bti_node = (zfs_btree_hdr_t *)last_leaf; 322 *where = idx; 323 } 324 return (d); 325 } 326 } 327 328 zfs_btree_core_t *node = NULL; 329 uint32_t child = 0; 330 uint32_t depth = 0; 331 332 /* 333 * Iterate down the tree, finding which child the value should be in 334 * by comparing with the separators. 335 */ 336 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; 337 node = (zfs_btree_core_t *)node->btc_children[child], depth++) { 338 ASSERT3P(node, !=, NULL); 339 void *d = tree->bt_find_in_buf(tree, node->btc_elems, 340 node->btc_hdr.bth_count, value, &idx); 341 EQUIV(d != NULL, !idx.bti_before); 342 if (d != NULL) { 343 if (where != NULL) { 344 idx.bti_node = (zfs_btree_hdr_t *)node; 345 *where = idx; 346 } 347 return (d); 348 } 349 ASSERT(idx.bti_before); 350 child = idx.bti_offset; 351 } 352 353 /* 354 * The value is in this leaf, or it would be if it were in the 355 * tree. Find its proper location and return it. 356 */ 357 zfs_btree_leaf_t *leaf = (depth == 0 ? 358 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); 359 void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + 360 leaf->btl_hdr.bth_first * size, 361 leaf->btl_hdr.bth_count, value, &idx); 362 363 if (where != NULL) { 364 idx.bti_node = (zfs_btree_hdr_t *)leaf; 365 *where = idx; 366 } 367 368 return (d); 369 } 370 371 /* 372 * To explain the following functions, it is useful to understand the four 373 * kinds of shifts used in btree operation. First, a shift is a movement of 374 * elements within a node. It is used to create gaps for inserting new 375 * elements and children, or cover gaps created when things are removed. A 376 * shift has two fundamental properties, each of which can be one of two 377 * values, making four types of shifts. There is the direction of the shift 378 * (left or right) and the shape of the shift (parallelogram or isoceles 379 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only 380 * applies to shifts of core nodes. 381 * 382 * The names derive from the following imagining of the layout of a node: 383 * 384 * Elements: * * * * * * * ... * * * 385 * Children: * * * * * * * * ... * * * 386 * 387 * This layout follows from the fact that the elements act as separators 388 * between pairs of children, and that children root subtrees "below" the 389 * current node. A left and right shift are fairly self-explanatory; a left 390 * shift moves things to the left, while a right shift moves things to the 391 * right. A parallelogram shift is a shift with the same number of elements 392 * and children being moved, while a trapezoid shift is a shift that moves one 393 * more children than elements. An example follows: 394 * 395 * A parallelogram shift could contain the following: 396 * _______________ 397 * \* * * * \ * * * ... * * * 398 * * \ * * * *\ * * * ... * * * 399 * --------------- 400 * A trapezoid shift could contain the following: 401 * ___________ 402 * * / * * * \ * * * ... * * * 403 * * / * * * *\ * * * ... * * * 404 * --------------- 405 * 406 * Note that a parallelogram shift is always shaped like a "left-leaning" 407 * parallelogram, where the starting index of the children being moved is 408 * always one higher than the starting index of the elements being moved. No 409 * "right-leaning" parallelogram shifts are needed (shifts where the starting 410 * element index and starting child index being moved are the same) to achieve 411 * any btree operations, so we ignore them. 412 */ 413 414 enum bt_shift_shape { 415 BSS_TRAPEZOID, 416 BSS_PARALLELOGRAM 417 }; 418 419 enum bt_shift_direction { 420 BSD_LEFT, 421 BSD_RIGHT 422 }; 423 424 /* 425 * Shift elements and children in the provided core node by off spots. The 426 * first element moved is idx, and count elements are moved. The shape of the 427 * shift is determined by shape. The direction is determined by dir. 428 */ 429 static inline void 430 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 431 uint32_t count, uint32_t off, enum bt_shift_shape shape, 432 enum bt_shift_direction dir) 433 { 434 size_t size = tree->bt_elem_size; 435 ASSERT(zfs_btree_is_core(&node->btc_hdr)); 436 437 uint8_t *e_start = node->btc_elems + idx * size; 438 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : 439 e_start + off * size); 440 bmov(e_start, e_out, count * size); 441 442 zfs_btree_hdr_t **c_start = node->btc_children + idx + 443 (shape == BSS_TRAPEZOID ? 0 : 1); 444 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : 445 c_start + off); 446 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 447 bmov(c_start, c_out, c_count * sizeof (*c_start)); 448 } 449 450 /* 451 * Shift elements and children in the provided core node left by one spot. 452 * The first element moved is idx, and count elements are moved. The 453 * shape of the shift is determined by trap; true if the shift is a trapezoid, 454 * false if it is a parallelogram. 455 */ 456 static inline void 457 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 458 uint32_t count, enum bt_shift_shape shape) 459 { 460 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); 461 } 462 463 /* 464 * Shift elements and children in the provided core node right by one spot. 465 * Starts with elements[idx] and children[idx] and one more child than element. 466 */ 467 static inline void 468 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 469 uint32_t count, enum bt_shift_shape shape) 470 { 471 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); 472 } 473 474 /* 475 * Shift elements and children in the provided leaf node by off spots. 476 * The first element moved is idx, and count elements are moved. The direction 477 * is determined by left. 478 */ 479 static inline void 480 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, 481 uint32_t count, uint32_t off, enum bt_shift_direction dir) 482 { 483 size_t size = tree->bt_elem_size; 484 zfs_btree_hdr_t *hdr = &node->btl_hdr; 485 ASSERT(!zfs_btree_is_core(hdr)); 486 487 if (count == 0) 488 return; 489 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; 490 uint8_t *out = (dir == BSD_LEFT ? start - off * size : 491 start + off * size); 492 bmov(start, out, count * size); 493 } 494 495 /* 496 * Grow leaf for n new elements before idx. 497 */ 498 static void 499 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 500 uint32_t n) 501 { 502 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 503 ASSERT(!zfs_btree_is_core(hdr)); 504 ASSERT3U(idx, <=, hdr->bth_count); 505 uint32_t capacity = tree->bt_leaf_cap; 506 ASSERT3U(hdr->bth_count + n, <=, capacity); 507 boolean_t cl = (hdr->bth_first >= n); 508 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); 509 510 if (cl && (!cr || idx <= hdr->bth_count / 2)) { 511 /* Grow left. */ 512 hdr->bth_first -= n; 513 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); 514 } else if (cr) { 515 /* Grow right. */ 516 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, 517 BSD_RIGHT); 518 } else { 519 /* Grow both ways. */ 520 uint32_t fn = hdr->bth_first - 521 (capacity - (hdr->bth_count + n)) / 2; 522 hdr->bth_first -= fn; 523 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); 524 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, 525 n - fn, BSD_RIGHT); 526 } 527 hdr->bth_count += n; 528 } 529 530 /* 531 * Shrink leaf for count elements starting from idx. 532 */ 533 static void 534 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 535 uint32_t n) 536 { 537 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 538 ASSERT(!zfs_btree_is_core(hdr)); 539 ASSERT3U(idx, <=, hdr->bth_count); 540 ASSERT3U(idx + n, <=, hdr->bth_count); 541 542 if (idx <= (hdr->bth_count - n) / 2) { 543 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); 544 zfs_btree_poison_node_at(tree, hdr, 0, n); 545 hdr->bth_first += n; 546 } else { 547 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, 548 BSD_LEFT); 549 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); 550 } 551 hdr->bth_count -= n; 552 } 553 554 /* 555 * Move children and elements from one core node to another. The shape 556 * parameter behaves the same as it does in the shift logic. 557 */ 558 static inline void 559 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, 560 uint32_t count, zfs_btree_core_t *dest, uint32_t didx, 561 enum bt_shift_shape shape) 562 { 563 size_t size = tree->bt_elem_size; 564 ASSERT(zfs_btree_is_core(&source->btc_hdr)); 565 ASSERT(zfs_btree_is_core(&dest->btc_hdr)); 566 567 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, 568 count * size); 569 570 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 571 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), 572 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), 573 c_count * sizeof (*source->btc_children)); 574 } 575 576 static inline void 577 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, 578 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) 579 { 580 size_t size = tree->bt_elem_size; 581 ASSERT(!zfs_btree_is_core(&source->btl_hdr)); 582 ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); 583 584 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, 585 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, 586 count * size); 587 } 588 589 /* 590 * Find the first element in the subtree rooted at hdr, return its value and 591 * put its location in where if non-null. 592 */ 593 static void * 594 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 595 zfs_btree_index_t *where) 596 { 597 zfs_btree_hdr_t *node; 598 599 for (node = hdr; zfs_btree_is_core(node); 600 node = ((zfs_btree_core_t *)node)->btc_children[0]) 601 ; 602 603 ASSERT(!zfs_btree_is_core(node)); 604 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 605 if (where != NULL) { 606 where->bti_node = node; 607 where->bti_offset = 0; 608 where->bti_before = B_FALSE; 609 } 610 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); 611 } 612 613 /* Insert an element and a child into a core node at the given offset. */ 614 static void 615 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, 616 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) 617 { 618 size_t size = tree->bt_elem_size; 619 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 620 ASSERT3P(par_hdr, ==, new_node->bth_parent); 621 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); 622 623 if (zfs_btree_verify_intensity >= 5) { 624 zfs_btree_verify_poison_at(tree, par_hdr, 625 par_hdr->bth_count); 626 } 627 /* Shift existing elements and children */ 628 uint32_t count = par_hdr->bth_count - offset; 629 bt_shift_core_right(tree, parent, offset, count, 630 BSS_PARALLELOGRAM); 631 632 /* Insert new values */ 633 parent->btc_children[offset + 1] = new_node; 634 bcpy(buf, parent->btc_elems + offset * size, size); 635 par_hdr->bth_count++; 636 } 637 638 /* 639 * Insert new_node into the parent of old_node directly after old_node, with 640 * buf as the dividing element between the two. 641 */ 642 static void 643 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, 644 zfs_btree_hdr_t *new_node, void *buf) 645 { 646 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); 647 size_t size = tree->bt_elem_size; 648 zfs_btree_core_t *parent = old_node->bth_parent; 649 650 /* 651 * If this is the root node we were splitting, we create a new root 652 * and increase the height of the tree. 653 */ 654 if (parent == NULL) { 655 ASSERT3P(old_node, ==, tree->bt_root); 656 tree->bt_num_nodes++; 657 zfs_btree_core_t *new_root = 658 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * 659 size, KM_SLEEP); 660 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; 661 new_root_hdr->bth_parent = NULL; 662 new_root_hdr->bth_first = -1; 663 new_root_hdr->bth_count = 1; 664 665 old_node->bth_parent = new_node->bth_parent = new_root; 666 new_root->btc_children[0] = old_node; 667 new_root->btc_children[1] = new_node; 668 bcpy(buf, new_root->btc_elems, size); 669 670 tree->bt_height++; 671 tree->bt_root = new_root_hdr; 672 zfs_btree_poison_node(tree, new_root_hdr); 673 return; 674 } 675 676 /* 677 * Since we have the new separator, binary search for where to put 678 * new_node. 679 */ 680 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 681 zfs_btree_index_t idx; 682 ASSERT(zfs_btree_is_core(par_hdr)); 683 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, 684 par_hdr->bth_count, buf, &idx), ==, NULL); 685 ASSERT(idx.bti_before); 686 uint32_t offset = idx.bti_offset; 687 ASSERT3U(offset, <=, par_hdr->bth_count); 688 ASSERT3P(parent->btc_children[offset], ==, old_node); 689 690 /* 691 * If the parent isn't full, shift things to accommodate our insertions 692 * and return. 693 */ 694 if (par_hdr->bth_count != BTREE_CORE_ELEMS) { 695 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); 696 return; 697 } 698 699 /* 700 * We need to split this core node into two. Currently there are 701 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for 702 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the 703 * current node, and the others will be moved to the new core node. 704 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One 705 * will be used as the new separator in our parent, and the others 706 * will be split among the two core nodes. 707 * 708 * Usually we will split the node in half evenly, with 709 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we 710 * instead move only about a quarter of the elements (and children) to 711 * the new node. Since the average state after a long time is a 3/4 712 * full node, shortcutting directly to that state improves efficiency. 713 * 714 * We do this in two stages: first we split into two nodes, and then we 715 * reuse our existing logic to insert the new element and child. 716 */ 717 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 718 2 : 4)) - 1, 2); 719 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; 720 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); 721 tree->bt_num_nodes++; 722 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + 723 BTREE_CORE_ELEMS * size, KM_SLEEP); 724 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; 725 new_par_hdr->bth_parent = par_hdr->bth_parent; 726 new_par_hdr->bth_first = -1; 727 new_par_hdr->bth_count = move_count; 728 zfs_btree_poison_node(tree, new_par_hdr); 729 730 par_hdr->bth_count = keep_count; 731 732 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 733 0, BSS_TRAPEZOID); 734 735 /* Store the new separator in a buffer. */ 736 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); 737 bcpy(parent->btc_elems + keep_count * size, tmp_buf, 738 size); 739 zfs_btree_poison_node(tree, par_hdr); 740 741 if (offset < keep_count) { 742 /* Insert the new node into the left half */ 743 zfs_btree_insert_core_impl(tree, parent, offset, new_node, 744 buf); 745 746 /* 747 * Move the new separator to the existing buffer. 748 */ 749 bcpy(tmp_buf, buf, size); 750 } else if (offset > keep_count) { 751 /* Insert the new node into the right half */ 752 new_node->bth_parent = new_parent; 753 zfs_btree_insert_core_impl(tree, new_parent, 754 offset - keep_count - 1, new_node, buf); 755 756 /* 757 * Move the new separator to the existing buffer. 758 */ 759 bcpy(tmp_buf, buf, size); 760 } else { 761 /* 762 * Move the new separator into the right half, and replace it 763 * with buf. We also need to shift back the elements in the 764 * right half to accommodate new_node. 765 */ 766 bt_shift_core_right(tree, new_parent, 0, move_count, 767 BSS_TRAPEZOID); 768 new_parent->btc_children[0] = new_node; 769 bcpy(tmp_buf, new_parent->btc_elems, size); 770 new_par_hdr->bth_count++; 771 } 772 kmem_free(tmp_buf, size); 773 zfs_btree_poison_node(tree, par_hdr); 774 775 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) 776 new_parent->btc_children[i]->bth_parent = new_parent; 777 778 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) 779 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); 780 781 /* 782 * Now that the node is split, we need to insert the new node into its 783 * parent. This may cause further splitting. 784 */ 785 zfs_btree_insert_into_parent(tree, &parent->btc_hdr, 786 &new_parent->btc_hdr, buf); 787 } 788 789 /* Insert an element into a leaf node at the given offset. */ 790 static void 791 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 792 uint32_t idx, const void *value) 793 { 794 size_t size = tree->bt_elem_size; 795 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 796 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); 797 798 if (zfs_btree_verify_intensity >= 5) { 799 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, 800 leaf->btl_hdr.bth_count); 801 } 802 803 bt_grow_leaf(tree, leaf, idx, 1); 804 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; 805 bcpy(value, start, size); 806 } 807 808 static void 809 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); 810 811 /* Helper function for inserting a new value into leaf at the given index. */ 812 static void 813 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 814 const void *value, uint32_t idx) 815 { 816 size_t size = tree->bt_elem_size; 817 uint32_t capacity = tree->bt_leaf_cap; 818 819 /* 820 * If the leaf isn't full, shift the elements after idx and insert 821 * value. 822 */ 823 if (leaf->btl_hdr.bth_count != capacity) { 824 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 825 return; 826 } 827 828 /* 829 * Otherwise, we split the leaf node into two nodes. If we're not bulk 830 * inserting, each is of size (capacity / 2). If we are bulk 831 * inserting, we move a quarter of the elements to the new node so 832 * inserts into the old node don't cause immediate splitting but the 833 * tree stays relatively dense. Since the average state after a long 834 * time is a 3/4 full node, shortcutting directly to that state 835 * improves efficiency. At the end of the bulk insertion process 836 * we'll need to go through and fix up any nodes (the last leaf and 837 * its ancestors, potentially) that are below the minimum. 838 * 839 * In either case, we're left with one extra element. The leftover 840 * element will become the new dividing element between the two nodes. 841 */ 842 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; 843 uint32_t keep_count = capacity - move_count - 1; 844 ASSERT3U(keep_count, >=, 1); 845 /* If we insert on left. move one more to keep leaves balanced. */ 846 if (idx < keep_count) { 847 keep_count--; 848 move_count++; 849 } 850 tree->bt_num_nodes++; 851 zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); 852 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; 853 new_hdr->bth_parent = leaf->btl_hdr.bth_parent; 854 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + 855 (idx >= keep_count && idx <= keep_count + move_count / 2); 856 new_hdr->bth_count = move_count; 857 zfs_btree_poison_node(tree, new_hdr); 858 859 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) 860 tree->bt_bulk = new_leaf; 861 862 /* Copy the back part to the new leaf. */ 863 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); 864 865 /* We store the new separator in a buffer we control for simplicity. */ 866 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 867 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, 868 buf, size); 869 870 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); 871 872 if (idx < keep_count) { 873 /* Insert into the existing leaf. */ 874 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 875 } else if (idx > keep_count) { 876 /* Insert into the new leaf. */ 877 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 878 1, value); 879 } else { 880 /* 881 * Insert planned separator into the new leaf, and use 882 * the new value as the new separator. 883 */ 884 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); 885 bcpy(value, buf, size); 886 } 887 888 /* 889 * Now that the node is split, we need to insert the new node into its 890 * parent. This may cause further splitting, bur only of core nodes. 891 */ 892 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, 893 buf); 894 kmem_free(buf, size); 895 } 896 897 static uint32_t 898 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 899 { 900 void *buf; 901 if (zfs_btree_is_core(hdr)) { 902 buf = ((zfs_btree_core_t *)hdr)->btc_elems; 903 } else { 904 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + 905 hdr->bth_first * tree->bt_elem_size; 906 } 907 zfs_btree_index_t idx; 908 zfs_btree_core_t *parent = hdr->bth_parent; 909 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, 910 parent->btc_hdr.bth_count, buf, &idx), ==, NULL); 911 ASSERT(idx.bti_before); 912 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); 913 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); 914 return (idx.bti_offset); 915 } 916 917 /* 918 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some 919 * nodes may violate the invariant that non-root nodes must be at least half 920 * full. All nodes violating this invariant should be the last node in their 921 * particular level. To correct the invariant, we take values from their left 922 * neighbor until they are half full. They must have a left neighbor at their 923 * level because the last node at a level is not the first node unless it's 924 * the root. 925 */ 926 static void 927 zfs_btree_bulk_finish(zfs_btree_t *tree) 928 { 929 ASSERT3P(tree->bt_bulk, !=, NULL); 930 ASSERT3P(tree->bt_root, !=, NULL); 931 zfs_btree_leaf_t *leaf = tree->bt_bulk; 932 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 933 zfs_btree_core_t *parent = hdr->bth_parent; 934 size_t size = tree->bt_elem_size; 935 uint32_t capacity = tree->bt_leaf_cap; 936 937 /* 938 * The invariant doesn't apply to the root node, if that's the only 939 * node in the tree we're done. 940 */ 941 if (parent == NULL) { 942 tree->bt_bulk = NULL; 943 return; 944 } 945 946 /* First, take elements to rebalance the leaf node. */ 947 if (hdr->bth_count < capacity / 2) { 948 /* 949 * First, find the left neighbor. The simplest way to do this 950 * is to call zfs_btree_prev twice; the first time finds some 951 * ancestor of this node, and the second time finds the left 952 * neighbor. The ancestor found is the lowest common ancestor 953 * of leaf and the neighbor. 954 */ 955 zfs_btree_index_t idx = { 956 .bti_node = hdr, 957 .bti_offset = 0 958 }; 959 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 960 ASSERT(zfs_btree_is_core(idx.bti_node)); 961 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; 962 uint32_t common_idx = idx.bti_offset; 963 964 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 965 ASSERT(!zfs_btree_is_core(idx.bti_node)); 966 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; 967 zfs_btree_hdr_t *l_hdr = idx.bti_node; 968 uint32_t move_count = (capacity / 2) - hdr->bth_count; 969 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, 970 capacity / 2); 971 972 if (zfs_btree_verify_intensity >= 5) { 973 for (uint32_t i = 0; i < move_count; i++) { 974 zfs_btree_verify_poison_at(tree, hdr, 975 leaf->btl_hdr.bth_count + i); 976 } 977 } 978 979 /* First, shift elements in leaf back. */ 980 bt_grow_leaf(tree, leaf, 0, move_count); 981 982 /* Next, move the separator from the common ancestor to leaf. */ 983 uint8_t *separator = common->btc_elems + common_idx * size; 984 uint8_t *out = leaf->btl_elems + 985 (hdr->bth_first + move_count - 1) * size; 986 bcpy(separator, out, size); 987 988 /* 989 * Now we move elements from the tail of the left neighbor to 990 * fill the remaining spots in leaf. 991 */ 992 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - 993 (move_count - 1), move_count - 1, leaf, 0); 994 995 /* 996 * Finally, move the new last element in the left neighbor to 997 * the separator. 998 */ 999 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + 1000 l_hdr->bth_count - move_count) * size, separator, size); 1001 1002 /* Adjust the node's counts, and we're done. */ 1003 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, 1004 move_count); 1005 1006 ASSERT3U(l_hdr->bth_count, >=, capacity / 2); 1007 ASSERT3U(hdr->bth_count, >=, capacity / 2); 1008 } 1009 1010 /* 1011 * Now we have to rebalance any ancestors of leaf that may also 1012 * violate the invariant. 1013 */ 1014 capacity = BTREE_CORE_ELEMS; 1015 while (parent->btc_hdr.bth_parent != NULL) { 1016 zfs_btree_core_t *cur = parent; 1017 zfs_btree_hdr_t *hdr = &cur->btc_hdr; 1018 parent = hdr->bth_parent; 1019 /* 1020 * If the invariant isn't violated, move on to the next 1021 * ancestor. 1022 */ 1023 if (hdr->bth_count >= capacity / 2) 1024 continue; 1025 1026 /* 1027 * Because the smallest number of nodes we can move when 1028 * splitting is 2, we never need to worry about not having a 1029 * left sibling (a sibling is a neighbor with the same parent). 1030 */ 1031 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1032 ASSERT3U(parent_idx, >, 0); 1033 zfs_btree_core_t *l_neighbor = 1034 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; 1035 uint32_t move_count = (capacity / 2) - hdr->bth_count; 1036 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, 1037 capacity / 2); 1038 1039 if (zfs_btree_verify_intensity >= 5) { 1040 for (uint32_t i = 0; i < move_count; i++) { 1041 zfs_btree_verify_poison_at(tree, hdr, 1042 hdr->bth_count + i); 1043 } 1044 } 1045 /* First, shift things in the right node back. */ 1046 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, 1047 BSS_TRAPEZOID, BSD_RIGHT); 1048 1049 /* Next, move the separator to the right node. */ 1050 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * 1051 size); 1052 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); 1053 bcpy(separator, e_out, size); 1054 1055 /* 1056 * Now, move elements and children from the left node to the 1057 * right. We move one more child than elements. 1058 */ 1059 move_count--; 1060 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; 1061 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, 1062 BSS_TRAPEZOID); 1063 1064 /* 1065 * Finally, move the last element in the left node to the 1066 * separator's position. 1067 */ 1068 move_idx--; 1069 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); 1070 1071 l_neighbor->btc_hdr.bth_count -= move_count + 1; 1072 hdr->bth_count += move_count + 1; 1073 1074 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); 1075 ASSERT3U(hdr->bth_count, >=, capacity / 2); 1076 1077 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); 1078 1079 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1080 cur->btc_children[i]->bth_parent = cur; 1081 } 1082 1083 tree->bt_bulk = NULL; 1084 zfs_btree_verify(tree); 1085 } 1086 1087 /* 1088 * Insert value into tree at the location specified by where. 1089 */ 1090 void 1091 zfs_btree_add_idx(zfs_btree_t *tree, const void *value, 1092 const zfs_btree_index_t *where) 1093 { 1094 zfs_btree_index_t idx = {0}; 1095 1096 /* If we're not inserting in the last leaf, end bulk insert mode. */ 1097 if (tree->bt_bulk != NULL) { 1098 if (where->bti_node != &tree->bt_bulk->btl_hdr) { 1099 zfs_btree_bulk_finish(tree); 1100 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); 1101 where = &idx; 1102 } 1103 } 1104 1105 tree->bt_num_elems++; 1106 /* 1107 * If this is the first element in the tree, create a leaf root node 1108 * and add the value to it. 1109 */ 1110 if (where->bti_node == NULL) { 1111 ASSERT3U(tree->bt_num_elems, ==, 1); 1112 ASSERT3S(tree->bt_height, ==, -1); 1113 ASSERT3P(tree->bt_root, ==, NULL); 1114 ASSERT0(where->bti_offset); 1115 1116 tree->bt_num_nodes++; 1117 zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); 1118 tree->bt_root = &leaf->btl_hdr; 1119 tree->bt_height++; 1120 1121 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 1122 hdr->bth_parent = NULL; 1123 hdr->bth_first = 0; 1124 hdr->bth_count = 0; 1125 zfs_btree_poison_node(tree, hdr); 1126 1127 zfs_btree_insert_into_leaf(tree, leaf, value, 0); 1128 tree->bt_bulk = leaf; 1129 } else if (!zfs_btree_is_core(where->bti_node)) { 1130 /* 1131 * If we're inserting into a leaf, go directly to the helper 1132 * function. 1133 */ 1134 zfs_btree_insert_into_leaf(tree, 1135 (zfs_btree_leaf_t *)where->bti_node, value, 1136 where->bti_offset); 1137 } else { 1138 /* 1139 * If we're inserting into a core node, we can't just shift 1140 * the existing element in that slot in the same node without 1141 * breaking our ordering invariants. Instead we place the new 1142 * value in the node at that spot and then insert the old 1143 * separator into the first slot in the subtree to the right. 1144 */ 1145 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; 1146 1147 /* 1148 * We can ignore bti_before, because either way the value 1149 * should end up in bti_offset. 1150 */ 1151 uint32_t off = where->bti_offset; 1152 zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; 1153 size_t size = tree->bt_elem_size; 1154 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 1155 bcpy(node->btc_elems + off * size, buf, size); 1156 bcpy(value, node->btc_elems + off * size, size); 1157 1158 /* 1159 * Find the first slot in the subtree to the right, insert 1160 * there. 1161 */ 1162 zfs_btree_index_t new_idx; 1163 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, 1164 NULL); 1165 ASSERT0(new_idx.bti_offset); 1166 ASSERT(!zfs_btree_is_core(new_idx.bti_node)); 1167 zfs_btree_insert_into_leaf(tree, 1168 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); 1169 kmem_free(buf, size); 1170 } 1171 zfs_btree_verify(tree); 1172 } 1173 1174 /* 1175 * Return the first element in the tree, and put its location in where if 1176 * non-null. 1177 */ 1178 void * 1179 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) 1180 { 1181 if (tree->bt_height == -1) { 1182 ASSERT0(tree->bt_num_elems); 1183 return (NULL); 1184 } 1185 return (zfs_btree_first_helper(tree, tree->bt_root, where)); 1186 } 1187 1188 /* 1189 * Find the last element in the subtree rooted at hdr, return its value and 1190 * put its location in where if non-null. 1191 */ 1192 static void * 1193 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, 1194 zfs_btree_index_t *where) 1195 { 1196 zfs_btree_hdr_t *node; 1197 1198 for (node = hdr; zfs_btree_is_core(node); node = 1199 ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) 1200 ; 1201 1202 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 1203 if (where != NULL) { 1204 where->bti_node = node; 1205 where->bti_offset = node->bth_count - 1; 1206 where->bti_before = B_FALSE; 1207 } 1208 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * 1209 btree->bt_elem_size); 1210 } 1211 1212 /* 1213 * Return the last element in the tree, and put its location in where if 1214 * non-null. 1215 */ 1216 void * 1217 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) 1218 { 1219 if (tree->bt_height == -1) { 1220 ASSERT0(tree->bt_num_elems); 1221 return (NULL); 1222 } 1223 return (zfs_btree_last_helper(tree, tree->bt_root, where)); 1224 } 1225 1226 /* 1227 * This function contains the logic to find the next node in the tree. A 1228 * helper function is used because there are multiple internal consumemrs of 1229 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each 1230 * node after we've finished with it. 1231 */ 1232 static void * 1233 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1234 zfs_btree_index_t *out_idx, 1235 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) 1236 { 1237 if (idx->bti_node == NULL) { 1238 ASSERT3S(tree->bt_height, ==, -1); 1239 return (NULL); 1240 } 1241 1242 uint32_t offset = idx->bti_offset; 1243 if (!zfs_btree_is_core(idx->bti_node)) { 1244 /* 1245 * When finding the next element of an element in a leaf, 1246 * there are two cases. If the element isn't the last one in 1247 * the leaf, in which case we just return the next element in 1248 * the leaf. Otherwise, we need to traverse up our parents 1249 * until we find one where our ancestor isn't the last child 1250 * of its parent. Once we do, the next element is the 1251 * separator after our ancestor in its parent. 1252 */ 1253 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1254 uint32_t new_off = offset + (idx->bti_before ? 0 : 1); 1255 if (leaf->btl_hdr.bth_count > new_off) { 1256 out_idx->bti_node = &leaf->btl_hdr; 1257 out_idx->bti_offset = new_off; 1258 out_idx->bti_before = B_FALSE; 1259 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1260 new_off) * tree->bt_elem_size); 1261 } 1262 1263 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1264 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1265 node != NULL; node = node->btc_hdr.bth_parent) { 1266 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1267 ASSERT(zfs_btree_is_core(hdr)); 1268 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1269 if (done_func != NULL) 1270 done_func(tree, prev); 1271 if (i == hdr->bth_count) { 1272 prev = hdr; 1273 continue; 1274 } 1275 out_idx->bti_node = hdr; 1276 out_idx->bti_offset = i; 1277 out_idx->bti_before = B_FALSE; 1278 return (node->btc_elems + i * tree->bt_elem_size); 1279 } 1280 if (done_func != NULL) 1281 done_func(tree, prev); 1282 /* 1283 * We've traversed all the way up and been at the end of the 1284 * node every time, so this was the last element in the tree. 1285 */ 1286 return (NULL); 1287 } 1288 1289 /* If we were before an element in a core node, return that element. */ 1290 ASSERT(zfs_btree_is_core(idx->bti_node)); 1291 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1292 if (idx->bti_before) { 1293 out_idx->bti_before = B_FALSE; 1294 return (node->btc_elems + offset * tree->bt_elem_size); 1295 } 1296 1297 /* 1298 * The next element from one in a core node is the first element in 1299 * the subtree just to the right of the separator. 1300 */ 1301 zfs_btree_hdr_t *child = node->btc_children[offset + 1]; 1302 return (zfs_btree_first_helper(tree, child, out_idx)); 1303 } 1304 1305 /* 1306 * Return the next valued node in the tree. The same address can be safely 1307 * passed for idx and out_idx. 1308 */ 1309 void * 1310 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1311 zfs_btree_index_t *out_idx) 1312 { 1313 return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); 1314 } 1315 1316 /* 1317 * Return the previous valued node in the tree. The same value can be safely 1318 * passed for idx and out_idx. 1319 */ 1320 void * 1321 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1322 zfs_btree_index_t *out_idx) 1323 { 1324 if (idx->bti_node == NULL) { 1325 ASSERT3S(tree->bt_height, ==, -1); 1326 return (NULL); 1327 } 1328 1329 uint32_t offset = idx->bti_offset; 1330 if (!zfs_btree_is_core(idx->bti_node)) { 1331 /* 1332 * When finding the previous element of an element in a leaf, 1333 * there are two cases. If the element isn't the first one in 1334 * the leaf, in which case we just return the previous element 1335 * in the leaf. Otherwise, we need to traverse up our parents 1336 * until we find one where our previous ancestor isn't the 1337 * first child. Once we do, the previous element is the 1338 * separator after our previous ancestor. 1339 */ 1340 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1341 if (offset != 0) { 1342 out_idx->bti_node = &leaf->btl_hdr; 1343 out_idx->bti_offset = offset - 1; 1344 out_idx->bti_before = B_FALSE; 1345 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1346 offset - 1) * tree->bt_elem_size); 1347 } 1348 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1349 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1350 node != NULL; node = node->btc_hdr.bth_parent) { 1351 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1352 ASSERT(zfs_btree_is_core(hdr)); 1353 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1354 if (i == 0) { 1355 prev = hdr; 1356 continue; 1357 } 1358 out_idx->bti_node = hdr; 1359 out_idx->bti_offset = i - 1; 1360 out_idx->bti_before = B_FALSE; 1361 return (node->btc_elems + (i - 1) * tree->bt_elem_size); 1362 } 1363 /* 1364 * We've traversed all the way up and been at the start of the 1365 * node every time, so this was the first node in the tree. 1366 */ 1367 return (NULL); 1368 } 1369 1370 /* 1371 * The previous element from one in a core node is the last element in 1372 * the subtree just to the left of the separator. 1373 */ 1374 ASSERT(zfs_btree_is_core(idx->bti_node)); 1375 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1376 zfs_btree_hdr_t *child = node->btc_children[offset]; 1377 return (zfs_btree_last_helper(tree, child, out_idx)); 1378 } 1379 1380 /* 1381 * Get the value at the provided index in the tree. 1382 * 1383 * Note that the value returned from this function can be mutated, but only 1384 * if it will not change the ordering of the element with respect to any other 1385 * elements that could be in the tree. 1386 */ 1387 void * 1388 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) 1389 { 1390 ASSERT(!idx->bti_before); 1391 size_t size = tree->bt_elem_size; 1392 if (!zfs_btree_is_core(idx->bti_node)) { 1393 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1394 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1395 idx->bti_offset) * size); 1396 } 1397 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1398 return (node->btc_elems + idx->bti_offset * size); 1399 } 1400 1401 /* Add the given value to the tree. Must not already be in the tree. */ 1402 void 1403 zfs_btree_add(zfs_btree_t *tree, const void *node) 1404 { 1405 zfs_btree_index_t where = {0}; 1406 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); 1407 zfs_btree_add_idx(tree, node, &where); 1408 } 1409 1410 /* Helper function to free a tree node. */ 1411 static void 1412 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) 1413 { 1414 tree->bt_num_nodes--; 1415 if (!zfs_btree_is_core(node)) { 1416 zfs_btree_leaf_free(tree, node); 1417 } else { 1418 kmem_free(node, sizeof (zfs_btree_core_t) + 1419 BTREE_CORE_ELEMS * tree->bt_elem_size); 1420 } 1421 } 1422 1423 /* 1424 * Remove the rm_hdr and the separator to its left from the parent node. The 1425 * buffer that rm_hdr was stored in may already be freed, so its contents 1426 * cannot be accessed. 1427 */ 1428 static void 1429 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, 1430 zfs_btree_hdr_t *rm_hdr) 1431 { 1432 size_t size = tree->bt_elem_size; 1433 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; 1434 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1435 /* 1436 * If the node is the root node and rm_hdr is one of two children, 1437 * promote the other child to the root. 1438 */ 1439 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { 1440 ASSERT3U(hdr->bth_count, ==, 1); 1441 ASSERT3P(tree->bt_root, ==, node); 1442 ASSERT3P(node->btc_children[1], ==, rm_hdr); 1443 tree->bt_root = node->btc_children[0]; 1444 node->btc_children[0]->bth_parent = NULL; 1445 zfs_btree_node_destroy(tree, hdr); 1446 tree->bt_height--; 1447 return; 1448 } 1449 1450 uint32_t idx; 1451 for (idx = 0; idx <= hdr->bth_count; idx++) { 1452 if (node->btc_children[idx] == rm_hdr) 1453 break; 1454 } 1455 ASSERT3U(idx, <=, hdr->bth_count); 1456 1457 /* 1458 * If the node is the root or it has more than the minimum number of 1459 * children, just remove the child and separator, and return. 1460 */ 1461 if (hdr->bth_parent == NULL || 1462 hdr->bth_count > min_count) { 1463 /* 1464 * Shift the element and children to the right of rm_hdr to 1465 * the left by one spot. 1466 */ 1467 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1468 BSS_PARALLELOGRAM); 1469 hdr->bth_count--; 1470 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); 1471 return; 1472 } 1473 1474 ASSERT3U(hdr->bth_count, ==, min_count); 1475 1476 /* 1477 * Now we try to take a node from a neighbor. We check left, then 1478 * right. If the neighbor exists and has more than the minimum number 1479 * of elements, we move the separator between us and them to our 1480 * node, move their closest element (last for left, first for right) 1481 * to the separator, and move their closest child to our node. Along 1482 * the way we need to collapse the gap made by idx, and (for our right 1483 * neighbor) the gap made by removing their first element and child. 1484 * 1485 * Note: this logic currently doesn't support taking from a neighbor 1486 * that isn't a sibling (i.e. a neighbor with a different 1487 * parent). This isn't critical functionality, but may be worth 1488 * implementing in the future for completeness' sake. 1489 */ 1490 zfs_btree_core_t *parent = hdr->bth_parent; 1491 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1492 1493 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1494 parent->btc_children[parent_idx - 1]); 1495 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1496 /* We can take a node from the left neighbor. */ 1497 ASSERT(zfs_btree_is_core(l_hdr)); 1498 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; 1499 1500 /* 1501 * Start by shifting the elements and children in the current 1502 * node to the right by one spot. 1503 */ 1504 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); 1505 1506 /* 1507 * Move the separator between node and neighbor to the first 1508 * element slot in the current node. 1509 */ 1510 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1511 size; 1512 bcpy(separator, node->btc_elems, size); 1513 1514 /* Move the last child of neighbor to our first child slot. */ 1515 node->btc_children[0] = 1516 neighbor->btc_children[l_hdr->bth_count]; 1517 node->btc_children[0]->bth_parent = node; 1518 1519 /* Move the last element of neighbor to the separator spot. */ 1520 uint8_t *take_elem = neighbor->btc_elems + 1521 (l_hdr->bth_count - 1) * size; 1522 bcpy(take_elem, separator, size); 1523 l_hdr->bth_count--; 1524 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); 1525 return; 1526 } 1527 1528 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1529 NULL : parent->btc_children[parent_idx + 1]); 1530 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1531 /* We can take a node from the right neighbor. */ 1532 ASSERT(zfs_btree_is_core(r_hdr)); 1533 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; 1534 1535 /* 1536 * Shift elements in node left by one spot to overwrite rm_hdr 1537 * and the separator before it. 1538 */ 1539 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1540 BSS_PARALLELOGRAM); 1541 1542 /* 1543 * Move the separator between node and neighbor to the last 1544 * element spot in node. 1545 */ 1546 uint8_t *separator = parent->btc_elems + parent_idx * size; 1547 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, 1548 size); 1549 1550 /* 1551 * Move the first child of neighbor to the last child spot in 1552 * node. 1553 */ 1554 node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; 1555 node->btc_children[hdr->bth_count]->bth_parent = node; 1556 1557 /* Move the first element of neighbor to the separator spot. */ 1558 uint8_t *take_elem = neighbor->btc_elems; 1559 bcpy(take_elem, separator, size); 1560 r_hdr->bth_count--; 1561 1562 /* 1563 * Shift the elements and children of neighbor to cover the 1564 * stolen elements. 1565 */ 1566 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, 1567 BSS_TRAPEZOID); 1568 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); 1569 return; 1570 } 1571 1572 /* 1573 * In this case, neither of our neighbors can spare an element, so we 1574 * need to merge with one of them. We prefer the left one, 1575 * arbitrarily. Move the separator into the leftmost merging node 1576 * (which may be us or the left neighbor), and then move the right 1577 * merging node's elements. Once that's done, we go back and delete 1578 * the element we're removing. Finally, go into the parent and delete 1579 * the right merging node and the separator. This may cause further 1580 * merging. 1581 */ 1582 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; 1583 uint32_t new_idx = idx; 1584 if (l_hdr != NULL) { 1585 keep_hdr = l_hdr; 1586 new_rm_hdr = hdr; 1587 new_idx += keep_hdr->bth_count + 1; 1588 } else { 1589 ASSERT3P(r_hdr, !=, NULL); 1590 keep_hdr = hdr; 1591 new_rm_hdr = r_hdr; 1592 parent_idx++; 1593 } 1594 1595 ASSERT(zfs_btree_is_core(keep_hdr)); 1596 ASSERT(zfs_btree_is_core(new_rm_hdr)); 1597 1598 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; 1599 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; 1600 1601 if (zfs_btree_verify_intensity >= 5) { 1602 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { 1603 zfs_btree_verify_poison_at(tree, keep_hdr, 1604 keep_hdr->bth_count + i); 1605 } 1606 } 1607 1608 /* Move the separator into the left node. */ 1609 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; 1610 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1611 size; 1612 bcpy(separator, e_out, size); 1613 keep_hdr->bth_count++; 1614 1615 /* Move all our elements and children into the left node. */ 1616 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, 1617 keep_hdr->bth_count, BSS_TRAPEZOID); 1618 1619 uint32_t old_count = keep_hdr->bth_count; 1620 1621 /* Update bookkeeping */ 1622 keep_hdr->bth_count += new_rm_hdr->bth_count; 1623 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); 1624 1625 /* 1626 * Shift the element and children to the right of rm_hdr to 1627 * the left by one spot. 1628 */ 1629 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); 1630 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, 1631 BSS_PARALLELOGRAM); 1632 keep_hdr->bth_count--; 1633 1634 /* Reparent all our children to point to the left node. */ 1635 zfs_btree_hdr_t **new_start = keep->btc_children + 1636 old_count - 1; 1637 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) 1638 new_start[i]->bth_parent = keep; 1639 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { 1640 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); 1641 ASSERT3P(keep->btc_children[i], !=, rm_hdr); 1642 } 1643 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); 1644 1645 new_rm_hdr->bth_count = 0; 1646 zfs_btree_remove_from_node(tree, parent, new_rm_hdr); 1647 zfs_btree_node_destroy(tree, new_rm_hdr); 1648 } 1649 1650 /* Remove the element at the specific location. */ 1651 void 1652 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) 1653 { 1654 size_t size = tree->bt_elem_size; 1655 zfs_btree_hdr_t *hdr = where->bti_node; 1656 uint32_t idx = where->bti_offset; 1657 1658 ASSERT(!where->bti_before); 1659 if (tree->bt_bulk != NULL) { 1660 /* 1661 * Leave bulk insert mode. Note that our index would be 1662 * invalid after we correct the tree, so we copy the value 1663 * we're planning to remove and find it again after 1664 * bulk_finish. 1665 */ 1666 uint8_t *value = zfs_btree_get(tree, where); 1667 uint8_t *tmp = kmem_alloc(size, KM_SLEEP); 1668 bcpy(value, tmp, size); 1669 zfs_btree_bulk_finish(tree); 1670 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); 1671 kmem_free(tmp, size); 1672 hdr = where->bti_node; 1673 idx = where->bti_offset; 1674 } 1675 1676 tree->bt_num_elems--; 1677 /* 1678 * If the element happens to be in a core node, we move a leaf node's 1679 * element into its place and then remove the leaf node element. This 1680 * makes the rebalance logic not need to be recursive both upwards and 1681 * downwards. 1682 */ 1683 if (zfs_btree_is_core(hdr)) { 1684 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1685 zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; 1686 void *new_value = zfs_btree_last_helper(tree, left_subtree, 1687 where); 1688 ASSERT3P(new_value, !=, NULL); 1689 1690 bcpy(new_value, node->btc_elems + idx * size, size); 1691 1692 hdr = where->bti_node; 1693 idx = where->bti_offset; 1694 ASSERT(!where->bti_before); 1695 } 1696 1697 /* 1698 * First, we'll update the leaf's metadata. Then, we shift any 1699 * elements after the idx to the left. After that, we rebalance if 1700 * needed. 1701 */ 1702 ASSERT(!zfs_btree_is_core(hdr)); 1703 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 1704 ASSERT3U(hdr->bth_count, >, 0); 1705 1706 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; 1707 1708 /* 1709 * If we're over the minimum size or this is the root, just overwrite 1710 * the value and return. 1711 */ 1712 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { 1713 bt_shrink_leaf(tree, leaf, idx, 1); 1714 if (hdr->bth_parent == NULL) { 1715 ASSERT0(tree->bt_height); 1716 if (hdr->bth_count == 0) { 1717 tree->bt_root = NULL; 1718 tree->bt_height--; 1719 zfs_btree_node_destroy(tree, &leaf->btl_hdr); 1720 } 1721 } 1722 zfs_btree_verify(tree); 1723 return; 1724 } 1725 ASSERT3U(hdr->bth_count, ==, min_count); 1726 1727 /* 1728 * Now we try to take a node from a sibling. We check left, then 1729 * right. If they exist and have more than the minimum number of 1730 * elements, we move the separator between us and them to our node 1731 * and move their closest element (last for left, first for right) to 1732 * the separator. Along the way we need to collapse the gap made by 1733 * idx, and (for our right neighbor) the gap made by removing their 1734 * first element. 1735 * 1736 * Note: this logic currently doesn't support taking from a neighbor 1737 * that isn't a sibling. This isn't critical functionality, but may be 1738 * worth implementing in the future for completeness' sake. 1739 */ 1740 zfs_btree_core_t *parent = hdr->bth_parent; 1741 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1742 1743 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1744 parent->btc_children[parent_idx - 1]); 1745 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1746 /* We can take a node from the left neighbor. */ 1747 ASSERT(!zfs_btree_is_core(l_hdr)); 1748 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; 1749 1750 /* 1751 * Move our elements back by one spot to make room for the 1752 * stolen element and overwrite the element being removed. 1753 */ 1754 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); 1755 1756 /* Move the separator to our first spot. */ 1757 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1758 size; 1759 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); 1760 1761 /* Move our neighbor's last element to the separator. */ 1762 uint8_t *take_elem = neighbor->btl_elems + 1763 (l_hdr->bth_first + l_hdr->bth_count - 1) * size; 1764 bcpy(take_elem, separator, size); 1765 1766 /* Delete our neighbor's last element. */ 1767 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); 1768 zfs_btree_verify(tree); 1769 return; 1770 } 1771 1772 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1773 NULL : parent->btc_children[parent_idx + 1]); 1774 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1775 /* We can take a node from the right neighbor. */ 1776 ASSERT(!zfs_btree_is_core(r_hdr)); 1777 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; 1778 1779 /* 1780 * Move our elements after the element being removed forwards 1781 * by one spot to make room for the stolen element and 1782 * overwrite the element being removed. 1783 */ 1784 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, 1785 1, BSD_LEFT); 1786 1787 /* Move the separator between us to our last spot. */ 1788 uint8_t *separator = parent->btc_elems + parent_idx * size; 1789 bcpy(separator, leaf->btl_elems + (hdr->bth_first + 1790 hdr->bth_count - 1) * size, size); 1791 1792 /* Move our neighbor's first element to the separator. */ 1793 uint8_t *take_elem = neighbor->btl_elems + 1794 r_hdr->bth_first * size; 1795 bcpy(take_elem, separator, size); 1796 1797 /* Delete our neighbor's first element. */ 1798 bt_shrink_leaf(tree, neighbor, 0, 1); 1799 zfs_btree_verify(tree); 1800 return; 1801 } 1802 1803 /* 1804 * In this case, neither of our neighbors can spare an element, so we 1805 * need to merge with one of them. We prefer the left one, arbitrarily. 1806 * After remove we move the separator into the leftmost merging node 1807 * (which may be us or the left neighbor), and then move the right 1808 * merging node's elements. Once that's done, we go back and delete 1809 * the element we're removing. Finally, go into the parent and delete 1810 * the right merging node and the separator. This may cause further 1811 * merging. 1812 */ 1813 zfs_btree_hdr_t *rm_hdr, *k_hdr; 1814 if (l_hdr != NULL) { 1815 k_hdr = l_hdr; 1816 rm_hdr = hdr; 1817 } else { 1818 ASSERT3P(r_hdr, !=, NULL); 1819 k_hdr = hdr; 1820 rm_hdr = r_hdr; 1821 parent_idx++; 1822 } 1823 ASSERT(!zfs_btree_is_core(k_hdr)); 1824 ASSERT(!zfs_btree_is_core(rm_hdr)); 1825 ASSERT3U(k_hdr->bth_count, ==, min_count); 1826 ASSERT3U(rm_hdr->bth_count, ==, min_count); 1827 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; 1828 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; 1829 1830 if (zfs_btree_verify_intensity >= 5) { 1831 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { 1832 zfs_btree_verify_poison_at(tree, k_hdr, 1833 k_hdr->bth_count + i); 1834 } 1835 } 1836 1837 /* 1838 * Remove the value from the node. It will go below the minimum, 1839 * but we'll fix it in no time. 1840 */ 1841 bt_shrink_leaf(tree, leaf, idx, 1); 1842 1843 /* Prepare space for elements to be moved from the right. */ 1844 uint32_t k_count = k_hdr->bth_count; 1845 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); 1846 ASSERT3U(k_hdr->bth_count, ==, min_count * 2); 1847 1848 /* Move the separator into the first open spot. */ 1849 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; 1850 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; 1851 bcpy(separator, out, size); 1852 1853 /* Move our elements to the left neighbor. */ 1854 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); 1855 1856 /* Remove the emptied node from the parent. */ 1857 zfs_btree_remove_from_node(tree, parent, rm_hdr); 1858 zfs_btree_node_destroy(tree, rm_hdr); 1859 zfs_btree_verify(tree); 1860 } 1861 1862 /* Remove the given value from the tree. */ 1863 void 1864 zfs_btree_remove(zfs_btree_t *tree, const void *value) 1865 { 1866 zfs_btree_index_t where = {0}; 1867 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); 1868 zfs_btree_remove_idx(tree, &where); 1869 } 1870 1871 /* Return the number of elements in the tree. */ 1872 ulong_t 1873 zfs_btree_numnodes(zfs_btree_t *tree) 1874 { 1875 return (tree->bt_num_elems); 1876 } 1877 1878 /* 1879 * This function is used to visit all the elements in the tree before 1880 * destroying the tree. This allows the calling code to perform any cleanup it 1881 * needs to do. This is more efficient than just removing the first element 1882 * over and over, because it removes all rebalancing. Once the destroy_nodes() 1883 * function has been called, no other btree operations are valid until it 1884 * returns NULL, which point the only valid operation is zfs_btree_destroy(). 1885 * 1886 * example: 1887 * 1888 * zfs_btree_index_t *cookie = NULL; 1889 * my_data_t *node; 1890 * 1891 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) 1892 * free(node->ptr); 1893 * zfs_btree_destroy(tree); 1894 * 1895 */ 1896 void * 1897 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) 1898 { 1899 if (*cookie == NULL) { 1900 if (tree->bt_height == -1) 1901 return (NULL); 1902 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); 1903 return (zfs_btree_first(tree, *cookie)); 1904 } 1905 1906 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, 1907 zfs_btree_node_destroy); 1908 if (rval == NULL) { 1909 tree->bt_root = NULL; 1910 tree->bt_height = -1; 1911 tree->bt_num_elems = 0; 1912 kmem_free(*cookie, sizeof (**cookie)); 1913 tree->bt_bulk = NULL; 1914 } 1915 return (rval); 1916 } 1917 1918 static void 1919 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1920 { 1921 if (zfs_btree_is_core(hdr)) { 1922 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; 1923 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1924 zfs_btree_clear_helper(tree, btc->btc_children[i]); 1925 } 1926 1927 zfs_btree_node_destroy(tree, hdr); 1928 } 1929 1930 void 1931 zfs_btree_clear(zfs_btree_t *tree) 1932 { 1933 if (tree->bt_root == NULL) { 1934 ASSERT0(tree->bt_num_elems); 1935 return; 1936 } 1937 1938 zfs_btree_clear_helper(tree, tree->bt_root); 1939 tree->bt_num_elems = 0; 1940 tree->bt_root = NULL; 1941 tree->bt_num_nodes = 0; 1942 tree->bt_height = -1; 1943 tree->bt_bulk = NULL; 1944 } 1945 1946 void 1947 zfs_btree_destroy(zfs_btree_t *tree) 1948 { 1949 ASSERT0(tree->bt_num_elems); 1950 ASSERT3P(tree->bt_root, ==, NULL); 1951 } 1952 1953 /* Verify that every child of this node has the correct parent pointer. */ 1954 static void 1955 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1956 { 1957 if (!zfs_btree_is_core(hdr)) 1958 return; 1959 1960 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1961 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1962 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); 1963 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); 1964 } 1965 } 1966 1967 /* Verify that every node has the correct parent pointer. */ 1968 static void 1969 zfs_btree_verify_pointers(zfs_btree_t *tree) 1970 { 1971 if (tree->bt_height == -1) { 1972 VERIFY3P(tree->bt_root, ==, NULL); 1973 return; 1974 } 1975 VERIFY3P(tree->bt_root->bth_parent, ==, NULL); 1976 zfs_btree_verify_pointers_helper(tree, tree->bt_root); 1977 } 1978 1979 /* 1980 * Verify that all the current node and its children satisfy the count 1981 * invariants, and return the total count in the subtree rooted in this node. 1982 */ 1983 static uint64_t 1984 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1985 { 1986 if (!zfs_btree_is_core(hdr)) { 1987 if (tree->bt_root != hdr && tree->bt_bulk && 1988 hdr != &tree->bt_bulk->btl_hdr) { 1989 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); 1990 } 1991 1992 return (hdr->bth_count); 1993 } else { 1994 1995 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1996 uint64_t ret = hdr->bth_count; 1997 if (tree->bt_root != hdr && tree->bt_bulk == NULL) 1998 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); 1999 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2000 ret += zfs_btree_verify_counts_helper(tree, 2001 node->btc_children[i]); 2002 } 2003 2004 return (ret); 2005 } 2006 } 2007 2008 /* 2009 * Verify that all nodes satisfy the invariants and that the total number of 2010 * elements is correct. 2011 */ 2012 static void 2013 zfs_btree_verify_counts(zfs_btree_t *tree) 2014 { 2015 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); 2016 if (tree->bt_height == -1) { 2017 return; 2018 } 2019 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, 2020 tree->bt_num_elems); 2021 } 2022 2023 /* 2024 * Check that the subtree rooted at this node has a uniform height. Returns 2025 * the number of nodes under this node, to help verify bt_num_nodes. 2026 */ 2027 static uint64_t 2028 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 2029 int32_t height) 2030 { 2031 if (!zfs_btree_is_core(hdr)) { 2032 VERIFY0(height); 2033 return (1); 2034 } 2035 2036 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2037 uint64_t ret = 1; 2038 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2039 ret += zfs_btree_verify_height_helper(tree, 2040 node->btc_children[i], height - 1); 2041 } 2042 return (ret); 2043 } 2044 2045 /* 2046 * Check that the tree rooted at this node has a uniform height, and that the 2047 * bt_height in the tree is correct. 2048 */ 2049 static void 2050 zfs_btree_verify_height(zfs_btree_t *tree) 2051 { 2052 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2053 if (tree->bt_height == -1) { 2054 return; 2055 } 2056 2057 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, 2058 tree->bt_height), ==, tree->bt_num_nodes); 2059 } 2060 2061 /* 2062 * Check that the elements in this node are sorted, and that if this is a core 2063 * node, the separators are properly between the subtrees they separaate and 2064 * that the children also satisfy this requirement. 2065 */ 2066 static void 2067 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2068 { 2069 size_t size = tree->bt_elem_size; 2070 if (!zfs_btree_is_core(hdr)) { 2071 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2072 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2073 VERIFY3S(tree->bt_compar(leaf->btl_elems + 2074 (hdr->bth_first + i - 1) * size, 2075 leaf->btl_elems + 2076 (hdr->bth_first + i) * size), ==, -1); 2077 } 2078 return; 2079 } 2080 2081 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2082 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2083 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, 2084 node->btc_elems + i * size), ==, -1); 2085 } 2086 for (uint32_t i = 0; i < hdr->bth_count; i++) { 2087 uint8_t *left_child_last = NULL; 2088 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; 2089 if (zfs_btree_is_core(left_child_hdr)) { 2090 zfs_btree_core_t *left_child = 2091 (zfs_btree_core_t *)left_child_hdr; 2092 left_child_last = left_child->btc_elems + 2093 (left_child_hdr->bth_count - 1) * size; 2094 } else { 2095 zfs_btree_leaf_t *left_child = 2096 (zfs_btree_leaf_t *)left_child_hdr; 2097 left_child_last = left_child->btl_elems + 2098 (left_child_hdr->bth_first + 2099 left_child_hdr->bth_count - 1) * size; 2100 } 2101 int comp = tree->bt_compar(node->btc_elems + i * size, 2102 left_child_last); 2103 if (comp <= 0) { 2104 panic("btree: compar returned %d (expected 1) at " 2105 "%px %d: compar(%px, %px)", comp, node, i, 2106 node->btc_elems + i * size, left_child_last); 2107 } 2108 2109 uint8_t *right_child_first = NULL; 2110 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; 2111 if (zfs_btree_is_core(right_child_hdr)) { 2112 zfs_btree_core_t *right_child = 2113 (zfs_btree_core_t *)right_child_hdr; 2114 right_child_first = right_child->btc_elems; 2115 } else { 2116 zfs_btree_leaf_t *right_child = 2117 (zfs_btree_leaf_t *)right_child_hdr; 2118 right_child_first = right_child->btl_elems + 2119 right_child_hdr->bth_first * size; 2120 } 2121 comp = tree->bt_compar(node->btc_elems + i * size, 2122 right_child_first); 2123 if (comp >= 0) { 2124 panic("btree: compar returned %d (expected -1) at " 2125 "%px %d: compar(%px, %px)", comp, node, i, 2126 node->btc_elems + i * size, right_child_first); 2127 } 2128 } 2129 for (uint32_t i = 0; i <= hdr->bth_count; i++) 2130 zfs_btree_verify_order_helper(tree, node->btc_children[i]); 2131 } 2132 2133 /* Check that all elements in the tree are in sorted order. */ 2134 static void 2135 zfs_btree_verify_order(zfs_btree_t *tree) 2136 { 2137 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2138 if (tree->bt_height == -1) { 2139 return; 2140 } 2141 2142 zfs_btree_verify_order_helper(tree, tree->bt_root); 2143 } 2144 2145 #ifdef ZFS_DEBUG 2146 /* Check that all unused memory is poisoned correctly. */ 2147 static void 2148 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2149 { 2150 size_t size = tree->bt_elem_size; 2151 if (!zfs_btree_is_core(hdr)) { 2152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2153 for (size_t i = 0; i < hdr->bth_first * size; i++) 2154 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2155 size_t esize = tree->bt_leaf_size - 2156 offsetof(zfs_btree_leaf_t, btl_elems); 2157 for (size_t i = (hdr->bth_first + hdr->bth_count) * size; 2158 i < esize; i++) 2159 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2160 } else { 2161 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2162 for (size_t i = hdr->bth_count * size; 2163 i < BTREE_CORE_ELEMS * size; i++) 2164 VERIFY3U(node->btc_elems[i], ==, 0x0f); 2165 2166 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 2167 i++) { 2168 VERIFY3P(node->btc_children[i], ==, 2169 (zfs_btree_hdr_t *)BTREE_POISON); 2170 } 2171 2172 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2173 zfs_btree_verify_poison_helper(tree, 2174 node->btc_children[i]); 2175 } 2176 } 2177 } 2178 #endif 2179 2180 /* Check that unused memory in the tree is still poisoned. */ 2181 static void 2182 zfs_btree_verify_poison(zfs_btree_t *tree) 2183 { 2184 #ifdef ZFS_DEBUG 2185 if (tree->bt_height == -1) 2186 return; 2187 zfs_btree_verify_poison_helper(tree, tree->bt_root); 2188 #endif 2189 } 2190 2191 void 2192 zfs_btree_verify(zfs_btree_t *tree) 2193 { 2194 if (zfs_btree_verify_intensity == 0) 2195 return; 2196 zfs_btree_verify_height(tree); 2197 if (zfs_btree_verify_intensity == 1) 2198 return; 2199 zfs_btree_verify_pointers(tree); 2200 if (zfs_btree_verify_intensity == 2) 2201 return; 2202 zfs_btree_verify_counts(tree); 2203 if (zfs_btree_verify_intensity == 3) 2204 return; 2205 zfs_btree_verify_order(tree); 2206 2207 if (zfs_btree_verify_intensity == 4) 2208 return; 2209 zfs_btree_verify_poison(tree); 2210 } 2211 2212 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, 2213 "Enable btree verification. Levels above 4 require ZFS be built " 2214 "with debugging"); 2215