1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 /* 16 * Copyright (c) 2019 by Delphix. All rights reserved. 17 */ 18 19 #include <sys/btree.h> 20 #include <sys/bitops.h> 21 #include <sys/zfs_context.h> 22 23 kmem_cache_t *zfs_btree_leaf_cache; 24 25 /* 26 * Control the extent of the verification that occurs when zfs_btree_verify is 27 * called. Primarily used for debugging when extending the btree logic and 28 * functionality. As the intensity is increased, new verification steps are 29 * added. These steps are cumulative; intensity = 3 includes the intensity = 1 30 * and intensity = 2 steps as well. 31 * 32 * Intensity 1: Verify that the tree's height is consistent throughout. 33 * Intensity 2: Verify that a core node's children's parent pointers point 34 * to the core node. 35 * Intensity 3: Verify that the total number of elements in the tree matches the 36 * sum of the number of elements in each node. Also verifies that each node's 37 * count obeys the invariants (less than or equal to maximum value, greater than 38 * or equal to half the maximum minus one). 39 * Intensity 4: Verify that each element compares less than the element 40 * immediately after it and greater than the one immediately before it using the 41 * comparator function. For core nodes, also checks that each element is greater 42 * than the last element in the first of the two nodes it separates, and less 43 * than the first element in the second of the two nodes. 44 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside 45 * of each node is poisoned appropriately. Note that poisoning always occurs if 46 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal 47 * operation. 48 * 49 * Intensity 4 and 5 are particularly expensive to perform; the previous levels 50 * are a few memory operations per node, while these levels require multiple 51 * operations per element. In addition, when creating large btrees, these 52 * operations are called at every step, resulting in extremely slow operation 53 * (while the asymptotic complexity of the other steps is the same, the 54 * importance of the constant factors cannot be denied). 55 */ 56 uint_t zfs_btree_verify_intensity = 0; 57 58 /* 59 * Convenience functions to silence warnings from memcpy/memmove's 60 * return values and change argument order to src, dest. 61 */ 62 static void 63 bcpy(const void *src, void *dest, size_t size) 64 { 65 (void) memcpy(dest, src, size); 66 } 67 68 static void 69 bmov(const void *src, void *dest, size_t size) 70 { 71 (void) memmove(dest, src, size); 72 } 73 74 static boolean_t 75 zfs_btree_is_core(struct zfs_btree_hdr *hdr) 76 { 77 return (hdr->bth_first == -1); 78 } 79 80 #ifdef _ILP32 81 #define BTREE_POISON 0xabadb10c 82 #else 83 #define BTREE_POISON 0xabadb10cdeadbeef 84 #endif 85 86 static void 87 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 88 { 89 #ifdef ZFS_DEBUG 90 size_t size = tree->bt_elem_size; 91 if (zfs_btree_is_core(hdr)) { 92 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 93 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 94 i++) { 95 node->btc_children[i] = 96 (zfs_btree_hdr_t *)BTREE_POISON; 97 } 98 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, 99 (BTREE_CORE_ELEMS - hdr->bth_count) * size); 100 } else { 101 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 102 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); 103 (void) memset(leaf->btl_elems + 104 (hdr->bth_first + hdr->bth_count) * size, 0x0f, 105 tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - 106 (hdr->bth_first + hdr->bth_count) * size); 107 } 108 #endif 109 } 110 111 static inline void 112 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 113 uint32_t idx, uint32_t count) 114 { 115 #ifdef ZFS_DEBUG 116 size_t size = tree->bt_elem_size; 117 if (zfs_btree_is_core(hdr)) { 118 ASSERT3U(idx, >=, hdr->bth_count); 119 ASSERT3U(idx, <=, BTREE_CORE_ELEMS); 120 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); 121 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 122 for (uint32_t i = 1; i <= count; i++) { 123 node->btc_children[idx + i] = 124 (zfs_btree_hdr_t *)BTREE_POISON; 125 } 126 (void) memset(node->btc_elems + idx * size, 0x0f, count * size); 127 } else { 128 ASSERT3U(idx, <=, tree->bt_leaf_cap); 129 ASSERT3U(idx + count, <=, tree->bt_leaf_cap); 130 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 131 (void) memset(leaf->btl_elems + 132 (hdr->bth_first + idx) * size, 0x0f, count * size); 133 } 134 #endif 135 } 136 137 static inline void 138 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 139 uint32_t idx) 140 { 141 #ifdef ZFS_DEBUG 142 size_t size = tree->bt_elem_size; 143 if (zfs_btree_is_core(hdr)) { 144 ASSERT3U(idx, <, BTREE_CORE_ELEMS); 145 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 146 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; 147 VERIFY3P(node->btc_children[idx + 1], ==, cval); 148 for (size_t i = 0; i < size; i++) 149 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); 150 } else { 151 ASSERT3U(idx, <, tree->bt_leaf_cap); 152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 153 if (idx >= tree->bt_leaf_cap - hdr->bth_first) 154 return; 155 for (size_t i = 0; i < size; i++) { 156 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) 157 * size + i], ==, 0x0f); 158 } 159 } 160 #endif 161 } 162 163 void 164 zfs_btree_init(void) 165 { 166 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", 167 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 168 } 169 170 void 171 zfs_btree_fini(void) 172 { 173 kmem_cache_destroy(zfs_btree_leaf_cache); 174 } 175 176 static void * 177 zfs_btree_leaf_alloc(zfs_btree_t *tree) 178 { 179 if (tree->bt_leaf_size == BTREE_LEAF_SIZE) 180 return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); 181 else 182 return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); 183 } 184 185 static void 186 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) 187 { 188 if (tree->bt_leaf_size == BTREE_LEAF_SIZE) 189 return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); 190 else 191 return (kmem_free(ptr, tree->bt_leaf_size)); 192 } 193 194 void 195 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), 196 bt_find_in_buf_f bt_find_in_buf, size_t size) 197 { 198 zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, 199 BTREE_LEAF_SIZE); 200 } 201 202 static void * 203 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, 204 const void *value, zfs_btree_index_t *where); 205 206 void 207 zfs_btree_create_custom(zfs_btree_t *tree, 208 int (*compar) (const void *, const void *), 209 bt_find_in_buf_f bt_find_in_buf, 210 size_t size, size_t lsize) 211 { 212 size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); 213 214 ASSERT3U(size, <=, esize / 2); 215 memset(tree, 0, sizeof (*tree)); 216 tree->bt_compar = compar; 217 tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? 218 zfs_btree_find_in_buf : bt_find_in_buf; 219 tree->bt_elem_size = size; 220 tree->bt_leaf_size = lsize; 221 tree->bt_leaf_cap = P2ALIGN(esize / size, 2); 222 tree->bt_height = -1; 223 tree->bt_bulk = NULL; 224 } 225 226 /* 227 * Find value in the array of elements provided. Uses a simple binary search. 228 */ 229 static void * 230 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, 231 const void *value, zfs_btree_index_t *where) 232 { 233 uint32_t max = nelems; 234 uint32_t min = 0; 235 while (max > min) { 236 uint32_t idx = (min + max) / 2; 237 uint8_t *cur = buf + idx * tree->bt_elem_size; 238 int comp = tree->bt_compar(cur, value); 239 if (comp < 0) { 240 min = idx + 1; 241 } else if (comp > 0) { 242 max = idx; 243 } else { 244 where->bti_offset = idx; 245 where->bti_before = B_FALSE; 246 return (cur); 247 } 248 } 249 250 where->bti_offset = max; 251 where->bti_before = B_TRUE; 252 return (NULL); 253 } 254 255 /* 256 * Find the given value in the tree. where may be passed as null to use as a 257 * membership test or if the btree is being used as a map. 258 */ 259 void * 260 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) 261 { 262 if (tree->bt_height == -1) { 263 if (where != NULL) { 264 where->bti_node = NULL; 265 where->bti_offset = 0; 266 } 267 ASSERT0(tree->bt_num_elems); 268 return (NULL); 269 } 270 271 /* 272 * If we're in bulk-insert mode, we check the last spot in the tree 273 * and the last leaf in the tree before doing the normal search, 274 * because for most workloads the vast majority of finds in 275 * bulk-insert mode are to insert new elements. 276 */ 277 zfs_btree_index_t idx; 278 size_t size = tree->bt_elem_size; 279 if (tree->bt_bulk != NULL) { 280 zfs_btree_leaf_t *last_leaf = tree->bt_bulk; 281 int comp = tree->bt_compar(last_leaf->btl_elems + 282 (last_leaf->btl_hdr.bth_first + 283 last_leaf->btl_hdr.bth_count - 1) * size, value); 284 if (comp < 0) { 285 /* 286 * If what they're looking for is after the last 287 * element, it's not in the tree. 288 */ 289 if (where != NULL) { 290 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 291 where->bti_offset = 292 last_leaf->btl_hdr.bth_count; 293 where->bti_before = B_TRUE; 294 } 295 return (NULL); 296 } else if (comp == 0) { 297 if (where != NULL) { 298 where->bti_node = (zfs_btree_hdr_t *)last_leaf; 299 where->bti_offset = 300 last_leaf->btl_hdr.bth_count - 1; 301 where->bti_before = B_FALSE; 302 } 303 return (last_leaf->btl_elems + 304 (last_leaf->btl_hdr.bth_first + 305 last_leaf->btl_hdr.bth_count - 1) * size); 306 } 307 if (tree->bt_compar(last_leaf->btl_elems + 308 last_leaf->btl_hdr.bth_first * size, value) <= 0) { 309 /* 310 * If what they're looking for is after the first 311 * element in the last leaf, it's in the last leaf or 312 * it's not in the tree. 313 */ 314 void *d = tree->bt_find_in_buf(tree, 315 last_leaf->btl_elems + 316 last_leaf->btl_hdr.bth_first * size, 317 last_leaf->btl_hdr.bth_count, value, &idx); 318 319 if (where != NULL) { 320 idx.bti_node = (zfs_btree_hdr_t *)last_leaf; 321 *where = idx; 322 } 323 return (d); 324 } 325 } 326 327 zfs_btree_core_t *node = NULL; 328 uint32_t child = 0; 329 uint32_t depth = 0; 330 331 /* 332 * Iterate down the tree, finding which child the value should be in 333 * by comparing with the separators. 334 */ 335 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; 336 node = (zfs_btree_core_t *)node->btc_children[child], depth++) { 337 ASSERT3P(node, !=, NULL); 338 void *d = tree->bt_find_in_buf(tree, node->btc_elems, 339 node->btc_hdr.bth_count, value, &idx); 340 EQUIV(d != NULL, !idx.bti_before); 341 if (d != NULL) { 342 if (where != NULL) { 343 idx.bti_node = (zfs_btree_hdr_t *)node; 344 *where = idx; 345 } 346 return (d); 347 } 348 ASSERT(idx.bti_before); 349 child = idx.bti_offset; 350 } 351 352 /* 353 * The value is in this leaf, or it would be if it were in the 354 * tree. Find its proper location and return it. 355 */ 356 zfs_btree_leaf_t *leaf = (depth == 0 ? 357 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); 358 void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + 359 leaf->btl_hdr.bth_first * size, 360 leaf->btl_hdr.bth_count, value, &idx); 361 362 if (where != NULL) { 363 idx.bti_node = (zfs_btree_hdr_t *)leaf; 364 *where = idx; 365 } 366 367 return (d); 368 } 369 370 /* 371 * To explain the following functions, it is useful to understand the four 372 * kinds of shifts used in btree operation. First, a shift is a movement of 373 * elements within a node. It is used to create gaps for inserting new 374 * elements and children, or cover gaps created when things are removed. A 375 * shift has two fundamental properties, each of which can be one of two 376 * values, making four types of shifts. There is the direction of the shift 377 * (left or right) and the shape of the shift (parallelogram or isoceles 378 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only 379 * applies to shifts of core nodes. 380 * 381 * The names derive from the following imagining of the layout of a node: 382 * 383 * Elements: * * * * * * * ... * * * 384 * Children: * * * * * * * * ... * * * 385 * 386 * This layout follows from the fact that the elements act as separators 387 * between pairs of children, and that children root subtrees "below" the 388 * current node. A left and right shift are fairly self-explanatory; a left 389 * shift moves things to the left, while a right shift moves things to the 390 * right. A parallelogram shift is a shift with the same number of elements 391 * and children being moved, while a trapezoid shift is a shift that moves one 392 * more children than elements. An example follows: 393 * 394 * A parallelogram shift could contain the following: 395 * _______________ 396 * \* * * * \ * * * ... * * * 397 * * \ * * * *\ * * * ... * * * 398 * --------------- 399 * A trapezoid shift could contain the following: 400 * ___________ 401 * * / * * * \ * * * ... * * * 402 * * / * * * *\ * * * ... * * * 403 * --------------- 404 * 405 * Note that a parallelogram shift is always shaped like a "left-leaning" 406 * parallelogram, where the starting index of the children being moved is 407 * always one higher than the starting index of the elements being moved. No 408 * "right-leaning" parallelogram shifts are needed (shifts where the starting 409 * element index and starting child index being moved are the same) to achieve 410 * any btree operations, so we ignore them. 411 */ 412 413 enum bt_shift_shape { 414 BSS_TRAPEZOID, 415 BSS_PARALLELOGRAM 416 }; 417 418 enum bt_shift_direction { 419 BSD_LEFT, 420 BSD_RIGHT 421 }; 422 423 /* 424 * Shift elements and children in the provided core node by off spots. The 425 * first element moved is idx, and count elements are moved. The shape of the 426 * shift is determined by shape. The direction is determined by dir. 427 */ 428 static inline void 429 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 430 uint32_t count, uint32_t off, enum bt_shift_shape shape, 431 enum bt_shift_direction dir) 432 { 433 size_t size = tree->bt_elem_size; 434 ASSERT(zfs_btree_is_core(&node->btc_hdr)); 435 436 uint8_t *e_start = node->btc_elems + idx * size; 437 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : 438 e_start + off * size); 439 bmov(e_start, e_out, count * size); 440 441 zfs_btree_hdr_t **c_start = node->btc_children + idx + 442 (shape == BSS_TRAPEZOID ? 0 : 1); 443 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : 444 c_start + off); 445 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 446 bmov(c_start, c_out, c_count * sizeof (*c_start)); 447 } 448 449 /* 450 * Shift elements and children in the provided core node left by one spot. 451 * The first element moved is idx, and count elements are moved. The 452 * shape of the shift is determined by trap; true if the shift is a trapezoid, 453 * false if it is a parallelogram. 454 */ 455 static inline void 456 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 457 uint32_t count, enum bt_shift_shape shape) 458 { 459 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); 460 } 461 462 /* 463 * Shift elements and children in the provided core node right by one spot. 464 * Starts with elements[idx] and children[idx] and one more child than element. 465 */ 466 static inline void 467 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, 468 uint32_t count, enum bt_shift_shape shape) 469 { 470 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); 471 } 472 473 /* 474 * Shift elements and children in the provided leaf node by off spots. 475 * The first element moved is idx, and count elements are moved. The direction 476 * is determined by left. 477 */ 478 static inline void 479 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, 480 uint32_t count, uint32_t off, enum bt_shift_direction dir) 481 { 482 size_t size = tree->bt_elem_size; 483 zfs_btree_hdr_t *hdr = &node->btl_hdr; 484 ASSERT(!zfs_btree_is_core(hdr)); 485 486 if (count == 0) 487 return; 488 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; 489 uint8_t *out = (dir == BSD_LEFT ? start - off * size : 490 start + off * size); 491 bmov(start, out, count * size); 492 } 493 494 /* 495 * Grow leaf for n new elements before idx. 496 */ 497 static void 498 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 499 uint32_t n) 500 { 501 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 502 ASSERT(!zfs_btree_is_core(hdr)); 503 ASSERT3U(idx, <=, hdr->bth_count); 504 uint32_t capacity = tree->bt_leaf_cap; 505 ASSERT3U(hdr->bth_count + n, <=, capacity); 506 boolean_t cl = (hdr->bth_first >= n); 507 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); 508 509 if (cl && (!cr || idx <= hdr->bth_count / 2)) { 510 /* Grow left. */ 511 hdr->bth_first -= n; 512 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); 513 } else if (cr) { 514 /* Grow right. */ 515 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, 516 BSD_RIGHT); 517 } else { 518 /* Grow both ways. */ 519 uint32_t fn = hdr->bth_first - 520 (capacity - (hdr->bth_count + n)) / 2; 521 hdr->bth_first -= fn; 522 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); 523 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, 524 n - fn, BSD_RIGHT); 525 } 526 hdr->bth_count += n; 527 } 528 529 /* 530 * Shrink leaf for count elements starting from idx. 531 */ 532 static void 533 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, 534 uint32_t n) 535 { 536 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 537 ASSERT(!zfs_btree_is_core(hdr)); 538 ASSERT3U(idx, <=, hdr->bth_count); 539 ASSERT3U(idx + n, <=, hdr->bth_count); 540 541 if (idx <= (hdr->bth_count - n) / 2) { 542 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); 543 zfs_btree_poison_node_at(tree, hdr, 0, n); 544 hdr->bth_first += n; 545 } else { 546 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, 547 BSD_LEFT); 548 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); 549 } 550 hdr->bth_count -= n; 551 } 552 553 /* 554 * Move children and elements from one core node to another. The shape 555 * parameter behaves the same as it does in the shift logic. 556 */ 557 static inline void 558 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, 559 uint32_t count, zfs_btree_core_t *dest, uint32_t didx, 560 enum bt_shift_shape shape) 561 { 562 size_t size = tree->bt_elem_size; 563 ASSERT(zfs_btree_is_core(&source->btc_hdr)); 564 ASSERT(zfs_btree_is_core(&dest->btc_hdr)); 565 566 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, 567 count * size); 568 569 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); 570 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), 571 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), 572 c_count * sizeof (*source->btc_children)); 573 } 574 575 static inline void 576 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, 577 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) 578 { 579 size_t size = tree->bt_elem_size; 580 ASSERT(!zfs_btree_is_core(&source->btl_hdr)); 581 ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); 582 583 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, 584 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, 585 count * size); 586 } 587 588 /* 589 * Find the first element in the subtree rooted at hdr, return its value and 590 * put its location in where if non-null. 591 */ 592 static void * 593 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 594 zfs_btree_index_t *where) 595 { 596 zfs_btree_hdr_t *node; 597 598 for (node = hdr; zfs_btree_is_core(node); 599 node = ((zfs_btree_core_t *)node)->btc_children[0]) 600 ; 601 602 ASSERT(!zfs_btree_is_core(node)); 603 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 604 if (where != NULL) { 605 where->bti_node = node; 606 where->bti_offset = 0; 607 where->bti_before = B_FALSE; 608 } 609 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); 610 } 611 612 /* Insert an element and a child into a core node at the given offset. */ 613 static void 614 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, 615 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) 616 { 617 size_t size = tree->bt_elem_size; 618 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 619 ASSERT3P(par_hdr, ==, new_node->bth_parent); 620 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); 621 622 if (zfs_btree_verify_intensity >= 5) { 623 zfs_btree_verify_poison_at(tree, par_hdr, 624 par_hdr->bth_count); 625 } 626 /* Shift existing elements and children */ 627 uint32_t count = par_hdr->bth_count - offset; 628 bt_shift_core_right(tree, parent, offset, count, 629 BSS_PARALLELOGRAM); 630 631 /* Insert new values */ 632 parent->btc_children[offset + 1] = new_node; 633 bcpy(buf, parent->btc_elems + offset * size, size); 634 par_hdr->bth_count++; 635 } 636 637 /* 638 * Insert new_node into the parent of old_node directly after old_node, with 639 * buf as the dividing element between the two. 640 */ 641 static void 642 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, 643 zfs_btree_hdr_t *new_node, void *buf) 644 { 645 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); 646 size_t size = tree->bt_elem_size; 647 zfs_btree_core_t *parent = old_node->bth_parent; 648 649 /* 650 * If this is the root node we were splitting, we create a new root 651 * and increase the height of the tree. 652 */ 653 if (parent == NULL) { 654 ASSERT3P(old_node, ==, tree->bt_root); 655 tree->bt_num_nodes++; 656 zfs_btree_core_t *new_root = 657 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * 658 size, KM_SLEEP); 659 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; 660 new_root_hdr->bth_parent = NULL; 661 new_root_hdr->bth_first = -1; 662 new_root_hdr->bth_count = 1; 663 664 old_node->bth_parent = new_node->bth_parent = new_root; 665 new_root->btc_children[0] = old_node; 666 new_root->btc_children[1] = new_node; 667 bcpy(buf, new_root->btc_elems, size); 668 669 tree->bt_height++; 670 tree->bt_root = new_root_hdr; 671 zfs_btree_poison_node(tree, new_root_hdr); 672 return; 673 } 674 675 /* 676 * Since we have the new separator, binary search for where to put 677 * new_node. 678 */ 679 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; 680 zfs_btree_index_t idx; 681 ASSERT(zfs_btree_is_core(par_hdr)); 682 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, 683 par_hdr->bth_count, buf, &idx), ==, NULL); 684 ASSERT(idx.bti_before); 685 uint32_t offset = idx.bti_offset; 686 ASSERT3U(offset, <=, par_hdr->bth_count); 687 ASSERT3P(parent->btc_children[offset], ==, old_node); 688 689 /* 690 * If the parent isn't full, shift things to accommodate our insertions 691 * and return. 692 */ 693 if (par_hdr->bth_count != BTREE_CORE_ELEMS) { 694 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); 695 return; 696 } 697 698 /* 699 * We need to split this core node into two. Currently there are 700 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for 701 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the 702 * current node, and the others will be moved to the new core node. 703 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One 704 * will be used as the new separator in our parent, and the others 705 * will be split among the two core nodes. 706 * 707 * Usually we will split the node in half evenly, with 708 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we 709 * instead move only about a quarter of the elements (and children) to 710 * the new node. Since the average state after a long time is a 3/4 711 * full node, shortcutting directly to that state improves efficiency. 712 * 713 * We do this in two stages: first we split into two nodes, and then we 714 * reuse our existing logic to insert the new element and child. 715 */ 716 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 717 2 : 4)) - 1, 2); 718 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; 719 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); 720 tree->bt_num_nodes++; 721 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + 722 BTREE_CORE_ELEMS * size, KM_SLEEP); 723 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; 724 new_par_hdr->bth_parent = par_hdr->bth_parent; 725 new_par_hdr->bth_first = -1; 726 new_par_hdr->bth_count = move_count; 727 zfs_btree_poison_node(tree, new_par_hdr); 728 729 par_hdr->bth_count = keep_count; 730 731 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 732 0, BSS_TRAPEZOID); 733 734 /* Store the new separator in a buffer. */ 735 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); 736 bcpy(parent->btc_elems + keep_count * size, tmp_buf, 737 size); 738 zfs_btree_poison_node(tree, par_hdr); 739 740 if (offset < keep_count) { 741 /* Insert the new node into the left half */ 742 zfs_btree_insert_core_impl(tree, parent, offset, new_node, 743 buf); 744 745 /* 746 * Move the new separator to the existing buffer. 747 */ 748 bcpy(tmp_buf, buf, size); 749 } else if (offset > keep_count) { 750 /* Insert the new node into the right half */ 751 new_node->bth_parent = new_parent; 752 zfs_btree_insert_core_impl(tree, new_parent, 753 offset - keep_count - 1, new_node, buf); 754 755 /* 756 * Move the new separator to the existing buffer. 757 */ 758 bcpy(tmp_buf, buf, size); 759 } else { 760 /* 761 * Move the new separator into the right half, and replace it 762 * with buf. We also need to shift back the elements in the 763 * right half to accommodate new_node. 764 */ 765 bt_shift_core_right(tree, new_parent, 0, move_count, 766 BSS_TRAPEZOID); 767 new_parent->btc_children[0] = new_node; 768 bcpy(tmp_buf, new_parent->btc_elems, size); 769 new_par_hdr->bth_count++; 770 } 771 kmem_free(tmp_buf, size); 772 zfs_btree_poison_node(tree, par_hdr); 773 774 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) 775 new_parent->btc_children[i]->bth_parent = new_parent; 776 777 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) 778 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); 779 780 /* 781 * Now that the node is split, we need to insert the new node into its 782 * parent. This may cause further splitting. 783 */ 784 zfs_btree_insert_into_parent(tree, &parent->btc_hdr, 785 &new_parent->btc_hdr, buf); 786 } 787 788 /* Insert an element into a leaf node at the given offset. */ 789 static void 790 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 791 uint32_t idx, const void *value) 792 { 793 size_t size = tree->bt_elem_size; 794 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 795 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); 796 797 if (zfs_btree_verify_intensity >= 5) { 798 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, 799 leaf->btl_hdr.bth_count); 800 } 801 802 bt_grow_leaf(tree, leaf, idx, 1); 803 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; 804 bcpy(value, start, size); 805 } 806 807 static void 808 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); 809 810 /* Helper function for inserting a new value into leaf at the given index. */ 811 static void 812 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 813 const void *value, uint32_t idx) 814 { 815 size_t size = tree->bt_elem_size; 816 uint32_t capacity = tree->bt_leaf_cap; 817 818 /* 819 * If the leaf isn't full, shift the elements after idx and insert 820 * value. 821 */ 822 if (leaf->btl_hdr.bth_count != capacity) { 823 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 824 return; 825 } 826 827 /* 828 * Otherwise, we split the leaf node into two nodes. If we're not bulk 829 * inserting, each is of size (capacity / 2). If we are bulk 830 * inserting, we move a quarter of the elements to the new node so 831 * inserts into the old node don't cause immediate splitting but the 832 * tree stays relatively dense. Since the average state after a long 833 * time is a 3/4 full node, shortcutting directly to that state 834 * improves efficiency. At the end of the bulk insertion process 835 * we'll need to go through and fix up any nodes (the last leaf and 836 * its ancestors, potentially) that are below the minimum. 837 * 838 * In either case, we're left with one extra element. The leftover 839 * element will become the new dividing element between the two nodes. 840 */ 841 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; 842 uint32_t keep_count = capacity - move_count - 1; 843 ASSERT3U(keep_count, >=, 1); 844 /* If we insert on left. move one more to keep leaves balanced. */ 845 if (idx < keep_count) { 846 keep_count--; 847 move_count++; 848 } 849 tree->bt_num_nodes++; 850 zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); 851 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; 852 new_hdr->bth_parent = leaf->btl_hdr.bth_parent; 853 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + 854 (idx >= keep_count && idx <= keep_count + move_count / 2); 855 new_hdr->bth_count = move_count; 856 zfs_btree_poison_node(tree, new_hdr); 857 858 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) 859 tree->bt_bulk = new_leaf; 860 861 /* Copy the back part to the new leaf. */ 862 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); 863 864 /* We store the new separator in a buffer we control for simplicity. */ 865 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 866 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, 867 buf, size); 868 869 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); 870 871 if (idx < keep_count) { 872 /* Insert into the existing leaf. */ 873 zfs_btree_insert_leaf_impl(tree, leaf, idx, value); 874 } else if (idx > keep_count) { 875 /* Insert into the new leaf. */ 876 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 877 1, value); 878 } else { 879 /* 880 * Insert planned separator into the new leaf, and use 881 * the new value as the new separator. 882 */ 883 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); 884 bcpy(value, buf, size); 885 } 886 887 /* 888 * Now that the node is split, we need to insert the new node into its 889 * parent. This may cause further splitting, bur only of core nodes. 890 */ 891 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, 892 buf); 893 kmem_free(buf, size); 894 } 895 896 static uint32_t 897 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 898 { 899 void *buf; 900 if (zfs_btree_is_core(hdr)) { 901 buf = ((zfs_btree_core_t *)hdr)->btc_elems; 902 } else { 903 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + 904 hdr->bth_first * tree->bt_elem_size; 905 } 906 zfs_btree_index_t idx; 907 zfs_btree_core_t *parent = hdr->bth_parent; 908 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, 909 parent->btc_hdr.bth_count, buf, &idx), ==, NULL); 910 ASSERT(idx.bti_before); 911 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); 912 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); 913 return (idx.bti_offset); 914 } 915 916 /* 917 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some 918 * nodes may violate the invariant that non-root nodes must be at least half 919 * full. All nodes violating this invariant should be the last node in their 920 * particular level. To correct the invariant, we take values from their left 921 * neighbor until they are half full. They must have a left neighbor at their 922 * level because the last node at a level is not the first node unless it's 923 * the root. 924 */ 925 static void 926 zfs_btree_bulk_finish(zfs_btree_t *tree) 927 { 928 ASSERT3P(tree->bt_bulk, !=, NULL); 929 ASSERT3P(tree->bt_root, !=, NULL); 930 zfs_btree_leaf_t *leaf = tree->bt_bulk; 931 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 932 zfs_btree_core_t *parent = hdr->bth_parent; 933 size_t size = tree->bt_elem_size; 934 uint32_t capacity = tree->bt_leaf_cap; 935 936 /* 937 * The invariant doesn't apply to the root node, if that's the only 938 * node in the tree we're done. 939 */ 940 if (parent == NULL) { 941 tree->bt_bulk = NULL; 942 return; 943 } 944 945 /* First, take elements to rebalance the leaf node. */ 946 if (hdr->bth_count < capacity / 2) { 947 /* 948 * First, find the left neighbor. The simplest way to do this 949 * is to call zfs_btree_prev twice; the first time finds some 950 * ancestor of this node, and the second time finds the left 951 * neighbor. The ancestor found is the lowest common ancestor 952 * of leaf and the neighbor. 953 */ 954 zfs_btree_index_t idx = { 955 .bti_node = hdr, 956 .bti_offset = 0 957 }; 958 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 959 ASSERT(zfs_btree_is_core(idx.bti_node)); 960 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; 961 uint32_t common_idx = idx.bti_offset; 962 963 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); 964 ASSERT(!zfs_btree_is_core(idx.bti_node)); 965 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; 966 zfs_btree_hdr_t *l_hdr = idx.bti_node; 967 uint32_t move_count = (capacity / 2) - hdr->bth_count; 968 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, 969 capacity / 2); 970 971 if (zfs_btree_verify_intensity >= 5) { 972 for (uint32_t i = 0; i < move_count; i++) { 973 zfs_btree_verify_poison_at(tree, hdr, 974 leaf->btl_hdr.bth_count + i); 975 } 976 } 977 978 /* First, shift elements in leaf back. */ 979 bt_grow_leaf(tree, leaf, 0, move_count); 980 981 /* Next, move the separator from the common ancestor to leaf. */ 982 uint8_t *separator = common->btc_elems + common_idx * size; 983 uint8_t *out = leaf->btl_elems + 984 (hdr->bth_first + move_count - 1) * size; 985 bcpy(separator, out, size); 986 987 /* 988 * Now we move elements from the tail of the left neighbor to 989 * fill the remaining spots in leaf. 990 */ 991 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - 992 (move_count - 1), move_count - 1, leaf, 0); 993 994 /* 995 * Finally, move the new last element in the left neighbor to 996 * the separator. 997 */ 998 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + 999 l_hdr->bth_count - move_count) * size, separator, size); 1000 1001 /* Adjust the node's counts, and we're done. */ 1002 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, 1003 move_count); 1004 1005 ASSERT3U(l_hdr->bth_count, >=, capacity / 2); 1006 ASSERT3U(hdr->bth_count, >=, capacity / 2); 1007 } 1008 1009 /* 1010 * Now we have to rebalance any ancestors of leaf that may also 1011 * violate the invariant. 1012 */ 1013 capacity = BTREE_CORE_ELEMS; 1014 while (parent->btc_hdr.bth_parent != NULL) { 1015 zfs_btree_core_t *cur = parent; 1016 zfs_btree_hdr_t *hdr = &cur->btc_hdr; 1017 parent = hdr->bth_parent; 1018 /* 1019 * If the invariant isn't violated, move on to the next 1020 * ancestor. 1021 */ 1022 if (hdr->bth_count >= capacity / 2) 1023 continue; 1024 1025 /* 1026 * Because the smallest number of nodes we can move when 1027 * splitting is 2, we never need to worry about not having a 1028 * left sibling (a sibling is a neighbor with the same parent). 1029 */ 1030 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1031 ASSERT3U(parent_idx, >, 0); 1032 zfs_btree_core_t *l_neighbor = 1033 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; 1034 uint32_t move_count = (capacity / 2) - hdr->bth_count; 1035 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, 1036 capacity / 2); 1037 1038 if (zfs_btree_verify_intensity >= 5) { 1039 for (uint32_t i = 0; i < move_count; i++) { 1040 zfs_btree_verify_poison_at(tree, hdr, 1041 hdr->bth_count + i); 1042 } 1043 } 1044 /* First, shift things in the right node back. */ 1045 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, 1046 BSS_TRAPEZOID, BSD_RIGHT); 1047 1048 /* Next, move the separator to the right node. */ 1049 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * 1050 size); 1051 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); 1052 bcpy(separator, e_out, size); 1053 1054 /* 1055 * Now, move elements and children from the left node to the 1056 * right. We move one more child than elements. 1057 */ 1058 move_count--; 1059 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; 1060 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, 1061 BSS_TRAPEZOID); 1062 1063 /* 1064 * Finally, move the last element in the left node to the 1065 * separator's position. 1066 */ 1067 move_idx--; 1068 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); 1069 1070 l_neighbor->btc_hdr.bth_count -= move_count + 1; 1071 hdr->bth_count += move_count + 1; 1072 1073 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); 1074 ASSERT3U(hdr->bth_count, >=, capacity / 2); 1075 1076 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); 1077 1078 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1079 cur->btc_children[i]->bth_parent = cur; 1080 } 1081 1082 tree->bt_bulk = NULL; 1083 zfs_btree_verify(tree); 1084 } 1085 1086 /* 1087 * Insert value into tree at the location specified by where. 1088 */ 1089 void 1090 zfs_btree_add_idx(zfs_btree_t *tree, const void *value, 1091 const zfs_btree_index_t *where) 1092 { 1093 zfs_btree_index_t idx = {0}; 1094 1095 /* If we're not inserting in the last leaf, end bulk insert mode. */ 1096 if (tree->bt_bulk != NULL) { 1097 if (where->bti_node != &tree->bt_bulk->btl_hdr) { 1098 zfs_btree_bulk_finish(tree); 1099 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); 1100 where = &idx; 1101 } 1102 } 1103 1104 tree->bt_num_elems++; 1105 /* 1106 * If this is the first element in the tree, create a leaf root node 1107 * and add the value to it. 1108 */ 1109 if (where->bti_node == NULL) { 1110 ASSERT3U(tree->bt_num_elems, ==, 1); 1111 ASSERT3S(tree->bt_height, ==, -1); 1112 ASSERT3P(tree->bt_root, ==, NULL); 1113 ASSERT0(where->bti_offset); 1114 1115 tree->bt_num_nodes++; 1116 zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); 1117 tree->bt_root = &leaf->btl_hdr; 1118 tree->bt_height++; 1119 1120 zfs_btree_hdr_t *hdr = &leaf->btl_hdr; 1121 hdr->bth_parent = NULL; 1122 hdr->bth_first = 0; 1123 hdr->bth_count = 0; 1124 zfs_btree_poison_node(tree, hdr); 1125 1126 zfs_btree_insert_into_leaf(tree, leaf, value, 0); 1127 tree->bt_bulk = leaf; 1128 } else if (!zfs_btree_is_core(where->bti_node)) { 1129 /* 1130 * If we're inserting into a leaf, go directly to the helper 1131 * function. 1132 */ 1133 zfs_btree_insert_into_leaf(tree, 1134 (zfs_btree_leaf_t *)where->bti_node, value, 1135 where->bti_offset); 1136 } else { 1137 /* 1138 * If we're inserting into a core node, we can't just shift 1139 * the existing element in that slot in the same node without 1140 * breaking our ordering invariants. Instead we place the new 1141 * value in the node at that spot and then insert the old 1142 * separator into the first slot in the subtree to the right. 1143 */ 1144 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; 1145 1146 /* 1147 * We can ignore bti_before, because either way the value 1148 * should end up in bti_offset. 1149 */ 1150 uint32_t off = where->bti_offset; 1151 zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; 1152 size_t size = tree->bt_elem_size; 1153 uint8_t *buf = kmem_alloc(size, KM_SLEEP); 1154 bcpy(node->btc_elems + off * size, buf, size); 1155 bcpy(value, node->btc_elems + off * size, size); 1156 1157 /* 1158 * Find the first slot in the subtree to the right, insert 1159 * there. 1160 */ 1161 zfs_btree_index_t new_idx; 1162 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, 1163 NULL); 1164 ASSERT0(new_idx.bti_offset); 1165 ASSERT(!zfs_btree_is_core(new_idx.bti_node)); 1166 zfs_btree_insert_into_leaf(tree, 1167 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); 1168 kmem_free(buf, size); 1169 } 1170 zfs_btree_verify(tree); 1171 } 1172 1173 /* 1174 * Return the first element in the tree, and put its location in where if 1175 * non-null. 1176 */ 1177 void * 1178 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) 1179 { 1180 if (tree->bt_height == -1) { 1181 ASSERT0(tree->bt_num_elems); 1182 return (NULL); 1183 } 1184 return (zfs_btree_first_helper(tree, tree->bt_root, where)); 1185 } 1186 1187 /* 1188 * Find the last element in the subtree rooted at hdr, return its value and 1189 * put its location in where if non-null. 1190 */ 1191 static void * 1192 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, 1193 zfs_btree_index_t *where) 1194 { 1195 zfs_btree_hdr_t *node; 1196 1197 for (node = hdr; zfs_btree_is_core(node); node = 1198 ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) 1199 ; 1200 1201 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; 1202 if (where != NULL) { 1203 where->bti_node = node; 1204 where->bti_offset = node->bth_count - 1; 1205 where->bti_before = B_FALSE; 1206 } 1207 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * 1208 btree->bt_elem_size); 1209 } 1210 1211 /* 1212 * Return the last element in the tree, and put its location in where if 1213 * non-null. 1214 */ 1215 void * 1216 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) 1217 { 1218 if (tree->bt_height == -1) { 1219 ASSERT0(tree->bt_num_elems); 1220 return (NULL); 1221 } 1222 return (zfs_btree_last_helper(tree, tree->bt_root, where)); 1223 } 1224 1225 /* 1226 * This function contains the logic to find the next node in the tree. A 1227 * helper function is used because there are multiple internal consumemrs of 1228 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each 1229 * node after we've finished with it. 1230 */ 1231 static void * 1232 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1233 zfs_btree_index_t *out_idx, 1234 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) 1235 { 1236 if (idx->bti_node == NULL) { 1237 ASSERT3S(tree->bt_height, ==, -1); 1238 return (NULL); 1239 } 1240 1241 uint32_t offset = idx->bti_offset; 1242 if (!zfs_btree_is_core(idx->bti_node)) { 1243 /* 1244 * When finding the next element of an element in a leaf, 1245 * there are two cases. If the element isn't the last one in 1246 * the leaf, in which case we just return the next element in 1247 * the leaf. Otherwise, we need to traverse up our parents 1248 * until we find one where our ancestor isn't the last child 1249 * of its parent. Once we do, the next element is the 1250 * separator after our ancestor in its parent. 1251 */ 1252 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1253 uint32_t new_off = offset + (idx->bti_before ? 0 : 1); 1254 if (leaf->btl_hdr.bth_count > new_off) { 1255 out_idx->bti_node = &leaf->btl_hdr; 1256 out_idx->bti_offset = new_off; 1257 out_idx->bti_before = B_FALSE; 1258 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1259 new_off) * tree->bt_elem_size); 1260 } 1261 1262 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1263 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1264 node != NULL; node = node->btc_hdr.bth_parent) { 1265 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1266 ASSERT(zfs_btree_is_core(hdr)); 1267 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1268 if (done_func != NULL) 1269 done_func(tree, prev); 1270 if (i == hdr->bth_count) { 1271 prev = hdr; 1272 continue; 1273 } 1274 out_idx->bti_node = hdr; 1275 out_idx->bti_offset = i; 1276 out_idx->bti_before = B_FALSE; 1277 return (node->btc_elems + i * tree->bt_elem_size); 1278 } 1279 if (done_func != NULL) 1280 done_func(tree, prev); 1281 /* 1282 * We've traversed all the way up and been at the end of the 1283 * node every time, so this was the last element in the tree. 1284 */ 1285 return (NULL); 1286 } 1287 1288 /* If we were before an element in a core node, return that element. */ 1289 ASSERT(zfs_btree_is_core(idx->bti_node)); 1290 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1291 if (idx->bti_before) { 1292 out_idx->bti_before = B_FALSE; 1293 return (node->btc_elems + offset * tree->bt_elem_size); 1294 } 1295 1296 /* 1297 * The next element from one in a core node is the first element in 1298 * the subtree just to the right of the separator. 1299 */ 1300 zfs_btree_hdr_t *child = node->btc_children[offset + 1]; 1301 return (zfs_btree_first_helper(tree, child, out_idx)); 1302 } 1303 1304 /* 1305 * Return the next valued node in the tree. The same address can be safely 1306 * passed for idx and out_idx. 1307 */ 1308 void * 1309 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1310 zfs_btree_index_t *out_idx) 1311 { 1312 return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); 1313 } 1314 1315 /* 1316 * Return the previous valued node in the tree. The same value can be safely 1317 * passed for idx and out_idx. 1318 */ 1319 void * 1320 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, 1321 zfs_btree_index_t *out_idx) 1322 { 1323 if (idx->bti_node == NULL) { 1324 ASSERT3S(tree->bt_height, ==, -1); 1325 return (NULL); 1326 } 1327 1328 uint32_t offset = idx->bti_offset; 1329 if (!zfs_btree_is_core(idx->bti_node)) { 1330 /* 1331 * When finding the previous element of an element in a leaf, 1332 * there are two cases. If the element isn't the first one in 1333 * the leaf, in which case we just return the previous element 1334 * in the leaf. Otherwise, we need to traverse up our parents 1335 * until we find one where our previous ancestor isn't the 1336 * first child. Once we do, the previous element is the 1337 * separator after our previous ancestor. 1338 */ 1339 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1340 if (offset != 0) { 1341 out_idx->bti_node = &leaf->btl_hdr; 1342 out_idx->bti_offset = offset - 1; 1343 out_idx->bti_before = B_FALSE; 1344 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1345 offset - 1) * tree->bt_elem_size); 1346 } 1347 zfs_btree_hdr_t *prev = &leaf->btl_hdr; 1348 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; 1349 node != NULL; node = node->btc_hdr.bth_parent) { 1350 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1351 ASSERT(zfs_btree_is_core(hdr)); 1352 uint32_t i = zfs_btree_find_parent_idx(tree, prev); 1353 if (i == 0) { 1354 prev = hdr; 1355 continue; 1356 } 1357 out_idx->bti_node = hdr; 1358 out_idx->bti_offset = i - 1; 1359 out_idx->bti_before = B_FALSE; 1360 return (node->btc_elems + (i - 1) * tree->bt_elem_size); 1361 } 1362 /* 1363 * We've traversed all the way up and been at the start of the 1364 * node every time, so this was the first node in the tree. 1365 */ 1366 return (NULL); 1367 } 1368 1369 /* 1370 * The previous element from one in a core node is the last element in 1371 * the subtree just to the left of the separator. 1372 */ 1373 ASSERT(zfs_btree_is_core(idx->bti_node)); 1374 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1375 zfs_btree_hdr_t *child = node->btc_children[offset]; 1376 return (zfs_btree_last_helper(tree, child, out_idx)); 1377 } 1378 1379 /* 1380 * Get the value at the provided index in the tree. 1381 * 1382 * Note that the value returned from this function can be mutated, but only 1383 * if it will not change the ordering of the element with respect to any other 1384 * elements that could be in the tree. 1385 */ 1386 void * 1387 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) 1388 { 1389 ASSERT(!idx->bti_before); 1390 size_t size = tree->bt_elem_size; 1391 if (!zfs_btree_is_core(idx->bti_node)) { 1392 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; 1393 return (leaf->btl_elems + (leaf->btl_hdr.bth_first + 1394 idx->bti_offset) * size); 1395 } 1396 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; 1397 return (node->btc_elems + idx->bti_offset * size); 1398 } 1399 1400 /* Add the given value to the tree. Must not already be in the tree. */ 1401 void 1402 zfs_btree_add(zfs_btree_t *tree, const void *node) 1403 { 1404 zfs_btree_index_t where = {0}; 1405 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); 1406 zfs_btree_add_idx(tree, node, &where); 1407 } 1408 1409 /* Helper function to free a tree node. */ 1410 static void 1411 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) 1412 { 1413 tree->bt_num_nodes--; 1414 if (!zfs_btree_is_core(node)) { 1415 zfs_btree_leaf_free(tree, node); 1416 } else { 1417 kmem_free(node, sizeof (zfs_btree_core_t) + 1418 BTREE_CORE_ELEMS * tree->bt_elem_size); 1419 } 1420 } 1421 1422 /* 1423 * Remove the rm_hdr and the separator to its left from the parent node. The 1424 * buffer that rm_hdr was stored in may already be freed, so its contents 1425 * cannot be accessed. 1426 */ 1427 static void 1428 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, 1429 zfs_btree_hdr_t *rm_hdr) 1430 { 1431 size_t size = tree->bt_elem_size; 1432 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; 1433 zfs_btree_hdr_t *hdr = &node->btc_hdr; 1434 /* 1435 * If the node is the root node and rm_hdr is one of two children, 1436 * promote the other child to the root. 1437 */ 1438 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { 1439 ASSERT3U(hdr->bth_count, ==, 1); 1440 ASSERT3P(tree->bt_root, ==, node); 1441 ASSERT3P(node->btc_children[1], ==, rm_hdr); 1442 tree->bt_root = node->btc_children[0]; 1443 node->btc_children[0]->bth_parent = NULL; 1444 zfs_btree_node_destroy(tree, hdr); 1445 tree->bt_height--; 1446 return; 1447 } 1448 1449 uint32_t idx; 1450 for (idx = 0; idx <= hdr->bth_count; idx++) { 1451 if (node->btc_children[idx] == rm_hdr) 1452 break; 1453 } 1454 ASSERT3U(idx, <=, hdr->bth_count); 1455 1456 /* 1457 * If the node is the root or it has more than the minimum number of 1458 * children, just remove the child and separator, and return. 1459 */ 1460 if (hdr->bth_parent == NULL || 1461 hdr->bth_count > min_count) { 1462 /* 1463 * Shift the element and children to the right of rm_hdr to 1464 * the left by one spot. 1465 */ 1466 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1467 BSS_PARALLELOGRAM); 1468 hdr->bth_count--; 1469 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); 1470 return; 1471 } 1472 1473 ASSERT3U(hdr->bth_count, ==, min_count); 1474 1475 /* 1476 * Now we try to take a node from a neighbor. We check left, then 1477 * right. If the neighbor exists and has more than the minimum number 1478 * of elements, we move the separator between us and them to our 1479 * node, move their closest element (last for left, first for right) 1480 * to the separator, and move their closest child to our node. Along 1481 * the way we need to collapse the gap made by idx, and (for our right 1482 * neighbor) the gap made by removing their first element and child. 1483 * 1484 * Note: this logic currently doesn't support taking from a neighbor 1485 * that isn't a sibling (i.e. a neighbor with a different 1486 * parent). This isn't critical functionality, but may be worth 1487 * implementing in the future for completeness' sake. 1488 */ 1489 zfs_btree_core_t *parent = hdr->bth_parent; 1490 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1491 1492 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1493 parent->btc_children[parent_idx - 1]); 1494 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1495 /* We can take a node from the left neighbor. */ 1496 ASSERT(zfs_btree_is_core(l_hdr)); 1497 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; 1498 1499 /* 1500 * Start by shifting the elements and children in the current 1501 * node to the right by one spot. 1502 */ 1503 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); 1504 1505 /* 1506 * Move the separator between node and neighbor to the first 1507 * element slot in the current node. 1508 */ 1509 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1510 size; 1511 bcpy(separator, node->btc_elems, size); 1512 1513 /* Move the last child of neighbor to our first child slot. */ 1514 node->btc_children[0] = 1515 neighbor->btc_children[l_hdr->bth_count]; 1516 node->btc_children[0]->bth_parent = node; 1517 1518 /* Move the last element of neighbor to the separator spot. */ 1519 uint8_t *take_elem = neighbor->btc_elems + 1520 (l_hdr->bth_count - 1) * size; 1521 bcpy(take_elem, separator, size); 1522 l_hdr->bth_count--; 1523 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); 1524 return; 1525 } 1526 1527 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1528 NULL : parent->btc_children[parent_idx + 1]); 1529 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1530 /* We can take a node from the right neighbor. */ 1531 ASSERT(zfs_btree_is_core(r_hdr)); 1532 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; 1533 1534 /* 1535 * Shift elements in node left by one spot to overwrite rm_hdr 1536 * and the separator before it. 1537 */ 1538 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, 1539 BSS_PARALLELOGRAM); 1540 1541 /* 1542 * Move the separator between node and neighbor to the last 1543 * element spot in node. 1544 */ 1545 uint8_t *separator = parent->btc_elems + parent_idx * size; 1546 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, 1547 size); 1548 1549 /* 1550 * Move the first child of neighbor to the last child spot in 1551 * node. 1552 */ 1553 node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; 1554 node->btc_children[hdr->bth_count]->bth_parent = node; 1555 1556 /* Move the first element of neighbor to the separator spot. */ 1557 uint8_t *take_elem = neighbor->btc_elems; 1558 bcpy(take_elem, separator, size); 1559 r_hdr->bth_count--; 1560 1561 /* 1562 * Shift the elements and children of neighbor to cover the 1563 * stolen elements. 1564 */ 1565 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, 1566 BSS_TRAPEZOID); 1567 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); 1568 return; 1569 } 1570 1571 /* 1572 * In this case, neither of our neighbors can spare an element, so we 1573 * need to merge with one of them. We prefer the left one, 1574 * arbitrarily. Move the separator into the leftmost merging node 1575 * (which may be us or the left neighbor), and then move the right 1576 * merging node's elements. Once that's done, we go back and delete 1577 * the element we're removing. Finally, go into the parent and delete 1578 * the right merging node and the separator. This may cause further 1579 * merging. 1580 */ 1581 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; 1582 uint32_t new_idx = idx; 1583 if (l_hdr != NULL) { 1584 keep_hdr = l_hdr; 1585 new_rm_hdr = hdr; 1586 new_idx += keep_hdr->bth_count + 1; 1587 } else { 1588 ASSERT3P(r_hdr, !=, NULL); 1589 keep_hdr = hdr; 1590 new_rm_hdr = r_hdr; 1591 parent_idx++; 1592 } 1593 1594 ASSERT(zfs_btree_is_core(keep_hdr)); 1595 ASSERT(zfs_btree_is_core(new_rm_hdr)); 1596 1597 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; 1598 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; 1599 1600 if (zfs_btree_verify_intensity >= 5) { 1601 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { 1602 zfs_btree_verify_poison_at(tree, keep_hdr, 1603 keep_hdr->bth_count + i); 1604 } 1605 } 1606 1607 /* Move the separator into the left node. */ 1608 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; 1609 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1610 size; 1611 bcpy(separator, e_out, size); 1612 keep_hdr->bth_count++; 1613 1614 /* Move all our elements and children into the left node. */ 1615 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, 1616 keep_hdr->bth_count, BSS_TRAPEZOID); 1617 1618 uint32_t old_count = keep_hdr->bth_count; 1619 1620 /* Update bookkeeping */ 1621 keep_hdr->bth_count += new_rm_hdr->bth_count; 1622 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); 1623 1624 /* 1625 * Shift the element and children to the right of rm_hdr to 1626 * the left by one spot. 1627 */ 1628 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); 1629 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, 1630 BSS_PARALLELOGRAM); 1631 keep_hdr->bth_count--; 1632 1633 /* Reparent all our children to point to the left node. */ 1634 zfs_btree_hdr_t **new_start = keep->btc_children + 1635 old_count - 1; 1636 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) 1637 new_start[i]->bth_parent = keep; 1638 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { 1639 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); 1640 ASSERT3P(keep->btc_children[i], !=, rm_hdr); 1641 } 1642 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); 1643 1644 new_rm_hdr->bth_count = 0; 1645 zfs_btree_remove_from_node(tree, parent, new_rm_hdr); 1646 zfs_btree_node_destroy(tree, new_rm_hdr); 1647 } 1648 1649 /* Remove the element at the specific location. */ 1650 void 1651 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) 1652 { 1653 size_t size = tree->bt_elem_size; 1654 zfs_btree_hdr_t *hdr = where->bti_node; 1655 uint32_t idx = where->bti_offset; 1656 1657 ASSERT(!where->bti_before); 1658 if (tree->bt_bulk != NULL) { 1659 /* 1660 * Leave bulk insert mode. Note that our index would be 1661 * invalid after we correct the tree, so we copy the value 1662 * we're planning to remove and find it again after 1663 * bulk_finish. 1664 */ 1665 uint8_t *value = zfs_btree_get(tree, where); 1666 uint8_t *tmp = kmem_alloc(size, KM_SLEEP); 1667 bcpy(value, tmp, size); 1668 zfs_btree_bulk_finish(tree); 1669 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); 1670 kmem_free(tmp, size); 1671 hdr = where->bti_node; 1672 idx = where->bti_offset; 1673 } 1674 1675 tree->bt_num_elems--; 1676 /* 1677 * If the element happens to be in a core node, we move a leaf node's 1678 * element into its place and then remove the leaf node element. This 1679 * makes the rebalance logic not need to be recursive both upwards and 1680 * downwards. 1681 */ 1682 if (zfs_btree_is_core(hdr)) { 1683 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1684 zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; 1685 void *new_value = zfs_btree_last_helper(tree, left_subtree, 1686 where); 1687 ASSERT3P(new_value, !=, NULL); 1688 1689 bcpy(new_value, node->btc_elems + idx * size, size); 1690 1691 hdr = where->bti_node; 1692 idx = where->bti_offset; 1693 ASSERT(!where->bti_before); 1694 } 1695 1696 /* 1697 * First, we'll update the leaf's metadata. Then, we shift any 1698 * elements after the idx to the left. After that, we rebalance if 1699 * needed. 1700 */ 1701 ASSERT(!zfs_btree_is_core(hdr)); 1702 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 1703 ASSERT3U(hdr->bth_count, >, 0); 1704 1705 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; 1706 1707 /* 1708 * If we're over the minimum size or this is the root, just overwrite 1709 * the value and return. 1710 */ 1711 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { 1712 bt_shrink_leaf(tree, leaf, idx, 1); 1713 if (hdr->bth_parent == NULL) { 1714 ASSERT0(tree->bt_height); 1715 if (hdr->bth_count == 0) { 1716 tree->bt_root = NULL; 1717 tree->bt_height--; 1718 zfs_btree_node_destroy(tree, &leaf->btl_hdr); 1719 } 1720 } 1721 zfs_btree_verify(tree); 1722 return; 1723 } 1724 ASSERT3U(hdr->bth_count, ==, min_count); 1725 1726 /* 1727 * Now we try to take a node from a sibling. We check left, then 1728 * right. If they exist and have more than the minimum number of 1729 * elements, we move the separator between us and them to our node 1730 * and move their closest element (last for left, first for right) to 1731 * the separator. Along the way we need to collapse the gap made by 1732 * idx, and (for our right neighbor) the gap made by removing their 1733 * first element. 1734 * 1735 * Note: this logic currently doesn't support taking from a neighbor 1736 * that isn't a sibling. This isn't critical functionality, but may be 1737 * worth implementing in the future for completeness' sake. 1738 */ 1739 zfs_btree_core_t *parent = hdr->bth_parent; 1740 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); 1741 1742 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : 1743 parent->btc_children[parent_idx - 1]); 1744 if (l_hdr != NULL && l_hdr->bth_count > min_count) { 1745 /* We can take a node from the left neighbor. */ 1746 ASSERT(!zfs_btree_is_core(l_hdr)); 1747 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; 1748 1749 /* 1750 * Move our elements back by one spot to make room for the 1751 * stolen element and overwrite the element being removed. 1752 */ 1753 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); 1754 1755 /* Move the separator to our first spot. */ 1756 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * 1757 size; 1758 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); 1759 1760 /* Move our neighbor's last element to the separator. */ 1761 uint8_t *take_elem = neighbor->btl_elems + 1762 (l_hdr->bth_first + l_hdr->bth_count - 1) * size; 1763 bcpy(take_elem, separator, size); 1764 1765 /* Delete our neighbor's last element. */ 1766 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); 1767 zfs_btree_verify(tree); 1768 return; 1769 } 1770 1771 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? 1772 NULL : parent->btc_children[parent_idx + 1]); 1773 if (r_hdr != NULL && r_hdr->bth_count > min_count) { 1774 /* We can take a node from the right neighbor. */ 1775 ASSERT(!zfs_btree_is_core(r_hdr)); 1776 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; 1777 1778 /* 1779 * Move our elements after the element being removed forwards 1780 * by one spot to make room for the stolen element and 1781 * overwrite the element being removed. 1782 */ 1783 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, 1784 1, BSD_LEFT); 1785 1786 /* Move the separator between us to our last spot. */ 1787 uint8_t *separator = parent->btc_elems + parent_idx * size; 1788 bcpy(separator, leaf->btl_elems + (hdr->bth_first + 1789 hdr->bth_count - 1) * size, size); 1790 1791 /* Move our neighbor's first element to the separator. */ 1792 uint8_t *take_elem = neighbor->btl_elems + 1793 r_hdr->bth_first * size; 1794 bcpy(take_elem, separator, size); 1795 1796 /* Delete our neighbor's first element. */ 1797 bt_shrink_leaf(tree, neighbor, 0, 1); 1798 zfs_btree_verify(tree); 1799 return; 1800 } 1801 1802 /* 1803 * In this case, neither of our neighbors can spare an element, so we 1804 * need to merge with one of them. We prefer the left one, arbitrarily. 1805 * After remove we move the separator into the leftmost merging node 1806 * (which may be us or the left neighbor), and then move the right 1807 * merging node's elements. Once that's done, we go back and delete 1808 * the element we're removing. Finally, go into the parent and delete 1809 * the right merging node and the separator. This may cause further 1810 * merging. 1811 */ 1812 zfs_btree_hdr_t *rm_hdr, *k_hdr; 1813 if (l_hdr != NULL) { 1814 k_hdr = l_hdr; 1815 rm_hdr = hdr; 1816 } else { 1817 ASSERT3P(r_hdr, !=, NULL); 1818 k_hdr = hdr; 1819 rm_hdr = r_hdr; 1820 parent_idx++; 1821 } 1822 ASSERT(!zfs_btree_is_core(k_hdr)); 1823 ASSERT(!zfs_btree_is_core(rm_hdr)); 1824 ASSERT3U(k_hdr->bth_count, ==, min_count); 1825 ASSERT3U(rm_hdr->bth_count, ==, min_count); 1826 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; 1827 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; 1828 1829 if (zfs_btree_verify_intensity >= 5) { 1830 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { 1831 zfs_btree_verify_poison_at(tree, k_hdr, 1832 k_hdr->bth_count + i); 1833 } 1834 } 1835 1836 /* 1837 * Remove the value from the node. It will go below the minimum, 1838 * but we'll fix it in no time. 1839 */ 1840 bt_shrink_leaf(tree, leaf, idx, 1); 1841 1842 /* Prepare space for elements to be moved from the right. */ 1843 uint32_t k_count = k_hdr->bth_count; 1844 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); 1845 ASSERT3U(k_hdr->bth_count, ==, min_count * 2); 1846 1847 /* Move the separator into the first open spot. */ 1848 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; 1849 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; 1850 bcpy(separator, out, size); 1851 1852 /* Move our elements to the left neighbor. */ 1853 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); 1854 1855 /* Remove the emptied node from the parent. */ 1856 zfs_btree_remove_from_node(tree, parent, rm_hdr); 1857 zfs_btree_node_destroy(tree, rm_hdr); 1858 zfs_btree_verify(tree); 1859 } 1860 1861 /* Remove the given value from the tree. */ 1862 void 1863 zfs_btree_remove(zfs_btree_t *tree, const void *value) 1864 { 1865 zfs_btree_index_t where = {0}; 1866 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); 1867 zfs_btree_remove_idx(tree, &where); 1868 } 1869 1870 /* Return the number of elements in the tree. */ 1871 ulong_t 1872 zfs_btree_numnodes(zfs_btree_t *tree) 1873 { 1874 return (tree->bt_num_elems); 1875 } 1876 1877 /* 1878 * This function is used to visit all the elements in the tree before 1879 * destroying the tree. This allows the calling code to perform any cleanup it 1880 * needs to do. This is more efficient than just removing the first element 1881 * over and over, because it removes all rebalancing. Once the destroy_nodes() 1882 * function has been called, no other btree operations are valid until it 1883 * returns NULL, which point the only valid operation is zfs_btree_destroy(). 1884 * 1885 * example: 1886 * 1887 * zfs_btree_index_t *cookie = NULL; 1888 * my_data_t *node; 1889 * 1890 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) 1891 * free(node->ptr); 1892 * zfs_btree_destroy(tree); 1893 * 1894 */ 1895 void * 1896 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) 1897 { 1898 if (*cookie == NULL) { 1899 if (tree->bt_height == -1) 1900 return (NULL); 1901 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); 1902 return (zfs_btree_first(tree, *cookie)); 1903 } 1904 1905 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, 1906 zfs_btree_node_destroy); 1907 if (rval == NULL) { 1908 tree->bt_root = NULL; 1909 tree->bt_height = -1; 1910 tree->bt_num_elems = 0; 1911 kmem_free(*cookie, sizeof (**cookie)); 1912 tree->bt_bulk = NULL; 1913 } 1914 return (rval); 1915 } 1916 1917 static void 1918 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1919 { 1920 if (zfs_btree_is_core(hdr)) { 1921 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; 1922 for (uint32_t i = 0; i <= hdr->bth_count; i++) 1923 zfs_btree_clear_helper(tree, btc->btc_children[i]); 1924 } 1925 1926 zfs_btree_node_destroy(tree, hdr); 1927 } 1928 1929 void 1930 zfs_btree_clear(zfs_btree_t *tree) 1931 { 1932 if (tree->bt_root == NULL) { 1933 ASSERT0(tree->bt_num_elems); 1934 return; 1935 } 1936 1937 zfs_btree_clear_helper(tree, tree->bt_root); 1938 tree->bt_num_elems = 0; 1939 tree->bt_root = NULL; 1940 tree->bt_num_nodes = 0; 1941 tree->bt_height = -1; 1942 tree->bt_bulk = NULL; 1943 } 1944 1945 void 1946 zfs_btree_destroy(zfs_btree_t *tree) 1947 { 1948 ASSERT0(tree->bt_num_elems); 1949 ASSERT3P(tree->bt_root, ==, NULL); 1950 } 1951 1952 /* Verify that every child of this node has the correct parent pointer. */ 1953 static void 1954 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1955 { 1956 if (!zfs_btree_is_core(hdr)) 1957 return; 1958 1959 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1960 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1961 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); 1962 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); 1963 } 1964 } 1965 1966 /* Verify that every node has the correct parent pointer. */ 1967 static void 1968 zfs_btree_verify_pointers(zfs_btree_t *tree) 1969 { 1970 if (tree->bt_height == -1) { 1971 VERIFY3P(tree->bt_root, ==, NULL); 1972 return; 1973 } 1974 VERIFY3P(tree->bt_root->bth_parent, ==, NULL); 1975 zfs_btree_verify_pointers_helper(tree, tree->bt_root); 1976 } 1977 1978 /* 1979 * Verify that all the current node and its children satisfy the count 1980 * invariants, and return the total count in the subtree rooted in this node. 1981 */ 1982 static uint64_t 1983 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 1984 { 1985 if (!zfs_btree_is_core(hdr)) { 1986 if (tree->bt_root != hdr && tree->bt_bulk && 1987 hdr != &tree->bt_bulk->btl_hdr) { 1988 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); 1989 } 1990 1991 return (hdr->bth_count); 1992 } else { 1993 1994 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 1995 uint64_t ret = hdr->bth_count; 1996 if (tree->bt_root != hdr && tree->bt_bulk == NULL) 1997 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); 1998 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 1999 ret += zfs_btree_verify_counts_helper(tree, 2000 node->btc_children[i]); 2001 } 2002 2003 return (ret); 2004 } 2005 } 2006 2007 /* 2008 * Verify that all nodes satisfy the invariants and that the total number of 2009 * elements is correct. 2010 */ 2011 static void 2012 zfs_btree_verify_counts(zfs_btree_t *tree) 2013 { 2014 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); 2015 if (tree->bt_height == -1) { 2016 return; 2017 } 2018 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, 2019 tree->bt_num_elems); 2020 } 2021 2022 /* 2023 * Check that the subtree rooted at this node has a uniform height. Returns 2024 * the number of nodes under this node, to help verify bt_num_nodes. 2025 */ 2026 static uint64_t 2027 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, 2028 int32_t height) 2029 { 2030 if (!zfs_btree_is_core(hdr)) { 2031 VERIFY0(height); 2032 return (1); 2033 } 2034 2035 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2036 uint64_t ret = 1; 2037 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2038 ret += zfs_btree_verify_height_helper(tree, 2039 node->btc_children[i], height - 1); 2040 } 2041 return (ret); 2042 } 2043 2044 /* 2045 * Check that the tree rooted at this node has a uniform height, and that the 2046 * bt_height in the tree is correct. 2047 */ 2048 static void 2049 zfs_btree_verify_height(zfs_btree_t *tree) 2050 { 2051 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2052 if (tree->bt_height == -1) { 2053 return; 2054 } 2055 2056 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, 2057 tree->bt_height), ==, tree->bt_num_nodes); 2058 } 2059 2060 /* 2061 * Check that the elements in this node are sorted, and that if this is a core 2062 * node, the separators are properly between the subtrees they separaate and 2063 * that the children also satisfy this requirement. 2064 */ 2065 static void 2066 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2067 { 2068 size_t size = tree->bt_elem_size; 2069 if (!zfs_btree_is_core(hdr)) { 2070 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2071 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2072 VERIFY3S(tree->bt_compar(leaf->btl_elems + 2073 (hdr->bth_first + i - 1) * size, 2074 leaf->btl_elems + 2075 (hdr->bth_first + i) * size), ==, -1); 2076 } 2077 return; 2078 } 2079 2080 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2081 for (uint32_t i = 1; i < hdr->bth_count; i++) { 2082 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, 2083 node->btc_elems + i * size), ==, -1); 2084 } 2085 for (uint32_t i = 0; i < hdr->bth_count; i++) { 2086 uint8_t *left_child_last = NULL; 2087 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; 2088 if (zfs_btree_is_core(left_child_hdr)) { 2089 zfs_btree_core_t *left_child = 2090 (zfs_btree_core_t *)left_child_hdr; 2091 left_child_last = left_child->btc_elems + 2092 (left_child_hdr->bth_count - 1) * size; 2093 } else { 2094 zfs_btree_leaf_t *left_child = 2095 (zfs_btree_leaf_t *)left_child_hdr; 2096 left_child_last = left_child->btl_elems + 2097 (left_child_hdr->bth_first + 2098 left_child_hdr->bth_count - 1) * size; 2099 } 2100 int comp = tree->bt_compar(node->btc_elems + i * size, 2101 left_child_last); 2102 if (comp <= 0) { 2103 panic("btree: compar returned %d (expected 1) at " 2104 "%px %d: compar(%px, %px)", comp, node, i, 2105 node->btc_elems + i * size, left_child_last); 2106 } 2107 2108 uint8_t *right_child_first = NULL; 2109 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; 2110 if (zfs_btree_is_core(right_child_hdr)) { 2111 zfs_btree_core_t *right_child = 2112 (zfs_btree_core_t *)right_child_hdr; 2113 right_child_first = right_child->btc_elems; 2114 } else { 2115 zfs_btree_leaf_t *right_child = 2116 (zfs_btree_leaf_t *)right_child_hdr; 2117 right_child_first = right_child->btl_elems + 2118 right_child_hdr->bth_first * size; 2119 } 2120 comp = tree->bt_compar(node->btc_elems + i * size, 2121 right_child_first); 2122 if (comp >= 0) { 2123 panic("btree: compar returned %d (expected -1) at " 2124 "%px %d: compar(%px, %px)", comp, node, i, 2125 node->btc_elems + i * size, right_child_first); 2126 } 2127 } 2128 for (uint32_t i = 0; i <= hdr->bth_count; i++) 2129 zfs_btree_verify_order_helper(tree, node->btc_children[i]); 2130 } 2131 2132 /* Check that all elements in the tree are in sorted order. */ 2133 static void 2134 zfs_btree_verify_order(zfs_btree_t *tree) 2135 { 2136 EQUIV(tree->bt_height == -1, tree->bt_root == NULL); 2137 if (tree->bt_height == -1) { 2138 return; 2139 } 2140 2141 zfs_btree_verify_order_helper(tree, tree->bt_root); 2142 } 2143 2144 #ifdef ZFS_DEBUG 2145 /* Check that all unused memory is poisoned correctly. */ 2146 static void 2147 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) 2148 { 2149 size_t size = tree->bt_elem_size; 2150 if (!zfs_btree_is_core(hdr)) { 2151 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; 2152 for (size_t i = 0; i < hdr->bth_first * size; i++) 2153 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2154 size_t esize = tree->bt_leaf_size - 2155 offsetof(zfs_btree_leaf_t, btl_elems); 2156 for (size_t i = (hdr->bth_first + hdr->bth_count) * size; 2157 i < esize; i++) 2158 VERIFY3U(leaf->btl_elems[i], ==, 0x0f); 2159 } else { 2160 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; 2161 for (size_t i = hdr->bth_count * size; 2162 i < BTREE_CORE_ELEMS * size; i++) 2163 VERIFY3U(node->btc_elems[i], ==, 0x0f); 2164 2165 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; 2166 i++) { 2167 VERIFY3P(node->btc_children[i], ==, 2168 (zfs_btree_hdr_t *)BTREE_POISON); 2169 } 2170 2171 for (uint32_t i = 0; i <= hdr->bth_count; i++) { 2172 zfs_btree_verify_poison_helper(tree, 2173 node->btc_children[i]); 2174 } 2175 } 2176 } 2177 #endif 2178 2179 /* Check that unused memory in the tree is still poisoned. */ 2180 static void 2181 zfs_btree_verify_poison(zfs_btree_t *tree) 2182 { 2183 #ifdef ZFS_DEBUG 2184 if (tree->bt_height == -1) 2185 return; 2186 zfs_btree_verify_poison_helper(tree, tree->bt_root); 2187 #endif 2188 } 2189 2190 void 2191 zfs_btree_verify(zfs_btree_t *tree) 2192 { 2193 if (zfs_btree_verify_intensity == 0) 2194 return; 2195 zfs_btree_verify_height(tree); 2196 if (zfs_btree_verify_intensity == 1) 2197 return; 2198 zfs_btree_verify_pointers(tree); 2199 if (zfs_btree_verify_intensity == 2) 2200 return; 2201 zfs_btree_verify_counts(tree); 2202 if (zfs_btree_verify_intensity == 3) 2203 return; 2204 zfs_btree_verify_order(tree); 2205 2206 if (zfs_btree_verify_intensity == 4) 2207 return; 2208 zfs_btree_verify_poison(tree); 2209 } 2210 2211 /* BEGIN CSTYLED */ 2212 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, 2213 "Enable btree verification. Levels above 4 require ZFS be built " 2214 "with debugging"); 2215 /* END CSTYLED */ 2216