1 /*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15 /*
16 * Copyright (c) 2019 by Delphix. All rights reserved.
17 */
18
19 #include <sys/btree.h>
20 #include <sys/bitops.h>
21 #include <sys/zfs_context.h>
22
23 kmem_cache_t *zfs_btree_leaf_cache;
24
25 /*
26 * Control the extent of the verification that occurs when zfs_btree_verify is
27 * called. Primarily used for debugging when extending the btree logic and
28 * functionality. As the intensity is increased, new verification steps are
29 * added. These steps are cumulative; intensity = 3 includes the intensity = 1
30 * and intensity = 2 steps as well.
31 *
32 * Intensity 1: Verify that the tree's height is consistent throughout.
33 * Intensity 2: Verify that a core node's children's parent pointers point
34 * to the core node.
35 * Intensity 3: Verify that the total number of elements in the tree matches the
36 * sum of the number of elements in each node. Also verifies that each node's
37 * count obeys the invariants (less than or equal to maximum value, greater than
38 * or equal to half the maximum minus one).
39 * Intensity 4: Verify that each element compares less than the element
40 * immediately after it and greater than the one immediately before it using the
41 * comparator function. For core nodes, also checks that each element is greater
42 * than the last element in the first of the two nodes it separates, and less
43 * than the first element in the second of the two nodes.
44 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
45 * of each node is poisoned appropriately. Note that poisoning always occurs if
46 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
47 * operation.
48 *
49 * Intensity 4 and 5 are particularly expensive to perform; the previous levels
50 * are a few memory operations per node, while these levels require multiple
51 * operations per element. In addition, when creating large btrees, these
52 * operations are called at every step, resulting in extremely slow operation
53 * (while the asymptotic complexity of the other steps is the same, the
54 * importance of the constant factors cannot be denied).
55 */
56 int zfs_btree_verify_intensity = 0;
57
58 /*
59 * Convenience functions to silence warnings from memcpy/memmove's
60 * return values and change argument order to src, dest.
61 */
62 static void
bcpy(const void * src,void * dest,size_t size)63 bcpy(const void *src, void *dest, size_t size)
64 {
65 (void) memcpy(dest, src, size);
66 }
67
68 void
bmov(const void * src,void * dest,size_t size)69 bmov(const void *src, void *dest, size_t size)
70 {
71 (void) memmove(dest, src, size);
72 }
73
74 static boolean_t
zfs_btree_is_core(struct zfs_btree_hdr * hdr)75 zfs_btree_is_core(struct zfs_btree_hdr *hdr)
76 {
77 return (hdr->bth_first == -1);
78 }
79
80 #ifdef _ILP32
81 #define BTREE_POISON 0xabadb10c
82 #else
83 #define BTREE_POISON 0xabadb10cdeadbeef
84 #endif
85
86 static void
zfs_btree_poison_node(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)87 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
88 {
89 #ifdef ZFS_DEBUG
90 size_t size = tree->bt_elem_size;
91 if (zfs_btree_is_core(hdr)) {
92 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
93 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
94 i++) {
95 node->btc_children[i] =
96 (zfs_btree_hdr_t *)BTREE_POISON;
97 }
98 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
99 (BTREE_CORE_ELEMS - hdr->bth_count) * size);
100 } else {
101 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
102 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
103 (void) memset(leaf->btl_elems +
104 (hdr->bth_first + hdr->bth_count) * size, 0x0f,
105 BTREE_LEAF_ESIZE -
106 (hdr->bth_first + hdr->bth_count) * size);
107 }
108 #endif
109 }
110
111 static inline void
zfs_btree_poison_node_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx,uint32_t count)112 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
113 uint32_t idx, uint32_t count)
114 {
115 #ifdef ZFS_DEBUG
116 size_t size = tree->bt_elem_size;
117 if (zfs_btree_is_core(hdr)) {
118 ASSERT3U(idx, >=, hdr->bth_count);
119 ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
120 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
121 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
122 for (uint32_t i = 1; i <= count; i++) {
123 node->btc_children[idx + i] =
124 (zfs_btree_hdr_t *)BTREE_POISON;
125 }
126 (void) memset(node->btc_elems + idx * size, 0x0f, count * size);
127 } else {
128 ASSERT3U(idx, <=, tree->bt_leaf_cap);
129 ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
130 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
131 (void) memset(leaf->btl_elems +
132 (hdr->bth_first + idx) * size, 0x0f, count * size);
133 }
134 #endif
135 }
136
137 static inline void
zfs_btree_verify_poison_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx)138 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
139 uint32_t idx)
140 {
141 #ifdef ZFS_DEBUG
142 size_t size = tree->bt_elem_size;
143 if (zfs_btree_is_core(hdr)) {
144 ASSERT3U(idx, <, BTREE_CORE_ELEMS);
145 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
146 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
147 VERIFY3P(node->btc_children[idx + 1], ==, cval);
148 for (size_t i = 0; i < size; i++)
149 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
150 } else {
151 ASSERT3U(idx, <, tree->bt_leaf_cap);
152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
153 if (idx >= tree->bt_leaf_cap - hdr->bth_first)
154 return;
155 for (size_t i = 0; i < size; i++) {
156 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
157 * size + i], ==, 0x0f);
158 }
159 }
160 #endif
161 }
162
163 void
zfs_btree_init(void)164 zfs_btree_init(void)
165 {
166 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
167 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
168 }
169
170 void
zfs_btree_fini(void)171 zfs_btree_fini(void)
172 {
173 kmem_cache_destroy(zfs_btree_leaf_cache);
174 }
175
176 void
zfs_btree_create(zfs_btree_t * tree,int (* compar)(const void *,const void *),size_t size)177 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
178 size_t size)
179 {
180 ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2);
181
182 bzero(tree, sizeof (*tree));
183 tree->bt_compar = compar;
184 tree->bt_elem_size = size;
185 tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2);
186 tree->bt_height = -1;
187 tree->bt_bulk = NULL;
188 }
189
190 /*
191 * Find value in the array of elements provided. Uses a simple binary search.
192 */
193 static void *
zfs_btree_find_in_buf(zfs_btree_t * tree,uint8_t * buf,uint32_t nelems,const void * value,zfs_btree_index_t * where)194 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
195 const void *value, zfs_btree_index_t *where)
196 {
197 uint32_t max = nelems;
198 uint32_t min = 0;
199 while (max > min) {
200 uint32_t idx = (min + max) / 2;
201 uint8_t *cur = buf + idx * tree->bt_elem_size;
202 int comp = tree->bt_compar(cur, value);
203 if (comp < 0) {
204 min = idx + 1;
205 } else if (comp > 0) {
206 max = idx;
207 } else {
208 where->bti_offset = idx;
209 where->bti_before = B_FALSE;
210 return (cur);
211 }
212 }
213
214 where->bti_offset = max;
215 where->bti_before = B_TRUE;
216 return (NULL);
217 }
218
219 /*
220 * Find the given value in the tree. where may be passed as null to use as a
221 * membership test or if the btree is being used as a map.
222 */
223 void *
zfs_btree_find(zfs_btree_t * tree,const void * value,zfs_btree_index_t * where)224 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
225 {
226 if (tree->bt_height == -1) {
227 if (where != NULL) {
228 where->bti_node = NULL;
229 where->bti_offset = 0;
230 }
231 ASSERT0(tree->bt_num_elems);
232 return (NULL);
233 }
234
235 /*
236 * If we're in bulk-insert mode, we check the last spot in the tree
237 * and the last leaf in the tree before doing the normal search,
238 * because for most workloads the vast majority of finds in
239 * bulk-insert mode are to insert new elements.
240 */
241 zfs_btree_index_t idx;
242 size_t size = tree->bt_elem_size;
243 if (tree->bt_bulk != NULL) {
244 zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
245 int comp = tree->bt_compar(last_leaf->btl_elems +
246 (last_leaf->btl_hdr.bth_first +
247 last_leaf->btl_hdr.bth_count - 1) * size, value);
248 if (comp < 0) {
249 /*
250 * If what they're looking for is after the last
251 * element, it's not in the tree.
252 */
253 if (where != NULL) {
254 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
255 where->bti_offset =
256 last_leaf->btl_hdr.bth_count;
257 where->bti_before = B_TRUE;
258 }
259 return (NULL);
260 } else if (comp == 0) {
261 if (where != NULL) {
262 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
263 where->bti_offset =
264 last_leaf->btl_hdr.bth_count - 1;
265 where->bti_before = B_FALSE;
266 }
267 return (last_leaf->btl_elems +
268 (last_leaf->btl_hdr.bth_first +
269 last_leaf->btl_hdr.bth_count - 1) * size);
270 }
271 if (tree->bt_compar(last_leaf->btl_elems +
272 last_leaf->btl_hdr.bth_first * size, value) <= 0) {
273 /*
274 * If what they're looking for is after the first
275 * element in the last leaf, it's in the last leaf or
276 * it's not in the tree.
277 */
278 void *d = zfs_btree_find_in_buf(tree,
279 last_leaf->btl_elems +
280 last_leaf->btl_hdr.bth_first * size,
281 last_leaf->btl_hdr.bth_count, value, &idx);
282
283 if (where != NULL) {
284 idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
285 *where = idx;
286 }
287 return (d);
288 }
289 }
290
291 zfs_btree_core_t *node = NULL;
292 uint32_t child = 0;
293 uint64_t depth = 0;
294
295 /*
296 * Iterate down the tree, finding which child the value should be in
297 * by comparing with the separators.
298 */
299 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
300 node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
301 ASSERT3P(node, !=, NULL);
302 void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
303 node->btc_hdr.bth_count, value, &idx);
304 EQUIV(d != NULL, !idx.bti_before);
305 if (d != NULL) {
306 if (where != NULL) {
307 idx.bti_node = (zfs_btree_hdr_t *)node;
308 *where = idx;
309 }
310 return (d);
311 }
312 ASSERT(idx.bti_before);
313 child = idx.bti_offset;
314 }
315
316 /*
317 * The value is in this leaf, or it would be if it were in the
318 * tree. Find its proper location and return it.
319 */
320 zfs_btree_leaf_t *leaf = (depth == 0 ?
321 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
322 void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
323 leaf->btl_hdr.bth_first * size,
324 leaf->btl_hdr.bth_count, value, &idx);
325
326 if (where != NULL) {
327 idx.bti_node = (zfs_btree_hdr_t *)leaf;
328 *where = idx;
329 }
330
331 return (d);
332 }
333
334 /*
335 * To explain the following functions, it is useful to understand the four
336 * kinds of shifts used in btree operation. First, a shift is a movement of
337 * elements within a node. It is used to create gaps for inserting new
338 * elements and children, or cover gaps created when things are removed. A
339 * shift has two fundamental properties, each of which can be one of two
340 * values, making four types of shifts. There is the direction of the shift
341 * (left or right) and the shape of the shift (parallelogram or isoceles
342 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
343 * applies to shifts of core nodes.
344 *
345 * The names derive from the following imagining of the layout of a node:
346 *
347 * Elements: * * * * * * * ... * * *
348 * Children: * * * * * * * * ... * * *
349 *
350 * This layout follows from the fact that the elements act as separators
351 * between pairs of children, and that children root subtrees "below" the
352 * current node. A left and right shift are fairly self-explanatory; a left
353 * shift moves things to the left, while a right shift moves things to the
354 * right. A parallelogram shift is a shift with the same number of elements
355 * and children being moved, while a trapezoid shift is a shift that moves one
356 * more children than elements. An example follows:
357 *
358 * A parallelogram shift could contain the following:
359 * _______________
360 * \* * * * \ * * * ... * * *
361 * * \ * * * *\ * * * ... * * *
362 * ---------------
363 * A trapezoid shift could contain the following:
364 * ___________
365 * * / * * * \ * * * ... * * *
366 * * / * * * *\ * * * ... * * *
367 * ---------------
368 *
369 * Note that a parellelogram shift is always shaped like a "left-leaning"
370 * parallelogram, where the starting index of the children being moved is
371 * always one higher than the starting index of the elements being moved. No
372 * "right-leaning" parallelogram shifts are needed (shifts where the starting
373 * element index and starting child index being moved are the same) to achieve
374 * any btree operations, so we ignore them.
375 */
376
377 enum bt_shift_shape {
378 BSS_TRAPEZOID,
379 BSS_PARALLELOGRAM
380 };
381
382 enum bt_shift_direction {
383 BSD_LEFT,
384 BSD_RIGHT
385 };
386
387 /*
388 * Shift elements and children in the provided core node by off spots. The
389 * first element moved is idx, and count elements are moved. The shape of the
390 * shift is determined by shape. The direction is determined by dir.
391 */
392 static inline void
bt_shift_core(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_shape shape,enum bt_shift_direction dir)393 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
394 uint32_t count, uint32_t off, enum bt_shift_shape shape,
395 enum bt_shift_direction dir)
396 {
397 size_t size = tree->bt_elem_size;
398 ASSERT(zfs_btree_is_core(&node->btc_hdr));
399
400 uint8_t *e_start = node->btc_elems + idx * size;
401 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
402 e_start + off * size);
403 bmov(e_start, e_out, count * size);
404
405 zfs_btree_hdr_t **c_start = node->btc_children + idx +
406 (shape == BSS_TRAPEZOID ? 0 : 1);
407 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
408 c_start + off);
409 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
410 bmov(c_start, c_out, c_count * sizeof (*c_start));
411 }
412
413 /*
414 * Shift elements and children in the provided core node left by one spot.
415 * The first element moved is idx, and count elements are moved. The
416 * shape of the shift is determined by trap; true if the shift is a trapezoid,
417 * false if it is a parallelogram.
418 */
419 static inline void
bt_shift_core_left(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)420 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
421 uint32_t count, enum bt_shift_shape shape)
422 {
423 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
424 }
425
426 /*
427 * Shift elements and children in the provided core node right by one spot.
428 * Starts with elements[idx] and children[idx] and one more child than element.
429 */
430 static inline void
bt_shift_core_right(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)431 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
432 uint32_t count, enum bt_shift_shape shape)
433 {
434 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
435 }
436
437 /*
438 * Shift elements and children in the provided leaf node by off spots.
439 * The first element moved is idx, and count elements are moved. The direction
440 * is determined by left.
441 */
442 static inline void
bt_shift_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_direction dir)443 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
444 uint32_t count, uint32_t off, enum bt_shift_direction dir)
445 {
446 size_t size = tree->bt_elem_size;
447 zfs_btree_hdr_t *hdr = &node->btl_hdr;
448 ASSERT(!zfs_btree_is_core(hdr));
449
450 if (count == 0)
451 return;
452 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
453 uint8_t *out = (dir == BSD_LEFT ? start - off * size :
454 start + off * size);
455 bmov(start, out, count * size);
456 }
457
458 /*
459 * Grow leaf for n new elements before idx.
460 */
461 static void
bt_grow_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)462 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
463 uint32_t n)
464 {
465 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
466 ASSERT(!zfs_btree_is_core(hdr));
467 ASSERT3U(idx, <=, hdr->bth_count);
468 uint32_t capacity = tree->bt_leaf_cap;
469 ASSERT3U(hdr->bth_count + n, <=, capacity);
470 boolean_t cl = (hdr->bth_first >= n);
471 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
472
473 if (cl && (!cr || idx <= hdr->bth_count / 2)) {
474 /* Grow left. */
475 hdr->bth_first -= n;
476 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
477 } else if (cr) {
478 /* Grow right. */
479 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
480 BSD_RIGHT);
481 } else {
482 /* Grow both ways. */
483 uint32_t fn = hdr->bth_first -
484 (capacity - (hdr->bth_count + n)) / 2;
485 hdr->bth_first -= fn;
486 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
487 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
488 n - fn, BSD_RIGHT);
489 }
490 hdr->bth_count += n;
491 }
492
493 /*
494 * Shrink leaf for count elements starting from idx.
495 */
496 static void
bt_shrink_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)497 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
498 uint32_t n)
499 {
500 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
501 ASSERT(!zfs_btree_is_core(hdr));
502 ASSERT3U(idx, <=, hdr->bth_count);
503 ASSERT3U(idx + n, <=, hdr->bth_count);
504
505 if (idx <= (hdr->bth_count - n) / 2) {
506 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
507 zfs_btree_poison_node_at(tree, hdr, 0, n);
508 hdr->bth_first += n;
509 } else {
510 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
511 BSD_LEFT);
512 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
513 }
514 hdr->bth_count -= n;
515 }
516
517 /*
518 * Move children and elements from one core node to another. The shape
519 * parameter behaves the same as it does in the shift logic.
520 */
521 static inline void
bt_transfer_core(zfs_btree_t * tree,zfs_btree_core_t * source,uint32_t sidx,uint32_t count,zfs_btree_core_t * dest,uint32_t didx,enum bt_shift_shape shape)522 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
523 uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
524 enum bt_shift_shape shape)
525 {
526 size_t size = tree->bt_elem_size;
527 ASSERT(zfs_btree_is_core(&source->btc_hdr));
528 ASSERT(zfs_btree_is_core(&dest->btc_hdr));
529
530 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
531 count * size);
532
533 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
534 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
535 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
536 c_count * sizeof (*source->btc_children));
537 }
538
539 static inline void
bt_transfer_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * source,uint32_t sidx,uint32_t count,zfs_btree_leaf_t * dest,uint32_t didx)540 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
541 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
542 {
543 size_t size = tree->bt_elem_size;
544 ASSERT(!zfs_btree_is_core(&source->btl_hdr));
545 ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
546
547 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
548 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
549 count * size);
550 }
551
552 /*
553 * Find the first element in the subtree rooted at hdr, return its value and
554 * put its location in where if non-null.
555 */
556 static void *
zfs_btree_first_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)557 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
558 zfs_btree_index_t *where)
559 {
560 zfs_btree_hdr_t *node;
561
562 for (node = hdr; zfs_btree_is_core(node);
563 node = ((zfs_btree_core_t *)node)->btc_children[0])
564 ;
565
566 ASSERT(!zfs_btree_is_core(node));
567 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
568 if (where != NULL) {
569 where->bti_node = node;
570 where->bti_offset = 0;
571 where->bti_before = B_FALSE;
572 }
573 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
574 }
575
576 /* Insert an element and a child into a core node at the given offset. */
577 static void
zfs_btree_insert_core_impl(zfs_btree_t * tree,zfs_btree_core_t * parent,uint32_t offset,zfs_btree_hdr_t * new_node,void * buf)578 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
579 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
580 {
581 size_t size = tree->bt_elem_size;
582 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
583 ASSERT3P(par_hdr, ==, new_node->bth_parent);
584 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
585
586 if (zfs_btree_verify_intensity >= 5) {
587 zfs_btree_verify_poison_at(tree, par_hdr,
588 par_hdr->bth_count);
589 }
590 /* Shift existing elements and children */
591 uint32_t count = par_hdr->bth_count - offset;
592 bt_shift_core_right(tree, parent, offset, count,
593 BSS_PARALLELOGRAM);
594
595 /* Insert new values */
596 parent->btc_children[offset + 1] = new_node;
597 bcpy(buf, parent->btc_elems + offset * size, size);
598 par_hdr->bth_count++;
599 }
600
601 /*
602 * Insert new_node into the parent of old_node directly after old_node, with
603 * buf as the dividing element between the two.
604 */
605 static void
zfs_btree_insert_into_parent(zfs_btree_t * tree,zfs_btree_hdr_t * old_node,zfs_btree_hdr_t * new_node,void * buf)606 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
607 zfs_btree_hdr_t *new_node, void *buf)
608 {
609 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
610 size_t size = tree->bt_elem_size;
611 zfs_btree_core_t *parent = old_node->bth_parent;
612 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
613
614 /*
615 * If this is the root node we were splitting, we create a new root
616 * and increase the height of the tree.
617 */
618 if (parent == NULL) {
619 ASSERT3P(old_node, ==, tree->bt_root);
620 tree->bt_num_nodes++;
621 zfs_btree_core_t *new_root =
622 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
623 size, KM_SLEEP);
624 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
625 new_root_hdr->bth_parent = NULL;
626 new_root_hdr->bth_first = -1;
627 new_root_hdr->bth_count = 1;
628
629 old_node->bth_parent = new_node->bth_parent = new_root;
630 new_root->btc_children[0] = old_node;
631 new_root->btc_children[1] = new_node;
632 bcpy(buf, new_root->btc_elems, size);
633
634 tree->bt_height++;
635 tree->bt_root = new_root_hdr;
636 zfs_btree_poison_node(tree, new_root_hdr);
637 return;
638 }
639
640 /*
641 * Since we have the new separator, binary search for where to put
642 * new_node.
643 */
644 zfs_btree_index_t idx;
645 ASSERT(zfs_btree_is_core(par_hdr));
646 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
647 par_hdr->bth_count, buf, &idx), ==, NULL);
648 ASSERT(idx.bti_before);
649 uint32_t offset = idx.bti_offset;
650 ASSERT3U(offset, <=, par_hdr->bth_count);
651 ASSERT3P(parent->btc_children[offset], ==, old_node);
652
653 /*
654 * If the parent isn't full, shift things to accomodate our insertions
655 * and return.
656 */
657 if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
658 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
659 return;
660 }
661
662 /*
663 * We need to split this core node into two. Currently there are
664 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
665 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
666 * current node, and the others will be moved to the new core node.
667 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
668 * will be used as the new separator in our parent, and the others
669 * will be split among the two core nodes.
670 *
671 * Usually we will split the node in half evenly, with
672 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
673 * instead move only about a quarter of the elements (and children) to
674 * the new node. Since the average state after a long time is a 3/4
675 * full node, shortcutting directly to that state improves efficiency.
676 *
677 * We do this in two stages: first we split into two nodes, and then we
678 * reuse our existing logic to insert the new element and child.
679 */
680 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
681 2 : 4)) - 1, 2);
682 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
683 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
684 tree->bt_num_nodes++;
685 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
686 BTREE_CORE_ELEMS * size, KM_SLEEP);
687 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
688 new_par_hdr->bth_parent = par_hdr->bth_parent;
689 new_par_hdr->bth_first = -1;
690 new_par_hdr->bth_count = move_count;
691 zfs_btree_poison_node(tree, new_par_hdr);
692
693 par_hdr->bth_count = keep_count;
694
695 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
696 0, BSS_TRAPEZOID);
697
698 /* Store the new separator in a buffer. */
699 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
700 bcpy(parent->btc_elems + keep_count * size, tmp_buf,
701 size);
702 zfs_btree_poison_node(tree, par_hdr);
703
704 if (offset < keep_count) {
705 /* Insert the new node into the left half */
706 zfs_btree_insert_core_impl(tree, parent, offset, new_node,
707 buf);
708
709 /*
710 * Move the new separator to the existing buffer.
711 */
712 bcpy(tmp_buf, buf, size);
713 } else if (offset > keep_count) {
714 /* Insert the new node into the right half */
715 new_node->bth_parent = new_parent;
716 zfs_btree_insert_core_impl(tree, new_parent,
717 offset - keep_count - 1, new_node, buf);
718
719 /*
720 * Move the new separator to the existing buffer.
721 */
722 bcpy(tmp_buf, buf, size);
723 } else {
724 /*
725 * Move the new separator into the right half, and replace it
726 * with buf. We also need to shift back the elements in the
727 * right half to accomodate new_node.
728 */
729 bt_shift_core_right(tree, new_parent, 0, move_count,
730 BSS_TRAPEZOID);
731 new_parent->btc_children[0] = new_node;
732 bcpy(tmp_buf, new_parent->btc_elems, size);
733 new_par_hdr->bth_count++;
734 }
735 kmem_free(tmp_buf, size);
736 zfs_btree_poison_node(tree, par_hdr);
737
738 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
739 new_parent->btc_children[i]->bth_parent = new_parent;
740
741 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
742 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
743
744 /*
745 * Now that the node is split, we need to insert the new node into its
746 * parent. This may cause further splitting.
747 */
748 zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
749 &new_parent->btc_hdr, buf);
750 }
751
752 /* Insert an element into a leaf node at the given offset. */
753 static void
zfs_btree_insert_leaf_impl(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,const void * value)754 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
755 uint32_t idx, const void *value)
756 {
757 size_t size = tree->bt_elem_size;
758 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
759 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
760
761 if (zfs_btree_verify_intensity >= 5) {
762 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
763 leaf->btl_hdr.bth_count);
764 }
765
766 bt_grow_leaf(tree, leaf, idx, 1);
767 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
768 bcpy(value, start, size);
769 }
770
771 static void
772 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
773
774 /* Helper function for inserting a new value into leaf at the given index. */
775 static void
zfs_btree_insert_into_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,const void * value,uint32_t idx)776 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
777 const void *value, uint32_t idx)
778 {
779 size_t size = tree->bt_elem_size;
780 uint32_t capacity = tree->bt_leaf_cap;
781
782 /*
783 * If the leaf isn't full, shift the elements after idx and insert
784 * value.
785 */
786 if (leaf->btl_hdr.bth_count != capacity) {
787 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
788 return;
789 }
790
791 /*
792 * Otherwise, we split the leaf node into two nodes. If we're not bulk
793 * inserting, each is of size (capacity / 2). If we are bulk
794 * inserting, we move a quarter of the elements to the new node so
795 * inserts into the old node don't cause immediate splitting but the
796 * tree stays relatively dense. Since the average state after a long
797 * time is a 3/4 full node, shortcutting directly to that state
798 * improves efficiency. At the end of the bulk insertion process
799 * we'll need to go through and fix up any nodes (the last leaf and
800 * its ancestors, potentially) that are below the minimum.
801 *
802 * In either case, we're left with one extra element. The leftover
803 * element will become the new dividing element between the two nodes.
804 */
805 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
806 uint32_t keep_count = capacity - move_count - 1;
807 ASSERT3U(keep_count, >=, 1);
808 /* If we insert on left. move one more to keep leaves balanced. */
809 if (idx < keep_count) {
810 keep_count--;
811 move_count++;
812 }
813 tree->bt_num_nodes++;
814 zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
815 KM_SLEEP);
816 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
817 new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
818 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
819 (idx >= keep_count && idx <= keep_count + move_count / 2);
820 new_hdr->bth_count = move_count;
821 zfs_btree_poison_node(tree, new_hdr);
822
823 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
824 tree->bt_bulk = new_leaf;
825
826 /* Copy the back part to the new leaf. */
827 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
828
829 /* We store the new separator in a buffer we control for simplicity. */
830 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
831 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
832 buf, size);
833
834 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
835
836 if (idx < keep_count) {
837 /* Insert into the existing leaf. */
838 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
839 } else if (idx > keep_count) {
840 /* Insert into the new leaf. */
841 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
842 1, value);
843 } else {
844 /*
845 * Insert planned separator into the new leaf, and use
846 * the new value as the new separator.
847 */
848 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
849 bcpy(value, buf, size);
850 }
851
852 /*
853 * Now that the node is split, we need to insert the new node into its
854 * parent. This may cause further splitting, bur only of core nodes.
855 */
856 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
857 buf);
858 kmem_free(buf, size);
859 }
860
861 static uint32_t
zfs_btree_find_parent_idx(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)862 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
863 {
864 void *buf;
865 if (zfs_btree_is_core(hdr)) {
866 buf = ((zfs_btree_core_t *)hdr)->btc_elems;
867 } else {
868 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
869 hdr->bth_first * tree->bt_elem_size;
870 }
871 zfs_btree_index_t idx;
872 zfs_btree_core_t *parent = hdr->bth_parent;
873 VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
874 parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
875 ASSERT(idx.bti_before);
876 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
877 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
878 return (idx.bti_offset);
879 }
880
881 /*
882 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
883 * nodes may violate the invariant that non-root nodes must be at least half
884 * full. All nodes violating this invariant should be the last node in their
885 * particular level. To correct the invariant, we take values from their left
886 * neighbor until they are half full. They must have a left neighbor at their
887 * level because the last node at a level is not the first node unless it's
888 * the root.
889 */
890 static void
zfs_btree_bulk_finish(zfs_btree_t * tree)891 zfs_btree_bulk_finish(zfs_btree_t *tree)
892 {
893 ASSERT3P(tree->bt_bulk, !=, NULL);
894 ASSERT3P(tree->bt_root, !=, NULL);
895 zfs_btree_leaf_t *leaf = tree->bt_bulk;
896 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
897 zfs_btree_core_t *parent = hdr->bth_parent;
898 size_t size = tree->bt_elem_size;
899 uint32_t capacity = tree->bt_leaf_cap;
900
901 /*
902 * The invariant doesn't apply to the root node, if that's the only
903 * node in the tree we're done.
904 */
905 if (parent == NULL) {
906 tree->bt_bulk = NULL;
907 return;
908 }
909
910 /* First, take elements to rebalance the leaf node. */
911 if (hdr->bth_count < capacity / 2) {
912 /*
913 * First, find the left neighbor. The simplest way to do this
914 * is to call zfs_btree_prev twice; the first time finds some
915 * ancestor of this node, and the second time finds the left
916 * neighbor. The ancestor found is the lowest common ancestor
917 * of leaf and the neighbor.
918 */
919 zfs_btree_index_t idx = {
920 .bti_node = hdr,
921 .bti_offset = 0
922 };
923 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
924 ASSERT(zfs_btree_is_core(idx.bti_node));
925 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
926 uint32_t common_idx = idx.bti_offset;
927
928 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
929 ASSERT(!zfs_btree_is_core(idx.bti_node));
930 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
931 zfs_btree_hdr_t *l_hdr = idx.bti_node;
932 uint32_t move_count = (capacity / 2) - hdr->bth_count;
933 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
934 capacity / 2);
935
936 if (zfs_btree_verify_intensity >= 5) {
937 for (uint32_t i = 0; i < move_count; i++) {
938 zfs_btree_verify_poison_at(tree, hdr,
939 leaf->btl_hdr.bth_count + i);
940 }
941 }
942
943 /* First, shift elements in leaf back. */
944 bt_grow_leaf(tree, leaf, 0, move_count);
945
946 /* Next, move the separator from the common ancestor to leaf. */
947 uint8_t *separator = common->btc_elems + common_idx * size;
948 uint8_t *out = leaf->btl_elems +
949 (hdr->bth_first + move_count - 1) * size;
950 bcpy(separator, out, size);
951
952 /*
953 * Now we move elements from the tail of the left neighbor to
954 * fill the remaining spots in leaf.
955 */
956 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
957 (move_count - 1), move_count - 1, leaf, 0);
958
959 /*
960 * Finally, move the new last element in the left neighbor to
961 * the separator.
962 */
963 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
964 l_hdr->bth_count - move_count) * size, separator, size);
965
966 /* Adjust the node's counts, and we're done. */
967 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
968 move_count);
969
970 ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
971 ASSERT3U(hdr->bth_count, >=, capacity / 2);
972 }
973
974 /*
975 * Now we have to rebalance any ancestors of leaf that may also
976 * violate the invariant.
977 */
978 capacity = BTREE_CORE_ELEMS;
979 while (parent->btc_hdr.bth_parent != NULL) {
980 zfs_btree_core_t *cur = parent;
981 zfs_btree_hdr_t *hdr = &cur->btc_hdr;
982 parent = hdr->bth_parent;
983 /*
984 * If the invariant isn't violated, move on to the next
985 * ancestor.
986 */
987 if (hdr->bth_count >= capacity / 2)
988 continue;
989
990 /*
991 * Because the smallest number of nodes we can move when
992 * splitting is 2, we never need to worry about not having a
993 * left sibling (a sibling is a neighbor with the same parent).
994 */
995 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
996 ASSERT3U(parent_idx, >, 0);
997 zfs_btree_core_t *l_neighbor =
998 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
999 uint32_t move_count = (capacity / 2) - hdr->bth_count;
1000 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
1001 capacity / 2);
1002
1003 if (zfs_btree_verify_intensity >= 5) {
1004 for (uint32_t i = 0; i < move_count; i++) {
1005 zfs_btree_verify_poison_at(tree, hdr,
1006 hdr->bth_count + i);
1007 }
1008 }
1009 /* First, shift things in the right node back. */
1010 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
1011 BSS_TRAPEZOID, BSD_RIGHT);
1012
1013 /* Next, move the separator to the right node. */
1014 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
1015 size);
1016 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
1017 bcpy(separator, e_out, size);
1018
1019 /*
1020 * Now, move elements and children from the left node to the
1021 * right. We move one more child than elements.
1022 */
1023 move_count--;
1024 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
1025 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
1026 BSS_TRAPEZOID);
1027
1028 /*
1029 * Finally, move the last element in the left node to the
1030 * separator's position.
1031 */
1032 move_idx--;
1033 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
1034
1035 l_neighbor->btc_hdr.bth_count -= move_count + 1;
1036 hdr->bth_count += move_count + 1;
1037
1038 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
1039 ASSERT3U(hdr->bth_count, >=, capacity / 2);
1040
1041 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
1042
1043 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1044 cur->btc_children[i]->bth_parent = cur;
1045 }
1046
1047 tree->bt_bulk = NULL;
1048 zfs_btree_verify(tree);
1049 }
1050
1051 /*
1052 * Insert value into tree at the location specified by where.
1053 */
1054 void
zfs_btree_add_idx(zfs_btree_t * tree,const void * value,const zfs_btree_index_t * where)1055 zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
1056 const zfs_btree_index_t *where)
1057 {
1058 zfs_btree_index_t idx = {0};
1059
1060 /* If we're not inserting in the last leaf, end bulk insert mode. */
1061 if (tree->bt_bulk != NULL) {
1062 if (where->bti_node != &tree->bt_bulk->btl_hdr) {
1063 zfs_btree_bulk_finish(tree);
1064 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
1065 where = &idx;
1066 }
1067 }
1068
1069 tree->bt_num_elems++;
1070 /*
1071 * If this is the first element in the tree, create a leaf root node
1072 * and add the value to it.
1073 */
1074 if (where->bti_node == NULL) {
1075 ASSERT3U(tree->bt_num_elems, ==, 1);
1076 ASSERT3S(tree->bt_height, ==, -1);
1077 ASSERT3P(tree->bt_root, ==, NULL);
1078 ASSERT0(where->bti_offset);
1079
1080 tree->bt_num_nodes++;
1081 zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
1082 KM_SLEEP);
1083 tree->bt_root = &leaf->btl_hdr;
1084 tree->bt_height++;
1085
1086 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
1087 hdr->bth_parent = NULL;
1088 hdr->bth_first = 0;
1089 hdr->bth_count = 0;
1090 zfs_btree_poison_node(tree, hdr);
1091
1092 zfs_btree_insert_into_leaf(tree, leaf, value, 0);
1093 tree->bt_bulk = leaf;
1094 } else if (!zfs_btree_is_core(where->bti_node)) {
1095 /*
1096 * If we're inserting into a leaf, go directly to the helper
1097 * function.
1098 */
1099 zfs_btree_insert_into_leaf(tree,
1100 (zfs_btree_leaf_t *)where->bti_node, value,
1101 where->bti_offset);
1102 } else {
1103 /*
1104 * If we're inserting into a core node, we can't just shift
1105 * the existing element in that slot in the same node without
1106 * breaking our ordering invariants. Instead we place the new
1107 * value in the node at that spot and then insert the old
1108 * separator into the first slot in the subtree to the right.
1109 */
1110 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
1111
1112 /*
1113 * We can ignore bti_before, because either way the value
1114 * should end up in bti_offset.
1115 */
1116 uint32_t off = where->bti_offset;
1117 zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
1118 size_t size = tree->bt_elem_size;
1119 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
1120 bcpy(node->btc_elems + off * size, buf, size);
1121 bcpy(value, node->btc_elems + off * size, size);
1122
1123 /*
1124 * Find the first slot in the subtree to the right, insert
1125 * there.
1126 */
1127 zfs_btree_index_t new_idx;
1128 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
1129 NULL);
1130 ASSERT0(new_idx.bti_offset);
1131 ASSERT(!zfs_btree_is_core(new_idx.bti_node));
1132 zfs_btree_insert_into_leaf(tree,
1133 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
1134 kmem_free(buf, size);
1135 }
1136 zfs_btree_verify(tree);
1137 }
1138
1139 /*
1140 * Return the first element in the tree, and put its location in where if
1141 * non-null.
1142 */
1143 void *
zfs_btree_first(zfs_btree_t * tree,zfs_btree_index_t * where)1144 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
1145 {
1146 if (tree->bt_height == -1) {
1147 ASSERT0(tree->bt_num_elems);
1148 return (NULL);
1149 }
1150 return (zfs_btree_first_helper(tree, tree->bt_root, where));
1151 }
1152
1153 /*
1154 * Find the last element in the subtree rooted at hdr, return its value and
1155 * put its location in where if non-null.
1156 */
1157 static void *
zfs_btree_last_helper(zfs_btree_t * btree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)1158 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
1159 zfs_btree_index_t *where)
1160 {
1161 zfs_btree_hdr_t *node;
1162
1163 for (node = hdr; zfs_btree_is_core(node); node =
1164 ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
1165 ;
1166
1167 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
1168 if (where != NULL) {
1169 where->bti_node = node;
1170 where->bti_offset = node->bth_count - 1;
1171 where->bti_before = B_FALSE;
1172 }
1173 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
1174 btree->bt_elem_size);
1175 }
1176
1177 /*
1178 * Return the last element in the tree, and put its location in where if
1179 * non-null.
1180 */
1181 void *
zfs_btree_last(zfs_btree_t * tree,zfs_btree_index_t * where)1182 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
1183 {
1184 if (tree->bt_height == -1) {
1185 ASSERT0(tree->bt_num_elems);
1186 return (NULL);
1187 }
1188 return (zfs_btree_last_helper(tree, tree->bt_root, where));
1189 }
1190
1191 /*
1192 * This function contains the logic to find the next node in the tree. A
1193 * helper function is used because there are multiple internal consumemrs of
1194 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
1195 * node after we've finished with it.
1196 */
1197 static void *
zfs_btree_next_helper(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx,void (* done_func)(zfs_btree_t *,zfs_btree_hdr_t *))1198 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1199 zfs_btree_index_t *out_idx,
1200 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
1201 {
1202 if (idx->bti_node == NULL) {
1203 ASSERT3S(tree->bt_height, ==, -1);
1204 return (NULL);
1205 }
1206
1207 uint32_t offset = idx->bti_offset;
1208 if (!zfs_btree_is_core(idx->bti_node)) {
1209 /*
1210 * When finding the next element of an element in a leaf,
1211 * there are two cases. If the element isn't the last one in
1212 * the leaf, in which case we just return the next element in
1213 * the leaf. Otherwise, we need to traverse up our parents
1214 * until we find one where our ancestor isn't the last child
1215 * of its parent. Once we do, the next element is the
1216 * separator after our ancestor in its parent.
1217 */
1218 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1219 uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
1220 if (leaf->btl_hdr.bth_count > new_off) {
1221 out_idx->bti_node = &leaf->btl_hdr;
1222 out_idx->bti_offset = new_off;
1223 out_idx->bti_before = B_FALSE;
1224 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1225 new_off) * tree->bt_elem_size);
1226 }
1227
1228 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1229 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1230 node != NULL; node = node->btc_hdr.bth_parent) {
1231 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1232 ASSERT(zfs_btree_is_core(hdr));
1233 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1234 if (done_func != NULL)
1235 done_func(tree, prev);
1236 if (i == hdr->bth_count) {
1237 prev = hdr;
1238 continue;
1239 }
1240 out_idx->bti_node = hdr;
1241 out_idx->bti_offset = i;
1242 out_idx->bti_before = B_FALSE;
1243 return (node->btc_elems + i * tree->bt_elem_size);
1244 }
1245 if (done_func != NULL)
1246 done_func(tree, prev);
1247 /*
1248 * We've traversed all the way up and been at the end of the
1249 * node every time, so this was the last element in the tree.
1250 */
1251 return (NULL);
1252 }
1253
1254 /* If we were before an element in a core node, return that element. */
1255 ASSERT(zfs_btree_is_core(idx->bti_node));
1256 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1257 if (idx->bti_before) {
1258 out_idx->bti_before = B_FALSE;
1259 return (node->btc_elems + offset * tree->bt_elem_size);
1260 }
1261
1262 /*
1263 * The next element from one in a core node is the first element in
1264 * the subtree just to the right of the separator.
1265 */
1266 zfs_btree_hdr_t *child = node->btc_children[offset + 1];
1267 return (zfs_btree_first_helper(tree, child, out_idx));
1268 }
1269
1270 /*
1271 * Return the next valued node in the tree. The same address can be safely
1272 * passed for idx and out_idx.
1273 */
1274 void *
zfs_btree_next(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1275 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1276 zfs_btree_index_t *out_idx)
1277 {
1278 return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
1279 }
1280
1281 /*
1282 * Return the previous valued node in the tree. The same value can be safely
1283 * passed for idx and out_idx.
1284 */
1285 void *
zfs_btree_prev(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1286 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1287 zfs_btree_index_t *out_idx)
1288 {
1289 if (idx->bti_node == NULL) {
1290 ASSERT3S(tree->bt_height, ==, -1);
1291 return (NULL);
1292 }
1293
1294 uint32_t offset = idx->bti_offset;
1295 if (!zfs_btree_is_core(idx->bti_node)) {
1296 /*
1297 * When finding the previous element of an element in a leaf,
1298 * there are two cases. If the element isn't the first one in
1299 * the leaf, in which case we just return the previous element
1300 * in the leaf. Otherwise, we need to traverse up our parents
1301 * until we find one where our previous ancestor isn't the
1302 * first child. Once we do, the previous element is the
1303 * separator after our previous ancestor.
1304 */
1305 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1306 if (offset != 0) {
1307 out_idx->bti_node = &leaf->btl_hdr;
1308 out_idx->bti_offset = offset - 1;
1309 out_idx->bti_before = B_FALSE;
1310 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1311 offset - 1) * tree->bt_elem_size);
1312 }
1313 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1314 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1315 node != NULL; node = node->btc_hdr.bth_parent) {
1316 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1317 ASSERT(zfs_btree_is_core(hdr));
1318 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1319 if (i == 0) {
1320 prev = hdr;
1321 continue;
1322 }
1323 out_idx->bti_node = hdr;
1324 out_idx->bti_offset = i - 1;
1325 out_idx->bti_before = B_FALSE;
1326 return (node->btc_elems + (i - 1) * tree->bt_elem_size);
1327 }
1328 /*
1329 * We've traversed all the way up and been at the start of the
1330 * node every time, so this was the first node in the tree.
1331 */
1332 return (NULL);
1333 }
1334
1335 /*
1336 * The previous element from one in a core node is the last element in
1337 * the subtree just to the left of the separator.
1338 */
1339 ASSERT(zfs_btree_is_core(idx->bti_node));
1340 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1341 zfs_btree_hdr_t *child = node->btc_children[offset];
1342 return (zfs_btree_last_helper(tree, child, out_idx));
1343 }
1344
1345 /*
1346 * Get the value at the provided index in the tree.
1347 *
1348 * Note that the value returned from this function can be mutated, but only
1349 * if it will not change the ordering of the element with respect to any other
1350 * elements that could be in the tree.
1351 */
1352 void *
zfs_btree_get(zfs_btree_t * tree,zfs_btree_index_t * idx)1353 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
1354 {
1355 ASSERT(!idx->bti_before);
1356 size_t size = tree->bt_elem_size;
1357 if (!zfs_btree_is_core(idx->bti_node)) {
1358 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1359 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1360 idx->bti_offset) * size);
1361 }
1362 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1363 return (node->btc_elems + idx->bti_offset * size);
1364 }
1365
1366 /* Add the given value to the tree. Must not already be in the tree. */
1367 void
zfs_btree_add(zfs_btree_t * tree,const void * node)1368 zfs_btree_add(zfs_btree_t *tree, const void *node)
1369 {
1370 zfs_btree_index_t where = {0};
1371 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
1372 zfs_btree_add_idx(tree, node, &where);
1373 }
1374
1375 /* Helper function to free a tree node. */
1376 static void
zfs_btree_node_destroy(zfs_btree_t * tree,zfs_btree_hdr_t * node)1377 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
1378 {
1379 tree->bt_num_nodes--;
1380 if (!zfs_btree_is_core(node)) {
1381 kmem_cache_free(zfs_btree_leaf_cache, node);
1382 } else {
1383 kmem_free(node, sizeof (zfs_btree_core_t) +
1384 BTREE_CORE_ELEMS * tree->bt_elem_size);
1385 }
1386 }
1387
1388 /*
1389 * Remove the rm_hdr and the separator to its left from the parent node. The
1390 * buffer that rm_hdr was stored in may already be freed, so its contents
1391 * cannot be accessed.
1392 */
1393 static void
zfs_btree_remove_from_node(zfs_btree_t * tree,zfs_btree_core_t * node,zfs_btree_hdr_t * rm_hdr)1394 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
1395 zfs_btree_hdr_t *rm_hdr)
1396 {
1397 size_t size = tree->bt_elem_size;
1398 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
1399 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1400 /*
1401 * If the node is the root node and rm_hdr is one of two children,
1402 * promote the other child to the root.
1403 */
1404 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
1405 ASSERT3U(hdr->bth_count, ==, 1);
1406 ASSERT3P(tree->bt_root, ==, node);
1407 ASSERT3P(node->btc_children[1], ==, rm_hdr);
1408 tree->bt_root = node->btc_children[0];
1409 node->btc_children[0]->bth_parent = NULL;
1410 zfs_btree_node_destroy(tree, hdr);
1411 tree->bt_height--;
1412 return;
1413 }
1414
1415 uint32_t idx;
1416 for (idx = 0; idx <= hdr->bth_count; idx++) {
1417 if (node->btc_children[idx] == rm_hdr)
1418 break;
1419 }
1420 ASSERT3U(idx, <=, hdr->bth_count);
1421
1422 /*
1423 * If the node is the root or it has more than the minimum number of
1424 * children, just remove the child and separator, and return.
1425 */
1426 if (hdr->bth_parent == NULL ||
1427 hdr->bth_count > min_count) {
1428 /*
1429 * Shift the element and children to the right of rm_hdr to
1430 * the left by one spot.
1431 */
1432 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1433 BSS_PARALLELOGRAM);
1434 hdr->bth_count--;
1435 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
1436 return;
1437 }
1438
1439 ASSERT3U(hdr->bth_count, ==, min_count);
1440
1441 /*
1442 * Now we try to take a node from a neighbor. We check left, then
1443 * right. If the neighbor exists and has more than the minimum number
1444 * of elements, we move the separator betweeen us and them to our
1445 * node, move their closest element (last for left, first for right)
1446 * to the separator, and move their closest child to our node. Along
1447 * the way we need to collapse the gap made by idx, and (for our right
1448 * neighbor) the gap made by removing their first element and child.
1449 *
1450 * Note: this logic currently doesn't support taking from a neighbor
1451 * that isn't a sibling (i.e. a neighbor with a different
1452 * parent). This isn't critical functionality, but may be worth
1453 * implementing in the future for completeness' sake.
1454 */
1455 zfs_btree_core_t *parent = hdr->bth_parent;
1456 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1457
1458 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1459 parent->btc_children[parent_idx - 1]);
1460 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1461 /* We can take a node from the left neighbor. */
1462 ASSERT(zfs_btree_is_core(l_hdr));
1463 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
1464
1465 /*
1466 * Start by shifting the elements and children in the current
1467 * node to the right by one spot.
1468 */
1469 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
1470
1471 /*
1472 * Move the separator between node and neighbor to the first
1473 * element slot in the current node.
1474 */
1475 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1476 size;
1477 bcpy(separator, node->btc_elems, size);
1478
1479 /* Move the last child of neighbor to our first child slot. */
1480 node->btc_children[0] =
1481 neighbor->btc_children[l_hdr->bth_count];
1482 node->btc_children[0]->bth_parent = node;
1483
1484 /* Move the last element of neighbor to the separator spot. */
1485 uint8_t *take_elem = neighbor->btc_elems +
1486 (l_hdr->bth_count - 1) * size;
1487 bcpy(take_elem, separator, size);
1488 l_hdr->bth_count--;
1489 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
1490 return;
1491 }
1492
1493 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1494 NULL : parent->btc_children[parent_idx + 1]);
1495 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1496 /* We can take a node from the right neighbor. */
1497 ASSERT(zfs_btree_is_core(r_hdr));
1498 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
1499
1500 /*
1501 * Shift elements in node left by one spot to overwrite rm_hdr
1502 * and the separator before it.
1503 */
1504 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1505 BSS_PARALLELOGRAM);
1506
1507 /*
1508 * Move the separator between node and neighbor to the last
1509 * element spot in node.
1510 */
1511 uint8_t *separator = parent->btc_elems + parent_idx * size;
1512 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
1513 size);
1514
1515 /*
1516 * Move the first child of neighbor to the last child spot in
1517 * node.
1518 */
1519 node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
1520 node->btc_children[hdr->bth_count]->bth_parent = node;
1521
1522 /* Move the first element of neighbor to the separator spot. */
1523 uint8_t *take_elem = neighbor->btc_elems;
1524 bcpy(take_elem, separator, size);
1525 r_hdr->bth_count--;
1526
1527 /*
1528 * Shift the elements and children of neighbor to cover the
1529 * stolen elements.
1530 */
1531 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
1532 BSS_TRAPEZOID);
1533 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
1534 return;
1535 }
1536
1537 /*
1538 * In this case, neither of our neighbors can spare an element, so we
1539 * need to merge with one of them. We prefer the left one,
1540 * arabitrarily. Move the separator into the leftmost merging node
1541 * (which may be us or the left neighbor), and then move the right
1542 * merging node's elements. Once that's done, we go back and delete
1543 * the element we're removing. Finally, go into the parent and delete
1544 * the right merging node and the separator. This may cause further
1545 * merging.
1546 */
1547 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
1548 uint32_t new_idx = idx;
1549 if (l_hdr != NULL) {
1550 keep_hdr = l_hdr;
1551 new_rm_hdr = hdr;
1552 new_idx += keep_hdr->bth_count + 1;
1553 } else {
1554 ASSERT3P(r_hdr, !=, NULL);
1555 keep_hdr = hdr;
1556 new_rm_hdr = r_hdr;
1557 parent_idx++;
1558 }
1559
1560 ASSERT(zfs_btree_is_core(keep_hdr));
1561 ASSERT(zfs_btree_is_core(new_rm_hdr));
1562
1563 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
1564 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
1565
1566 if (zfs_btree_verify_intensity >= 5) {
1567 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
1568 zfs_btree_verify_poison_at(tree, keep_hdr,
1569 keep_hdr->bth_count + i);
1570 }
1571 }
1572
1573 /* Move the separator into the left node. */
1574 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
1575 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1576 size;
1577 bcpy(separator, e_out, size);
1578 keep_hdr->bth_count++;
1579
1580 /* Move all our elements and children into the left node. */
1581 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
1582 keep_hdr->bth_count, BSS_TRAPEZOID);
1583
1584 uint32_t old_count = keep_hdr->bth_count;
1585
1586 /* Update bookkeeping */
1587 keep_hdr->bth_count += new_rm_hdr->bth_count;
1588 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
1589
1590 /*
1591 * Shift the element and children to the right of rm_hdr to
1592 * the left by one spot.
1593 */
1594 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
1595 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
1596 BSS_PARALLELOGRAM);
1597 keep_hdr->bth_count--;
1598
1599 /* Reparent all our children to point to the left node. */
1600 zfs_btree_hdr_t **new_start = keep->btc_children +
1601 old_count - 1;
1602 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
1603 new_start[i]->bth_parent = keep;
1604 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
1605 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
1606 ASSERT3P(keep->btc_children[i], !=, rm_hdr);
1607 }
1608 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
1609
1610 new_rm_hdr->bth_count = 0;
1611 zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
1612 zfs_btree_node_destroy(tree, new_rm_hdr);
1613 }
1614
1615 /* Remove the element at the specific location. */
1616 void
zfs_btree_remove_idx(zfs_btree_t * tree,zfs_btree_index_t * where)1617 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
1618 {
1619 size_t size = tree->bt_elem_size;
1620 zfs_btree_hdr_t *hdr = where->bti_node;
1621 uint32_t idx = where->bti_offset;
1622
1623 ASSERT(!where->bti_before);
1624 if (tree->bt_bulk != NULL) {
1625 /*
1626 * Leave bulk insert mode. Note that our index would be
1627 * invalid after we correct the tree, so we copy the value
1628 * we're planning to remove and find it again after
1629 * bulk_finish.
1630 */
1631 uint8_t *value = zfs_btree_get(tree, where);
1632 uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
1633 bcpy(value, tmp, size);
1634 zfs_btree_bulk_finish(tree);
1635 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
1636 kmem_free(tmp, size);
1637 hdr = where->bti_node;
1638 idx = where->bti_offset;
1639 }
1640
1641 tree->bt_num_elems--;
1642 /*
1643 * If the element happens to be in a core node, we move a leaf node's
1644 * element into its place and then remove the leaf node element. This
1645 * makes the rebalance logic not need to be recursive both upwards and
1646 * downwards.
1647 */
1648 if (zfs_btree_is_core(hdr)) {
1649 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1650 zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
1651 void *new_value = zfs_btree_last_helper(tree, left_subtree,
1652 where);
1653 ASSERT3P(new_value, !=, NULL);
1654
1655 bcpy(new_value, node->btc_elems + idx * size, size);
1656
1657 hdr = where->bti_node;
1658 idx = where->bti_offset;
1659 ASSERT(!where->bti_before);
1660 }
1661
1662 /*
1663 * First, we'll update the leaf's metadata. Then, we shift any
1664 * elements after the idx to the left. After that, we rebalance if
1665 * needed.
1666 */
1667 ASSERT(!zfs_btree_is_core(hdr));
1668 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
1669 ASSERT3U(hdr->bth_count, >, 0);
1670
1671 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
1672
1673 /*
1674 * If we're over the minimum size or this is the root, just overwrite
1675 * the value and return.
1676 */
1677 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
1678 bt_shrink_leaf(tree, leaf, idx, 1);
1679 if (hdr->bth_parent == NULL) {
1680 ASSERT0(tree->bt_height);
1681 if (hdr->bth_count == 0) {
1682 tree->bt_root = NULL;
1683 tree->bt_height--;
1684 zfs_btree_node_destroy(tree, &leaf->btl_hdr);
1685 }
1686 }
1687 zfs_btree_verify(tree);
1688 return;
1689 }
1690 ASSERT3U(hdr->bth_count, ==, min_count);
1691
1692 /*
1693 * Now we try to take a node from a sibling. We check left, then
1694 * right. If they exist and have more than the minimum number of
1695 * elements, we move the separator betweeen us and them to our node
1696 * and move their closest element (last for left, first for right) to
1697 * the separator. Along the way we need to collapse the gap made by
1698 * idx, and (for our right neighbor) the gap made by removing their
1699 * first element.
1700 *
1701 * Note: this logic currently doesn't support taking from a neighbor
1702 * that isn't a sibling. This isn't critical functionality, but may be
1703 * worth implementing in the future for completeness' sake.
1704 */
1705 zfs_btree_core_t *parent = hdr->bth_parent;
1706 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1707
1708 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1709 parent->btc_children[parent_idx - 1]);
1710 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1711 /* We can take a node from the left neighbor. */
1712 ASSERT(!zfs_btree_is_core(l_hdr));
1713 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
1714
1715 /*
1716 * Move our elements back by one spot to make room for the
1717 * stolen element and overwrite the element being removed.
1718 */
1719 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
1720
1721 /* Move the separator to our first spot. */
1722 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1723 size;
1724 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
1725
1726 /* Move our neighbor's last element to the separator. */
1727 uint8_t *take_elem = neighbor->btl_elems +
1728 (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
1729 bcpy(take_elem, separator, size);
1730
1731 /* Delete our neighbor's last element. */
1732 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
1733 zfs_btree_verify(tree);
1734 return;
1735 }
1736
1737 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1738 NULL : parent->btc_children[parent_idx + 1]);
1739 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1740 /* We can take a node from the right neighbor. */
1741 ASSERT(!zfs_btree_is_core(r_hdr));
1742 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
1743
1744 /*
1745 * Move our elements after the element being removed forwards
1746 * by one spot to make room for the stolen element and
1747 * overwrite the element being removed.
1748 */
1749 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
1750 1, BSD_LEFT);
1751
1752 /* Move the separator between us to our last spot. */
1753 uint8_t *separator = parent->btc_elems + parent_idx * size;
1754 bcpy(separator, leaf->btl_elems + (hdr->bth_first +
1755 hdr->bth_count - 1) * size, size);
1756
1757 /* Move our neighbor's first element to the separator. */
1758 uint8_t *take_elem = neighbor->btl_elems +
1759 r_hdr->bth_first * size;
1760 bcpy(take_elem, separator, size);
1761
1762 /* Delete our neighbor's first element. */
1763 bt_shrink_leaf(tree, neighbor, 0, 1);
1764 zfs_btree_verify(tree);
1765 return;
1766 }
1767
1768 /*
1769 * In this case, neither of our neighbors can spare an element, so we
1770 * need to merge with one of them. We prefer the left one, arbitrarily.
1771 * After remove we move the separator into the leftmost merging node
1772 * (which may be us or the left neighbor), and then move the right
1773 * merging node's elements. Once that's done, we go back and delete
1774 * the element we're removing. Finally, go into the parent and delete
1775 * the right merging node and the separator. This may cause further
1776 * merging.
1777 */
1778 zfs_btree_hdr_t *rm_hdr, *k_hdr;
1779 if (l_hdr != NULL) {
1780 k_hdr = l_hdr;
1781 rm_hdr = hdr;
1782 } else {
1783 ASSERT3P(r_hdr, !=, NULL);
1784 k_hdr = hdr;
1785 rm_hdr = r_hdr;
1786 parent_idx++;
1787 }
1788 ASSERT(!zfs_btree_is_core(k_hdr));
1789 ASSERT(!zfs_btree_is_core(rm_hdr));
1790 ASSERT3U(k_hdr->bth_count, ==, min_count);
1791 ASSERT3U(rm_hdr->bth_count, ==, min_count);
1792 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
1793 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
1794
1795 if (zfs_btree_verify_intensity >= 5) {
1796 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
1797 zfs_btree_verify_poison_at(tree, k_hdr,
1798 k_hdr->bth_count + i);
1799 }
1800 }
1801
1802 /*
1803 * Remove the value from the node. It will go below the minimum,
1804 * but we'll fix it in no time.
1805 */
1806 bt_shrink_leaf(tree, leaf, idx, 1);
1807
1808 /* Prepare space for elements to be moved from the right. */
1809 uint32_t k_count = k_hdr->bth_count;
1810 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
1811 ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
1812
1813 /* Move the separator into the first open spot. */
1814 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
1815 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
1816 bcpy(separator, out, size);
1817
1818 /* Move our elements to the left neighbor. */
1819 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
1820
1821 /* Remove the emptied node from the parent. */
1822 zfs_btree_remove_from_node(tree, parent, rm_hdr);
1823 zfs_btree_node_destroy(tree, rm_hdr);
1824 zfs_btree_verify(tree);
1825 }
1826
1827 /* Remove the given value from the tree. */
1828 void
zfs_btree_remove(zfs_btree_t * tree,const void * value)1829 zfs_btree_remove(zfs_btree_t *tree, const void *value)
1830 {
1831 zfs_btree_index_t where = {0};
1832 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
1833 zfs_btree_remove_idx(tree, &where);
1834 }
1835
1836 /* Return the number of elements in the tree. */
1837 ulong_t
zfs_btree_numnodes(zfs_btree_t * tree)1838 zfs_btree_numnodes(zfs_btree_t *tree)
1839 {
1840 return (tree->bt_num_elems);
1841 }
1842
1843 /*
1844 * This function is used to visit all the elements in the tree before
1845 * destroying the tree. This allows the calling code to perform any cleanup it
1846 * needs to do. This is more efficient than just removing the first element
1847 * over and over, because it removes all rebalancing. Once the destroy_nodes()
1848 * function has been called, no other btree operations are valid until it
1849 * returns NULL, which point the only valid operation is zfs_btree_destroy().
1850 *
1851 * example:
1852 *
1853 * zfs_btree_index_t *cookie = NULL;
1854 * my_data_t *node;
1855 *
1856 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
1857 * free(node->ptr);
1858 * zfs_btree_destroy(tree);
1859 *
1860 */
1861 void *
zfs_btree_destroy_nodes(zfs_btree_t * tree,zfs_btree_index_t ** cookie)1862 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
1863 {
1864 if (*cookie == NULL) {
1865 if (tree->bt_height == -1)
1866 return (NULL);
1867 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
1868 return (zfs_btree_first(tree, *cookie));
1869 }
1870
1871 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
1872 zfs_btree_node_destroy);
1873 if (rval == NULL) {
1874 tree->bt_root = NULL;
1875 tree->bt_height = -1;
1876 tree->bt_num_elems = 0;
1877 kmem_free(*cookie, sizeof (**cookie));
1878 tree->bt_bulk = NULL;
1879 }
1880 return (rval);
1881 }
1882
1883 static void
zfs_btree_clear_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1884 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1885 {
1886 if (zfs_btree_is_core(hdr)) {
1887 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
1888 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1889 zfs_btree_clear_helper(tree, btc->btc_children[i]);
1890 }
1891
1892 zfs_btree_node_destroy(tree, hdr);
1893 }
1894
1895 void
zfs_btree_clear(zfs_btree_t * tree)1896 zfs_btree_clear(zfs_btree_t *tree)
1897 {
1898 if (tree->bt_root == NULL) {
1899 ASSERT0(tree->bt_num_elems);
1900 return;
1901 }
1902
1903 zfs_btree_clear_helper(tree, tree->bt_root);
1904 tree->bt_num_elems = 0;
1905 tree->bt_root = NULL;
1906 tree->bt_num_nodes = 0;
1907 tree->bt_height = -1;
1908 tree->bt_bulk = NULL;
1909 }
1910
1911 void
zfs_btree_destroy(zfs_btree_t * tree)1912 zfs_btree_destroy(zfs_btree_t *tree)
1913 {
1914 ASSERT0(tree->bt_num_elems);
1915 ASSERT3P(tree->bt_root, ==, NULL);
1916 }
1917
1918 /* Verify that every child of this node has the correct parent pointer. */
1919 static void
zfs_btree_verify_pointers_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1920 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1921 {
1922 if (!zfs_btree_is_core(hdr))
1923 return;
1924
1925 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1926 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1927 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
1928 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
1929 }
1930 }
1931
1932 /* Verify that every node has the correct parent pointer. */
1933 static void
zfs_btree_verify_pointers(zfs_btree_t * tree)1934 zfs_btree_verify_pointers(zfs_btree_t *tree)
1935 {
1936 if (tree->bt_height == -1) {
1937 VERIFY3P(tree->bt_root, ==, NULL);
1938 return;
1939 }
1940 VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
1941 zfs_btree_verify_pointers_helper(tree, tree->bt_root);
1942 }
1943
1944 /*
1945 * Verify that all the current node and its children satisfy the count
1946 * invariants, and return the total count in the subtree rooted in this node.
1947 */
1948 static uint64_t
zfs_btree_verify_counts_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1949 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1950 {
1951 if (!zfs_btree_is_core(hdr)) {
1952 if (tree->bt_root != hdr && tree->bt_bulk &&
1953 hdr != &tree->bt_bulk->btl_hdr) {
1954 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
1955 }
1956
1957 return (hdr->bth_count);
1958 } else {
1959
1960 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1961 uint64_t ret = hdr->bth_count;
1962 if (tree->bt_root != hdr && tree->bt_bulk == NULL)
1963 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
1964 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1965 ret += zfs_btree_verify_counts_helper(tree,
1966 node->btc_children[i]);
1967 }
1968
1969 return (ret);
1970 }
1971 }
1972
1973 /*
1974 * Verify that all nodes satisfy the invariants and that the total number of
1975 * elements is correct.
1976 */
1977 static void
zfs_btree_verify_counts(zfs_btree_t * tree)1978 zfs_btree_verify_counts(zfs_btree_t *tree)
1979 {
1980 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
1981 if (tree->bt_height == -1) {
1982 return;
1983 }
1984 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
1985 tree->bt_num_elems);
1986 }
1987
1988 /*
1989 * Check that the subtree rooted at this node has a uniform height. Returns
1990 * the number of nodes under this node, to help verify bt_num_nodes.
1991 */
1992 static uint64_t
zfs_btree_verify_height_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,int64_t height)1993 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
1994 int64_t height)
1995 {
1996 if (!zfs_btree_is_core(hdr)) {
1997 VERIFY0(height);
1998 return (1);
1999 }
2000
2001 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2002 uint64_t ret = 1;
2003 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2004 ret += zfs_btree_verify_height_helper(tree,
2005 node->btc_children[i], height - 1);
2006 }
2007 return (ret);
2008 }
2009
2010 /*
2011 * Check that the tree rooted at this node has a uniform height, and that the
2012 * bt_height in the tree is correct.
2013 */
2014 static void
zfs_btree_verify_height(zfs_btree_t * tree)2015 zfs_btree_verify_height(zfs_btree_t *tree)
2016 {
2017 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2018 if (tree->bt_height == -1) {
2019 return;
2020 }
2021
2022 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
2023 tree->bt_height), ==, tree->bt_num_nodes);
2024 }
2025
2026 /*
2027 * Check that the elements in this node are sorted, and that if this is a core
2028 * node, the separators are properly between the subtrees they separaate and
2029 * that the children also satisfy this requirement.
2030 */
2031 static void
zfs_btree_verify_order_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2032 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2033 {
2034 size_t size = tree->bt_elem_size;
2035 if (!zfs_btree_is_core(hdr)) {
2036 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2037 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2038 VERIFY3S(tree->bt_compar(leaf->btl_elems +
2039 (hdr->bth_first + i - 1) * size,
2040 leaf->btl_elems +
2041 (hdr->bth_first + i) * size), ==, -1);
2042 }
2043 return;
2044 }
2045
2046 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2047 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2048 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
2049 node->btc_elems + i * size), ==, -1);
2050 }
2051 for (uint32_t i = 0; i < hdr->bth_count; i++) {
2052 uint8_t *left_child_last = NULL;
2053 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
2054 if (zfs_btree_is_core(left_child_hdr)) {
2055 zfs_btree_core_t *left_child =
2056 (zfs_btree_core_t *)left_child_hdr;
2057 left_child_last = left_child->btc_elems +
2058 (left_child_hdr->bth_count - 1) * size;
2059 } else {
2060 zfs_btree_leaf_t *left_child =
2061 (zfs_btree_leaf_t *)left_child_hdr;
2062 left_child_last = left_child->btl_elems +
2063 (left_child_hdr->bth_first +
2064 left_child_hdr->bth_count - 1) * size;
2065 }
2066 int comp = tree->bt_compar(node->btc_elems + i * size,
2067 left_child_last);
2068 if (comp <= 0) {
2069 panic("btree: compar returned %d (expected 1) at "
2070 "%px %d: compar(%px, %px)", comp, node, i,
2071 node->btc_elems + i * size, left_child_last);
2072 }
2073
2074 uint8_t *right_child_first = NULL;
2075 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
2076 if (zfs_btree_is_core(right_child_hdr)) {
2077 zfs_btree_core_t *right_child =
2078 (zfs_btree_core_t *)right_child_hdr;
2079 right_child_first = right_child->btc_elems;
2080 } else {
2081 zfs_btree_leaf_t *right_child =
2082 (zfs_btree_leaf_t *)right_child_hdr;
2083 right_child_first = right_child->btl_elems +
2084 right_child_hdr->bth_first * size;
2085 }
2086 comp = tree->bt_compar(node->btc_elems + i * size,
2087 right_child_first);
2088 if (comp >= 0) {
2089 panic("btree: compar returned %d (expected -1) at "
2090 "%px %d: compar(%px, %px)", comp, node, i,
2091 node->btc_elems + i * size, right_child_first);
2092 }
2093 }
2094 for (uint32_t i = 0; i <= hdr->bth_count; i++)
2095 zfs_btree_verify_order_helper(tree, node->btc_children[i]);
2096 }
2097
2098 /* Check that all elements in the tree are in sorted order. */
2099 static void
zfs_btree_verify_order(zfs_btree_t * tree)2100 zfs_btree_verify_order(zfs_btree_t *tree)
2101 {
2102 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2103 if (tree->bt_height == -1) {
2104 return;
2105 }
2106
2107 zfs_btree_verify_order_helper(tree, tree->bt_root);
2108 }
2109
2110 #ifdef ZFS_DEBUG
2111 /* Check that all unused memory is poisoned correctly. */
2112 static void
zfs_btree_verify_poison_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2113 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2114 {
2115 size_t size = tree->bt_elem_size;
2116 if (!zfs_btree_is_core(hdr)) {
2117 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2118 for (size_t i = 0; i < hdr->bth_first * size; i++)
2119 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2120 for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2121 i < BTREE_LEAF_ESIZE; i++)
2122 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2123 } else {
2124 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2125 for (size_t i = hdr->bth_count * size;
2126 i < BTREE_CORE_ELEMS * size; i++)
2127 VERIFY3U(node->btc_elems[i], ==, 0x0f);
2128
2129 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
2130 i++) {
2131 VERIFY3P(node->btc_children[i], ==,
2132 (zfs_btree_hdr_t *)BTREE_POISON);
2133 }
2134
2135 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2136 zfs_btree_verify_poison_helper(tree,
2137 node->btc_children[i]);
2138 }
2139 }
2140 }
2141 #endif
2142
2143 /* Check that unused memory in the tree is still poisoned. */
2144 static void
zfs_btree_verify_poison(zfs_btree_t * tree)2145 zfs_btree_verify_poison(zfs_btree_t *tree)
2146 {
2147 #ifdef ZFS_DEBUG
2148 if (tree->bt_height == -1)
2149 return;
2150 zfs_btree_verify_poison_helper(tree, tree->bt_root);
2151 #endif
2152 }
2153
2154 void
zfs_btree_verify(zfs_btree_t * tree)2155 zfs_btree_verify(zfs_btree_t *tree)
2156 {
2157 if (zfs_btree_verify_intensity == 0)
2158 return;
2159 zfs_btree_verify_height(tree);
2160 if (zfs_btree_verify_intensity == 1)
2161 return;
2162 zfs_btree_verify_pointers(tree);
2163 if (zfs_btree_verify_intensity == 2)
2164 return;
2165 zfs_btree_verify_counts(tree);
2166 if (zfs_btree_verify_intensity == 3)
2167 return;
2168 zfs_btree_verify_order(tree);
2169
2170 if (zfs_btree_verify_intensity == 4)
2171 return;
2172 zfs_btree_verify_poison(tree);
2173 }
2174