1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-btree.h" 9 #include "dm-btree-internal.h" 10 #include "dm-transaction-manager.h" 11 12 #include <linux/export.h> 13 #include <linux/device-mapper.h> 14 15 #define DM_MSG_PREFIX "btree" 16 17 /* 18 * Removing an entry from a btree 19 * ============================== 20 * 21 * A very important constraint for our btree is that no node, except the 22 * root, may have fewer than a certain number of entries. 23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). 24 * 25 * Ensuring this is complicated by the way we want to only ever hold the 26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom 27 * fashion. 28 * 29 * Each node may have a left or right sibling. When decending the spine, 30 * if a node contains only MIN_ENTRIES then we try and increase this to at 31 * least MIN_ENTRIES + 1. We do this in the following ways: 32 * 33 * [A] No siblings => this can only happen if the node is the root, in which 34 * case we copy the childs contents over the root. 35 * 36 * [B] No left sibling 37 * ==> rebalance(node, right sibling) 38 * 39 * [C] No right sibling 40 * ==> rebalance(left sibling, node) 41 * 42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD 43 * ==> delete node adding it's contents to left and right 44 * 45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD 46 * ==> rebalance(left, node, right) 47 * 48 * After these operations it's possible that the our original node no 49 * longer contains the desired sub tree. For this reason this rebalancing 50 * is performed on the children of the current node. This also avoids 51 * having a special case for the root. 52 * 53 * Once this rebalancing has occurred we can then step into the child node 54 * for internal nodes. Or delete the entry for leaf nodes. 55 */ 56 57 /* 58 * Some little utilities for moving node data around. 59 */ 60 static void node_shift(struct btree_node *n, int shift) 61 { 62 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 63 uint32_t value_size = le32_to_cpu(n->header.value_size); 64 65 if (shift < 0) { 66 shift = -shift; 67 BUG_ON(shift > nr_entries); 68 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); 69 memmove(key_ptr(n, 0), 70 key_ptr(n, shift), 71 (nr_entries - shift) * sizeof(__le64)); 72 memmove(value_ptr(n, 0), 73 value_ptr(n, shift), 74 (nr_entries - shift) * value_size); 75 } else { 76 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 77 memmove(key_ptr(n, shift), 78 key_ptr(n, 0), 79 nr_entries * sizeof(__le64)); 80 memmove(value_ptr(n, shift), 81 value_ptr(n, 0), 82 nr_entries * value_size); 83 } 84 } 85 86 static int node_copy(struct btree_node *left, struct btree_node *right, int shift) 87 { 88 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 89 uint32_t value_size = le32_to_cpu(left->header.value_size); 90 91 if (value_size != le32_to_cpu(right->header.value_size)) { 92 DMERR("mismatched value size"); 93 return -EILSEQ; 94 } 95 96 if (shift < 0) { 97 shift = -shift; 98 99 if (nr_left + shift > le32_to_cpu(left->header.max_entries)) { 100 DMERR("bad shift"); 101 return -EINVAL; 102 } 103 104 memcpy(key_ptr(left, nr_left), 105 key_ptr(right, 0), 106 shift * sizeof(__le64)); 107 memcpy(value_ptr(left, nr_left), 108 value_ptr(right, 0), 109 shift * value_size); 110 } else { 111 if (shift > le32_to_cpu(right->header.max_entries)) { 112 DMERR("bad shift"); 113 return -EINVAL; 114 } 115 116 memcpy(key_ptr(right, 0), 117 key_ptr(left, nr_left - shift), 118 shift * sizeof(__le64)); 119 memcpy(value_ptr(right, 0), 120 value_ptr(left, nr_left - shift), 121 shift * value_size); 122 } 123 return 0; 124 } 125 126 /* 127 * Delete a specific entry from a leaf node. 128 */ 129 static void delete_at(struct btree_node *n, unsigned int index) 130 { 131 unsigned int nr_entries = le32_to_cpu(n->header.nr_entries); 132 unsigned int nr_to_copy = nr_entries - (index + 1); 133 uint32_t value_size = le32_to_cpu(n->header.value_size); 134 135 BUG_ON(index >= nr_entries); 136 137 if (nr_to_copy) { 138 memmove(key_ptr(n, index), 139 key_ptr(n, index + 1), 140 nr_to_copy * sizeof(__le64)); 141 142 memmove(value_ptr(n, index), 143 value_ptr(n, index + 1), 144 nr_to_copy * value_size); 145 } 146 147 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 148 } 149 150 static unsigned int merge_threshold(struct btree_node *n) 151 { 152 return le32_to_cpu(n->header.max_entries) / 3; 153 } 154 155 struct child { 156 unsigned int index; 157 struct dm_block *block; 158 struct btree_node *n; 159 }; 160 161 static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 162 struct btree_node *parent, 163 unsigned int index, struct child *result) 164 { 165 int r, inc; 166 dm_block_t root; 167 168 result->index = index; 169 root = value64(parent, index); 170 171 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 172 &result->block, &inc); 173 if (r) 174 return r; 175 176 result->n = dm_block_data(result->block); 177 178 if (inc) 179 inc_children(info->tm, result->n, vt); 180 181 *((__le64 *) value_ptr(parent, index)) = 182 cpu_to_le64(dm_block_location(result->block)); 183 184 return 0; 185 } 186 187 static void exit_child(struct dm_btree_info *info, struct child *c) 188 { 189 dm_tm_unlock(info->tm, c->block); 190 } 191 192 static int shift(struct btree_node *left, struct btree_node *right, int count) 193 { 194 int r; 195 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 196 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 197 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 198 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); 199 200 if (max_entries != r_max_entries) { 201 DMERR("node max_entries mismatch"); 202 return -EILSEQ; 203 } 204 205 if (nr_left - count > max_entries) { 206 DMERR("node shift out of bounds"); 207 return -EINVAL; 208 } 209 210 if (nr_right + count > max_entries) { 211 DMERR("node shift out of bounds"); 212 return -EINVAL; 213 } 214 215 if (!count) 216 return 0; 217 218 if (count > 0) { 219 node_shift(right, count); 220 r = node_copy(left, right, count); 221 if (r) 222 return r; 223 } else { 224 r = node_copy(left, right, count); 225 if (r) 226 return r; 227 node_shift(right, count); 228 } 229 230 left->header.nr_entries = cpu_to_le32(nr_left - count); 231 right->header.nr_entries = cpu_to_le32(nr_right + count); 232 233 return 0; 234 } 235 236 static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent, 237 struct child *l, struct child *r) 238 { 239 int ret; 240 struct btree_node *left = l->n; 241 struct btree_node *right = r->n; 242 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 243 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 244 /* 245 * Ensure the number of entries in each child will be greater 246 * than or equal to (max_entries / 3 + 1), so no matter which 247 * child is used for removal, the number will still be not 248 * less than (max_entries / 3). 249 */ 250 unsigned int threshold = 2 * (merge_threshold(left) + 1); 251 252 if (nr_left + nr_right < threshold) { 253 /* 254 * Merge 255 */ 256 node_copy(left, right, -nr_right); 257 left->header.nr_entries = cpu_to_le32(nr_left + nr_right); 258 delete_at(parent, r->index); 259 260 /* 261 * We need to decrement the right block, but not it's 262 * children, since they're still referenced by left. 263 */ 264 dm_tm_dec(info->tm, dm_block_location(r->block)); 265 } else { 266 /* 267 * Rebalance. 268 */ 269 unsigned int target_left = (nr_left + nr_right) / 2; 270 271 ret = shift(left, right, nr_left - target_left); 272 if (ret) 273 return ret; 274 *key_ptr(parent, r->index) = right->keys[0]; 275 } 276 return 0; 277 } 278 279 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 280 struct dm_btree_value_type *vt, unsigned int left_index) 281 { 282 int r; 283 struct btree_node *parent; 284 struct child left, right; 285 286 parent = dm_block_data(shadow_current(s)); 287 288 r = init_child(info, vt, parent, left_index, &left); 289 if (r) 290 return r; 291 292 r = init_child(info, vt, parent, left_index + 1, &right); 293 if (r) { 294 exit_child(info, &left); 295 return r; 296 } 297 298 r = __rebalance2(info, parent, &left, &right); 299 300 exit_child(info, &left); 301 exit_child(info, &right); 302 303 return r; 304 } 305 306 /* 307 * We dump as many entries from center as possible into left, then the rest 308 * in right, then rebalance2. This wastes some cpu, but I want something 309 * simple atm. 310 */ 311 static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent, 312 struct child *l, struct child *c, struct child *r, 313 struct btree_node *left, struct btree_node *center, struct btree_node *right, 314 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 315 { 316 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 317 unsigned int shift = min(max_entries - nr_left, nr_center); 318 319 if (nr_left + shift > max_entries) { 320 DMERR("node shift out of bounds"); 321 return -EINVAL; 322 } 323 324 node_copy(left, center, -shift); 325 left->header.nr_entries = cpu_to_le32(nr_left + shift); 326 327 if (shift != nr_center) { 328 shift = nr_center - shift; 329 330 if ((nr_right + shift) > max_entries) { 331 DMERR("node shift out of bounds"); 332 return -EINVAL; 333 } 334 335 node_shift(right, shift); 336 node_copy(center, right, shift); 337 right->header.nr_entries = cpu_to_le32(nr_right + shift); 338 } 339 *key_ptr(parent, r->index) = right->keys[0]; 340 341 delete_at(parent, c->index); 342 r->index--; 343 344 dm_tm_dec(info->tm, dm_block_location(c->block)); 345 return __rebalance2(info, parent, l, r); 346 } 347 348 /* 349 * Redistributes entries among 3 sibling nodes. 350 */ 351 static int redistribute3(struct dm_btree_info *info, struct btree_node *parent, 352 struct child *l, struct child *c, struct child *r, 353 struct btree_node *left, struct btree_node *center, struct btree_node *right, 354 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 355 { 356 int s, ret; 357 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 358 unsigned int total = nr_left + nr_center + nr_right; 359 unsigned int target_right = total / 3; 360 unsigned int remainder = (target_right * 3) != total; 361 unsigned int target_left = target_right + remainder; 362 363 BUG_ON(target_left > max_entries); 364 BUG_ON(target_right > max_entries); 365 366 if (nr_left < nr_right) { 367 s = nr_left - target_left; 368 369 if (s < 0 && nr_center < -s) { 370 /* not enough in central node */ 371 ret = shift(left, center, -nr_center); 372 if (ret) 373 return ret; 374 375 s += nr_center; 376 ret = shift(left, right, s); 377 if (ret) 378 return ret; 379 380 nr_right += s; 381 } else { 382 ret = shift(left, center, s); 383 if (ret) 384 return ret; 385 } 386 387 ret = shift(center, right, target_right - nr_right); 388 if (ret) 389 return ret; 390 } else { 391 s = target_right - nr_right; 392 if (s > 0 && nr_center < s) { 393 /* not enough in central node */ 394 ret = shift(center, right, nr_center); 395 if (ret) 396 return ret; 397 s -= nr_center; 398 ret = shift(left, right, s); 399 if (ret) 400 return ret; 401 nr_left -= s; 402 } else { 403 ret = shift(center, right, s); 404 if (ret) 405 return ret; 406 } 407 408 ret = shift(left, center, nr_left - target_left); 409 if (ret) 410 return ret; 411 } 412 413 *key_ptr(parent, c->index) = center->keys[0]; 414 *key_ptr(parent, r->index) = right->keys[0]; 415 return 0; 416 } 417 418 static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent, 419 struct child *l, struct child *c, struct child *r) 420 { 421 struct btree_node *left = l->n; 422 struct btree_node *center = c->n; 423 struct btree_node *right = r->n; 424 425 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 426 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 427 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 428 429 unsigned int threshold = merge_threshold(left) * 4 + 1; 430 431 if ((left->header.max_entries != center->header.max_entries) || 432 (center->header.max_entries != right->header.max_entries)) { 433 DMERR("bad btree metadata, max_entries differ"); 434 return -EILSEQ; 435 } 436 437 if ((nr_left + nr_center + nr_right) < threshold) { 438 return delete_center_node(info, parent, l, c, r, left, center, right, 439 nr_left, nr_center, nr_right); 440 } 441 442 return redistribute3(info, parent, l, c, r, left, center, right, 443 nr_left, nr_center, nr_right); 444 } 445 446 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 447 struct dm_btree_value_type *vt, unsigned int left_index) 448 { 449 int r; 450 struct btree_node *parent = dm_block_data(shadow_current(s)); 451 struct child left, center, right; 452 453 /* 454 * FIXME: fill out an array? 455 */ 456 r = init_child(info, vt, parent, left_index, &left); 457 if (r) 458 return r; 459 460 r = init_child(info, vt, parent, left_index + 1, ¢er); 461 if (r) { 462 exit_child(info, &left); 463 return r; 464 } 465 466 r = init_child(info, vt, parent, left_index + 2, &right); 467 if (r) { 468 exit_child(info, &left); 469 exit_child(info, ¢er); 470 return r; 471 } 472 473 r = __rebalance3(info, parent, &left, ¢er, &right); 474 475 exit_child(info, &left); 476 exit_child(info, ¢er); 477 exit_child(info, &right); 478 479 return r; 480 } 481 482 static int rebalance_children(struct shadow_spine *s, 483 struct dm_btree_info *info, 484 struct dm_btree_value_type *vt, uint64_t key) 485 { 486 int i, r, has_left_sibling, has_right_sibling; 487 struct btree_node *n; 488 489 n = dm_block_data(shadow_current(s)); 490 491 if (le32_to_cpu(n->header.nr_entries) == 1) { 492 struct dm_block *child; 493 int is_shared; 494 dm_block_t b = value64(n, 0); 495 496 r = dm_tm_block_is_shared(info->tm, b, &is_shared); 497 if (r) 498 return r; 499 500 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); 501 if (r) 502 return r; 503 504 if (is_shared) 505 inc_children(info->tm, dm_block_data(child), vt); 506 507 memcpy(n, dm_block_data(child), 508 dm_bm_block_size(dm_tm_get_bm(info->tm))); 509 510 dm_tm_dec(info->tm, dm_block_location(child)); 511 dm_tm_unlock(info->tm, child); 512 return 0; 513 } 514 515 i = lower_bound(n, key); 516 if (i < 0) 517 return -ENODATA; 518 519 has_left_sibling = i > 0; 520 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 521 522 if (!has_left_sibling) 523 r = rebalance2(s, info, vt, i); 524 525 else if (!has_right_sibling) 526 r = rebalance2(s, info, vt, i - 1); 527 528 else 529 r = rebalance3(s, info, vt, i - 1); 530 531 return r; 532 } 533 534 static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index) 535 { 536 int i = lower_bound(n, key); 537 538 if ((i < 0) || 539 (i >= le32_to_cpu(n->header.nr_entries)) || 540 (le64_to_cpu(n->keys[i]) != key)) 541 return -ENODATA; 542 543 *index = i; 544 545 return 0; 546 } 547 548 /* 549 * Prepares for removal from one level of the hierarchy. The caller must 550 * call delete_at() to remove the entry at index. 551 */ 552 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, 553 struct dm_btree_value_type *vt, dm_block_t root, 554 uint64_t key, unsigned int *index) 555 { 556 int i = *index, r; 557 struct btree_node *n; 558 559 for (;;) { 560 r = shadow_step(s, root, vt); 561 if (r < 0) 562 break; 563 564 /* 565 * We have to patch up the parent node, ugly, but I don't 566 * see a way to do this automatically as part of the spine 567 * op. 568 */ 569 if (shadow_has_parent(s)) { 570 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 571 572 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 573 &location, sizeof(__le64)); 574 } 575 576 n = dm_block_data(shadow_current(s)); 577 578 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 579 return do_leaf(n, key, index); 580 581 r = rebalance_children(s, info, vt, key); 582 if (r) 583 break; 584 585 n = dm_block_data(shadow_current(s)); 586 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 587 return do_leaf(n, key, index); 588 589 i = lower_bound(n, key); 590 591 /* 592 * We know the key is present, or else 593 * rebalance_children would have returned 594 * -ENODATA 595 */ 596 root = value64(n, i); 597 } 598 599 return r; 600 } 601 602 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 603 uint64_t *keys, dm_block_t *new_root) 604 { 605 unsigned int level, last_level = info->levels - 1; 606 int index = 0, r = 0; 607 struct shadow_spine spine; 608 struct btree_node *n; 609 struct dm_btree_value_type le64_vt; 610 611 init_le64_type(info->tm, &le64_vt); 612 init_shadow_spine(&spine, info); 613 for (level = 0; level < info->levels; level++) { 614 r = remove_raw(&spine, info, 615 (level == last_level ? 616 &info->value_type : &le64_vt), 617 root, keys[level], (unsigned int *)&index); 618 if (r < 0) 619 break; 620 621 n = dm_block_data(shadow_current(&spine)); 622 if (level != last_level) { 623 root = value64(n, index); 624 continue; 625 } 626 627 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); 628 629 if (info->value_type.dec) 630 info->value_type.dec(info->value_type.context, 631 value_ptr(n, index), 1); 632 633 delete_at(n, index); 634 } 635 636 if (!r) 637 *new_root = shadow_root(&spine); 638 exit_shadow_spine(&spine); 639 640 return r; 641 } 642 EXPORT_SYMBOL_GPL(dm_btree_remove); 643 644 /*----------------------------------------------------------------*/ 645 646 static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, 647 struct dm_btree_value_type *vt, dm_block_t root, 648 uint64_t key, int *index) 649 { 650 int i = *index, r; 651 struct btree_node *n; 652 653 for (;;) { 654 r = shadow_step(s, root, vt); 655 if (r < 0) 656 break; 657 658 /* 659 * We have to patch up the parent node, ugly, but I don't 660 * see a way to do this automatically as part of the spine 661 * op. 662 */ 663 if (shadow_has_parent(s)) { 664 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 665 666 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 667 &location, sizeof(__le64)); 668 } 669 670 n = dm_block_data(shadow_current(s)); 671 672 if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 673 *index = lower_bound(n, key); 674 return 0; 675 } 676 677 r = rebalance_children(s, info, vt, key); 678 if (r) 679 break; 680 681 n = dm_block_data(shadow_current(s)); 682 if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 683 *index = lower_bound(n, key); 684 return 0; 685 } 686 687 i = lower_bound(n, key); 688 689 /* 690 * We know the key is present, or else 691 * rebalance_children would have returned 692 * -ENODATA 693 */ 694 root = value64(n, i); 695 } 696 697 return r; 698 } 699 700 static int remove_one(struct dm_btree_info *info, dm_block_t root, 701 uint64_t *keys, uint64_t end_key, 702 dm_block_t *new_root, unsigned int *nr_removed) 703 { 704 unsigned int level, last_level = info->levels - 1; 705 int index = 0, r = 0; 706 struct shadow_spine spine; 707 struct btree_node *n; 708 struct dm_btree_value_type le64_vt; 709 uint64_t k; 710 711 init_le64_type(info->tm, &le64_vt); 712 init_shadow_spine(&spine, info); 713 for (level = 0; level < last_level; level++) { 714 r = remove_raw(&spine, info, &le64_vt, 715 root, keys[level], (unsigned int *) &index); 716 if (r < 0) 717 goto out; 718 719 n = dm_block_data(shadow_current(&spine)); 720 root = value64(n, index); 721 } 722 723 r = remove_nearest(&spine, info, &info->value_type, 724 root, keys[last_level], &index); 725 if (r < 0) 726 goto out; 727 728 n = dm_block_data(shadow_current(&spine)); 729 730 if (index < 0) 731 index = 0; 732 733 if (index >= le32_to_cpu(n->header.nr_entries)) { 734 r = -ENODATA; 735 goto out; 736 } 737 738 k = le64_to_cpu(n->keys[index]); 739 if (k >= keys[last_level] && k < end_key) { 740 if (info->value_type.dec) 741 info->value_type.dec(info->value_type.context, 742 value_ptr(n, index), 1); 743 744 delete_at(n, index); 745 keys[last_level] = k + 1ull; 746 747 } else 748 r = -ENODATA; 749 750 out: 751 *new_root = shadow_root(&spine); 752 exit_shadow_spine(&spine); 753 754 return r; 755 } 756 757 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, 758 uint64_t *first_key, uint64_t end_key, 759 dm_block_t *new_root, unsigned int *nr_removed) 760 { 761 int r; 762 763 *nr_removed = 0; 764 do { 765 r = remove_one(info, root, first_key, end_key, &root, nr_removed); 766 if (!r) 767 (*nr_removed)++; 768 } while (!r); 769 770 *new_root = root; 771 return r == -ENODATA ? 0 : r; 772 } 773 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); 774