1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-btree-internal.h" 9 #include "dm-space-map.h" 10 #include "dm-transaction-manager.h" 11 12 #include <linux/export.h> 13 #include <linux/device-mapper.h> 14 15 #define DM_MSG_PREFIX "btree" 16 17 /* 18 *-------------------------------------------------------------- 19 * Array manipulation 20 *-------------------------------------------------------------- 21 */ 22 static void memcpy_disk(void *dest, const void *src, size_t len) 23 __dm_written_to_disk(src) 24 { 25 memcpy(dest, src, len); 26 __dm_unbless_for_disk(src); 27 } 28 29 static void array_insert(void *base, size_t elt_size, unsigned int nr_elts, 30 unsigned int index, void *elt) 31 __dm_written_to_disk(elt) 32 { 33 if (index < nr_elts) 34 memmove(base + (elt_size * (index + 1)), 35 base + (elt_size * index), 36 (nr_elts - index) * elt_size); 37 38 memcpy_disk(base + (elt_size * index), elt, elt_size); 39 } 40 41 /*----------------------------------------------------------------*/ 42 43 /* makes the assumption that no two keys are the same. */ 44 static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 45 { 46 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 47 48 while (hi - lo > 1) { 49 int mid = lo + ((hi - lo) / 2); 50 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 51 52 if (mid_key == key) 53 return mid; 54 55 if (mid_key < key) 56 lo = mid; 57 else 58 hi = mid; 59 } 60 61 return want_hi ? hi : lo; 62 } 63 64 int lower_bound(struct btree_node *n, uint64_t key) 65 { 66 return bsearch(n, key, 0); 67 } 68 69 static int upper_bound(struct btree_node *n, uint64_t key) 70 { 71 return bsearch(n, key, 1); 72 } 73 74 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 75 struct dm_btree_value_type *vt) 76 { 77 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 78 79 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 80 dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); 81 82 else if (vt->inc) 83 vt->inc(vt->context, value_ptr(n, 0), nr_entries); 84 } 85 86 static int insert_at(size_t value_size, struct btree_node *node, unsigned int index, 87 uint64_t key, void *value) 88 __dm_written_to_disk(value) 89 { 90 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 91 uint32_t max_entries = le32_to_cpu(node->header.max_entries); 92 __le64 key_le = cpu_to_le64(key); 93 94 if (index > nr_entries || 95 index >= max_entries || 96 nr_entries >= max_entries) { 97 DMERR("too many entries in btree node for insert"); 98 __dm_unbless_for_disk(value); 99 return -ENOMEM; 100 } 101 102 __dm_bless_for_disk(&key_le); 103 104 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 105 array_insert(value_base(node), value_size, nr_entries, index, value); 106 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 107 108 return 0; 109 } 110 111 /*----------------------------------------------------------------*/ 112 113 /* 114 * We want 3n entries (for some n). This works more nicely for repeated 115 * insert remove loops than (2n + 1). 116 */ 117 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 118 { 119 uint32_t total, n; 120 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 121 122 block_size -= sizeof(struct node_header); 123 total = block_size / elt_size; 124 n = total / 3; /* rounds down */ 125 126 return 3 * n; 127 } 128 129 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 130 { 131 int r; 132 struct dm_block *b; 133 struct btree_node *n; 134 size_t block_size; 135 uint32_t max_entries; 136 137 r = new_block(info, &b); 138 if (r < 0) 139 return r; 140 141 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 142 max_entries = calc_max_entries(info->value_type.size, block_size); 143 144 n = dm_block_data(b); 145 memset(n, 0, block_size); 146 n->header.flags = cpu_to_le32(LEAF_NODE); 147 n->header.nr_entries = cpu_to_le32(0); 148 n->header.max_entries = cpu_to_le32(max_entries); 149 n->header.value_size = cpu_to_le32(info->value_type.size); 150 151 *root = dm_block_location(b); 152 unlock_block(info, b); 153 154 return 0; 155 } 156 EXPORT_SYMBOL_GPL(dm_btree_empty); 157 158 /*----------------------------------------------------------------*/ 159 160 /* 161 * Deletion uses a recursive algorithm, since we have limited stack space 162 * we explicitly manage our own stack on the heap. 163 */ 164 #define MAX_SPINE_DEPTH 64 165 struct frame { 166 struct dm_block *b; 167 struct btree_node *n; 168 unsigned int level; 169 unsigned int nr_children; 170 unsigned int current_child; 171 }; 172 173 struct del_stack { 174 struct dm_btree_info *info; 175 struct dm_transaction_manager *tm; 176 int top; 177 struct frame spine[MAX_SPINE_DEPTH]; 178 }; 179 180 static int top_frame(struct del_stack *s, struct frame **f) 181 { 182 if (s->top < 0) { 183 DMERR("btree deletion stack empty"); 184 return -EINVAL; 185 } 186 187 *f = s->spine + s->top; 188 189 return 0; 190 } 191 192 static int unprocessed_frames(struct del_stack *s) 193 { 194 return s->top >= 0; 195 } 196 197 static void prefetch_children(struct del_stack *s, struct frame *f) 198 { 199 unsigned int i; 200 struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 201 202 for (i = 0; i < f->nr_children; i++) 203 dm_bm_prefetch(bm, value64(f->n, i)); 204 } 205 206 static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 207 { 208 return f->level < (info->levels - 1); 209 } 210 211 static int push_frame(struct del_stack *s, dm_block_t b, unsigned int level) 212 { 213 int r; 214 uint32_t ref_count; 215 216 if (s->top >= MAX_SPINE_DEPTH - 1) { 217 DMERR("btree deletion stack out of memory"); 218 return -ENOMEM; 219 } 220 221 r = dm_tm_ref(s->tm, b, &ref_count); 222 if (r) 223 return r; 224 225 if (ref_count > 1) 226 /* 227 * This is a shared node, so we can just decrement it's 228 * reference counter and leave the children. 229 */ 230 dm_tm_dec(s->tm, b); 231 232 else { 233 uint32_t flags; 234 struct frame *f = s->spine + ++s->top; 235 236 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 237 if (r) { 238 s->top--; 239 return r; 240 } 241 242 f->n = dm_block_data(f->b); 243 f->level = level; 244 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 245 f->current_child = 0; 246 247 flags = le32_to_cpu(f->n->header.flags); 248 if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 249 prefetch_children(s, f); 250 } 251 252 return 0; 253 } 254 255 static void pop_frame(struct del_stack *s) 256 { 257 struct frame *f = s->spine + s->top--; 258 259 dm_tm_dec(s->tm, dm_block_location(f->b)); 260 dm_tm_unlock(s->tm, f->b); 261 } 262 263 static void unlock_all_frames(struct del_stack *s) 264 { 265 struct frame *f; 266 267 while (unprocessed_frames(s)) { 268 f = s->spine + s->top--; 269 dm_tm_unlock(s->tm, f->b); 270 } 271 } 272 273 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 274 { 275 int r; 276 struct del_stack *s; 277 278 /* 279 * dm_btree_del() is called via an ioctl, as such should be 280 * considered an FS op. We can't recurse back into the FS, so we 281 * allocate GFP_NOFS. 282 */ 283 s = kmalloc(sizeof(*s), GFP_NOFS); 284 if (!s) 285 return -ENOMEM; 286 s->info = info; 287 s->tm = info->tm; 288 s->top = -1; 289 290 r = push_frame(s, root, 0); 291 if (r) 292 goto out; 293 294 while (unprocessed_frames(s)) { 295 uint32_t flags; 296 struct frame *f; 297 dm_block_t b; 298 299 r = top_frame(s, &f); 300 if (r) 301 goto out; 302 303 if (f->current_child >= f->nr_children) { 304 pop_frame(s); 305 continue; 306 } 307 308 flags = le32_to_cpu(f->n->header.flags); 309 if (flags & INTERNAL_NODE) { 310 b = value64(f->n, f->current_child); 311 f->current_child++; 312 r = push_frame(s, b, f->level); 313 if (r) 314 goto out; 315 316 } else if (is_internal_level(info, f)) { 317 b = value64(f->n, f->current_child); 318 f->current_child++; 319 r = push_frame(s, b, f->level + 1); 320 if (r) 321 goto out; 322 323 } else { 324 if (info->value_type.dec) 325 info->value_type.dec(info->value_type.context, 326 value_ptr(f->n, 0), f->nr_children); 327 pop_frame(s); 328 } 329 } 330 out: 331 if (r) { 332 /* cleanup all frames of del_stack */ 333 unlock_all_frames(s); 334 } 335 kfree(s); 336 337 return r; 338 } 339 EXPORT_SYMBOL_GPL(dm_btree_del); 340 341 /*----------------------------------------------------------------*/ 342 343 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 344 int (*search_fn)(struct btree_node *, uint64_t), 345 uint64_t *result_key, void *v, size_t value_size) 346 { 347 int i, r; 348 uint32_t flags, nr_entries; 349 350 do { 351 r = ro_step(s, block); 352 if (r < 0) 353 return r; 354 355 i = search_fn(ro_node(s), key); 356 357 flags = le32_to_cpu(ro_node(s)->header.flags); 358 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 359 if (i < 0 || i >= nr_entries) 360 return -ENODATA; 361 362 if (flags & INTERNAL_NODE) 363 block = value64(ro_node(s), i); 364 365 } while (!(flags & LEAF_NODE)); 366 367 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 368 if (v) 369 memcpy(v, value_ptr(ro_node(s), i), value_size); 370 371 return 0; 372 } 373 374 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 375 uint64_t *keys, void *value_le) 376 { 377 unsigned int level, last_level = info->levels - 1; 378 int r = -ENODATA; 379 uint64_t rkey; 380 __le64 internal_value_le; 381 struct ro_spine spine; 382 383 init_ro_spine(&spine, info); 384 for (level = 0; level < info->levels; level++) { 385 size_t size; 386 void *value_p; 387 388 if (level == last_level) { 389 value_p = value_le; 390 size = info->value_type.size; 391 392 } else { 393 value_p = &internal_value_le; 394 size = sizeof(uint64_t); 395 } 396 397 r = btree_lookup_raw(&spine, root, keys[level], 398 lower_bound, &rkey, 399 value_p, size); 400 401 if (!r) { 402 if (rkey != keys[level]) { 403 exit_ro_spine(&spine); 404 return -ENODATA; 405 } 406 } else { 407 exit_ro_spine(&spine); 408 return r; 409 } 410 411 root = le64_to_cpu(internal_value_le); 412 } 413 exit_ro_spine(&spine); 414 415 return r; 416 } 417 EXPORT_SYMBOL_GPL(dm_btree_lookup); 418 419 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 420 uint64_t key, uint64_t *rkey, void *value_le) 421 { 422 int r, i; 423 uint32_t flags, nr_entries; 424 struct dm_block *node; 425 struct btree_node *n; 426 427 r = bn_read_lock(info, root, &node); 428 if (r) 429 return r; 430 431 n = dm_block_data(node); 432 flags = le32_to_cpu(n->header.flags); 433 nr_entries = le32_to_cpu(n->header.nr_entries); 434 435 if (flags & INTERNAL_NODE) { 436 i = lower_bound(n, key); 437 if (i < 0) { 438 /* 439 * avoid early -ENODATA return when all entries are 440 * higher than the search @key. 441 */ 442 i = 0; 443 } 444 if (i >= nr_entries) { 445 r = -ENODATA; 446 goto out; 447 } 448 449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 450 if (r == -ENODATA && i < (nr_entries - 1)) { 451 i++; 452 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 453 } 454 455 } else { 456 i = upper_bound(n, key); 457 if (i < 0 || i >= nr_entries) { 458 r = -ENODATA; 459 goto out; 460 } 461 462 *rkey = le64_to_cpu(n->keys[i]); 463 memcpy(value_le, value_ptr(n, i), info->value_type.size); 464 } 465 out: 466 dm_tm_unlock(info->tm, node); 467 return r; 468 } 469 470 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 471 uint64_t *keys, uint64_t *rkey, void *value_le) 472 { 473 unsigned int level; 474 int r = -ENODATA; 475 __le64 internal_value_le; 476 struct ro_spine spine; 477 478 init_ro_spine(&spine, info); 479 for (level = 0; level < info->levels - 1u; level++) { 480 r = btree_lookup_raw(&spine, root, keys[level], 481 lower_bound, rkey, 482 &internal_value_le, sizeof(uint64_t)); 483 if (r) 484 goto out; 485 486 if (*rkey != keys[level]) { 487 r = -ENODATA; 488 goto out; 489 } 490 491 root = le64_to_cpu(internal_value_le); 492 } 493 494 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 495 out: 496 exit_ro_spine(&spine); 497 return r; 498 } 499 EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 500 501 /*----------------------------------------------------------------*/ 502 503 /* 504 * Copies entries from one region of a btree node to another. The regions 505 * must not overlap. 506 */ 507 static void copy_entries(struct btree_node *dest, unsigned int dest_offset, 508 struct btree_node *src, unsigned int src_offset, 509 unsigned int count) 510 { 511 size_t value_size = le32_to_cpu(dest->header.value_size); 512 513 memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); 514 memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); 515 } 516 517 /* 518 * Moves entries from one region fo a btree node to another. The regions 519 * may overlap. 520 */ 521 static void move_entries(struct btree_node *dest, unsigned int dest_offset, 522 struct btree_node *src, unsigned int src_offset, 523 unsigned int count) 524 { 525 size_t value_size = le32_to_cpu(dest->header.value_size); 526 527 memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); 528 memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); 529 } 530 531 /* 532 * Erases the first 'count' entries of a btree node, shifting following 533 * entries down into their place. 534 */ 535 static void shift_down(struct btree_node *n, unsigned int count) 536 { 537 move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); 538 } 539 540 /* 541 * Moves entries in a btree node up 'count' places, making space for 542 * new entries at the start of the node. 543 */ 544 static void shift_up(struct btree_node *n, unsigned int count) 545 { 546 move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); 547 } 548 549 /* 550 * Redistributes entries between two btree nodes to make them 551 * have similar numbers of entries. 552 */ 553 static void redistribute2(struct btree_node *left, struct btree_node *right) 554 { 555 unsigned int nr_left = le32_to_cpu(left->header.nr_entries); 556 unsigned int nr_right = le32_to_cpu(right->header.nr_entries); 557 unsigned int total = nr_left + nr_right; 558 unsigned int target_left = total / 2; 559 unsigned int target_right = total - target_left; 560 561 if (nr_left < target_left) { 562 unsigned int delta = target_left - nr_left; 563 564 copy_entries(left, nr_left, right, 0, delta); 565 shift_down(right, delta); 566 } else if (nr_left > target_left) { 567 unsigned int delta = nr_left - target_left; 568 569 if (nr_right) 570 shift_up(right, delta); 571 copy_entries(right, 0, left, target_left, delta); 572 } 573 574 left->header.nr_entries = cpu_to_le32(target_left); 575 right->header.nr_entries = cpu_to_le32(target_right); 576 } 577 578 /* 579 * Redistribute entries between three nodes. Assumes the central 580 * node is empty. 581 */ 582 static void redistribute3(struct btree_node *left, struct btree_node *center, 583 struct btree_node *right) 584 { 585 unsigned int nr_left = le32_to_cpu(left->header.nr_entries); 586 unsigned int nr_center = le32_to_cpu(center->header.nr_entries); 587 unsigned int nr_right = le32_to_cpu(right->header.nr_entries); 588 unsigned int total, target_left, target_center, target_right; 589 590 BUG_ON(nr_center); 591 592 total = nr_left + nr_right; 593 target_left = total / 3; 594 target_center = (total - target_left) / 2; 595 target_right = (total - target_left - target_center); 596 597 if (nr_left < target_left) { 598 unsigned int left_short = target_left - nr_left; 599 600 copy_entries(left, nr_left, right, 0, left_short); 601 copy_entries(center, 0, right, left_short, target_center); 602 shift_down(right, nr_right - target_right); 603 604 } else if (nr_left < (target_left + target_center)) { 605 unsigned int left_to_center = nr_left - target_left; 606 607 copy_entries(center, 0, left, target_left, left_to_center); 608 copy_entries(center, left_to_center, right, 0, target_center - left_to_center); 609 shift_down(right, nr_right - target_right); 610 611 } else { 612 unsigned int right_short = target_right - nr_right; 613 614 shift_up(right, right_short); 615 copy_entries(right, 0, left, nr_left - right_short, right_short); 616 copy_entries(center, 0, left, target_left, nr_left - target_left); 617 } 618 619 left->header.nr_entries = cpu_to_le32(target_left); 620 center->header.nr_entries = cpu_to_le32(target_center); 621 right->header.nr_entries = cpu_to_le32(target_right); 622 } 623 624 /* 625 * Splits a node by creating a sibling node and shifting half the nodes 626 * contents across. Assumes there is a parent node, and it has room for 627 * another child. 628 * 629 * Before: 630 * +--------+ 631 * | Parent | 632 * +--------+ 633 * | 634 * v 635 * +----------+ 636 * | A ++++++ | 637 * +----------+ 638 * 639 * 640 * After: 641 * +--------+ 642 * | Parent | 643 * +--------+ 644 * | | 645 * v +------+ 646 * +---------+ | 647 * | A* +++ | v 648 * +---------+ +-------+ 649 * | B +++ | 650 * +-------+ 651 * 652 * Where A* is a shadow of A. 653 */ 654 static int split_one_into_two(struct shadow_spine *s, unsigned int parent_index, 655 struct dm_btree_value_type *vt, uint64_t key) 656 { 657 int r; 658 struct dm_block *left, *right, *parent; 659 struct btree_node *ln, *rn, *pn; 660 __le64 location; 661 662 left = shadow_current(s); 663 664 r = new_block(s->info, &right); 665 if (r < 0) 666 return r; 667 668 ln = dm_block_data(left); 669 rn = dm_block_data(right); 670 671 rn->header.flags = ln->header.flags; 672 rn->header.nr_entries = cpu_to_le32(0); 673 rn->header.max_entries = ln->header.max_entries; 674 rn->header.value_size = ln->header.value_size; 675 redistribute2(ln, rn); 676 677 /* patch up the parent */ 678 parent = shadow_parent(s); 679 pn = dm_block_data(parent); 680 681 location = cpu_to_le64(dm_block_location(right)); 682 __dm_bless_for_disk(&location); 683 r = insert_at(sizeof(__le64), pn, parent_index + 1, 684 le64_to_cpu(rn->keys[0]), &location); 685 if (r) { 686 unlock_block(s->info, right); 687 return r; 688 } 689 690 /* patch up the spine */ 691 if (key < le64_to_cpu(rn->keys[0])) { 692 unlock_block(s->info, right); 693 s->nodes[1] = left; 694 } else { 695 unlock_block(s->info, left); 696 s->nodes[1] = right; 697 } 698 699 return 0; 700 } 701 702 /* 703 * We often need to modify a sibling node. This function shadows a particular 704 * child of the given parent node. Making sure to update the parent to point 705 * to the new shadow. 706 */ 707 static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 708 struct btree_node *parent, unsigned int index, 709 struct dm_block **result) 710 { 711 int r, inc; 712 dm_block_t root; 713 struct btree_node *node; 714 715 root = value64(parent, index); 716 717 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 718 result, &inc); 719 if (r) 720 return r; 721 722 node = dm_block_data(*result); 723 724 if (inc) 725 inc_children(info->tm, node, vt); 726 727 *((__le64 *) value_ptr(parent, index)) = 728 cpu_to_le64(dm_block_location(*result)); 729 730 return 0; 731 } 732 733 /* 734 * Splits two nodes into three. This is more work, but results in fuller 735 * nodes, so saves metadata space. 736 */ 737 static int split_two_into_three(struct shadow_spine *s, unsigned int parent_index, 738 struct dm_btree_value_type *vt, uint64_t key) 739 { 740 int r; 741 unsigned int middle_index; 742 struct dm_block *left, *middle, *right, *parent; 743 struct btree_node *ln, *rn, *mn, *pn; 744 __le64 location; 745 746 parent = shadow_parent(s); 747 pn = dm_block_data(parent); 748 749 if (parent_index == 0) { 750 middle_index = 1; 751 left = shadow_current(s); 752 r = shadow_child(s->info, vt, pn, parent_index + 1, &right); 753 if (r) 754 return r; 755 } else { 756 middle_index = parent_index; 757 right = shadow_current(s); 758 r = shadow_child(s->info, vt, pn, parent_index - 1, &left); 759 if (r) 760 return r; 761 } 762 763 r = new_block(s->info, &middle); 764 if (r < 0) 765 return r; 766 767 ln = dm_block_data(left); 768 mn = dm_block_data(middle); 769 rn = dm_block_data(right); 770 771 mn->header.nr_entries = cpu_to_le32(0); 772 mn->header.flags = ln->header.flags; 773 mn->header.max_entries = ln->header.max_entries; 774 mn->header.value_size = ln->header.value_size; 775 776 redistribute3(ln, mn, rn); 777 778 /* patch up the parent */ 779 pn->keys[middle_index] = rn->keys[0]; 780 location = cpu_to_le64(dm_block_location(middle)); 781 __dm_bless_for_disk(&location); 782 r = insert_at(sizeof(__le64), pn, middle_index, 783 le64_to_cpu(mn->keys[0]), &location); 784 if (r) { 785 if (shadow_current(s) != left) 786 unlock_block(s->info, left); 787 788 unlock_block(s->info, middle); 789 790 if (shadow_current(s) != right) 791 unlock_block(s->info, right); 792 793 return r; 794 } 795 796 797 /* patch up the spine */ 798 if (key < le64_to_cpu(mn->keys[0])) { 799 unlock_block(s->info, middle); 800 unlock_block(s->info, right); 801 s->nodes[1] = left; 802 } else if (key < le64_to_cpu(rn->keys[0])) { 803 unlock_block(s->info, left); 804 unlock_block(s->info, right); 805 s->nodes[1] = middle; 806 } else { 807 unlock_block(s->info, left); 808 unlock_block(s->info, middle); 809 s->nodes[1] = right; 810 } 811 812 return 0; 813 } 814 815 /*----------------------------------------------------------------*/ 816 817 /* 818 * Splits a node by creating two new children beneath the given node. 819 * 820 * Before: 821 * +----------+ 822 * | A ++++++ | 823 * +----------+ 824 * 825 * 826 * After: 827 * +------------+ 828 * | A (shadow) | 829 * +------------+ 830 * | | 831 * +------+ +----+ 832 * | | 833 * v v 834 * +-------+ +-------+ 835 * | B +++ | | C +++ | 836 * +-------+ +-------+ 837 */ 838 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 839 { 840 int r; 841 size_t size; 842 unsigned int nr_left, nr_right; 843 struct dm_block *left, *right, *new_parent; 844 struct btree_node *pn, *ln, *rn; 845 __le64 val; 846 847 new_parent = shadow_current(s); 848 849 pn = dm_block_data(new_parent); 850 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 851 sizeof(__le64) : s->info->value_type.size; 852 853 /* create & init the left block */ 854 r = new_block(s->info, &left); 855 if (r < 0) 856 return r; 857 858 ln = dm_block_data(left); 859 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 860 861 ln->header.flags = pn->header.flags; 862 ln->header.nr_entries = cpu_to_le32(nr_left); 863 ln->header.max_entries = pn->header.max_entries; 864 ln->header.value_size = pn->header.value_size; 865 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 866 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 867 868 /* create & init the right block */ 869 r = new_block(s->info, &right); 870 if (r < 0) { 871 unlock_block(s->info, left); 872 return r; 873 } 874 875 rn = dm_block_data(right); 876 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 877 878 rn->header.flags = pn->header.flags; 879 rn->header.nr_entries = cpu_to_le32(nr_right); 880 rn->header.max_entries = pn->header.max_entries; 881 rn->header.value_size = pn->header.value_size; 882 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 883 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 884 nr_right * size); 885 886 /* new_parent should just point to l and r now */ 887 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 888 pn->header.nr_entries = cpu_to_le32(2); 889 pn->header.max_entries = cpu_to_le32( 890 calc_max_entries(sizeof(__le64), 891 dm_bm_block_size( 892 dm_tm_get_bm(s->info->tm)))); 893 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 894 895 val = cpu_to_le64(dm_block_location(left)); 896 __dm_bless_for_disk(&val); 897 pn->keys[0] = ln->keys[0]; 898 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 899 900 val = cpu_to_le64(dm_block_location(right)); 901 __dm_bless_for_disk(&val); 902 pn->keys[1] = rn->keys[0]; 903 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 904 905 unlock_block(s->info, left); 906 unlock_block(s->info, right); 907 return 0; 908 } 909 910 /*----------------------------------------------------------------*/ 911 912 /* 913 * Redistributes a node's entries with its left sibling. 914 */ 915 static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, 916 unsigned int parent_index, uint64_t key) 917 { 918 int r; 919 struct dm_block *sib; 920 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); 921 922 r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); 923 if (r) 924 return r; 925 926 left = dm_block_data(sib); 927 right = dm_block_data(shadow_current(s)); 928 redistribute2(left, right); 929 *key_ptr(parent, parent_index) = right->keys[0]; 930 931 if (key < le64_to_cpu(right->keys[0])) { 932 unlock_block(s->info, s->nodes[1]); 933 s->nodes[1] = sib; 934 } else { 935 unlock_block(s->info, sib); 936 } 937 938 return 0; 939 } 940 941 /* 942 * Redistributes a nodes entries with its right sibling. 943 */ 944 static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, 945 unsigned int parent_index, uint64_t key) 946 { 947 int r; 948 struct dm_block *sib; 949 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); 950 951 r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); 952 if (r) 953 return r; 954 955 left = dm_block_data(shadow_current(s)); 956 right = dm_block_data(sib); 957 redistribute2(left, right); 958 *key_ptr(parent, parent_index + 1) = right->keys[0]; 959 960 if (key < le64_to_cpu(right->keys[0])) { 961 unlock_block(s->info, sib); 962 } else { 963 unlock_block(s->info, s->nodes[1]); 964 s->nodes[1] = sib; 965 } 966 967 return 0; 968 } 969 970 /* 971 * Returns the number of spare entries in a node. 972 */ 973 static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned int *space) 974 { 975 int r; 976 unsigned int nr_entries; 977 struct dm_block *block; 978 struct btree_node *node; 979 980 r = bn_read_lock(info, b, &block); 981 if (r) 982 return r; 983 984 node = dm_block_data(block); 985 nr_entries = le32_to_cpu(node->header.nr_entries); 986 *space = le32_to_cpu(node->header.max_entries) - nr_entries; 987 988 unlock_block(info, block); 989 return 0; 990 } 991 992 /* 993 * Make space in a node, either by moving some entries to a sibling, 994 * or creating a new sibling node. SPACE_THRESHOLD defines the minimum 995 * number of free entries that must be in the sibling to make the move 996 * worth while. If the siblings are shared (eg, part of a snapshot), 997 * then they are not touched, since this break sharing and so consume 998 * more space than we save. 999 */ 1000 #define SPACE_THRESHOLD 8 1001 static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, 1002 unsigned int parent_index, uint64_t key) 1003 { 1004 int r; 1005 struct btree_node *parent = dm_block_data(shadow_parent(s)); 1006 unsigned int nr_parent = le32_to_cpu(parent->header.nr_entries); 1007 unsigned int free_space; 1008 int left_shared = 0, right_shared = 0; 1009 1010 /* Should we move entries to the left sibling? */ 1011 if (parent_index > 0) { 1012 dm_block_t left_b = value64(parent, parent_index - 1); 1013 1014 r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); 1015 if (r) 1016 return r; 1017 1018 if (!left_shared) { 1019 r = get_node_free_space(s->info, left_b, &free_space); 1020 if (r) 1021 return r; 1022 1023 if (free_space >= SPACE_THRESHOLD) 1024 return rebalance_left(s, vt, parent_index, key); 1025 } 1026 } 1027 1028 /* Should we move entries to the right sibling? */ 1029 if (parent_index < (nr_parent - 1)) { 1030 dm_block_t right_b = value64(parent, parent_index + 1); 1031 1032 r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); 1033 if (r) 1034 return r; 1035 1036 if (!right_shared) { 1037 r = get_node_free_space(s->info, right_b, &free_space); 1038 if (r) 1039 return r; 1040 1041 if (free_space >= SPACE_THRESHOLD) 1042 return rebalance_right(s, vt, parent_index, key); 1043 } 1044 } 1045 1046 /* 1047 * We need to split the node, normally we split two nodes 1048 * into three. But when inserting a sequence that is either 1049 * monotonically increasing or decreasing it's better to split 1050 * a single node into two. 1051 */ 1052 if (left_shared || right_shared || (nr_parent <= 2) || 1053 (parent_index == 0) || (parent_index + 1 == nr_parent)) { 1054 return split_one_into_two(s, parent_index, vt, key); 1055 } else { 1056 return split_two_into_three(s, parent_index, vt, key); 1057 } 1058 } 1059 1060 /* 1061 * Does the node contain a particular key? 1062 */ 1063 static bool contains_key(struct btree_node *node, uint64_t key) 1064 { 1065 int i = lower_bound(node, key); 1066 1067 if (i >= 0 && le64_to_cpu(node->keys[i]) == key) 1068 return true; 1069 1070 return false; 1071 } 1072 1073 /* 1074 * In general we preemptively make sure there's a free entry in every 1075 * node on the spine when doing an insert. But we can avoid that with 1076 * leaf nodes if we know it's an overwrite. 1077 */ 1078 static bool has_space_for_insert(struct btree_node *node, uint64_t key) 1079 { 1080 if (node->header.nr_entries == node->header.max_entries) { 1081 if (le32_to_cpu(node->header.flags) & LEAF_NODE) { 1082 /* we don't need space if it's an overwrite */ 1083 return contains_key(node, key); 1084 } 1085 1086 return false; 1087 } 1088 1089 return true; 1090 } 1091 1092 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 1093 struct dm_btree_value_type *vt, 1094 uint64_t key, unsigned int *index) 1095 { 1096 int r, i = *index, top = 1; 1097 struct btree_node *node; 1098 1099 for (;;) { 1100 r = shadow_step(s, root, vt); 1101 if (r < 0) 1102 return r; 1103 1104 node = dm_block_data(shadow_current(s)); 1105 1106 /* 1107 * We have to patch up the parent node, ugly, but I don't 1108 * see a way to do this automatically as part of the spine 1109 * op. 1110 */ 1111 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 1112 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 1113 1114 __dm_bless_for_disk(&location); 1115 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 1116 &location, sizeof(__le64)); 1117 } 1118 1119 node = dm_block_data(shadow_current(s)); 1120 1121 if (!has_space_for_insert(node, key)) { 1122 if (top) 1123 r = btree_split_beneath(s, key); 1124 else 1125 r = rebalance_or_split(s, vt, i, key); 1126 1127 if (r < 0) 1128 return r; 1129 1130 /* making space can cause the current node to change */ 1131 node = dm_block_data(shadow_current(s)); 1132 } 1133 1134 i = lower_bound(node, key); 1135 1136 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 1137 break; 1138 1139 if (i < 0) { 1140 /* change the bounds on the lowest key */ 1141 node->keys[0] = cpu_to_le64(key); 1142 i = 0; 1143 } 1144 1145 root = value64(node, i); 1146 top = 0; 1147 } 1148 1149 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 1150 i++; 1151 1152 *index = i; 1153 return 0; 1154 } 1155 1156 static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, 1157 uint64_t key, int *index) 1158 { 1159 int r, i = -1; 1160 struct btree_node *node; 1161 1162 *index = 0; 1163 for (;;) { 1164 r = shadow_step(s, root, &s->info->value_type); 1165 if (r < 0) 1166 return r; 1167 1168 node = dm_block_data(shadow_current(s)); 1169 1170 /* 1171 * We have to patch up the parent node, ugly, but I don't 1172 * see a way to do this automatically as part of the spine 1173 * op. 1174 */ 1175 if (shadow_has_parent(s) && i >= 0) { 1176 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 1177 1178 __dm_bless_for_disk(&location); 1179 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 1180 &location, sizeof(__le64)); 1181 } 1182 1183 node = dm_block_data(shadow_current(s)); 1184 i = lower_bound(node, key); 1185 1186 BUG_ON(i < 0); 1187 BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); 1188 1189 if (le32_to_cpu(node->header.flags) & LEAF_NODE) { 1190 if (key != le64_to_cpu(node->keys[i])) 1191 return -EINVAL; 1192 break; 1193 } 1194 1195 root = value64(node, i); 1196 } 1197 1198 *index = i; 1199 return 0; 1200 } 1201 1202 int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, 1203 uint64_t key, int *index, 1204 dm_block_t *new_root, struct dm_block **leaf) 1205 { 1206 int r; 1207 struct shadow_spine spine; 1208 1209 BUG_ON(info->levels > 1); 1210 init_shadow_spine(&spine, info); 1211 r = __btree_get_overwrite_leaf(&spine, root, key, index); 1212 if (!r) { 1213 *new_root = shadow_root(&spine); 1214 *leaf = shadow_current(&spine); 1215 1216 /* 1217 * Decrement the count so exit_shadow_spine() doesn't 1218 * unlock the leaf. 1219 */ 1220 spine.count--; 1221 } 1222 exit_shadow_spine(&spine); 1223 1224 return r; 1225 } 1226 1227 static bool need_insert(struct btree_node *node, uint64_t *keys, 1228 unsigned int level, unsigned int index) 1229 { 1230 return ((index >= le32_to_cpu(node->header.nr_entries)) || 1231 (le64_to_cpu(node->keys[index]) != keys[level])); 1232 } 1233 1234 static int insert(struct dm_btree_info *info, dm_block_t root, 1235 uint64_t *keys, void *value, dm_block_t *new_root, 1236 int *inserted) 1237 __dm_written_to_disk(value) 1238 { 1239 int r; 1240 unsigned int level, index = -1, last_level = info->levels - 1; 1241 dm_block_t block = root; 1242 struct shadow_spine spine; 1243 struct btree_node *n; 1244 struct dm_btree_value_type le64_type; 1245 1246 init_le64_type(info->tm, &le64_type); 1247 init_shadow_spine(&spine, info); 1248 1249 for (level = 0; level < (info->levels - 1); level++) { 1250 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 1251 if (r < 0) 1252 goto bad; 1253 1254 n = dm_block_data(shadow_current(&spine)); 1255 1256 if (need_insert(n, keys, level, index)) { 1257 dm_block_t new_tree; 1258 __le64 new_le; 1259 1260 r = dm_btree_empty(info, &new_tree); 1261 if (r < 0) 1262 goto bad; 1263 1264 new_le = cpu_to_le64(new_tree); 1265 __dm_bless_for_disk(&new_le); 1266 1267 r = insert_at(sizeof(uint64_t), n, index, 1268 keys[level], &new_le); 1269 if (r) 1270 goto bad; 1271 } 1272 1273 if (level < last_level) 1274 block = value64(n, index); 1275 } 1276 1277 r = btree_insert_raw(&spine, block, &info->value_type, 1278 keys[level], &index); 1279 if (r < 0) 1280 goto bad; 1281 1282 n = dm_block_data(shadow_current(&spine)); 1283 1284 if (need_insert(n, keys, level, index)) { 1285 if (inserted) 1286 *inserted = 1; 1287 1288 r = insert_at(info->value_type.size, n, index, 1289 keys[level], value); 1290 if (r) 1291 goto bad_unblessed; 1292 } else { 1293 if (inserted) 1294 *inserted = 0; 1295 1296 if (info->value_type.dec && 1297 (!info->value_type.equal || 1298 !info->value_type.equal( 1299 info->value_type.context, 1300 value_ptr(n, index), 1301 value))) { 1302 info->value_type.dec(info->value_type.context, 1303 value_ptr(n, index), 1); 1304 } 1305 memcpy_disk(value_ptr(n, index), 1306 value, info->value_type.size); 1307 } 1308 1309 *new_root = shadow_root(&spine); 1310 exit_shadow_spine(&spine); 1311 1312 return 0; 1313 1314 bad: 1315 __dm_unbless_for_disk(value); 1316 bad_unblessed: 1317 exit_shadow_spine(&spine); 1318 return r; 1319 } 1320 1321 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 1322 uint64_t *keys, void *value, dm_block_t *new_root) 1323 __dm_written_to_disk(value) 1324 { 1325 return insert(info, root, keys, value, new_root, NULL); 1326 } 1327 EXPORT_SYMBOL_GPL(dm_btree_insert); 1328 1329 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 1330 uint64_t *keys, void *value, dm_block_t *new_root, 1331 int *inserted) 1332 __dm_written_to_disk(value) 1333 { 1334 return insert(info, root, keys, value, new_root, inserted); 1335 } 1336 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 1337 1338 /*----------------------------------------------------------------*/ 1339 1340 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 1341 uint64_t *result_key, dm_block_t *next_block) 1342 { 1343 int i, r; 1344 uint32_t flags; 1345 1346 do { 1347 r = ro_step(s, block); 1348 if (r < 0) 1349 return r; 1350 1351 flags = le32_to_cpu(ro_node(s)->header.flags); 1352 i = le32_to_cpu(ro_node(s)->header.nr_entries); 1353 if (!i) 1354 return -ENODATA; 1355 else 1356 i--; 1357 1358 if (find_highest) 1359 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 1360 else 1361 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 1362 1363 if (next_block || flags & INTERNAL_NODE) { 1364 if (find_highest) 1365 block = value64(ro_node(s), i); 1366 else 1367 block = value64(ro_node(s), 0); 1368 } 1369 1370 } while (flags & INTERNAL_NODE); 1371 1372 if (next_block) 1373 *next_block = block; 1374 return 0; 1375 } 1376 1377 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 1378 bool find_highest, uint64_t *result_keys) 1379 { 1380 int r = 0, count = 0, level; 1381 struct ro_spine spine; 1382 1383 init_ro_spine(&spine, info); 1384 for (level = 0; level < info->levels; level++) { 1385 r = find_key(&spine, root, find_highest, result_keys + level, 1386 level == info->levels - 1 ? NULL : &root); 1387 if (r == -ENODATA) { 1388 r = 0; 1389 break; 1390 1391 } else if (r) 1392 break; 1393 1394 count++; 1395 } 1396 exit_ro_spine(&spine); 1397 1398 return r ? r : count; 1399 } 1400 1401 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 1402 uint64_t *result_keys) 1403 { 1404 return dm_btree_find_key(info, root, true, result_keys); 1405 } 1406 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 1407 1408 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 1409 uint64_t *result_keys) 1410 { 1411 return dm_btree_find_key(info, root, false, result_keys); 1412 } 1413 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 1414 1415 /*----------------------------------------------------------------*/ 1416 1417 /* 1418 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 1419 * space. Also this only works for single level trees. 1420 */ 1421 static int walk_node(struct dm_btree_info *info, dm_block_t block, 1422 int (*fn)(void *context, uint64_t *keys, void *leaf), 1423 void *context) 1424 { 1425 int r; 1426 unsigned int i, nr; 1427 struct dm_block *node; 1428 struct btree_node *n; 1429 uint64_t keys; 1430 1431 r = bn_read_lock(info, block, &node); 1432 if (r) 1433 return r; 1434 1435 n = dm_block_data(node); 1436 1437 nr = le32_to_cpu(n->header.nr_entries); 1438 for (i = 0; i < nr; i++) { 1439 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 1440 r = walk_node(info, value64(n, i), fn, context); 1441 if (r) 1442 goto out; 1443 } else { 1444 keys = le64_to_cpu(*key_ptr(n, i)); 1445 r = fn(context, &keys, value_ptr(n, i)); 1446 if (r) 1447 goto out; 1448 } 1449 } 1450 1451 out: 1452 dm_tm_unlock(info->tm, node); 1453 return r; 1454 } 1455 1456 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 1457 int (*fn)(void *context, uint64_t *keys, void *leaf), 1458 void *context) 1459 { 1460 BUG_ON(info->levels > 1); 1461 return walk_node(info, root, fn, context); 1462 } 1463 EXPORT_SYMBOL_GPL(dm_btree_walk); 1464 1465 /*----------------------------------------------------------------*/ 1466 1467 static void prefetch_values(struct dm_btree_cursor *c) 1468 { 1469 unsigned int i, nr; 1470 __le64 value_le; 1471 struct cursor_node *n = c->nodes + c->depth - 1; 1472 struct btree_node *bn = dm_block_data(n->b); 1473 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 1474 1475 BUG_ON(c->info->value_type.size != sizeof(value_le)); 1476 1477 nr = le32_to_cpu(bn->header.nr_entries); 1478 for (i = 0; i < nr; i++) { 1479 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 1480 dm_bm_prefetch(bm, le64_to_cpu(value_le)); 1481 } 1482 } 1483 1484 static bool leaf_node(struct dm_btree_cursor *c) 1485 { 1486 struct cursor_node *n = c->nodes + c->depth - 1; 1487 struct btree_node *bn = dm_block_data(n->b); 1488 1489 return le32_to_cpu(bn->header.flags) & LEAF_NODE; 1490 } 1491 1492 static int push_node(struct dm_btree_cursor *c, dm_block_t b) 1493 { 1494 int r; 1495 struct cursor_node *n = c->nodes + c->depth; 1496 1497 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 1498 DMERR("couldn't push cursor node, stack depth too high"); 1499 return -EINVAL; 1500 } 1501 1502 r = bn_read_lock(c->info, b, &n->b); 1503 if (r) 1504 return r; 1505 1506 n->index = 0; 1507 c->depth++; 1508 1509 if (c->prefetch_leaves || !leaf_node(c)) 1510 prefetch_values(c); 1511 1512 return 0; 1513 } 1514 1515 static void pop_node(struct dm_btree_cursor *c) 1516 { 1517 c->depth--; 1518 unlock_block(c->info, c->nodes[c->depth].b); 1519 } 1520 1521 static int inc_or_backtrack(struct dm_btree_cursor *c) 1522 { 1523 struct cursor_node *n; 1524 struct btree_node *bn; 1525 1526 for (;;) { 1527 if (!c->depth) 1528 return -ENODATA; 1529 1530 n = c->nodes + c->depth - 1; 1531 bn = dm_block_data(n->b); 1532 1533 n->index++; 1534 if (n->index < le32_to_cpu(bn->header.nr_entries)) 1535 break; 1536 1537 pop_node(c); 1538 } 1539 1540 return 0; 1541 } 1542 1543 static int find_leaf(struct dm_btree_cursor *c) 1544 { 1545 int r = 0; 1546 struct cursor_node *n; 1547 struct btree_node *bn; 1548 __le64 value_le; 1549 1550 for (;;) { 1551 n = c->nodes + c->depth - 1; 1552 bn = dm_block_data(n->b); 1553 1554 if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 1555 break; 1556 1557 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 1558 r = push_node(c, le64_to_cpu(value_le)); 1559 if (r) { 1560 DMERR("push_node failed"); 1561 break; 1562 } 1563 } 1564 1565 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 1566 return -ENODATA; 1567 1568 return r; 1569 } 1570 1571 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 1572 bool prefetch_leaves, struct dm_btree_cursor *c) 1573 { 1574 int r; 1575 1576 c->info = info; 1577 c->root = root; 1578 c->depth = 0; 1579 c->prefetch_leaves = prefetch_leaves; 1580 1581 r = push_node(c, root); 1582 if (r) 1583 return r; 1584 1585 return find_leaf(c); 1586 } 1587 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 1588 1589 void dm_btree_cursor_end(struct dm_btree_cursor *c) 1590 { 1591 while (c->depth) 1592 pop_node(c); 1593 } 1594 EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 1595 1596 int dm_btree_cursor_next(struct dm_btree_cursor *c) 1597 { 1598 int r = inc_or_backtrack(c); 1599 1600 if (!r) { 1601 r = find_leaf(c); 1602 if (r) 1603 DMERR("find_leaf failed"); 1604 } 1605 1606 return r; 1607 } 1608 EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1609 1610 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) 1611 { 1612 int r = 0; 1613 1614 while (count-- && !r) 1615 r = dm_btree_cursor_next(c); 1616 1617 return r; 1618 } 1619 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); 1620 1621 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1622 { 1623 if (c->depth) { 1624 struct cursor_node *n = c->nodes + c->depth - 1; 1625 struct btree_node *bn = dm_block_data(n->b); 1626 1627 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 1628 return -EINVAL; 1629 1630 *key = le64_to_cpu(*key_ptr(bn, n->index)); 1631 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 1632 return 0; 1633 1634 } else 1635 return -ENODATA; 1636 } 1637 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1638