1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree-internal.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19 static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21 { 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24 } 25 26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29 { 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36 } 37 38 /*----------------------------------------------------------------*/ 39 40 /* makes the assumption that no two keys are the same. */ 41 static int bsearch(struct node *n, uint64_t key, int want_hi) 42 { 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59 } 60 61 int lower_bound(struct node *n, uint64_t key) 62 { 63 return bsearch(n, key, 0); 64 } 65 66 void inc_children(struct dm_transaction_manager *tm, struct node *n, 67 struct dm_btree_value_type *vt) 68 { 69 unsigned i; 70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 71 72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 73 for (i = 0; i < nr_entries; i++) 74 dm_tm_inc(tm, value64(n, i)); 75 else if (vt->inc) 76 for (i = 0; i < nr_entries; i++) 77 vt->inc(vt->context, value_ptr(n, i)); 78 } 79 80 static int insert_at(size_t value_size, struct node *node, unsigned index, 81 uint64_t key, void *value) 82 __dm_written_to_disk(value) 83 { 84 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 85 __le64 key_le = cpu_to_le64(key); 86 87 if (index > nr_entries || 88 index >= le32_to_cpu(node->header.max_entries)) { 89 DMERR("too many entries in btree node for insert"); 90 __dm_unbless_for_disk(value); 91 return -ENOMEM; 92 } 93 94 __dm_bless_for_disk(&key_le); 95 96 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 97 array_insert(value_base(node), value_size, nr_entries, index, value); 98 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 99 100 return 0; 101 } 102 103 /*----------------------------------------------------------------*/ 104 105 /* 106 * We want 3n entries (for some n). This works more nicely for repeated 107 * insert remove loops than (2n + 1). 108 */ 109 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 110 { 111 uint32_t total, n; 112 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 113 114 block_size -= sizeof(struct node_header); 115 total = block_size / elt_size; 116 n = total / 3; /* rounds down */ 117 118 return 3 * n; 119 } 120 121 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 122 { 123 int r; 124 struct dm_block *b; 125 struct node *n; 126 size_t block_size; 127 uint32_t max_entries; 128 129 r = new_block(info, &b); 130 if (r < 0) 131 return r; 132 133 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 134 max_entries = calc_max_entries(info->value_type.size, block_size); 135 136 n = dm_block_data(b); 137 memset(n, 0, block_size); 138 n->header.flags = cpu_to_le32(LEAF_NODE); 139 n->header.nr_entries = cpu_to_le32(0); 140 n->header.max_entries = cpu_to_le32(max_entries); 141 n->header.value_size = cpu_to_le32(info->value_type.size); 142 143 *root = dm_block_location(b); 144 return unlock_block(info, b); 145 } 146 EXPORT_SYMBOL_GPL(dm_btree_empty); 147 148 /*----------------------------------------------------------------*/ 149 150 /* 151 * Deletion uses a recursive algorithm, since we have limited stack space 152 * we explicitly manage our own stack on the heap. 153 */ 154 #define MAX_SPINE_DEPTH 64 155 struct frame { 156 struct dm_block *b; 157 struct node *n; 158 unsigned level; 159 unsigned nr_children; 160 unsigned current_child; 161 }; 162 163 struct del_stack { 164 struct dm_transaction_manager *tm; 165 int top; 166 struct frame spine[MAX_SPINE_DEPTH]; 167 }; 168 169 static int top_frame(struct del_stack *s, struct frame **f) 170 { 171 if (s->top < 0) { 172 DMERR("btree deletion stack empty"); 173 return -EINVAL; 174 } 175 176 *f = s->spine + s->top; 177 178 return 0; 179 } 180 181 static int unprocessed_frames(struct del_stack *s) 182 { 183 return s->top >= 0; 184 } 185 186 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 187 { 188 int r; 189 uint32_t ref_count; 190 191 if (s->top >= MAX_SPINE_DEPTH - 1) { 192 DMERR("btree deletion stack out of memory"); 193 return -ENOMEM; 194 } 195 196 r = dm_tm_ref(s->tm, b, &ref_count); 197 if (r) 198 return r; 199 200 if (ref_count > 1) 201 /* 202 * This is a shared node, so we can just decrement it's 203 * reference counter and leave the children. 204 */ 205 dm_tm_dec(s->tm, b); 206 207 else { 208 struct frame *f = s->spine + ++s->top; 209 210 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 211 if (r) { 212 s->top--; 213 return r; 214 } 215 216 f->n = dm_block_data(f->b); 217 f->level = level; 218 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 219 f->current_child = 0; 220 } 221 222 return 0; 223 } 224 225 static void pop_frame(struct del_stack *s) 226 { 227 struct frame *f = s->spine + s->top--; 228 229 dm_tm_dec(s->tm, dm_block_location(f->b)); 230 dm_tm_unlock(s->tm, f->b); 231 } 232 233 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 234 { 235 int r; 236 struct del_stack *s; 237 238 s = kmalloc(sizeof(*s), GFP_KERNEL); 239 if (!s) 240 return -ENOMEM; 241 s->tm = info->tm; 242 s->top = -1; 243 244 r = push_frame(s, root, 1); 245 if (r) 246 goto out; 247 248 while (unprocessed_frames(s)) { 249 uint32_t flags; 250 struct frame *f; 251 dm_block_t b; 252 253 r = top_frame(s, &f); 254 if (r) 255 goto out; 256 257 if (f->current_child >= f->nr_children) { 258 pop_frame(s); 259 continue; 260 } 261 262 flags = le32_to_cpu(f->n->header.flags); 263 if (flags & INTERNAL_NODE) { 264 b = value64(f->n, f->current_child); 265 f->current_child++; 266 r = push_frame(s, b, f->level); 267 if (r) 268 goto out; 269 270 } else if (f->level != (info->levels - 1)) { 271 b = value64(f->n, f->current_child); 272 f->current_child++; 273 r = push_frame(s, b, f->level + 1); 274 if (r) 275 goto out; 276 277 } else { 278 if (info->value_type.dec) { 279 unsigned i; 280 281 for (i = 0; i < f->nr_children; i++) 282 info->value_type.dec(info->value_type.context, 283 value_ptr(f->n, i)); 284 } 285 f->current_child = f->nr_children; 286 } 287 } 288 289 out: 290 kfree(s); 291 return r; 292 } 293 EXPORT_SYMBOL_GPL(dm_btree_del); 294 295 /*----------------------------------------------------------------*/ 296 297 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 298 int (*search_fn)(struct node *, uint64_t), 299 uint64_t *result_key, void *v, size_t value_size) 300 { 301 int i, r; 302 uint32_t flags, nr_entries; 303 304 do { 305 r = ro_step(s, block); 306 if (r < 0) 307 return r; 308 309 i = search_fn(ro_node(s), key); 310 311 flags = le32_to_cpu(ro_node(s)->header.flags); 312 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 313 if (i < 0 || i >= nr_entries) 314 return -ENODATA; 315 316 if (flags & INTERNAL_NODE) 317 block = value64(ro_node(s), i); 318 319 } while (!(flags & LEAF_NODE)); 320 321 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 322 memcpy(v, value_ptr(ro_node(s), i), value_size); 323 324 return 0; 325 } 326 327 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 328 uint64_t *keys, void *value_le) 329 { 330 unsigned level, last_level = info->levels - 1; 331 int r = -ENODATA; 332 uint64_t rkey; 333 __le64 internal_value_le; 334 struct ro_spine spine; 335 336 init_ro_spine(&spine, info); 337 for (level = 0; level < info->levels; level++) { 338 size_t size; 339 void *value_p; 340 341 if (level == last_level) { 342 value_p = value_le; 343 size = info->value_type.size; 344 345 } else { 346 value_p = &internal_value_le; 347 size = sizeof(uint64_t); 348 } 349 350 r = btree_lookup_raw(&spine, root, keys[level], 351 lower_bound, &rkey, 352 value_p, size); 353 354 if (!r) { 355 if (rkey != keys[level]) { 356 exit_ro_spine(&spine); 357 return -ENODATA; 358 } 359 } else { 360 exit_ro_spine(&spine); 361 return r; 362 } 363 364 root = le64_to_cpu(internal_value_le); 365 } 366 exit_ro_spine(&spine); 367 368 return r; 369 } 370 EXPORT_SYMBOL_GPL(dm_btree_lookup); 371 372 /* 373 * Splits a node by creating a sibling node and shifting half the nodes 374 * contents across. Assumes there is a parent node, and it has room for 375 * another child. 376 * 377 * Before: 378 * +--------+ 379 * | Parent | 380 * +--------+ 381 * | 382 * v 383 * +----------+ 384 * | A ++++++ | 385 * +----------+ 386 * 387 * 388 * After: 389 * +--------+ 390 * | Parent | 391 * +--------+ 392 * | | 393 * v +------+ 394 * +---------+ | 395 * | A* +++ | v 396 * +---------+ +-------+ 397 * | B +++ | 398 * +-------+ 399 * 400 * Where A* is a shadow of A. 401 */ 402 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, 403 unsigned parent_index, uint64_t key) 404 { 405 int r; 406 size_t size; 407 unsigned nr_left, nr_right; 408 struct dm_block *left, *right, *parent; 409 struct node *ln, *rn, *pn; 410 __le64 location; 411 412 left = shadow_current(s); 413 414 r = new_block(s->info, &right); 415 if (r < 0) 416 return r; 417 418 ln = dm_block_data(left); 419 rn = dm_block_data(right); 420 421 nr_left = le32_to_cpu(ln->header.nr_entries) / 2; 422 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; 423 424 ln->header.nr_entries = cpu_to_le32(nr_left); 425 426 rn->header.flags = ln->header.flags; 427 rn->header.nr_entries = cpu_to_le32(nr_right); 428 rn->header.max_entries = ln->header.max_entries; 429 rn->header.value_size = ln->header.value_size; 430 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); 431 432 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 433 sizeof(uint64_t) : s->info->value_type.size; 434 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), 435 size * nr_right); 436 437 /* 438 * Patch up the parent 439 */ 440 parent = shadow_parent(s); 441 442 pn = dm_block_data(parent); 443 location = cpu_to_le64(dm_block_location(left)); 444 __dm_bless_for_disk(&location); 445 memcpy_disk(value_ptr(pn, parent_index), 446 &location, sizeof(__le64)); 447 448 location = cpu_to_le64(dm_block_location(right)); 449 __dm_bless_for_disk(&location); 450 451 r = insert_at(sizeof(__le64), pn, parent_index + 1, 452 le64_to_cpu(rn->keys[0]), &location); 453 if (r) 454 return r; 455 456 if (key < le64_to_cpu(rn->keys[0])) { 457 unlock_block(s->info, right); 458 s->nodes[1] = left; 459 } else { 460 unlock_block(s->info, left); 461 s->nodes[1] = right; 462 } 463 464 return 0; 465 } 466 467 /* 468 * Splits a node by creating two new children beneath the given node. 469 * 470 * Before: 471 * +----------+ 472 * | A ++++++ | 473 * +----------+ 474 * 475 * 476 * After: 477 * +------------+ 478 * | A (shadow) | 479 * +------------+ 480 * | | 481 * +------+ +----+ 482 * | | 483 * v v 484 * +-------+ +-------+ 485 * | B +++ | | C +++ | 486 * +-------+ +-------+ 487 */ 488 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 489 { 490 int r; 491 size_t size; 492 unsigned nr_left, nr_right; 493 struct dm_block *left, *right, *new_parent; 494 struct node *pn, *ln, *rn; 495 __le64 val; 496 497 new_parent = shadow_current(s); 498 499 r = new_block(s->info, &left); 500 if (r < 0) 501 return r; 502 503 r = new_block(s->info, &right); 504 if (r < 0) { 505 /* FIXME: put left */ 506 return r; 507 } 508 509 pn = dm_block_data(new_parent); 510 ln = dm_block_data(left); 511 rn = dm_block_data(right); 512 513 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 514 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 515 516 ln->header.flags = pn->header.flags; 517 ln->header.nr_entries = cpu_to_le32(nr_left); 518 ln->header.max_entries = pn->header.max_entries; 519 ln->header.value_size = pn->header.value_size; 520 521 rn->header.flags = pn->header.flags; 522 rn->header.nr_entries = cpu_to_le32(nr_right); 523 rn->header.max_entries = pn->header.max_entries; 524 rn->header.value_size = pn->header.value_size; 525 526 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 527 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 528 529 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 530 sizeof(__le64) : s->info->value_type.size; 531 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 532 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 533 nr_right * size); 534 535 /* new_parent should just point to l and r now */ 536 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 537 pn->header.nr_entries = cpu_to_le32(2); 538 pn->header.max_entries = cpu_to_le32( 539 calc_max_entries(sizeof(__le64), 540 dm_bm_block_size( 541 dm_tm_get_bm(s->info->tm)))); 542 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 543 544 val = cpu_to_le64(dm_block_location(left)); 545 __dm_bless_for_disk(&val); 546 pn->keys[0] = ln->keys[0]; 547 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 548 549 val = cpu_to_le64(dm_block_location(right)); 550 __dm_bless_for_disk(&val); 551 pn->keys[1] = rn->keys[0]; 552 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 553 554 /* 555 * rejig the spine. This is ugly, since it knows too 556 * much about the spine 557 */ 558 if (s->nodes[0] != new_parent) { 559 unlock_block(s->info, s->nodes[0]); 560 s->nodes[0] = new_parent; 561 } 562 if (key < le64_to_cpu(rn->keys[0])) { 563 unlock_block(s->info, right); 564 s->nodes[1] = left; 565 } else { 566 unlock_block(s->info, left); 567 s->nodes[1] = right; 568 } 569 s->count = 2; 570 571 return 0; 572 } 573 574 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 575 struct dm_btree_value_type *vt, 576 uint64_t key, unsigned *index) 577 { 578 int r, i = *index, top = 1; 579 struct node *node; 580 581 for (;;) { 582 r = shadow_step(s, root, vt); 583 if (r < 0) 584 return r; 585 586 node = dm_block_data(shadow_current(s)); 587 588 /* 589 * We have to patch up the parent node, ugly, but I don't 590 * see a way to do this automatically as part of the spine 591 * op. 592 */ 593 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 594 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 595 596 __dm_bless_for_disk(&location); 597 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 598 &location, sizeof(__le64)); 599 } 600 601 node = dm_block_data(shadow_current(s)); 602 603 if (node->header.nr_entries == node->header.max_entries) { 604 if (top) 605 r = btree_split_beneath(s, key); 606 else 607 r = btree_split_sibling(s, root, i, key); 608 609 if (r < 0) 610 return r; 611 } 612 613 node = dm_block_data(shadow_current(s)); 614 615 i = lower_bound(node, key); 616 617 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 618 break; 619 620 if (i < 0) { 621 /* change the bounds on the lowest key */ 622 node->keys[0] = cpu_to_le64(key); 623 i = 0; 624 } 625 626 root = value64(node, i); 627 top = 0; 628 } 629 630 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 631 i++; 632 633 *index = i; 634 return 0; 635 } 636 637 static int insert(struct dm_btree_info *info, dm_block_t root, 638 uint64_t *keys, void *value, dm_block_t *new_root, 639 int *inserted) 640 __dm_written_to_disk(value) 641 { 642 int r, need_insert; 643 unsigned level, index = -1, last_level = info->levels - 1; 644 dm_block_t block = root; 645 struct shadow_spine spine; 646 struct node *n; 647 struct dm_btree_value_type le64_type; 648 649 le64_type.context = NULL; 650 le64_type.size = sizeof(__le64); 651 le64_type.inc = NULL; 652 le64_type.dec = NULL; 653 le64_type.equal = NULL; 654 655 init_shadow_spine(&spine, info); 656 657 for (level = 0; level < (info->levels - 1); level++) { 658 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 659 if (r < 0) 660 goto bad; 661 662 n = dm_block_data(shadow_current(&spine)); 663 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || 664 (le64_to_cpu(n->keys[index]) != keys[level])); 665 666 if (need_insert) { 667 dm_block_t new_tree; 668 __le64 new_le; 669 670 r = dm_btree_empty(info, &new_tree); 671 if (r < 0) 672 goto bad; 673 674 new_le = cpu_to_le64(new_tree); 675 __dm_bless_for_disk(&new_le); 676 677 r = insert_at(sizeof(uint64_t), n, index, 678 keys[level], &new_le); 679 if (r) 680 goto bad; 681 } 682 683 if (level < last_level) 684 block = value64(n, index); 685 } 686 687 r = btree_insert_raw(&spine, block, &info->value_type, 688 keys[level], &index); 689 if (r < 0) 690 goto bad; 691 692 n = dm_block_data(shadow_current(&spine)); 693 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || 694 (le64_to_cpu(n->keys[index]) != keys[level])); 695 696 if (need_insert) { 697 if (inserted) 698 *inserted = 1; 699 700 r = insert_at(info->value_type.size, n, index, 701 keys[level], value); 702 if (r) 703 goto bad_unblessed; 704 } else { 705 if (inserted) 706 *inserted = 0; 707 708 if (info->value_type.dec && 709 (!info->value_type.equal || 710 !info->value_type.equal( 711 info->value_type.context, 712 value_ptr(n, index), 713 value))) { 714 info->value_type.dec(info->value_type.context, 715 value_ptr(n, index)); 716 } 717 memcpy_disk(value_ptr(n, index), 718 value, info->value_type.size); 719 } 720 721 *new_root = shadow_root(&spine); 722 exit_shadow_spine(&spine); 723 724 return 0; 725 726 bad: 727 __dm_unbless_for_disk(value); 728 bad_unblessed: 729 exit_shadow_spine(&spine); 730 return r; 731 } 732 733 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 734 uint64_t *keys, void *value, dm_block_t *new_root) 735 __dm_written_to_disk(value) 736 { 737 return insert(info, root, keys, value, new_root, NULL); 738 } 739 EXPORT_SYMBOL_GPL(dm_btree_insert); 740 741 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 742 uint64_t *keys, void *value, dm_block_t *new_root, 743 int *inserted) 744 __dm_written_to_disk(value) 745 { 746 return insert(info, root, keys, value, new_root, inserted); 747 } 748 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 749 750 /*----------------------------------------------------------------*/ 751 752 static int find_highest_key(struct ro_spine *s, dm_block_t block, 753 uint64_t *result_key, dm_block_t *next_block) 754 { 755 int i, r; 756 uint32_t flags; 757 758 do { 759 r = ro_step(s, block); 760 if (r < 0) 761 return r; 762 763 flags = le32_to_cpu(ro_node(s)->header.flags); 764 i = le32_to_cpu(ro_node(s)->header.nr_entries); 765 if (!i) 766 return -ENODATA; 767 else 768 i--; 769 770 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 771 if (next_block || flags & INTERNAL_NODE) 772 block = value64(ro_node(s), i); 773 774 } while (flags & INTERNAL_NODE); 775 776 if (next_block) 777 *next_block = block; 778 return 0; 779 } 780 781 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 782 uint64_t *result_keys) 783 { 784 int r = 0, count = 0, level; 785 struct ro_spine spine; 786 787 init_ro_spine(&spine, info); 788 for (level = 0; level < info->levels; level++) { 789 r = find_highest_key(&spine, root, result_keys + level, 790 level == info->levels - 1 ? NULL : &root); 791 if (r == -ENODATA) { 792 r = 0; 793 break; 794 795 } else if (r) 796 break; 797 798 count++; 799 } 800 exit_ro_spine(&spine); 801 802 return r ? r : count; 803 } 804 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 805