1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree-internal.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19 static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21 { 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24 } 25 26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29 { 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36 } 37 38 /*----------------------------------------------------------------*/ 39 40 /* makes the assumption that no two keys are the same. */ 41 static int bsearch(struct node *n, uint64_t key, int want_hi) 42 { 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59 } 60 61 int lower_bound(struct node *n, uint64_t key) 62 { 63 return bsearch(n, key, 0); 64 } 65 66 void inc_children(struct dm_transaction_manager *tm, struct node *n, 67 struct dm_btree_value_type *vt) 68 { 69 unsigned i; 70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 71 72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 73 for (i = 0; i < nr_entries; i++) 74 dm_tm_inc(tm, value64(n, i)); 75 else if (vt->inc) 76 for (i = 0; i < nr_entries; i++) 77 vt->inc(vt->context, 78 value_ptr(n, i, vt->size)); 79 } 80 81 static int insert_at(size_t value_size, struct node *node, unsigned index, 82 uint64_t key, void *value) 83 __dm_written_to_disk(value) 84 { 85 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 86 __le64 key_le = cpu_to_le64(key); 87 88 if (index > nr_entries || 89 index >= le32_to_cpu(node->header.max_entries)) { 90 DMERR("too many entries in btree node for insert"); 91 __dm_unbless_for_disk(value); 92 return -ENOMEM; 93 } 94 95 __dm_bless_for_disk(&key_le); 96 97 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 98 array_insert(value_base(node), value_size, nr_entries, index, value); 99 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 100 101 return 0; 102 } 103 104 /*----------------------------------------------------------------*/ 105 106 /* 107 * We want 3n entries (for some n). This works more nicely for repeated 108 * insert remove loops than (2n + 1). 109 */ 110 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 111 { 112 uint32_t total, n; 113 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 114 115 block_size -= sizeof(struct node_header); 116 total = block_size / elt_size; 117 n = total / 3; /* rounds down */ 118 119 return 3 * n; 120 } 121 122 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 123 { 124 int r; 125 struct dm_block *b; 126 struct node *n; 127 size_t block_size; 128 uint32_t max_entries; 129 130 r = new_block(info, &b); 131 if (r < 0) 132 return r; 133 134 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 135 max_entries = calc_max_entries(info->value_type.size, block_size); 136 137 n = dm_block_data(b); 138 memset(n, 0, block_size); 139 n->header.flags = cpu_to_le32(LEAF_NODE); 140 n->header.nr_entries = cpu_to_le32(0); 141 n->header.max_entries = cpu_to_le32(max_entries); 142 n->header.value_size = cpu_to_le32(info->value_type.size); 143 144 *root = dm_block_location(b); 145 return unlock_block(info, b); 146 } 147 EXPORT_SYMBOL_GPL(dm_btree_empty); 148 149 /*----------------------------------------------------------------*/ 150 151 /* 152 * Deletion uses a recursive algorithm, since we have limited stack space 153 * we explicitly manage our own stack on the heap. 154 */ 155 #define MAX_SPINE_DEPTH 64 156 struct frame { 157 struct dm_block *b; 158 struct node *n; 159 unsigned level; 160 unsigned nr_children; 161 unsigned current_child; 162 }; 163 164 struct del_stack { 165 struct dm_transaction_manager *tm; 166 int top; 167 struct frame spine[MAX_SPINE_DEPTH]; 168 }; 169 170 static int top_frame(struct del_stack *s, struct frame **f) 171 { 172 if (s->top < 0) { 173 DMERR("btree deletion stack empty"); 174 return -EINVAL; 175 } 176 177 *f = s->spine + s->top; 178 179 return 0; 180 } 181 182 static int unprocessed_frames(struct del_stack *s) 183 { 184 return s->top >= 0; 185 } 186 187 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 188 { 189 int r; 190 uint32_t ref_count; 191 192 if (s->top >= MAX_SPINE_DEPTH - 1) { 193 DMERR("btree deletion stack out of memory"); 194 return -ENOMEM; 195 } 196 197 r = dm_tm_ref(s->tm, b, &ref_count); 198 if (r) 199 return r; 200 201 if (ref_count > 1) 202 /* 203 * This is a shared node, so we can just decrement it's 204 * reference counter and leave the children. 205 */ 206 dm_tm_dec(s->tm, b); 207 208 else { 209 struct frame *f = s->spine + ++s->top; 210 211 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 212 if (r) { 213 s->top--; 214 return r; 215 } 216 217 f->n = dm_block_data(f->b); 218 f->level = level; 219 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 220 f->current_child = 0; 221 } 222 223 return 0; 224 } 225 226 static void pop_frame(struct del_stack *s) 227 { 228 struct frame *f = s->spine + s->top--; 229 230 dm_tm_dec(s->tm, dm_block_location(f->b)); 231 dm_tm_unlock(s->tm, f->b); 232 } 233 234 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 235 { 236 int r; 237 struct del_stack *s; 238 239 s = kmalloc(sizeof(*s), GFP_KERNEL); 240 if (!s) 241 return -ENOMEM; 242 s->tm = info->tm; 243 s->top = -1; 244 245 r = push_frame(s, root, 1); 246 if (r) 247 goto out; 248 249 while (unprocessed_frames(s)) { 250 uint32_t flags; 251 struct frame *f; 252 dm_block_t b; 253 254 r = top_frame(s, &f); 255 if (r) 256 goto out; 257 258 if (f->current_child >= f->nr_children) { 259 pop_frame(s); 260 continue; 261 } 262 263 flags = le32_to_cpu(f->n->header.flags); 264 if (flags & INTERNAL_NODE) { 265 b = value64(f->n, f->current_child); 266 f->current_child++; 267 r = push_frame(s, b, f->level); 268 if (r) 269 goto out; 270 271 } else if (f->level != (info->levels - 1)) { 272 b = value64(f->n, f->current_child); 273 f->current_child++; 274 r = push_frame(s, b, f->level + 1); 275 if (r) 276 goto out; 277 278 } else { 279 if (info->value_type.dec) { 280 unsigned i; 281 282 for (i = 0; i < f->nr_children; i++) 283 info->value_type.dec(info->value_type.context, 284 value_ptr(f->n, i, info->value_type.size)); 285 } 286 f->current_child = f->nr_children; 287 } 288 } 289 290 out: 291 kfree(s); 292 return r; 293 } 294 EXPORT_SYMBOL_GPL(dm_btree_del); 295 296 /*----------------------------------------------------------------*/ 297 298 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 299 int (*search_fn)(struct node *, uint64_t), 300 uint64_t *result_key, void *v, size_t value_size) 301 { 302 int i, r; 303 uint32_t flags, nr_entries; 304 305 do { 306 r = ro_step(s, block); 307 if (r < 0) 308 return r; 309 310 i = search_fn(ro_node(s), key); 311 312 flags = le32_to_cpu(ro_node(s)->header.flags); 313 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 314 if (i < 0 || i >= nr_entries) 315 return -ENODATA; 316 317 if (flags & INTERNAL_NODE) 318 block = value64(ro_node(s), i); 319 320 } while (!(flags & LEAF_NODE)); 321 322 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 323 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); 324 325 return 0; 326 } 327 328 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 329 uint64_t *keys, void *value_le) 330 { 331 unsigned level, last_level = info->levels - 1; 332 int r = -ENODATA; 333 uint64_t rkey; 334 __le64 internal_value_le; 335 struct ro_spine spine; 336 337 init_ro_spine(&spine, info); 338 for (level = 0; level < info->levels; level++) { 339 size_t size; 340 void *value_p; 341 342 if (level == last_level) { 343 value_p = value_le; 344 size = info->value_type.size; 345 346 } else { 347 value_p = &internal_value_le; 348 size = sizeof(uint64_t); 349 } 350 351 r = btree_lookup_raw(&spine, root, keys[level], 352 lower_bound, &rkey, 353 value_p, size); 354 355 if (!r) { 356 if (rkey != keys[level]) { 357 exit_ro_spine(&spine); 358 return -ENODATA; 359 } 360 } else { 361 exit_ro_spine(&spine); 362 return r; 363 } 364 365 root = le64_to_cpu(internal_value_le); 366 } 367 exit_ro_spine(&spine); 368 369 return r; 370 } 371 EXPORT_SYMBOL_GPL(dm_btree_lookup); 372 373 /* 374 * Splits a node by creating a sibling node and shifting half the nodes 375 * contents across. Assumes there is a parent node, and it has room for 376 * another child. 377 * 378 * Before: 379 * +--------+ 380 * | Parent | 381 * +--------+ 382 * | 383 * v 384 * +----------+ 385 * | A ++++++ | 386 * +----------+ 387 * 388 * 389 * After: 390 * +--------+ 391 * | Parent | 392 * +--------+ 393 * | | 394 * v +------+ 395 * +---------+ | 396 * | A* +++ | v 397 * +---------+ +-------+ 398 * | B +++ | 399 * +-------+ 400 * 401 * Where A* is a shadow of A. 402 */ 403 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, 404 unsigned parent_index, uint64_t key) 405 { 406 int r; 407 size_t size; 408 unsigned nr_left, nr_right; 409 struct dm_block *left, *right, *parent; 410 struct node *ln, *rn, *pn; 411 __le64 location; 412 413 left = shadow_current(s); 414 415 r = new_block(s->info, &right); 416 if (r < 0) 417 return r; 418 419 ln = dm_block_data(left); 420 rn = dm_block_data(right); 421 422 nr_left = le32_to_cpu(ln->header.nr_entries) / 2; 423 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; 424 425 ln->header.nr_entries = cpu_to_le32(nr_left); 426 427 rn->header.flags = ln->header.flags; 428 rn->header.nr_entries = cpu_to_le32(nr_right); 429 rn->header.max_entries = ln->header.max_entries; 430 rn->header.value_size = ln->header.value_size; 431 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); 432 433 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 434 sizeof(uint64_t) : s->info->value_type.size; 435 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), 436 size * nr_right); 437 438 /* 439 * Patch up the parent 440 */ 441 parent = shadow_parent(s); 442 443 pn = dm_block_data(parent); 444 location = cpu_to_le64(dm_block_location(left)); 445 __dm_bless_for_disk(&location); 446 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), 447 &location, sizeof(__le64)); 448 449 location = cpu_to_le64(dm_block_location(right)); 450 __dm_bless_for_disk(&location); 451 452 r = insert_at(sizeof(__le64), pn, parent_index + 1, 453 le64_to_cpu(rn->keys[0]), &location); 454 if (r) 455 return r; 456 457 if (key < le64_to_cpu(rn->keys[0])) { 458 unlock_block(s->info, right); 459 s->nodes[1] = left; 460 } else { 461 unlock_block(s->info, left); 462 s->nodes[1] = right; 463 } 464 465 return 0; 466 } 467 468 /* 469 * Splits a node by creating two new children beneath the given node. 470 * 471 * Before: 472 * +----------+ 473 * | A ++++++ | 474 * +----------+ 475 * 476 * 477 * After: 478 * +------------+ 479 * | A (shadow) | 480 * +------------+ 481 * | | 482 * +------+ +----+ 483 * | | 484 * v v 485 * +-------+ +-------+ 486 * | B +++ | | C +++ | 487 * +-------+ +-------+ 488 */ 489 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 490 { 491 int r; 492 size_t size; 493 unsigned nr_left, nr_right; 494 struct dm_block *left, *right, *new_parent; 495 struct node *pn, *ln, *rn; 496 __le64 val; 497 498 new_parent = shadow_current(s); 499 500 r = new_block(s->info, &left); 501 if (r < 0) 502 return r; 503 504 r = new_block(s->info, &right); 505 if (r < 0) { 506 /* FIXME: put left */ 507 return r; 508 } 509 510 pn = dm_block_data(new_parent); 511 ln = dm_block_data(left); 512 rn = dm_block_data(right); 513 514 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 515 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 516 517 ln->header.flags = pn->header.flags; 518 ln->header.nr_entries = cpu_to_le32(nr_left); 519 ln->header.max_entries = pn->header.max_entries; 520 ln->header.value_size = pn->header.value_size; 521 522 rn->header.flags = pn->header.flags; 523 rn->header.nr_entries = cpu_to_le32(nr_right); 524 rn->header.max_entries = pn->header.max_entries; 525 rn->header.value_size = pn->header.value_size; 526 527 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 528 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 529 530 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 531 sizeof(__le64) : s->info->value_type.size; 532 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); 533 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), 534 nr_right * size); 535 536 /* new_parent should just point to l and r now */ 537 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 538 pn->header.nr_entries = cpu_to_le32(2); 539 pn->header.max_entries = cpu_to_le32( 540 calc_max_entries(sizeof(__le64), 541 dm_bm_block_size( 542 dm_tm_get_bm(s->info->tm)))); 543 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 544 545 val = cpu_to_le64(dm_block_location(left)); 546 __dm_bless_for_disk(&val); 547 pn->keys[0] = ln->keys[0]; 548 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); 549 550 val = cpu_to_le64(dm_block_location(right)); 551 __dm_bless_for_disk(&val); 552 pn->keys[1] = rn->keys[0]; 553 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); 554 555 /* 556 * rejig the spine. This is ugly, since it knows too 557 * much about the spine 558 */ 559 if (s->nodes[0] != new_parent) { 560 unlock_block(s->info, s->nodes[0]); 561 s->nodes[0] = new_parent; 562 } 563 if (key < le64_to_cpu(rn->keys[0])) { 564 unlock_block(s->info, right); 565 s->nodes[1] = left; 566 } else { 567 unlock_block(s->info, left); 568 s->nodes[1] = right; 569 } 570 s->count = 2; 571 572 return 0; 573 } 574 575 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 576 struct dm_btree_value_type *vt, 577 uint64_t key, unsigned *index) 578 { 579 int r, i = *index, top = 1; 580 struct node *node; 581 582 for (;;) { 583 r = shadow_step(s, root, vt); 584 if (r < 0) 585 return r; 586 587 node = dm_block_data(shadow_current(s)); 588 589 /* 590 * We have to patch up the parent node, ugly, but I don't 591 * see a way to do this automatically as part of the spine 592 * op. 593 */ 594 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 595 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 596 597 __dm_bless_for_disk(&location); 598 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), 599 &location, sizeof(__le64)); 600 } 601 602 node = dm_block_data(shadow_current(s)); 603 604 if (node->header.nr_entries == node->header.max_entries) { 605 if (top) 606 r = btree_split_beneath(s, key); 607 else 608 r = btree_split_sibling(s, root, i, key); 609 610 if (r < 0) 611 return r; 612 } 613 614 node = dm_block_data(shadow_current(s)); 615 616 i = lower_bound(node, key); 617 618 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 619 break; 620 621 if (i < 0) { 622 /* change the bounds on the lowest key */ 623 node->keys[0] = cpu_to_le64(key); 624 i = 0; 625 } 626 627 root = value64(node, i); 628 top = 0; 629 } 630 631 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 632 i++; 633 634 *index = i; 635 return 0; 636 } 637 638 static int insert(struct dm_btree_info *info, dm_block_t root, 639 uint64_t *keys, void *value, dm_block_t *new_root, 640 int *inserted) 641 __dm_written_to_disk(value) 642 { 643 int r, need_insert; 644 unsigned level, index = -1, last_level = info->levels - 1; 645 dm_block_t block = root; 646 struct shadow_spine spine; 647 struct node *n; 648 struct dm_btree_value_type le64_type; 649 650 le64_type.context = NULL; 651 le64_type.size = sizeof(__le64); 652 le64_type.inc = NULL; 653 le64_type.dec = NULL; 654 le64_type.equal = NULL; 655 656 init_shadow_spine(&spine, info); 657 658 for (level = 0; level < (info->levels - 1); level++) { 659 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 660 if (r < 0) 661 goto bad; 662 663 n = dm_block_data(shadow_current(&spine)); 664 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || 665 (le64_to_cpu(n->keys[index]) != keys[level])); 666 667 if (need_insert) { 668 dm_block_t new_tree; 669 __le64 new_le; 670 671 r = dm_btree_empty(info, &new_tree); 672 if (r < 0) 673 goto bad; 674 675 new_le = cpu_to_le64(new_tree); 676 __dm_bless_for_disk(&new_le); 677 678 r = insert_at(sizeof(uint64_t), n, index, 679 keys[level], &new_le); 680 if (r) 681 goto bad; 682 } 683 684 if (level < last_level) 685 block = value64(n, index); 686 } 687 688 r = btree_insert_raw(&spine, block, &info->value_type, 689 keys[level], &index); 690 if (r < 0) 691 goto bad; 692 693 n = dm_block_data(shadow_current(&spine)); 694 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || 695 (le64_to_cpu(n->keys[index]) != keys[level])); 696 697 if (need_insert) { 698 if (inserted) 699 *inserted = 1; 700 701 r = insert_at(info->value_type.size, n, index, 702 keys[level], value); 703 if (r) 704 goto bad_unblessed; 705 } else { 706 if (inserted) 707 *inserted = 0; 708 709 if (info->value_type.dec && 710 (!info->value_type.equal || 711 !info->value_type.equal( 712 info->value_type.context, 713 value_ptr(n, index, info->value_type.size), 714 value))) { 715 info->value_type.dec(info->value_type.context, 716 value_ptr(n, index, info->value_type.size)); 717 } 718 memcpy_disk(value_ptr(n, index, info->value_type.size), 719 value, info->value_type.size); 720 } 721 722 *new_root = shadow_root(&spine); 723 exit_shadow_spine(&spine); 724 725 return 0; 726 727 bad: 728 __dm_unbless_for_disk(value); 729 bad_unblessed: 730 exit_shadow_spine(&spine); 731 return r; 732 } 733 734 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 735 uint64_t *keys, void *value, dm_block_t *new_root) 736 __dm_written_to_disk(value) 737 { 738 return insert(info, root, keys, value, new_root, NULL); 739 } 740 EXPORT_SYMBOL_GPL(dm_btree_insert); 741 742 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 743 uint64_t *keys, void *value, dm_block_t *new_root, 744 int *inserted) 745 __dm_written_to_disk(value) 746 { 747 return insert(info, root, keys, value, new_root, inserted); 748 } 749 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 750 751 /*----------------------------------------------------------------*/ 752 753 static int find_highest_key(struct ro_spine *s, dm_block_t block, 754 uint64_t *result_key, dm_block_t *next_block) 755 { 756 int i, r; 757 uint32_t flags; 758 759 do { 760 r = ro_step(s, block); 761 if (r < 0) 762 return r; 763 764 flags = le32_to_cpu(ro_node(s)->header.flags); 765 i = le32_to_cpu(ro_node(s)->header.nr_entries); 766 if (!i) 767 return -ENODATA; 768 else 769 i--; 770 771 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 772 if (next_block || flags & INTERNAL_NODE) 773 block = value64(ro_node(s), i); 774 775 } while (flags & INTERNAL_NODE); 776 777 if (next_block) 778 *next_block = block; 779 return 0; 780 } 781 782 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 783 uint64_t *result_keys) 784 { 785 int r = 0, count = 0, level; 786 struct ro_spine spine; 787 788 init_ro_spine(&spine, info); 789 for (level = 0; level < info->levels; level++) { 790 r = find_highest_key(&spine, root, result_keys + level, 791 level == info->levels - 1 ? NULL : &root); 792 if (r == -ENODATA) { 793 r = 0; 794 break; 795 796 } else if (r) 797 break; 798 799 count++; 800 } 801 exit_ro_spine(&spine); 802 803 return r ? r : count; 804 } 805 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 806