1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree.h" 8 #include "dm-btree-internal.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 13 /* 14 * Removing an entry from a btree 15 * ============================== 16 * 17 * A very important constraint for our btree is that no node, except the 18 * root, may have fewer than a certain number of entries. 19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). 20 * 21 * Ensuring this is complicated by the way we want to only ever hold the 22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom 23 * fashion. 24 * 25 * Each node may have a left or right sibling. When decending the spine, 26 * if a node contains only MIN_ENTRIES then we try and increase this to at 27 * least MIN_ENTRIES + 1. We do this in the following ways: 28 * 29 * [A] No siblings => this can only happen if the node is the root, in which 30 * case we copy the childs contents over the root. 31 * 32 * [B] No left sibling 33 * ==> rebalance(node, right sibling) 34 * 35 * [C] No right sibling 36 * ==> rebalance(left sibling, node) 37 * 38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD 39 * ==> delete node adding it's contents to left and right 40 * 41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD 42 * ==> rebalance(left, node, right) 43 * 44 * After these operations it's possible that the our original node no 45 * longer contains the desired sub tree. For this reason this rebalancing 46 * is performed on the children of the current node. This also avoids 47 * having a special case for the root. 48 * 49 * Once this rebalancing has occurred we can then step into the child node 50 * for internal nodes. Or delete the entry for leaf nodes. 51 */ 52 53 /* 54 * Some little utilities for moving node data around. 55 */ 56 static void node_shift(struct node *n, int shift) 57 { 58 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 59 uint32_t value_size = le32_to_cpu(n->header.value_size); 60 61 if (shift < 0) { 62 shift = -shift; 63 BUG_ON(shift > nr_entries); 64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); 65 memmove(key_ptr(n, 0), 66 key_ptr(n, shift), 67 (nr_entries - shift) * sizeof(__le64)); 68 memmove(value_ptr(n, 0, value_size), 69 value_ptr(n, shift, value_size), 70 (nr_entries - shift) * value_size); 71 } else { 72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 73 memmove(key_ptr(n, shift), 74 key_ptr(n, 0), 75 nr_entries * sizeof(__le64)); 76 memmove(value_ptr(n, shift, value_size), 77 value_ptr(n, 0, value_size), 78 nr_entries * value_size); 79 } 80 } 81 82 static void node_copy(struct node *left, struct node *right, int shift) 83 { 84 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 85 uint32_t value_size = le32_to_cpu(left->header.value_size); 86 BUG_ON(value_size != le32_to_cpu(right->header.value_size)); 87 88 if (shift < 0) { 89 shift = -shift; 90 BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); 91 memcpy(key_ptr(left, nr_left), 92 key_ptr(right, 0), 93 shift * sizeof(__le64)); 94 memcpy(value_ptr(left, nr_left, value_size), 95 value_ptr(right, 0, value_size), 96 shift * value_size); 97 } else { 98 BUG_ON(shift > le32_to_cpu(right->header.max_entries)); 99 memcpy(key_ptr(right, 0), 100 key_ptr(left, nr_left - shift), 101 shift * sizeof(__le64)); 102 memcpy(value_ptr(right, 0, value_size), 103 value_ptr(left, nr_left - shift, value_size), 104 shift * value_size); 105 } 106 } 107 108 /* 109 * Delete a specific entry from a leaf node. 110 */ 111 static void delete_at(struct node *n, unsigned index) 112 { 113 unsigned nr_entries = le32_to_cpu(n->header.nr_entries); 114 unsigned nr_to_copy = nr_entries - (index + 1); 115 uint32_t value_size = le32_to_cpu(n->header.value_size); 116 BUG_ON(index >= nr_entries); 117 118 if (nr_to_copy) { 119 memmove(key_ptr(n, index), 120 key_ptr(n, index + 1), 121 nr_to_copy * sizeof(__le64)); 122 123 memmove(value_ptr(n, index, value_size), 124 value_ptr(n, index + 1, value_size), 125 nr_to_copy * value_size); 126 } 127 128 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 129 } 130 131 static unsigned del_threshold(struct node *n) 132 { 133 return le32_to_cpu(n->header.max_entries) / 3; 134 } 135 136 static unsigned merge_threshold(struct node *n) 137 { 138 /* 139 * The extra one is because we know we're potentially going to 140 * delete an entry. 141 */ 142 return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; 143 } 144 145 struct child { 146 unsigned index; 147 struct dm_block *block; 148 struct node *n; 149 }; 150 151 static struct dm_btree_value_type le64_type = { 152 .context = NULL, 153 .size = sizeof(__le64), 154 .inc = NULL, 155 .dec = NULL, 156 .equal = NULL 157 }; 158 159 static int init_child(struct dm_btree_info *info, struct node *parent, 160 unsigned index, struct child *result) 161 { 162 int r, inc; 163 dm_block_t root; 164 165 result->index = index; 166 root = value64(parent, index); 167 168 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 169 &result->block, &inc); 170 if (r) 171 return r; 172 173 result->n = dm_block_data(result->block); 174 175 if (inc) 176 inc_children(info->tm, result->n, &le64_type); 177 178 *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = 179 cpu_to_le64(dm_block_location(result->block)); 180 181 return 0; 182 } 183 184 static int exit_child(struct dm_btree_info *info, struct child *c) 185 { 186 return dm_tm_unlock(info->tm, c->block); 187 } 188 189 static void shift(struct node *left, struct node *right, int count) 190 { 191 if (!count) 192 return; 193 194 if (count > 0) { 195 node_shift(right, count); 196 node_copy(left, right, count); 197 } else { 198 node_copy(left, right, count); 199 node_shift(right, count); 200 } 201 202 left->header.nr_entries = 203 cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); 204 BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); 205 206 right->header.nr_entries = 207 cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); 208 BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); 209 } 210 211 static void __rebalance2(struct dm_btree_info *info, struct node *parent, 212 struct child *l, struct child *r) 213 { 214 struct node *left = l->n; 215 struct node *right = r->n; 216 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 217 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 218 219 if (nr_left + nr_right <= merge_threshold(left)) { 220 /* 221 * Merge 222 */ 223 node_copy(left, right, -nr_right); 224 left->header.nr_entries = cpu_to_le32(nr_left + nr_right); 225 delete_at(parent, r->index); 226 227 /* 228 * We need to decrement the right block, but not it's 229 * children, since they're still referenced by left. 230 */ 231 dm_tm_dec(info->tm, dm_block_location(r->block)); 232 } else { 233 /* 234 * Rebalance. 235 */ 236 unsigned target_left = (nr_left + nr_right) / 2; 237 unsigned shift_ = nr_left - target_left; 238 BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); 239 BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); 240 shift(left, right, nr_left - target_left); 241 *key_ptr(parent, r->index) = right->keys[0]; 242 } 243 } 244 245 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 246 unsigned left_index) 247 { 248 int r; 249 struct node *parent; 250 struct child left, right; 251 252 parent = dm_block_data(shadow_current(s)); 253 254 r = init_child(info, parent, left_index, &left); 255 if (r) 256 return r; 257 258 r = init_child(info, parent, left_index + 1, &right); 259 if (r) { 260 exit_child(info, &left); 261 return r; 262 } 263 264 __rebalance2(info, parent, &left, &right); 265 266 r = exit_child(info, &left); 267 if (r) { 268 exit_child(info, &right); 269 return r; 270 } 271 272 return exit_child(info, &right); 273 } 274 275 static void __rebalance3(struct dm_btree_info *info, struct node *parent, 276 struct child *l, struct child *c, struct child *r) 277 { 278 struct node *left = l->n; 279 struct node *center = c->n; 280 struct node *right = r->n; 281 282 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 283 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 284 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 285 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 286 287 unsigned target; 288 289 BUG_ON(left->header.max_entries != center->header.max_entries); 290 BUG_ON(center->header.max_entries != right->header.max_entries); 291 292 if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { 293 /* 294 * Delete center node: 295 * 296 * We dump as many entries from center as possible into 297 * left, then the rest in right, then rebalance2. This 298 * wastes some cpu, but I want something simple atm. 299 */ 300 unsigned shift = min(max_entries - nr_left, nr_center); 301 302 BUG_ON(nr_left + shift > max_entries); 303 node_copy(left, center, -shift); 304 left->header.nr_entries = cpu_to_le32(nr_left + shift); 305 306 if (shift != nr_center) { 307 shift = nr_center - shift; 308 BUG_ON((nr_right + shift) >= max_entries); 309 node_shift(right, shift); 310 node_copy(center, right, shift); 311 right->header.nr_entries = cpu_to_le32(nr_right + shift); 312 } 313 *key_ptr(parent, r->index) = right->keys[0]; 314 315 delete_at(parent, c->index); 316 r->index--; 317 318 dm_tm_dec(info->tm, dm_block_location(c->block)); 319 __rebalance2(info, parent, l, r); 320 321 return; 322 } 323 324 /* 325 * Rebalance 326 */ 327 target = (nr_left + nr_center + nr_right) / 3; 328 BUG_ON(target > max_entries); 329 330 /* 331 * Adjust the left node 332 */ 333 shift(left, center, nr_left - target); 334 335 /* 336 * Adjust the right node 337 */ 338 shift(center, right, target - nr_right); 339 *key_ptr(parent, c->index) = center->keys[0]; 340 *key_ptr(parent, r->index) = right->keys[0]; 341 } 342 343 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 344 unsigned left_index) 345 { 346 int r; 347 struct node *parent = dm_block_data(shadow_current(s)); 348 struct child left, center, right; 349 350 /* 351 * FIXME: fill out an array? 352 */ 353 r = init_child(info, parent, left_index, &left); 354 if (r) 355 return r; 356 357 r = init_child(info, parent, left_index + 1, ¢er); 358 if (r) { 359 exit_child(info, &left); 360 return r; 361 } 362 363 r = init_child(info, parent, left_index + 2, &right); 364 if (r) { 365 exit_child(info, &left); 366 exit_child(info, ¢er); 367 return r; 368 } 369 370 __rebalance3(info, parent, &left, ¢er, &right); 371 372 r = exit_child(info, &left); 373 if (r) { 374 exit_child(info, ¢er); 375 exit_child(info, &right); 376 return r; 377 } 378 379 r = exit_child(info, ¢er); 380 if (r) { 381 exit_child(info, &right); 382 return r; 383 } 384 385 r = exit_child(info, &right); 386 if (r) 387 return r; 388 389 return 0; 390 } 391 392 static int get_nr_entries(struct dm_transaction_manager *tm, 393 dm_block_t b, uint32_t *result) 394 { 395 int r; 396 struct dm_block *block; 397 struct node *n; 398 399 r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); 400 if (r) 401 return r; 402 403 n = dm_block_data(block); 404 *result = le32_to_cpu(n->header.nr_entries); 405 406 return dm_tm_unlock(tm, block); 407 } 408 409 static int rebalance_children(struct shadow_spine *s, 410 struct dm_btree_info *info, uint64_t key) 411 { 412 int i, r, has_left_sibling, has_right_sibling; 413 uint32_t child_entries; 414 struct node *n; 415 416 n = dm_block_data(shadow_current(s)); 417 418 if (le32_to_cpu(n->header.nr_entries) == 1) { 419 struct dm_block *child; 420 dm_block_t b = value64(n, 0); 421 422 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); 423 if (r) 424 return r; 425 426 memcpy(n, dm_block_data(child), 427 dm_bm_block_size(dm_tm_get_bm(info->tm))); 428 r = dm_tm_unlock(info->tm, child); 429 if (r) 430 return r; 431 432 dm_tm_dec(info->tm, dm_block_location(child)); 433 return 0; 434 } 435 436 i = lower_bound(n, key); 437 if (i < 0) 438 return -ENODATA; 439 440 r = get_nr_entries(info->tm, value64(n, i), &child_entries); 441 if (r) 442 return r; 443 444 if (child_entries > del_threshold(n)) 445 return 0; 446 447 has_left_sibling = i > 0; 448 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 449 450 if (!has_left_sibling) 451 r = rebalance2(s, info, i); 452 453 else if (!has_right_sibling) 454 r = rebalance2(s, info, i - 1); 455 456 else 457 r = rebalance3(s, info, i - 1); 458 459 return r; 460 } 461 462 static int do_leaf(struct node *n, uint64_t key, unsigned *index) 463 { 464 int i = lower_bound(n, key); 465 466 if ((i < 0) || 467 (i >= le32_to_cpu(n->header.nr_entries)) || 468 (le64_to_cpu(n->keys[i]) != key)) 469 return -ENODATA; 470 471 *index = i; 472 473 return 0; 474 } 475 476 /* 477 * Prepares for removal from one level of the hierarchy. The caller must 478 * call delete_at() to remove the entry at index. 479 */ 480 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, 481 struct dm_btree_value_type *vt, dm_block_t root, 482 uint64_t key, unsigned *index) 483 { 484 int i = *index, r; 485 struct node *n; 486 487 for (;;) { 488 r = shadow_step(s, root, vt); 489 if (r < 0) 490 break; 491 492 /* 493 * We have to patch up the parent node, ugly, but I don't 494 * see a way to do this automatically as part of the spine 495 * op. 496 */ 497 if (shadow_has_parent(s)) { 498 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 499 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), 500 &location, sizeof(__le64)); 501 } 502 503 n = dm_block_data(shadow_current(s)); 504 505 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 506 return do_leaf(n, key, index); 507 508 r = rebalance_children(s, info, key); 509 if (r) 510 break; 511 512 n = dm_block_data(shadow_current(s)); 513 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 514 return do_leaf(n, key, index); 515 516 i = lower_bound(n, key); 517 518 /* 519 * We know the key is present, or else 520 * rebalance_children would have returned 521 * -ENODATA 522 */ 523 root = value64(n, i); 524 } 525 526 return r; 527 } 528 529 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 530 uint64_t *keys, dm_block_t *new_root) 531 { 532 unsigned level, last_level = info->levels - 1; 533 int index = 0, r = 0; 534 struct shadow_spine spine; 535 struct node *n; 536 537 init_shadow_spine(&spine, info); 538 for (level = 0; level < info->levels; level++) { 539 r = remove_raw(&spine, info, 540 (level == last_level ? 541 &info->value_type : &le64_type), 542 root, keys[level], (unsigned *)&index); 543 if (r < 0) 544 break; 545 546 n = dm_block_data(shadow_current(&spine)); 547 if (level != last_level) { 548 root = value64(n, index); 549 continue; 550 } 551 552 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); 553 554 if (info->value_type.dec) 555 info->value_type.dec(info->value_type.context, 556 value_ptr(n, index, info->value_type.size)); 557 558 delete_at(n, index); 559 } 560 561 *new_root = shadow_root(&spine); 562 exit_shadow_spine(&spine); 563 564 return r; 565 } 566 EXPORT_SYMBOL_GPL(dm_btree_remove); 567