1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "messages.h" 4 #include "tree-mod-log.h" 5 #include "disk-io.h" 6 #include "fs.h" 7 #include "accessors.h" 8 #include "tree-checker.h" 9 10 struct tree_mod_root { 11 u64 logical; 12 u8 level; 13 }; 14 15 struct tree_mod_elem { 16 struct rb_node node; 17 u64 logical; 18 u64 seq; 19 enum btrfs_mod_log_op op; 20 21 /* 22 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS 23 * operations. 24 */ 25 int slot; 26 27 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ 28 u64 generation; 29 30 union { 31 /* 32 * This is used for the following op types: 33 * 34 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING 35 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING 36 * BTRFS_MOD_LOG_KEY_REMOVE 37 * BTRFS_MOD_LOG_KEY_REPLACE 38 */ 39 struct { 40 struct btrfs_disk_key key; 41 u64 blockptr; 42 } slot_change; 43 44 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ 45 struct { 46 int dst_slot; 47 int nr_items; 48 } move; 49 50 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ 51 struct tree_mod_root old_root; 52 }; 53 }; 54 55 /* 56 * Pull a new tree mod seq number for our operation. 57 */ 58 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 59 { 60 return atomic64_inc_return(&fs_info->tree_mod_seq); 61 } 62 63 /* 64 * This adds a new blocker to the tree mod log's blocker list if the @elem 65 * passed does not already have a sequence number set. So when a caller expects 66 * to record tree modifications, it should ensure to set elem->seq to zero 67 * before calling btrfs_get_tree_mod_seq. 68 * Returns a fresh, unused tree log modification sequence number, even if no new 69 * blocker was added. 70 */ 71 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 72 struct btrfs_seq_list *elem) 73 { 74 write_lock(&fs_info->tree_mod_log_lock); 75 if (!elem->seq) { 76 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 77 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 78 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 79 } 80 write_unlock(&fs_info->tree_mod_log_lock); 81 82 return elem->seq; 83 } 84 85 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 86 struct btrfs_seq_list *elem) 87 { 88 struct rb_root *tm_root; 89 struct rb_node *node; 90 struct rb_node *next; 91 struct tree_mod_elem *tm; 92 u64 min_seq = BTRFS_SEQ_LAST; 93 u64 seq_putting = elem->seq; 94 95 if (!seq_putting) 96 return; 97 98 write_lock(&fs_info->tree_mod_log_lock); 99 list_del(&elem->list); 100 elem->seq = 0; 101 102 if (list_empty(&fs_info->tree_mod_seq_list)) { 103 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 104 } else { 105 struct btrfs_seq_list *first; 106 107 first = list_first_entry(&fs_info->tree_mod_seq_list, 108 struct btrfs_seq_list, list); 109 if (seq_putting > first->seq) { 110 /* 111 * Blocker with lower sequence number exists, we cannot 112 * remove anything from the log. 113 */ 114 write_unlock(&fs_info->tree_mod_log_lock); 115 return; 116 } 117 min_seq = first->seq; 118 } 119 120 /* 121 * Anything that's lower than the lowest existing (read: blocked) 122 * sequence number can be removed from the tree. 123 */ 124 tm_root = &fs_info->tree_mod_log; 125 for (node = rb_first(tm_root); node; node = next) { 126 next = rb_next(node); 127 tm = rb_entry(node, struct tree_mod_elem, node); 128 if (tm->seq >= min_seq) 129 continue; 130 rb_erase(node, tm_root); 131 kfree(tm); 132 } 133 write_unlock(&fs_info->tree_mod_log_lock); 134 } 135 136 /* 137 * Key order of the log: 138 * node/leaf start address -> sequence 139 * 140 * The 'start address' is the logical address of the *new* root node for root 141 * replace operations, or the logical address of the affected block for all 142 * other operations. 143 */ 144 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, 145 struct tree_mod_elem *tm) 146 { 147 struct rb_root *tm_root; 148 struct rb_node **new; 149 struct rb_node *parent = NULL; 150 struct tree_mod_elem *cur; 151 152 lockdep_assert_held_write(&fs_info->tree_mod_log_lock); 153 154 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 155 156 tm_root = &fs_info->tree_mod_log; 157 new = &tm_root->rb_node; 158 while (*new) { 159 cur = rb_entry(*new, struct tree_mod_elem, node); 160 parent = *new; 161 if (cur->logical < tm->logical) 162 new = &((*new)->rb_left); 163 else if (cur->logical > tm->logical) 164 new = &((*new)->rb_right); 165 else if (cur->seq < tm->seq) 166 new = &((*new)->rb_left); 167 else if (cur->seq > tm->seq) 168 new = &((*new)->rb_right); 169 else 170 return -EEXIST; 171 } 172 173 rb_link_node(&tm->node, parent, new); 174 rb_insert_color(&tm->node, tm_root); 175 return 0; 176 } 177 178 static inline bool skip_eb_logging(const struct extent_buffer *eb) 179 { 180 const u64 owner = btrfs_header_owner(eb); 181 182 if (btrfs_header_level(eb) == 0) 183 return true; 184 185 /* 186 * Tree mod logging exists so that there's a consistent view of the 187 * extents and backrefs of inodes even if while a task is iterating over 188 * them other tasks are modifying subvolume trees and the extent tree 189 * (including running delayed refs). So we only need to log extent 190 * buffers from the extent tree and subvolume trees. 191 */ 192 193 if (owner == BTRFS_EXTENT_TREE_OBJECTID) 194 return false; 195 196 if (btrfs_is_fstree(owner)) 197 return false; 198 199 return true; 200 } 201 202 /* 203 * Determines if logging can be omitted. Returns true if it can. Otherwise, it 204 * returns false with the tree_mod_log_lock acquired. The caller must hold 205 * this until all tree mod log insertions are recorded in the rb tree and then 206 * write unlock fs_info::tree_mod_log_lock. 207 */ 208 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) 209 { 210 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 211 return true; 212 if (eb && skip_eb_logging(eb)) 213 return true; 214 215 write_lock(&fs_info->tree_mod_log_lock); 216 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 217 write_unlock(&fs_info->tree_mod_log_lock); 218 return true; 219 } 220 221 return false; 222 } 223 224 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 225 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, 226 const struct extent_buffer *eb) 227 { 228 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 229 return false; 230 if (eb && skip_eb_logging(eb)) 231 return false; 232 233 return true; 234 } 235 236 static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, 237 int slot, 238 enum btrfs_mod_log_op op) 239 { 240 struct tree_mod_elem *tm; 241 242 /* Can't be one of these types, due to union in struct tree_mod_elem. */ 243 ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS); 244 ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE); 245 246 tm = kzalloc(sizeof(*tm), GFP_NOFS); 247 if (!tm) 248 return NULL; 249 250 tm->logical = eb->start; 251 btrfs_node_key(eb, &tm->slot_change.key, slot); 252 tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot); 253 tm->op = op; 254 tm->slot = slot; 255 tm->generation = btrfs_node_ptr_generation(eb, slot); 256 RB_CLEAR_NODE(&tm->node); 257 258 return tm; 259 } 260 261 int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, 262 enum btrfs_mod_log_op op) 263 { 264 struct tree_mod_elem *tm; 265 int ret = 0; 266 267 if (!tree_mod_need_log(eb->fs_info, eb)) 268 return 0; 269 270 tm = alloc_tree_mod_elem(eb, slot, op); 271 if (!tm) 272 ret = -ENOMEM; 273 274 if (tree_mod_dont_log(eb->fs_info, eb)) { 275 kfree(tm); 276 /* 277 * Don't error if we failed to allocate memory because we don't 278 * need to log. 279 */ 280 return 0; 281 } else if (ret != 0) { 282 /* 283 * We previously failed to allocate memory and we need to log, 284 * so we have to fail. 285 */ 286 goto out_unlock; 287 } 288 289 ret = tree_mod_log_insert(eb->fs_info, tm); 290 out_unlock: 291 write_unlock(&eb->fs_info->tree_mod_log_lock); 292 if (ret) 293 kfree(tm); 294 295 return ret; 296 } 297 298 static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, 299 int dst_slot, int src_slot, 300 int nr_items) 301 { 302 struct tree_mod_elem *tm; 303 304 tm = kzalloc(sizeof(*tm), GFP_NOFS); 305 if (!tm) 306 return ERR_PTR(-ENOMEM); 307 308 tm->logical = eb->start; 309 tm->slot = src_slot; 310 tm->move.dst_slot = dst_slot; 311 tm->move.nr_items = nr_items; 312 tm->op = BTRFS_MOD_LOG_MOVE_KEYS; 313 RB_CLEAR_NODE(&tm->node); 314 315 return tm; 316 } 317 318 int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, 319 int dst_slot, int src_slot, 320 int nr_items) 321 { 322 struct tree_mod_elem *tm = NULL; 323 struct tree_mod_elem **tm_list = NULL; 324 int ret = 0; 325 int i; 326 bool locked = false; 327 328 if (!tree_mod_need_log(eb->fs_info, eb)) 329 return 0; 330 331 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 332 if (!tm_list) { 333 ret = -ENOMEM; 334 goto lock; 335 } 336 337 tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); 338 if (IS_ERR(tm)) { 339 ret = PTR_ERR(tm); 340 tm = NULL; 341 goto lock; 342 } 343 344 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 345 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 346 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); 347 if (!tm_list[i]) { 348 ret = -ENOMEM; 349 goto lock; 350 } 351 } 352 353 lock: 354 if (tree_mod_dont_log(eb->fs_info, eb)) { 355 /* 356 * Don't error if we failed to allocate memory because we don't 357 * need to log. 358 */ 359 ret = 0; 360 goto free_tms; 361 } 362 locked = true; 363 364 /* 365 * We previously failed to allocate memory and we need to log, so we 366 * have to fail. 367 */ 368 if (ret != 0) 369 goto free_tms; 370 371 /* 372 * When we override something during the move, we log these removals. 373 * This can only happen when we move towards the beginning of the 374 * buffer, i.e. dst_slot < src_slot. 375 */ 376 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 377 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); 378 if (ret) 379 goto free_tms; 380 } 381 382 ret = tree_mod_log_insert(eb->fs_info, tm); 383 if (ret) 384 goto free_tms; 385 write_unlock(&eb->fs_info->tree_mod_log_lock); 386 kfree(tm_list); 387 388 return 0; 389 390 free_tms: 391 if (tm_list) { 392 for (i = 0; i < nr_items; i++) { 393 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 394 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 395 kfree(tm_list[i]); 396 } 397 } 398 if (locked) 399 write_unlock(&eb->fs_info->tree_mod_log_lock); 400 kfree(tm_list); 401 kfree(tm); 402 403 return ret; 404 } 405 406 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 407 struct tree_mod_elem **tm_list, 408 int nritems) 409 { 410 int i, j; 411 int ret; 412 413 for (i = nritems - 1; i >= 0; i--) { 414 ret = tree_mod_log_insert(fs_info, tm_list[i]); 415 if (ret) { 416 for (j = nritems - 1; j > i; j--) 417 rb_erase(&tm_list[j]->node, 418 &fs_info->tree_mod_log); 419 return ret; 420 } 421 } 422 423 return 0; 424 } 425 426 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, 427 struct extent_buffer *new_root, 428 bool log_removal) 429 { 430 struct btrfs_fs_info *fs_info = old_root->fs_info; 431 struct tree_mod_elem *tm = NULL; 432 struct tree_mod_elem **tm_list = NULL; 433 int nritems = 0; 434 int ret = 0; 435 int i; 436 437 if (!tree_mod_need_log(fs_info, NULL)) 438 return 0; 439 440 if (log_removal && btrfs_header_level(old_root) > 0) { 441 nritems = btrfs_header_nritems(old_root); 442 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 443 GFP_NOFS); 444 if (!tm_list) { 445 ret = -ENOMEM; 446 goto lock; 447 } 448 for (i = 0; i < nritems; i++) { 449 tm_list[i] = alloc_tree_mod_elem(old_root, i, 450 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 451 if (!tm_list[i]) { 452 ret = -ENOMEM; 453 goto lock; 454 } 455 } 456 } 457 458 tm = kzalloc(sizeof(*tm), GFP_NOFS); 459 if (!tm) { 460 ret = -ENOMEM; 461 goto lock; 462 } 463 464 tm->logical = new_root->start; 465 tm->old_root.logical = old_root->start; 466 tm->old_root.level = btrfs_header_level(old_root); 467 tm->generation = btrfs_header_generation(old_root); 468 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; 469 470 lock: 471 if (tree_mod_dont_log(fs_info, NULL)) { 472 /* 473 * Don't error if we failed to allocate memory because we don't 474 * need to log. 475 */ 476 ret = 0; 477 goto free_tms; 478 } else if (ret != 0) { 479 /* 480 * We previously failed to allocate memory and we need to log, 481 * so we have to fail. 482 */ 483 goto out_unlock; 484 } 485 486 if (tm_list) 487 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); 488 if (!ret) 489 ret = tree_mod_log_insert(fs_info, tm); 490 491 out_unlock: 492 write_unlock(&fs_info->tree_mod_log_lock); 493 if (ret) 494 goto free_tms; 495 kfree(tm_list); 496 497 return ret; 498 499 free_tms: 500 if (tm_list) { 501 for (i = 0; i < nritems; i++) 502 kfree(tm_list[i]); 503 kfree(tm_list); 504 } 505 kfree(tm); 506 507 return ret; 508 } 509 510 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, 511 u64 start, u64 min_seq, 512 bool smallest) 513 { 514 struct rb_root *tm_root; 515 struct rb_node *node; 516 struct tree_mod_elem *cur = NULL; 517 struct tree_mod_elem *found = NULL; 518 519 read_lock(&fs_info->tree_mod_log_lock); 520 tm_root = &fs_info->tree_mod_log; 521 node = tm_root->rb_node; 522 while (node) { 523 cur = rb_entry(node, struct tree_mod_elem, node); 524 if (cur->logical < start) { 525 node = node->rb_left; 526 } else if (cur->logical > start) { 527 node = node->rb_right; 528 } else if (cur->seq < min_seq) { 529 node = node->rb_left; 530 } else if (!smallest) { 531 /* We want the node with the highest seq */ 532 if (found) 533 BUG_ON(found->seq > cur->seq); 534 found = cur; 535 node = node->rb_left; 536 } else if (cur->seq > min_seq) { 537 /* We want the node with the smallest seq */ 538 if (found) 539 BUG_ON(found->seq < cur->seq); 540 found = cur; 541 node = node->rb_right; 542 } else { 543 found = cur; 544 break; 545 } 546 } 547 read_unlock(&fs_info->tree_mod_log_lock); 548 549 return found; 550 } 551 552 /* 553 * This returns the element from the log with the smallest time sequence 554 * value that's in the log (the oldest log item). Any element with a time 555 * sequence lower than min_seq will be ignored. 556 */ 557 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, 558 u64 start, u64 min_seq) 559 { 560 return __tree_mod_log_search(fs_info, start, min_seq, true); 561 } 562 563 /* 564 * This returns the element from the log with the largest time sequence 565 * value that's in the log (the most recent log item). Any element with 566 * a time sequence lower than min_seq will be ignored. 567 */ 568 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, 569 u64 start, u64 min_seq) 570 { 571 return __tree_mod_log_search(fs_info, start, min_seq, false); 572 } 573 574 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, 575 const struct extent_buffer *src, 576 unsigned long dst_offset, 577 unsigned long src_offset, 578 int nr_items) 579 { 580 struct btrfs_fs_info *fs_info = dst->fs_info; 581 int ret = 0; 582 struct tree_mod_elem **tm_list = NULL; 583 struct tree_mod_elem **tm_list_add = NULL; 584 struct tree_mod_elem **tm_list_rem = NULL; 585 int i; 586 bool locked = false; 587 struct tree_mod_elem *dst_move_tm = NULL; 588 struct tree_mod_elem *src_move_tm = NULL; 589 u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; 590 u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); 591 592 if (!tree_mod_need_log(fs_info, NULL)) 593 return 0; 594 595 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 596 return 0; 597 598 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 599 GFP_NOFS); 600 if (!tm_list) { 601 ret = -ENOMEM; 602 goto lock; 603 } 604 605 if (dst_move_nr_items) { 606 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, 607 dst_offset, dst_move_nr_items); 608 if (IS_ERR(dst_move_tm)) { 609 ret = PTR_ERR(dst_move_tm); 610 dst_move_tm = NULL; 611 goto lock; 612 } 613 } 614 if (src_move_nr_items) { 615 src_move_tm = tree_mod_log_alloc_move(src, src_offset, 616 src_offset + nr_items, 617 src_move_nr_items); 618 if (IS_ERR(src_move_tm)) { 619 ret = PTR_ERR(src_move_tm); 620 src_move_tm = NULL; 621 goto lock; 622 } 623 } 624 625 tm_list_add = tm_list; 626 tm_list_rem = tm_list + nr_items; 627 for (i = 0; i < nr_items; i++) { 628 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 629 BTRFS_MOD_LOG_KEY_REMOVE); 630 if (!tm_list_rem[i]) { 631 ret = -ENOMEM; 632 goto lock; 633 } 634 635 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 636 BTRFS_MOD_LOG_KEY_ADD); 637 if (!tm_list_add[i]) { 638 ret = -ENOMEM; 639 goto lock; 640 } 641 } 642 643 lock: 644 if (tree_mod_dont_log(fs_info, NULL)) { 645 /* 646 * Don't error if we failed to allocate memory because we don't 647 * need to log. 648 */ 649 ret = 0; 650 goto free_tms; 651 } 652 locked = true; 653 654 /* 655 * We previously failed to allocate memory and we need to log, so we 656 * have to fail. 657 */ 658 if (ret != 0) 659 goto free_tms; 660 661 if (dst_move_tm) { 662 ret = tree_mod_log_insert(fs_info, dst_move_tm); 663 if (ret) 664 goto free_tms; 665 } 666 for (i = 0; i < nr_items; i++) { 667 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); 668 if (ret) 669 goto free_tms; 670 ret = tree_mod_log_insert(fs_info, tm_list_add[i]); 671 if (ret) 672 goto free_tms; 673 } 674 if (src_move_tm) { 675 ret = tree_mod_log_insert(fs_info, src_move_tm); 676 if (ret) 677 goto free_tms; 678 } 679 680 write_unlock(&fs_info->tree_mod_log_lock); 681 kfree(tm_list); 682 683 return 0; 684 685 free_tms: 686 if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) 687 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); 688 kfree(dst_move_tm); 689 if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) 690 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); 691 kfree(src_move_tm); 692 if (tm_list) { 693 for (i = 0; i < nr_items * 2; i++) { 694 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 695 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 696 kfree(tm_list[i]); 697 } 698 } 699 if (locked) 700 write_unlock(&fs_info->tree_mod_log_lock); 701 kfree(tm_list); 702 703 return ret; 704 } 705 706 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) 707 { 708 struct tree_mod_elem **tm_list = NULL; 709 int nritems = 0; 710 int i; 711 int ret = 0; 712 713 if (!tree_mod_need_log(eb->fs_info, eb)) 714 return 0; 715 716 nritems = btrfs_header_nritems(eb); 717 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 718 if (!tm_list) { 719 ret = -ENOMEM; 720 goto lock; 721 } 722 723 for (i = 0; i < nritems; i++) { 724 tm_list[i] = alloc_tree_mod_elem(eb, i, 725 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 726 if (!tm_list[i]) { 727 ret = -ENOMEM; 728 goto lock; 729 } 730 } 731 732 lock: 733 if (tree_mod_dont_log(eb->fs_info, eb)) { 734 /* 735 * Don't error if we failed to allocate memory because we don't 736 * need to log. 737 */ 738 ret = 0; 739 goto free_tms; 740 } else if (ret != 0) { 741 /* 742 * We previously failed to allocate memory and we need to log, 743 * so we have to fail. 744 */ 745 goto out_unlock; 746 } 747 748 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 749 out_unlock: 750 write_unlock(&eb->fs_info->tree_mod_log_lock); 751 if (ret) 752 goto free_tms; 753 kfree(tm_list); 754 755 return 0; 756 757 free_tms: 758 if (tm_list) { 759 for (i = 0; i < nritems; i++) 760 kfree(tm_list[i]); 761 kfree(tm_list); 762 } 763 764 return ret; 765 } 766 767 /* 768 * Returns the logical address of the oldest predecessor of the given root. 769 * Entries older than time_seq are ignored. 770 */ 771 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, 772 u64 time_seq) 773 { 774 struct tree_mod_elem *tm; 775 struct tree_mod_elem *found = NULL; 776 u64 root_logical = eb_root->start; 777 bool looped = false; 778 779 if (!time_seq) 780 return NULL; 781 782 /* 783 * The very last operation that's logged for a root is the replacement 784 * operation (if it is replaced at all). This has the logical address 785 * of the *new* root, making it the very first operation that's logged 786 * for this root. 787 */ 788 while (1) { 789 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 790 time_seq); 791 if (!looped && !tm) 792 return NULL; 793 /* 794 * If there are no tree operation for the oldest root, we simply 795 * return it. This should only happen if that (old) root is at 796 * level 0. 797 */ 798 if (!tm) 799 break; 800 801 /* 802 * If there's an operation that's not a root replacement, we 803 * found the oldest version of our root. Normally, we'll find a 804 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 805 */ 806 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) 807 break; 808 809 found = tm; 810 root_logical = tm->old_root.logical; 811 looped = true; 812 } 813 814 /* If there's no old root to return, return what we found instead */ 815 if (!found) 816 found = tm; 817 818 return found; 819 } 820 821 822 /* 823 * tm is a pointer to the first operation to rewind within eb. Then, all 824 * previous operations will be rewound (until we reach something older than 825 * time_seq). 826 */ 827 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 828 struct extent_buffer *eb, 829 u64 time_seq, 830 struct tree_mod_elem *first_tm) 831 { 832 u32 n; 833 struct rb_node *next; 834 struct tree_mod_elem *tm = first_tm; 835 unsigned long o_dst; 836 unsigned long o_src; 837 unsigned long p_size = sizeof(struct btrfs_key_ptr); 838 /* 839 * max_slot tracks the maximum valid slot of the rewind eb at every 840 * step of the rewind. This is in contrast with 'n' which eventually 841 * matches the number of items, but can be wrong during moves or if 842 * removes overlap on already valid slots (which is probably separately 843 * a bug). We do this to validate the offsets of memmoves for rewinding 844 * moves and detect invalid memmoves. 845 * 846 * Since a rewind eb can start empty, max_slot is a signed integer with 847 * a special meaning for -1, which is that no slot is valid to move out 848 * of. Any other negative value is invalid. 849 */ 850 int max_slot; 851 int move_src_end_slot; 852 int move_dst_end_slot; 853 854 n = btrfs_header_nritems(eb); 855 max_slot = n - 1; 856 read_lock(&fs_info->tree_mod_log_lock); 857 while (tm && tm->seq >= time_seq) { 858 ASSERT(max_slot >= -1); 859 /* 860 * All the operations are recorded with the operator used for 861 * the modification. As we're going backwards, we do the 862 * opposite of each operation here. 863 */ 864 switch (tm->op) { 865 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 866 BUG_ON(tm->slot < n); 867 fallthrough; 868 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 869 case BTRFS_MOD_LOG_KEY_REMOVE: 870 btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); 871 btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); 872 btrfs_set_node_ptr_generation(eb, tm->slot, 873 tm->generation); 874 n++; 875 if (tm->slot > max_slot) 876 max_slot = tm->slot; 877 break; 878 case BTRFS_MOD_LOG_KEY_REPLACE: 879 BUG_ON(tm->slot >= n); 880 btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); 881 btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); 882 btrfs_set_node_ptr_generation(eb, tm->slot, 883 tm->generation); 884 break; 885 case BTRFS_MOD_LOG_KEY_ADD: 886 /* 887 * It is possible we could have already removed keys 888 * behind the known max slot, so this will be an 889 * overestimate. In practice, the copy operation 890 * inserts them in increasing order, and overestimating 891 * just means we miss some warnings, so it's OK. It 892 * isn't worth carefully tracking the full array of 893 * valid slots to check against when moving. 894 */ 895 if (tm->slot == max_slot) 896 max_slot--; 897 /* if a move operation is needed it's in the log */ 898 n--; 899 break; 900 case BTRFS_MOD_LOG_MOVE_KEYS: 901 ASSERT(tm->move.nr_items > 0); 902 move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; 903 move_dst_end_slot = tm->slot + tm->move.nr_items - 1; 904 o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); 905 o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); 906 if (WARN_ON(move_src_end_slot > max_slot || 907 tm->move.nr_items <= 0)) { 908 btrfs_warn(fs_info, 909 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", 910 eb->start, tm->slot, 911 tm->move.dst_slot, tm->move.nr_items, 912 tm->seq, n, max_slot); 913 } 914 memmove_extent_buffer(eb, o_dst, o_src, 915 tm->move.nr_items * p_size); 916 max_slot = move_dst_end_slot; 917 break; 918 case BTRFS_MOD_LOG_ROOT_REPLACE: 919 /* 920 * This operation is special. For roots, this must be 921 * handled explicitly before rewinding. 922 * For non-roots, this operation may exist if the node 923 * was a root: root A -> child B; then A gets empty and 924 * B is promoted to the new root. In the mod log, we'll 925 * have a root-replace operation for B, a tree block 926 * that is no root. We simply ignore that operation. 927 */ 928 break; 929 } 930 next = rb_next(&tm->node); 931 if (!next) 932 break; 933 tm = rb_entry(next, struct tree_mod_elem, node); 934 if (tm->logical != first_tm->logical) 935 break; 936 } 937 read_unlock(&fs_info->tree_mod_log_lock); 938 btrfs_set_header_nritems(eb, n); 939 } 940 941 /* 942 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 943 * is returned. If rewind operations happen, a fresh buffer is returned. The 944 * returned buffer is always read-locked. If the returned buffer is not the 945 * input buffer, the lock on the input buffer is released and the input buffer 946 * is freed (its refcount is decremented). 947 */ 948 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 949 struct extent_buffer *eb, 950 u64 time_seq) 951 { 952 struct extent_buffer *eb_rewin; 953 struct tree_mod_elem *tm; 954 955 if (!time_seq) 956 return eb; 957 958 if (btrfs_header_level(eb) == 0) 959 return eb; 960 961 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 962 if (!tm) 963 return eb; 964 965 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 966 BUG_ON(tm->slot != 0); 967 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 968 if (!eb_rewin) { 969 btrfs_tree_read_unlock(eb); 970 free_extent_buffer(eb); 971 return NULL; 972 } 973 btrfs_set_header_bytenr(eb_rewin, eb->start); 974 btrfs_set_header_backref_rev(eb_rewin, 975 btrfs_header_backref_rev(eb)); 976 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 977 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 978 } else { 979 eb_rewin = btrfs_clone_extent_buffer(eb); 980 if (!eb_rewin) { 981 btrfs_tree_read_unlock(eb); 982 free_extent_buffer(eb); 983 return NULL; 984 } 985 } 986 987 btrfs_tree_read_unlock(eb); 988 free_extent_buffer(eb); 989 990 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), 991 eb_rewin, btrfs_header_level(eb_rewin)); 992 btrfs_tree_read_lock(eb_rewin); 993 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 994 WARN_ON(btrfs_header_nritems(eb_rewin) > 995 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 996 997 return eb_rewin; 998 } 999 1000 /* 1001 * Rewind the state of @root's root node to the given @time_seq value. 1002 * If there are no changes, the current root->root_node is returned. If anything 1003 * changed in between, there's a fresh buffer allocated on which the rewind 1004 * operations are done. In any case, the returned buffer is read locked. 1005 * Returns NULL on error (with no locks held). 1006 */ 1007 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) 1008 { 1009 struct btrfs_fs_info *fs_info = root->fs_info; 1010 struct tree_mod_elem *tm; 1011 struct extent_buffer *eb = NULL; 1012 struct extent_buffer *eb_root; 1013 u64 eb_root_owner = 0; 1014 struct extent_buffer *old; 1015 struct tree_mod_root *old_root = NULL; 1016 u64 old_generation = 0; 1017 u64 logical; 1018 int level; 1019 1020 eb_root = btrfs_read_lock_root_node(root); 1021 tm = tree_mod_log_oldest_root(eb_root, time_seq); 1022 if (!tm) 1023 return eb_root; 1024 1025 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { 1026 old_root = &tm->old_root; 1027 old_generation = tm->generation; 1028 logical = old_root->logical; 1029 level = old_root->level; 1030 } else { 1031 logical = eb_root->start; 1032 level = btrfs_header_level(eb_root); 1033 } 1034 1035 tm = tree_mod_log_search(fs_info, logical, time_seq); 1036 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1037 struct btrfs_tree_parent_check check = { 0 }; 1038 1039 btrfs_tree_read_unlock(eb_root); 1040 free_extent_buffer(eb_root); 1041 1042 check.level = level; 1043 check.owner_root = btrfs_root_id(root); 1044 1045 old = read_tree_block(fs_info, logical, &check); 1046 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1047 if (!IS_ERR(old)) 1048 free_extent_buffer(old); 1049 btrfs_warn(fs_info, 1050 "failed to read tree block %llu from get_old_root", 1051 logical); 1052 } else { 1053 struct tree_mod_elem *tm2; 1054 1055 btrfs_tree_read_lock(old); 1056 eb = btrfs_clone_extent_buffer(old); 1057 /* 1058 * After the lookup for the most recent tree mod operation 1059 * above and before we locked and cloned the extent buffer 1060 * 'old', a new tree mod log operation may have been added. 1061 * So lookup for a more recent one to make sure the number 1062 * of mod log operations we replay is consistent with the 1063 * number of items we have in the cloned extent buffer, 1064 * otherwise we can hit a BUG_ON when rewinding the extent 1065 * buffer. 1066 */ 1067 tm2 = tree_mod_log_search(fs_info, logical, time_seq); 1068 btrfs_tree_read_unlock(old); 1069 free_extent_buffer(old); 1070 ASSERT(tm2); 1071 ASSERT(tm2 == tm || tm2->seq > tm->seq); 1072 if (!tm2 || tm2->seq < tm->seq) { 1073 free_extent_buffer(eb); 1074 return NULL; 1075 } 1076 tm = tm2; 1077 } 1078 } else if (old_root) { 1079 eb_root_owner = btrfs_header_owner(eb_root); 1080 btrfs_tree_read_unlock(eb_root); 1081 free_extent_buffer(eb_root); 1082 eb = alloc_dummy_extent_buffer(fs_info, logical); 1083 } else { 1084 eb = btrfs_clone_extent_buffer(eb_root); 1085 btrfs_tree_read_unlock(eb_root); 1086 free_extent_buffer(eb_root); 1087 } 1088 1089 if (!eb) 1090 return NULL; 1091 if (old_root) { 1092 btrfs_set_header_bytenr(eb, eb->start); 1093 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1094 btrfs_set_header_owner(eb, eb_root_owner); 1095 btrfs_set_header_level(eb, old_root->level); 1096 btrfs_set_header_generation(eb, old_generation); 1097 } 1098 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, 1099 btrfs_header_level(eb)); 1100 btrfs_tree_read_lock(eb); 1101 if (tm) 1102 tree_mod_log_rewind(fs_info, eb, time_seq, tm); 1103 else 1104 WARN_ON(btrfs_header_level(eb) != 0); 1105 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1106 1107 return eb; 1108 } 1109 1110 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1111 { 1112 struct tree_mod_elem *tm; 1113 int level; 1114 struct extent_buffer *eb_root = btrfs_root_node(root); 1115 1116 tm = tree_mod_log_oldest_root(eb_root, time_seq); 1117 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) 1118 level = tm->old_root.level; 1119 else 1120 level = btrfs_header_level(eb_root); 1121 1122 free_extent_buffer(eb_root); 1123 1124 return level; 1125 } 1126 1127 /* 1128 * Return the lowest sequence number in the tree modification log. 1129 * 1130 * Return the sequence number of the oldest tree modification log user, which 1131 * corresponds to the lowest sequence number of all existing users. If there are 1132 * no users it returns 0. 1133 */ 1134 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) 1135 { 1136 u64 ret = 0; 1137 1138 read_lock(&fs_info->tree_mod_log_lock); 1139 if (!list_empty(&fs_info->tree_mod_seq_list)) { 1140 struct btrfs_seq_list *elem; 1141 1142 elem = list_first_entry(&fs_info->tree_mod_seq_list, 1143 struct btrfs_seq_list, list); 1144 ret = elem->seq; 1145 } 1146 read_unlock(&fs_info->tree_mod_log_lock); 1147 1148 return ret; 1149 } 1150