1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "messages.h" 4 #include "tree-mod-log.h" 5 #include "disk-io.h" 6 #include "fs.h" 7 #include "accessors.h" 8 #include "tree-checker.h" 9 10 struct tree_mod_root { 11 u64 logical; 12 u8 level; 13 }; 14 15 struct tree_mod_elem { 16 struct rb_node node; 17 u64 logical; 18 u64 seq; 19 enum btrfs_mod_log_op op; 20 21 /* 22 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS 23 * operations. 24 */ 25 int slot; 26 27 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ 28 u64 generation; 29 30 /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ 31 struct btrfs_disk_key key; 32 u64 blockptr; 33 34 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ 35 struct { 36 int dst_slot; 37 int nr_items; 38 } move; 39 40 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ 41 struct tree_mod_root old_root; 42 }; 43 44 /* 45 * Pull a new tree mod seq number for our operation. 46 */ 47 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 48 { 49 return atomic64_inc_return(&fs_info->tree_mod_seq); 50 } 51 52 /* 53 * This adds a new blocker to the tree mod log's blocker list if the @elem 54 * passed does not already have a sequence number set. So when a caller expects 55 * to record tree modifications, it should ensure to set elem->seq to zero 56 * before calling btrfs_get_tree_mod_seq. 57 * Returns a fresh, unused tree log modification sequence number, even if no new 58 * blocker was added. 59 */ 60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 61 struct btrfs_seq_list *elem) 62 { 63 write_lock(&fs_info->tree_mod_log_lock); 64 if (!elem->seq) { 65 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 66 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 67 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 68 } 69 write_unlock(&fs_info->tree_mod_log_lock); 70 71 return elem->seq; 72 } 73 74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 75 struct btrfs_seq_list *elem) 76 { 77 struct rb_root *tm_root; 78 struct rb_node *node; 79 struct rb_node *next; 80 struct tree_mod_elem *tm; 81 u64 min_seq = BTRFS_SEQ_LAST; 82 u64 seq_putting = elem->seq; 83 84 if (!seq_putting) 85 return; 86 87 write_lock(&fs_info->tree_mod_log_lock); 88 list_del(&elem->list); 89 elem->seq = 0; 90 91 if (list_empty(&fs_info->tree_mod_seq_list)) { 92 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 93 } else { 94 struct btrfs_seq_list *first; 95 96 first = list_first_entry(&fs_info->tree_mod_seq_list, 97 struct btrfs_seq_list, list); 98 if (seq_putting > first->seq) { 99 /* 100 * Blocker with lower sequence number exists, we cannot 101 * remove anything from the log. 102 */ 103 write_unlock(&fs_info->tree_mod_log_lock); 104 return; 105 } 106 min_seq = first->seq; 107 } 108 109 /* 110 * Anything that's lower than the lowest existing (read: blocked) 111 * sequence number can be removed from the tree. 112 */ 113 tm_root = &fs_info->tree_mod_log; 114 for (node = rb_first(tm_root); node; node = next) { 115 next = rb_next(node); 116 tm = rb_entry(node, struct tree_mod_elem, node); 117 if (tm->seq >= min_seq) 118 continue; 119 rb_erase(node, tm_root); 120 kfree(tm); 121 } 122 write_unlock(&fs_info->tree_mod_log_lock); 123 } 124 125 /* 126 * Key order of the log: 127 * node/leaf start address -> sequence 128 * 129 * The 'start address' is the logical address of the *new* root node for root 130 * replace operations, or the logical address of the affected block for all 131 * other operations. 132 */ 133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, 134 struct tree_mod_elem *tm) 135 { 136 struct rb_root *tm_root; 137 struct rb_node **new; 138 struct rb_node *parent = NULL; 139 struct tree_mod_elem *cur; 140 141 lockdep_assert_held_write(&fs_info->tree_mod_log_lock); 142 143 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 144 145 tm_root = &fs_info->tree_mod_log; 146 new = &tm_root->rb_node; 147 while (*new) { 148 cur = rb_entry(*new, struct tree_mod_elem, node); 149 parent = *new; 150 if (cur->logical < tm->logical) 151 new = &((*new)->rb_left); 152 else if (cur->logical > tm->logical) 153 new = &((*new)->rb_right); 154 else if (cur->seq < tm->seq) 155 new = &((*new)->rb_left); 156 else if (cur->seq > tm->seq) 157 new = &((*new)->rb_right); 158 else 159 return -EEXIST; 160 } 161 162 rb_link_node(&tm->node, parent, new); 163 rb_insert_color(&tm->node, tm_root); 164 return 0; 165 } 166 167 /* 168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it 169 * returns false with the tree_mod_log_lock acquired. The caller must hold 170 * this until all tree mod log insertions are recorded in the rb tree and then 171 * write unlock fs_info::tree_mod_log_lock. 172 */ 173 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, 174 struct extent_buffer *eb) 175 { 176 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 177 return true; 178 if (eb && btrfs_header_level(eb) == 0) 179 return true; 180 181 write_lock(&fs_info->tree_mod_log_lock); 182 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 183 write_unlock(&fs_info->tree_mod_log_lock); 184 return true; 185 } 186 187 return false; 188 } 189 190 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 191 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, 192 struct extent_buffer *eb) 193 { 194 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 195 return false; 196 if (eb && btrfs_header_level(eb) == 0) 197 return false; 198 199 return true; 200 } 201 202 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, 203 int slot, 204 enum btrfs_mod_log_op op) 205 { 206 struct tree_mod_elem *tm; 207 208 tm = kzalloc(sizeof(*tm), GFP_NOFS); 209 if (!tm) 210 return NULL; 211 212 tm->logical = eb->start; 213 if (op != BTRFS_MOD_LOG_KEY_ADD) { 214 btrfs_node_key(eb, &tm->key, slot); 215 tm->blockptr = btrfs_node_blockptr(eb, slot); 216 } 217 tm->op = op; 218 tm->slot = slot; 219 tm->generation = btrfs_node_ptr_generation(eb, slot); 220 RB_CLEAR_NODE(&tm->node); 221 222 return tm; 223 } 224 225 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 226 enum btrfs_mod_log_op op) 227 { 228 struct tree_mod_elem *tm; 229 int ret = 0; 230 231 if (!tree_mod_need_log(eb->fs_info, eb)) 232 return 0; 233 234 tm = alloc_tree_mod_elem(eb, slot, op); 235 if (!tm) 236 ret = -ENOMEM; 237 238 if (tree_mod_dont_log(eb->fs_info, eb)) { 239 kfree(tm); 240 /* 241 * Don't error if we failed to allocate memory because we don't 242 * need to log. 243 */ 244 return 0; 245 } else if (ret != 0) { 246 /* 247 * We previously failed to allocate memory and we need to log, 248 * so we have to fail. 249 */ 250 goto out_unlock; 251 } 252 253 ret = tree_mod_log_insert(eb->fs_info, tm); 254 out_unlock: 255 write_unlock(&eb->fs_info->tree_mod_log_lock); 256 if (ret) 257 kfree(tm); 258 259 return ret; 260 } 261 262 static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, 263 int dst_slot, int src_slot, 264 int nr_items) 265 { 266 struct tree_mod_elem *tm; 267 268 tm = kzalloc(sizeof(*tm), GFP_NOFS); 269 if (!tm) 270 return ERR_PTR(-ENOMEM); 271 272 tm->logical = eb->start; 273 tm->slot = src_slot; 274 tm->move.dst_slot = dst_slot; 275 tm->move.nr_items = nr_items; 276 tm->op = BTRFS_MOD_LOG_MOVE_KEYS; 277 RB_CLEAR_NODE(&tm->node); 278 279 return tm; 280 } 281 282 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, 283 int dst_slot, int src_slot, 284 int nr_items) 285 { 286 struct tree_mod_elem *tm = NULL; 287 struct tree_mod_elem **tm_list = NULL; 288 int ret = 0; 289 int i; 290 bool locked = false; 291 292 if (!tree_mod_need_log(eb->fs_info, eb)) 293 return 0; 294 295 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 296 if (!tm_list) { 297 ret = -ENOMEM; 298 goto lock; 299 } 300 301 tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); 302 if (IS_ERR(tm)) { 303 ret = PTR_ERR(tm); 304 tm = NULL; 305 goto lock; 306 } 307 308 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 309 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 310 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); 311 if (!tm_list[i]) { 312 ret = -ENOMEM; 313 goto lock; 314 } 315 } 316 317 lock: 318 if (tree_mod_dont_log(eb->fs_info, eb)) { 319 /* 320 * Don't error if we failed to allocate memory because we don't 321 * need to log. 322 */ 323 ret = 0; 324 goto free_tms; 325 } 326 locked = true; 327 328 /* 329 * We previously failed to allocate memory and we need to log, so we 330 * have to fail. 331 */ 332 if (ret != 0) 333 goto free_tms; 334 335 /* 336 * When we override something during the move, we log these removals. 337 * This can only happen when we move towards the beginning of the 338 * buffer, i.e. dst_slot < src_slot. 339 */ 340 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 341 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); 342 if (ret) 343 goto free_tms; 344 } 345 346 ret = tree_mod_log_insert(eb->fs_info, tm); 347 if (ret) 348 goto free_tms; 349 write_unlock(&eb->fs_info->tree_mod_log_lock); 350 kfree(tm_list); 351 352 return 0; 353 354 free_tms: 355 if (tm_list) { 356 for (i = 0; i < nr_items; i++) { 357 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 358 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 359 kfree(tm_list[i]); 360 } 361 } 362 if (locked) 363 write_unlock(&eb->fs_info->tree_mod_log_lock); 364 kfree(tm_list); 365 kfree(tm); 366 367 return ret; 368 } 369 370 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 371 struct tree_mod_elem **tm_list, 372 int nritems) 373 { 374 int i, j; 375 int ret; 376 377 for (i = nritems - 1; i >= 0; i--) { 378 ret = tree_mod_log_insert(fs_info, tm_list[i]); 379 if (ret) { 380 for (j = nritems - 1; j > i; j--) 381 rb_erase(&tm_list[j]->node, 382 &fs_info->tree_mod_log); 383 return ret; 384 } 385 } 386 387 return 0; 388 } 389 390 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, 391 struct extent_buffer *new_root, 392 bool log_removal) 393 { 394 struct btrfs_fs_info *fs_info = old_root->fs_info; 395 struct tree_mod_elem *tm = NULL; 396 struct tree_mod_elem **tm_list = NULL; 397 int nritems = 0; 398 int ret = 0; 399 int i; 400 401 if (!tree_mod_need_log(fs_info, NULL)) 402 return 0; 403 404 if (log_removal && btrfs_header_level(old_root) > 0) { 405 nritems = btrfs_header_nritems(old_root); 406 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 407 GFP_NOFS); 408 if (!tm_list) { 409 ret = -ENOMEM; 410 goto lock; 411 } 412 for (i = 0; i < nritems; i++) { 413 tm_list[i] = alloc_tree_mod_elem(old_root, i, 414 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 415 if (!tm_list[i]) { 416 ret = -ENOMEM; 417 goto lock; 418 } 419 } 420 } 421 422 tm = kzalloc(sizeof(*tm), GFP_NOFS); 423 if (!tm) { 424 ret = -ENOMEM; 425 goto lock; 426 } 427 428 tm->logical = new_root->start; 429 tm->old_root.logical = old_root->start; 430 tm->old_root.level = btrfs_header_level(old_root); 431 tm->generation = btrfs_header_generation(old_root); 432 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; 433 434 lock: 435 if (tree_mod_dont_log(fs_info, NULL)) { 436 /* 437 * Don't error if we failed to allocate memory because we don't 438 * need to log. 439 */ 440 ret = 0; 441 goto free_tms; 442 } else if (ret != 0) { 443 /* 444 * We previously failed to allocate memory and we need to log, 445 * so we have to fail. 446 */ 447 goto out_unlock; 448 } 449 450 if (tm_list) 451 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); 452 if (!ret) 453 ret = tree_mod_log_insert(fs_info, tm); 454 455 out_unlock: 456 write_unlock(&fs_info->tree_mod_log_lock); 457 if (ret) 458 goto free_tms; 459 kfree(tm_list); 460 461 return ret; 462 463 free_tms: 464 if (tm_list) { 465 for (i = 0; i < nritems; i++) 466 kfree(tm_list[i]); 467 kfree(tm_list); 468 } 469 kfree(tm); 470 471 return ret; 472 } 473 474 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, 475 u64 start, u64 min_seq, 476 bool smallest) 477 { 478 struct rb_root *tm_root; 479 struct rb_node *node; 480 struct tree_mod_elem *cur = NULL; 481 struct tree_mod_elem *found = NULL; 482 483 read_lock(&fs_info->tree_mod_log_lock); 484 tm_root = &fs_info->tree_mod_log; 485 node = tm_root->rb_node; 486 while (node) { 487 cur = rb_entry(node, struct tree_mod_elem, node); 488 if (cur->logical < start) { 489 node = node->rb_left; 490 } else if (cur->logical > start) { 491 node = node->rb_right; 492 } else if (cur->seq < min_seq) { 493 node = node->rb_left; 494 } else if (!smallest) { 495 /* We want the node with the highest seq */ 496 if (found) 497 BUG_ON(found->seq > cur->seq); 498 found = cur; 499 node = node->rb_left; 500 } else if (cur->seq > min_seq) { 501 /* We want the node with the smallest seq */ 502 if (found) 503 BUG_ON(found->seq < cur->seq); 504 found = cur; 505 node = node->rb_right; 506 } else { 507 found = cur; 508 break; 509 } 510 } 511 read_unlock(&fs_info->tree_mod_log_lock); 512 513 return found; 514 } 515 516 /* 517 * This returns the element from the log with the smallest time sequence 518 * value that's in the log (the oldest log item). Any element with a time 519 * sequence lower than min_seq will be ignored. 520 */ 521 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, 522 u64 start, u64 min_seq) 523 { 524 return __tree_mod_log_search(fs_info, start, min_seq, true); 525 } 526 527 /* 528 * This returns the element from the log with the largest time sequence 529 * value that's in the log (the most recent log item). Any element with 530 * a time sequence lower than min_seq will be ignored. 531 */ 532 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, 533 u64 start, u64 min_seq) 534 { 535 return __tree_mod_log_search(fs_info, start, min_seq, false); 536 } 537 538 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, 539 struct extent_buffer *src, 540 unsigned long dst_offset, 541 unsigned long src_offset, 542 int nr_items) 543 { 544 struct btrfs_fs_info *fs_info = dst->fs_info; 545 int ret = 0; 546 struct tree_mod_elem **tm_list = NULL; 547 struct tree_mod_elem **tm_list_add = NULL; 548 struct tree_mod_elem **tm_list_rem = NULL; 549 int i; 550 bool locked = false; 551 struct tree_mod_elem *dst_move_tm = NULL; 552 struct tree_mod_elem *src_move_tm = NULL; 553 u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; 554 u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); 555 556 if (!tree_mod_need_log(fs_info, NULL)) 557 return 0; 558 559 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 560 return 0; 561 562 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 563 GFP_NOFS); 564 if (!tm_list) { 565 ret = -ENOMEM; 566 goto lock; 567 } 568 569 if (dst_move_nr_items) { 570 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, 571 dst_offset, dst_move_nr_items); 572 if (IS_ERR(dst_move_tm)) { 573 ret = PTR_ERR(dst_move_tm); 574 dst_move_tm = NULL; 575 goto lock; 576 } 577 } 578 if (src_move_nr_items) { 579 src_move_tm = tree_mod_log_alloc_move(src, src_offset, 580 src_offset + nr_items, 581 src_move_nr_items); 582 if (IS_ERR(src_move_tm)) { 583 ret = PTR_ERR(src_move_tm); 584 src_move_tm = NULL; 585 goto lock; 586 } 587 } 588 589 tm_list_add = tm_list; 590 tm_list_rem = tm_list + nr_items; 591 for (i = 0; i < nr_items; i++) { 592 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 593 BTRFS_MOD_LOG_KEY_REMOVE); 594 if (!tm_list_rem[i]) { 595 ret = -ENOMEM; 596 goto lock; 597 } 598 599 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 600 BTRFS_MOD_LOG_KEY_ADD); 601 if (!tm_list_add[i]) { 602 ret = -ENOMEM; 603 goto lock; 604 } 605 } 606 607 lock: 608 if (tree_mod_dont_log(fs_info, NULL)) { 609 /* 610 * Don't error if we failed to allocate memory because we don't 611 * need to log. 612 */ 613 ret = 0; 614 goto free_tms; 615 } 616 locked = true; 617 618 /* 619 * We previously failed to allocate memory and we need to log, so we 620 * have to fail. 621 */ 622 if (ret != 0) 623 goto free_tms; 624 625 if (dst_move_tm) { 626 ret = tree_mod_log_insert(fs_info, dst_move_tm); 627 if (ret) 628 goto free_tms; 629 } 630 for (i = 0; i < nr_items; i++) { 631 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); 632 if (ret) 633 goto free_tms; 634 ret = tree_mod_log_insert(fs_info, tm_list_add[i]); 635 if (ret) 636 goto free_tms; 637 } 638 if (src_move_tm) { 639 ret = tree_mod_log_insert(fs_info, src_move_tm); 640 if (ret) 641 goto free_tms; 642 } 643 644 write_unlock(&fs_info->tree_mod_log_lock); 645 kfree(tm_list); 646 647 return 0; 648 649 free_tms: 650 if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) 651 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); 652 kfree(dst_move_tm); 653 if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) 654 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); 655 kfree(src_move_tm); 656 if (tm_list) { 657 for (i = 0; i < nr_items * 2; i++) { 658 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 659 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 660 kfree(tm_list[i]); 661 } 662 } 663 if (locked) 664 write_unlock(&fs_info->tree_mod_log_lock); 665 kfree(tm_list); 666 667 return ret; 668 } 669 670 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) 671 { 672 struct tree_mod_elem **tm_list = NULL; 673 int nritems = 0; 674 int i; 675 int ret = 0; 676 677 if (!tree_mod_need_log(eb->fs_info, eb)) 678 return 0; 679 680 nritems = btrfs_header_nritems(eb); 681 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 682 if (!tm_list) { 683 ret = -ENOMEM; 684 goto lock; 685 } 686 687 for (i = 0; i < nritems; i++) { 688 tm_list[i] = alloc_tree_mod_elem(eb, i, 689 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 690 if (!tm_list[i]) { 691 ret = -ENOMEM; 692 goto lock; 693 } 694 } 695 696 lock: 697 if (tree_mod_dont_log(eb->fs_info, eb)) { 698 /* 699 * Don't error if we failed to allocate memory because we don't 700 * need to log. 701 */ 702 ret = 0; 703 goto free_tms; 704 } else if (ret != 0) { 705 /* 706 * We previously failed to allocate memory and we need to log, 707 * so we have to fail. 708 */ 709 goto out_unlock; 710 } 711 712 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 713 out_unlock: 714 write_unlock(&eb->fs_info->tree_mod_log_lock); 715 if (ret) 716 goto free_tms; 717 kfree(tm_list); 718 719 return 0; 720 721 free_tms: 722 if (tm_list) { 723 for (i = 0; i < nritems; i++) 724 kfree(tm_list[i]); 725 kfree(tm_list); 726 } 727 728 return ret; 729 } 730 731 /* 732 * Returns the logical address of the oldest predecessor of the given root. 733 * Entries older than time_seq are ignored. 734 */ 735 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, 736 u64 time_seq) 737 { 738 struct tree_mod_elem *tm; 739 struct tree_mod_elem *found = NULL; 740 u64 root_logical = eb_root->start; 741 bool looped = false; 742 743 if (!time_seq) 744 return NULL; 745 746 /* 747 * The very last operation that's logged for a root is the replacement 748 * operation (if it is replaced at all). This has the logical address 749 * of the *new* root, making it the very first operation that's logged 750 * for this root. 751 */ 752 while (1) { 753 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 754 time_seq); 755 if (!looped && !tm) 756 return NULL; 757 /* 758 * If there are no tree operation for the oldest root, we simply 759 * return it. This should only happen if that (old) root is at 760 * level 0. 761 */ 762 if (!tm) 763 break; 764 765 /* 766 * If there's an operation that's not a root replacement, we 767 * found the oldest version of our root. Normally, we'll find a 768 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 769 */ 770 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) 771 break; 772 773 found = tm; 774 root_logical = tm->old_root.logical; 775 looped = true; 776 } 777 778 /* If there's no old root to return, return what we found instead */ 779 if (!found) 780 found = tm; 781 782 return found; 783 } 784 785 786 /* 787 * tm is a pointer to the first operation to rewind within eb. Then, all 788 * previous operations will be rewound (until we reach something older than 789 * time_seq). 790 */ 791 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 792 struct extent_buffer *eb, 793 u64 time_seq, 794 struct tree_mod_elem *first_tm) 795 { 796 u32 n; 797 struct rb_node *next; 798 struct tree_mod_elem *tm = first_tm; 799 unsigned long o_dst; 800 unsigned long o_src; 801 unsigned long p_size = sizeof(struct btrfs_key_ptr); 802 /* 803 * max_slot tracks the maximum valid slot of the rewind eb at every 804 * step of the rewind. This is in contrast with 'n' which eventually 805 * matches the number of items, but can be wrong during moves or if 806 * removes overlap on already valid slots (which is probably separately 807 * a bug). We do this to validate the offsets of memmoves for rewinding 808 * moves and detect invalid memmoves. 809 * 810 * Since a rewind eb can start empty, max_slot is a signed integer with 811 * a special meaning for -1, which is that no slot is valid to move out 812 * of. Any other negative value is invalid. 813 */ 814 int max_slot; 815 int move_src_end_slot; 816 int move_dst_end_slot; 817 818 n = btrfs_header_nritems(eb); 819 max_slot = n - 1; 820 read_lock(&fs_info->tree_mod_log_lock); 821 while (tm && tm->seq >= time_seq) { 822 ASSERT(max_slot >= -1); 823 /* 824 * All the operations are recorded with the operator used for 825 * the modification. As we're going backwards, we do the 826 * opposite of each operation here. 827 */ 828 switch (tm->op) { 829 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 830 BUG_ON(tm->slot < n); 831 fallthrough; 832 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 833 case BTRFS_MOD_LOG_KEY_REMOVE: 834 btrfs_set_node_key(eb, &tm->key, tm->slot); 835 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 836 btrfs_set_node_ptr_generation(eb, tm->slot, 837 tm->generation); 838 n++; 839 if (tm->slot > max_slot) 840 max_slot = tm->slot; 841 break; 842 case BTRFS_MOD_LOG_KEY_REPLACE: 843 BUG_ON(tm->slot >= n); 844 btrfs_set_node_key(eb, &tm->key, tm->slot); 845 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 846 btrfs_set_node_ptr_generation(eb, tm->slot, 847 tm->generation); 848 break; 849 case BTRFS_MOD_LOG_KEY_ADD: 850 /* 851 * It is possible we could have already removed keys 852 * behind the known max slot, so this will be an 853 * overestimate. In practice, the copy operation 854 * inserts them in increasing order, and overestimating 855 * just means we miss some warnings, so it's OK. It 856 * isn't worth carefully tracking the full array of 857 * valid slots to check against when moving. 858 */ 859 if (tm->slot == max_slot) 860 max_slot--; 861 /* if a move operation is needed it's in the log */ 862 n--; 863 break; 864 case BTRFS_MOD_LOG_MOVE_KEYS: 865 ASSERT(tm->move.nr_items > 0); 866 move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; 867 move_dst_end_slot = tm->slot + tm->move.nr_items - 1; 868 o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); 869 o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); 870 if (WARN_ON(move_src_end_slot > max_slot || 871 tm->move.nr_items <= 0)) { 872 btrfs_warn(fs_info, 873 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", 874 eb->start, tm->slot, 875 tm->move.dst_slot, tm->move.nr_items, 876 tm->seq, n, max_slot); 877 } 878 memmove_extent_buffer(eb, o_dst, o_src, 879 tm->move.nr_items * p_size); 880 max_slot = move_dst_end_slot; 881 break; 882 case BTRFS_MOD_LOG_ROOT_REPLACE: 883 /* 884 * This operation is special. For roots, this must be 885 * handled explicitly before rewinding. 886 * For non-roots, this operation may exist if the node 887 * was a root: root A -> child B; then A gets empty and 888 * B is promoted to the new root. In the mod log, we'll 889 * have a root-replace operation for B, a tree block 890 * that is no root. We simply ignore that operation. 891 */ 892 break; 893 } 894 next = rb_next(&tm->node); 895 if (!next) 896 break; 897 tm = rb_entry(next, struct tree_mod_elem, node); 898 if (tm->logical != first_tm->logical) 899 break; 900 } 901 read_unlock(&fs_info->tree_mod_log_lock); 902 btrfs_set_header_nritems(eb, n); 903 } 904 905 /* 906 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 907 * is returned. If rewind operations happen, a fresh buffer is returned. The 908 * returned buffer is always read-locked. If the returned buffer is not the 909 * input buffer, the lock on the input buffer is released and the input buffer 910 * is freed (its refcount is decremented). 911 */ 912 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 913 struct btrfs_path *path, 914 struct extent_buffer *eb, 915 u64 time_seq) 916 { 917 struct extent_buffer *eb_rewin; 918 struct tree_mod_elem *tm; 919 920 if (!time_seq) 921 return eb; 922 923 if (btrfs_header_level(eb) == 0) 924 return eb; 925 926 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 927 if (!tm) 928 return eb; 929 930 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 931 BUG_ON(tm->slot != 0); 932 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 933 if (!eb_rewin) { 934 btrfs_tree_read_unlock(eb); 935 free_extent_buffer(eb); 936 return NULL; 937 } 938 btrfs_set_header_bytenr(eb_rewin, eb->start); 939 btrfs_set_header_backref_rev(eb_rewin, 940 btrfs_header_backref_rev(eb)); 941 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 942 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 943 } else { 944 eb_rewin = btrfs_clone_extent_buffer(eb); 945 if (!eb_rewin) { 946 btrfs_tree_read_unlock(eb); 947 free_extent_buffer(eb); 948 return NULL; 949 } 950 } 951 952 btrfs_tree_read_unlock(eb); 953 free_extent_buffer(eb); 954 955 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), 956 eb_rewin, btrfs_header_level(eb_rewin)); 957 btrfs_tree_read_lock(eb_rewin); 958 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 959 WARN_ON(btrfs_header_nritems(eb_rewin) > 960 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 961 962 return eb_rewin; 963 } 964 965 /* 966 * Rewind the state of @root's root node to the given @time_seq value. 967 * If there are no changes, the current root->root_node is returned. If anything 968 * changed in between, there's a fresh buffer allocated on which the rewind 969 * operations are done. In any case, the returned buffer is read locked. 970 * Returns NULL on error (with no locks held). 971 */ 972 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) 973 { 974 struct btrfs_fs_info *fs_info = root->fs_info; 975 struct tree_mod_elem *tm; 976 struct extent_buffer *eb = NULL; 977 struct extent_buffer *eb_root; 978 u64 eb_root_owner = 0; 979 struct extent_buffer *old; 980 struct tree_mod_root *old_root = NULL; 981 u64 old_generation = 0; 982 u64 logical; 983 int level; 984 985 eb_root = btrfs_read_lock_root_node(root); 986 tm = tree_mod_log_oldest_root(eb_root, time_seq); 987 if (!tm) 988 return eb_root; 989 990 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { 991 old_root = &tm->old_root; 992 old_generation = tm->generation; 993 logical = old_root->logical; 994 level = old_root->level; 995 } else { 996 logical = eb_root->start; 997 level = btrfs_header_level(eb_root); 998 } 999 1000 tm = tree_mod_log_search(fs_info, logical, time_seq); 1001 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1002 struct btrfs_tree_parent_check check = { 0 }; 1003 1004 btrfs_tree_read_unlock(eb_root); 1005 free_extent_buffer(eb_root); 1006 1007 check.level = level; 1008 check.owner_root = root->root_key.objectid; 1009 1010 old = read_tree_block(fs_info, logical, &check); 1011 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1012 if (!IS_ERR(old)) 1013 free_extent_buffer(old); 1014 btrfs_warn(fs_info, 1015 "failed to read tree block %llu from get_old_root", 1016 logical); 1017 } else { 1018 struct tree_mod_elem *tm2; 1019 1020 btrfs_tree_read_lock(old); 1021 eb = btrfs_clone_extent_buffer(old); 1022 /* 1023 * After the lookup for the most recent tree mod operation 1024 * above and before we locked and cloned the extent buffer 1025 * 'old', a new tree mod log operation may have been added. 1026 * So lookup for a more recent one to make sure the number 1027 * of mod log operations we replay is consistent with the 1028 * number of items we have in the cloned extent buffer, 1029 * otherwise we can hit a BUG_ON when rewinding the extent 1030 * buffer. 1031 */ 1032 tm2 = tree_mod_log_search(fs_info, logical, time_seq); 1033 btrfs_tree_read_unlock(old); 1034 free_extent_buffer(old); 1035 ASSERT(tm2); 1036 ASSERT(tm2 == tm || tm2->seq > tm->seq); 1037 if (!tm2 || tm2->seq < tm->seq) { 1038 free_extent_buffer(eb); 1039 return NULL; 1040 } 1041 tm = tm2; 1042 } 1043 } else if (old_root) { 1044 eb_root_owner = btrfs_header_owner(eb_root); 1045 btrfs_tree_read_unlock(eb_root); 1046 free_extent_buffer(eb_root); 1047 eb = alloc_dummy_extent_buffer(fs_info, logical); 1048 } else { 1049 eb = btrfs_clone_extent_buffer(eb_root); 1050 btrfs_tree_read_unlock(eb_root); 1051 free_extent_buffer(eb_root); 1052 } 1053 1054 if (!eb) 1055 return NULL; 1056 if (old_root) { 1057 btrfs_set_header_bytenr(eb, eb->start); 1058 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1059 btrfs_set_header_owner(eb, eb_root_owner); 1060 btrfs_set_header_level(eb, old_root->level); 1061 btrfs_set_header_generation(eb, old_generation); 1062 } 1063 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, 1064 btrfs_header_level(eb)); 1065 btrfs_tree_read_lock(eb); 1066 if (tm) 1067 tree_mod_log_rewind(fs_info, eb, time_seq, tm); 1068 else 1069 WARN_ON(btrfs_header_level(eb) != 0); 1070 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1071 1072 return eb; 1073 } 1074 1075 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1076 { 1077 struct tree_mod_elem *tm; 1078 int level; 1079 struct extent_buffer *eb_root = btrfs_root_node(root); 1080 1081 tm = tree_mod_log_oldest_root(eb_root, time_seq); 1082 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) 1083 level = tm->old_root.level; 1084 else 1085 level = btrfs_header_level(eb_root); 1086 1087 free_extent_buffer(eb_root); 1088 1089 return level; 1090 } 1091 1092 /* 1093 * Return the lowest sequence number in the tree modification log. 1094 * 1095 * Return the sequence number of the oldest tree modification log user, which 1096 * corresponds to the lowest sequence number of all existing users. If there are 1097 * no users it returns 0. 1098 */ 1099 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) 1100 { 1101 u64 ret = 0; 1102 1103 read_lock(&fs_info->tree_mod_log_lock); 1104 if (!list_empty(&fs_info->tree_mod_seq_list)) { 1105 struct btrfs_seq_list *elem; 1106 1107 elem = list_first_entry(&fs_info->tree_mod_seq_list, 1108 struct btrfs_seq_list, list); 1109 ret = elem->seq; 1110 } 1111 read_unlock(&fs_info->tree_mod_log_lock); 1112 1113 return ret; 1114 } 1115