1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "messages.h" 4 #include "tree-mod-log.h" 5 #include "disk-io.h" 6 #include "fs.h" 7 #include "accessors.h" 8 #include "tree-checker.h" 9 10 struct tree_mod_root { 11 u64 logical; 12 u8 level; 13 }; 14 15 struct tree_mod_elem { 16 struct rb_node node; 17 u64 logical; 18 u64 seq; 19 enum btrfs_mod_log_op op; 20 21 /* 22 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS 23 * operations. 24 */ 25 int slot; 26 27 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ 28 u64 generation; 29 30 /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ 31 struct btrfs_disk_key key; 32 u64 blockptr; 33 34 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ 35 struct { 36 int dst_slot; 37 int nr_items; 38 } move; 39 40 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ 41 struct tree_mod_root old_root; 42 }; 43 44 /* 45 * Pull a new tree mod seq number for our operation. 46 */ 47 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 48 { 49 return atomic64_inc_return(&fs_info->tree_mod_seq); 50 } 51 52 /* 53 * This adds a new blocker to the tree mod log's blocker list if the @elem 54 * passed does not already have a sequence number set. So when a caller expects 55 * to record tree modifications, it should ensure to set elem->seq to zero 56 * before calling btrfs_get_tree_mod_seq. 57 * Returns a fresh, unused tree log modification sequence number, even if no new 58 * blocker was added. 59 */ 60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 61 struct btrfs_seq_list *elem) 62 { 63 write_lock(&fs_info->tree_mod_log_lock); 64 if (!elem->seq) { 65 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 66 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 67 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 68 } 69 write_unlock(&fs_info->tree_mod_log_lock); 70 71 return elem->seq; 72 } 73 74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 75 struct btrfs_seq_list *elem) 76 { 77 struct rb_root *tm_root; 78 struct rb_node *node; 79 struct rb_node *next; 80 struct tree_mod_elem *tm; 81 u64 min_seq = BTRFS_SEQ_LAST; 82 u64 seq_putting = elem->seq; 83 84 if (!seq_putting) 85 return; 86 87 write_lock(&fs_info->tree_mod_log_lock); 88 list_del(&elem->list); 89 elem->seq = 0; 90 91 if (list_empty(&fs_info->tree_mod_seq_list)) { 92 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 93 } else { 94 struct btrfs_seq_list *first; 95 96 first = list_first_entry(&fs_info->tree_mod_seq_list, 97 struct btrfs_seq_list, list); 98 if (seq_putting > first->seq) { 99 /* 100 * Blocker with lower sequence number exists, we cannot 101 * remove anything from the log. 102 */ 103 write_unlock(&fs_info->tree_mod_log_lock); 104 return; 105 } 106 min_seq = first->seq; 107 } 108 109 /* 110 * Anything that's lower than the lowest existing (read: blocked) 111 * sequence number can be removed from the tree. 112 */ 113 tm_root = &fs_info->tree_mod_log; 114 for (node = rb_first(tm_root); node; node = next) { 115 next = rb_next(node); 116 tm = rb_entry(node, struct tree_mod_elem, node); 117 if (tm->seq >= min_seq) 118 continue; 119 rb_erase(node, tm_root); 120 kfree(tm); 121 } 122 write_unlock(&fs_info->tree_mod_log_lock); 123 } 124 125 /* 126 * Key order of the log: 127 * node/leaf start address -> sequence 128 * 129 * The 'start address' is the logical address of the *new* root node for root 130 * replace operations, or the logical address of the affected block for all 131 * other operations. 132 */ 133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, 134 struct tree_mod_elem *tm) 135 { 136 struct rb_root *tm_root; 137 struct rb_node **new; 138 struct rb_node *parent = NULL; 139 struct tree_mod_elem *cur; 140 141 lockdep_assert_held_write(&fs_info->tree_mod_log_lock); 142 143 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 144 145 tm_root = &fs_info->tree_mod_log; 146 new = &tm_root->rb_node; 147 while (*new) { 148 cur = rb_entry(*new, struct tree_mod_elem, node); 149 parent = *new; 150 if (cur->logical < tm->logical) 151 new = &((*new)->rb_left); 152 else if (cur->logical > tm->logical) 153 new = &((*new)->rb_right); 154 else if (cur->seq < tm->seq) 155 new = &((*new)->rb_left); 156 else if (cur->seq > tm->seq) 157 new = &((*new)->rb_right); 158 else 159 return -EEXIST; 160 } 161 162 rb_link_node(&tm->node, parent, new); 163 rb_insert_color(&tm->node, tm_root); 164 return 0; 165 } 166 167 /* 168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it 169 * returns false with the tree_mod_log_lock acquired. The caller must hold 170 * this until all tree mod log insertions are recorded in the rb tree and then 171 * write unlock fs_info::tree_mod_log_lock. 172 */ 173 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 174 { 175 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 176 return true; 177 if (eb && btrfs_header_level(eb) == 0) 178 return true; 179 180 write_lock(&fs_info->tree_mod_log_lock); 181 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 182 write_unlock(&fs_info->tree_mod_log_lock); 183 return true; 184 } 185 186 return false; 187 } 188 189 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 190 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, 191 struct extent_buffer *eb) 192 { 193 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 194 return false; 195 if (eb && btrfs_header_level(eb) == 0) 196 return false; 197 198 return true; 199 } 200 201 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, 202 int slot, 203 enum btrfs_mod_log_op op) 204 { 205 struct tree_mod_elem *tm; 206 207 tm = kzalloc(sizeof(*tm), GFP_NOFS); 208 if (!tm) 209 return NULL; 210 211 tm->logical = eb->start; 212 if (op != BTRFS_MOD_LOG_KEY_ADD) { 213 btrfs_node_key(eb, &tm->key, slot); 214 tm->blockptr = btrfs_node_blockptr(eb, slot); 215 } 216 tm->op = op; 217 tm->slot = slot; 218 tm->generation = btrfs_node_ptr_generation(eb, slot); 219 RB_CLEAR_NODE(&tm->node); 220 221 return tm; 222 } 223 224 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 225 enum btrfs_mod_log_op op) 226 { 227 struct tree_mod_elem *tm; 228 int ret = 0; 229 230 if (!tree_mod_need_log(eb->fs_info, eb)) 231 return 0; 232 233 tm = alloc_tree_mod_elem(eb, slot, op); 234 if (!tm) 235 ret = -ENOMEM; 236 237 if (tree_mod_dont_log(eb->fs_info, eb)) { 238 kfree(tm); 239 /* 240 * Don't error if we failed to allocate memory because we don't 241 * need to log. 242 */ 243 return 0; 244 } else if (ret != 0) { 245 /* 246 * We previously failed to allocate memory and we need to log, 247 * so we have to fail. 248 */ 249 goto out_unlock; 250 } 251 252 ret = tree_mod_log_insert(eb->fs_info, tm); 253 out_unlock: 254 write_unlock(&eb->fs_info->tree_mod_log_lock); 255 if (ret) 256 kfree(tm); 257 258 return ret; 259 } 260 261 static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, 262 int dst_slot, int src_slot, 263 int nr_items) 264 { 265 struct tree_mod_elem *tm; 266 267 tm = kzalloc(sizeof(*tm), GFP_NOFS); 268 if (!tm) 269 return ERR_PTR(-ENOMEM); 270 271 tm->logical = eb->start; 272 tm->slot = src_slot; 273 tm->move.dst_slot = dst_slot; 274 tm->move.nr_items = nr_items; 275 tm->op = BTRFS_MOD_LOG_MOVE_KEYS; 276 RB_CLEAR_NODE(&tm->node); 277 278 return tm; 279 } 280 281 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, 282 int dst_slot, int src_slot, 283 int nr_items) 284 { 285 struct tree_mod_elem *tm = NULL; 286 struct tree_mod_elem **tm_list = NULL; 287 int ret = 0; 288 int i; 289 bool locked = false; 290 291 if (!tree_mod_need_log(eb->fs_info, eb)) 292 return 0; 293 294 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 295 if (!tm_list) { 296 ret = -ENOMEM; 297 goto lock; 298 } 299 300 tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); 301 if (IS_ERR(tm)) { 302 ret = PTR_ERR(tm); 303 tm = NULL; 304 goto lock; 305 } 306 307 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 308 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 309 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); 310 if (!tm_list[i]) { 311 ret = -ENOMEM; 312 goto lock; 313 } 314 } 315 316 lock: 317 if (tree_mod_dont_log(eb->fs_info, eb)) { 318 /* 319 * Don't error if we failed to allocate memory because we don't 320 * need to log. 321 */ 322 ret = 0; 323 goto free_tms; 324 } 325 locked = true; 326 327 /* 328 * We previously failed to allocate memory and we need to log, so we 329 * have to fail. 330 */ 331 if (ret != 0) 332 goto free_tms; 333 334 /* 335 * When we override something during the move, we log these removals. 336 * This can only happen when we move towards the beginning of the 337 * buffer, i.e. dst_slot < src_slot. 338 */ 339 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 340 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); 341 if (ret) 342 goto free_tms; 343 } 344 345 ret = tree_mod_log_insert(eb->fs_info, tm); 346 if (ret) 347 goto free_tms; 348 write_unlock(&eb->fs_info->tree_mod_log_lock); 349 kfree(tm_list); 350 351 return 0; 352 353 free_tms: 354 if (tm_list) { 355 for (i = 0; i < nr_items; i++) { 356 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 357 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 358 kfree(tm_list[i]); 359 } 360 } 361 if (locked) 362 write_unlock(&eb->fs_info->tree_mod_log_lock); 363 kfree(tm_list); 364 kfree(tm); 365 366 return ret; 367 } 368 369 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 370 struct tree_mod_elem **tm_list, 371 int nritems) 372 { 373 int i, j; 374 int ret; 375 376 for (i = nritems - 1; i >= 0; i--) { 377 ret = tree_mod_log_insert(fs_info, tm_list[i]); 378 if (ret) { 379 for (j = nritems - 1; j > i; j--) 380 rb_erase(&tm_list[j]->node, 381 &fs_info->tree_mod_log); 382 return ret; 383 } 384 } 385 386 return 0; 387 } 388 389 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, 390 struct extent_buffer *new_root, 391 bool log_removal) 392 { 393 struct btrfs_fs_info *fs_info = old_root->fs_info; 394 struct tree_mod_elem *tm = NULL; 395 struct tree_mod_elem **tm_list = NULL; 396 int nritems = 0; 397 int ret = 0; 398 int i; 399 400 if (!tree_mod_need_log(fs_info, NULL)) 401 return 0; 402 403 if (log_removal && btrfs_header_level(old_root) > 0) { 404 nritems = btrfs_header_nritems(old_root); 405 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 406 GFP_NOFS); 407 if (!tm_list) { 408 ret = -ENOMEM; 409 goto lock; 410 } 411 for (i = 0; i < nritems; i++) { 412 tm_list[i] = alloc_tree_mod_elem(old_root, i, 413 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 414 if (!tm_list[i]) { 415 ret = -ENOMEM; 416 goto lock; 417 } 418 } 419 } 420 421 tm = kzalloc(sizeof(*tm), GFP_NOFS); 422 if (!tm) { 423 ret = -ENOMEM; 424 goto lock; 425 } 426 427 tm->logical = new_root->start; 428 tm->old_root.logical = old_root->start; 429 tm->old_root.level = btrfs_header_level(old_root); 430 tm->generation = btrfs_header_generation(old_root); 431 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; 432 433 lock: 434 if (tree_mod_dont_log(fs_info, NULL)) { 435 /* 436 * Don't error if we failed to allocate memory because we don't 437 * need to log. 438 */ 439 ret = 0; 440 goto free_tms; 441 } else if (ret != 0) { 442 /* 443 * We previously failed to allocate memory and we need to log, 444 * so we have to fail. 445 */ 446 goto out_unlock; 447 } 448 449 if (tm_list) 450 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); 451 if (!ret) 452 ret = tree_mod_log_insert(fs_info, tm); 453 454 out_unlock: 455 write_unlock(&fs_info->tree_mod_log_lock); 456 if (ret) 457 goto free_tms; 458 kfree(tm_list); 459 460 return ret; 461 462 free_tms: 463 if (tm_list) { 464 for (i = 0; i < nritems; i++) 465 kfree(tm_list[i]); 466 kfree(tm_list); 467 } 468 kfree(tm); 469 470 return ret; 471 } 472 473 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, 474 u64 start, u64 min_seq, 475 bool smallest) 476 { 477 struct rb_root *tm_root; 478 struct rb_node *node; 479 struct tree_mod_elem *cur = NULL; 480 struct tree_mod_elem *found = NULL; 481 482 read_lock(&fs_info->tree_mod_log_lock); 483 tm_root = &fs_info->tree_mod_log; 484 node = tm_root->rb_node; 485 while (node) { 486 cur = rb_entry(node, struct tree_mod_elem, node); 487 if (cur->logical < start) { 488 node = node->rb_left; 489 } else if (cur->logical > start) { 490 node = node->rb_right; 491 } else if (cur->seq < min_seq) { 492 node = node->rb_left; 493 } else if (!smallest) { 494 /* We want the node with the highest seq */ 495 if (found) 496 BUG_ON(found->seq > cur->seq); 497 found = cur; 498 node = node->rb_left; 499 } else if (cur->seq > min_seq) { 500 /* We want the node with the smallest seq */ 501 if (found) 502 BUG_ON(found->seq < cur->seq); 503 found = cur; 504 node = node->rb_right; 505 } else { 506 found = cur; 507 break; 508 } 509 } 510 read_unlock(&fs_info->tree_mod_log_lock); 511 512 return found; 513 } 514 515 /* 516 * This returns the element from the log with the smallest time sequence 517 * value that's in the log (the oldest log item). Any element with a time 518 * sequence lower than min_seq will be ignored. 519 */ 520 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, 521 u64 start, u64 min_seq) 522 { 523 return __tree_mod_log_search(fs_info, start, min_seq, true); 524 } 525 526 /* 527 * This returns the element from the log with the largest time sequence 528 * value that's in the log (the most recent log item). Any element with 529 * a time sequence lower than min_seq will be ignored. 530 */ 531 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, 532 u64 start, u64 min_seq) 533 { 534 return __tree_mod_log_search(fs_info, start, min_seq, false); 535 } 536 537 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, 538 struct extent_buffer *src, 539 unsigned long dst_offset, 540 unsigned long src_offset, 541 int nr_items) 542 { 543 struct btrfs_fs_info *fs_info = dst->fs_info; 544 int ret = 0; 545 struct tree_mod_elem **tm_list = NULL; 546 struct tree_mod_elem **tm_list_add = NULL; 547 struct tree_mod_elem **tm_list_rem = NULL; 548 int i; 549 bool locked = false; 550 struct tree_mod_elem *dst_move_tm = NULL; 551 struct tree_mod_elem *src_move_tm = NULL; 552 u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; 553 u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); 554 555 if (!tree_mod_need_log(fs_info, NULL)) 556 return 0; 557 558 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 559 return 0; 560 561 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 562 GFP_NOFS); 563 if (!tm_list) { 564 ret = -ENOMEM; 565 goto lock; 566 } 567 568 if (dst_move_nr_items) { 569 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, 570 dst_offset, dst_move_nr_items); 571 if (IS_ERR(dst_move_tm)) { 572 ret = PTR_ERR(dst_move_tm); 573 dst_move_tm = NULL; 574 goto lock; 575 } 576 } 577 if (src_move_nr_items) { 578 src_move_tm = tree_mod_log_alloc_move(src, src_offset, 579 src_offset + nr_items, 580 src_move_nr_items); 581 if (IS_ERR(src_move_tm)) { 582 ret = PTR_ERR(src_move_tm); 583 src_move_tm = NULL; 584 goto lock; 585 } 586 } 587 588 tm_list_add = tm_list; 589 tm_list_rem = tm_list + nr_items; 590 for (i = 0; i < nr_items; i++) { 591 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 592 BTRFS_MOD_LOG_KEY_REMOVE); 593 if (!tm_list_rem[i]) { 594 ret = -ENOMEM; 595 goto lock; 596 } 597 598 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 599 BTRFS_MOD_LOG_KEY_ADD); 600 if (!tm_list_add[i]) { 601 ret = -ENOMEM; 602 goto lock; 603 } 604 } 605 606 lock: 607 if (tree_mod_dont_log(fs_info, NULL)) { 608 /* 609 * Don't error if we failed to allocate memory because we don't 610 * need to log. 611 */ 612 ret = 0; 613 goto free_tms; 614 } 615 locked = true; 616 617 /* 618 * We previously failed to allocate memory and we need to log, so we 619 * have to fail. 620 */ 621 if (ret != 0) 622 goto free_tms; 623 624 if (dst_move_tm) { 625 ret = tree_mod_log_insert(fs_info, dst_move_tm); 626 if (ret) 627 goto free_tms; 628 } 629 for (i = 0; i < nr_items; i++) { 630 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); 631 if (ret) 632 goto free_tms; 633 ret = tree_mod_log_insert(fs_info, tm_list_add[i]); 634 if (ret) 635 goto free_tms; 636 } 637 if (src_move_tm) { 638 ret = tree_mod_log_insert(fs_info, src_move_tm); 639 if (ret) 640 goto free_tms; 641 } 642 643 write_unlock(&fs_info->tree_mod_log_lock); 644 kfree(tm_list); 645 646 return 0; 647 648 free_tms: 649 if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) 650 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); 651 kfree(dst_move_tm); 652 if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) 653 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); 654 kfree(src_move_tm); 655 if (tm_list) { 656 for (i = 0; i < nr_items * 2; i++) { 657 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 658 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 659 kfree(tm_list[i]); 660 } 661 } 662 if (locked) 663 write_unlock(&fs_info->tree_mod_log_lock); 664 kfree(tm_list); 665 666 return ret; 667 } 668 669 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) 670 { 671 struct tree_mod_elem **tm_list = NULL; 672 int nritems = 0; 673 int i; 674 int ret = 0; 675 676 if (!tree_mod_need_log(eb->fs_info, eb)) 677 return 0; 678 679 nritems = btrfs_header_nritems(eb); 680 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 681 if (!tm_list) { 682 ret = -ENOMEM; 683 goto lock; 684 } 685 686 for (i = 0; i < nritems; i++) { 687 tm_list[i] = alloc_tree_mod_elem(eb, i, 688 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 689 if (!tm_list[i]) { 690 ret = -ENOMEM; 691 goto lock; 692 } 693 } 694 695 lock: 696 if (tree_mod_dont_log(eb->fs_info, eb)) { 697 /* 698 * Don't error if we failed to allocate memory because we don't 699 * need to log. 700 */ 701 ret = 0; 702 goto free_tms; 703 } else if (ret != 0) { 704 /* 705 * We previously failed to allocate memory and we need to log, 706 * so we have to fail. 707 */ 708 goto out_unlock; 709 } 710 711 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 712 out_unlock: 713 write_unlock(&eb->fs_info->tree_mod_log_lock); 714 if (ret) 715 goto free_tms; 716 kfree(tm_list); 717 718 return 0; 719 720 free_tms: 721 if (tm_list) { 722 for (i = 0; i < nritems; i++) 723 kfree(tm_list[i]); 724 kfree(tm_list); 725 } 726 727 return ret; 728 } 729 730 /* 731 * Returns the logical address of the oldest predecessor of the given root. 732 * Entries older than time_seq are ignored. 733 */ 734 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, 735 u64 time_seq) 736 { 737 struct tree_mod_elem *tm; 738 struct tree_mod_elem *found = NULL; 739 u64 root_logical = eb_root->start; 740 bool looped = false; 741 742 if (!time_seq) 743 return NULL; 744 745 /* 746 * The very last operation that's logged for a root is the replacement 747 * operation (if it is replaced at all). This has the logical address 748 * of the *new* root, making it the very first operation that's logged 749 * for this root. 750 */ 751 while (1) { 752 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 753 time_seq); 754 if (!looped && !tm) 755 return NULL; 756 /* 757 * If there are no tree operation for the oldest root, we simply 758 * return it. This should only happen if that (old) root is at 759 * level 0. 760 */ 761 if (!tm) 762 break; 763 764 /* 765 * If there's an operation that's not a root replacement, we 766 * found the oldest version of our root. Normally, we'll find a 767 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 768 */ 769 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) 770 break; 771 772 found = tm; 773 root_logical = tm->old_root.logical; 774 looped = true; 775 } 776 777 /* If there's no old root to return, return what we found instead */ 778 if (!found) 779 found = tm; 780 781 return found; 782 } 783 784 785 /* 786 * tm is a pointer to the first operation to rewind within eb. Then, all 787 * previous operations will be rewound (until we reach something older than 788 * time_seq). 789 */ 790 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 791 struct extent_buffer *eb, 792 u64 time_seq, 793 struct tree_mod_elem *first_tm) 794 { 795 u32 n; 796 struct rb_node *next; 797 struct tree_mod_elem *tm = first_tm; 798 unsigned long o_dst; 799 unsigned long o_src; 800 unsigned long p_size = sizeof(struct btrfs_key_ptr); 801 /* 802 * max_slot tracks the maximum valid slot of the rewind eb at every 803 * step of the rewind. This is in contrast with 'n' which eventually 804 * matches the number of items, but can be wrong during moves or if 805 * removes overlap on already valid slots (which is probably separately 806 * a bug). We do this to validate the offsets of memmoves for rewinding 807 * moves and detect invalid memmoves. 808 * 809 * Since a rewind eb can start empty, max_slot is a signed integer with 810 * a special meaning for -1, which is that no slot is valid to move out 811 * of. Any other negative value is invalid. 812 */ 813 int max_slot; 814 int move_src_end_slot; 815 int move_dst_end_slot; 816 817 n = btrfs_header_nritems(eb); 818 max_slot = n - 1; 819 read_lock(&fs_info->tree_mod_log_lock); 820 while (tm && tm->seq >= time_seq) { 821 ASSERT(max_slot >= -1); 822 /* 823 * All the operations are recorded with the operator used for 824 * the modification. As we're going backwards, we do the 825 * opposite of each operation here. 826 */ 827 switch (tm->op) { 828 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 829 BUG_ON(tm->slot < n); 830 fallthrough; 831 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 832 case BTRFS_MOD_LOG_KEY_REMOVE: 833 btrfs_set_node_key(eb, &tm->key, tm->slot); 834 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 835 btrfs_set_node_ptr_generation(eb, tm->slot, 836 tm->generation); 837 n++; 838 if (tm->slot > max_slot) 839 max_slot = tm->slot; 840 break; 841 case BTRFS_MOD_LOG_KEY_REPLACE: 842 BUG_ON(tm->slot >= n); 843 btrfs_set_node_key(eb, &tm->key, tm->slot); 844 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 845 btrfs_set_node_ptr_generation(eb, tm->slot, 846 tm->generation); 847 break; 848 case BTRFS_MOD_LOG_KEY_ADD: 849 /* 850 * It is possible we could have already removed keys 851 * behind the known max slot, so this will be an 852 * overestimate. In practice, the copy operation 853 * inserts them in increasing order, and overestimating 854 * just means we miss some warnings, so it's OK. It 855 * isn't worth carefully tracking the full array of 856 * valid slots to check against when moving. 857 */ 858 if (tm->slot == max_slot) 859 max_slot--; 860 /* if a move operation is needed it's in the log */ 861 n--; 862 break; 863 case BTRFS_MOD_LOG_MOVE_KEYS: 864 ASSERT(tm->move.nr_items > 0); 865 move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; 866 move_dst_end_slot = tm->slot + tm->move.nr_items - 1; 867 o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); 868 o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); 869 if (WARN_ON(move_src_end_slot > max_slot || 870 tm->move.nr_items <= 0)) { 871 btrfs_warn(fs_info, 872 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", 873 eb->start, tm->slot, 874 tm->move.dst_slot, tm->move.nr_items, 875 tm->seq, n, max_slot); 876 } 877 memmove_extent_buffer(eb, o_dst, o_src, 878 tm->move.nr_items * p_size); 879 max_slot = move_dst_end_slot; 880 break; 881 case BTRFS_MOD_LOG_ROOT_REPLACE: 882 /* 883 * This operation is special. For roots, this must be 884 * handled explicitly before rewinding. 885 * For non-roots, this operation may exist if the node 886 * was a root: root A -> child B; then A gets empty and 887 * B is promoted to the new root. In the mod log, we'll 888 * have a root-replace operation for B, a tree block 889 * that is no root. We simply ignore that operation. 890 */ 891 break; 892 } 893 next = rb_next(&tm->node); 894 if (!next) 895 break; 896 tm = rb_entry(next, struct tree_mod_elem, node); 897 if (tm->logical != first_tm->logical) 898 break; 899 } 900 read_unlock(&fs_info->tree_mod_log_lock); 901 btrfs_set_header_nritems(eb, n); 902 } 903 904 /* 905 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 906 * is returned. If rewind operations happen, a fresh buffer is returned. The 907 * returned buffer is always read-locked. If the returned buffer is not the 908 * input buffer, the lock on the input buffer is released and the input buffer 909 * is freed (its refcount is decremented). 910 */ 911 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 912 struct btrfs_path *path, 913 struct extent_buffer *eb, 914 u64 time_seq) 915 { 916 struct extent_buffer *eb_rewin; 917 struct tree_mod_elem *tm; 918 919 if (!time_seq) 920 return eb; 921 922 if (btrfs_header_level(eb) == 0) 923 return eb; 924 925 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 926 if (!tm) 927 return eb; 928 929 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 930 BUG_ON(tm->slot != 0); 931 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 932 if (!eb_rewin) { 933 btrfs_tree_read_unlock(eb); 934 free_extent_buffer(eb); 935 return NULL; 936 } 937 btrfs_set_header_bytenr(eb_rewin, eb->start); 938 btrfs_set_header_backref_rev(eb_rewin, 939 btrfs_header_backref_rev(eb)); 940 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 941 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 942 } else { 943 eb_rewin = btrfs_clone_extent_buffer(eb); 944 if (!eb_rewin) { 945 btrfs_tree_read_unlock(eb); 946 free_extent_buffer(eb); 947 return NULL; 948 } 949 } 950 951 btrfs_tree_read_unlock(eb); 952 free_extent_buffer(eb); 953 954 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), 955 eb_rewin, btrfs_header_level(eb_rewin)); 956 btrfs_tree_read_lock(eb_rewin); 957 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 958 WARN_ON(btrfs_header_nritems(eb_rewin) > 959 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 960 961 return eb_rewin; 962 } 963 964 /* 965 * Rewind the state of @root's root node to the given @time_seq value. 966 * If there are no changes, the current root->root_node is returned. If anything 967 * changed in between, there's a fresh buffer allocated on which the rewind 968 * operations are done. In any case, the returned buffer is read locked. 969 * Returns NULL on error (with no locks held). 970 */ 971 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) 972 { 973 struct btrfs_fs_info *fs_info = root->fs_info; 974 struct tree_mod_elem *tm; 975 struct extent_buffer *eb = NULL; 976 struct extent_buffer *eb_root; 977 u64 eb_root_owner = 0; 978 struct extent_buffer *old; 979 struct tree_mod_root *old_root = NULL; 980 u64 old_generation = 0; 981 u64 logical; 982 int level; 983 984 eb_root = btrfs_read_lock_root_node(root); 985 tm = tree_mod_log_oldest_root(eb_root, time_seq); 986 if (!tm) 987 return eb_root; 988 989 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { 990 old_root = &tm->old_root; 991 old_generation = tm->generation; 992 logical = old_root->logical; 993 level = old_root->level; 994 } else { 995 logical = eb_root->start; 996 level = btrfs_header_level(eb_root); 997 } 998 999 tm = tree_mod_log_search(fs_info, logical, time_seq); 1000 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1001 struct btrfs_tree_parent_check check = { 0 }; 1002 1003 btrfs_tree_read_unlock(eb_root); 1004 free_extent_buffer(eb_root); 1005 1006 check.level = level; 1007 check.owner_root = root->root_key.objectid; 1008 1009 old = read_tree_block(fs_info, logical, &check); 1010 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1011 if (!IS_ERR(old)) 1012 free_extent_buffer(old); 1013 btrfs_warn(fs_info, 1014 "failed to read tree block %llu from get_old_root", 1015 logical); 1016 } else { 1017 struct tree_mod_elem *tm2; 1018 1019 btrfs_tree_read_lock(old); 1020 eb = btrfs_clone_extent_buffer(old); 1021 /* 1022 * After the lookup for the most recent tree mod operation 1023 * above and before we locked and cloned the extent buffer 1024 * 'old', a new tree mod log operation may have been added. 1025 * So lookup for a more recent one to make sure the number 1026 * of mod log operations we replay is consistent with the 1027 * number of items we have in the cloned extent buffer, 1028 * otherwise we can hit a BUG_ON when rewinding the extent 1029 * buffer. 1030 */ 1031 tm2 = tree_mod_log_search(fs_info, logical, time_seq); 1032 btrfs_tree_read_unlock(old); 1033 free_extent_buffer(old); 1034 ASSERT(tm2); 1035 ASSERT(tm2 == tm || tm2->seq > tm->seq); 1036 if (!tm2 || tm2->seq < tm->seq) { 1037 free_extent_buffer(eb); 1038 return NULL; 1039 } 1040 tm = tm2; 1041 } 1042 } else if (old_root) { 1043 eb_root_owner = btrfs_header_owner(eb_root); 1044 btrfs_tree_read_unlock(eb_root); 1045 free_extent_buffer(eb_root); 1046 eb = alloc_dummy_extent_buffer(fs_info, logical); 1047 } else { 1048 eb = btrfs_clone_extent_buffer(eb_root); 1049 btrfs_tree_read_unlock(eb_root); 1050 free_extent_buffer(eb_root); 1051 } 1052 1053 if (!eb) 1054 return NULL; 1055 if (old_root) { 1056 btrfs_set_header_bytenr(eb, eb->start); 1057 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1058 btrfs_set_header_owner(eb, eb_root_owner); 1059 btrfs_set_header_level(eb, old_root->level); 1060 btrfs_set_header_generation(eb, old_generation); 1061 } 1062 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, 1063 btrfs_header_level(eb)); 1064 btrfs_tree_read_lock(eb); 1065 if (tm) 1066 tree_mod_log_rewind(fs_info, eb, time_seq, tm); 1067 else 1068 WARN_ON(btrfs_header_level(eb) != 0); 1069 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1070 1071 return eb; 1072 } 1073 1074 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1075 { 1076 struct tree_mod_elem *tm; 1077 int level; 1078 struct extent_buffer *eb_root = btrfs_root_node(root); 1079 1080 tm = tree_mod_log_oldest_root(eb_root, time_seq); 1081 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) 1082 level = tm->old_root.level; 1083 else 1084 level = btrfs_header_level(eb_root); 1085 1086 free_extent_buffer(eb_root); 1087 1088 return level; 1089 } 1090 1091 /* 1092 * Return the lowest sequence number in the tree modification log. 1093 * 1094 * Return the sequence number of the oldest tree modification log user, which 1095 * corresponds to the lowest sequence number of all existing users. If there are 1096 * no users it returns 0. 1097 */ 1098 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) 1099 { 1100 u64 ret = 0; 1101 1102 read_lock(&fs_info->tree_mod_log_lock); 1103 if (!list_empty(&fs_info->tree_mod_seq_list)) { 1104 struct btrfs_seq_list *elem; 1105 1106 elem = list_first_entry(&fs_info->tree_mod_seq_list, 1107 struct btrfs_seq_list, list); 1108 ret = elem->seq; 1109 } 1110 read_unlock(&fs_info->tree_mod_log_lock); 1111 1112 return ret; 1113 } 1114