1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "tree-mod-log.h" 4 #include "disk-io.h" 5 6 struct tree_mod_root { 7 u64 logical; 8 u8 level; 9 }; 10 11 struct tree_mod_elem { 12 struct rb_node node; 13 u64 logical; 14 u64 seq; 15 enum btrfs_mod_log_op op; 16 17 /* 18 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS 19 * operations. 20 */ 21 int slot; 22 23 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ 24 u64 generation; 25 26 /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ 27 struct btrfs_disk_key key; 28 u64 blockptr; 29 30 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ 31 struct { 32 int dst_slot; 33 int nr_items; 34 } move; 35 36 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ 37 struct tree_mod_root old_root; 38 }; 39 40 /* 41 * Pull a new tree mod seq number for our operation. 42 */ 43 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 44 { 45 return atomic64_inc_return(&fs_info->tree_mod_seq); 46 } 47 48 /* 49 * This adds a new blocker to the tree mod log's blocker list if the @elem 50 * passed does not already have a sequence number set. So when a caller expects 51 * to record tree modifications, it should ensure to set elem->seq to zero 52 * before calling btrfs_get_tree_mod_seq. 53 * Returns a fresh, unused tree log modification sequence number, even if no new 54 * blocker was added. 55 */ 56 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 57 struct btrfs_seq_list *elem) 58 { 59 write_lock(&fs_info->tree_mod_log_lock); 60 if (!elem->seq) { 61 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 62 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 63 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 64 } 65 write_unlock(&fs_info->tree_mod_log_lock); 66 67 return elem->seq; 68 } 69 70 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 71 struct btrfs_seq_list *elem) 72 { 73 struct rb_root *tm_root; 74 struct rb_node *node; 75 struct rb_node *next; 76 struct tree_mod_elem *tm; 77 u64 min_seq = BTRFS_SEQ_LAST; 78 u64 seq_putting = elem->seq; 79 80 if (!seq_putting) 81 return; 82 83 write_lock(&fs_info->tree_mod_log_lock); 84 list_del(&elem->list); 85 elem->seq = 0; 86 87 if (list_empty(&fs_info->tree_mod_seq_list)) { 88 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 89 } else { 90 struct btrfs_seq_list *first; 91 92 first = list_first_entry(&fs_info->tree_mod_seq_list, 93 struct btrfs_seq_list, list); 94 if (seq_putting > first->seq) { 95 /* 96 * Blocker with lower sequence number exists, we cannot 97 * remove anything from the log. 98 */ 99 write_unlock(&fs_info->tree_mod_log_lock); 100 return; 101 } 102 min_seq = first->seq; 103 } 104 105 /* 106 * Anything that's lower than the lowest existing (read: blocked) 107 * sequence number can be removed from the tree. 108 */ 109 tm_root = &fs_info->tree_mod_log; 110 for (node = rb_first(tm_root); node; node = next) { 111 next = rb_next(node); 112 tm = rb_entry(node, struct tree_mod_elem, node); 113 if (tm->seq >= min_seq) 114 continue; 115 rb_erase(node, tm_root); 116 kfree(tm); 117 } 118 write_unlock(&fs_info->tree_mod_log_lock); 119 } 120 121 /* 122 * Key order of the log: 123 * node/leaf start address -> sequence 124 * 125 * The 'start address' is the logical address of the *new* root node for root 126 * replace operations, or the logical address of the affected block for all 127 * other operations. 128 */ 129 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, 130 struct tree_mod_elem *tm) 131 { 132 struct rb_root *tm_root; 133 struct rb_node **new; 134 struct rb_node *parent = NULL; 135 struct tree_mod_elem *cur; 136 137 lockdep_assert_held_write(&fs_info->tree_mod_log_lock); 138 139 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 140 141 tm_root = &fs_info->tree_mod_log; 142 new = &tm_root->rb_node; 143 while (*new) { 144 cur = rb_entry(*new, struct tree_mod_elem, node); 145 parent = *new; 146 if (cur->logical < tm->logical) 147 new = &((*new)->rb_left); 148 else if (cur->logical > tm->logical) 149 new = &((*new)->rb_right); 150 else if (cur->seq < tm->seq) 151 new = &((*new)->rb_left); 152 else if (cur->seq > tm->seq) 153 new = &((*new)->rb_right); 154 else 155 return -EEXIST; 156 } 157 158 rb_link_node(&tm->node, parent, new); 159 rb_insert_color(&tm->node, tm_root); 160 return 0; 161 } 162 163 /* 164 * Determines if logging can be omitted. Returns true if it can. Otherwise, it 165 * returns false with the tree_mod_log_lock acquired. The caller must hold 166 * this until all tree mod log insertions are recorded in the rb tree and then 167 * write unlock fs_info::tree_mod_log_lock. 168 */ 169 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, 170 struct extent_buffer *eb) 171 { 172 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 173 return true; 174 if (eb && btrfs_header_level(eb) == 0) 175 return true; 176 177 write_lock(&fs_info->tree_mod_log_lock); 178 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 179 write_unlock(&fs_info->tree_mod_log_lock); 180 return true; 181 } 182 183 return false; 184 } 185 186 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 187 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, 188 struct extent_buffer *eb) 189 { 190 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 191 return false; 192 if (eb && btrfs_header_level(eb) == 0) 193 return false; 194 195 return true; 196 } 197 198 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, 199 int slot, 200 enum btrfs_mod_log_op op, 201 gfp_t flags) 202 { 203 struct tree_mod_elem *tm; 204 205 tm = kzalloc(sizeof(*tm), flags); 206 if (!tm) 207 return NULL; 208 209 tm->logical = eb->start; 210 if (op != BTRFS_MOD_LOG_KEY_ADD) { 211 btrfs_node_key(eb, &tm->key, slot); 212 tm->blockptr = btrfs_node_blockptr(eb, slot); 213 } 214 tm->op = op; 215 tm->slot = slot; 216 tm->generation = btrfs_node_ptr_generation(eb, slot); 217 RB_CLEAR_NODE(&tm->node); 218 219 return tm; 220 } 221 222 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 223 enum btrfs_mod_log_op op, gfp_t flags) 224 { 225 struct tree_mod_elem *tm; 226 int ret; 227 228 if (!tree_mod_need_log(eb->fs_info, eb)) 229 return 0; 230 231 tm = alloc_tree_mod_elem(eb, slot, op, flags); 232 if (!tm) 233 return -ENOMEM; 234 235 if (tree_mod_dont_log(eb->fs_info, eb)) { 236 kfree(tm); 237 return 0; 238 } 239 240 ret = tree_mod_log_insert(eb->fs_info, tm); 241 write_unlock(&eb->fs_info->tree_mod_log_lock); 242 if (ret) 243 kfree(tm); 244 245 return ret; 246 } 247 248 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, 249 int dst_slot, int src_slot, 250 int nr_items) 251 { 252 struct tree_mod_elem *tm = NULL; 253 struct tree_mod_elem **tm_list = NULL; 254 int ret = 0; 255 int i; 256 bool locked = false; 257 258 if (!tree_mod_need_log(eb->fs_info, eb)) 259 return 0; 260 261 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 262 if (!tm_list) 263 return -ENOMEM; 264 265 tm = kzalloc(sizeof(*tm), GFP_NOFS); 266 if (!tm) { 267 ret = -ENOMEM; 268 goto free_tms; 269 } 270 271 tm->logical = eb->start; 272 tm->slot = src_slot; 273 tm->move.dst_slot = dst_slot; 274 tm->move.nr_items = nr_items; 275 tm->op = BTRFS_MOD_LOG_MOVE_KEYS; 276 277 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 278 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 279 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); 280 if (!tm_list[i]) { 281 ret = -ENOMEM; 282 goto free_tms; 283 } 284 } 285 286 if (tree_mod_dont_log(eb->fs_info, eb)) 287 goto free_tms; 288 locked = true; 289 290 /* 291 * When we override something during the move, we log these removals. 292 * This can only happen when we move towards the beginning of the 293 * buffer, i.e. dst_slot < src_slot. 294 */ 295 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 296 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); 297 if (ret) 298 goto free_tms; 299 } 300 301 ret = tree_mod_log_insert(eb->fs_info, tm); 302 if (ret) 303 goto free_tms; 304 write_unlock(&eb->fs_info->tree_mod_log_lock); 305 kfree(tm_list); 306 307 return 0; 308 309 free_tms: 310 for (i = 0; i < nr_items; i++) { 311 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 312 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 313 kfree(tm_list[i]); 314 } 315 if (locked) 316 write_unlock(&eb->fs_info->tree_mod_log_lock); 317 kfree(tm_list); 318 kfree(tm); 319 320 return ret; 321 } 322 323 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 324 struct tree_mod_elem **tm_list, 325 int nritems) 326 { 327 int i, j; 328 int ret; 329 330 for (i = nritems - 1; i >= 0; i--) { 331 ret = tree_mod_log_insert(fs_info, tm_list[i]); 332 if (ret) { 333 for (j = nritems - 1; j > i; j--) 334 rb_erase(&tm_list[j]->node, 335 &fs_info->tree_mod_log); 336 return ret; 337 } 338 } 339 340 return 0; 341 } 342 343 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, 344 struct extent_buffer *new_root, 345 bool log_removal) 346 { 347 struct btrfs_fs_info *fs_info = old_root->fs_info; 348 struct tree_mod_elem *tm = NULL; 349 struct tree_mod_elem **tm_list = NULL; 350 int nritems = 0; 351 int ret = 0; 352 int i; 353 354 if (!tree_mod_need_log(fs_info, NULL)) 355 return 0; 356 357 if (log_removal && btrfs_header_level(old_root) > 0) { 358 nritems = btrfs_header_nritems(old_root); 359 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 360 GFP_NOFS); 361 if (!tm_list) { 362 ret = -ENOMEM; 363 goto free_tms; 364 } 365 for (i = 0; i < nritems; i++) { 366 tm_list[i] = alloc_tree_mod_elem(old_root, i, 367 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 368 if (!tm_list[i]) { 369 ret = -ENOMEM; 370 goto free_tms; 371 } 372 } 373 } 374 375 tm = kzalloc(sizeof(*tm), GFP_NOFS); 376 if (!tm) { 377 ret = -ENOMEM; 378 goto free_tms; 379 } 380 381 tm->logical = new_root->start; 382 tm->old_root.logical = old_root->start; 383 tm->old_root.level = btrfs_header_level(old_root); 384 tm->generation = btrfs_header_generation(old_root); 385 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; 386 387 if (tree_mod_dont_log(fs_info, NULL)) 388 goto free_tms; 389 390 if (tm_list) 391 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); 392 if (!ret) 393 ret = tree_mod_log_insert(fs_info, tm); 394 395 write_unlock(&fs_info->tree_mod_log_lock); 396 if (ret) 397 goto free_tms; 398 kfree(tm_list); 399 400 return ret; 401 402 free_tms: 403 if (tm_list) { 404 for (i = 0; i < nritems; i++) 405 kfree(tm_list[i]); 406 kfree(tm_list); 407 } 408 kfree(tm); 409 410 return ret; 411 } 412 413 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, 414 u64 start, u64 min_seq, 415 bool smallest) 416 { 417 struct rb_root *tm_root; 418 struct rb_node *node; 419 struct tree_mod_elem *cur = NULL; 420 struct tree_mod_elem *found = NULL; 421 422 read_lock(&fs_info->tree_mod_log_lock); 423 tm_root = &fs_info->tree_mod_log; 424 node = tm_root->rb_node; 425 while (node) { 426 cur = rb_entry(node, struct tree_mod_elem, node); 427 if (cur->logical < start) { 428 node = node->rb_left; 429 } else if (cur->logical > start) { 430 node = node->rb_right; 431 } else if (cur->seq < min_seq) { 432 node = node->rb_left; 433 } else if (!smallest) { 434 /* We want the node with the highest seq */ 435 if (found) 436 BUG_ON(found->seq > cur->seq); 437 found = cur; 438 node = node->rb_left; 439 } else if (cur->seq > min_seq) { 440 /* We want the node with the smallest seq */ 441 if (found) 442 BUG_ON(found->seq < cur->seq); 443 found = cur; 444 node = node->rb_right; 445 } else { 446 found = cur; 447 break; 448 } 449 } 450 read_unlock(&fs_info->tree_mod_log_lock); 451 452 return found; 453 } 454 455 /* 456 * This returns the element from the log with the smallest time sequence 457 * value that's in the log (the oldest log item). Any element with a time 458 * sequence lower than min_seq will be ignored. 459 */ 460 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, 461 u64 start, u64 min_seq) 462 { 463 return __tree_mod_log_search(fs_info, start, min_seq, true); 464 } 465 466 /* 467 * This returns the element from the log with the largest time sequence 468 * value that's in the log (the most recent log item). Any element with 469 * a time sequence lower than min_seq will be ignored. 470 */ 471 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, 472 u64 start, u64 min_seq) 473 { 474 return __tree_mod_log_search(fs_info, start, min_seq, false); 475 } 476 477 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, 478 struct extent_buffer *src, 479 unsigned long dst_offset, 480 unsigned long src_offset, 481 int nr_items) 482 { 483 struct btrfs_fs_info *fs_info = dst->fs_info; 484 int ret = 0; 485 struct tree_mod_elem **tm_list = NULL; 486 struct tree_mod_elem **tm_list_add, **tm_list_rem; 487 int i; 488 bool locked = false; 489 490 if (!tree_mod_need_log(fs_info, NULL)) 491 return 0; 492 493 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 494 return 0; 495 496 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 497 GFP_NOFS); 498 if (!tm_list) 499 return -ENOMEM; 500 501 tm_list_add = tm_list; 502 tm_list_rem = tm_list + nr_items; 503 for (i = 0; i < nr_items; i++) { 504 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 505 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); 506 if (!tm_list_rem[i]) { 507 ret = -ENOMEM; 508 goto free_tms; 509 } 510 511 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 512 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); 513 if (!tm_list_add[i]) { 514 ret = -ENOMEM; 515 goto free_tms; 516 } 517 } 518 519 if (tree_mod_dont_log(fs_info, NULL)) 520 goto free_tms; 521 locked = true; 522 523 for (i = 0; i < nr_items; i++) { 524 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); 525 if (ret) 526 goto free_tms; 527 ret = tree_mod_log_insert(fs_info, tm_list_add[i]); 528 if (ret) 529 goto free_tms; 530 } 531 532 write_unlock(&fs_info->tree_mod_log_lock); 533 kfree(tm_list); 534 535 return 0; 536 537 free_tms: 538 for (i = 0; i < nr_items * 2; i++) { 539 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 540 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 541 kfree(tm_list[i]); 542 } 543 if (locked) 544 write_unlock(&fs_info->tree_mod_log_lock); 545 kfree(tm_list); 546 547 return ret; 548 } 549 550 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) 551 { 552 struct tree_mod_elem **tm_list = NULL; 553 int nritems = 0; 554 int i; 555 int ret = 0; 556 557 if (!tree_mod_need_log(eb->fs_info, eb)) 558 return 0; 559 560 nritems = btrfs_header_nritems(eb); 561 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 562 if (!tm_list) 563 return -ENOMEM; 564 565 for (i = 0; i < nritems; i++) { 566 tm_list[i] = alloc_tree_mod_elem(eb, i, 567 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 568 if (!tm_list[i]) { 569 ret = -ENOMEM; 570 goto free_tms; 571 } 572 } 573 574 if (tree_mod_dont_log(eb->fs_info, eb)) 575 goto free_tms; 576 577 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 578 write_unlock(&eb->fs_info->tree_mod_log_lock); 579 if (ret) 580 goto free_tms; 581 kfree(tm_list); 582 583 return 0; 584 585 free_tms: 586 for (i = 0; i < nritems; i++) 587 kfree(tm_list[i]); 588 kfree(tm_list); 589 590 return ret; 591 } 592 593 /* 594 * Returns the logical address of the oldest predecessor of the given root. 595 * Entries older than time_seq are ignored. 596 */ 597 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, 598 u64 time_seq) 599 { 600 struct tree_mod_elem *tm; 601 struct tree_mod_elem *found = NULL; 602 u64 root_logical = eb_root->start; 603 bool looped = false; 604 605 if (!time_seq) 606 return NULL; 607 608 /* 609 * The very last operation that's logged for a root is the replacement 610 * operation (if it is replaced at all). This has the logical address 611 * of the *new* root, making it the very first operation that's logged 612 * for this root. 613 */ 614 while (1) { 615 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 616 time_seq); 617 if (!looped && !tm) 618 return NULL; 619 /* 620 * If there are no tree operation for the oldest root, we simply 621 * return it. This should only happen if that (old) root is at 622 * level 0. 623 */ 624 if (!tm) 625 break; 626 627 /* 628 * If there's an operation that's not a root replacement, we 629 * found the oldest version of our root. Normally, we'll find a 630 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 631 */ 632 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) 633 break; 634 635 found = tm; 636 root_logical = tm->old_root.logical; 637 looped = true; 638 } 639 640 /* If there's no old root to return, return what we found instead */ 641 if (!found) 642 found = tm; 643 644 return found; 645 } 646 647 648 /* 649 * tm is a pointer to the first operation to rewind within eb. Then, all 650 * previous operations will be rewound (until we reach something older than 651 * time_seq). 652 */ 653 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 654 struct extent_buffer *eb, 655 u64 time_seq, 656 struct tree_mod_elem *first_tm) 657 { 658 u32 n; 659 struct rb_node *next; 660 struct tree_mod_elem *tm = first_tm; 661 unsigned long o_dst; 662 unsigned long o_src; 663 unsigned long p_size = sizeof(struct btrfs_key_ptr); 664 665 n = btrfs_header_nritems(eb); 666 read_lock(&fs_info->tree_mod_log_lock); 667 while (tm && tm->seq >= time_seq) { 668 /* 669 * All the operations are recorded with the operator used for 670 * the modification. As we're going backwards, we do the 671 * opposite of each operation here. 672 */ 673 switch (tm->op) { 674 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 675 BUG_ON(tm->slot < n); 676 fallthrough; 677 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 678 case BTRFS_MOD_LOG_KEY_REMOVE: 679 btrfs_set_node_key(eb, &tm->key, tm->slot); 680 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 681 btrfs_set_node_ptr_generation(eb, tm->slot, 682 tm->generation); 683 n++; 684 break; 685 case BTRFS_MOD_LOG_KEY_REPLACE: 686 BUG_ON(tm->slot >= n); 687 btrfs_set_node_key(eb, &tm->key, tm->slot); 688 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 689 btrfs_set_node_ptr_generation(eb, tm->slot, 690 tm->generation); 691 break; 692 case BTRFS_MOD_LOG_KEY_ADD: 693 /* if a move operation is needed it's in the log */ 694 n--; 695 break; 696 case BTRFS_MOD_LOG_MOVE_KEYS: 697 o_dst = btrfs_node_key_ptr_offset(tm->slot); 698 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 699 memmove_extent_buffer(eb, o_dst, o_src, 700 tm->move.nr_items * p_size); 701 break; 702 case BTRFS_MOD_LOG_ROOT_REPLACE: 703 /* 704 * This operation is special. For roots, this must be 705 * handled explicitly before rewinding. 706 * For non-roots, this operation may exist if the node 707 * was a root: root A -> child B; then A gets empty and 708 * B is promoted to the new root. In the mod log, we'll 709 * have a root-replace operation for B, a tree block 710 * that is no root. We simply ignore that operation. 711 */ 712 break; 713 } 714 next = rb_next(&tm->node); 715 if (!next) 716 break; 717 tm = rb_entry(next, struct tree_mod_elem, node); 718 if (tm->logical != first_tm->logical) 719 break; 720 } 721 read_unlock(&fs_info->tree_mod_log_lock); 722 btrfs_set_header_nritems(eb, n); 723 } 724 725 /* 726 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 727 * is returned. If rewind operations happen, a fresh buffer is returned. The 728 * returned buffer is always read-locked. If the returned buffer is not the 729 * input buffer, the lock on the input buffer is released and the input buffer 730 * is freed (its refcount is decremented). 731 */ 732 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 733 struct btrfs_path *path, 734 struct extent_buffer *eb, 735 u64 time_seq) 736 { 737 struct extent_buffer *eb_rewin; 738 struct tree_mod_elem *tm; 739 740 if (!time_seq) 741 return eb; 742 743 if (btrfs_header_level(eb) == 0) 744 return eb; 745 746 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 747 if (!tm) 748 return eb; 749 750 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 751 BUG_ON(tm->slot != 0); 752 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 753 if (!eb_rewin) { 754 btrfs_tree_read_unlock(eb); 755 free_extent_buffer(eb); 756 return NULL; 757 } 758 btrfs_set_header_bytenr(eb_rewin, eb->start); 759 btrfs_set_header_backref_rev(eb_rewin, 760 btrfs_header_backref_rev(eb)); 761 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 762 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 763 } else { 764 eb_rewin = btrfs_clone_extent_buffer(eb); 765 if (!eb_rewin) { 766 btrfs_tree_read_unlock(eb); 767 free_extent_buffer(eb); 768 return NULL; 769 } 770 } 771 772 btrfs_tree_read_unlock(eb); 773 free_extent_buffer(eb); 774 775 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), 776 eb_rewin, btrfs_header_level(eb_rewin)); 777 btrfs_tree_read_lock(eb_rewin); 778 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 779 WARN_ON(btrfs_header_nritems(eb_rewin) > 780 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 781 782 return eb_rewin; 783 } 784 785 /* 786 * Rewind the state of @root's root node to the given @time_seq value. 787 * If there are no changes, the current root->root_node is returned. If anything 788 * changed in between, there's a fresh buffer allocated on which the rewind 789 * operations are done. In any case, the returned buffer is read locked. 790 * Returns NULL on error (with no locks held). 791 */ 792 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) 793 { 794 struct btrfs_fs_info *fs_info = root->fs_info; 795 struct tree_mod_elem *tm; 796 struct extent_buffer *eb = NULL; 797 struct extent_buffer *eb_root; 798 u64 eb_root_owner = 0; 799 struct extent_buffer *old; 800 struct tree_mod_root *old_root = NULL; 801 u64 old_generation = 0; 802 u64 logical; 803 int level; 804 805 eb_root = btrfs_read_lock_root_node(root); 806 tm = tree_mod_log_oldest_root(eb_root, time_seq); 807 if (!tm) 808 return eb_root; 809 810 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { 811 old_root = &tm->old_root; 812 old_generation = tm->generation; 813 logical = old_root->logical; 814 level = old_root->level; 815 } else { 816 logical = eb_root->start; 817 level = btrfs_header_level(eb_root); 818 } 819 820 tm = tree_mod_log_search(fs_info, logical, time_seq); 821 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 822 btrfs_tree_read_unlock(eb_root); 823 free_extent_buffer(eb_root); 824 old = read_tree_block(fs_info, logical, root->root_key.objectid, 825 0, level, NULL); 826 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 827 if (!IS_ERR(old)) 828 free_extent_buffer(old); 829 btrfs_warn(fs_info, 830 "failed to read tree block %llu from get_old_root", 831 logical); 832 } else { 833 struct tree_mod_elem *tm2; 834 835 btrfs_tree_read_lock(old); 836 eb = btrfs_clone_extent_buffer(old); 837 /* 838 * After the lookup for the most recent tree mod operation 839 * above and before we locked and cloned the extent buffer 840 * 'old', a new tree mod log operation may have been added. 841 * So lookup for a more recent one to make sure the number 842 * of mod log operations we replay is consistent with the 843 * number of items we have in the cloned extent buffer, 844 * otherwise we can hit a BUG_ON when rewinding the extent 845 * buffer. 846 */ 847 tm2 = tree_mod_log_search(fs_info, logical, time_seq); 848 btrfs_tree_read_unlock(old); 849 free_extent_buffer(old); 850 ASSERT(tm2); 851 ASSERT(tm2 == tm || tm2->seq > tm->seq); 852 if (!tm2 || tm2->seq < tm->seq) { 853 free_extent_buffer(eb); 854 return NULL; 855 } 856 tm = tm2; 857 } 858 } else if (old_root) { 859 eb_root_owner = btrfs_header_owner(eb_root); 860 btrfs_tree_read_unlock(eb_root); 861 free_extent_buffer(eb_root); 862 eb = alloc_dummy_extent_buffer(fs_info, logical); 863 } else { 864 eb = btrfs_clone_extent_buffer(eb_root); 865 btrfs_tree_read_unlock(eb_root); 866 free_extent_buffer(eb_root); 867 } 868 869 if (!eb) 870 return NULL; 871 if (old_root) { 872 btrfs_set_header_bytenr(eb, eb->start); 873 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 874 btrfs_set_header_owner(eb, eb_root_owner); 875 btrfs_set_header_level(eb, old_root->level); 876 btrfs_set_header_generation(eb, old_generation); 877 } 878 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, 879 btrfs_header_level(eb)); 880 btrfs_tree_read_lock(eb); 881 if (tm) 882 tree_mod_log_rewind(fs_info, eb, time_seq, tm); 883 else 884 WARN_ON(btrfs_header_level(eb) != 0); 885 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 886 887 return eb; 888 } 889 890 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 891 { 892 struct tree_mod_elem *tm; 893 int level; 894 struct extent_buffer *eb_root = btrfs_root_node(root); 895 896 tm = tree_mod_log_oldest_root(eb_root, time_seq); 897 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) 898 level = tm->old_root.level; 899 else 900 level = btrfs_header_level(eb_root); 901 902 free_extent_buffer(eb_root); 903 904 return level; 905 } 906 907 /* 908 * Return the lowest sequence number in the tree modification log. 909 * 910 * Return the sequence number of the oldest tree modification log user, which 911 * corresponds to the lowest sequence number of all existing users. If there are 912 * no users it returns 0. 913 */ 914 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) 915 { 916 u64 ret = 0; 917 918 read_lock(&fs_info->tree_mod_log_lock); 919 if (!list_empty(&fs_info->tree_mod_seq_list)) { 920 struct btrfs_seq_list *elem; 921 922 elem = list_first_entry(&fs_info->tree_mod_seq_list, 923 struct btrfs_seq_list, list); 924 ret = elem->seq; 925 } 926 read_unlock(&fs_info->tree_mod_log_lock); 927 928 return ret; 929 } 930