1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "ctree.h" 20 #include "extent-tree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "locking.h" 27 #include "free-space-cache.h" 28 #include "free-space-tree.h" 29 #include "qgroup.h" 30 #include "ref-verify.h" 31 #include "space-info.h" 32 #include "block-rsv.h" 33 #include "discard.h" 34 #include "zoned.h" 35 #include "dev-replace.h" 36 #include "fs.h" 37 #include "accessors.h" 38 #include "root-tree.h" 39 #include "file-item.h" 40 #include "orphan.h" 41 #include "tree-checker.h" 42 #include "raid-stripe-tree.h" 43 44 #undef SCRAMBLE_DELAYED_REFS 45 46 47 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 48 struct btrfs_delayed_ref_head *href, 49 struct btrfs_delayed_ref_node *node, 50 struct btrfs_delayed_extent_op *extra_op); 51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 52 struct extent_buffer *leaf, 53 struct btrfs_extent_item *ei); 54 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 55 u64 parent, u64 root_objectid, 56 u64 flags, u64 owner, u64 offset, 57 struct btrfs_key *ins, int ref_mod, u64 oref_root); 58 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 59 struct btrfs_delayed_ref_node *node, 60 struct btrfs_delayed_extent_op *extent_op); 61 static int find_next_key(struct btrfs_path *path, int level, 62 struct btrfs_key *key); 63 64 static int block_group_bits(struct btrfs_block_group *cache, u64 bits) 65 { 66 return (cache->flags & bits) == bits; 67 } 68 69 /* simple helper to search for an existing data extent at a given offset */ 70 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 71 { 72 struct btrfs_root *root = btrfs_extent_root(fs_info, start); 73 int ret; 74 struct btrfs_key key; 75 struct btrfs_path *path; 76 77 path = btrfs_alloc_path(); 78 if (!path) 79 return -ENOMEM; 80 81 key.objectid = start; 82 key.offset = len; 83 key.type = BTRFS_EXTENT_ITEM_KEY; 84 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 85 btrfs_free_path(path); 86 return ret; 87 } 88 89 /* 90 * helper function to lookup reference count and flags of a tree block. 91 * 92 * the head node for delayed ref is used to store the sum of all the 93 * reference count modifications queued up in the rbtree. the head 94 * node may also store the extent flags to set. This way you can check 95 * to see what the reference count and extent flags would be if all of 96 * the delayed refs are not processed. 97 */ 98 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 99 struct btrfs_fs_info *fs_info, u64 bytenr, 100 u64 offset, int metadata, u64 *refs, u64 *flags, 101 u64 *owning_root) 102 { 103 struct btrfs_root *extent_root; 104 struct btrfs_delayed_ref_head *head; 105 struct btrfs_delayed_ref_root *delayed_refs; 106 struct btrfs_path *path; 107 struct btrfs_extent_item *ei; 108 struct extent_buffer *leaf; 109 struct btrfs_key key; 110 u32 item_size; 111 u64 num_refs; 112 u64 extent_flags; 113 u64 owner = 0; 114 int ret; 115 116 /* 117 * If we don't have skinny metadata, don't bother doing anything 118 * different 119 */ 120 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 121 offset = fs_info->nodesize; 122 metadata = 0; 123 } 124 125 path = btrfs_alloc_path(); 126 if (!path) 127 return -ENOMEM; 128 129 if (!trans) { 130 path->skip_locking = 1; 131 path->search_commit_root = 1; 132 } 133 134 search_again: 135 key.objectid = bytenr; 136 key.offset = offset; 137 if (metadata) 138 key.type = BTRFS_METADATA_ITEM_KEY; 139 else 140 key.type = BTRFS_EXTENT_ITEM_KEY; 141 142 extent_root = btrfs_extent_root(fs_info, bytenr); 143 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 144 if (ret < 0) 145 goto out_free; 146 147 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 148 if (path->slots[0]) { 149 path->slots[0]--; 150 btrfs_item_key_to_cpu(path->nodes[0], &key, 151 path->slots[0]); 152 if (key.objectid == bytenr && 153 key.type == BTRFS_EXTENT_ITEM_KEY && 154 key.offset == fs_info->nodesize) 155 ret = 0; 156 } 157 } 158 159 if (ret == 0) { 160 leaf = path->nodes[0]; 161 item_size = btrfs_item_size(leaf, path->slots[0]); 162 if (item_size >= sizeof(*ei)) { 163 ei = btrfs_item_ptr(leaf, path->slots[0], 164 struct btrfs_extent_item); 165 num_refs = btrfs_extent_refs(leaf, ei); 166 extent_flags = btrfs_extent_flags(leaf, ei); 167 owner = btrfs_get_extent_owner_root(fs_info, leaf, 168 path->slots[0]); 169 } else { 170 ret = -EUCLEAN; 171 btrfs_err(fs_info, 172 "unexpected extent item size, has %u expect >= %zu", 173 item_size, sizeof(*ei)); 174 if (trans) 175 btrfs_abort_transaction(trans, ret); 176 else 177 btrfs_handle_fs_error(fs_info, ret, NULL); 178 179 goto out_free; 180 } 181 182 BUG_ON(num_refs == 0); 183 } else { 184 num_refs = 0; 185 extent_flags = 0; 186 ret = 0; 187 } 188 189 if (!trans) 190 goto out; 191 192 delayed_refs = &trans->transaction->delayed_refs; 193 spin_lock(&delayed_refs->lock); 194 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 195 if (head) { 196 if (!mutex_trylock(&head->mutex)) { 197 refcount_inc(&head->refs); 198 spin_unlock(&delayed_refs->lock); 199 200 btrfs_release_path(path); 201 202 /* 203 * Mutex was contended, block until it's released and try 204 * again 205 */ 206 mutex_lock(&head->mutex); 207 mutex_unlock(&head->mutex); 208 btrfs_put_delayed_ref_head(head); 209 goto search_again; 210 } 211 spin_lock(&head->lock); 212 if (head->extent_op && head->extent_op->update_flags) 213 extent_flags |= head->extent_op->flags_to_set; 214 else 215 BUG_ON(num_refs == 0); 216 217 num_refs += head->ref_mod; 218 spin_unlock(&head->lock); 219 mutex_unlock(&head->mutex); 220 } 221 spin_unlock(&delayed_refs->lock); 222 out: 223 WARN_ON(num_refs == 0); 224 if (refs) 225 *refs = num_refs; 226 if (flags) 227 *flags = extent_flags; 228 if (owning_root) 229 *owning_root = owner; 230 out_free: 231 btrfs_free_path(path); 232 return ret; 233 } 234 235 /* 236 * Back reference rules. Back refs have three main goals: 237 * 238 * 1) differentiate between all holders of references to an extent so that 239 * when a reference is dropped we can make sure it was a valid reference 240 * before freeing the extent. 241 * 242 * 2) Provide enough information to quickly find the holders of an extent 243 * if we notice a given block is corrupted or bad. 244 * 245 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 246 * maintenance. This is actually the same as #2, but with a slightly 247 * different use case. 248 * 249 * There are two kinds of back refs. The implicit back refs is optimized 250 * for pointers in non-shared tree blocks. For a given pointer in a block, 251 * back refs of this kind provide information about the block's owner tree 252 * and the pointer's key. These information allow us to find the block by 253 * b-tree searching. The full back refs is for pointers in tree blocks not 254 * referenced by their owner trees. The location of tree block is recorded 255 * in the back refs. Actually the full back refs is generic, and can be 256 * used in all cases the implicit back refs is used. The major shortcoming 257 * of the full back refs is its overhead. Every time a tree block gets 258 * COWed, we have to update back refs entry for all pointers in it. 259 * 260 * For a newly allocated tree block, we use implicit back refs for 261 * pointers in it. This means most tree related operations only involve 262 * implicit back refs. For a tree block created in old transaction, the 263 * only way to drop a reference to it is COW it. So we can detect the 264 * event that tree block loses its owner tree's reference and do the 265 * back refs conversion. 266 * 267 * When a tree block is COWed through a tree, there are four cases: 268 * 269 * The reference count of the block is one and the tree is the block's 270 * owner tree. Nothing to do in this case. 271 * 272 * The reference count of the block is one and the tree is not the 273 * block's owner tree. In this case, full back refs is used for pointers 274 * in the block. Remove these full back refs, add implicit back refs for 275 * every pointers in the new block. 276 * 277 * The reference count of the block is greater than one and the tree is 278 * the block's owner tree. In this case, implicit back refs is used for 279 * pointers in the block. Add full back refs for every pointers in the 280 * block, increase lower level extents' reference counts. The original 281 * implicit back refs are entailed to the new block. 282 * 283 * The reference count of the block is greater than one and the tree is 284 * not the block's owner tree. Add implicit back refs for every pointer in 285 * the new block, increase lower level extents' reference count. 286 * 287 * Back Reference Key composing: 288 * 289 * The key objectid corresponds to the first byte in the extent, 290 * The key type is used to differentiate between types of back refs. 291 * There are different meanings of the key offset for different types 292 * of back refs. 293 * 294 * File extents can be referenced by: 295 * 296 * - multiple snapshots, subvolumes, or different generations in one subvol 297 * - different files inside a single subvolume 298 * - different offsets inside a file (bookend extents in file.c) 299 * 300 * The extent ref structure for the implicit back refs has fields for: 301 * 302 * - Objectid of the subvolume root 303 * - objectid of the file holding the reference 304 * - original offset in the file 305 * - how many bookend extents 306 * 307 * The key offset for the implicit back refs is hash of the first 308 * three fields. 309 * 310 * The extent ref structure for the full back refs has field for: 311 * 312 * - number of pointers in the tree leaf 313 * 314 * The key offset for the implicit back refs is the first byte of 315 * the tree leaf 316 * 317 * When a file extent is allocated, The implicit back refs is used. 318 * the fields are filled in: 319 * 320 * (root_key.objectid, inode objectid, offset in file, 1) 321 * 322 * When a file extent is removed file truncation, we find the 323 * corresponding implicit back refs and check the following fields: 324 * 325 * (btrfs_header_owner(leaf), inode objectid, offset in file) 326 * 327 * Btree extents can be referenced by: 328 * 329 * - Different subvolumes 330 * 331 * Both the implicit back refs and the full back refs for tree blocks 332 * only consist of key. The key offset for the implicit back refs is 333 * objectid of block's owner tree. The key offset for the full back refs 334 * is the first byte of parent block. 335 * 336 * When implicit back refs is used, information about the lowest key and 337 * level of the tree block are required. These information are stored in 338 * tree block info structure. 339 */ 340 341 /* 342 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 343 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 344 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 345 */ 346 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 347 struct btrfs_extent_inline_ref *iref, 348 enum btrfs_inline_ref_type is_data) 349 { 350 struct btrfs_fs_info *fs_info = eb->fs_info; 351 int type = btrfs_extent_inline_ref_type(eb, iref); 352 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 353 354 if (type == BTRFS_EXTENT_OWNER_REF_KEY) { 355 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); 356 return type; 357 } 358 359 if (type == BTRFS_TREE_BLOCK_REF_KEY || 360 type == BTRFS_SHARED_BLOCK_REF_KEY || 361 type == BTRFS_SHARED_DATA_REF_KEY || 362 type == BTRFS_EXTENT_DATA_REF_KEY) { 363 if (is_data == BTRFS_REF_TYPE_BLOCK) { 364 if (type == BTRFS_TREE_BLOCK_REF_KEY) 365 return type; 366 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 367 ASSERT(fs_info); 368 /* 369 * Every shared one has parent tree block, 370 * which must be aligned to sector size. 371 */ 372 if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) 373 return type; 374 } 375 } else if (is_data == BTRFS_REF_TYPE_DATA) { 376 if (type == BTRFS_EXTENT_DATA_REF_KEY) 377 return type; 378 if (type == BTRFS_SHARED_DATA_REF_KEY) { 379 ASSERT(fs_info); 380 /* 381 * Every shared one has parent tree block, 382 * which must be aligned to sector size. 383 */ 384 if (offset && 385 IS_ALIGNED(offset, fs_info->sectorsize)) 386 return type; 387 } 388 } else { 389 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 390 return type; 391 } 392 } 393 394 WARN_ON(1); 395 btrfs_print_leaf(eb); 396 btrfs_err(fs_info, 397 "eb %llu iref 0x%lx invalid extent inline ref type %d", 398 eb->start, (unsigned long)iref, type); 399 400 return BTRFS_REF_TYPE_INVALID; 401 } 402 403 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 404 { 405 u32 high_crc = ~(u32)0; 406 u32 low_crc = ~(u32)0; 407 __le64 lenum; 408 409 lenum = cpu_to_le64(root_objectid); 410 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 411 lenum = cpu_to_le64(owner); 412 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 413 lenum = cpu_to_le64(offset); 414 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 415 416 return ((u64)high_crc << 31) ^ (u64)low_crc; 417 } 418 419 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 420 struct btrfs_extent_data_ref *ref) 421 { 422 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 423 btrfs_extent_data_ref_objectid(leaf, ref), 424 btrfs_extent_data_ref_offset(leaf, ref)); 425 } 426 427 static int match_extent_data_ref(struct extent_buffer *leaf, 428 struct btrfs_extent_data_ref *ref, 429 u64 root_objectid, u64 owner, u64 offset) 430 { 431 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 432 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 433 btrfs_extent_data_ref_offset(leaf, ref) != offset) 434 return 0; 435 return 1; 436 } 437 438 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 439 struct btrfs_path *path, 440 u64 bytenr, u64 parent, 441 u64 root_objectid, 442 u64 owner, u64 offset) 443 { 444 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 445 struct btrfs_key key; 446 struct btrfs_extent_data_ref *ref; 447 struct extent_buffer *leaf; 448 u32 nritems; 449 int recow; 450 int ret; 451 452 key.objectid = bytenr; 453 if (parent) { 454 key.type = BTRFS_SHARED_DATA_REF_KEY; 455 key.offset = parent; 456 } else { 457 key.type = BTRFS_EXTENT_DATA_REF_KEY; 458 key.offset = hash_extent_data_ref(root_objectid, 459 owner, offset); 460 } 461 again: 462 recow = 0; 463 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 464 if (ret < 0) 465 return ret; 466 467 if (parent) { 468 if (ret) 469 return -ENOENT; 470 return 0; 471 } 472 473 ret = -ENOENT; 474 leaf = path->nodes[0]; 475 nritems = btrfs_header_nritems(leaf); 476 while (1) { 477 if (path->slots[0] >= nritems) { 478 ret = btrfs_next_leaf(root, path); 479 if (ret) { 480 if (ret > 0) 481 return -ENOENT; 482 return ret; 483 } 484 485 leaf = path->nodes[0]; 486 nritems = btrfs_header_nritems(leaf); 487 recow = 1; 488 } 489 490 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 491 if (key.objectid != bytenr || 492 key.type != BTRFS_EXTENT_DATA_REF_KEY) 493 goto fail; 494 495 ref = btrfs_item_ptr(leaf, path->slots[0], 496 struct btrfs_extent_data_ref); 497 498 if (match_extent_data_ref(leaf, ref, root_objectid, 499 owner, offset)) { 500 if (recow) { 501 btrfs_release_path(path); 502 goto again; 503 } 504 ret = 0; 505 break; 506 } 507 path->slots[0]++; 508 } 509 fail: 510 return ret; 511 } 512 513 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 514 struct btrfs_path *path, 515 struct btrfs_delayed_ref_node *node, 516 u64 bytenr) 517 { 518 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 519 struct btrfs_key key; 520 struct extent_buffer *leaf; 521 u64 owner = btrfs_delayed_ref_owner(node); 522 u64 offset = btrfs_delayed_ref_offset(node); 523 u32 size; 524 u32 num_refs; 525 int ret; 526 527 key.objectid = bytenr; 528 if (node->parent) { 529 key.type = BTRFS_SHARED_DATA_REF_KEY; 530 key.offset = node->parent; 531 size = sizeof(struct btrfs_shared_data_ref); 532 } else { 533 key.type = BTRFS_EXTENT_DATA_REF_KEY; 534 key.offset = hash_extent_data_ref(node->ref_root, owner, offset); 535 size = sizeof(struct btrfs_extent_data_ref); 536 } 537 538 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 539 if (ret && ret != -EEXIST) 540 goto fail; 541 542 leaf = path->nodes[0]; 543 if (node->parent) { 544 struct btrfs_shared_data_ref *ref; 545 ref = btrfs_item_ptr(leaf, path->slots[0], 546 struct btrfs_shared_data_ref); 547 if (ret == 0) { 548 btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); 549 } else { 550 num_refs = btrfs_shared_data_ref_count(leaf, ref); 551 num_refs += node->ref_mod; 552 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 553 } 554 } else { 555 struct btrfs_extent_data_ref *ref; 556 while (ret == -EEXIST) { 557 ref = btrfs_item_ptr(leaf, path->slots[0], 558 struct btrfs_extent_data_ref); 559 if (match_extent_data_ref(leaf, ref, node->ref_root, 560 owner, offset)) 561 break; 562 btrfs_release_path(path); 563 key.offset++; 564 ret = btrfs_insert_empty_item(trans, root, path, &key, 565 size); 566 if (ret && ret != -EEXIST) 567 goto fail; 568 569 leaf = path->nodes[0]; 570 } 571 ref = btrfs_item_ptr(leaf, path->slots[0], 572 struct btrfs_extent_data_ref); 573 if (ret == 0) { 574 btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); 575 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 576 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 577 btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); 578 } else { 579 num_refs = btrfs_extent_data_ref_count(leaf, ref); 580 num_refs += node->ref_mod; 581 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 582 } 583 } 584 btrfs_mark_buffer_dirty(trans, leaf); 585 ret = 0; 586 fail: 587 btrfs_release_path(path); 588 return ret; 589 } 590 591 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 592 struct btrfs_root *root, 593 struct btrfs_path *path, 594 int refs_to_drop) 595 { 596 struct btrfs_key key; 597 struct btrfs_extent_data_ref *ref1 = NULL; 598 struct btrfs_shared_data_ref *ref2 = NULL; 599 struct extent_buffer *leaf; 600 u32 num_refs = 0; 601 int ret = 0; 602 603 leaf = path->nodes[0]; 604 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 605 606 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 607 ref1 = btrfs_item_ptr(leaf, path->slots[0], 608 struct btrfs_extent_data_ref); 609 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 610 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 611 ref2 = btrfs_item_ptr(leaf, path->slots[0], 612 struct btrfs_shared_data_ref); 613 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 614 } else { 615 btrfs_err(trans->fs_info, 616 "unrecognized backref key (%llu %u %llu)", 617 key.objectid, key.type, key.offset); 618 btrfs_abort_transaction(trans, -EUCLEAN); 619 return -EUCLEAN; 620 } 621 622 BUG_ON(num_refs < refs_to_drop); 623 num_refs -= refs_to_drop; 624 625 if (num_refs == 0) { 626 ret = btrfs_del_item(trans, root, path); 627 } else { 628 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 629 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 630 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 631 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 632 btrfs_mark_buffer_dirty(trans, leaf); 633 } 634 return ret; 635 } 636 637 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 638 struct btrfs_extent_inline_ref *iref) 639 { 640 struct btrfs_key key; 641 struct extent_buffer *leaf; 642 struct btrfs_extent_data_ref *ref1; 643 struct btrfs_shared_data_ref *ref2; 644 u32 num_refs = 0; 645 int type; 646 647 leaf = path->nodes[0]; 648 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 649 650 if (iref) { 651 /* 652 * If type is invalid, we should have bailed out earlier than 653 * this call. 654 */ 655 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 656 ASSERT(type != BTRFS_REF_TYPE_INVALID); 657 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 658 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 659 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 660 } else { 661 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 662 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 663 } 664 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 665 ref1 = btrfs_item_ptr(leaf, path->slots[0], 666 struct btrfs_extent_data_ref); 667 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 668 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 669 ref2 = btrfs_item_ptr(leaf, path->slots[0], 670 struct btrfs_shared_data_ref); 671 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 672 } else { 673 WARN_ON(1); 674 } 675 return num_refs; 676 } 677 678 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 679 struct btrfs_path *path, 680 u64 bytenr, u64 parent, 681 u64 root_objectid) 682 { 683 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 684 struct btrfs_key key; 685 int ret; 686 687 key.objectid = bytenr; 688 if (parent) { 689 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 690 key.offset = parent; 691 } else { 692 key.type = BTRFS_TREE_BLOCK_REF_KEY; 693 key.offset = root_objectid; 694 } 695 696 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 697 if (ret > 0) 698 ret = -ENOENT; 699 return ret; 700 } 701 702 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 703 struct btrfs_path *path, 704 struct btrfs_delayed_ref_node *node, 705 u64 bytenr) 706 { 707 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 708 struct btrfs_key key; 709 int ret; 710 711 key.objectid = bytenr; 712 if (node->parent) { 713 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 714 key.offset = node->parent; 715 } else { 716 key.type = BTRFS_TREE_BLOCK_REF_KEY; 717 key.offset = node->ref_root; 718 } 719 720 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 721 btrfs_release_path(path); 722 return ret; 723 } 724 725 static inline int extent_ref_type(u64 parent, u64 owner) 726 { 727 int type; 728 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 729 if (parent > 0) 730 type = BTRFS_SHARED_BLOCK_REF_KEY; 731 else 732 type = BTRFS_TREE_BLOCK_REF_KEY; 733 } else { 734 if (parent > 0) 735 type = BTRFS_SHARED_DATA_REF_KEY; 736 else 737 type = BTRFS_EXTENT_DATA_REF_KEY; 738 } 739 return type; 740 } 741 742 static int find_next_key(struct btrfs_path *path, int level, 743 struct btrfs_key *key) 744 745 { 746 for (; level < BTRFS_MAX_LEVEL; level++) { 747 if (!path->nodes[level]) 748 break; 749 if (path->slots[level] + 1 >= 750 btrfs_header_nritems(path->nodes[level])) 751 continue; 752 if (level == 0) 753 btrfs_item_key_to_cpu(path->nodes[level], key, 754 path->slots[level] + 1); 755 else 756 btrfs_node_key_to_cpu(path->nodes[level], key, 757 path->slots[level] + 1); 758 return 0; 759 } 760 return 1; 761 } 762 763 /* 764 * look for inline back ref. if back ref is found, *ref_ret is set 765 * to the address of inline back ref, and 0 is returned. 766 * 767 * if back ref isn't found, *ref_ret is set to the address where it 768 * should be inserted, and -ENOENT is returned. 769 * 770 * if insert is true and there are too many inline back refs, the path 771 * points to the extent item, and -EAGAIN is returned. 772 * 773 * NOTE: inline back refs are ordered in the same way that back ref 774 * items in the tree are ordered. 775 */ 776 static noinline_for_stack 777 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 778 struct btrfs_path *path, 779 struct btrfs_extent_inline_ref **ref_ret, 780 u64 bytenr, u64 num_bytes, 781 u64 parent, u64 root_objectid, 782 u64 owner, u64 offset, int insert) 783 { 784 struct btrfs_fs_info *fs_info = trans->fs_info; 785 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); 786 struct btrfs_key key; 787 struct extent_buffer *leaf; 788 struct btrfs_extent_item *ei; 789 struct btrfs_extent_inline_ref *iref; 790 u64 flags; 791 u64 item_size; 792 unsigned long ptr; 793 unsigned long end; 794 int extra_size; 795 int type; 796 int want; 797 int ret; 798 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 799 int needed; 800 801 key.objectid = bytenr; 802 key.type = BTRFS_EXTENT_ITEM_KEY; 803 key.offset = num_bytes; 804 805 want = extent_ref_type(parent, owner); 806 if (insert) { 807 extra_size = btrfs_extent_inline_ref_size(want); 808 path->search_for_extension = 1; 809 path->keep_locks = 1; 810 } else 811 extra_size = -1; 812 813 /* 814 * Owner is our level, so we can just add one to get the level for the 815 * block we are interested in. 816 */ 817 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 818 key.type = BTRFS_METADATA_ITEM_KEY; 819 key.offset = owner; 820 } 821 822 again: 823 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 824 if (ret < 0) 825 goto out; 826 827 /* 828 * We may be a newly converted file system which still has the old fat 829 * extent entries for metadata, so try and see if we have one of those. 830 */ 831 if (ret > 0 && skinny_metadata) { 832 skinny_metadata = false; 833 if (path->slots[0]) { 834 path->slots[0]--; 835 btrfs_item_key_to_cpu(path->nodes[0], &key, 836 path->slots[0]); 837 if (key.objectid == bytenr && 838 key.type == BTRFS_EXTENT_ITEM_KEY && 839 key.offset == num_bytes) 840 ret = 0; 841 } 842 if (ret) { 843 key.objectid = bytenr; 844 key.type = BTRFS_EXTENT_ITEM_KEY; 845 key.offset = num_bytes; 846 btrfs_release_path(path); 847 goto again; 848 } 849 } 850 851 if (ret && !insert) { 852 ret = -ENOENT; 853 goto out; 854 } else if (WARN_ON(ret)) { 855 btrfs_print_leaf(path->nodes[0]); 856 btrfs_err(fs_info, 857 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", 858 bytenr, num_bytes, parent, root_objectid, owner, 859 offset); 860 ret = -EUCLEAN; 861 goto out; 862 } 863 864 leaf = path->nodes[0]; 865 item_size = btrfs_item_size(leaf, path->slots[0]); 866 if (unlikely(item_size < sizeof(*ei))) { 867 ret = -EUCLEAN; 868 btrfs_err(fs_info, 869 "unexpected extent item size, has %llu expect >= %zu", 870 item_size, sizeof(*ei)); 871 btrfs_abort_transaction(trans, ret); 872 goto out; 873 } 874 875 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 876 flags = btrfs_extent_flags(leaf, ei); 877 878 ptr = (unsigned long)(ei + 1); 879 end = (unsigned long)ei + item_size; 880 881 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 882 ptr += sizeof(struct btrfs_tree_block_info); 883 BUG_ON(ptr > end); 884 } 885 886 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 887 needed = BTRFS_REF_TYPE_DATA; 888 else 889 needed = BTRFS_REF_TYPE_BLOCK; 890 891 ret = -ENOENT; 892 while (ptr < end) { 893 iref = (struct btrfs_extent_inline_ref *)ptr; 894 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 895 if (type == BTRFS_EXTENT_OWNER_REF_KEY) { 896 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); 897 ptr += btrfs_extent_inline_ref_size(type); 898 continue; 899 } 900 if (type == BTRFS_REF_TYPE_INVALID) { 901 ret = -EUCLEAN; 902 goto out; 903 } 904 905 if (want < type) 906 break; 907 if (want > type) { 908 ptr += btrfs_extent_inline_ref_size(type); 909 continue; 910 } 911 912 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 913 struct btrfs_extent_data_ref *dref; 914 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 915 if (match_extent_data_ref(leaf, dref, root_objectid, 916 owner, offset)) { 917 ret = 0; 918 break; 919 } 920 if (hash_extent_data_ref_item(leaf, dref) < 921 hash_extent_data_ref(root_objectid, owner, offset)) 922 break; 923 } else { 924 u64 ref_offset; 925 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 926 if (parent > 0) { 927 if (parent == ref_offset) { 928 ret = 0; 929 break; 930 } 931 if (ref_offset < parent) 932 break; 933 } else { 934 if (root_objectid == ref_offset) { 935 ret = 0; 936 break; 937 } 938 if (ref_offset < root_objectid) 939 break; 940 } 941 } 942 ptr += btrfs_extent_inline_ref_size(type); 943 } 944 945 if (unlikely(ptr > end)) { 946 ret = -EUCLEAN; 947 btrfs_print_leaf(path->nodes[0]); 948 btrfs_crit(fs_info, 949 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", 950 path->slots[0], root_objectid, owner, offset, parent); 951 goto out; 952 } 953 954 if (ret == -ENOENT && insert) { 955 if (item_size + extra_size >= 956 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 957 ret = -EAGAIN; 958 goto out; 959 } 960 /* 961 * To add new inline back ref, we have to make sure 962 * there is no corresponding back ref item. 963 * For simplicity, we just do not add new inline back 964 * ref if there is any kind of item for this block 965 */ 966 if (find_next_key(path, 0, &key) == 0 && 967 key.objectid == bytenr && 968 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 969 ret = -EAGAIN; 970 goto out; 971 } 972 } 973 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 974 out: 975 if (insert) { 976 path->keep_locks = 0; 977 path->search_for_extension = 0; 978 btrfs_unlock_up_safe(path, 1); 979 } 980 return ret; 981 } 982 983 /* 984 * helper to add new inline back ref 985 */ 986 static noinline_for_stack 987 void setup_inline_extent_backref(struct btrfs_trans_handle *trans, 988 struct btrfs_path *path, 989 struct btrfs_extent_inline_ref *iref, 990 u64 parent, u64 root_objectid, 991 u64 owner, u64 offset, int refs_to_add, 992 struct btrfs_delayed_extent_op *extent_op) 993 { 994 struct extent_buffer *leaf; 995 struct btrfs_extent_item *ei; 996 unsigned long ptr; 997 unsigned long end; 998 unsigned long item_offset; 999 u64 refs; 1000 int size; 1001 int type; 1002 1003 leaf = path->nodes[0]; 1004 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1005 item_offset = (unsigned long)iref - (unsigned long)ei; 1006 1007 type = extent_ref_type(parent, owner); 1008 size = btrfs_extent_inline_ref_size(type); 1009 1010 btrfs_extend_item(trans, path, size); 1011 1012 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1013 refs = btrfs_extent_refs(leaf, ei); 1014 refs += refs_to_add; 1015 btrfs_set_extent_refs(leaf, ei, refs); 1016 if (extent_op) 1017 __run_delayed_extent_op(extent_op, leaf, ei); 1018 1019 ptr = (unsigned long)ei + item_offset; 1020 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); 1021 if (ptr < end - size) 1022 memmove_extent_buffer(leaf, ptr + size, ptr, 1023 end - size - ptr); 1024 1025 iref = (struct btrfs_extent_inline_ref *)ptr; 1026 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1027 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1028 struct btrfs_extent_data_ref *dref; 1029 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1030 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1031 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1032 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1033 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1034 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1035 struct btrfs_shared_data_ref *sref; 1036 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1037 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1038 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1039 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1040 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1041 } else { 1042 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1043 } 1044 btrfs_mark_buffer_dirty(trans, leaf); 1045 } 1046 1047 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1048 struct btrfs_path *path, 1049 struct btrfs_extent_inline_ref **ref_ret, 1050 u64 bytenr, u64 num_bytes, u64 parent, 1051 u64 root_objectid, u64 owner, u64 offset) 1052 { 1053 int ret; 1054 1055 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1056 num_bytes, parent, root_objectid, 1057 owner, offset, 0); 1058 if (ret != -ENOENT) 1059 return ret; 1060 1061 btrfs_release_path(path); 1062 *ref_ret = NULL; 1063 1064 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1065 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1066 root_objectid); 1067 } else { 1068 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1069 root_objectid, owner, offset); 1070 } 1071 return ret; 1072 } 1073 1074 /* 1075 * helper to update/remove inline back ref 1076 */ 1077 static noinline_for_stack int update_inline_extent_backref( 1078 struct btrfs_trans_handle *trans, 1079 struct btrfs_path *path, 1080 struct btrfs_extent_inline_ref *iref, 1081 int refs_to_mod, 1082 struct btrfs_delayed_extent_op *extent_op) 1083 { 1084 struct extent_buffer *leaf = path->nodes[0]; 1085 struct btrfs_fs_info *fs_info = leaf->fs_info; 1086 struct btrfs_extent_item *ei; 1087 struct btrfs_extent_data_ref *dref = NULL; 1088 struct btrfs_shared_data_ref *sref = NULL; 1089 unsigned long ptr; 1090 unsigned long end; 1091 u32 item_size; 1092 int size; 1093 int type; 1094 u64 refs; 1095 1096 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1097 refs = btrfs_extent_refs(leaf, ei); 1098 if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { 1099 struct btrfs_key key; 1100 u32 extent_size; 1101 1102 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1103 if (key.type == BTRFS_METADATA_ITEM_KEY) 1104 extent_size = fs_info->nodesize; 1105 else 1106 extent_size = key.offset; 1107 btrfs_print_leaf(leaf); 1108 btrfs_err(fs_info, 1109 "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", 1110 key.objectid, extent_size, refs_to_mod, refs); 1111 return -EUCLEAN; 1112 } 1113 refs += refs_to_mod; 1114 btrfs_set_extent_refs(leaf, ei, refs); 1115 if (extent_op) 1116 __run_delayed_extent_op(extent_op, leaf, ei); 1117 1118 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1119 /* 1120 * Function btrfs_get_extent_inline_ref_type() has already printed 1121 * error messages. 1122 */ 1123 if (unlikely(type == BTRFS_REF_TYPE_INVALID)) 1124 return -EUCLEAN; 1125 1126 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1127 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1128 refs = btrfs_extent_data_ref_count(leaf, dref); 1129 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1130 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1131 refs = btrfs_shared_data_ref_count(leaf, sref); 1132 } else { 1133 refs = 1; 1134 /* 1135 * For tree blocks we can only drop one ref for it, and tree 1136 * blocks should not have refs > 1. 1137 * 1138 * Furthermore if we're inserting a new inline backref, we 1139 * won't reach this path either. That would be 1140 * setup_inline_extent_backref(). 1141 */ 1142 if (unlikely(refs_to_mod != -1)) { 1143 struct btrfs_key key; 1144 1145 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1146 1147 btrfs_print_leaf(leaf); 1148 btrfs_err(fs_info, 1149 "invalid refs_to_mod for tree block %llu, has %d expect -1", 1150 key.objectid, refs_to_mod); 1151 return -EUCLEAN; 1152 } 1153 } 1154 1155 if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { 1156 struct btrfs_key key; 1157 u32 extent_size; 1158 1159 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1160 if (key.type == BTRFS_METADATA_ITEM_KEY) 1161 extent_size = fs_info->nodesize; 1162 else 1163 extent_size = key.offset; 1164 btrfs_print_leaf(leaf); 1165 btrfs_err(fs_info, 1166 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", 1167 (unsigned long)iref, key.objectid, extent_size, 1168 refs_to_mod, refs); 1169 return -EUCLEAN; 1170 } 1171 refs += refs_to_mod; 1172 1173 if (refs > 0) { 1174 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1175 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1176 else 1177 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1178 } else { 1179 size = btrfs_extent_inline_ref_size(type); 1180 item_size = btrfs_item_size(leaf, path->slots[0]); 1181 ptr = (unsigned long)iref; 1182 end = (unsigned long)ei + item_size; 1183 if (ptr + size < end) 1184 memmove_extent_buffer(leaf, ptr, ptr + size, 1185 end - ptr - size); 1186 item_size -= size; 1187 btrfs_truncate_item(trans, path, item_size, 1); 1188 } 1189 btrfs_mark_buffer_dirty(trans, leaf); 1190 return 0; 1191 } 1192 1193 static noinline_for_stack 1194 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1195 struct btrfs_path *path, 1196 u64 bytenr, u64 num_bytes, u64 parent, 1197 u64 root_objectid, u64 owner, 1198 u64 offset, int refs_to_add, 1199 struct btrfs_delayed_extent_op *extent_op) 1200 { 1201 struct btrfs_extent_inline_ref *iref; 1202 int ret; 1203 1204 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1205 num_bytes, parent, root_objectid, 1206 owner, offset, 1); 1207 if (ret == 0) { 1208 /* 1209 * We're adding refs to a tree block we already own, this 1210 * should not happen at all. 1211 */ 1212 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1213 btrfs_print_leaf(path->nodes[0]); 1214 btrfs_crit(trans->fs_info, 1215 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", 1216 bytenr, num_bytes, root_objectid, path->slots[0]); 1217 return -EUCLEAN; 1218 } 1219 ret = update_inline_extent_backref(trans, path, iref, 1220 refs_to_add, extent_op); 1221 } else if (ret == -ENOENT) { 1222 setup_inline_extent_backref(trans, path, iref, parent, 1223 root_objectid, owner, offset, 1224 refs_to_add, extent_op); 1225 ret = 0; 1226 } 1227 return ret; 1228 } 1229 1230 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1231 struct btrfs_root *root, 1232 struct btrfs_path *path, 1233 struct btrfs_extent_inline_ref *iref, 1234 int refs_to_drop, int is_data) 1235 { 1236 int ret = 0; 1237 1238 BUG_ON(!is_data && refs_to_drop != 1); 1239 if (iref) 1240 ret = update_inline_extent_backref(trans, path, iref, 1241 -refs_to_drop, NULL); 1242 else if (is_data) 1243 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1244 else 1245 ret = btrfs_del_item(trans, root, path); 1246 return ret; 1247 } 1248 1249 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1250 u64 *discarded_bytes) 1251 { 1252 int j, ret = 0; 1253 u64 bytes_left, end; 1254 u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); 1255 1256 /* Adjust the range to be aligned to 512B sectors if necessary. */ 1257 if (start != aligned_start) { 1258 len -= aligned_start - start; 1259 len = round_down(len, 1 << SECTOR_SHIFT); 1260 start = aligned_start; 1261 } 1262 1263 *discarded_bytes = 0; 1264 1265 if (!len) 1266 return 0; 1267 1268 end = start + len; 1269 bytes_left = len; 1270 1271 /* Skip any superblocks on this device. */ 1272 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1273 u64 sb_start = btrfs_sb_offset(j); 1274 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1275 u64 size = sb_start - start; 1276 1277 if (!in_range(sb_start, start, bytes_left) && 1278 !in_range(sb_end, start, bytes_left) && 1279 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1280 continue; 1281 1282 /* 1283 * Superblock spans beginning of range. Adjust start and 1284 * try again. 1285 */ 1286 if (sb_start <= start) { 1287 start += sb_end - start; 1288 if (start > end) { 1289 bytes_left = 0; 1290 break; 1291 } 1292 bytes_left = end - start; 1293 continue; 1294 } 1295 1296 if (size) { 1297 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1298 size >> SECTOR_SHIFT, 1299 GFP_NOFS); 1300 if (!ret) 1301 *discarded_bytes += size; 1302 else if (ret != -EOPNOTSUPP) 1303 return ret; 1304 } 1305 1306 start = sb_end; 1307 if (start > end) { 1308 bytes_left = 0; 1309 break; 1310 } 1311 bytes_left = end - start; 1312 } 1313 1314 if (bytes_left) { 1315 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1316 bytes_left >> SECTOR_SHIFT, 1317 GFP_NOFS); 1318 if (!ret) 1319 *discarded_bytes += bytes_left; 1320 } 1321 return ret; 1322 } 1323 1324 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) 1325 { 1326 struct btrfs_device *dev = stripe->dev; 1327 struct btrfs_fs_info *fs_info = dev->fs_info; 1328 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1329 u64 phys = stripe->physical; 1330 u64 len = stripe->length; 1331 u64 discarded = 0; 1332 int ret = 0; 1333 1334 /* Zone reset on a zoned filesystem */ 1335 if (btrfs_can_zone_reset(dev, phys, len)) { 1336 u64 src_disc; 1337 1338 ret = btrfs_reset_device_zone(dev, phys, len, &discarded); 1339 if (ret) 1340 goto out; 1341 1342 if (!btrfs_dev_replace_is_ongoing(dev_replace) || 1343 dev != dev_replace->srcdev) 1344 goto out; 1345 1346 src_disc = discarded; 1347 1348 /* Send to replace target as well */ 1349 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, 1350 &discarded); 1351 discarded += src_disc; 1352 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { 1353 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); 1354 } else { 1355 ret = 0; 1356 *bytes = 0; 1357 } 1358 1359 out: 1360 *bytes = discarded; 1361 return ret; 1362 } 1363 1364 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1365 u64 num_bytes, u64 *actual_bytes) 1366 { 1367 int ret = 0; 1368 u64 discarded_bytes = 0; 1369 u64 end = bytenr + num_bytes; 1370 u64 cur = bytenr; 1371 1372 /* 1373 * Avoid races with device replace and make sure the devices in the 1374 * stripes don't go away while we are discarding. 1375 */ 1376 btrfs_bio_counter_inc_blocked(fs_info); 1377 while (cur < end) { 1378 struct btrfs_discard_stripe *stripes; 1379 unsigned int num_stripes; 1380 int i; 1381 1382 num_bytes = end - cur; 1383 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); 1384 if (IS_ERR(stripes)) { 1385 ret = PTR_ERR(stripes); 1386 if (ret == -EOPNOTSUPP) 1387 ret = 0; 1388 break; 1389 } 1390 1391 for (i = 0; i < num_stripes; i++) { 1392 struct btrfs_discard_stripe *stripe = stripes + i; 1393 u64 bytes; 1394 1395 if (!stripe->dev->bdev) { 1396 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 1397 continue; 1398 } 1399 1400 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 1401 &stripe->dev->dev_state)) 1402 continue; 1403 1404 ret = do_discard_extent(stripe, &bytes); 1405 if (ret) { 1406 /* 1407 * Keep going if discard is not supported by the 1408 * device. 1409 */ 1410 if (ret != -EOPNOTSUPP) 1411 break; 1412 ret = 0; 1413 } else { 1414 discarded_bytes += bytes; 1415 } 1416 } 1417 kfree(stripes); 1418 if (ret) 1419 break; 1420 cur += num_bytes; 1421 } 1422 btrfs_bio_counter_dec(fs_info); 1423 if (actual_bytes) 1424 *actual_bytes = discarded_bytes; 1425 return ret; 1426 } 1427 1428 /* Can return -ENOMEM */ 1429 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1430 struct btrfs_ref *generic_ref) 1431 { 1432 struct btrfs_fs_info *fs_info = trans->fs_info; 1433 int ret; 1434 1435 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && 1436 generic_ref->action); 1437 BUG_ON(generic_ref->type == BTRFS_REF_METADATA && 1438 generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); 1439 1440 if (generic_ref->type == BTRFS_REF_METADATA) 1441 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); 1442 else 1443 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); 1444 1445 btrfs_ref_tree_mod(fs_info, generic_ref); 1446 1447 return ret; 1448 } 1449 1450 /* 1451 * Insert backreference for a given extent. 1452 * 1453 * The counterpart is in __btrfs_free_extent(), with examples and more details 1454 * how it works. 1455 * 1456 * @trans: Handle of transaction 1457 * 1458 * @node: The delayed ref node used to get the bytenr/length for 1459 * extent whose references are incremented. 1460 * 1461 * @extent_op Pointer to a structure, holding information necessary when 1462 * updating a tree block's flags 1463 * 1464 */ 1465 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1466 struct btrfs_delayed_ref_node *node, 1467 struct btrfs_delayed_extent_op *extent_op) 1468 { 1469 struct btrfs_path *path; 1470 struct extent_buffer *leaf; 1471 struct btrfs_extent_item *item; 1472 struct btrfs_key key; 1473 u64 bytenr = node->bytenr; 1474 u64 num_bytes = node->num_bytes; 1475 u64 owner = btrfs_delayed_ref_owner(node); 1476 u64 offset = btrfs_delayed_ref_offset(node); 1477 u64 refs; 1478 int refs_to_add = node->ref_mod; 1479 int ret; 1480 1481 path = btrfs_alloc_path(); 1482 if (!path) 1483 return -ENOMEM; 1484 1485 /* this will setup the path even if it fails to insert the back ref */ 1486 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 1487 node->parent, node->ref_root, owner, 1488 offset, refs_to_add, extent_op); 1489 if ((ret < 0 && ret != -EAGAIN) || !ret) 1490 goto out; 1491 1492 /* 1493 * Ok we had -EAGAIN which means we didn't have space to insert and 1494 * inline extent ref, so just update the reference count and add a 1495 * normal backref. 1496 */ 1497 leaf = path->nodes[0]; 1498 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1499 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1500 refs = btrfs_extent_refs(leaf, item); 1501 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1502 if (extent_op) 1503 __run_delayed_extent_op(extent_op, leaf, item); 1504 1505 btrfs_mark_buffer_dirty(trans, leaf); 1506 btrfs_release_path(path); 1507 1508 /* now insert the actual backref */ 1509 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1510 ret = insert_tree_block_ref(trans, path, node, bytenr); 1511 else 1512 ret = insert_extent_data_ref(trans, path, node, bytenr); 1513 1514 if (ret) 1515 btrfs_abort_transaction(trans, ret); 1516 out: 1517 btrfs_free_path(path); 1518 return ret; 1519 } 1520 1521 static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, 1522 struct btrfs_delayed_ref_head *href) 1523 { 1524 u64 root = href->owning_root; 1525 1526 /* 1527 * Don't check must_insert_reserved, as this is called from contexts 1528 * where it has already been unset. 1529 */ 1530 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || 1531 !href->is_data || !is_fstree(root)) 1532 return; 1533 1534 btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, 1535 BTRFS_QGROUP_RSV_DATA); 1536 } 1537 1538 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1539 struct btrfs_delayed_ref_head *href, 1540 struct btrfs_delayed_ref_node *node, 1541 struct btrfs_delayed_extent_op *extent_op, 1542 bool insert_reserved) 1543 { 1544 int ret = 0; 1545 u64 parent = 0; 1546 u64 flags = 0; 1547 1548 trace_run_delayed_data_ref(trans->fs_info, node); 1549 1550 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1551 parent = node->parent; 1552 1553 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1554 struct btrfs_key key; 1555 struct btrfs_squota_delta delta = { 1556 .root = href->owning_root, 1557 .num_bytes = node->num_bytes, 1558 .is_data = true, 1559 .is_inc = true, 1560 .generation = trans->transid, 1561 }; 1562 u64 owner = btrfs_delayed_ref_owner(node); 1563 u64 offset = btrfs_delayed_ref_offset(node); 1564 1565 if (extent_op) 1566 flags |= extent_op->flags_to_set; 1567 1568 key.objectid = node->bytenr; 1569 key.type = BTRFS_EXTENT_ITEM_KEY; 1570 key.offset = node->num_bytes; 1571 1572 ret = alloc_reserved_file_extent(trans, parent, node->ref_root, 1573 flags, owner, offset, &key, 1574 node->ref_mod, 1575 href->owning_root); 1576 free_head_ref_squota_rsv(trans->fs_info, href); 1577 if (!ret) 1578 ret = btrfs_record_squota_delta(trans->fs_info, &delta); 1579 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1580 ret = __btrfs_inc_extent_ref(trans, node, extent_op); 1581 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1582 ret = __btrfs_free_extent(trans, href, node, extent_op); 1583 } else { 1584 BUG(); 1585 } 1586 return ret; 1587 } 1588 1589 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1590 struct extent_buffer *leaf, 1591 struct btrfs_extent_item *ei) 1592 { 1593 u64 flags = btrfs_extent_flags(leaf, ei); 1594 if (extent_op->update_flags) { 1595 flags |= extent_op->flags_to_set; 1596 btrfs_set_extent_flags(leaf, ei, flags); 1597 } 1598 1599 if (extent_op->update_key) { 1600 struct btrfs_tree_block_info *bi; 1601 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1602 bi = (struct btrfs_tree_block_info *)(ei + 1); 1603 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1604 } 1605 } 1606 1607 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1608 struct btrfs_delayed_ref_head *head, 1609 struct btrfs_delayed_extent_op *extent_op) 1610 { 1611 struct btrfs_fs_info *fs_info = trans->fs_info; 1612 struct btrfs_root *root; 1613 struct btrfs_key key; 1614 struct btrfs_path *path; 1615 struct btrfs_extent_item *ei; 1616 struct extent_buffer *leaf; 1617 u32 item_size; 1618 int ret; 1619 int metadata = 1; 1620 1621 if (TRANS_ABORTED(trans)) 1622 return 0; 1623 1624 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1625 metadata = 0; 1626 1627 path = btrfs_alloc_path(); 1628 if (!path) 1629 return -ENOMEM; 1630 1631 key.objectid = head->bytenr; 1632 1633 if (metadata) { 1634 key.type = BTRFS_METADATA_ITEM_KEY; 1635 key.offset = extent_op->level; 1636 } else { 1637 key.type = BTRFS_EXTENT_ITEM_KEY; 1638 key.offset = head->num_bytes; 1639 } 1640 1641 root = btrfs_extent_root(fs_info, key.objectid); 1642 again: 1643 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1644 if (ret < 0) { 1645 goto out; 1646 } else if (ret > 0) { 1647 if (metadata) { 1648 if (path->slots[0] > 0) { 1649 path->slots[0]--; 1650 btrfs_item_key_to_cpu(path->nodes[0], &key, 1651 path->slots[0]); 1652 if (key.objectid == head->bytenr && 1653 key.type == BTRFS_EXTENT_ITEM_KEY && 1654 key.offset == head->num_bytes) 1655 ret = 0; 1656 } 1657 if (ret > 0) { 1658 btrfs_release_path(path); 1659 metadata = 0; 1660 1661 key.objectid = head->bytenr; 1662 key.offset = head->num_bytes; 1663 key.type = BTRFS_EXTENT_ITEM_KEY; 1664 goto again; 1665 } 1666 } else { 1667 ret = -EUCLEAN; 1668 btrfs_err(fs_info, 1669 "missing extent item for extent %llu num_bytes %llu level %d", 1670 head->bytenr, head->num_bytes, extent_op->level); 1671 goto out; 1672 } 1673 } 1674 1675 leaf = path->nodes[0]; 1676 item_size = btrfs_item_size(leaf, path->slots[0]); 1677 1678 if (unlikely(item_size < sizeof(*ei))) { 1679 ret = -EUCLEAN; 1680 btrfs_err(fs_info, 1681 "unexpected extent item size, has %u expect >= %zu", 1682 item_size, sizeof(*ei)); 1683 btrfs_abort_transaction(trans, ret); 1684 goto out; 1685 } 1686 1687 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1688 __run_delayed_extent_op(extent_op, leaf, ei); 1689 1690 btrfs_mark_buffer_dirty(trans, leaf); 1691 out: 1692 btrfs_free_path(path); 1693 return ret; 1694 } 1695 1696 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 1697 struct btrfs_delayed_ref_head *href, 1698 struct btrfs_delayed_ref_node *node, 1699 struct btrfs_delayed_extent_op *extent_op, 1700 bool insert_reserved) 1701 { 1702 int ret = 0; 1703 struct btrfs_fs_info *fs_info = trans->fs_info; 1704 u64 parent = 0; 1705 u64 ref_root = 0; 1706 1707 trace_run_delayed_tree_ref(trans->fs_info, node); 1708 1709 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1710 parent = node->parent; 1711 ref_root = node->ref_root; 1712 1713 if (unlikely(node->ref_mod != 1)) { 1714 btrfs_err(trans->fs_info, 1715 "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", 1716 node->bytenr, node->ref_mod, node->action, ref_root, 1717 parent); 1718 return -EUCLEAN; 1719 } 1720 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1721 struct btrfs_squota_delta delta = { 1722 .root = href->owning_root, 1723 .num_bytes = fs_info->nodesize, 1724 .is_data = false, 1725 .is_inc = true, 1726 .generation = trans->transid, 1727 }; 1728 1729 BUG_ON(!extent_op || !extent_op->update_flags); 1730 ret = alloc_reserved_tree_block(trans, node, extent_op); 1731 if (!ret) 1732 btrfs_record_squota_delta(fs_info, &delta); 1733 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1734 ret = __btrfs_inc_extent_ref(trans, node, extent_op); 1735 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1736 ret = __btrfs_free_extent(trans, href, node, extent_op); 1737 } else { 1738 BUG(); 1739 } 1740 return ret; 1741 } 1742 1743 /* helper function to actually process a single delayed ref entry */ 1744 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1745 struct btrfs_delayed_ref_head *href, 1746 struct btrfs_delayed_ref_node *node, 1747 struct btrfs_delayed_extent_op *extent_op, 1748 bool insert_reserved) 1749 { 1750 int ret = 0; 1751 1752 if (TRANS_ABORTED(trans)) { 1753 if (insert_reserved) { 1754 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1755 free_head_ref_squota_rsv(trans->fs_info, href); 1756 } 1757 return 0; 1758 } 1759 1760 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1761 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1762 ret = run_delayed_tree_ref(trans, href, node, extent_op, 1763 insert_reserved); 1764 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1765 node->type == BTRFS_SHARED_DATA_REF_KEY) 1766 ret = run_delayed_data_ref(trans, href, node, extent_op, 1767 insert_reserved); 1768 else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) 1769 ret = 0; 1770 else 1771 BUG(); 1772 if (ret && insert_reserved) 1773 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1774 if (ret < 0) 1775 btrfs_err(trans->fs_info, 1776 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", 1777 node->bytenr, node->num_bytes, node->type, 1778 node->action, node->ref_mod, ret); 1779 return ret; 1780 } 1781 1782 static inline struct btrfs_delayed_ref_node * 1783 select_delayed_ref(struct btrfs_delayed_ref_head *head) 1784 { 1785 struct btrfs_delayed_ref_node *ref; 1786 1787 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 1788 return NULL; 1789 1790 /* 1791 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 1792 * This is to prevent a ref count from going down to zero, which deletes 1793 * the extent item from the extent tree, when there still are references 1794 * to add, which would fail because they would not find the extent item. 1795 */ 1796 if (!list_empty(&head->ref_add_list)) 1797 return list_first_entry(&head->ref_add_list, 1798 struct btrfs_delayed_ref_node, add_list); 1799 1800 ref = rb_entry(rb_first_cached(&head->ref_tree), 1801 struct btrfs_delayed_ref_node, ref_node); 1802 ASSERT(list_empty(&ref->add_list)); 1803 return ref; 1804 } 1805 1806 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 1807 struct btrfs_delayed_ref_head *head) 1808 { 1809 spin_lock(&delayed_refs->lock); 1810 head->processing = false; 1811 delayed_refs->num_heads_ready++; 1812 spin_unlock(&delayed_refs->lock); 1813 btrfs_delayed_ref_unlock(head); 1814 } 1815 1816 static struct btrfs_delayed_extent_op *cleanup_extent_op( 1817 struct btrfs_delayed_ref_head *head) 1818 { 1819 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 1820 1821 if (!extent_op) 1822 return NULL; 1823 1824 if (head->must_insert_reserved) { 1825 head->extent_op = NULL; 1826 btrfs_free_delayed_extent_op(extent_op); 1827 return NULL; 1828 } 1829 return extent_op; 1830 } 1831 1832 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 1833 struct btrfs_delayed_ref_head *head) 1834 { 1835 struct btrfs_delayed_extent_op *extent_op; 1836 int ret; 1837 1838 extent_op = cleanup_extent_op(head); 1839 if (!extent_op) 1840 return 0; 1841 head->extent_op = NULL; 1842 spin_unlock(&head->lock); 1843 ret = run_delayed_extent_op(trans, head, extent_op); 1844 btrfs_free_delayed_extent_op(extent_op); 1845 return ret ? ret : 1; 1846 } 1847 1848 u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 1849 struct btrfs_delayed_ref_root *delayed_refs, 1850 struct btrfs_delayed_ref_head *head) 1851 { 1852 u64 ret = 0; 1853 1854 /* 1855 * We had csum deletions accounted for in our delayed refs rsv, we need 1856 * to drop the csum leaves for this update from our delayed_refs_rsv. 1857 */ 1858 if (head->total_ref_mod < 0 && head->is_data) { 1859 int nr_csums; 1860 1861 spin_lock(&delayed_refs->lock); 1862 delayed_refs->pending_csums -= head->num_bytes; 1863 spin_unlock(&delayed_refs->lock); 1864 nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); 1865 1866 btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); 1867 1868 ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); 1869 } 1870 /* must_insert_reserved can be set only if we didn't run the head ref. */ 1871 if (head->must_insert_reserved) 1872 free_head_ref_squota_rsv(fs_info, head); 1873 1874 return ret; 1875 } 1876 1877 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 1878 struct btrfs_delayed_ref_head *head, 1879 u64 *bytes_released) 1880 { 1881 1882 struct btrfs_fs_info *fs_info = trans->fs_info; 1883 struct btrfs_delayed_ref_root *delayed_refs; 1884 int ret; 1885 1886 delayed_refs = &trans->transaction->delayed_refs; 1887 1888 ret = run_and_cleanup_extent_op(trans, head); 1889 if (ret < 0) { 1890 unselect_delayed_ref_head(delayed_refs, head); 1891 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 1892 return ret; 1893 } else if (ret) { 1894 return ret; 1895 } 1896 1897 /* 1898 * Need to drop our head ref lock and re-acquire the delayed ref lock 1899 * and then re-check to make sure nobody got added. 1900 */ 1901 spin_unlock(&head->lock); 1902 spin_lock(&delayed_refs->lock); 1903 spin_lock(&head->lock); 1904 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 1905 spin_unlock(&head->lock); 1906 spin_unlock(&delayed_refs->lock); 1907 return 1; 1908 } 1909 btrfs_delete_ref_head(delayed_refs, head); 1910 spin_unlock(&head->lock); 1911 spin_unlock(&delayed_refs->lock); 1912 1913 if (head->must_insert_reserved) { 1914 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); 1915 if (head->is_data) { 1916 struct btrfs_root *csum_root; 1917 1918 csum_root = btrfs_csum_root(fs_info, head->bytenr); 1919 ret = btrfs_del_csums(trans, csum_root, head->bytenr, 1920 head->num_bytes); 1921 } 1922 } 1923 1924 *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 1925 1926 trace_run_delayed_ref_head(fs_info, head, 0); 1927 btrfs_delayed_ref_unlock(head); 1928 btrfs_put_delayed_ref_head(head); 1929 return ret; 1930 } 1931 1932 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 1933 struct btrfs_trans_handle *trans) 1934 { 1935 struct btrfs_delayed_ref_root *delayed_refs = 1936 &trans->transaction->delayed_refs; 1937 struct btrfs_delayed_ref_head *head = NULL; 1938 int ret; 1939 1940 spin_lock(&delayed_refs->lock); 1941 head = btrfs_select_ref_head(delayed_refs); 1942 if (!head) { 1943 spin_unlock(&delayed_refs->lock); 1944 return head; 1945 } 1946 1947 /* 1948 * Grab the lock that says we are going to process all the refs for 1949 * this head 1950 */ 1951 ret = btrfs_delayed_ref_lock(delayed_refs, head); 1952 spin_unlock(&delayed_refs->lock); 1953 1954 /* 1955 * We may have dropped the spin lock to get the head mutex lock, and 1956 * that might have given someone else time to free the head. If that's 1957 * true, it has been removed from our list and we can move on. 1958 */ 1959 if (ret == -EAGAIN) 1960 head = ERR_PTR(-EAGAIN); 1961 1962 return head; 1963 } 1964 1965 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 1966 struct btrfs_delayed_ref_head *locked_ref, 1967 u64 *bytes_released) 1968 { 1969 struct btrfs_fs_info *fs_info = trans->fs_info; 1970 struct btrfs_delayed_ref_root *delayed_refs; 1971 struct btrfs_delayed_extent_op *extent_op; 1972 struct btrfs_delayed_ref_node *ref; 1973 bool must_insert_reserved; 1974 int ret; 1975 1976 delayed_refs = &trans->transaction->delayed_refs; 1977 1978 lockdep_assert_held(&locked_ref->mutex); 1979 lockdep_assert_held(&locked_ref->lock); 1980 1981 while ((ref = select_delayed_ref(locked_ref))) { 1982 if (ref->seq && 1983 btrfs_check_delayed_seq(fs_info, ref->seq)) { 1984 spin_unlock(&locked_ref->lock); 1985 unselect_delayed_ref_head(delayed_refs, locked_ref); 1986 return -EAGAIN; 1987 } 1988 1989 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 1990 RB_CLEAR_NODE(&ref->ref_node); 1991 if (!list_empty(&ref->add_list)) 1992 list_del(&ref->add_list); 1993 /* 1994 * When we play the delayed ref, also correct the ref_mod on 1995 * head 1996 */ 1997 switch (ref->action) { 1998 case BTRFS_ADD_DELAYED_REF: 1999 case BTRFS_ADD_DELAYED_EXTENT: 2000 locked_ref->ref_mod -= ref->ref_mod; 2001 break; 2002 case BTRFS_DROP_DELAYED_REF: 2003 locked_ref->ref_mod += ref->ref_mod; 2004 break; 2005 default: 2006 WARN_ON(1); 2007 } 2008 atomic_dec(&delayed_refs->num_entries); 2009 2010 /* 2011 * Record the must_insert_reserved flag before we drop the 2012 * spin lock. 2013 */ 2014 must_insert_reserved = locked_ref->must_insert_reserved; 2015 /* 2016 * Unsetting this on the head ref relinquishes ownership of 2017 * the rsv_bytes, so it is critical that every possible code 2018 * path from here forward frees all reserves including qgroup 2019 * reserve. 2020 */ 2021 locked_ref->must_insert_reserved = false; 2022 2023 extent_op = locked_ref->extent_op; 2024 locked_ref->extent_op = NULL; 2025 spin_unlock(&locked_ref->lock); 2026 2027 ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, 2028 must_insert_reserved); 2029 btrfs_delayed_refs_rsv_release(fs_info, 1, 0); 2030 *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); 2031 2032 btrfs_free_delayed_extent_op(extent_op); 2033 if (ret) { 2034 unselect_delayed_ref_head(delayed_refs, locked_ref); 2035 btrfs_put_delayed_ref(ref); 2036 return ret; 2037 } 2038 2039 btrfs_put_delayed_ref(ref); 2040 cond_resched(); 2041 2042 spin_lock(&locked_ref->lock); 2043 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 2044 } 2045 2046 return 0; 2047 } 2048 2049 /* 2050 * Returns 0 on success or if called with an already aborted transaction. 2051 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2052 */ 2053 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2054 u64 min_bytes) 2055 { 2056 struct btrfs_fs_info *fs_info = trans->fs_info; 2057 struct btrfs_delayed_ref_root *delayed_refs; 2058 struct btrfs_delayed_ref_head *locked_ref = NULL; 2059 int ret; 2060 unsigned long count = 0; 2061 unsigned long max_count = 0; 2062 u64 bytes_processed = 0; 2063 2064 delayed_refs = &trans->transaction->delayed_refs; 2065 if (min_bytes == 0) { 2066 max_count = delayed_refs->num_heads_ready; 2067 min_bytes = U64_MAX; 2068 } 2069 2070 do { 2071 if (!locked_ref) { 2072 locked_ref = btrfs_obtain_ref_head(trans); 2073 if (IS_ERR_OR_NULL(locked_ref)) { 2074 if (PTR_ERR(locked_ref) == -EAGAIN) { 2075 continue; 2076 } else { 2077 break; 2078 } 2079 } 2080 count++; 2081 } 2082 /* 2083 * We need to try and merge add/drops of the same ref since we 2084 * can run into issues with relocate dropping the implicit ref 2085 * and then it being added back again before the drop can 2086 * finish. If we merged anything we need to re-loop so we can 2087 * get a good ref. 2088 * Or we can get node references of the same type that weren't 2089 * merged when created due to bumps in the tree mod seq, and 2090 * we need to merge them to prevent adding an inline extent 2091 * backref before dropping it (triggering a BUG_ON at 2092 * insert_inline_extent_backref()). 2093 */ 2094 spin_lock(&locked_ref->lock); 2095 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 2096 2097 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); 2098 if (ret < 0 && ret != -EAGAIN) { 2099 /* 2100 * Error, btrfs_run_delayed_refs_for_head already 2101 * unlocked everything so just bail out 2102 */ 2103 return ret; 2104 } else if (!ret) { 2105 /* 2106 * Success, perform the usual cleanup of a processed 2107 * head 2108 */ 2109 ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); 2110 if (ret > 0 ) { 2111 /* We dropped our lock, we need to loop. */ 2112 ret = 0; 2113 continue; 2114 } else if (ret) { 2115 return ret; 2116 } 2117 } 2118 2119 /* 2120 * Either success case or btrfs_run_delayed_refs_for_head 2121 * returned -EAGAIN, meaning we need to select another head 2122 */ 2123 2124 locked_ref = NULL; 2125 cond_resched(); 2126 } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || 2127 (max_count > 0 && count < max_count) || 2128 locked_ref); 2129 2130 return 0; 2131 } 2132 2133 #ifdef SCRAMBLE_DELAYED_REFS 2134 /* 2135 * Normally delayed refs get processed in ascending bytenr order. This 2136 * correlates in most cases to the order added. To expose dependencies on this 2137 * order, we start to process the tree in the middle instead of the beginning 2138 */ 2139 static u64 find_middle(struct rb_root *root) 2140 { 2141 struct rb_node *n = root->rb_node; 2142 struct btrfs_delayed_ref_node *entry; 2143 int alt = 1; 2144 u64 middle; 2145 u64 first = 0, last = 0; 2146 2147 n = rb_first(root); 2148 if (n) { 2149 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2150 first = entry->bytenr; 2151 } 2152 n = rb_last(root); 2153 if (n) { 2154 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2155 last = entry->bytenr; 2156 } 2157 n = root->rb_node; 2158 2159 while (n) { 2160 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2161 WARN_ON(!entry->in_tree); 2162 2163 middle = entry->bytenr; 2164 2165 if (alt) 2166 n = n->rb_left; 2167 else 2168 n = n->rb_right; 2169 2170 alt = 1 - alt; 2171 } 2172 return middle; 2173 } 2174 #endif 2175 2176 /* 2177 * Start processing the delayed reference count updates and extent insertions 2178 * we have queued up so far. 2179 * 2180 * @trans: Transaction handle. 2181 * @min_bytes: How many bytes of delayed references to process. After this 2182 * many bytes we stop processing delayed references if there are 2183 * any more. If 0 it means to run all existing delayed references, 2184 * but not new ones added after running all existing ones. 2185 * Use (u64)-1 (U64_MAX) to run all existing delayed references 2186 * plus any new ones that are added. 2187 * 2188 * Returns 0 on success or if called with an aborted transaction 2189 * Returns <0 on error and aborts the transaction 2190 */ 2191 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) 2192 { 2193 struct btrfs_fs_info *fs_info = trans->fs_info; 2194 struct btrfs_delayed_ref_root *delayed_refs; 2195 int ret; 2196 2197 /* We'll clean this up in btrfs_cleanup_transaction */ 2198 if (TRANS_ABORTED(trans)) 2199 return 0; 2200 2201 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2202 return 0; 2203 2204 delayed_refs = &trans->transaction->delayed_refs; 2205 again: 2206 #ifdef SCRAMBLE_DELAYED_REFS 2207 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2208 #endif 2209 ret = __btrfs_run_delayed_refs(trans, min_bytes); 2210 if (ret < 0) { 2211 btrfs_abort_transaction(trans, ret); 2212 return ret; 2213 } 2214 2215 if (min_bytes == U64_MAX) { 2216 btrfs_create_pending_block_groups(trans); 2217 2218 spin_lock(&delayed_refs->lock); 2219 if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { 2220 spin_unlock(&delayed_refs->lock); 2221 return 0; 2222 } 2223 spin_unlock(&delayed_refs->lock); 2224 2225 cond_resched(); 2226 goto again; 2227 } 2228 2229 return 0; 2230 } 2231 2232 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2233 struct extent_buffer *eb, u64 flags) 2234 { 2235 struct btrfs_delayed_extent_op *extent_op; 2236 int level = btrfs_header_level(eb); 2237 int ret; 2238 2239 extent_op = btrfs_alloc_delayed_extent_op(); 2240 if (!extent_op) 2241 return -ENOMEM; 2242 2243 extent_op->flags_to_set = flags; 2244 extent_op->update_flags = true; 2245 extent_op->update_key = false; 2246 extent_op->level = level; 2247 2248 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); 2249 if (ret) 2250 btrfs_free_delayed_extent_op(extent_op); 2251 return ret; 2252 } 2253 2254 static noinline int check_delayed_ref(struct btrfs_root *root, 2255 struct btrfs_path *path, 2256 u64 objectid, u64 offset, u64 bytenr) 2257 { 2258 struct btrfs_delayed_ref_head *head; 2259 struct btrfs_delayed_ref_node *ref; 2260 struct btrfs_delayed_ref_root *delayed_refs; 2261 struct btrfs_transaction *cur_trans; 2262 struct rb_node *node; 2263 int ret = 0; 2264 2265 spin_lock(&root->fs_info->trans_lock); 2266 cur_trans = root->fs_info->running_transaction; 2267 if (cur_trans) 2268 refcount_inc(&cur_trans->use_count); 2269 spin_unlock(&root->fs_info->trans_lock); 2270 if (!cur_trans) 2271 return 0; 2272 2273 delayed_refs = &cur_trans->delayed_refs; 2274 spin_lock(&delayed_refs->lock); 2275 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 2276 if (!head) { 2277 spin_unlock(&delayed_refs->lock); 2278 btrfs_put_transaction(cur_trans); 2279 return 0; 2280 } 2281 2282 if (!mutex_trylock(&head->mutex)) { 2283 if (path->nowait) { 2284 spin_unlock(&delayed_refs->lock); 2285 btrfs_put_transaction(cur_trans); 2286 return -EAGAIN; 2287 } 2288 2289 refcount_inc(&head->refs); 2290 spin_unlock(&delayed_refs->lock); 2291 2292 btrfs_release_path(path); 2293 2294 /* 2295 * Mutex was contended, block until it's released and let 2296 * caller try again 2297 */ 2298 mutex_lock(&head->mutex); 2299 mutex_unlock(&head->mutex); 2300 btrfs_put_delayed_ref_head(head); 2301 btrfs_put_transaction(cur_trans); 2302 return -EAGAIN; 2303 } 2304 spin_unlock(&delayed_refs->lock); 2305 2306 spin_lock(&head->lock); 2307 /* 2308 * XXX: We should replace this with a proper search function in the 2309 * future. 2310 */ 2311 for (node = rb_first_cached(&head->ref_tree); node; 2312 node = rb_next(node)) { 2313 u64 ref_owner; 2314 u64 ref_offset; 2315 2316 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 2317 /* If it's a shared ref we know a cross reference exists */ 2318 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2319 ret = 1; 2320 break; 2321 } 2322 2323 ref_owner = btrfs_delayed_ref_owner(ref); 2324 ref_offset = btrfs_delayed_ref_offset(ref); 2325 2326 /* 2327 * If our ref doesn't match the one we're currently looking at 2328 * then we have a cross reference. 2329 */ 2330 if (ref->ref_root != btrfs_root_id(root) || 2331 ref_owner != objectid || ref_offset != offset) { 2332 ret = 1; 2333 break; 2334 } 2335 } 2336 spin_unlock(&head->lock); 2337 mutex_unlock(&head->mutex); 2338 btrfs_put_transaction(cur_trans); 2339 return ret; 2340 } 2341 2342 static noinline int check_committed_ref(struct btrfs_root *root, 2343 struct btrfs_path *path, 2344 u64 objectid, u64 offset, u64 bytenr, 2345 bool strict) 2346 { 2347 struct btrfs_fs_info *fs_info = root->fs_info; 2348 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); 2349 struct extent_buffer *leaf; 2350 struct btrfs_extent_data_ref *ref; 2351 struct btrfs_extent_inline_ref *iref; 2352 struct btrfs_extent_item *ei; 2353 struct btrfs_key key; 2354 u32 item_size; 2355 u32 expected_size; 2356 int type; 2357 int ret; 2358 2359 key.objectid = bytenr; 2360 key.offset = (u64)-1; 2361 key.type = BTRFS_EXTENT_ITEM_KEY; 2362 2363 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2364 if (ret < 0) 2365 goto out; 2366 if (ret == 0) { 2367 /* 2368 * Key with offset -1 found, there would have to exist an extent 2369 * item with such offset, but this is out of the valid range. 2370 */ 2371 ret = -EUCLEAN; 2372 goto out; 2373 } 2374 2375 ret = -ENOENT; 2376 if (path->slots[0] == 0) 2377 goto out; 2378 2379 path->slots[0]--; 2380 leaf = path->nodes[0]; 2381 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2382 2383 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2384 goto out; 2385 2386 ret = 1; 2387 item_size = btrfs_item_size(leaf, path->slots[0]); 2388 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2389 expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); 2390 2391 /* No inline refs; we need to bail before checking for owner ref. */ 2392 if (item_size == sizeof(*ei)) 2393 goto out; 2394 2395 /* Check for an owner ref; skip over it to the real inline refs. */ 2396 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2397 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2398 if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { 2399 expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); 2400 iref = (struct btrfs_extent_inline_ref *)(iref + 1); 2401 } 2402 2403 /* If extent item has more than 1 inline ref then it's shared */ 2404 if (item_size != expected_size) 2405 goto out; 2406 2407 /* 2408 * If extent created before last snapshot => it's shared unless the 2409 * snapshot has been deleted. Use the heuristic if strict is false. 2410 */ 2411 if (!strict && 2412 (btrfs_extent_generation(leaf, ei) <= 2413 btrfs_root_last_snapshot(&root->root_item))) 2414 goto out; 2415 2416 /* If this extent has SHARED_DATA_REF then it's shared */ 2417 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2418 if (type != BTRFS_EXTENT_DATA_REF_KEY) 2419 goto out; 2420 2421 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2422 if (btrfs_extent_refs(leaf, ei) != 2423 btrfs_extent_data_ref_count(leaf, ref) || 2424 btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || 2425 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2426 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2427 goto out; 2428 2429 ret = 0; 2430 out: 2431 return ret; 2432 } 2433 2434 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 2435 u64 bytenr, bool strict, struct btrfs_path *path) 2436 { 2437 int ret; 2438 2439 do { 2440 ret = check_committed_ref(root, path, objectid, 2441 offset, bytenr, strict); 2442 if (ret && ret != -ENOENT) 2443 goto out; 2444 2445 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 2446 } while (ret == -EAGAIN); 2447 2448 out: 2449 btrfs_release_path(path); 2450 if (btrfs_is_data_reloc_root(root)) 2451 WARN_ON(ret > 0); 2452 return ret; 2453 } 2454 2455 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2456 struct btrfs_root *root, 2457 struct extent_buffer *buf, 2458 int full_backref, int inc) 2459 { 2460 struct btrfs_fs_info *fs_info = root->fs_info; 2461 u64 parent; 2462 u64 ref_root; 2463 u32 nritems; 2464 struct btrfs_key key; 2465 struct btrfs_file_extent_item *fi; 2466 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); 2467 int i; 2468 int action; 2469 int level; 2470 int ret = 0; 2471 2472 if (btrfs_is_testing(fs_info)) 2473 return 0; 2474 2475 ref_root = btrfs_header_owner(buf); 2476 nritems = btrfs_header_nritems(buf); 2477 level = btrfs_header_level(buf); 2478 2479 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) 2480 return 0; 2481 2482 if (full_backref) 2483 parent = buf->start; 2484 else 2485 parent = 0; 2486 if (inc) 2487 action = BTRFS_ADD_DELAYED_REF; 2488 else 2489 action = BTRFS_DROP_DELAYED_REF; 2490 2491 for (i = 0; i < nritems; i++) { 2492 struct btrfs_ref ref = { 2493 .action = action, 2494 .parent = parent, 2495 .ref_root = ref_root, 2496 }; 2497 2498 if (level == 0) { 2499 btrfs_item_key_to_cpu(buf, &key, i); 2500 if (key.type != BTRFS_EXTENT_DATA_KEY) 2501 continue; 2502 fi = btrfs_item_ptr(buf, i, 2503 struct btrfs_file_extent_item); 2504 if (btrfs_file_extent_type(buf, fi) == 2505 BTRFS_FILE_EXTENT_INLINE) 2506 continue; 2507 ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2508 if (ref.bytenr == 0) 2509 continue; 2510 2511 ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2512 ref.owning_root = ref_root; 2513 2514 key.offset -= btrfs_file_extent_offset(buf, fi); 2515 btrfs_init_data_ref(&ref, key.objectid, key.offset, 2516 btrfs_root_id(root), for_reloc); 2517 if (inc) 2518 ret = btrfs_inc_extent_ref(trans, &ref); 2519 else 2520 ret = btrfs_free_extent(trans, &ref); 2521 if (ret) 2522 goto fail; 2523 } else { 2524 /* We don't know the owning_root, leave as 0. */ 2525 ref.bytenr = btrfs_node_blockptr(buf, i); 2526 ref.num_bytes = fs_info->nodesize; 2527 2528 btrfs_init_tree_ref(&ref, level - 1, 2529 btrfs_root_id(root), for_reloc); 2530 if (inc) 2531 ret = btrfs_inc_extent_ref(trans, &ref); 2532 else 2533 ret = btrfs_free_extent(trans, &ref); 2534 if (ret) 2535 goto fail; 2536 } 2537 } 2538 return 0; 2539 fail: 2540 return ret; 2541 } 2542 2543 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2544 struct extent_buffer *buf, int full_backref) 2545 { 2546 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2547 } 2548 2549 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2550 struct extent_buffer *buf, int full_backref) 2551 { 2552 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2553 } 2554 2555 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 2556 { 2557 struct btrfs_fs_info *fs_info = root->fs_info; 2558 u64 flags; 2559 u64 ret; 2560 2561 if (data) 2562 flags = BTRFS_BLOCK_GROUP_DATA; 2563 else if (root == fs_info->chunk_root) 2564 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2565 else 2566 flags = BTRFS_BLOCK_GROUP_METADATA; 2567 2568 ret = btrfs_get_alloc_profile(fs_info, flags); 2569 return ret; 2570 } 2571 2572 static u64 first_logical_byte(struct btrfs_fs_info *fs_info) 2573 { 2574 struct rb_node *leftmost; 2575 u64 bytenr = 0; 2576 2577 read_lock(&fs_info->block_group_cache_lock); 2578 /* Get the block group with the lowest logical start address. */ 2579 leftmost = rb_first_cached(&fs_info->block_group_cache_tree); 2580 if (leftmost) { 2581 struct btrfs_block_group *bg; 2582 2583 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); 2584 bytenr = bg->start; 2585 } 2586 read_unlock(&fs_info->block_group_cache_lock); 2587 2588 return bytenr; 2589 } 2590 2591 static int pin_down_extent(struct btrfs_trans_handle *trans, 2592 struct btrfs_block_group *cache, 2593 u64 bytenr, u64 num_bytes, int reserved) 2594 { 2595 struct btrfs_fs_info *fs_info = cache->fs_info; 2596 2597 spin_lock(&cache->space_info->lock); 2598 spin_lock(&cache->lock); 2599 cache->pinned += num_bytes; 2600 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, 2601 num_bytes); 2602 if (reserved) { 2603 cache->reserved -= num_bytes; 2604 cache->space_info->bytes_reserved -= num_bytes; 2605 } 2606 spin_unlock(&cache->lock); 2607 spin_unlock(&cache->space_info->lock); 2608 2609 set_extent_bit(&trans->transaction->pinned_extents, bytenr, 2610 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); 2611 return 0; 2612 } 2613 2614 int btrfs_pin_extent(struct btrfs_trans_handle *trans, 2615 u64 bytenr, u64 num_bytes, int reserved) 2616 { 2617 struct btrfs_block_group *cache; 2618 2619 cache = btrfs_lookup_block_group(trans->fs_info, bytenr); 2620 BUG_ON(!cache); /* Logic error */ 2621 2622 pin_down_extent(trans, cache, bytenr, num_bytes, reserved); 2623 2624 btrfs_put_block_group(cache); 2625 return 0; 2626 } 2627 2628 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 2629 const struct extent_buffer *eb) 2630 { 2631 struct btrfs_block_group *cache; 2632 int ret; 2633 2634 cache = btrfs_lookup_block_group(trans->fs_info, eb->start); 2635 if (!cache) 2636 return -EINVAL; 2637 2638 /* 2639 * Fully cache the free space first so that our pin removes the free space 2640 * from the cache. 2641 */ 2642 ret = btrfs_cache_block_group(cache, true); 2643 if (ret) 2644 goto out; 2645 2646 pin_down_extent(trans, cache, eb->start, eb->len, 0); 2647 2648 /* remove us from the free space cache (if we're there at all) */ 2649 ret = btrfs_remove_free_space(cache, eb->start, eb->len); 2650 out: 2651 btrfs_put_block_group(cache); 2652 return ret; 2653 } 2654 2655 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 2656 u64 start, u64 num_bytes) 2657 { 2658 int ret; 2659 struct btrfs_block_group *block_group; 2660 2661 block_group = btrfs_lookup_block_group(fs_info, start); 2662 if (!block_group) 2663 return -EINVAL; 2664 2665 ret = btrfs_cache_block_group(block_group, true); 2666 if (ret) 2667 goto out; 2668 2669 ret = btrfs_remove_free_space(block_group, start, num_bytes); 2670 out: 2671 btrfs_put_block_group(block_group); 2672 return ret; 2673 } 2674 2675 int btrfs_exclude_logged_extents(struct extent_buffer *eb) 2676 { 2677 struct btrfs_fs_info *fs_info = eb->fs_info; 2678 struct btrfs_file_extent_item *item; 2679 struct btrfs_key key; 2680 int found_type; 2681 int i; 2682 int ret = 0; 2683 2684 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 2685 return 0; 2686 2687 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2688 btrfs_item_key_to_cpu(eb, &key, i); 2689 if (key.type != BTRFS_EXTENT_DATA_KEY) 2690 continue; 2691 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 2692 found_type = btrfs_file_extent_type(eb, item); 2693 if (found_type == BTRFS_FILE_EXTENT_INLINE) 2694 continue; 2695 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 2696 continue; 2697 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 2698 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 2699 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 2700 if (ret) 2701 break; 2702 } 2703 2704 return ret; 2705 } 2706 2707 static void 2708 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) 2709 { 2710 atomic_inc(&bg->reservations); 2711 } 2712 2713 /* 2714 * Returns the free cluster for the given space info and sets empty_cluster to 2715 * what it should be based on the mount options. 2716 */ 2717 static struct btrfs_free_cluster * 2718 fetch_cluster_info(struct btrfs_fs_info *fs_info, 2719 struct btrfs_space_info *space_info, u64 *empty_cluster) 2720 { 2721 struct btrfs_free_cluster *ret = NULL; 2722 2723 *empty_cluster = 0; 2724 if (btrfs_mixed_space_info(space_info)) 2725 return ret; 2726 2727 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 2728 ret = &fs_info->meta_alloc_cluster; 2729 if (btrfs_test_opt(fs_info, SSD)) 2730 *empty_cluster = SZ_2M; 2731 else 2732 *empty_cluster = SZ_64K; 2733 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 2734 btrfs_test_opt(fs_info, SSD_SPREAD)) { 2735 *empty_cluster = SZ_2M; 2736 ret = &fs_info->data_alloc_cluster; 2737 } 2738 2739 return ret; 2740 } 2741 2742 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 2743 u64 start, u64 end, 2744 const bool return_free_space) 2745 { 2746 struct btrfs_block_group *cache = NULL; 2747 struct btrfs_space_info *space_info; 2748 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2749 struct btrfs_free_cluster *cluster = NULL; 2750 u64 len; 2751 u64 total_unpinned = 0; 2752 u64 empty_cluster = 0; 2753 bool readonly; 2754 int ret = 0; 2755 2756 while (start <= end) { 2757 readonly = false; 2758 if (!cache || 2759 start >= cache->start + cache->length) { 2760 if (cache) 2761 btrfs_put_block_group(cache); 2762 total_unpinned = 0; 2763 cache = btrfs_lookup_block_group(fs_info, start); 2764 if (cache == NULL) { 2765 /* Logic error, something removed the block group. */ 2766 ret = -EUCLEAN; 2767 goto out; 2768 } 2769 2770 cluster = fetch_cluster_info(fs_info, 2771 cache->space_info, 2772 &empty_cluster); 2773 empty_cluster <<= 1; 2774 } 2775 2776 len = cache->start + cache->length - start; 2777 len = min(len, end + 1 - start); 2778 2779 if (return_free_space) 2780 btrfs_add_free_space(cache, start, len); 2781 2782 start += len; 2783 total_unpinned += len; 2784 space_info = cache->space_info; 2785 2786 /* 2787 * If this space cluster has been marked as fragmented and we've 2788 * unpinned enough in this block group to potentially allow a 2789 * cluster to be created inside of it go ahead and clear the 2790 * fragmented check. 2791 */ 2792 if (cluster && cluster->fragmented && 2793 total_unpinned > empty_cluster) { 2794 spin_lock(&cluster->lock); 2795 cluster->fragmented = 0; 2796 spin_unlock(&cluster->lock); 2797 } 2798 2799 spin_lock(&space_info->lock); 2800 spin_lock(&cache->lock); 2801 cache->pinned -= len; 2802 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); 2803 space_info->max_extent_size = 0; 2804 if (cache->ro) { 2805 space_info->bytes_readonly += len; 2806 readonly = true; 2807 } else if (btrfs_is_zoned(fs_info)) { 2808 /* Need reset before reusing in a zoned block group */ 2809 space_info->bytes_zone_unusable += len; 2810 readonly = true; 2811 } 2812 spin_unlock(&cache->lock); 2813 if (!readonly && return_free_space && 2814 global_rsv->space_info == space_info) { 2815 spin_lock(&global_rsv->lock); 2816 if (!global_rsv->full) { 2817 u64 to_add = min(len, global_rsv->size - 2818 global_rsv->reserved); 2819 2820 global_rsv->reserved += to_add; 2821 btrfs_space_info_update_bytes_may_use(fs_info, 2822 space_info, to_add); 2823 if (global_rsv->reserved >= global_rsv->size) 2824 global_rsv->full = 1; 2825 len -= to_add; 2826 } 2827 spin_unlock(&global_rsv->lock); 2828 } 2829 /* Add to any tickets we may have */ 2830 if (!readonly && return_free_space && len) 2831 btrfs_try_granting_tickets(fs_info, space_info); 2832 spin_unlock(&space_info->lock); 2833 } 2834 2835 if (cache) 2836 btrfs_put_block_group(cache); 2837 out: 2838 return ret; 2839 } 2840 2841 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 2842 { 2843 struct btrfs_fs_info *fs_info = trans->fs_info; 2844 struct btrfs_block_group *block_group, *tmp; 2845 struct list_head *deleted_bgs; 2846 struct extent_io_tree *unpin; 2847 u64 start; 2848 u64 end; 2849 int ret; 2850 2851 unpin = &trans->transaction->pinned_extents; 2852 2853 while (!TRANS_ABORTED(trans)) { 2854 struct extent_state *cached_state = NULL; 2855 2856 mutex_lock(&fs_info->unused_bg_unpin_mutex); 2857 if (!find_first_extent_bit(unpin, 0, &start, &end, 2858 EXTENT_DIRTY, &cached_state)) { 2859 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2860 break; 2861 } 2862 2863 if (btrfs_test_opt(fs_info, DISCARD_SYNC)) 2864 ret = btrfs_discard_extent(fs_info, start, 2865 end + 1 - start, NULL); 2866 2867 clear_extent_dirty(unpin, start, end, &cached_state); 2868 ret = unpin_extent_range(fs_info, start, end, true); 2869 BUG_ON(ret); 2870 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2871 free_extent_state(cached_state); 2872 cond_resched(); 2873 } 2874 2875 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 2876 btrfs_discard_calc_delay(&fs_info->discard_ctl); 2877 btrfs_discard_schedule_work(&fs_info->discard_ctl, true); 2878 } 2879 2880 /* 2881 * Transaction is finished. We don't need the lock anymore. We 2882 * do need to clean up the block groups in case of a transaction 2883 * abort. 2884 */ 2885 deleted_bgs = &trans->transaction->deleted_bgs; 2886 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 2887 u64 trimmed = 0; 2888 2889 ret = -EROFS; 2890 if (!TRANS_ABORTED(trans)) 2891 ret = btrfs_discard_extent(fs_info, 2892 block_group->start, 2893 block_group->length, 2894 &trimmed); 2895 2896 list_del_init(&block_group->bg_list); 2897 btrfs_unfreeze_block_group(block_group); 2898 btrfs_put_block_group(block_group); 2899 2900 if (ret) { 2901 const char *errstr = btrfs_decode_error(ret); 2902 btrfs_warn(fs_info, 2903 "discard failed while removing blockgroup: errno=%d %s", 2904 ret, errstr); 2905 } 2906 } 2907 2908 return 0; 2909 } 2910 2911 /* 2912 * Parse an extent item's inline extents looking for a simple quotas owner ref. 2913 * 2914 * @fs_info: the btrfs_fs_info for this mount 2915 * @leaf: a leaf in the extent tree containing the extent item 2916 * @slot: the slot in the leaf where the extent item is found 2917 * 2918 * Returns the objectid of the root that originally allocated the extent item 2919 * if the inline owner ref is expected and present, otherwise 0. 2920 * 2921 * If an extent item has an owner ref item, it will be the first inline ref 2922 * item. Therefore the logic is to check whether there are any inline ref 2923 * items, then check the type of the first one. 2924 */ 2925 u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, 2926 struct extent_buffer *leaf, int slot) 2927 { 2928 struct btrfs_extent_item *ei; 2929 struct btrfs_extent_inline_ref *iref; 2930 struct btrfs_extent_owner_ref *oref; 2931 unsigned long ptr; 2932 unsigned long end; 2933 int type; 2934 2935 if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) 2936 return 0; 2937 2938 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 2939 ptr = (unsigned long)(ei + 1); 2940 end = (unsigned long)ei + btrfs_item_size(leaf, slot); 2941 2942 /* No inline ref items of any kind, can't check type. */ 2943 if (ptr == end) 2944 return 0; 2945 2946 iref = (struct btrfs_extent_inline_ref *)ptr; 2947 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 2948 2949 /* We found an owner ref, get the root out of it. */ 2950 if (type == BTRFS_EXTENT_OWNER_REF_KEY) { 2951 oref = (struct btrfs_extent_owner_ref *)(&iref->offset); 2952 return btrfs_extent_owner_ref_root_id(leaf, oref); 2953 } 2954 2955 /* We have inline refs, but not an owner ref. */ 2956 return 0; 2957 } 2958 2959 static int do_free_extent_accounting(struct btrfs_trans_handle *trans, 2960 u64 bytenr, struct btrfs_squota_delta *delta) 2961 { 2962 int ret; 2963 u64 num_bytes = delta->num_bytes; 2964 2965 if (delta->is_data) { 2966 struct btrfs_root *csum_root; 2967 2968 csum_root = btrfs_csum_root(trans->fs_info, bytenr); 2969 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); 2970 if (ret) { 2971 btrfs_abort_transaction(trans, ret); 2972 return ret; 2973 } 2974 2975 ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); 2976 if (ret) { 2977 btrfs_abort_transaction(trans, ret); 2978 return ret; 2979 } 2980 } 2981 2982 ret = btrfs_record_squota_delta(trans->fs_info, delta); 2983 if (ret) { 2984 btrfs_abort_transaction(trans, ret); 2985 return ret; 2986 } 2987 2988 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 2989 if (ret) { 2990 btrfs_abort_transaction(trans, ret); 2991 return ret; 2992 } 2993 2994 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 2995 if (ret) 2996 btrfs_abort_transaction(trans, ret); 2997 2998 return ret; 2999 } 3000 3001 #define abort_and_dump(trans, path, fmt, args...) \ 3002 ({ \ 3003 btrfs_abort_transaction(trans, -EUCLEAN); \ 3004 btrfs_print_leaf(path->nodes[0]); \ 3005 btrfs_crit(trans->fs_info, fmt, ##args); \ 3006 }) 3007 3008 /* 3009 * Drop one or more refs of @node. 3010 * 3011 * 1. Locate the extent refs. 3012 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. 3013 * Locate it, then reduce the refs number or remove the ref line completely. 3014 * 3015 * 2. Update the refs count in EXTENT/METADATA_ITEM 3016 * 3017 * Inline backref case: 3018 * 3019 * in extent tree we have: 3020 * 3021 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 3022 * refs 2 gen 6 flags DATA 3023 * extent data backref root FS_TREE objectid 258 offset 0 count 1 3024 * extent data backref root FS_TREE objectid 257 offset 0 count 1 3025 * 3026 * This function gets called with: 3027 * 3028 * node->bytenr = 13631488 3029 * node->num_bytes = 1048576 3030 * root_objectid = FS_TREE 3031 * owner_objectid = 257 3032 * owner_offset = 0 3033 * refs_to_drop = 1 3034 * 3035 * Then we should get some like: 3036 * 3037 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 3038 * refs 1 gen 6 flags DATA 3039 * extent data backref root FS_TREE objectid 258 offset 0 count 1 3040 * 3041 * Keyed backref case: 3042 * 3043 * in extent tree we have: 3044 * 3045 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 3046 * refs 754 gen 6 flags DATA 3047 * [...] 3048 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 3049 * extent data backref root FS_TREE objectid 866 offset 0 count 1 3050 * 3051 * This function get called with: 3052 * 3053 * node->bytenr = 13631488 3054 * node->num_bytes = 1048576 3055 * root_objectid = FS_TREE 3056 * owner_objectid = 866 3057 * owner_offset = 0 3058 * refs_to_drop = 1 3059 * 3060 * Then we should get some like: 3061 * 3062 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 3063 * refs 753 gen 6 flags DATA 3064 * 3065 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. 3066 */ 3067 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3068 struct btrfs_delayed_ref_head *href, 3069 struct btrfs_delayed_ref_node *node, 3070 struct btrfs_delayed_extent_op *extent_op) 3071 { 3072 struct btrfs_fs_info *info = trans->fs_info; 3073 struct btrfs_key key; 3074 struct btrfs_path *path; 3075 struct btrfs_root *extent_root; 3076 struct extent_buffer *leaf; 3077 struct btrfs_extent_item *ei; 3078 struct btrfs_extent_inline_ref *iref; 3079 int ret; 3080 int is_data; 3081 int extent_slot = 0; 3082 int found_extent = 0; 3083 int num_to_del = 1; 3084 int refs_to_drop = node->ref_mod; 3085 u32 item_size; 3086 u64 refs; 3087 u64 bytenr = node->bytenr; 3088 u64 num_bytes = node->num_bytes; 3089 u64 owner_objectid = btrfs_delayed_ref_owner(node); 3090 u64 owner_offset = btrfs_delayed_ref_offset(node); 3091 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 3092 u64 delayed_ref_root = href->owning_root; 3093 3094 extent_root = btrfs_extent_root(info, bytenr); 3095 ASSERT(extent_root); 3096 3097 path = btrfs_alloc_path(); 3098 if (!path) 3099 return -ENOMEM; 3100 3101 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 3102 3103 if (!is_data && refs_to_drop != 1) { 3104 btrfs_crit(info, 3105 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", 3106 node->bytenr, refs_to_drop); 3107 ret = -EINVAL; 3108 btrfs_abort_transaction(trans, ret); 3109 goto out; 3110 } 3111 3112 if (is_data) 3113 skinny_metadata = false; 3114 3115 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 3116 node->parent, node->ref_root, owner_objectid, 3117 owner_offset); 3118 if (ret == 0) { 3119 /* 3120 * Either the inline backref or the SHARED_DATA_REF/ 3121 * SHARED_BLOCK_REF is found 3122 * 3123 * Here is a quick path to locate EXTENT/METADATA_ITEM. 3124 * It's possible the EXTENT/METADATA_ITEM is near current slot. 3125 */ 3126 extent_slot = path->slots[0]; 3127 while (extent_slot >= 0) { 3128 btrfs_item_key_to_cpu(path->nodes[0], &key, 3129 extent_slot); 3130 if (key.objectid != bytenr) 3131 break; 3132 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3133 key.offset == num_bytes) { 3134 found_extent = 1; 3135 break; 3136 } 3137 if (key.type == BTRFS_METADATA_ITEM_KEY && 3138 key.offset == owner_objectid) { 3139 found_extent = 1; 3140 break; 3141 } 3142 3143 /* Quick path didn't find the EXTEMT/METADATA_ITEM */ 3144 if (path->slots[0] - extent_slot > 5) 3145 break; 3146 extent_slot--; 3147 } 3148 3149 if (!found_extent) { 3150 if (iref) { 3151 abort_and_dump(trans, path, 3152 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", 3153 path->slots[0]); 3154 ret = -EUCLEAN; 3155 goto out; 3156 } 3157 /* Must be SHARED_* item, remove the backref first */ 3158 ret = remove_extent_backref(trans, extent_root, path, 3159 NULL, refs_to_drop, is_data); 3160 if (ret) { 3161 btrfs_abort_transaction(trans, ret); 3162 goto out; 3163 } 3164 btrfs_release_path(path); 3165 3166 /* Slow path to locate EXTENT/METADATA_ITEM */ 3167 key.objectid = bytenr; 3168 key.type = BTRFS_EXTENT_ITEM_KEY; 3169 key.offset = num_bytes; 3170 3171 if (!is_data && skinny_metadata) { 3172 key.type = BTRFS_METADATA_ITEM_KEY; 3173 key.offset = owner_objectid; 3174 } 3175 3176 ret = btrfs_search_slot(trans, extent_root, 3177 &key, path, -1, 1); 3178 if (ret > 0 && skinny_metadata && path->slots[0]) { 3179 /* 3180 * Couldn't find our skinny metadata item, 3181 * see if we have ye olde extent item. 3182 */ 3183 path->slots[0]--; 3184 btrfs_item_key_to_cpu(path->nodes[0], &key, 3185 path->slots[0]); 3186 if (key.objectid == bytenr && 3187 key.type == BTRFS_EXTENT_ITEM_KEY && 3188 key.offset == num_bytes) 3189 ret = 0; 3190 } 3191 3192 if (ret > 0 && skinny_metadata) { 3193 skinny_metadata = false; 3194 key.objectid = bytenr; 3195 key.type = BTRFS_EXTENT_ITEM_KEY; 3196 key.offset = num_bytes; 3197 btrfs_release_path(path); 3198 ret = btrfs_search_slot(trans, extent_root, 3199 &key, path, -1, 1); 3200 } 3201 3202 if (ret) { 3203 if (ret > 0) 3204 btrfs_print_leaf(path->nodes[0]); 3205 btrfs_err(info, 3206 "umm, got %d back from search, was looking for %llu, slot %d", 3207 ret, bytenr, path->slots[0]); 3208 } 3209 if (ret < 0) { 3210 btrfs_abort_transaction(trans, ret); 3211 goto out; 3212 } 3213 extent_slot = path->slots[0]; 3214 } 3215 } else if (WARN_ON(ret == -ENOENT)) { 3216 abort_and_dump(trans, path, 3217 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", 3218 bytenr, node->parent, node->ref_root, owner_objectid, 3219 owner_offset, path->slots[0]); 3220 goto out; 3221 } else { 3222 btrfs_abort_transaction(trans, ret); 3223 goto out; 3224 } 3225 3226 leaf = path->nodes[0]; 3227 item_size = btrfs_item_size(leaf, extent_slot); 3228 if (unlikely(item_size < sizeof(*ei))) { 3229 ret = -EUCLEAN; 3230 btrfs_err(trans->fs_info, 3231 "unexpected extent item size, has %u expect >= %zu", 3232 item_size, sizeof(*ei)); 3233 btrfs_abort_transaction(trans, ret); 3234 goto out; 3235 } 3236 ei = btrfs_item_ptr(leaf, extent_slot, 3237 struct btrfs_extent_item); 3238 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 3239 key.type == BTRFS_EXTENT_ITEM_KEY) { 3240 struct btrfs_tree_block_info *bi; 3241 3242 if (item_size < sizeof(*ei) + sizeof(*bi)) { 3243 abort_and_dump(trans, path, 3244 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", 3245 key.objectid, key.type, key.offset, 3246 path->slots[0], owner_objectid, item_size, 3247 sizeof(*ei) + sizeof(*bi)); 3248 ret = -EUCLEAN; 3249 goto out; 3250 } 3251 bi = (struct btrfs_tree_block_info *)(ei + 1); 3252 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 3253 } 3254 3255 refs = btrfs_extent_refs(leaf, ei); 3256 if (refs < refs_to_drop) { 3257 abort_and_dump(trans, path, 3258 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", 3259 refs_to_drop, refs, bytenr, path->slots[0]); 3260 ret = -EUCLEAN; 3261 goto out; 3262 } 3263 refs -= refs_to_drop; 3264 3265 if (refs > 0) { 3266 if (extent_op) 3267 __run_delayed_extent_op(extent_op, leaf, ei); 3268 /* 3269 * In the case of inline back ref, reference count will 3270 * be updated by remove_extent_backref 3271 */ 3272 if (iref) { 3273 if (!found_extent) { 3274 abort_and_dump(trans, path, 3275 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", 3276 path->slots[0]); 3277 ret = -EUCLEAN; 3278 goto out; 3279 } 3280 } else { 3281 btrfs_set_extent_refs(leaf, ei, refs); 3282 btrfs_mark_buffer_dirty(trans, leaf); 3283 } 3284 if (found_extent) { 3285 ret = remove_extent_backref(trans, extent_root, path, 3286 iref, refs_to_drop, is_data); 3287 if (ret) { 3288 btrfs_abort_transaction(trans, ret); 3289 goto out; 3290 } 3291 } 3292 } else { 3293 struct btrfs_squota_delta delta = { 3294 .root = delayed_ref_root, 3295 .num_bytes = num_bytes, 3296 .is_data = is_data, 3297 .is_inc = false, 3298 .generation = btrfs_extent_generation(leaf, ei), 3299 }; 3300 3301 /* In this branch refs == 1 */ 3302 if (found_extent) { 3303 if (is_data && refs_to_drop != 3304 extent_data_ref_count(path, iref)) { 3305 abort_and_dump(trans, path, 3306 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", 3307 extent_data_ref_count(path, iref), 3308 refs_to_drop, path->slots[0]); 3309 ret = -EUCLEAN; 3310 goto out; 3311 } 3312 if (iref) { 3313 if (path->slots[0] != extent_slot) { 3314 abort_and_dump(trans, path, 3315 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", 3316 key.objectid, key.type, 3317 key.offset, path->slots[0]); 3318 ret = -EUCLEAN; 3319 goto out; 3320 } 3321 } else { 3322 /* 3323 * No inline ref, we must be at SHARED_* item, 3324 * And it's single ref, it must be: 3325 * | extent_slot ||extent_slot + 1| 3326 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] 3327 */ 3328 if (path->slots[0] != extent_slot + 1) { 3329 abort_and_dump(trans, path, 3330 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", 3331 path->slots[0]); 3332 ret = -EUCLEAN; 3333 goto out; 3334 } 3335 path->slots[0] = extent_slot; 3336 num_to_del = 2; 3337 } 3338 } 3339 /* 3340 * We can't infer the data owner from the delayed ref, so we need 3341 * to try to get it from the owning ref item. 3342 * 3343 * If it is not present, then that extent was not written under 3344 * simple quotas mode, so we don't need to account for its deletion. 3345 */ 3346 if (is_data) 3347 delta.root = btrfs_get_extent_owner_root(trans->fs_info, 3348 leaf, extent_slot); 3349 3350 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3351 num_to_del); 3352 if (ret) { 3353 btrfs_abort_transaction(trans, ret); 3354 goto out; 3355 } 3356 btrfs_release_path(path); 3357 3358 ret = do_free_extent_accounting(trans, bytenr, &delta); 3359 } 3360 btrfs_release_path(path); 3361 3362 out: 3363 btrfs_free_path(path); 3364 return ret; 3365 } 3366 3367 /* 3368 * when we free an block, it is possible (and likely) that we free the last 3369 * delayed ref for that extent as well. This searches the delayed ref tree for 3370 * a given extent, and if there are no other delayed refs to be processed, it 3371 * removes it from the tree. 3372 */ 3373 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 3374 u64 bytenr) 3375 { 3376 struct btrfs_delayed_ref_head *head; 3377 struct btrfs_delayed_ref_root *delayed_refs; 3378 int ret = 0; 3379 3380 delayed_refs = &trans->transaction->delayed_refs; 3381 spin_lock(&delayed_refs->lock); 3382 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3383 if (!head) 3384 goto out_delayed_unlock; 3385 3386 spin_lock(&head->lock); 3387 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 3388 goto out; 3389 3390 if (cleanup_extent_op(head) != NULL) 3391 goto out; 3392 3393 /* 3394 * waiting for the lock here would deadlock. If someone else has it 3395 * locked they are already in the process of dropping it anyway 3396 */ 3397 if (!mutex_trylock(&head->mutex)) 3398 goto out; 3399 3400 btrfs_delete_ref_head(delayed_refs, head); 3401 head->processing = false; 3402 3403 spin_unlock(&head->lock); 3404 spin_unlock(&delayed_refs->lock); 3405 3406 BUG_ON(head->extent_op); 3407 if (head->must_insert_reserved) 3408 ret = 1; 3409 3410 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); 3411 mutex_unlock(&head->mutex); 3412 btrfs_put_delayed_ref_head(head); 3413 return ret; 3414 out: 3415 spin_unlock(&head->lock); 3416 3417 out_delayed_unlock: 3418 spin_unlock(&delayed_refs->lock); 3419 return 0; 3420 } 3421 3422 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3423 u64 root_id, 3424 struct extent_buffer *buf, 3425 u64 parent, int last_ref) 3426 { 3427 struct btrfs_fs_info *fs_info = trans->fs_info; 3428 struct btrfs_block_group *bg; 3429 int ret; 3430 3431 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3432 struct btrfs_ref generic_ref = { 3433 .action = BTRFS_DROP_DELAYED_REF, 3434 .bytenr = buf->start, 3435 .num_bytes = buf->len, 3436 .parent = parent, 3437 .owning_root = btrfs_header_owner(buf), 3438 .ref_root = root_id, 3439 }; 3440 3441 /* 3442 * Assert that the extent buffer is not cleared due to 3443 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer 3444 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for 3445 * detail. 3446 */ 3447 ASSERT(btrfs_header_bytenr(buf) != 0); 3448 3449 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); 3450 btrfs_ref_tree_mod(fs_info, &generic_ref); 3451 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); 3452 BUG_ON(ret); /* -ENOMEM */ 3453 } 3454 3455 if (!last_ref) 3456 return; 3457 3458 if (btrfs_header_generation(buf) != trans->transid) 3459 goto out; 3460 3461 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3462 ret = check_ref_cleanup(trans, buf->start); 3463 if (!ret) 3464 goto out; 3465 } 3466 3467 bg = btrfs_lookup_block_group(fs_info, buf->start); 3468 3469 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3470 pin_down_extent(trans, bg, buf->start, buf->len, 1); 3471 btrfs_put_block_group(bg); 3472 goto out; 3473 } 3474 3475 /* 3476 * If there are tree mod log users we may have recorded mod log 3477 * operations for this node. If we re-allocate this node we 3478 * could replay operations on this node that happened when it 3479 * existed in a completely different root. For example if it 3480 * was part of root A, then was reallocated to root B, and we 3481 * are doing a btrfs_old_search_slot(root b), we could replay 3482 * operations that happened when the block was part of root A, 3483 * giving us an inconsistent view of the btree. 3484 * 3485 * We are safe from races here because at this point no other 3486 * node or root points to this extent buffer, so if after this 3487 * check a new tree mod log user joins we will not have an 3488 * existing log of operations on this node that we have to 3489 * contend with. 3490 */ 3491 3492 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) 3493 || btrfs_is_zoned(fs_info)) { 3494 pin_down_extent(trans, bg, buf->start, buf->len, 1); 3495 btrfs_put_block_group(bg); 3496 goto out; 3497 } 3498 3499 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 3500 3501 btrfs_add_free_space(bg, buf->start, buf->len); 3502 btrfs_free_reserved_bytes(bg, buf->len, 0); 3503 btrfs_put_block_group(bg); 3504 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 3505 3506 out: 3507 3508 /* 3509 * Deleting the buffer, clear the corrupt flag since it doesn't 3510 * matter anymore. 3511 */ 3512 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 3513 } 3514 3515 /* Can return -ENOMEM */ 3516 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) 3517 { 3518 struct btrfs_fs_info *fs_info = trans->fs_info; 3519 int ret; 3520 3521 if (btrfs_is_testing(fs_info)) 3522 return 0; 3523 3524 /* 3525 * tree log blocks never actually go into the extent allocation 3526 * tree, just update pinning info and exit early. 3527 */ 3528 if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { 3529 btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); 3530 ret = 0; 3531 } else if (ref->type == BTRFS_REF_METADATA) { 3532 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); 3533 } else { 3534 ret = btrfs_add_delayed_data_ref(trans, ref, 0); 3535 } 3536 3537 if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) 3538 btrfs_ref_tree_mod(fs_info, ref); 3539 3540 return ret; 3541 } 3542 3543 enum btrfs_loop_type { 3544 /* 3545 * Start caching block groups but do not wait for progress or for them 3546 * to be done. 3547 */ 3548 LOOP_CACHING_NOWAIT, 3549 3550 /* 3551 * Wait for the block group free_space >= the space we're waiting for if 3552 * the block group isn't cached. 3553 */ 3554 LOOP_CACHING_WAIT, 3555 3556 /* 3557 * Allow allocations to happen from block groups that do not yet have a 3558 * size classification. 3559 */ 3560 LOOP_UNSET_SIZE_CLASS, 3561 3562 /* 3563 * Allocate a chunk and then retry the allocation. 3564 */ 3565 LOOP_ALLOC_CHUNK, 3566 3567 /* 3568 * Ignore the size class restrictions for this allocation. 3569 */ 3570 LOOP_WRONG_SIZE_CLASS, 3571 3572 /* 3573 * Ignore the empty size, only try to allocate the number of bytes 3574 * needed for this allocation. 3575 */ 3576 LOOP_NO_EMPTY_SIZE, 3577 }; 3578 3579 static inline void 3580 btrfs_lock_block_group(struct btrfs_block_group *cache, 3581 int delalloc) 3582 { 3583 if (delalloc) 3584 down_read(&cache->data_rwsem); 3585 } 3586 3587 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, 3588 int delalloc) 3589 { 3590 btrfs_get_block_group(cache); 3591 if (delalloc) 3592 down_read(&cache->data_rwsem); 3593 } 3594 3595 static struct btrfs_block_group *btrfs_lock_cluster( 3596 struct btrfs_block_group *block_group, 3597 struct btrfs_free_cluster *cluster, 3598 int delalloc) 3599 __acquires(&cluster->refill_lock) 3600 { 3601 struct btrfs_block_group *used_bg = NULL; 3602 3603 spin_lock(&cluster->refill_lock); 3604 while (1) { 3605 used_bg = cluster->block_group; 3606 if (!used_bg) 3607 return NULL; 3608 3609 if (used_bg == block_group) 3610 return used_bg; 3611 3612 btrfs_get_block_group(used_bg); 3613 3614 if (!delalloc) 3615 return used_bg; 3616 3617 if (down_read_trylock(&used_bg->data_rwsem)) 3618 return used_bg; 3619 3620 spin_unlock(&cluster->refill_lock); 3621 3622 /* We should only have one-level nested. */ 3623 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 3624 3625 spin_lock(&cluster->refill_lock); 3626 if (used_bg == cluster->block_group) 3627 return used_bg; 3628 3629 up_read(&used_bg->data_rwsem); 3630 btrfs_put_block_group(used_bg); 3631 } 3632 } 3633 3634 static inline void 3635 btrfs_release_block_group(struct btrfs_block_group *cache, 3636 int delalloc) 3637 { 3638 if (delalloc) 3639 up_read(&cache->data_rwsem); 3640 btrfs_put_block_group(cache); 3641 } 3642 3643 /* 3644 * Helper function for find_free_extent(). 3645 * 3646 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 3647 * Return >0 to inform caller that we find nothing 3648 * Return 0 means we have found a location and set ffe_ctl->found_offset. 3649 */ 3650 static int find_free_extent_clustered(struct btrfs_block_group *bg, 3651 struct find_free_extent_ctl *ffe_ctl, 3652 struct btrfs_block_group **cluster_bg_ret) 3653 { 3654 struct btrfs_block_group *cluster_bg; 3655 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3656 u64 aligned_cluster; 3657 u64 offset; 3658 int ret; 3659 3660 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 3661 if (!cluster_bg) 3662 goto refill_cluster; 3663 if (cluster_bg != bg && (cluster_bg->ro || 3664 !block_group_bits(cluster_bg, ffe_ctl->flags))) 3665 goto release_cluster; 3666 3667 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 3668 ffe_ctl->num_bytes, cluster_bg->start, 3669 &ffe_ctl->max_extent_size); 3670 if (offset) { 3671 /* We have a block, we're done */ 3672 spin_unlock(&last_ptr->refill_lock); 3673 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); 3674 *cluster_bg_ret = cluster_bg; 3675 ffe_ctl->found_offset = offset; 3676 return 0; 3677 } 3678 WARN_ON(last_ptr->block_group != cluster_bg); 3679 3680 release_cluster: 3681 /* 3682 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 3683 * lets just skip it and let the allocator find whatever block it can 3684 * find. If we reach this point, we will have tried the cluster 3685 * allocator plenty of times and not have found anything, so we are 3686 * likely way too fragmented for the clustering stuff to find anything. 3687 * 3688 * However, if the cluster is taken from the current block group, 3689 * release the cluster first, so that we stand a better chance of 3690 * succeeding in the unclustered allocation. 3691 */ 3692 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 3693 spin_unlock(&last_ptr->refill_lock); 3694 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3695 return -ENOENT; 3696 } 3697 3698 /* This cluster didn't work out, free it and start over */ 3699 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3700 3701 if (cluster_bg != bg) 3702 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3703 3704 refill_cluster: 3705 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 3706 spin_unlock(&last_ptr->refill_lock); 3707 return -ENOENT; 3708 } 3709 3710 aligned_cluster = max_t(u64, 3711 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 3712 bg->full_stripe_len); 3713 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, 3714 ffe_ctl->num_bytes, aligned_cluster); 3715 if (ret == 0) { 3716 /* Now pull our allocation out of this cluster */ 3717 offset = btrfs_alloc_from_cluster(bg, last_ptr, 3718 ffe_ctl->num_bytes, ffe_ctl->search_start, 3719 &ffe_ctl->max_extent_size); 3720 if (offset) { 3721 /* We found one, proceed */ 3722 spin_unlock(&last_ptr->refill_lock); 3723 ffe_ctl->found_offset = offset; 3724 trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); 3725 return 0; 3726 } 3727 } 3728 /* 3729 * At this point we either didn't find a cluster or we weren't able to 3730 * allocate a block from our cluster. Free the cluster we've been 3731 * trying to use, and go to the next block group. 3732 */ 3733 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3734 spin_unlock(&last_ptr->refill_lock); 3735 return 1; 3736 } 3737 3738 /* 3739 * Return >0 to inform caller that we find nothing 3740 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 3741 */ 3742 static int find_free_extent_unclustered(struct btrfs_block_group *bg, 3743 struct find_free_extent_ctl *ffe_ctl) 3744 { 3745 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3746 u64 offset; 3747 3748 /* 3749 * We are doing an unclustered allocation, set the fragmented flag so 3750 * we don't bother trying to setup a cluster again until we get more 3751 * space. 3752 */ 3753 if (unlikely(last_ptr)) { 3754 spin_lock(&last_ptr->lock); 3755 last_ptr->fragmented = 1; 3756 spin_unlock(&last_ptr->lock); 3757 } 3758 if (ffe_ctl->cached) { 3759 struct btrfs_free_space_ctl *free_space_ctl; 3760 3761 free_space_ctl = bg->free_space_ctl; 3762 spin_lock(&free_space_ctl->tree_lock); 3763 if (free_space_ctl->free_space < 3764 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 3765 ffe_ctl->empty_size) { 3766 ffe_ctl->total_free_space = max_t(u64, 3767 ffe_ctl->total_free_space, 3768 free_space_ctl->free_space); 3769 spin_unlock(&free_space_ctl->tree_lock); 3770 return 1; 3771 } 3772 spin_unlock(&free_space_ctl->tree_lock); 3773 } 3774 3775 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 3776 ffe_ctl->num_bytes, ffe_ctl->empty_size, 3777 &ffe_ctl->max_extent_size); 3778 if (!offset) 3779 return 1; 3780 ffe_ctl->found_offset = offset; 3781 return 0; 3782 } 3783 3784 static int do_allocation_clustered(struct btrfs_block_group *block_group, 3785 struct find_free_extent_ctl *ffe_ctl, 3786 struct btrfs_block_group **bg_ret) 3787 { 3788 int ret; 3789 3790 /* We want to try and use the cluster allocator, so lets look there */ 3791 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { 3792 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); 3793 if (ret >= 0) 3794 return ret; 3795 /* ret == -ENOENT case falls through */ 3796 } 3797 3798 return find_free_extent_unclustered(block_group, ffe_ctl); 3799 } 3800 3801 /* 3802 * Tree-log block group locking 3803 * ============================ 3804 * 3805 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which 3806 * indicates the starting address of a block group, which is reserved only 3807 * for tree-log metadata. 3808 * 3809 * Lock nesting 3810 * ============ 3811 * 3812 * space_info::lock 3813 * block_group::lock 3814 * fs_info::treelog_bg_lock 3815 */ 3816 3817 /* 3818 * Simple allocator for sequential-only block group. It only allows sequential 3819 * allocation. No need to play with trees. This function also reserves the 3820 * bytes as in btrfs_add_reserved_bytes. 3821 */ 3822 static int do_allocation_zoned(struct btrfs_block_group *block_group, 3823 struct find_free_extent_ctl *ffe_ctl, 3824 struct btrfs_block_group **bg_ret) 3825 { 3826 struct btrfs_fs_info *fs_info = block_group->fs_info; 3827 struct btrfs_space_info *space_info = block_group->space_info; 3828 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3829 u64 start = block_group->start; 3830 u64 num_bytes = ffe_ctl->num_bytes; 3831 u64 avail; 3832 u64 bytenr = block_group->start; 3833 u64 log_bytenr; 3834 u64 data_reloc_bytenr; 3835 int ret = 0; 3836 bool skip = false; 3837 3838 ASSERT(btrfs_is_zoned(block_group->fs_info)); 3839 3840 /* 3841 * Do not allow non-tree-log blocks in the dedicated tree-log block 3842 * group, and vice versa. 3843 */ 3844 spin_lock(&fs_info->treelog_bg_lock); 3845 log_bytenr = fs_info->treelog_bg; 3846 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || 3847 (!ffe_ctl->for_treelog && bytenr == log_bytenr))) 3848 skip = true; 3849 spin_unlock(&fs_info->treelog_bg_lock); 3850 if (skip) 3851 return 1; 3852 3853 /* 3854 * Do not allow non-relocation blocks in the dedicated relocation block 3855 * group, and vice versa. 3856 */ 3857 spin_lock(&fs_info->relocation_bg_lock); 3858 data_reloc_bytenr = fs_info->data_reloc_bg; 3859 if (data_reloc_bytenr && 3860 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || 3861 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) 3862 skip = true; 3863 spin_unlock(&fs_info->relocation_bg_lock); 3864 if (skip) 3865 return 1; 3866 3867 /* Check RO and no space case before trying to activate it */ 3868 spin_lock(&block_group->lock); 3869 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { 3870 ret = 1; 3871 /* 3872 * May need to clear fs_info->{treelog,data_reloc}_bg. 3873 * Return the error after taking the locks. 3874 */ 3875 } 3876 spin_unlock(&block_group->lock); 3877 3878 /* Metadata block group is activated at write time. */ 3879 if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && 3880 !btrfs_zone_activate(block_group)) { 3881 ret = 1; 3882 /* 3883 * May need to clear fs_info->{treelog,data_reloc}_bg. 3884 * Return the error after taking the locks. 3885 */ 3886 } 3887 3888 spin_lock(&space_info->lock); 3889 spin_lock(&block_group->lock); 3890 spin_lock(&fs_info->treelog_bg_lock); 3891 spin_lock(&fs_info->relocation_bg_lock); 3892 3893 if (ret) 3894 goto out; 3895 3896 ASSERT(!ffe_ctl->for_treelog || 3897 block_group->start == fs_info->treelog_bg || 3898 fs_info->treelog_bg == 0); 3899 ASSERT(!ffe_ctl->for_data_reloc || 3900 block_group->start == fs_info->data_reloc_bg || 3901 fs_info->data_reloc_bg == 0); 3902 3903 if (block_group->ro || 3904 (!ffe_ctl->for_data_reloc && 3905 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { 3906 ret = 1; 3907 goto out; 3908 } 3909 3910 /* 3911 * Do not allow currently using block group to be tree-log dedicated 3912 * block group. 3913 */ 3914 if (ffe_ctl->for_treelog && !fs_info->treelog_bg && 3915 (block_group->used || block_group->reserved)) { 3916 ret = 1; 3917 goto out; 3918 } 3919 3920 /* 3921 * Do not allow currently used block group to be the data relocation 3922 * dedicated block group. 3923 */ 3924 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && 3925 (block_group->used || block_group->reserved)) { 3926 ret = 1; 3927 goto out; 3928 } 3929 3930 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); 3931 avail = block_group->zone_capacity - block_group->alloc_offset; 3932 if (avail < num_bytes) { 3933 if (ffe_ctl->max_extent_size < avail) { 3934 /* 3935 * With sequential allocator, free space is always 3936 * contiguous 3937 */ 3938 ffe_ctl->max_extent_size = avail; 3939 ffe_ctl->total_free_space = avail; 3940 } 3941 ret = 1; 3942 goto out; 3943 } 3944 3945 if (ffe_ctl->for_treelog && !fs_info->treelog_bg) 3946 fs_info->treelog_bg = block_group->start; 3947 3948 if (ffe_ctl->for_data_reloc) { 3949 if (!fs_info->data_reloc_bg) 3950 fs_info->data_reloc_bg = block_group->start; 3951 /* 3952 * Do not allow allocations from this block group, unless it is 3953 * for data relocation. Compared to increasing the ->ro, setting 3954 * the ->zoned_data_reloc_ongoing flag still allows nocow 3955 * writers to come in. See btrfs_inc_nocow_writers(). 3956 * 3957 * We need to disable an allocation to avoid an allocation of 3958 * regular (non-relocation data) extent. With mix of relocation 3959 * extents and regular extents, we can dispatch WRITE commands 3960 * (for relocation extents) and ZONE APPEND commands (for 3961 * regular extents) at the same time to the same zone, which 3962 * easily break the write pointer. 3963 * 3964 * Also, this flag avoids this block group to be zone finished. 3965 */ 3966 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); 3967 } 3968 3969 ffe_ctl->found_offset = start + block_group->alloc_offset; 3970 block_group->alloc_offset += num_bytes; 3971 spin_lock(&ctl->tree_lock); 3972 ctl->free_space -= num_bytes; 3973 spin_unlock(&ctl->tree_lock); 3974 3975 /* 3976 * We do not check if found_offset is aligned to stripesize. The 3977 * address is anyway rewritten when using zone append writing. 3978 */ 3979 3980 ffe_ctl->search_start = ffe_ctl->found_offset; 3981 3982 out: 3983 if (ret && ffe_ctl->for_treelog) 3984 fs_info->treelog_bg = 0; 3985 if (ret && ffe_ctl->for_data_reloc) 3986 fs_info->data_reloc_bg = 0; 3987 spin_unlock(&fs_info->relocation_bg_lock); 3988 spin_unlock(&fs_info->treelog_bg_lock); 3989 spin_unlock(&block_group->lock); 3990 spin_unlock(&space_info->lock); 3991 return ret; 3992 } 3993 3994 static int do_allocation(struct btrfs_block_group *block_group, 3995 struct find_free_extent_ctl *ffe_ctl, 3996 struct btrfs_block_group **bg_ret) 3997 { 3998 switch (ffe_ctl->policy) { 3999 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4000 return do_allocation_clustered(block_group, ffe_ctl, bg_ret); 4001 case BTRFS_EXTENT_ALLOC_ZONED: 4002 return do_allocation_zoned(block_group, ffe_ctl, bg_ret); 4003 default: 4004 BUG(); 4005 } 4006 } 4007 4008 static void release_block_group(struct btrfs_block_group *block_group, 4009 struct find_free_extent_ctl *ffe_ctl, 4010 int delalloc) 4011 { 4012 switch (ffe_ctl->policy) { 4013 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4014 ffe_ctl->retry_uncached = false; 4015 break; 4016 case BTRFS_EXTENT_ALLOC_ZONED: 4017 /* Nothing to do */ 4018 break; 4019 default: 4020 BUG(); 4021 } 4022 4023 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 4024 ffe_ctl->index); 4025 btrfs_release_block_group(block_group, delalloc); 4026 } 4027 4028 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, 4029 struct btrfs_key *ins) 4030 { 4031 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 4032 4033 if (!ffe_ctl->use_cluster && last_ptr) { 4034 spin_lock(&last_ptr->lock); 4035 last_ptr->window_start = ins->objectid; 4036 spin_unlock(&last_ptr->lock); 4037 } 4038 } 4039 4040 static void found_extent(struct find_free_extent_ctl *ffe_ctl, 4041 struct btrfs_key *ins) 4042 { 4043 switch (ffe_ctl->policy) { 4044 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4045 found_extent_clustered(ffe_ctl, ins); 4046 break; 4047 case BTRFS_EXTENT_ALLOC_ZONED: 4048 /* Nothing to do */ 4049 break; 4050 default: 4051 BUG(); 4052 } 4053 } 4054 4055 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, 4056 struct find_free_extent_ctl *ffe_ctl) 4057 { 4058 /* Block group's activeness is not a requirement for METADATA block groups. */ 4059 if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) 4060 return 0; 4061 4062 /* If we can activate new zone, just allocate a chunk and use it */ 4063 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) 4064 return 0; 4065 4066 /* 4067 * We already reached the max active zones. Try to finish one block 4068 * group to make a room for a new block group. This is only possible 4069 * for a data block group because btrfs_zone_finish() may need to wait 4070 * for a running transaction which can cause a deadlock for metadata 4071 * allocation. 4072 */ 4073 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 4074 int ret = btrfs_zone_finish_one_bg(fs_info); 4075 4076 if (ret == 1) 4077 return 0; 4078 else if (ret < 0) 4079 return ret; 4080 } 4081 4082 /* 4083 * If we have enough free space left in an already active block group 4084 * and we can't activate any other zone now, do not allow allocating a 4085 * new chunk and let find_free_extent() retry with a smaller size. 4086 */ 4087 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) 4088 return -ENOSPC; 4089 4090 /* 4091 * Even min_alloc_size is not left in any block groups. Since we cannot 4092 * activate a new block group, allocating it may not help. Let's tell a 4093 * caller to try again and hope it progress something by writing some 4094 * parts of the region. That is only possible for data block groups, 4095 * where a part of the region can be written. 4096 */ 4097 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) 4098 return -EAGAIN; 4099 4100 /* 4101 * We cannot activate a new block group and no enough space left in any 4102 * block groups. So, allocating a new block group may not help. But, 4103 * there is nothing to do anyway, so let's go with it. 4104 */ 4105 return 0; 4106 } 4107 4108 static int can_allocate_chunk(struct btrfs_fs_info *fs_info, 4109 struct find_free_extent_ctl *ffe_ctl) 4110 { 4111 switch (ffe_ctl->policy) { 4112 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4113 return 0; 4114 case BTRFS_EXTENT_ALLOC_ZONED: 4115 return can_allocate_chunk_zoned(fs_info, ffe_ctl); 4116 default: 4117 BUG(); 4118 } 4119 } 4120 4121 /* 4122 * Return >0 means caller needs to re-search for free extent 4123 * Return 0 means we have the needed free extent. 4124 * Return <0 means we failed to locate any free extent. 4125 */ 4126 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 4127 struct btrfs_key *ins, 4128 struct find_free_extent_ctl *ffe_ctl, 4129 bool full_search) 4130 { 4131 struct btrfs_root *root = fs_info->chunk_root; 4132 int ret; 4133 4134 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 4135 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 4136 ffe_ctl->orig_have_caching_bg = true; 4137 4138 if (ins->objectid) { 4139 found_extent(ffe_ctl, ins); 4140 return 0; 4141 } 4142 4143 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) 4144 return 1; 4145 4146 ffe_ctl->index++; 4147 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) 4148 return 1; 4149 4150 /* See the comments for btrfs_loop_type for an explanation of the phases. */ 4151 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 4152 ffe_ctl->index = 0; 4153 /* 4154 * We want to skip the LOOP_CACHING_WAIT step if we don't have 4155 * any uncached bgs and we've already done a full search 4156 * through. 4157 */ 4158 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && 4159 (!ffe_ctl->orig_have_caching_bg && full_search)) 4160 ffe_ctl->loop++; 4161 ffe_ctl->loop++; 4162 4163 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 4164 struct btrfs_trans_handle *trans; 4165 int exist = 0; 4166 4167 /* Check if allocation policy allows to create a new chunk */ 4168 ret = can_allocate_chunk(fs_info, ffe_ctl); 4169 if (ret) 4170 return ret; 4171 4172 trans = current->journal_info; 4173 if (trans) 4174 exist = 1; 4175 else 4176 trans = btrfs_join_transaction(root); 4177 4178 if (IS_ERR(trans)) { 4179 ret = PTR_ERR(trans); 4180 return ret; 4181 } 4182 4183 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, 4184 CHUNK_ALLOC_FORCE_FOR_EXTENT); 4185 4186 /* Do not bail out on ENOSPC since we can do more. */ 4187 if (ret == -ENOSPC) { 4188 ret = 0; 4189 ffe_ctl->loop++; 4190 } 4191 else if (ret < 0) 4192 btrfs_abort_transaction(trans, ret); 4193 else 4194 ret = 0; 4195 if (!exist) 4196 btrfs_end_transaction(trans); 4197 if (ret) 4198 return ret; 4199 } 4200 4201 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 4202 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) 4203 return -ENOSPC; 4204 4205 /* 4206 * Don't loop again if we already have no empty_size and 4207 * no empty_cluster. 4208 */ 4209 if (ffe_ctl->empty_size == 0 && 4210 ffe_ctl->empty_cluster == 0) 4211 return -ENOSPC; 4212 ffe_ctl->empty_size = 0; 4213 ffe_ctl->empty_cluster = 0; 4214 } 4215 return 1; 4216 } 4217 return -ENOSPC; 4218 } 4219 4220 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, 4221 struct btrfs_block_group *bg) 4222 { 4223 if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) 4224 return true; 4225 if (!btrfs_block_group_should_use_size_class(bg)) 4226 return true; 4227 if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) 4228 return true; 4229 if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && 4230 bg->size_class == BTRFS_BG_SZ_NONE) 4231 return true; 4232 return ffe_ctl->size_class == bg->size_class; 4233 } 4234 4235 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, 4236 struct find_free_extent_ctl *ffe_ctl, 4237 struct btrfs_space_info *space_info, 4238 struct btrfs_key *ins) 4239 { 4240 /* 4241 * If our free space is heavily fragmented we may not be able to make 4242 * big contiguous allocations, so instead of doing the expensive search 4243 * for free space, simply return ENOSPC with our max_extent_size so we 4244 * can go ahead and search for a more manageable chunk. 4245 * 4246 * If our max_extent_size is large enough for our allocation simply 4247 * disable clustering since we will likely not be able to find enough 4248 * space to create a cluster and induce latency trying. 4249 */ 4250 if (space_info->max_extent_size) { 4251 spin_lock(&space_info->lock); 4252 if (space_info->max_extent_size && 4253 ffe_ctl->num_bytes > space_info->max_extent_size) { 4254 ins->offset = space_info->max_extent_size; 4255 spin_unlock(&space_info->lock); 4256 return -ENOSPC; 4257 } else if (space_info->max_extent_size) { 4258 ffe_ctl->use_cluster = false; 4259 } 4260 spin_unlock(&space_info->lock); 4261 } 4262 4263 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, 4264 &ffe_ctl->empty_cluster); 4265 if (ffe_ctl->last_ptr) { 4266 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 4267 4268 spin_lock(&last_ptr->lock); 4269 if (last_ptr->block_group) 4270 ffe_ctl->hint_byte = last_ptr->window_start; 4271 if (last_ptr->fragmented) { 4272 /* 4273 * We still set window_start so we can keep track of the 4274 * last place we found an allocation to try and save 4275 * some time. 4276 */ 4277 ffe_ctl->hint_byte = last_ptr->window_start; 4278 ffe_ctl->use_cluster = false; 4279 } 4280 spin_unlock(&last_ptr->lock); 4281 } 4282 4283 return 0; 4284 } 4285 4286 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, 4287 struct find_free_extent_ctl *ffe_ctl) 4288 { 4289 if (ffe_ctl->for_treelog) { 4290 spin_lock(&fs_info->treelog_bg_lock); 4291 if (fs_info->treelog_bg) 4292 ffe_ctl->hint_byte = fs_info->treelog_bg; 4293 spin_unlock(&fs_info->treelog_bg_lock); 4294 } else if (ffe_ctl->for_data_reloc) { 4295 spin_lock(&fs_info->relocation_bg_lock); 4296 if (fs_info->data_reloc_bg) 4297 ffe_ctl->hint_byte = fs_info->data_reloc_bg; 4298 spin_unlock(&fs_info->relocation_bg_lock); 4299 } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 4300 struct btrfs_block_group *block_group; 4301 4302 spin_lock(&fs_info->zone_active_bgs_lock); 4303 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { 4304 /* 4305 * No lock is OK here because avail is monotinically 4306 * decreasing, and this is just a hint. 4307 */ 4308 u64 avail = block_group->zone_capacity - block_group->alloc_offset; 4309 4310 if (block_group_bits(block_group, ffe_ctl->flags) && 4311 avail >= ffe_ctl->num_bytes) { 4312 ffe_ctl->hint_byte = block_group->start; 4313 break; 4314 } 4315 } 4316 spin_unlock(&fs_info->zone_active_bgs_lock); 4317 } 4318 4319 return 0; 4320 } 4321 4322 static int prepare_allocation(struct btrfs_fs_info *fs_info, 4323 struct find_free_extent_ctl *ffe_ctl, 4324 struct btrfs_space_info *space_info, 4325 struct btrfs_key *ins) 4326 { 4327 switch (ffe_ctl->policy) { 4328 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4329 return prepare_allocation_clustered(fs_info, ffe_ctl, 4330 space_info, ins); 4331 case BTRFS_EXTENT_ALLOC_ZONED: 4332 return prepare_allocation_zoned(fs_info, ffe_ctl); 4333 default: 4334 BUG(); 4335 } 4336 } 4337 4338 /* 4339 * walks the btree of allocated extents and find a hole of a given size. 4340 * The key ins is changed to record the hole: 4341 * ins->objectid == start position 4342 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4343 * ins->offset == the size of the hole. 4344 * Any available blocks before search_start are skipped. 4345 * 4346 * If there is no suitable free space, we will record the max size of 4347 * the free space extent currently. 4348 * 4349 * The overall logic and call chain: 4350 * 4351 * find_free_extent() 4352 * |- Iterate through all block groups 4353 * | |- Get a valid block group 4354 * | |- Try to do clustered allocation in that block group 4355 * | |- Try to do unclustered allocation in that block group 4356 * | |- Check if the result is valid 4357 * | | |- If valid, then exit 4358 * | |- Jump to next block group 4359 * | 4360 * |- Push harder to find free extents 4361 * |- If not found, re-iterate all block groups 4362 */ 4363 static noinline int find_free_extent(struct btrfs_root *root, 4364 struct btrfs_key *ins, 4365 struct find_free_extent_ctl *ffe_ctl) 4366 { 4367 struct btrfs_fs_info *fs_info = root->fs_info; 4368 int ret = 0; 4369 int cache_block_group_error = 0; 4370 struct btrfs_block_group *block_group = NULL; 4371 struct btrfs_space_info *space_info; 4372 bool full_search = false; 4373 4374 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); 4375 4376 ffe_ctl->search_start = 0; 4377 /* For clustered allocation */ 4378 ffe_ctl->empty_cluster = 0; 4379 ffe_ctl->last_ptr = NULL; 4380 ffe_ctl->use_cluster = true; 4381 ffe_ctl->have_caching_bg = false; 4382 ffe_ctl->orig_have_caching_bg = false; 4383 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); 4384 ffe_ctl->loop = 0; 4385 ffe_ctl->retry_uncached = false; 4386 ffe_ctl->cached = 0; 4387 ffe_ctl->max_extent_size = 0; 4388 ffe_ctl->total_free_space = 0; 4389 ffe_ctl->found_offset = 0; 4390 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; 4391 ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); 4392 4393 if (btrfs_is_zoned(fs_info)) 4394 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; 4395 4396 ins->type = BTRFS_EXTENT_ITEM_KEY; 4397 ins->objectid = 0; 4398 ins->offset = 0; 4399 4400 trace_find_free_extent(root, ffe_ctl); 4401 4402 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); 4403 if (!space_info) { 4404 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); 4405 return -ENOSPC; 4406 } 4407 4408 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); 4409 if (ret < 0) 4410 return ret; 4411 4412 ffe_ctl->search_start = max(ffe_ctl->search_start, 4413 first_logical_byte(fs_info)); 4414 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); 4415 if (ffe_ctl->search_start == ffe_ctl->hint_byte) { 4416 block_group = btrfs_lookup_block_group(fs_info, 4417 ffe_ctl->search_start); 4418 /* 4419 * we don't want to use the block group if it doesn't match our 4420 * allocation bits, or if its not cached. 4421 * 4422 * However if we are re-searching with an ideal block group 4423 * picked out then we don't care that the block group is cached. 4424 */ 4425 if (block_group && block_group_bits(block_group, ffe_ctl->flags) && 4426 block_group->cached != BTRFS_CACHE_NO) { 4427 down_read(&space_info->groups_sem); 4428 if (list_empty(&block_group->list) || 4429 block_group->ro) { 4430 /* 4431 * someone is removing this block group, 4432 * we can't jump into the have_block_group 4433 * target because our list pointers are not 4434 * valid 4435 */ 4436 btrfs_put_block_group(block_group); 4437 up_read(&space_info->groups_sem); 4438 } else { 4439 ffe_ctl->index = btrfs_bg_flags_to_raid_index( 4440 block_group->flags); 4441 btrfs_lock_block_group(block_group, 4442 ffe_ctl->delalloc); 4443 ffe_ctl->hinted = true; 4444 goto have_block_group; 4445 } 4446 } else if (block_group) { 4447 btrfs_put_block_group(block_group); 4448 } 4449 } 4450 search: 4451 trace_find_free_extent_search_loop(root, ffe_ctl); 4452 ffe_ctl->have_caching_bg = false; 4453 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || 4454 ffe_ctl->index == 0) 4455 full_search = true; 4456 down_read(&space_info->groups_sem); 4457 list_for_each_entry(block_group, 4458 &space_info->block_groups[ffe_ctl->index], list) { 4459 struct btrfs_block_group *bg_ret; 4460 4461 ffe_ctl->hinted = false; 4462 /* If the block group is read-only, we can skip it entirely. */ 4463 if (unlikely(block_group->ro)) { 4464 if (ffe_ctl->for_treelog) 4465 btrfs_clear_treelog_bg(block_group); 4466 if (ffe_ctl->for_data_reloc) 4467 btrfs_clear_data_reloc_bg(block_group); 4468 continue; 4469 } 4470 4471 btrfs_grab_block_group(block_group, ffe_ctl->delalloc); 4472 ffe_ctl->search_start = block_group->start; 4473 4474 /* 4475 * this can happen if we end up cycling through all the 4476 * raid types, but we want to make sure we only allocate 4477 * for the proper type. 4478 */ 4479 if (!block_group_bits(block_group, ffe_ctl->flags)) { 4480 u64 extra = BTRFS_BLOCK_GROUP_DUP | 4481 BTRFS_BLOCK_GROUP_RAID1_MASK | 4482 BTRFS_BLOCK_GROUP_RAID56_MASK | 4483 BTRFS_BLOCK_GROUP_RAID10; 4484 4485 /* 4486 * if they asked for extra copies and this block group 4487 * doesn't provide them, bail. This does allow us to 4488 * fill raid0 from raid1. 4489 */ 4490 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) 4491 goto loop; 4492 4493 /* 4494 * This block group has different flags than we want. 4495 * It's possible that we have MIXED_GROUP flag but no 4496 * block group is mixed. Just skip such block group. 4497 */ 4498 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4499 continue; 4500 } 4501 4502 have_block_group: 4503 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); 4504 ffe_ctl->cached = btrfs_block_group_done(block_group); 4505 if (unlikely(!ffe_ctl->cached)) { 4506 ffe_ctl->have_caching_bg = true; 4507 ret = btrfs_cache_block_group(block_group, false); 4508 4509 /* 4510 * If we get ENOMEM here or something else we want to 4511 * try other block groups, because it may not be fatal. 4512 * However if we can't find anything else we need to 4513 * save our return here so that we return the actual 4514 * error that caused problems, not ENOSPC. 4515 */ 4516 if (ret < 0) { 4517 if (!cache_block_group_error) 4518 cache_block_group_error = ret; 4519 ret = 0; 4520 goto loop; 4521 } 4522 ret = 0; 4523 } 4524 4525 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { 4526 if (!cache_block_group_error) 4527 cache_block_group_error = -EIO; 4528 goto loop; 4529 } 4530 4531 if (!find_free_extent_check_size_class(ffe_ctl, block_group)) 4532 goto loop; 4533 4534 bg_ret = NULL; 4535 ret = do_allocation(block_group, ffe_ctl, &bg_ret); 4536 if (ret > 0) 4537 goto loop; 4538 4539 if (bg_ret && bg_ret != block_group) { 4540 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4541 block_group = bg_ret; 4542 } 4543 4544 /* Checks */ 4545 ffe_ctl->search_start = round_up(ffe_ctl->found_offset, 4546 fs_info->stripesize); 4547 4548 /* move on to the next group */ 4549 if (ffe_ctl->search_start + ffe_ctl->num_bytes > 4550 block_group->start + block_group->length) { 4551 btrfs_add_free_space_unused(block_group, 4552 ffe_ctl->found_offset, 4553 ffe_ctl->num_bytes); 4554 goto loop; 4555 } 4556 4557 if (ffe_ctl->found_offset < ffe_ctl->search_start) 4558 btrfs_add_free_space_unused(block_group, 4559 ffe_ctl->found_offset, 4560 ffe_ctl->search_start - ffe_ctl->found_offset); 4561 4562 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, 4563 ffe_ctl->num_bytes, 4564 ffe_ctl->delalloc, 4565 ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); 4566 if (ret == -EAGAIN) { 4567 btrfs_add_free_space_unused(block_group, 4568 ffe_ctl->found_offset, 4569 ffe_ctl->num_bytes); 4570 goto loop; 4571 } 4572 btrfs_inc_block_group_reservations(block_group); 4573 4574 /* we are all good, lets return */ 4575 ins->objectid = ffe_ctl->search_start; 4576 ins->offset = ffe_ctl->num_bytes; 4577 4578 trace_btrfs_reserve_extent(block_group, ffe_ctl); 4579 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4580 break; 4581 loop: 4582 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 4583 !ffe_ctl->retry_uncached) { 4584 ffe_ctl->retry_uncached = true; 4585 btrfs_wait_block_group_cache_progress(block_group, 4586 ffe_ctl->num_bytes + 4587 ffe_ctl->empty_cluster + 4588 ffe_ctl->empty_size); 4589 goto have_block_group; 4590 } 4591 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); 4592 cond_resched(); 4593 } 4594 up_read(&space_info->groups_sem); 4595 4596 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); 4597 if (ret > 0) 4598 goto search; 4599 4600 if (ret == -ENOSPC && !cache_block_group_error) { 4601 /* 4602 * Use ffe_ctl->total_free_space as fallback if we can't find 4603 * any contiguous hole. 4604 */ 4605 if (!ffe_ctl->max_extent_size) 4606 ffe_ctl->max_extent_size = ffe_ctl->total_free_space; 4607 spin_lock(&space_info->lock); 4608 space_info->max_extent_size = ffe_ctl->max_extent_size; 4609 spin_unlock(&space_info->lock); 4610 ins->offset = ffe_ctl->max_extent_size; 4611 } else if (ret == -ENOSPC) { 4612 ret = cache_block_group_error; 4613 } 4614 return ret; 4615 } 4616 4617 /* 4618 * Entry point to the extent allocator. Tries to find a hole that is at least 4619 * as big as @num_bytes. 4620 * 4621 * @root - The root that will contain this extent 4622 * 4623 * @ram_bytes - The amount of space in ram that @num_bytes take. This 4624 * is used for accounting purposes. This value differs 4625 * from @num_bytes only in the case of compressed extents. 4626 * 4627 * @num_bytes - Number of bytes to allocate on-disk. 4628 * 4629 * @min_alloc_size - Indicates the minimum amount of space that the 4630 * allocator should try to satisfy. In some cases 4631 * @num_bytes may be larger than what is required and if 4632 * the filesystem is fragmented then allocation fails. 4633 * However, the presence of @min_alloc_size gives a 4634 * chance to try and satisfy the smaller allocation. 4635 * 4636 * @empty_size - A hint that you plan on doing more COW. This is the 4637 * size in bytes the allocator should try to find free 4638 * next to the block it returns. This is just a hint and 4639 * may be ignored by the allocator. 4640 * 4641 * @hint_byte - Hint to the allocator to start searching above the byte 4642 * address passed. It might be ignored. 4643 * 4644 * @ins - This key is modified to record the found hole. It will 4645 * have the following values: 4646 * ins->objectid == start position 4647 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4648 * ins->offset == the size of the hole. 4649 * 4650 * @is_data - Boolean flag indicating whether an extent is 4651 * allocated for data (true) or metadata (false) 4652 * 4653 * @delalloc - Boolean flag indicating whether this allocation is for 4654 * delalloc or not. If 'true' data_rwsem of block groups 4655 * is going to be acquired. 4656 * 4657 * 4658 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 4659 * case -ENOSPC is returned then @ins->offset will contain the size of the 4660 * largest available hole the allocator managed to find. 4661 */ 4662 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 4663 u64 num_bytes, u64 min_alloc_size, 4664 u64 empty_size, u64 hint_byte, 4665 struct btrfs_key *ins, int is_data, int delalloc) 4666 { 4667 struct btrfs_fs_info *fs_info = root->fs_info; 4668 struct find_free_extent_ctl ffe_ctl = {}; 4669 bool final_tried = num_bytes == min_alloc_size; 4670 u64 flags; 4671 int ret; 4672 bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); 4673 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); 4674 4675 flags = get_alloc_profile_by_root(root, is_data); 4676 again: 4677 WARN_ON(num_bytes < fs_info->sectorsize); 4678 4679 ffe_ctl.ram_bytes = ram_bytes; 4680 ffe_ctl.num_bytes = num_bytes; 4681 ffe_ctl.min_alloc_size = min_alloc_size; 4682 ffe_ctl.empty_size = empty_size; 4683 ffe_ctl.flags = flags; 4684 ffe_ctl.delalloc = delalloc; 4685 ffe_ctl.hint_byte = hint_byte; 4686 ffe_ctl.for_treelog = for_treelog; 4687 ffe_ctl.for_data_reloc = for_data_reloc; 4688 4689 ret = find_free_extent(root, ins, &ffe_ctl); 4690 if (!ret && !is_data) { 4691 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 4692 } else if (ret == -ENOSPC) { 4693 if (!final_tried && ins->offset) { 4694 num_bytes = min(num_bytes >> 1, ins->offset); 4695 num_bytes = round_down(num_bytes, 4696 fs_info->sectorsize); 4697 num_bytes = max(num_bytes, min_alloc_size); 4698 ram_bytes = num_bytes; 4699 if (num_bytes == min_alloc_size) 4700 final_tried = true; 4701 goto again; 4702 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4703 struct btrfs_space_info *sinfo; 4704 4705 sinfo = btrfs_find_space_info(fs_info, flags); 4706 btrfs_err(fs_info, 4707 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", 4708 flags, num_bytes, for_treelog, for_data_reloc); 4709 if (sinfo) 4710 btrfs_dump_space_info(fs_info, sinfo, 4711 num_bytes, 1); 4712 } 4713 } 4714 4715 return ret; 4716 } 4717 4718 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 4719 u64 start, u64 len, int delalloc) 4720 { 4721 struct btrfs_block_group *cache; 4722 4723 cache = btrfs_lookup_block_group(fs_info, start); 4724 if (!cache) { 4725 btrfs_err(fs_info, "Unable to find block group for %llu", 4726 start); 4727 return -ENOSPC; 4728 } 4729 4730 btrfs_add_free_space(cache, start, len); 4731 btrfs_free_reserved_bytes(cache, len, delalloc); 4732 trace_btrfs_reserved_extent_free(fs_info, start, len); 4733 4734 btrfs_put_block_group(cache); 4735 return 0; 4736 } 4737 4738 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, 4739 const struct extent_buffer *eb) 4740 { 4741 struct btrfs_block_group *cache; 4742 int ret = 0; 4743 4744 cache = btrfs_lookup_block_group(trans->fs_info, eb->start); 4745 if (!cache) { 4746 btrfs_err(trans->fs_info, "unable to find block group for %llu", 4747 eb->start); 4748 return -ENOSPC; 4749 } 4750 4751 ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); 4752 btrfs_put_block_group(cache); 4753 return ret; 4754 } 4755 4756 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, 4757 u64 num_bytes) 4758 { 4759 struct btrfs_fs_info *fs_info = trans->fs_info; 4760 int ret; 4761 4762 ret = remove_from_free_space_tree(trans, bytenr, num_bytes); 4763 if (ret) 4764 return ret; 4765 4766 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); 4767 if (ret) { 4768 ASSERT(!ret); 4769 btrfs_err(fs_info, "update block group failed for %llu %llu", 4770 bytenr, num_bytes); 4771 return ret; 4772 } 4773 4774 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); 4775 return 0; 4776 } 4777 4778 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4779 u64 parent, u64 root_objectid, 4780 u64 flags, u64 owner, u64 offset, 4781 struct btrfs_key *ins, int ref_mod, u64 oref_root) 4782 { 4783 struct btrfs_fs_info *fs_info = trans->fs_info; 4784 struct btrfs_root *extent_root; 4785 int ret; 4786 struct btrfs_extent_item *extent_item; 4787 struct btrfs_extent_owner_ref *oref; 4788 struct btrfs_extent_inline_ref *iref; 4789 struct btrfs_path *path; 4790 struct extent_buffer *leaf; 4791 int type; 4792 u32 size; 4793 const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); 4794 4795 if (parent > 0) 4796 type = BTRFS_SHARED_DATA_REF_KEY; 4797 else 4798 type = BTRFS_EXTENT_DATA_REF_KEY; 4799 4800 size = sizeof(*extent_item); 4801 if (simple_quota) 4802 size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); 4803 size += btrfs_extent_inline_ref_size(type); 4804 4805 path = btrfs_alloc_path(); 4806 if (!path) 4807 return -ENOMEM; 4808 4809 extent_root = btrfs_extent_root(fs_info, ins->objectid); 4810 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); 4811 if (ret) { 4812 btrfs_free_path(path); 4813 return ret; 4814 } 4815 4816 leaf = path->nodes[0]; 4817 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4818 struct btrfs_extent_item); 4819 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 4820 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4821 btrfs_set_extent_flags(leaf, extent_item, 4822 flags | BTRFS_EXTENT_FLAG_DATA); 4823 4824 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4825 if (simple_quota) { 4826 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); 4827 oref = (struct btrfs_extent_owner_ref *)(&iref->offset); 4828 btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); 4829 iref = (struct btrfs_extent_inline_ref *)(oref + 1); 4830 } 4831 btrfs_set_extent_inline_ref_type(leaf, iref, type); 4832 4833 if (parent > 0) { 4834 struct btrfs_shared_data_ref *ref; 4835 ref = (struct btrfs_shared_data_ref *)(iref + 1); 4836 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4837 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 4838 } else { 4839 struct btrfs_extent_data_ref *ref; 4840 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 4841 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 4842 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 4843 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 4844 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 4845 } 4846 4847 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 4848 btrfs_free_path(path); 4849 4850 return alloc_reserved_extent(trans, ins->objectid, ins->offset); 4851 } 4852 4853 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 4854 struct btrfs_delayed_ref_node *node, 4855 struct btrfs_delayed_extent_op *extent_op) 4856 { 4857 struct btrfs_fs_info *fs_info = trans->fs_info; 4858 struct btrfs_root *extent_root; 4859 int ret; 4860 struct btrfs_extent_item *extent_item; 4861 struct btrfs_key extent_key; 4862 struct btrfs_tree_block_info *block_info; 4863 struct btrfs_extent_inline_ref *iref; 4864 struct btrfs_path *path; 4865 struct extent_buffer *leaf; 4866 u32 size = sizeof(*extent_item) + sizeof(*iref); 4867 u64 flags = extent_op->flags_to_set; 4868 /* The owner of a tree block is the level. */ 4869 int level = btrfs_delayed_ref_owner(node); 4870 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4871 4872 extent_key.objectid = node->bytenr; 4873 if (skinny_metadata) { 4874 /* The owner of a tree block is the level. */ 4875 extent_key.offset = level; 4876 extent_key.type = BTRFS_METADATA_ITEM_KEY; 4877 } else { 4878 extent_key.offset = node->num_bytes; 4879 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 4880 size += sizeof(*block_info); 4881 } 4882 4883 path = btrfs_alloc_path(); 4884 if (!path) 4885 return -ENOMEM; 4886 4887 extent_root = btrfs_extent_root(fs_info, extent_key.objectid); 4888 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, 4889 size); 4890 if (ret) { 4891 btrfs_free_path(path); 4892 return ret; 4893 } 4894 4895 leaf = path->nodes[0]; 4896 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4897 struct btrfs_extent_item); 4898 btrfs_set_extent_refs(leaf, extent_item, 1); 4899 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4900 btrfs_set_extent_flags(leaf, extent_item, 4901 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 4902 4903 if (skinny_metadata) { 4904 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4905 } else { 4906 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 4907 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 4908 btrfs_set_tree_block_level(leaf, block_info, level); 4909 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 4910 } 4911 4912 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 4913 btrfs_set_extent_inline_ref_type(leaf, iref, 4914 BTRFS_SHARED_BLOCK_REF_KEY); 4915 btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); 4916 } else { 4917 btrfs_set_extent_inline_ref_type(leaf, iref, 4918 BTRFS_TREE_BLOCK_REF_KEY); 4919 btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); 4920 } 4921 4922 btrfs_mark_buffer_dirty(trans, leaf); 4923 btrfs_free_path(path); 4924 4925 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); 4926 } 4927 4928 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4929 struct btrfs_root *root, u64 owner, 4930 u64 offset, u64 ram_bytes, 4931 struct btrfs_key *ins) 4932 { 4933 struct btrfs_ref generic_ref = { 4934 .action = BTRFS_ADD_DELAYED_EXTENT, 4935 .bytenr = ins->objectid, 4936 .num_bytes = ins->offset, 4937 .owning_root = btrfs_root_id(root), 4938 .ref_root = btrfs_root_id(root), 4939 }; 4940 4941 ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); 4942 4943 if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) 4944 generic_ref.owning_root = root->relocation_src_root; 4945 4946 btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); 4947 btrfs_ref_tree_mod(root->fs_info, &generic_ref); 4948 4949 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); 4950 } 4951 4952 /* 4953 * this is used by the tree logging recovery code. It records that 4954 * an extent has been allocated and makes sure to clear the free 4955 * space cache bits as well 4956 */ 4957 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 4958 u64 root_objectid, u64 owner, u64 offset, 4959 struct btrfs_key *ins) 4960 { 4961 struct btrfs_fs_info *fs_info = trans->fs_info; 4962 int ret; 4963 struct btrfs_block_group *block_group; 4964 struct btrfs_space_info *space_info; 4965 struct btrfs_squota_delta delta = { 4966 .root = root_objectid, 4967 .num_bytes = ins->offset, 4968 .generation = trans->transid, 4969 .is_data = true, 4970 .is_inc = true, 4971 }; 4972 4973 /* 4974 * Mixed block groups will exclude before processing the log so we only 4975 * need to do the exclude dance if this fs isn't mixed. 4976 */ 4977 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 4978 ret = __exclude_logged_extent(fs_info, ins->objectid, 4979 ins->offset); 4980 if (ret) 4981 return ret; 4982 } 4983 4984 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 4985 if (!block_group) 4986 return -EINVAL; 4987 4988 space_info = block_group->space_info; 4989 spin_lock(&space_info->lock); 4990 spin_lock(&block_group->lock); 4991 space_info->bytes_reserved += ins->offset; 4992 block_group->reserved += ins->offset; 4993 spin_unlock(&block_group->lock); 4994 spin_unlock(&space_info->lock); 4995 4996 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 4997 offset, ins, 1, root_objectid); 4998 if (ret) 4999 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); 5000 ret = btrfs_record_squota_delta(fs_info, &delta); 5001 btrfs_put_block_group(block_group); 5002 return ret; 5003 } 5004 5005 #ifdef CONFIG_BTRFS_DEBUG 5006 /* 5007 * Extra safety check in case the extent tree is corrupted and extent allocator 5008 * chooses to use a tree block which is already used and locked. 5009 */ 5010 static bool check_eb_lock_owner(const struct extent_buffer *eb) 5011 { 5012 if (eb->lock_owner == current->pid) { 5013 btrfs_err_rl(eb->fs_info, 5014 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 5015 eb->start, btrfs_header_owner(eb), current->pid); 5016 return true; 5017 } 5018 return false; 5019 } 5020 #else 5021 static bool check_eb_lock_owner(struct extent_buffer *eb) 5022 { 5023 return false; 5024 } 5025 #endif 5026 5027 static struct extent_buffer * 5028 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 5029 u64 bytenr, int level, u64 owner, 5030 enum btrfs_lock_nesting nest) 5031 { 5032 struct btrfs_fs_info *fs_info = root->fs_info; 5033 struct extent_buffer *buf; 5034 u64 lockdep_owner = owner; 5035 5036 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); 5037 if (IS_ERR(buf)) 5038 return buf; 5039 5040 if (check_eb_lock_owner(buf)) { 5041 free_extent_buffer(buf); 5042 return ERR_PTR(-EUCLEAN); 5043 } 5044 5045 /* 5046 * The reloc trees are just snapshots, so we need them to appear to be 5047 * just like any other fs tree WRT lockdep. 5048 * 5049 * The exception however is in replace_path() in relocation, where we 5050 * hold the lock on the original fs root and then search for the reloc 5051 * root. At that point we need to make sure any reloc root buffers are 5052 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make 5053 * lockdep happy. 5054 */ 5055 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && 5056 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) 5057 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 5058 5059 /* btrfs_clear_buffer_dirty() accesses generation field. */ 5060 btrfs_set_header_generation(buf, trans->transid); 5061 5062 /* 5063 * This needs to stay, because we could allocate a freed block from an 5064 * old tree into a new tree, so we need to make sure this new block is 5065 * set to the appropriate level and owner. 5066 */ 5067 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); 5068 5069 btrfs_tree_lock_nested(buf, nest); 5070 btrfs_clear_buffer_dirty(trans, buf); 5071 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 5072 clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); 5073 5074 set_extent_buffer_uptodate(buf); 5075 5076 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 5077 btrfs_set_header_level(buf, level); 5078 btrfs_set_header_bytenr(buf, buf->start); 5079 btrfs_set_header_generation(buf, trans->transid); 5080 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 5081 btrfs_set_header_owner(buf, owner); 5082 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 5083 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 5084 if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { 5085 buf->log_index = root->log_transid % 2; 5086 /* 5087 * we allow two log transactions at a time, use different 5088 * EXTENT bit to differentiate dirty pages. 5089 */ 5090 if (buf->log_index == 0) 5091 set_extent_bit(&root->dirty_log_pages, buf->start, 5092 buf->start + buf->len - 1, 5093 EXTENT_DIRTY, NULL); 5094 else 5095 set_extent_bit(&root->dirty_log_pages, buf->start, 5096 buf->start + buf->len - 1, 5097 EXTENT_NEW, NULL); 5098 } else { 5099 buf->log_index = -1; 5100 set_extent_bit(&trans->transaction->dirty_pages, buf->start, 5101 buf->start + buf->len - 1, EXTENT_DIRTY, NULL); 5102 } 5103 /* this returns a buffer locked for blocking */ 5104 return buf; 5105 } 5106 5107 /* 5108 * finds a free extent and does all the dirty work required for allocation 5109 * returns the tree buffer or an ERR_PTR on error. 5110 */ 5111 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 5112 struct btrfs_root *root, 5113 u64 parent, u64 root_objectid, 5114 const struct btrfs_disk_key *key, 5115 int level, u64 hint, 5116 u64 empty_size, 5117 u64 reloc_src_root, 5118 enum btrfs_lock_nesting nest) 5119 { 5120 struct btrfs_fs_info *fs_info = root->fs_info; 5121 struct btrfs_key ins; 5122 struct btrfs_block_rsv *block_rsv; 5123 struct extent_buffer *buf; 5124 struct btrfs_delayed_extent_op *extent_op; 5125 u64 flags = 0; 5126 int ret; 5127 u32 blocksize = fs_info->nodesize; 5128 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 5129 u64 owning_root; 5130 5131 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5132 if (btrfs_is_testing(fs_info)) { 5133 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 5134 level, root_objectid, nest); 5135 if (!IS_ERR(buf)) 5136 root->alloc_bytenr += blocksize; 5137 return buf; 5138 } 5139 #endif 5140 5141 block_rsv = btrfs_use_block_rsv(trans, root, blocksize); 5142 if (IS_ERR(block_rsv)) 5143 return ERR_CAST(block_rsv); 5144 5145 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 5146 empty_size, hint, &ins, 0, 0); 5147 if (ret) 5148 goto out_unuse; 5149 5150 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 5151 root_objectid, nest); 5152 if (IS_ERR(buf)) { 5153 ret = PTR_ERR(buf); 5154 goto out_free_reserved; 5155 } 5156 owning_root = btrfs_header_owner(buf); 5157 5158 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 5159 if (parent == 0) 5160 parent = ins.objectid; 5161 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 5162 owning_root = reloc_src_root; 5163 } else 5164 BUG_ON(parent > 0); 5165 5166 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 5167 struct btrfs_ref generic_ref = { 5168 .action = BTRFS_ADD_DELAYED_EXTENT, 5169 .bytenr = ins.objectid, 5170 .num_bytes = ins.offset, 5171 .parent = parent, 5172 .owning_root = owning_root, 5173 .ref_root = root_objectid, 5174 }; 5175 extent_op = btrfs_alloc_delayed_extent_op(); 5176 if (!extent_op) { 5177 ret = -ENOMEM; 5178 goto out_free_buf; 5179 } 5180 if (key) 5181 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 5182 else 5183 memset(&extent_op->key, 0, sizeof(extent_op->key)); 5184 extent_op->flags_to_set = flags; 5185 extent_op->update_key = skinny_metadata ? false : true; 5186 extent_op->update_flags = true; 5187 extent_op->level = level; 5188 5189 btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); 5190 btrfs_ref_tree_mod(fs_info, &generic_ref); 5191 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); 5192 if (ret) 5193 goto out_free_delayed; 5194 } 5195 return buf; 5196 5197 out_free_delayed: 5198 btrfs_free_delayed_extent_op(extent_op); 5199 out_free_buf: 5200 btrfs_tree_unlock(buf); 5201 free_extent_buffer(buf); 5202 out_free_reserved: 5203 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 5204 out_unuse: 5205 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); 5206 return ERR_PTR(ret); 5207 } 5208 5209 struct walk_control { 5210 u64 refs[BTRFS_MAX_LEVEL]; 5211 u64 flags[BTRFS_MAX_LEVEL]; 5212 struct btrfs_key update_progress; 5213 struct btrfs_key drop_progress; 5214 int drop_level; 5215 int stage; 5216 int level; 5217 int shared_level; 5218 int update_ref; 5219 int keep_locks; 5220 int reada_slot; 5221 int reada_count; 5222 int restarted; 5223 }; 5224 5225 #define DROP_REFERENCE 1 5226 #define UPDATE_BACKREF 2 5227 5228 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 5229 struct btrfs_root *root, 5230 struct walk_control *wc, 5231 struct btrfs_path *path) 5232 { 5233 struct btrfs_fs_info *fs_info = root->fs_info; 5234 u64 bytenr; 5235 u64 generation; 5236 u64 refs; 5237 u64 flags; 5238 u32 nritems; 5239 struct btrfs_key key; 5240 struct extent_buffer *eb; 5241 int ret; 5242 int slot; 5243 int nread = 0; 5244 5245 if (path->slots[wc->level] < wc->reada_slot) { 5246 wc->reada_count = wc->reada_count * 2 / 3; 5247 wc->reada_count = max(wc->reada_count, 2); 5248 } else { 5249 wc->reada_count = wc->reada_count * 3 / 2; 5250 wc->reada_count = min_t(int, wc->reada_count, 5251 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 5252 } 5253 5254 eb = path->nodes[wc->level]; 5255 nritems = btrfs_header_nritems(eb); 5256 5257 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 5258 if (nread >= wc->reada_count) 5259 break; 5260 5261 cond_resched(); 5262 bytenr = btrfs_node_blockptr(eb, slot); 5263 generation = btrfs_node_ptr_generation(eb, slot); 5264 5265 if (slot == path->slots[wc->level]) 5266 goto reada; 5267 5268 if (wc->stage == UPDATE_BACKREF && 5269 generation <= root->root_key.offset) 5270 continue; 5271 5272 /* We don't lock the tree block, it's OK to be racy here */ 5273 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 5274 wc->level - 1, 1, &refs, 5275 &flags, NULL); 5276 /* We don't care about errors in readahead. */ 5277 if (ret < 0) 5278 continue; 5279 BUG_ON(refs == 0); 5280 5281 if (wc->stage == DROP_REFERENCE) { 5282 if (refs == 1) 5283 goto reada; 5284 5285 if (wc->level == 1 && 5286 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5287 continue; 5288 if (!wc->update_ref || 5289 generation <= root->root_key.offset) 5290 continue; 5291 btrfs_node_key_to_cpu(eb, &key, slot); 5292 ret = btrfs_comp_cpu_keys(&key, 5293 &wc->update_progress); 5294 if (ret < 0) 5295 continue; 5296 } else { 5297 if (wc->level == 1 && 5298 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5299 continue; 5300 } 5301 reada: 5302 btrfs_readahead_node_child(eb, slot); 5303 nread++; 5304 } 5305 wc->reada_slot = slot; 5306 } 5307 5308 /* 5309 * helper to process tree block while walking down the tree. 5310 * 5311 * when wc->stage == UPDATE_BACKREF, this function updates 5312 * back refs for pointers in the block. 5313 * 5314 * NOTE: return value 1 means we should stop walking down. 5315 */ 5316 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5317 struct btrfs_root *root, 5318 struct btrfs_path *path, 5319 struct walk_control *wc, int lookup_info) 5320 { 5321 struct btrfs_fs_info *fs_info = root->fs_info; 5322 int level = wc->level; 5323 struct extent_buffer *eb = path->nodes[level]; 5324 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5325 int ret; 5326 5327 if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) 5328 return 1; 5329 5330 /* 5331 * when reference count of tree block is 1, it won't increase 5332 * again. once full backref flag is set, we never clear it. 5333 */ 5334 if (lookup_info && 5335 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5336 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 5337 BUG_ON(!path->locks[level]); 5338 ret = btrfs_lookup_extent_info(trans, fs_info, 5339 eb->start, level, 1, 5340 &wc->refs[level], 5341 &wc->flags[level], 5342 NULL); 5343 BUG_ON(ret == -ENOMEM); 5344 if (ret) 5345 return ret; 5346 BUG_ON(wc->refs[level] == 0); 5347 } 5348 5349 if (wc->stage == DROP_REFERENCE) { 5350 if (wc->refs[level] > 1) 5351 return 1; 5352 5353 if (path->locks[level] && !wc->keep_locks) { 5354 btrfs_tree_unlock_rw(eb, path->locks[level]); 5355 path->locks[level] = 0; 5356 } 5357 return 0; 5358 } 5359 5360 /* wc->stage == UPDATE_BACKREF */ 5361 if (!(wc->flags[level] & flag)) { 5362 BUG_ON(!path->locks[level]); 5363 ret = btrfs_inc_ref(trans, root, eb, 1); 5364 BUG_ON(ret); /* -ENOMEM */ 5365 ret = btrfs_dec_ref(trans, root, eb, 0); 5366 BUG_ON(ret); /* -ENOMEM */ 5367 ret = btrfs_set_disk_extent_flags(trans, eb, flag); 5368 BUG_ON(ret); /* -ENOMEM */ 5369 wc->flags[level] |= flag; 5370 } 5371 5372 /* 5373 * the block is shared by multiple trees, so it's not good to 5374 * keep the tree lock 5375 */ 5376 if (path->locks[level] && level > 0) { 5377 btrfs_tree_unlock_rw(eb, path->locks[level]); 5378 path->locks[level] = 0; 5379 } 5380 return 0; 5381 } 5382 5383 /* 5384 * This is used to verify a ref exists for this root to deal with a bug where we 5385 * would have a drop_progress key that hadn't been updated properly. 5386 */ 5387 static int check_ref_exists(struct btrfs_trans_handle *trans, 5388 struct btrfs_root *root, u64 bytenr, u64 parent, 5389 int level) 5390 { 5391 struct btrfs_path *path; 5392 struct btrfs_extent_inline_ref *iref; 5393 int ret; 5394 5395 path = btrfs_alloc_path(); 5396 if (!path) 5397 return -ENOMEM; 5398 5399 ret = lookup_extent_backref(trans, path, &iref, bytenr, 5400 root->fs_info->nodesize, parent, 5401 btrfs_root_id(root), level, 0); 5402 btrfs_free_path(path); 5403 if (ret == -ENOENT) 5404 return 0; 5405 if (ret < 0) 5406 return ret; 5407 return 1; 5408 } 5409 5410 /* 5411 * helper to process tree block pointer. 5412 * 5413 * when wc->stage == DROP_REFERENCE, this function checks 5414 * reference count of the block pointed to. if the block 5415 * is shared and we need update back refs for the subtree 5416 * rooted at the block, this function changes wc->stage to 5417 * UPDATE_BACKREF. if the block is shared and there is no 5418 * need to update back, this function drops the reference 5419 * to the block. 5420 * 5421 * NOTE: return value 1 means we should stop walking down. 5422 */ 5423 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5424 struct btrfs_root *root, 5425 struct btrfs_path *path, 5426 struct walk_control *wc, int *lookup_info) 5427 { 5428 struct btrfs_fs_info *fs_info = root->fs_info; 5429 u64 bytenr; 5430 u64 generation; 5431 u64 owner_root = 0; 5432 struct btrfs_tree_parent_check check = { 0 }; 5433 struct btrfs_key key; 5434 struct extent_buffer *next; 5435 int level = wc->level; 5436 int reada = 0; 5437 int ret = 0; 5438 bool need_account = false; 5439 5440 generation = btrfs_node_ptr_generation(path->nodes[level], 5441 path->slots[level]); 5442 /* 5443 * if the lower level block was created before the snapshot 5444 * was created, we know there is no need to update back refs 5445 * for the subtree 5446 */ 5447 if (wc->stage == UPDATE_BACKREF && 5448 generation <= root->root_key.offset) { 5449 *lookup_info = 1; 5450 return 1; 5451 } 5452 5453 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5454 5455 check.level = level - 1; 5456 check.transid = generation; 5457 check.owner_root = btrfs_root_id(root); 5458 check.has_first_key = true; 5459 btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, 5460 path->slots[level]); 5461 5462 next = find_extent_buffer(fs_info, bytenr); 5463 if (!next) { 5464 next = btrfs_find_create_tree_block(fs_info, bytenr, 5465 btrfs_root_id(root), level - 1); 5466 if (IS_ERR(next)) 5467 return PTR_ERR(next); 5468 reada = 1; 5469 } 5470 btrfs_tree_lock(next); 5471 5472 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 5473 &wc->refs[level - 1], 5474 &wc->flags[level - 1], 5475 &owner_root); 5476 if (ret < 0) 5477 goto out_unlock; 5478 5479 if (unlikely(wc->refs[level - 1] == 0)) { 5480 btrfs_err(fs_info, "Missing references."); 5481 ret = -EIO; 5482 goto out_unlock; 5483 } 5484 *lookup_info = 0; 5485 5486 if (wc->stage == DROP_REFERENCE) { 5487 if (wc->refs[level - 1] > 1) { 5488 need_account = true; 5489 if (level == 1 && 5490 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5491 goto skip; 5492 5493 if (!wc->update_ref || 5494 generation <= root->root_key.offset) 5495 goto skip; 5496 5497 btrfs_node_key_to_cpu(path->nodes[level], &key, 5498 path->slots[level]); 5499 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 5500 if (ret < 0) 5501 goto skip; 5502 5503 wc->stage = UPDATE_BACKREF; 5504 wc->shared_level = level - 1; 5505 } 5506 } else { 5507 if (level == 1 && 5508 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5509 goto skip; 5510 } 5511 5512 if (!btrfs_buffer_uptodate(next, generation, 0)) { 5513 btrfs_tree_unlock(next); 5514 free_extent_buffer(next); 5515 next = NULL; 5516 *lookup_info = 1; 5517 } 5518 5519 if (!next) { 5520 if (reada && level == 1) 5521 reada_walk_down(trans, root, wc, path); 5522 next = read_tree_block(fs_info, bytenr, &check); 5523 if (IS_ERR(next)) { 5524 return PTR_ERR(next); 5525 } else if (!extent_buffer_uptodate(next)) { 5526 free_extent_buffer(next); 5527 return -EIO; 5528 } 5529 btrfs_tree_lock(next); 5530 } 5531 5532 level--; 5533 ASSERT(level == btrfs_header_level(next)); 5534 if (level != btrfs_header_level(next)) { 5535 btrfs_err(root->fs_info, "mismatched level"); 5536 ret = -EIO; 5537 goto out_unlock; 5538 } 5539 path->nodes[level] = next; 5540 path->slots[level] = 0; 5541 path->locks[level] = BTRFS_WRITE_LOCK; 5542 wc->level = level; 5543 if (wc->level == 1) 5544 wc->reada_slot = 0; 5545 return 0; 5546 skip: 5547 wc->refs[level - 1] = 0; 5548 wc->flags[level - 1] = 0; 5549 if (wc->stage == DROP_REFERENCE) { 5550 struct btrfs_ref ref = { 5551 .action = BTRFS_DROP_DELAYED_REF, 5552 .bytenr = bytenr, 5553 .num_bytes = fs_info->nodesize, 5554 .owning_root = owner_root, 5555 .ref_root = btrfs_root_id(root), 5556 }; 5557 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5558 ref.parent = path->nodes[level]->start; 5559 } else { 5560 ASSERT(btrfs_root_id(root) == 5561 btrfs_header_owner(path->nodes[level])); 5562 if (btrfs_root_id(root) != 5563 btrfs_header_owner(path->nodes[level])) { 5564 btrfs_err(root->fs_info, 5565 "mismatched block owner"); 5566 ret = -EIO; 5567 goto out_unlock; 5568 } 5569 } 5570 5571 /* 5572 * If we had a drop_progress we need to verify the refs are set 5573 * as expected. If we find our ref then we know that from here 5574 * on out everything should be correct, and we can clear the 5575 * ->restarted flag. 5576 */ 5577 if (wc->restarted) { 5578 ret = check_ref_exists(trans, root, bytenr, ref.parent, 5579 level - 1); 5580 if (ret < 0) 5581 goto out_unlock; 5582 if (ret == 0) 5583 goto no_delete; 5584 ret = 0; 5585 wc->restarted = 0; 5586 } 5587 5588 /* 5589 * Reloc tree doesn't contribute to qgroup numbers, and we have 5590 * already accounted them at merge time (replace_path), 5591 * thus we could skip expensive subtree trace here. 5592 */ 5593 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) { 5594 ret = btrfs_qgroup_trace_subtree(trans, next, 5595 generation, level - 1); 5596 if (ret) { 5597 btrfs_err_rl(fs_info, 5598 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 5599 ret); 5600 } 5601 } 5602 5603 /* 5604 * We need to update the next key in our walk control so we can 5605 * update the drop_progress key accordingly. We don't care if 5606 * find_next_key doesn't find a key because that means we're at 5607 * the end and are going to clean up now. 5608 */ 5609 wc->drop_level = level; 5610 find_next_key(path, level, &wc->drop_progress); 5611 5612 btrfs_init_tree_ref(&ref, level - 1, 0, false); 5613 ret = btrfs_free_extent(trans, &ref); 5614 if (ret) 5615 goto out_unlock; 5616 } 5617 no_delete: 5618 *lookup_info = 1; 5619 ret = 1; 5620 5621 out_unlock: 5622 btrfs_tree_unlock(next); 5623 free_extent_buffer(next); 5624 5625 return ret; 5626 } 5627 5628 /* 5629 * helper to process tree block while walking up the tree. 5630 * 5631 * when wc->stage == DROP_REFERENCE, this function drops 5632 * reference count on the block. 5633 * 5634 * when wc->stage == UPDATE_BACKREF, this function changes 5635 * wc->stage back to DROP_REFERENCE if we changed wc->stage 5636 * to UPDATE_BACKREF previously while processing the block. 5637 * 5638 * NOTE: return value 1 means we should stop walking up. 5639 */ 5640 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 5641 struct btrfs_root *root, 5642 struct btrfs_path *path, 5643 struct walk_control *wc) 5644 { 5645 struct btrfs_fs_info *fs_info = root->fs_info; 5646 int ret; 5647 int level = wc->level; 5648 struct extent_buffer *eb = path->nodes[level]; 5649 u64 parent = 0; 5650 5651 if (wc->stage == UPDATE_BACKREF) { 5652 BUG_ON(wc->shared_level < level); 5653 if (level < wc->shared_level) 5654 goto out; 5655 5656 ret = find_next_key(path, level + 1, &wc->update_progress); 5657 if (ret > 0) 5658 wc->update_ref = 0; 5659 5660 wc->stage = DROP_REFERENCE; 5661 wc->shared_level = -1; 5662 path->slots[level] = 0; 5663 5664 /* 5665 * check reference count again if the block isn't locked. 5666 * we should start walking down the tree again if reference 5667 * count is one. 5668 */ 5669 if (!path->locks[level]) { 5670 BUG_ON(level == 0); 5671 btrfs_tree_lock(eb); 5672 path->locks[level] = BTRFS_WRITE_LOCK; 5673 5674 ret = btrfs_lookup_extent_info(trans, fs_info, 5675 eb->start, level, 1, 5676 &wc->refs[level], 5677 &wc->flags[level], 5678 NULL); 5679 if (ret < 0) { 5680 btrfs_tree_unlock_rw(eb, path->locks[level]); 5681 path->locks[level] = 0; 5682 return ret; 5683 } 5684 BUG_ON(wc->refs[level] == 0); 5685 if (wc->refs[level] == 1) { 5686 btrfs_tree_unlock_rw(eb, path->locks[level]); 5687 path->locks[level] = 0; 5688 return 1; 5689 } 5690 } 5691 } 5692 5693 /* wc->stage == DROP_REFERENCE */ 5694 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 5695 5696 if (wc->refs[level] == 1) { 5697 if (level == 0) { 5698 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5699 ret = btrfs_dec_ref(trans, root, eb, 1); 5700 else 5701 ret = btrfs_dec_ref(trans, root, eb, 0); 5702 BUG_ON(ret); /* -ENOMEM */ 5703 if (is_fstree(btrfs_root_id(root))) { 5704 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 5705 if (ret) { 5706 btrfs_err_rl(fs_info, 5707 "error %d accounting leaf items, quota is out of sync, rescan required", 5708 ret); 5709 } 5710 } 5711 } 5712 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ 5713 if (!path->locks[level]) { 5714 btrfs_tree_lock(eb); 5715 path->locks[level] = BTRFS_WRITE_LOCK; 5716 } 5717 btrfs_clear_buffer_dirty(trans, eb); 5718 } 5719 5720 if (eb == root->node) { 5721 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5722 parent = eb->start; 5723 else if (btrfs_root_id(root) != btrfs_header_owner(eb)) 5724 goto owner_mismatch; 5725 } else { 5726 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5727 parent = path->nodes[level + 1]->start; 5728 else if (btrfs_root_id(root) != 5729 btrfs_header_owner(path->nodes[level + 1])) 5730 goto owner_mismatch; 5731 } 5732 5733 btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, 5734 wc->refs[level] == 1); 5735 out: 5736 wc->refs[level] = 0; 5737 wc->flags[level] = 0; 5738 return 0; 5739 5740 owner_mismatch: 5741 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 5742 btrfs_header_owner(eb), btrfs_root_id(root)); 5743 return -EUCLEAN; 5744 } 5745 5746 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5747 struct btrfs_root *root, 5748 struct btrfs_path *path, 5749 struct walk_control *wc) 5750 { 5751 int level = wc->level; 5752 int lookup_info = 1; 5753 int ret = 0; 5754 5755 while (level >= 0) { 5756 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5757 if (ret) 5758 break; 5759 5760 if (level == 0) 5761 break; 5762 5763 if (path->slots[level] >= 5764 btrfs_header_nritems(path->nodes[level])) 5765 break; 5766 5767 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5768 if (ret > 0) { 5769 path->slots[level]++; 5770 continue; 5771 } else if (ret < 0) 5772 break; 5773 level = wc->level; 5774 } 5775 return (ret == 1) ? 0 : ret; 5776 } 5777 5778 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5779 struct btrfs_root *root, 5780 struct btrfs_path *path, 5781 struct walk_control *wc, int max_level) 5782 { 5783 int level = wc->level; 5784 int ret; 5785 5786 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 5787 while (level < max_level && path->nodes[level]) { 5788 wc->level = level; 5789 if (path->slots[level] + 1 < 5790 btrfs_header_nritems(path->nodes[level])) { 5791 path->slots[level]++; 5792 return 0; 5793 } else { 5794 ret = walk_up_proc(trans, root, path, wc); 5795 if (ret > 0) 5796 return 0; 5797 if (ret < 0) 5798 return ret; 5799 5800 if (path->locks[level]) { 5801 btrfs_tree_unlock_rw(path->nodes[level], 5802 path->locks[level]); 5803 path->locks[level] = 0; 5804 } 5805 free_extent_buffer(path->nodes[level]); 5806 path->nodes[level] = NULL; 5807 level++; 5808 } 5809 } 5810 return 1; 5811 } 5812 5813 /* 5814 * drop a subvolume tree. 5815 * 5816 * this function traverses the tree freeing any blocks that only 5817 * referenced by the tree. 5818 * 5819 * when a shared tree block is found. this function decreases its 5820 * reference count by one. if update_ref is true, this function 5821 * also make sure backrefs for the shared block and all lower level 5822 * blocks are properly updated. 5823 * 5824 * If called with for_reloc == 0, may exit early with -EAGAIN 5825 */ 5826 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) 5827 { 5828 const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); 5829 struct btrfs_fs_info *fs_info = root->fs_info; 5830 struct btrfs_path *path; 5831 struct btrfs_trans_handle *trans; 5832 struct btrfs_root *tree_root = fs_info->tree_root; 5833 struct btrfs_root_item *root_item = &root->root_item; 5834 struct walk_control *wc; 5835 struct btrfs_key key; 5836 int err = 0; 5837 int ret; 5838 int level; 5839 bool root_dropped = false; 5840 bool unfinished_drop = false; 5841 5842 btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); 5843 5844 path = btrfs_alloc_path(); 5845 if (!path) { 5846 err = -ENOMEM; 5847 goto out; 5848 } 5849 5850 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5851 if (!wc) { 5852 btrfs_free_path(path); 5853 err = -ENOMEM; 5854 goto out; 5855 } 5856 5857 /* 5858 * Use join to avoid potential EINTR from transaction start. See 5859 * wait_reserve_ticket and the whole reservation callchain. 5860 */ 5861 if (for_reloc) 5862 trans = btrfs_join_transaction(tree_root); 5863 else 5864 trans = btrfs_start_transaction(tree_root, 0); 5865 if (IS_ERR(trans)) { 5866 err = PTR_ERR(trans); 5867 goto out_free; 5868 } 5869 5870 err = btrfs_run_delayed_items(trans); 5871 if (err) 5872 goto out_end_trans; 5873 5874 /* 5875 * This will help us catch people modifying the fs tree while we're 5876 * dropping it. It is unsafe to mess with the fs tree while it's being 5877 * dropped as we unlock the root node and parent nodes as we walk down 5878 * the tree, assuming nothing will change. If something does change 5879 * then we'll have stale information and drop references to blocks we've 5880 * already dropped. 5881 */ 5882 set_bit(BTRFS_ROOT_DELETING, &root->state); 5883 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); 5884 5885 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5886 level = btrfs_header_level(root->node); 5887 path->nodes[level] = btrfs_lock_root_node(root); 5888 path->slots[level] = 0; 5889 path->locks[level] = BTRFS_WRITE_LOCK; 5890 memset(&wc->update_progress, 0, 5891 sizeof(wc->update_progress)); 5892 } else { 5893 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5894 memcpy(&wc->update_progress, &key, 5895 sizeof(wc->update_progress)); 5896 5897 level = btrfs_root_drop_level(root_item); 5898 BUG_ON(level == 0); 5899 path->lowest_level = level; 5900 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5901 path->lowest_level = 0; 5902 if (ret < 0) { 5903 err = ret; 5904 goto out_end_trans; 5905 } 5906 WARN_ON(ret > 0); 5907 5908 /* 5909 * unlock our path, this is safe because only this 5910 * function is allowed to delete this snapshot 5911 */ 5912 btrfs_unlock_up_safe(path, 0); 5913 5914 level = btrfs_header_level(root->node); 5915 while (1) { 5916 btrfs_tree_lock(path->nodes[level]); 5917 path->locks[level] = BTRFS_WRITE_LOCK; 5918 5919 ret = btrfs_lookup_extent_info(trans, fs_info, 5920 path->nodes[level]->start, 5921 level, 1, &wc->refs[level], 5922 &wc->flags[level], NULL); 5923 if (ret < 0) { 5924 err = ret; 5925 goto out_end_trans; 5926 } 5927 BUG_ON(wc->refs[level] == 0); 5928 5929 if (level == btrfs_root_drop_level(root_item)) 5930 break; 5931 5932 btrfs_tree_unlock(path->nodes[level]); 5933 path->locks[level] = 0; 5934 WARN_ON(wc->refs[level] != 1); 5935 level--; 5936 } 5937 } 5938 5939 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 5940 wc->level = level; 5941 wc->shared_level = -1; 5942 wc->stage = DROP_REFERENCE; 5943 wc->update_ref = update_ref; 5944 wc->keep_locks = 0; 5945 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 5946 5947 while (1) { 5948 5949 ret = walk_down_tree(trans, root, path, wc); 5950 if (ret < 0) { 5951 btrfs_abort_transaction(trans, ret); 5952 err = ret; 5953 break; 5954 } 5955 5956 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 5957 if (ret < 0) { 5958 btrfs_abort_transaction(trans, ret); 5959 err = ret; 5960 break; 5961 } 5962 5963 if (ret > 0) { 5964 BUG_ON(wc->stage != DROP_REFERENCE); 5965 break; 5966 } 5967 5968 if (wc->stage == DROP_REFERENCE) { 5969 wc->drop_level = wc->level; 5970 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 5971 &wc->drop_progress, 5972 path->slots[wc->drop_level]); 5973 } 5974 btrfs_cpu_key_to_disk(&root_item->drop_progress, 5975 &wc->drop_progress); 5976 btrfs_set_root_drop_level(root_item, wc->drop_level); 5977 5978 BUG_ON(wc->level == 0); 5979 if (btrfs_should_end_transaction(trans) || 5980 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 5981 ret = btrfs_update_root(trans, tree_root, 5982 &root->root_key, 5983 root_item); 5984 if (ret) { 5985 btrfs_abort_transaction(trans, ret); 5986 err = ret; 5987 goto out_end_trans; 5988 } 5989 5990 if (!is_reloc_root) 5991 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 5992 5993 btrfs_end_transaction_throttle(trans); 5994 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 5995 btrfs_debug(fs_info, 5996 "drop snapshot early exit"); 5997 err = -EAGAIN; 5998 goto out_free; 5999 } 6000 6001 /* 6002 * Use join to avoid potential EINTR from transaction 6003 * start. See wait_reserve_ticket and the whole 6004 * reservation callchain. 6005 */ 6006 if (for_reloc) 6007 trans = btrfs_join_transaction(tree_root); 6008 else 6009 trans = btrfs_start_transaction(tree_root, 0); 6010 if (IS_ERR(trans)) { 6011 err = PTR_ERR(trans); 6012 goto out_free; 6013 } 6014 } 6015 } 6016 btrfs_release_path(path); 6017 if (err) 6018 goto out_end_trans; 6019 6020 ret = btrfs_del_root(trans, &root->root_key); 6021 if (ret) { 6022 btrfs_abort_transaction(trans, ret); 6023 err = ret; 6024 goto out_end_trans; 6025 } 6026 6027 if (!is_reloc_root) { 6028 ret = btrfs_find_root(tree_root, &root->root_key, path, 6029 NULL, NULL); 6030 if (ret < 0) { 6031 btrfs_abort_transaction(trans, ret); 6032 err = ret; 6033 goto out_end_trans; 6034 } else if (ret > 0) { 6035 /* if we fail to delete the orphan item this time 6036 * around, it'll get picked up the next time. 6037 * 6038 * The most common failure here is just -ENOENT. 6039 */ 6040 btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); 6041 } 6042 } 6043 6044 /* 6045 * This subvolume is going to be completely dropped, and won't be 6046 * recorded as dirty roots, thus pertrans meta rsv will not be freed at 6047 * commit transaction time. So free it here manually. 6048 */ 6049 btrfs_qgroup_convert_reserved_meta(root, INT_MAX); 6050 btrfs_qgroup_free_meta_all_pertrans(root); 6051 6052 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 6053 btrfs_add_dropped_root(trans, root); 6054 else 6055 btrfs_put_root(root); 6056 root_dropped = true; 6057 out_end_trans: 6058 if (!is_reloc_root) 6059 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 6060 6061 btrfs_end_transaction_throttle(trans); 6062 out_free: 6063 kfree(wc); 6064 btrfs_free_path(path); 6065 out: 6066 /* 6067 * We were an unfinished drop root, check to see if there are any 6068 * pending, and if not clear and wake up any waiters. 6069 */ 6070 if (!err && unfinished_drop) 6071 btrfs_maybe_wake_unfinished_drop(fs_info); 6072 6073 /* 6074 * So if we need to stop dropping the snapshot for whatever reason we 6075 * need to make sure to add it back to the dead root list so that we 6076 * keep trying to do the work later. This also cleans up roots if we 6077 * don't have it in the radix (like when we recover after a power fail 6078 * or unmount) so we don't leak memory. 6079 */ 6080 if (!for_reloc && !root_dropped) 6081 btrfs_add_dead_root(root); 6082 return err; 6083 } 6084 6085 /* 6086 * drop subtree rooted at tree block 'node'. 6087 * 6088 * NOTE: this function will unlock and release tree block 'node' 6089 * only used by relocation code 6090 */ 6091 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6092 struct btrfs_root *root, 6093 struct extent_buffer *node, 6094 struct extent_buffer *parent) 6095 { 6096 struct btrfs_fs_info *fs_info = root->fs_info; 6097 struct btrfs_path *path; 6098 struct walk_control *wc; 6099 int level; 6100 int parent_level; 6101 int ret = 0; 6102 6103 BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 6104 6105 path = btrfs_alloc_path(); 6106 if (!path) 6107 return -ENOMEM; 6108 6109 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6110 if (!wc) { 6111 btrfs_free_path(path); 6112 return -ENOMEM; 6113 } 6114 6115 btrfs_assert_tree_write_locked(parent); 6116 parent_level = btrfs_header_level(parent); 6117 atomic_inc(&parent->refs); 6118 path->nodes[parent_level] = parent; 6119 path->slots[parent_level] = btrfs_header_nritems(parent); 6120 6121 btrfs_assert_tree_write_locked(node); 6122 level = btrfs_header_level(node); 6123 path->nodes[level] = node; 6124 path->slots[level] = 0; 6125 path->locks[level] = BTRFS_WRITE_LOCK; 6126 6127 wc->refs[parent_level] = 1; 6128 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6129 wc->level = level; 6130 wc->shared_level = -1; 6131 wc->stage = DROP_REFERENCE; 6132 wc->update_ref = 0; 6133 wc->keep_locks = 1; 6134 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 6135 6136 while (1) { 6137 ret = walk_down_tree(trans, root, path, wc); 6138 if (ret < 0) 6139 break; 6140 6141 ret = walk_up_tree(trans, root, path, wc, parent_level); 6142 if (ret) { 6143 if (ret > 0) 6144 ret = 0; 6145 break; 6146 } 6147 } 6148 6149 kfree(wc); 6150 btrfs_free_path(path); 6151 return ret; 6152 } 6153 6154 /* 6155 * Unpin the extent range in an error context and don't add the space back. 6156 * Errors are not propagated further. 6157 */ 6158 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) 6159 { 6160 unpin_extent_range(fs_info, start, end, false); 6161 } 6162 6163 /* 6164 * It used to be that old block groups would be left around forever. 6165 * Iterating over them would be enough to trim unused space. Since we 6166 * now automatically remove them, we also need to iterate over unallocated 6167 * space. 6168 * 6169 * We don't want a transaction for this since the discard may take a 6170 * substantial amount of time. We don't require that a transaction be 6171 * running, but we do need to take a running transaction into account 6172 * to ensure that we're not discarding chunks that were released or 6173 * allocated in the current transaction. 6174 * 6175 * Holding the chunks lock will prevent other threads from allocating 6176 * or releasing chunks, but it won't prevent a running transaction 6177 * from committing and releasing the memory that the pending chunks 6178 * list head uses. For that, we need to take a reference to the 6179 * transaction and hold the commit root sem. We only need to hold 6180 * it while performing the free space search since we have already 6181 * held back allocations. 6182 */ 6183 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) 6184 { 6185 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; 6186 int ret; 6187 6188 *trimmed = 0; 6189 6190 /* Discard not supported = nothing to do. */ 6191 if (!bdev_max_discard_sectors(device->bdev)) 6192 return 0; 6193 6194 /* Not writable = nothing to do. */ 6195 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 6196 return 0; 6197 6198 /* No free space = nothing to do. */ 6199 if (device->total_bytes <= device->bytes_used) 6200 return 0; 6201 6202 ret = 0; 6203 6204 while (1) { 6205 struct btrfs_fs_info *fs_info = device->fs_info; 6206 u64 bytes; 6207 6208 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 6209 if (ret) 6210 break; 6211 6212 find_first_clear_extent_bit(&device->alloc_state, start, 6213 &start, &end, 6214 CHUNK_TRIMMED | CHUNK_ALLOCATED); 6215 6216 /* Check if there are any CHUNK_* bits left */ 6217 if (start > device->total_bytes) { 6218 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 6219 btrfs_warn_in_rcu(fs_info, 6220 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", 6221 start, end - start + 1, 6222 btrfs_dev_name(device), 6223 device->total_bytes); 6224 mutex_unlock(&fs_info->chunk_mutex); 6225 ret = 0; 6226 break; 6227 } 6228 6229 /* Ensure we skip the reserved space on each device. */ 6230 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 6231 6232 /* 6233 * If find_first_clear_extent_bit find a range that spans the 6234 * end of the device it will set end to -1, in this case it's up 6235 * to the caller to trim the value to the size of the device. 6236 */ 6237 end = min(end, device->total_bytes - 1); 6238 6239 len = end - start + 1; 6240 6241 /* We didn't find any extents */ 6242 if (!len) { 6243 mutex_unlock(&fs_info->chunk_mutex); 6244 ret = 0; 6245 break; 6246 } 6247 6248 ret = btrfs_issue_discard(device->bdev, start, len, 6249 &bytes); 6250 if (!ret) 6251 set_extent_bit(&device->alloc_state, start, 6252 start + bytes - 1, CHUNK_TRIMMED, NULL); 6253 mutex_unlock(&fs_info->chunk_mutex); 6254 6255 if (ret) 6256 break; 6257 6258 start += len; 6259 *trimmed += bytes; 6260 6261 if (fatal_signal_pending(current)) { 6262 ret = -ERESTARTSYS; 6263 break; 6264 } 6265 6266 cond_resched(); 6267 } 6268 6269 return ret; 6270 } 6271 6272 /* 6273 * Trim the whole filesystem by: 6274 * 1) trimming the free space in each block group 6275 * 2) trimming the unallocated space on each device 6276 * 6277 * This will also continue trimming even if a block group or device encounters 6278 * an error. The return value will be the last error, or 0 if nothing bad 6279 * happens. 6280 */ 6281 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 6282 { 6283 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6284 struct btrfs_block_group *cache = NULL; 6285 struct btrfs_device *device; 6286 u64 group_trimmed; 6287 u64 range_end = U64_MAX; 6288 u64 start; 6289 u64 end; 6290 u64 trimmed = 0; 6291 u64 bg_failed = 0; 6292 u64 dev_failed = 0; 6293 int bg_ret = 0; 6294 int dev_ret = 0; 6295 int ret = 0; 6296 6297 if (range->start == U64_MAX) 6298 return -EINVAL; 6299 6300 /* 6301 * Check range overflow if range->len is set. 6302 * The default range->len is U64_MAX. 6303 */ 6304 if (range->len != U64_MAX && 6305 check_add_overflow(range->start, range->len, &range_end)) 6306 return -EINVAL; 6307 6308 cache = btrfs_lookup_first_block_group(fs_info, range->start); 6309 for (; cache; cache = btrfs_next_block_group(cache)) { 6310 if (cache->start >= range_end) { 6311 btrfs_put_block_group(cache); 6312 break; 6313 } 6314 6315 start = max(range->start, cache->start); 6316 end = min(range_end, cache->start + cache->length); 6317 6318 if (end - start >= range->minlen) { 6319 if (!btrfs_block_group_done(cache)) { 6320 ret = btrfs_cache_block_group(cache, true); 6321 if (ret) { 6322 bg_failed++; 6323 bg_ret = ret; 6324 continue; 6325 } 6326 } 6327 ret = btrfs_trim_block_group(cache, 6328 &group_trimmed, 6329 start, 6330 end, 6331 range->minlen); 6332 6333 trimmed += group_trimmed; 6334 if (ret) { 6335 bg_failed++; 6336 bg_ret = ret; 6337 continue; 6338 } 6339 } 6340 } 6341 6342 if (bg_failed) 6343 btrfs_warn(fs_info, 6344 "failed to trim %llu block group(s), last error %d", 6345 bg_failed, bg_ret); 6346 6347 mutex_lock(&fs_devices->device_list_mutex); 6348 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6349 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 6350 continue; 6351 6352 ret = btrfs_trim_free_extents(device, &group_trimmed); 6353 if (ret) { 6354 dev_failed++; 6355 dev_ret = ret; 6356 break; 6357 } 6358 6359 trimmed += group_trimmed; 6360 } 6361 mutex_unlock(&fs_devices->device_list_mutex); 6362 6363 if (dev_failed) 6364 btrfs_warn(fs_info, 6365 "failed to trim %llu device(s), last error %d", 6366 dev_failed, dev_ret); 6367 range->len = trimmed; 6368 if (bg_ret) 6369 return bg_ret; 6370 return dev_ret; 6371 } 6372