1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "ctree.h" 20 #include "extent-tree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "locking.h" 27 #include "free-space-cache.h" 28 #include "free-space-tree.h" 29 #include "qgroup.h" 30 #include "ref-verify.h" 31 #include "space-info.h" 32 #include "block-rsv.h" 33 #include "discard.h" 34 #include "zoned.h" 35 #include "dev-replace.h" 36 #include "fs.h" 37 #include "accessors.h" 38 #include "root-tree.h" 39 #include "file-item.h" 40 #include "orphan.h" 41 #include "tree-checker.h" 42 #include "raid-stripe-tree.h" 43 44 #undef SCRAMBLE_DELAYED_REFS 45 46 47 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 48 struct btrfs_delayed_ref_head *href, 49 struct btrfs_delayed_ref_node *node, 50 struct btrfs_delayed_extent_op *extra_op); 51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 52 struct extent_buffer *leaf, 53 struct btrfs_extent_item *ei); 54 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 55 u64 parent, u64 root_objectid, 56 u64 flags, u64 owner, u64 offset, 57 struct btrfs_key *ins, int ref_mod, u64 oref_root); 58 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 59 struct btrfs_delayed_ref_node *node, 60 struct btrfs_delayed_extent_op *extent_op); 61 static int find_next_key(struct btrfs_path *path, int level, 62 struct btrfs_key *key); 63 64 static int block_group_bits(struct btrfs_block_group *cache, u64 bits) 65 { 66 return (cache->flags & bits) == bits; 67 } 68 69 /* simple helper to search for an existing data extent at a given offset */ 70 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 71 { 72 struct btrfs_root *root = btrfs_extent_root(fs_info, start); 73 struct btrfs_key key; 74 BTRFS_PATH_AUTO_FREE(path); 75 76 path = btrfs_alloc_path(); 77 if (!path) 78 return -ENOMEM; 79 80 key.objectid = start; 81 key.type = BTRFS_EXTENT_ITEM_KEY; 82 key.offset = len; 83 return btrfs_search_slot(NULL, root, &key, path, 0, 0); 84 } 85 86 /* 87 * helper function to lookup reference count and flags of a tree block. 88 * 89 * the head node for delayed ref is used to store the sum of all the 90 * reference count modifications queued up in the rbtree. the head 91 * node may also store the extent flags to set. This way you can check 92 * to see what the reference count and extent flags would be if all of 93 * the delayed refs are not processed. 94 */ 95 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 96 struct btrfs_fs_info *fs_info, u64 bytenr, 97 u64 offset, int metadata, u64 *refs, u64 *flags, 98 u64 *owning_root) 99 { 100 struct btrfs_root *extent_root; 101 struct btrfs_delayed_ref_head *head; 102 struct btrfs_delayed_ref_root *delayed_refs; 103 BTRFS_PATH_AUTO_FREE(path); 104 struct btrfs_key key; 105 u64 num_refs; 106 u64 extent_flags; 107 u64 owner = 0; 108 int ret; 109 110 /* 111 * If we don't have skinny metadata, don't bother doing anything 112 * different 113 */ 114 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 115 offset = fs_info->nodesize; 116 metadata = 0; 117 } 118 119 path = btrfs_alloc_path(); 120 if (!path) 121 return -ENOMEM; 122 123 search_again: 124 key.objectid = bytenr; 125 if (metadata) 126 key.type = BTRFS_METADATA_ITEM_KEY; 127 else 128 key.type = BTRFS_EXTENT_ITEM_KEY; 129 key.offset = offset; 130 131 extent_root = btrfs_extent_root(fs_info, bytenr); 132 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 133 if (ret < 0) 134 return ret; 135 136 if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { 137 if (path->slots[0]) { 138 path->slots[0]--; 139 btrfs_item_key_to_cpu(path->nodes[0], &key, 140 path->slots[0]); 141 if (key.objectid == bytenr && 142 key.type == BTRFS_EXTENT_ITEM_KEY && 143 key.offset == fs_info->nodesize) 144 ret = 0; 145 } 146 } 147 148 if (ret == 0) { 149 struct extent_buffer *leaf = path->nodes[0]; 150 struct btrfs_extent_item *ei; 151 const u32 item_size = btrfs_item_size(leaf, path->slots[0]); 152 153 if (unlikely(item_size < sizeof(*ei))) { 154 ret = -EUCLEAN; 155 btrfs_err(fs_info, 156 "unexpected extent item size, has %u expect >= %zu", 157 item_size, sizeof(*ei)); 158 btrfs_abort_transaction(trans, ret); 159 return ret; 160 } 161 162 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 163 num_refs = btrfs_extent_refs(leaf, ei); 164 if (unlikely(num_refs == 0)) { 165 ret = -EUCLEAN; 166 btrfs_err(fs_info, 167 "unexpected zero reference count for extent item (%llu %u %llu)", 168 key.objectid, key.type, key.offset); 169 btrfs_abort_transaction(trans, ret); 170 return ret; 171 } 172 extent_flags = btrfs_extent_flags(leaf, ei); 173 owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); 174 } else { 175 num_refs = 0; 176 extent_flags = 0; 177 ret = 0; 178 } 179 180 delayed_refs = &trans->transaction->delayed_refs; 181 spin_lock(&delayed_refs->lock); 182 head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); 183 if (head) { 184 if (!mutex_trylock(&head->mutex)) { 185 refcount_inc(&head->refs); 186 spin_unlock(&delayed_refs->lock); 187 188 btrfs_release_path(path); 189 190 /* 191 * Mutex was contended, block until it's released and try 192 * again 193 */ 194 mutex_lock(&head->mutex); 195 mutex_unlock(&head->mutex); 196 btrfs_put_delayed_ref_head(head); 197 goto search_again; 198 } 199 spin_lock(&head->lock); 200 if (head->extent_op && head->extent_op->update_flags) 201 extent_flags |= head->extent_op->flags_to_set; 202 203 num_refs += head->ref_mod; 204 spin_unlock(&head->lock); 205 mutex_unlock(&head->mutex); 206 } 207 spin_unlock(&delayed_refs->lock); 208 209 WARN_ON(num_refs == 0); 210 if (refs) 211 *refs = num_refs; 212 if (flags) 213 *flags = extent_flags; 214 if (owning_root) 215 *owning_root = owner; 216 217 return ret; 218 } 219 220 /* 221 * Back reference rules. Back refs have three main goals: 222 * 223 * 1) differentiate between all holders of references to an extent so that 224 * when a reference is dropped we can make sure it was a valid reference 225 * before freeing the extent. 226 * 227 * 2) Provide enough information to quickly find the holders of an extent 228 * if we notice a given block is corrupted or bad. 229 * 230 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 231 * maintenance. This is actually the same as #2, but with a slightly 232 * different use case. 233 * 234 * There are two kinds of back refs. The implicit back refs is optimized 235 * for pointers in non-shared tree blocks. For a given pointer in a block, 236 * back refs of this kind provide information about the block's owner tree 237 * and the pointer's key. These information allow us to find the block by 238 * b-tree searching. The full back refs is for pointers in tree blocks not 239 * referenced by their owner trees. The location of tree block is recorded 240 * in the back refs. Actually the full back refs is generic, and can be 241 * used in all cases the implicit back refs is used. The major shortcoming 242 * of the full back refs is its overhead. Every time a tree block gets 243 * COWed, we have to update back refs entry for all pointers in it. 244 * 245 * For a newly allocated tree block, we use implicit back refs for 246 * pointers in it. This means most tree related operations only involve 247 * implicit back refs. For a tree block created in old transaction, the 248 * only way to drop a reference to it is COW it. So we can detect the 249 * event that tree block loses its owner tree's reference and do the 250 * back refs conversion. 251 * 252 * When a tree block is COWed through a tree, there are four cases: 253 * 254 * The reference count of the block is one and the tree is the block's 255 * owner tree. Nothing to do in this case. 256 * 257 * The reference count of the block is one and the tree is not the 258 * block's owner tree. In this case, full back refs is used for pointers 259 * in the block. Remove these full back refs, add implicit back refs for 260 * every pointers in the new block. 261 * 262 * The reference count of the block is greater than one and the tree is 263 * the block's owner tree. In this case, implicit back refs is used for 264 * pointers in the block. Add full back refs for every pointers in the 265 * block, increase lower level extents' reference counts. The original 266 * implicit back refs are entailed to the new block. 267 * 268 * The reference count of the block is greater than one and the tree is 269 * not the block's owner tree. Add implicit back refs for every pointer in 270 * the new block, increase lower level extents' reference count. 271 * 272 * Back Reference Key composing: 273 * 274 * The key objectid corresponds to the first byte in the extent, 275 * The key type is used to differentiate between types of back refs. 276 * There are different meanings of the key offset for different types 277 * of back refs. 278 * 279 * File extents can be referenced by: 280 * 281 * - multiple snapshots, subvolumes, or different generations in one subvol 282 * - different files inside a single subvolume 283 * - different offsets inside a file (bookend extents in file.c) 284 * 285 * The extent ref structure for the implicit back refs has fields for: 286 * 287 * - Objectid of the subvolume root 288 * - objectid of the file holding the reference 289 * - original offset in the file 290 * - how many bookend extents 291 * 292 * The key offset for the implicit back refs is hash of the first 293 * three fields. 294 * 295 * The extent ref structure for the full back refs has field for: 296 * 297 * - number of pointers in the tree leaf 298 * 299 * The key offset for the implicit back refs is the first byte of 300 * the tree leaf 301 * 302 * When a file extent is allocated, The implicit back refs is used. 303 * the fields are filled in: 304 * 305 * (root_key.objectid, inode objectid, offset in file, 1) 306 * 307 * When a file extent is removed file truncation, we find the 308 * corresponding implicit back refs and check the following fields: 309 * 310 * (btrfs_header_owner(leaf), inode objectid, offset in file) 311 * 312 * Btree extents can be referenced by: 313 * 314 * - Different subvolumes 315 * 316 * Both the implicit back refs and the full back refs for tree blocks 317 * only consist of key. The key offset for the implicit back refs is 318 * objectid of block's owner tree. The key offset for the full back refs 319 * is the first byte of parent block. 320 * 321 * When implicit back refs is used, information about the lowest key and 322 * level of the tree block are required. These information are stored in 323 * tree block info structure. 324 */ 325 326 /* 327 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 328 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 329 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 330 */ 331 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 332 struct btrfs_extent_inline_ref *iref, 333 enum btrfs_inline_ref_type is_data) 334 { 335 struct btrfs_fs_info *fs_info = eb->fs_info; 336 int type = btrfs_extent_inline_ref_type(eb, iref); 337 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 338 339 if (type == BTRFS_EXTENT_OWNER_REF_KEY) { 340 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); 341 return type; 342 } 343 344 if (type == BTRFS_TREE_BLOCK_REF_KEY || 345 type == BTRFS_SHARED_BLOCK_REF_KEY || 346 type == BTRFS_SHARED_DATA_REF_KEY || 347 type == BTRFS_EXTENT_DATA_REF_KEY) { 348 if (is_data == BTRFS_REF_TYPE_BLOCK) { 349 if (type == BTRFS_TREE_BLOCK_REF_KEY) 350 return type; 351 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 352 ASSERT(fs_info); 353 /* 354 * Every shared one has parent tree block, 355 * which must be aligned to sector size. 356 */ 357 if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) 358 return type; 359 } 360 } else if (is_data == BTRFS_REF_TYPE_DATA) { 361 if (type == BTRFS_EXTENT_DATA_REF_KEY) 362 return type; 363 if (type == BTRFS_SHARED_DATA_REF_KEY) { 364 ASSERT(fs_info); 365 /* 366 * Every shared one has parent tree block, 367 * which must be aligned to sector size. 368 */ 369 if (offset && 370 IS_ALIGNED(offset, fs_info->sectorsize)) 371 return type; 372 } 373 } else { 374 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 375 return type; 376 } 377 } 378 379 WARN_ON(1); 380 btrfs_print_leaf(eb); 381 btrfs_err(fs_info, 382 "eb %llu iref 0x%lx invalid extent inline ref type %d", 383 eb->start, (unsigned long)iref, type); 384 385 return BTRFS_REF_TYPE_INVALID; 386 } 387 388 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 389 { 390 u32 high_crc = ~(u32)0; 391 u32 low_crc = ~(u32)0; 392 __le64 lenum; 393 394 lenum = cpu_to_le64(root_objectid); 395 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 396 lenum = cpu_to_le64(owner); 397 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 398 lenum = cpu_to_le64(offset); 399 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 400 401 return ((u64)high_crc << 31) ^ (u64)low_crc; 402 } 403 404 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 405 struct btrfs_extent_data_ref *ref) 406 { 407 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 408 btrfs_extent_data_ref_objectid(leaf, ref), 409 btrfs_extent_data_ref_offset(leaf, ref)); 410 } 411 412 static int match_extent_data_ref(struct extent_buffer *leaf, 413 struct btrfs_extent_data_ref *ref, 414 u64 root_objectid, u64 owner, u64 offset) 415 { 416 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 417 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 418 btrfs_extent_data_ref_offset(leaf, ref) != offset) 419 return 0; 420 return 1; 421 } 422 423 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 424 struct btrfs_path *path, 425 u64 bytenr, u64 parent, 426 u64 root_objectid, 427 u64 owner, u64 offset) 428 { 429 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 430 struct btrfs_key key; 431 struct btrfs_extent_data_ref *ref; 432 struct extent_buffer *leaf; 433 u32 nritems; 434 int recow; 435 int ret; 436 437 key.objectid = bytenr; 438 if (parent) { 439 key.type = BTRFS_SHARED_DATA_REF_KEY; 440 key.offset = parent; 441 } else { 442 key.type = BTRFS_EXTENT_DATA_REF_KEY; 443 key.offset = hash_extent_data_ref(root_objectid, 444 owner, offset); 445 } 446 again: 447 recow = 0; 448 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 449 if (ret < 0) 450 return ret; 451 452 if (parent) { 453 if (ret) 454 return -ENOENT; 455 return 0; 456 } 457 458 ret = -ENOENT; 459 leaf = path->nodes[0]; 460 nritems = btrfs_header_nritems(leaf); 461 while (1) { 462 if (path->slots[0] >= nritems) { 463 ret = btrfs_next_leaf(root, path); 464 if (ret) { 465 if (ret > 0) 466 return -ENOENT; 467 return ret; 468 } 469 470 leaf = path->nodes[0]; 471 nritems = btrfs_header_nritems(leaf); 472 recow = 1; 473 } 474 475 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 476 if (key.objectid != bytenr || 477 key.type != BTRFS_EXTENT_DATA_REF_KEY) 478 goto fail; 479 480 ref = btrfs_item_ptr(leaf, path->slots[0], 481 struct btrfs_extent_data_ref); 482 483 if (match_extent_data_ref(leaf, ref, root_objectid, 484 owner, offset)) { 485 if (recow) { 486 btrfs_release_path(path); 487 goto again; 488 } 489 ret = 0; 490 break; 491 } 492 path->slots[0]++; 493 } 494 fail: 495 return ret; 496 } 497 498 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 499 struct btrfs_path *path, 500 struct btrfs_delayed_ref_node *node, 501 u64 bytenr) 502 { 503 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 504 struct btrfs_key key; 505 struct extent_buffer *leaf; 506 u64 owner = btrfs_delayed_ref_owner(node); 507 u64 offset = btrfs_delayed_ref_offset(node); 508 u32 size; 509 u32 num_refs; 510 int ret; 511 512 key.objectid = bytenr; 513 if (node->parent) { 514 key.type = BTRFS_SHARED_DATA_REF_KEY; 515 key.offset = node->parent; 516 size = sizeof(struct btrfs_shared_data_ref); 517 } else { 518 key.type = BTRFS_EXTENT_DATA_REF_KEY; 519 key.offset = hash_extent_data_ref(node->ref_root, owner, offset); 520 size = sizeof(struct btrfs_extent_data_ref); 521 } 522 523 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 524 if (ret && ret != -EEXIST) 525 goto fail; 526 527 leaf = path->nodes[0]; 528 if (node->parent) { 529 struct btrfs_shared_data_ref *ref; 530 ref = btrfs_item_ptr(leaf, path->slots[0], 531 struct btrfs_shared_data_ref); 532 if (ret == 0) { 533 btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); 534 } else { 535 num_refs = btrfs_shared_data_ref_count(leaf, ref); 536 num_refs += node->ref_mod; 537 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 538 } 539 } else { 540 struct btrfs_extent_data_ref *ref; 541 while (ret == -EEXIST) { 542 ref = btrfs_item_ptr(leaf, path->slots[0], 543 struct btrfs_extent_data_ref); 544 if (match_extent_data_ref(leaf, ref, node->ref_root, 545 owner, offset)) 546 break; 547 btrfs_release_path(path); 548 key.offset++; 549 ret = btrfs_insert_empty_item(trans, root, path, &key, 550 size); 551 if (ret && ret != -EEXIST) 552 goto fail; 553 554 leaf = path->nodes[0]; 555 } 556 ref = btrfs_item_ptr(leaf, path->slots[0], 557 struct btrfs_extent_data_ref); 558 if (ret == 0) { 559 btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); 560 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 561 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 562 btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); 563 } else { 564 num_refs = btrfs_extent_data_ref_count(leaf, ref); 565 num_refs += node->ref_mod; 566 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 567 } 568 } 569 ret = 0; 570 fail: 571 btrfs_release_path(path); 572 return ret; 573 } 574 575 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 576 struct btrfs_root *root, 577 struct btrfs_path *path, 578 int refs_to_drop) 579 { 580 struct btrfs_key key; 581 struct btrfs_extent_data_ref *ref1 = NULL; 582 struct btrfs_shared_data_ref *ref2 = NULL; 583 struct extent_buffer *leaf; 584 u32 num_refs = 0; 585 int ret = 0; 586 587 leaf = path->nodes[0]; 588 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 589 590 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 591 ref1 = btrfs_item_ptr(leaf, path->slots[0], 592 struct btrfs_extent_data_ref); 593 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 594 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 595 ref2 = btrfs_item_ptr(leaf, path->slots[0], 596 struct btrfs_shared_data_ref); 597 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 598 } else { 599 btrfs_err(trans->fs_info, 600 "unrecognized backref key (%llu %u %llu)", 601 key.objectid, key.type, key.offset); 602 btrfs_abort_transaction(trans, -EUCLEAN); 603 return -EUCLEAN; 604 } 605 606 BUG_ON(num_refs < refs_to_drop); 607 num_refs -= refs_to_drop; 608 609 if (num_refs == 0) { 610 ret = btrfs_del_item(trans, root, path); 611 } else { 612 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 613 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 614 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 615 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 616 } 617 return ret; 618 } 619 620 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 621 struct btrfs_extent_inline_ref *iref) 622 { 623 struct btrfs_key key; 624 struct extent_buffer *leaf; 625 struct btrfs_extent_data_ref *ref1; 626 struct btrfs_shared_data_ref *ref2; 627 u32 num_refs = 0; 628 int type; 629 630 leaf = path->nodes[0]; 631 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 632 633 if (iref) { 634 /* 635 * If type is invalid, we should have bailed out earlier than 636 * this call. 637 */ 638 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 639 ASSERT(type != BTRFS_REF_TYPE_INVALID); 640 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 641 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 642 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 643 } else { 644 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 645 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 646 } 647 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 648 ref1 = btrfs_item_ptr(leaf, path->slots[0], 649 struct btrfs_extent_data_ref); 650 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 651 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 652 ref2 = btrfs_item_ptr(leaf, path->slots[0], 653 struct btrfs_shared_data_ref); 654 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 655 } else { 656 WARN_ON(1); 657 } 658 return num_refs; 659 } 660 661 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 662 struct btrfs_path *path, 663 u64 bytenr, u64 parent, 664 u64 root_objectid) 665 { 666 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 667 struct btrfs_key key; 668 int ret; 669 670 key.objectid = bytenr; 671 if (parent) { 672 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 673 key.offset = parent; 674 } else { 675 key.type = BTRFS_TREE_BLOCK_REF_KEY; 676 key.offset = root_objectid; 677 } 678 679 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 680 if (ret > 0) 681 ret = -ENOENT; 682 return ret; 683 } 684 685 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 686 struct btrfs_path *path, 687 struct btrfs_delayed_ref_node *node, 688 u64 bytenr) 689 { 690 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 691 struct btrfs_key key; 692 int ret; 693 694 key.objectid = bytenr; 695 if (node->parent) { 696 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 697 key.offset = node->parent; 698 } else { 699 key.type = BTRFS_TREE_BLOCK_REF_KEY; 700 key.offset = node->ref_root; 701 } 702 703 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 704 btrfs_release_path(path); 705 return ret; 706 } 707 708 static inline int extent_ref_type(u64 parent, u64 owner) 709 { 710 int type; 711 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 712 if (parent > 0) 713 type = BTRFS_SHARED_BLOCK_REF_KEY; 714 else 715 type = BTRFS_TREE_BLOCK_REF_KEY; 716 } else { 717 if (parent > 0) 718 type = BTRFS_SHARED_DATA_REF_KEY; 719 else 720 type = BTRFS_EXTENT_DATA_REF_KEY; 721 } 722 return type; 723 } 724 725 static int find_next_key(struct btrfs_path *path, int level, 726 struct btrfs_key *key) 727 728 { 729 for (; level < BTRFS_MAX_LEVEL; level++) { 730 if (!path->nodes[level]) 731 break; 732 if (path->slots[level] + 1 >= 733 btrfs_header_nritems(path->nodes[level])) 734 continue; 735 if (level == 0) 736 btrfs_item_key_to_cpu(path->nodes[level], key, 737 path->slots[level] + 1); 738 else 739 btrfs_node_key_to_cpu(path->nodes[level], key, 740 path->slots[level] + 1); 741 return 0; 742 } 743 return 1; 744 } 745 746 /* 747 * look for inline back ref. if back ref is found, *ref_ret is set 748 * to the address of inline back ref, and 0 is returned. 749 * 750 * if back ref isn't found, *ref_ret is set to the address where it 751 * should be inserted, and -ENOENT is returned. 752 * 753 * if insert is true and there are too many inline back refs, the path 754 * points to the extent item, and -EAGAIN is returned. 755 * 756 * NOTE: inline back refs are ordered in the same way that back ref 757 * items in the tree are ordered. 758 */ 759 static noinline_for_stack 760 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 761 struct btrfs_path *path, 762 struct btrfs_extent_inline_ref **ref_ret, 763 u64 bytenr, u64 num_bytes, 764 u64 parent, u64 root_objectid, 765 u64 owner, u64 offset, int insert) 766 { 767 struct btrfs_fs_info *fs_info = trans->fs_info; 768 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); 769 struct btrfs_key key; 770 struct extent_buffer *leaf; 771 struct btrfs_extent_item *ei; 772 struct btrfs_extent_inline_ref *iref; 773 u64 flags; 774 u64 item_size; 775 unsigned long ptr; 776 unsigned long end; 777 int extra_size; 778 int type; 779 int want; 780 int ret; 781 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 782 int needed; 783 784 key.objectid = bytenr; 785 key.type = BTRFS_EXTENT_ITEM_KEY; 786 key.offset = num_bytes; 787 788 want = extent_ref_type(parent, owner); 789 if (insert) { 790 extra_size = btrfs_extent_inline_ref_size(want); 791 path->search_for_extension = 1; 792 } else 793 extra_size = -1; 794 795 /* 796 * Owner is our level, so we can just add one to get the level for the 797 * block we are interested in. 798 */ 799 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 800 key.type = BTRFS_METADATA_ITEM_KEY; 801 key.offset = owner; 802 } 803 804 again: 805 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 806 if (ret < 0) 807 goto out; 808 809 /* 810 * We may be a newly converted file system which still has the old fat 811 * extent entries for metadata, so try and see if we have one of those. 812 */ 813 if (ret > 0 && skinny_metadata) { 814 skinny_metadata = false; 815 if (path->slots[0]) { 816 path->slots[0]--; 817 btrfs_item_key_to_cpu(path->nodes[0], &key, 818 path->slots[0]); 819 if (key.objectid == bytenr && 820 key.type == BTRFS_EXTENT_ITEM_KEY && 821 key.offset == num_bytes) 822 ret = 0; 823 } 824 if (ret) { 825 key.objectid = bytenr; 826 key.type = BTRFS_EXTENT_ITEM_KEY; 827 key.offset = num_bytes; 828 btrfs_release_path(path); 829 goto again; 830 } 831 } 832 833 if (ret && !insert) { 834 ret = -ENOENT; 835 goto out; 836 } else if (WARN_ON(ret)) { 837 btrfs_print_leaf(path->nodes[0]); 838 btrfs_err(fs_info, 839 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", 840 bytenr, num_bytes, parent, root_objectid, owner, 841 offset); 842 ret = -EUCLEAN; 843 goto out; 844 } 845 846 leaf = path->nodes[0]; 847 item_size = btrfs_item_size(leaf, path->slots[0]); 848 if (unlikely(item_size < sizeof(*ei))) { 849 ret = -EUCLEAN; 850 btrfs_err(fs_info, 851 "unexpected extent item size, has %llu expect >= %zu", 852 item_size, sizeof(*ei)); 853 btrfs_abort_transaction(trans, ret); 854 goto out; 855 } 856 857 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 858 flags = btrfs_extent_flags(leaf, ei); 859 860 ptr = (unsigned long)(ei + 1); 861 end = (unsigned long)ei + item_size; 862 863 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 864 ptr += sizeof(struct btrfs_tree_block_info); 865 BUG_ON(ptr > end); 866 } 867 868 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 869 needed = BTRFS_REF_TYPE_DATA; 870 else 871 needed = BTRFS_REF_TYPE_BLOCK; 872 873 ret = -ENOENT; 874 while (ptr < end) { 875 iref = (struct btrfs_extent_inline_ref *)ptr; 876 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 877 if (type == BTRFS_EXTENT_OWNER_REF_KEY) { 878 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); 879 ptr += btrfs_extent_inline_ref_size(type); 880 continue; 881 } 882 if (type == BTRFS_REF_TYPE_INVALID) { 883 ret = -EUCLEAN; 884 goto out; 885 } 886 887 if (want < type) 888 break; 889 if (want > type) { 890 ptr += btrfs_extent_inline_ref_size(type); 891 continue; 892 } 893 894 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 895 struct btrfs_extent_data_ref *dref; 896 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 897 if (match_extent_data_ref(leaf, dref, root_objectid, 898 owner, offset)) { 899 ret = 0; 900 break; 901 } 902 if (hash_extent_data_ref_item(leaf, dref) < 903 hash_extent_data_ref(root_objectid, owner, offset)) 904 break; 905 } else { 906 u64 ref_offset; 907 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 908 if (parent > 0) { 909 if (parent == ref_offset) { 910 ret = 0; 911 break; 912 } 913 if (ref_offset < parent) 914 break; 915 } else { 916 if (root_objectid == ref_offset) { 917 ret = 0; 918 break; 919 } 920 if (ref_offset < root_objectid) 921 break; 922 } 923 } 924 ptr += btrfs_extent_inline_ref_size(type); 925 } 926 927 if (unlikely(ptr > end)) { 928 ret = -EUCLEAN; 929 btrfs_print_leaf(path->nodes[0]); 930 btrfs_crit(fs_info, 931 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", 932 path->slots[0], root_objectid, owner, offset, parent); 933 goto out; 934 } 935 936 if (ret == -ENOENT && insert) { 937 if (item_size + extra_size >= 938 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 939 ret = -EAGAIN; 940 goto out; 941 } 942 943 if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { 944 struct btrfs_key tmp_key; 945 946 btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); 947 if (tmp_key.objectid == bytenr && 948 tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 949 ret = -EAGAIN; 950 goto out; 951 } 952 goto out_no_entry; 953 } 954 955 if (!path->keep_locks) { 956 btrfs_release_path(path); 957 path->keep_locks = 1; 958 goto again; 959 } 960 961 /* 962 * To add new inline back ref, we have to make sure 963 * there is no corresponding back ref item. 964 * For simplicity, we just do not add new inline back 965 * ref if there is any kind of item for this block 966 */ 967 if (find_next_key(path, 0, &key) == 0 && 968 key.objectid == bytenr && 969 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 970 ret = -EAGAIN; 971 goto out; 972 } 973 } 974 out_no_entry: 975 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 976 out: 977 if (path->keep_locks) { 978 path->keep_locks = 0; 979 btrfs_unlock_up_safe(path, 1); 980 } 981 if (insert) 982 path->search_for_extension = 0; 983 return ret; 984 } 985 986 /* 987 * helper to add new inline back ref 988 */ 989 static noinline_for_stack 990 void setup_inline_extent_backref(struct btrfs_trans_handle *trans, 991 struct btrfs_path *path, 992 struct btrfs_extent_inline_ref *iref, 993 u64 parent, u64 root_objectid, 994 u64 owner, u64 offset, int refs_to_add, 995 struct btrfs_delayed_extent_op *extent_op) 996 { 997 struct extent_buffer *leaf; 998 struct btrfs_extent_item *ei; 999 unsigned long ptr; 1000 unsigned long end; 1001 unsigned long item_offset; 1002 u64 refs; 1003 int size; 1004 int type; 1005 1006 leaf = path->nodes[0]; 1007 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1008 item_offset = (unsigned long)iref - (unsigned long)ei; 1009 1010 type = extent_ref_type(parent, owner); 1011 size = btrfs_extent_inline_ref_size(type); 1012 1013 btrfs_extend_item(trans, path, size); 1014 1015 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1016 refs = btrfs_extent_refs(leaf, ei); 1017 refs += refs_to_add; 1018 btrfs_set_extent_refs(leaf, ei, refs); 1019 if (extent_op) 1020 __run_delayed_extent_op(extent_op, leaf, ei); 1021 1022 ptr = (unsigned long)ei + item_offset; 1023 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); 1024 if (ptr < end - size) 1025 memmove_extent_buffer(leaf, ptr + size, ptr, 1026 end - size - ptr); 1027 1028 iref = (struct btrfs_extent_inline_ref *)ptr; 1029 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1030 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1031 struct btrfs_extent_data_ref *dref; 1032 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1033 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1034 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1035 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1036 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1037 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1038 struct btrfs_shared_data_ref *sref; 1039 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1040 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1041 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1042 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1043 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1044 } else { 1045 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1046 } 1047 } 1048 1049 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1050 struct btrfs_path *path, 1051 struct btrfs_extent_inline_ref **ref_ret, 1052 u64 bytenr, u64 num_bytes, u64 parent, 1053 u64 root_objectid, u64 owner, u64 offset) 1054 { 1055 int ret; 1056 1057 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1058 num_bytes, parent, root_objectid, 1059 owner, offset, 0); 1060 if (ret != -ENOENT) 1061 return ret; 1062 1063 btrfs_release_path(path); 1064 *ref_ret = NULL; 1065 1066 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1067 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1068 root_objectid); 1069 } else { 1070 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1071 root_objectid, owner, offset); 1072 } 1073 return ret; 1074 } 1075 1076 /* 1077 * helper to update/remove inline back ref 1078 */ 1079 static noinline_for_stack int update_inline_extent_backref( 1080 struct btrfs_trans_handle *trans, 1081 struct btrfs_path *path, 1082 struct btrfs_extent_inline_ref *iref, 1083 int refs_to_mod, 1084 struct btrfs_delayed_extent_op *extent_op) 1085 { 1086 struct extent_buffer *leaf = path->nodes[0]; 1087 struct btrfs_fs_info *fs_info = leaf->fs_info; 1088 struct btrfs_extent_item *ei; 1089 struct btrfs_extent_data_ref *dref = NULL; 1090 struct btrfs_shared_data_ref *sref = NULL; 1091 unsigned long ptr; 1092 unsigned long end; 1093 u32 item_size; 1094 int size; 1095 int type; 1096 u64 refs; 1097 1098 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1099 refs = btrfs_extent_refs(leaf, ei); 1100 if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { 1101 struct btrfs_key key; 1102 u32 extent_size; 1103 1104 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1105 if (key.type == BTRFS_METADATA_ITEM_KEY) 1106 extent_size = fs_info->nodesize; 1107 else 1108 extent_size = key.offset; 1109 btrfs_print_leaf(leaf); 1110 btrfs_err(fs_info, 1111 "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", 1112 key.objectid, extent_size, refs_to_mod, refs); 1113 return -EUCLEAN; 1114 } 1115 refs += refs_to_mod; 1116 btrfs_set_extent_refs(leaf, ei, refs); 1117 if (extent_op) 1118 __run_delayed_extent_op(extent_op, leaf, ei); 1119 1120 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1121 /* 1122 * Function btrfs_get_extent_inline_ref_type() has already printed 1123 * error messages. 1124 */ 1125 if (unlikely(type == BTRFS_REF_TYPE_INVALID)) 1126 return -EUCLEAN; 1127 1128 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1129 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1130 refs = btrfs_extent_data_ref_count(leaf, dref); 1131 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1132 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1133 refs = btrfs_shared_data_ref_count(leaf, sref); 1134 } else { 1135 refs = 1; 1136 /* 1137 * For tree blocks we can only drop one ref for it, and tree 1138 * blocks should not have refs > 1. 1139 * 1140 * Furthermore if we're inserting a new inline backref, we 1141 * won't reach this path either. That would be 1142 * setup_inline_extent_backref(). 1143 */ 1144 if (unlikely(refs_to_mod != -1)) { 1145 struct btrfs_key key; 1146 1147 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1148 1149 btrfs_print_leaf(leaf); 1150 btrfs_err(fs_info, 1151 "invalid refs_to_mod for tree block %llu, has %d expect -1", 1152 key.objectid, refs_to_mod); 1153 return -EUCLEAN; 1154 } 1155 } 1156 1157 if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { 1158 struct btrfs_key key; 1159 u32 extent_size; 1160 1161 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1162 if (key.type == BTRFS_METADATA_ITEM_KEY) 1163 extent_size = fs_info->nodesize; 1164 else 1165 extent_size = key.offset; 1166 btrfs_print_leaf(leaf); 1167 btrfs_err(fs_info, 1168 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", 1169 (unsigned long)iref, key.objectid, extent_size, 1170 refs_to_mod, refs); 1171 return -EUCLEAN; 1172 } 1173 refs += refs_to_mod; 1174 1175 if (refs > 0) { 1176 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1177 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1178 else 1179 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1180 } else { 1181 size = btrfs_extent_inline_ref_size(type); 1182 item_size = btrfs_item_size(leaf, path->slots[0]); 1183 ptr = (unsigned long)iref; 1184 end = (unsigned long)ei + item_size; 1185 if (ptr + size < end) 1186 memmove_extent_buffer(leaf, ptr, ptr + size, 1187 end - ptr - size); 1188 item_size -= size; 1189 btrfs_truncate_item(trans, path, item_size, 1); 1190 } 1191 return 0; 1192 } 1193 1194 static noinline_for_stack 1195 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1196 struct btrfs_path *path, 1197 u64 bytenr, u64 num_bytes, u64 parent, 1198 u64 root_objectid, u64 owner, 1199 u64 offset, int refs_to_add, 1200 struct btrfs_delayed_extent_op *extent_op) 1201 { 1202 struct btrfs_extent_inline_ref *iref; 1203 int ret; 1204 1205 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1206 num_bytes, parent, root_objectid, 1207 owner, offset, 1); 1208 if (ret == 0) { 1209 /* 1210 * We're adding refs to a tree block we already own, this 1211 * should not happen at all. 1212 */ 1213 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1214 btrfs_print_leaf(path->nodes[0]); 1215 btrfs_crit(trans->fs_info, 1216 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", 1217 bytenr, num_bytes, root_objectid, path->slots[0]); 1218 return -EUCLEAN; 1219 } 1220 ret = update_inline_extent_backref(trans, path, iref, 1221 refs_to_add, extent_op); 1222 } else if (ret == -ENOENT) { 1223 setup_inline_extent_backref(trans, path, iref, parent, 1224 root_objectid, owner, offset, 1225 refs_to_add, extent_op); 1226 ret = 0; 1227 } 1228 return ret; 1229 } 1230 1231 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1232 struct btrfs_root *root, 1233 struct btrfs_path *path, 1234 struct btrfs_extent_inline_ref *iref, 1235 int refs_to_drop, int is_data) 1236 { 1237 int ret = 0; 1238 1239 BUG_ON(!is_data && refs_to_drop != 1); 1240 if (iref) 1241 ret = update_inline_extent_backref(trans, path, iref, 1242 -refs_to_drop, NULL); 1243 else if (is_data) 1244 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1245 else 1246 ret = btrfs_del_item(trans, root, path); 1247 return ret; 1248 } 1249 1250 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1251 u64 *discarded_bytes) 1252 { 1253 int j, ret = 0; 1254 u64 bytes_left, end; 1255 u64 aligned_start = ALIGN(start, SECTOR_SIZE); 1256 1257 /* Adjust the range to be aligned to 512B sectors if necessary. */ 1258 if (start != aligned_start) { 1259 len -= aligned_start - start; 1260 len = round_down(len, SECTOR_SIZE); 1261 start = aligned_start; 1262 } 1263 1264 *discarded_bytes = 0; 1265 1266 if (!len) 1267 return 0; 1268 1269 end = start + len; 1270 bytes_left = len; 1271 1272 /* Skip any superblocks on this device. */ 1273 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1274 u64 sb_start = btrfs_sb_offset(j); 1275 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1276 u64 size = sb_start - start; 1277 1278 if (!in_range(sb_start, start, bytes_left) && 1279 !in_range(sb_end, start, bytes_left) && 1280 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1281 continue; 1282 1283 /* 1284 * Superblock spans beginning of range. Adjust start and 1285 * try again. 1286 */ 1287 if (sb_start <= start) { 1288 start += sb_end - start; 1289 if (start > end) { 1290 bytes_left = 0; 1291 break; 1292 } 1293 bytes_left = end - start; 1294 continue; 1295 } 1296 1297 if (size) { 1298 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1299 size >> SECTOR_SHIFT, 1300 GFP_NOFS); 1301 if (!ret) 1302 *discarded_bytes += size; 1303 else if (ret != -EOPNOTSUPP) 1304 return ret; 1305 } 1306 1307 start = sb_end; 1308 if (start > end) { 1309 bytes_left = 0; 1310 break; 1311 } 1312 bytes_left = end - start; 1313 } 1314 1315 while (bytes_left) { 1316 u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); 1317 1318 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1319 bytes_to_discard >> SECTOR_SHIFT, 1320 GFP_NOFS); 1321 1322 if (ret) { 1323 if (ret != -EOPNOTSUPP) 1324 break; 1325 continue; 1326 } 1327 1328 start += bytes_to_discard; 1329 bytes_left -= bytes_to_discard; 1330 *discarded_bytes += bytes_to_discard; 1331 1332 if (btrfs_trim_interrupted()) { 1333 ret = -ERESTARTSYS; 1334 break; 1335 } 1336 } 1337 1338 return ret; 1339 } 1340 1341 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) 1342 { 1343 struct btrfs_device *dev = stripe->dev; 1344 struct btrfs_fs_info *fs_info = dev->fs_info; 1345 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1346 u64 phys = stripe->physical; 1347 u64 len = stripe->length; 1348 u64 discarded = 0; 1349 int ret = 0; 1350 1351 /* Zone reset on a zoned filesystem */ 1352 if (btrfs_can_zone_reset(dev, phys, len)) { 1353 u64 src_disc; 1354 1355 ret = btrfs_reset_device_zone(dev, phys, len, &discarded); 1356 if (ret) 1357 goto out; 1358 1359 if (!btrfs_dev_replace_is_ongoing(dev_replace) || 1360 dev != dev_replace->srcdev) 1361 goto out; 1362 1363 src_disc = discarded; 1364 1365 /* Send to replace target as well */ 1366 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, 1367 &discarded); 1368 discarded += src_disc; 1369 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { 1370 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); 1371 } else { 1372 ret = 0; 1373 *bytes = 0; 1374 } 1375 1376 out: 1377 *bytes = discarded; 1378 return ret; 1379 } 1380 1381 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1382 u64 num_bytes, u64 *actual_bytes) 1383 { 1384 int ret = 0; 1385 u64 discarded_bytes = 0; 1386 u64 end = bytenr + num_bytes; 1387 u64 cur = bytenr; 1388 1389 /* 1390 * Avoid races with device replace and make sure the devices in the 1391 * stripes don't go away while we are discarding. 1392 */ 1393 btrfs_bio_counter_inc_blocked(fs_info); 1394 while (cur < end) { 1395 struct btrfs_discard_stripe *stripes; 1396 unsigned int num_stripes; 1397 int i; 1398 1399 num_bytes = end - cur; 1400 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); 1401 if (IS_ERR(stripes)) { 1402 ret = PTR_ERR(stripes); 1403 if (ret == -EOPNOTSUPP) 1404 ret = 0; 1405 break; 1406 } 1407 1408 for (i = 0; i < num_stripes; i++) { 1409 struct btrfs_discard_stripe *stripe = stripes + i; 1410 u64 bytes; 1411 1412 if (!stripe->dev->bdev) { 1413 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 1414 continue; 1415 } 1416 1417 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 1418 &stripe->dev->dev_state)) 1419 continue; 1420 1421 ret = do_discard_extent(stripe, &bytes); 1422 if (ret) { 1423 /* 1424 * Keep going if discard is not supported by the 1425 * device. 1426 */ 1427 if (ret != -EOPNOTSUPP) 1428 break; 1429 ret = 0; 1430 } else { 1431 discarded_bytes += bytes; 1432 } 1433 } 1434 kfree(stripes); 1435 if (ret) 1436 break; 1437 cur += num_bytes; 1438 } 1439 btrfs_bio_counter_dec(fs_info); 1440 if (actual_bytes) 1441 *actual_bytes = discarded_bytes; 1442 return ret; 1443 } 1444 1445 /* Can return -ENOMEM */ 1446 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1447 struct btrfs_ref *generic_ref) 1448 { 1449 struct btrfs_fs_info *fs_info = trans->fs_info; 1450 int ret; 1451 1452 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && 1453 generic_ref->action); 1454 BUG_ON(generic_ref->type == BTRFS_REF_METADATA && 1455 generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); 1456 1457 if (generic_ref->type == BTRFS_REF_METADATA) 1458 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); 1459 else 1460 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); 1461 1462 btrfs_ref_tree_mod(fs_info, generic_ref); 1463 1464 return ret; 1465 } 1466 1467 /* 1468 * Insert backreference for a given extent. 1469 * 1470 * The counterpart is in __btrfs_free_extent(), with examples and more details 1471 * how it works. 1472 * 1473 * @trans: Handle of transaction 1474 * 1475 * @node: The delayed ref node used to get the bytenr/length for 1476 * extent whose references are incremented. 1477 * 1478 * @extent_op Pointer to a structure, holding information necessary when 1479 * updating a tree block's flags 1480 * 1481 */ 1482 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1483 struct btrfs_delayed_ref_node *node, 1484 struct btrfs_delayed_extent_op *extent_op) 1485 { 1486 BTRFS_PATH_AUTO_FREE(path); 1487 struct extent_buffer *leaf; 1488 struct btrfs_extent_item *item; 1489 struct btrfs_key key; 1490 u64 bytenr = node->bytenr; 1491 u64 num_bytes = node->num_bytes; 1492 u64 owner = btrfs_delayed_ref_owner(node); 1493 u64 offset = btrfs_delayed_ref_offset(node); 1494 u64 refs; 1495 int refs_to_add = node->ref_mod; 1496 int ret; 1497 1498 path = btrfs_alloc_path(); 1499 if (!path) 1500 return -ENOMEM; 1501 1502 /* this will setup the path even if it fails to insert the back ref */ 1503 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 1504 node->parent, node->ref_root, owner, 1505 offset, refs_to_add, extent_op); 1506 if ((ret < 0 && ret != -EAGAIN) || !ret) 1507 return ret; 1508 1509 /* 1510 * Ok we had -EAGAIN which means we didn't have space to insert and 1511 * inline extent ref, so just update the reference count and add a 1512 * normal backref. 1513 */ 1514 leaf = path->nodes[0]; 1515 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1516 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1517 refs = btrfs_extent_refs(leaf, item); 1518 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1519 if (extent_op) 1520 __run_delayed_extent_op(extent_op, leaf, item); 1521 1522 btrfs_release_path(path); 1523 1524 /* now insert the actual backref */ 1525 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1526 ret = insert_tree_block_ref(trans, path, node, bytenr); 1527 else 1528 ret = insert_extent_data_ref(trans, path, node, bytenr); 1529 1530 if (ret) 1531 btrfs_abort_transaction(trans, ret); 1532 1533 return ret; 1534 } 1535 1536 static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, 1537 struct btrfs_delayed_ref_head *href) 1538 { 1539 u64 root = href->owning_root; 1540 1541 /* 1542 * Don't check must_insert_reserved, as this is called from contexts 1543 * where it has already been unset. 1544 */ 1545 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || 1546 !href->is_data || !is_fstree(root)) 1547 return; 1548 1549 btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, 1550 BTRFS_QGROUP_RSV_DATA); 1551 } 1552 1553 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1554 struct btrfs_delayed_ref_head *href, 1555 struct btrfs_delayed_ref_node *node, 1556 struct btrfs_delayed_extent_op *extent_op, 1557 bool insert_reserved) 1558 { 1559 int ret = 0; 1560 u64 parent = 0; 1561 u64 flags = 0; 1562 1563 trace_run_delayed_data_ref(trans->fs_info, node); 1564 1565 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1566 parent = node->parent; 1567 1568 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1569 struct btrfs_key key; 1570 struct btrfs_squota_delta delta = { 1571 .root = href->owning_root, 1572 .num_bytes = node->num_bytes, 1573 .is_data = true, 1574 .is_inc = true, 1575 .generation = trans->transid, 1576 }; 1577 u64 owner = btrfs_delayed_ref_owner(node); 1578 u64 offset = btrfs_delayed_ref_offset(node); 1579 1580 if (extent_op) 1581 flags |= extent_op->flags_to_set; 1582 1583 key.objectid = node->bytenr; 1584 key.type = BTRFS_EXTENT_ITEM_KEY; 1585 key.offset = node->num_bytes; 1586 1587 ret = alloc_reserved_file_extent(trans, parent, node->ref_root, 1588 flags, owner, offset, &key, 1589 node->ref_mod, 1590 href->owning_root); 1591 free_head_ref_squota_rsv(trans->fs_info, href); 1592 if (!ret) 1593 ret = btrfs_record_squota_delta(trans->fs_info, &delta); 1594 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1595 ret = __btrfs_inc_extent_ref(trans, node, extent_op); 1596 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1597 ret = __btrfs_free_extent(trans, href, node, extent_op); 1598 } else { 1599 BUG(); 1600 } 1601 return ret; 1602 } 1603 1604 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1605 struct extent_buffer *leaf, 1606 struct btrfs_extent_item *ei) 1607 { 1608 u64 flags = btrfs_extent_flags(leaf, ei); 1609 if (extent_op->update_flags) { 1610 flags |= extent_op->flags_to_set; 1611 btrfs_set_extent_flags(leaf, ei, flags); 1612 } 1613 1614 if (extent_op->update_key) { 1615 struct btrfs_tree_block_info *bi; 1616 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1617 bi = (struct btrfs_tree_block_info *)(ei + 1); 1618 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1619 } 1620 } 1621 1622 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1623 struct btrfs_delayed_ref_head *head, 1624 struct btrfs_delayed_extent_op *extent_op) 1625 { 1626 struct btrfs_fs_info *fs_info = trans->fs_info; 1627 struct btrfs_root *root; 1628 struct btrfs_key key; 1629 BTRFS_PATH_AUTO_FREE(path); 1630 struct btrfs_extent_item *ei; 1631 struct extent_buffer *leaf; 1632 u32 item_size; 1633 int ret; 1634 int metadata = 1; 1635 1636 if (TRANS_ABORTED(trans)) 1637 return 0; 1638 1639 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1640 metadata = 0; 1641 1642 path = btrfs_alloc_path(); 1643 if (!path) 1644 return -ENOMEM; 1645 1646 key.objectid = head->bytenr; 1647 1648 if (metadata) { 1649 key.type = BTRFS_METADATA_ITEM_KEY; 1650 key.offset = head->level; 1651 } else { 1652 key.type = BTRFS_EXTENT_ITEM_KEY; 1653 key.offset = head->num_bytes; 1654 } 1655 1656 root = btrfs_extent_root(fs_info, key.objectid); 1657 again: 1658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1659 if (ret < 0) { 1660 return ret; 1661 } else if (ret > 0) { 1662 if (metadata) { 1663 if (path->slots[0] > 0) { 1664 path->slots[0]--; 1665 btrfs_item_key_to_cpu(path->nodes[0], &key, 1666 path->slots[0]); 1667 if (key.objectid == head->bytenr && 1668 key.type == BTRFS_EXTENT_ITEM_KEY && 1669 key.offset == head->num_bytes) 1670 ret = 0; 1671 } 1672 if (ret > 0) { 1673 btrfs_release_path(path); 1674 metadata = 0; 1675 1676 key.objectid = head->bytenr; 1677 key.type = BTRFS_EXTENT_ITEM_KEY; 1678 key.offset = head->num_bytes; 1679 goto again; 1680 } 1681 } else { 1682 ret = -EUCLEAN; 1683 btrfs_err(fs_info, 1684 "missing extent item for extent %llu num_bytes %llu level %d", 1685 head->bytenr, head->num_bytes, head->level); 1686 return ret; 1687 } 1688 } 1689 1690 leaf = path->nodes[0]; 1691 item_size = btrfs_item_size(leaf, path->slots[0]); 1692 1693 if (unlikely(item_size < sizeof(*ei))) { 1694 ret = -EUCLEAN; 1695 btrfs_err(fs_info, 1696 "unexpected extent item size, has %u expect >= %zu", 1697 item_size, sizeof(*ei)); 1698 btrfs_abort_transaction(trans, ret); 1699 return ret; 1700 } 1701 1702 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1703 __run_delayed_extent_op(extent_op, leaf, ei); 1704 1705 return ret; 1706 } 1707 1708 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 1709 struct btrfs_delayed_ref_head *href, 1710 struct btrfs_delayed_ref_node *node, 1711 struct btrfs_delayed_extent_op *extent_op, 1712 bool insert_reserved) 1713 { 1714 int ret = 0; 1715 struct btrfs_fs_info *fs_info = trans->fs_info; 1716 u64 parent = 0; 1717 u64 ref_root = 0; 1718 1719 trace_run_delayed_tree_ref(trans->fs_info, node); 1720 1721 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1722 parent = node->parent; 1723 ref_root = node->ref_root; 1724 1725 if (unlikely(node->ref_mod != 1)) { 1726 btrfs_err(trans->fs_info, 1727 "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", 1728 node->bytenr, node->ref_mod, node->action, ref_root, 1729 parent); 1730 return -EUCLEAN; 1731 } 1732 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1733 struct btrfs_squota_delta delta = { 1734 .root = href->owning_root, 1735 .num_bytes = fs_info->nodesize, 1736 .is_data = false, 1737 .is_inc = true, 1738 .generation = trans->transid, 1739 }; 1740 1741 ret = alloc_reserved_tree_block(trans, node, extent_op); 1742 if (!ret) 1743 btrfs_record_squota_delta(fs_info, &delta); 1744 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1745 ret = __btrfs_inc_extent_ref(trans, node, extent_op); 1746 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1747 ret = __btrfs_free_extent(trans, href, node, extent_op); 1748 } else { 1749 BUG(); 1750 } 1751 return ret; 1752 } 1753 1754 /* helper function to actually process a single delayed ref entry */ 1755 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1756 struct btrfs_delayed_ref_head *href, 1757 struct btrfs_delayed_ref_node *node, 1758 struct btrfs_delayed_extent_op *extent_op, 1759 bool insert_reserved) 1760 { 1761 int ret = 0; 1762 1763 if (TRANS_ABORTED(trans)) { 1764 if (insert_reserved) { 1765 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1766 free_head_ref_squota_rsv(trans->fs_info, href); 1767 } 1768 return 0; 1769 } 1770 1771 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1772 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1773 ret = run_delayed_tree_ref(trans, href, node, extent_op, 1774 insert_reserved); 1775 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1776 node->type == BTRFS_SHARED_DATA_REF_KEY) 1777 ret = run_delayed_data_ref(trans, href, node, extent_op, 1778 insert_reserved); 1779 else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) 1780 ret = 0; 1781 else 1782 BUG(); 1783 if (ret && insert_reserved) 1784 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1785 if (ret < 0) 1786 btrfs_err(trans->fs_info, 1787 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", 1788 node->bytenr, node->num_bytes, node->type, 1789 node->action, node->ref_mod, ret); 1790 return ret; 1791 } 1792 1793 static struct btrfs_delayed_extent_op *cleanup_extent_op( 1794 struct btrfs_delayed_ref_head *head) 1795 { 1796 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 1797 1798 if (!extent_op) 1799 return NULL; 1800 1801 if (head->must_insert_reserved) { 1802 head->extent_op = NULL; 1803 btrfs_free_delayed_extent_op(extent_op); 1804 return NULL; 1805 } 1806 return extent_op; 1807 } 1808 1809 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 1810 struct btrfs_delayed_ref_head *head) 1811 { 1812 struct btrfs_delayed_extent_op *extent_op; 1813 int ret; 1814 1815 extent_op = cleanup_extent_op(head); 1816 if (!extent_op) 1817 return 0; 1818 head->extent_op = NULL; 1819 spin_unlock(&head->lock); 1820 ret = run_delayed_extent_op(trans, head, extent_op); 1821 btrfs_free_delayed_extent_op(extent_op); 1822 return ret ? ret : 1; 1823 } 1824 1825 u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 1826 struct btrfs_delayed_ref_root *delayed_refs, 1827 struct btrfs_delayed_ref_head *head) 1828 { 1829 u64 ret = 0; 1830 1831 /* 1832 * We had csum deletions accounted for in our delayed refs rsv, we need 1833 * to drop the csum leaves for this update from our delayed_refs_rsv. 1834 */ 1835 if (head->total_ref_mod < 0 && head->is_data) { 1836 int nr_csums; 1837 1838 spin_lock(&delayed_refs->lock); 1839 delayed_refs->pending_csums -= head->num_bytes; 1840 spin_unlock(&delayed_refs->lock); 1841 nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); 1842 1843 btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); 1844 1845 ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); 1846 } 1847 /* must_insert_reserved can be set only if we didn't run the head ref. */ 1848 if (head->must_insert_reserved) 1849 free_head_ref_squota_rsv(fs_info, head); 1850 1851 return ret; 1852 } 1853 1854 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 1855 struct btrfs_delayed_ref_head *head, 1856 u64 *bytes_released) 1857 { 1858 1859 struct btrfs_fs_info *fs_info = trans->fs_info; 1860 struct btrfs_delayed_ref_root *delayed_refs; 1861 int ret; 1862 1863 delayed_refs = &trans->transaction->delayed_refs; 1864 1865 ret = run_and_cleanup_extent_op(trans, head); 1866 if (ret < 0) { 1867 btrfs_unselect_ref_head(delayed_refs, head); 1868 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 1869 return ret; 1870 } else if (ret) { 1871 return ret; 1872 } 1873 1874 /* 1875 * Need to drop our head ref lock and re-acquire the delayed ref lock 1876 * and then re-check to make sure nobody got added. 1877 */ 1878 spin_unlock(&head->lock); 1879 spin_lock(&delayed_refs->lock); 1880 spin_lock(&head->lock); 1881 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 1882 spin_unlock(&head->lock); 1883 spin_unlock(&delayed_refs->lock); 1884 return 1; 1885 } 1886 btrfs_delete_ref_head(fs_info, delayed_refs, head); 1887 spin_unlock(&head->lock); 1888 spin_unlock(&delayed_refs->lock); 1889 1890 if (head->must_insert_reserved) { 1891 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); 1892 if (head->is_data) { 1893 struct btrfs_root *csum_root; 1894 1895 csum_root = btrfs_csum_root(fs_info, head->bytenr); 1896 ret = btrfs_del_csums(trans, csum_root, head->bytenr, 1897 head->num_bytes); 1898 } 1899 } 1900 1901 *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 1902 1903 trace_run_delayed_ref_head(fs_info, head, 0); 1904 btrfs_delayed_ref_unlock(head); 1905 btrfs_put_delayed_ref_head(head); 1906 return ret; 1907 } 1908 1909 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 1910 struct btrfs_delayed_ref_head *locked_ref, 1911 u64 *bytes_released) 1912 { 1913 struct btrfs_fs_info *fs_info = trans->fs_info; 1914 struct btrfs_delayed_ref_root *delayed_refs; 1915 struct btrfs_delayed_extent_op *extent_op; 1916 struct btrfs_delayed_ref_node *ref; 1917 bool must_insert_reserved; 1918 int ret; 1919 1920 delayed_refs = &trans->transaction->delayed_refs; 1921 1922 lockdep_assert_held(&locked_ref->mutex); 1923 lockdep_assert_held(&locked_ref->lock); 1924 1925 while ((ref = btrfs_select_delayed_ref(locked_ref))) { 1926 if (ref->seq && 1927 btrfs_check_delayed_seq(fs_info, ref->seq)) { 1928 spin_unlock(&locked_ref->lock); 1929 btrfs_unselect_ref_head(delayed_refs, locked_ref); 1930 return -EAGAIN; 1931 } 1932 1933 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 1934 RB_CLEAR_NODE(&ref->ref_node); 1935 if (!list_empty(&ref->add_list)) 1936 list_del(&ref->add_list); 1937 /* 1938 * When we play the delayed ref, also correct the ref_mod on 1939 * head 1940 */ 1941 switch (ref->action) { 1942 case BTRFS_ADD_DELAYED_REF: 1943 case BTRFS_ADD_DELAYED_EXTENT: 1944 locked_ref->ref_mod -= ref->ref_mod; 1945 break; 1946 case BTRFS_DROP_DELAYED_REF: 1947 locked_ref->ref_mod += ref->ref_mod; 1948 break; 1949 default: 1950 WARN_ON(1); 1951 } 1952 1953 /* 1954 * Record the must_insert_reserved flag before we drop the 1955 * spin lock. 1956 */ 1957 must_insert_reserved = locked_ref->must_insert_reserved; 1958 /* 1959 * Unsetting this on the head ref relinquishes ownership of 1960 * the rsv_bytes, so it is critical that every possible code 1961 * path from here forward frees all reserves including qgroup 1962 * reserve. 1963 */ 1964 locked_ref->must_insert_reserved = false; 1965 1966 extent_op = locked_ref->extent_op; 1967 locked_ref->extent_op = NULL; 1968 spin_unlock(&locked_ref->lock); 1969 1970 ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, 1971 must_insert_reserved); 1972 btrfs_delayed_refs_rsv_release(fs_info, 1, 0); 1973 *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); 1974 1975 btrfs_free_delayed_extent_op(extent_op); 1976 if (ret) { 1977 btrfs_unselect_ref_head(delayed_refs, locked_ref); 1978 btrfs_put_delayed_ref(ref); 1979 return ret; 1980 } 1981 1982 btrfs_put_delayed_ref(ref); 1983 cond_resched(); 1984 1985 spin_lock(&locked_ref->lock); 1986 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 1987 } 1988 1989 return 0; 1990 } 1991 1992 /* 1993 * Returns 0 on success or if called with an already aborted transaction. 1994 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 1995 */ 1996 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1997 u64 min_bytes) 1998 { 1999 struct btrfs_fs_info *fs_info = trans->fs_info; 2000 struct btrfs_delayed_ref_root *delayed_refs; 2001 struct btrfs_delayed_ref_head *locked_ref = NULL; 2002 int ret; 2003 unsigned long count = 0; 2004 unsigned long max_count = 0; 2005 u64 bytes_processed = 0; 2006 2007 delayed_refs = &trans->transaction->delayed_refs; 2008 if (min_bytes == 0) { 2009 max_count = delayed_refs->num_heads_ready; 2010 min_bytes = U64_MAX; 2011 } 2012 2013 do { 2014 if (!locked_ref) { 2015 locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); 2016 if (IS_ERR_OR_NULL(locked_ref)) { 2017 if (PTR_ERR(locked_ref) == -EAGAIN) { 2018 continue; 2019 } else { 2020 break; 2021 } 2022 } 2023 count++; 2024 } 2025 /* 2026 * We need to try and merge add/drops of the same ref since we 2027 * can run into issues with relocate dropping the implicit ref 2028 * and then it being added back again before the drop can 2029 * finish. If we merged anything we need to re-loop so we can 2030 * get a good ref. 2031 * Or we can get node references of the same type that weren't 2032 * merged when created due to bumps in the tree mod seq, and 2033 * we need to merge them to prevent adding an inline extent 2034 * backref before dropping it (triggering a BUG_ON at 2035 * insert_inline_extent_backref()). 2036 */ 2037 spin_lock(&locked_ref->lock); 2038 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 2039 2040 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); 2041 if (ret < 0 && ret != -EAGAIN) { 2042 /* 2043 * Error, btrfs_run_delayed_refs_for_head already 2044 * unlocked everything so just bail out 2045 */ 2046 return ret; 2047 } else if (!ret) { 2048 /* 2049 * Success, perform the usual cleanup of a processed 2050 * head 2051 */ 2052 ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); 2053 if (ret > 0 ) { 2054 /* We dropped our lock, we need to loop. */ 2055 ret = 0; 2056 continue; 2057 } else if (ret) { 2058 return ret; 2059 } 2060 } 2061 2062 /* 2063 * Either success case or btrfs_run_delayed_refs_for_head 2064 * returned -EAGAIN, meaning we need to select another head 2065 */ 2066 2067 locked_ref = NULL; 2068 cond_resched(); 2069 } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || 2070 (max_count > 0 && count < max_count) || 2071 locked_ref); 2072 2073 return 0; 2074 } 2075 2076 #ifdef SCRAMBLE_DELAYED_REFS 2077 /* 2078 * Normally delayed refs get processed in ascending bytenr order. This 2079 * correlates in most cases to the order added. To expose dependencies on this 2080 * order, we start to process the tree in the middle instead of the beginning 2081 */ 2082 static u64 find_middle(struct rb_root *root) 2083 { 2084 struct rb_node *n = root->rb_node; 2085 struct btrfs_delayed_ref_node *entry; 2086 int alt = 1; 2087 u64 middle; 2088 u64 first = 0, last = 0; 2089 2090 n = rb_first(root); 2091 if (n) { 2092 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2093 first = entry->bytenr; 2094 } 2095 n = rb_last(root); 2096 if (n) { 2097 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2098 last = entry->bytenr; 2099 } 2100 n = root->rb_node; 2101 2102 while (n) { 2103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2104 WARN_ON(!entry->in_tree); 2105 2106 middle = entry->bytenr; 2107 2108 if (alt) 2109 n = n->rb_left; 2110 else 2111 n = n->rb_right; 2112 2113 alt = 1 - alt; 2114 } 2115 return middle; 2116 } 2117 #endif 2118 2119 /* 2120 * Start processing the delayed reference count updates and extent insertions 2121 * we have queued up so far. 2122 * 2123 * @trans: Transaction handle. 2124 * @min_bytes: How many bytes of delayed references to process. After this 2125 * many bytes we stop processing delayed references if there are 2126 * any more. If 0 it means to run all existing delayed references, 2127 * but not new ones added after running all existing ones. 2128 * Use (u64)-1 (U64_MAX) to run all existing delayed references 2129 * plus any new ones that are added. 2130 * 2131 * Returns 0 on success or if called with an aborted transaction 2132 * Returns <0 on error and aborts the transaction 2133 */ 2134 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) 2135 { 2136 struct btrfs_fs_info *fs_info = trans->fs_info; 2137 struct btrfs_delayed_ref_root *delayed_refs; 2138 int ret; 2139 2140 /* We'll clean this up in btrfs_cleanup_transaction */ 2141 if (TRANS_ABORTED(trans)) 2142 return 0; 2143 2144 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2145 return 0; 2146 2147 delayed_refs = &trans->transaction->delayed_refs; 2148 again: 2149 #ifdef SCRAMBLE_DELAYED_REFS 2150 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2151 #endif 2152 ret = __btrfs_run_delayed_refs(trans, min_bytes); 2153 if (ret < 0) { 2154 btrfs_abort_transaction(trans, ret); 2155 return ret; 2156 } 2157 2158 if (min_bytes == U64_MAX) { 2159 btrfs_create_pending_block_groups(trans); 2160 2161 spin_lock(&delayed_refs->lock); 2162 if (xa_empty(&delayed_refs->head_refs)) { 2163 spin_unlock(&delayed_refs->lock); 2164 return 0; 2165 } 2166 spin_unlock(&delayed_refs->lock); 2167 2168 cond_resched(); 2169 goto again; 2170 } 2171 2172 return 0; 2173 } 2174 2175 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2176 struct extent_buffer *eb, u64 flags) 2177 { 2178 struct btrfs_delayed_extent_op *extent_op; 2179 int ret; 2180 2181 extent_op = btrfs_alloc_delayed_extent_op(); 2182 if (!extent_op) 2183 return -ENOMEM; 2184 2185 extent_op->flags_to_set = flags; 2186 extent_op->update_flags = true; 2187 extent_op->update_key = false; 2188 2189 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, 2190 btrfs_header_level(eb), extent_op); 2191 if (ret) 2192 btrfs_free_delayed_extent_op(extent_op); 2193 return ret; 2194 } 2195 2196 static noinline int check_delayed_ref(struct btrfs_inode *inode, 2197 struct btrfs_path *path, 2198 u64 offset, u64 bytenr) 2199 { 2200 struct btrfs_root *root = inode->root; 2201 struct btrfs_delayed_ref_head *head; 2202 struct btrfs_delayed_ref_node *ref; 2203 struct btrfs_delayed_ref_root *delayed_refs; 2204 struct btrfs_transaction *cur_trans; 2205 struct rb_node *node; 2206 int ret = 0; 2207 2208 spin_lock(&root->fs_info->trans_lock); 2209 cur_trans = root->fs_info->running_transaction; 2210 if (cur_trans) 2211 refcount_inc(&cur_trans->use_count); 2212 spin_unlock(&root->fs_info->trans_lock); 2213 if (!cur_trans) 2214 return 0; 2215 2216 delayed_refs = &cur_trans->delayed_refs; 2217 spin_lock(&delayed_refs->lock); 2218 head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); 2219 if (!head) { 2220 spin_unlock(&delayed_refs->lock); 2221 btrfs_put_transaction(cur_trans); 2222 return 0; 2223 } 2224 2225 if (!mutex_trylock(&head->mutex)) { 2226 if (path->nowait) { 2227 spin_unlock(&delayed_refs->lock); 2228 btrfs_put_transaction(cur_trans); 2229 return -EAGAIN; 2230 } 2231 2232 refcount_inc(&head->refs); 2233 spin_unlock(&delayed_refs->lock); 2234 2235 btrfs_release_path(path); 2236 2237 /* 2238 * Mutex was contended, block until it's released and let 2239 * caller try again 2240 */ 2241 mutex_lock(&head->mutex); 2242 mutex_unlock(&head->mutex); 2243 btrfs_put_delayed_ref_head(head); 2244 btrfs_put_transaction(cur_trans); 2245 return -EAGAIN; 2246 } 2247 spin_unlock(&delayed_refs->lock); 2248 2249 spin_lock(&head->lock); 2250 /* 2251 * XXX: We should replace this with a proper search function in the 2252 * future. 2253 */ 2254 for (node = rb_first_cached(&head->ref_tree); node; 2255 node = rb_next(node)) { 2256 u64 ref_owner; 2257 u64 ref_offset; 2258 2259 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 2260 /* If it's a shared ref we know a cross reference exists */ 2261 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2262 ret = 1; 2263 break; 2264 } 2265 2266 ref_owner = btrfs_delayed_ref_owner(ref); 2267 ref_offset = btrfs_delayed_ref_offset(ref); 2268 2269 /* 2270 * If our ref doesn't match the one we're currently looking at 2271 * then we have a cross reference. 2272 */ 2273 if (ref->ref_root != btrfs_root_id(root) || 2274 ref_owner != btrfs_ino(inode) || ref_offset != offset) { 2275 ret = 1; 2276 break; 2277 } 2278 } 2279 spin_unlock(&head->lock); 2280 mutex_unlock(&head->mutex); 2281 btrfs_put_transaction(cur_trans); 2282 return ret; 2283 } 2284 2285 /* 2286 * Check if there are references for a data extent other than the one belonging 2287 * to the given inode and offset. 2288 * 2289 * @inode: The only inode we expect to find associated with the data extent. 2290 * @path: A path to use for searching the extent tree. 2291 * @offset: The only offset we expect to find associated with the data extent. 2292 * @bytenr: The logical address of the data extent. 2293 * 2294 * When the extent does not have any other references other than the one we 2295 * expect to find, we always return a value of 0 with the path having a locked 2296 * leaf that contains the extent's extent item - this is necessary to ensure 2297 * we don't race with a task running delayed references, and our caller must 2298 * have such a path when calling check_delayed_ref() - it must lock a delayed 2299 * ref head while holding the leaf locked. In case the extent item is not found 2300 * in the extent tree, we return -ENOENT with the path having the leaf (locked) 2301 * where the extent item should be, in order to prevent races with another task 2302 * running delayed references, so that we don't miss any reference when calling 2303 * check_delayed_ref(). 2304 * 2305 * Note: this may return false positives, and this is because we want to be 2306 * quick here as we're called in write paths (when flushing delalloc and 2307 * in the direct IO write path). For example we can have an extent with 2308 * a single reference but that reference is not inlined, or we may have 2309 * many references in the extent tree but we also have delayed references 2310 * that cancel all the reference except the one for our inode and offset, 2311 * but it would be expensive to do such checks and complex due to all 2312 * locking to avoid races between the checks and flushing delayed refs, 2313 * plus non-inline references may be located on leaves other than the one 2314 * that contains the extent item in the extent tree. The important thing 2315 * here is to not return false negatives and that the false positives are 2316 * not very common. 2317 * 2318 * Returns: 0 if there are no cross references and with the path having a locked 2319 * leaf from the extent tree that contains the extent's extent item. 2320 * 2321 * 1 if there are cross references (false positives can happen). 2322 * 2323 * < 0 in case of an error. In case of -ENOENT the leaf in the extent 2324 * tree where the extent item should be located at is read locked and 2325 * accessible in the given path. 2326 */ 2327 static noinline int check_committed_ref(struct btrfs_inode *inode, 2328 struct btrfs_path *path, 2329 u64 offset, u64 bytenr) 2330 { 2331 struct btrfs_root *root = inode->root; 2332 struct btrfs_fs_info *fs_info = root->fs_info; 2333 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); 2334 struct extent_buffer *leaf; 2335 struct btrfs_extent_data_ref *ref; 2336 struct btrfs_extent_inline_ref *iref; 2337 struct btrfs_extent_item *ei; 2338 struct btrfs_key key; 2339 u32 item_size; 2340 u32 expected_size; 2341 int type; 2342 int ret; 2343 2344 key.objectid = bytenr; 2345 key.type = BTRFS_EXTENT_ITEM_KEY; 2346 key.offset = (u64)-1; 2347 2348 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2349 if (ret < 0) 2350 return ret; 2351 if (ret == 0) { 2352 /* 2353 * Key with offset -1 found, there would have to exist an extent 2354 * item with such offset, but this is out of the valid range. 2355 */ 2356 return -EUCLEAN; 2357 } 2358 2359 if (path->slots[0] == 0) 2360 return -ENOENT; 2361 2362 path->slots[0]--; 2363 leaf = path->nodes[0]; 2364 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2365 2366 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2367 return -ENOENT; 2368 2369 item_size = btrfs_item_size(leaf, path->slots[0]); 2370 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2371 expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); 2372 2373 /* No inline refs; we need to bail before checking for owner ref. */ 2374 if (item_size == sizeof(*ei)) 2375 return 1; 2376 2377 /* Check for an owner ref; skip over it to the real inline refs. */ 2378 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2379 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2380 if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { 2381 expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); 2382 iref = (struct btrfs_extent_inline_ref *)(iref + 1); 2383 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2384 } 2385 2386 /* If extent item has more than 1 inline ref then it's shared */ 2387 if (item_size != expected_size) 2388 return 1; 2389 2390 /* If this extent has SHARED_DATA_REF then it's shared */ 2391 if (type != BTRFS_EXTENT_DATA_REF_KEY) 2392 return 1; 2393 2394 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2395 if (btrfs_extent_refs(leaf, ei) != 2396 btrfs_extent_data_ref_count(leaf, ref) || 2397 btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || 2398 btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || 2399 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2400 return 1; 2401 2402 return 0; 2403 } 2404 2405 int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, 2406 u64 bytenr, struct btrfs_path *path) 2407 { 2408 int ret; 2409 2410 do { 2411 ret = check_committed_ref(inode, path, offset, bytenr); 2412 if (ret && ret != -ENOENT) 2413 goto out; 2414 2415 /* 2416 * The path must have a locked leaf from the extent tree where 2417 * the extent item for our extent is located, in case it exists, 2418 * or where it should be located in case it doesn't exist yet 2419 * because it's new and its delayed ref was not yet flushed. 2420 * We need to lock the delayed ref head at check_delayed_ref(), 2421 * if one exists, while holding the leaf locked in order to not 2422 * race with delayed ref flushing, missing references and 2423 * incorrectly reporting that the extent is not shared. 2424 */ 2425 if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { 2426 struct extent_buffer *leaf = path->nodes[0]; 2427 2428 ASSERT(leaf != NULL); 2429 btrfs_assert_tree_read_locked(leaf); 2430 2431 if (ret != -ENOENT) { 2432 struct btrfs_key key; 2433 2434 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2435 ASSERT(key.objectid == bytenr); 2436 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); 2437 } 2438 } 2439 2440 ret = check_delayed_ref(inode, path, offset, bytenr); 2441 } while (ret == -EAGAIN && !path->nowait); 2442 2443 out: 2444 btrfs_release_path(path); 2445 if (btrfs_is_data_reloc_root(inode->root)) 2446 WARN_ON(ret > 0); 2447 return ret; 2448 } 2449 2450 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2451 struct btrfs_root *root, 2452 struct extent_buffer *buf, 2453 int full_backref, int inc) 2454 { 2455 struct btrfs_fs_info *fs_info = root->fs_info; 2456 u64 parent; 2457 u64 ref_root; 2458 u32 nritems; 2459 struct btrfs_key key; 2460 struct btrfs_file_extent_item *fi; 2461 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); 2462 int i; 2463 int action; 2464 int level; 2465 int ret = 0; 2466 2467 if (btrfs_is_testing(fs_info)) 2468 return 0; 2469 2470 ref_root = btrfs_header_owner(buf); 2471 nritems = btrfs_header_nritems(buf); 2472 level = btrfs_header_level(buf); 2473 2474 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) 2475 return 0; 2476 2477 if (full_backref) 2478 parent = buf->start; 2479 else 2480 parent = 0; 2481 if (inc) 2482 action = BTRFS_ADD_DELAYED_REF; 2483 else 2484 action = BTRFS_DROP_DELAYED_REF; 2485 2486 for (i = 0; i < nritems; i++) { 2487 struct btrfs_ref ref = { 2488 .action = action, 2489 .parent = parent, 2490 .ref_root = ref_root, 2491 }; 2492 2493 if (level == 0) { 2494 btrfs_item_key_to_cpu(buf, &key, i); 2495 if (key.type != BTRFS_EXTENT_DATA_KEY) 2496 continue; 2497 fi = btrfs_item_ptr(buf, i, 2498 struct btrfs_file_extent_item); 2499 if (btrfs_file_extent_type(buf, fi) == 2500 BTRFS_FILE_EXTENT_INLINE) 2501 continue; 2502 ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2503 if (ref.bytenr == 0) 2504 continue; 2505 2506 ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2507 ref.owning_root = ref_root; 2508 2509 key.offset -= btrfs_file_extent_offset(buf, fi); 2510 btrfs_init_data_ref(&ref, key.objectid, key.offset, 2511 btrfs_root_id(root), for_reloc); 2512 if (inc) 2513 ret = btrfs_inc_extent_ref(trans, &ref); 2514 else 2515 ret = btrfs_free_extent(trans, &ref); 2516 if (ret) 2517 goto fail; 2518 } else { 2519 /* We don't know the owning_root, leave as 0. */ 2520 ref.bytenr = btrfs_node_blockptr(buf, i); 2521 ref.num_bytes = fs_info->nodesize; 2522 2523 btrfs_init_tree_ref(&ref, level - 1, 2524 btrfs_root_id(root), for_reloc); 2525 if (inc) 2526 ret = btrfs_inc_extent_ref(trans, &ref); 2527 else 2528 ret = btrfs_free_extent(trans, &ref); 2529 if (ret) 2530 goto fail; 2531 } 2532 } 2533 return 0; 2534 fail: 2535 return ret; 2536 } 2537 2538 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2539 struct extent_buffer *buf, int full_backref) 2540 { 2541 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2542 } 2543 2544 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2545 struct extent_buffer *buf, int full_backref) 2546 { 2547 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2548 } 2549 2550 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 2551 { 2552 struct btrfs_fs_info *fs_info = root->fs_info; 2553 u64 flags; 2554 u64 ret; 2555 2556 if (data) 2557 flags = BTRFS_BLOCK_GROUP_DATA; 2558 else if (root == fs_info->chunk_root) 2559 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2560 else 2561 flags = BTRFS_BLOCK_GROUP_METADATA; 2562 2563 ret = btrfs_get_alloc_profile(fs_info, flags); 2564 return ret; 2565 } 2566 2567 static u64 first_logical_byte(struct btrfs_fs_info *fs_info) 2568 { 2569 struct rb_node *leftmost; 2570 u64 bytenr = 0; 2571 2572 read_lock(&fs_info->block_group_cache_lock); 2573 /* Get the block group with the lowest logical start address. */ 2574 leftmost = rb_first_cached(&fs_info->block_group_cache_tree); 2575 if (leftmost) { 2576 struct btrfs_block_group *bg; 2577 2578 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); 2579 bytenr = bg->start; 2580 } 2581 read_unlock(&fs_info->block_group_cache_lock); 2582 2583 return bytenr; 2584 } 2585 2586 static int pin_down_extent(struct btrfs_trans_handle *trans, 2587 struct btrfs_block_group *cache, 2588 u64 bytenr, u64 num_bytes, int reserved) 2589 { 2590 spin_lock(&cache->space_info->lock); 2591 spin_lock(&cache->lock); 2592 cache->pinned += num_bytes; 2593 btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); 2594 if (reserved) { 2595 cache->reserved -= num_bytes; 2596 cache->space_info->bytes_reserved -= num_bytes; 2597 } 2598 spin_unlock(&cache->lock); 2599 spin_unlock(&cache->space_info->lock); 2600 2601 set_extent_bit(&trans->transaction->pinned_extents, bytenr, 2602 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); 2603 return 0; 2604 } 2605 2606 int btrfs_pin_extent(struct btrfs_trans_handle *trans, 2607 u64 bytenr, u64 num_bytes, int reserved) 2608 { 2609 struct btrfs_block_group *cache; 2610 2611 cache = btrfs_lookup_block_group(trans->fs_info, bytenr); 2612 BUG_ON(!cache); /* Logic error */ 2613 2614 pin_down_extent(trans, cache, bytenr, num_bytes, reserved); 2615 2616 btrfs_put_block_group(cache); 2617 return 0; 2618 } 2619 2620 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 2621 const struct extent_buffer *eb) 2622 { 2623 struct btrfs_block_group *cache; 2624 int ret; 2625 2626 cache = btrfs_lookup_block_group(trans->fs_info, eb->start); 2627 if (!cache) 2628 return -EINVAL; 2629 2630 /* 2631 * Fully cache the free space first so that our pin removes the free space 2632 * from the cache. 2633 */ 2634 ret = btrfs_cache_block_group(cache, true); 2635 if (ret) 2636 goto out; 2637 2638 pin_down_extent(trans, cache, eb->start, eb->len, 0); 2639 2640 /* remove us from the free space cache (if we're there at all) */ 2641 ret = btrfs_remove_free_space(cache, eb->start, eb->len); 2642 out: 2643 btrfs_put_block_group(cache); 2644 return ret; 2645 } 2646 2647 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 2648 u64 start, u64 num_bytes) 2649 { 2650 int ret; 2651 struct btrfs_block_group *block_group; 2652 2653 block_group = btrfs_lookup_block_group(fs_info, start); 2654 if (!block_group) 2655 return -EINVAL; 2656 2657 ret = btrfs_cache_block_group(block_group, true); 2658 if (ret) 2659 goto out; 2660 2661 ret = btrfs_remove_free_space(block_group, start, num_bytes); 2662 out: 2663 btrfs_put_block_group(block_group); 2664 return ret; 2665 } 2666 2667 int btrfs_exclude_logged_extents(struct extent_buffer *eb) 2668 { 2669 struct btrfs_fs_info *fs_info = eb->fs_info; 2670 struct btrfs_file_extent_item *item; 2671 struct btrfs_key key; 2672 int found_type; 2673 int i; 2674 int ret = 0; 2675 2676 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 2677 return 0; 2678 2679 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2680 btrfs_item_key_to_cpu(eb, &key, i); 2681 if (key.type != BTRFS_EXTENT_DATA_KEY) 2682 continue; 2683 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 2684 found_type = btrfs_file_extent_type(eb, item); 2685 if (found_type == BTRFS_FILE_EXTENT_INLINE) 2686 continue; 2687 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 2688 continue; 2689 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 2690 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 2691 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 2692 if (ret) 2693 break; 2694 } 2695 2696 return ret; 2697 } 2698 2699 static void 2700 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) 2701 { 2702 atomic_inc(&bg->reservations); 2703 } 2704 2705 /* 2706 * Returns the free cluster for the given space info and sets empty_cluster to 2707 * what it should be based on the mount options. 2708 */ 2709 static struct btrfs_free_cluster * 2710 fetch_cluster_info(struct btrfs_fs_info *fs_info, 2711 struct btrfs_space_info *space_info, u64 *empty_cluster) 2712 { 2713 struct btrfs_free_cluster *ret = NULL; 2714 2715 *empty_cluster = 0; 2716 if (btrfs_mixed_space_info(space_info)) 2717 return ret; 2718 2719 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 2720 ret = &fs_info->meta_alloc_cluster; 2721 if (btrfs_test_opt(fs_info, SSD)) 2722 *empty_cluster = SZ_2M; 2723 else 2724 *empty_cluster = SZ_64K; 2725 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 2726 btrfs_test_opt(fs_info, SSD_SPREAD)) { 2727 *empty_cluster = SZ_2M; 2728 ret = &fs_info->data_alloc_cluster; 2729 } 2730 2731 return ret; 2732 } 2733 2734 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 2735 u64 start, u64 end, 2736 const bool return_free_space) 2737 { 2738 struct btrfs_block_group *cache = NULL; 2739 struct btrfs_space_info *space_info; 2740 struct btrfs_free_cluster *cluster = NULL; 2741 u64 total_unpinned = 0; 2742 u64 empty_cluster = 0; 2743 bool readonly; 2744 int ret = 0; 2745 2746 while (start <= end) { 2747 u64 len; 2748 2749 readonly = false; 2750 if (!cache || 2751 start >= cache->start + cache->length) { 2752 if (cache) 2753 btrfs_put_block_group(cache); 2754 total_unpinned = 0; 2755 cache = btrfs_lookup_block_group(fs_info, start); 2756 if (cache == NULL) { 2757 /* Logic error, something removed the block group. */ 2758 ret = -EUCLEAN; 2759 goto out; 2760 } 2761 2762 cluster = fetch_cluster_info(fs_info, 2763 cache->space_info, 2764 &empty_cluster); 2765 empty_cluster <<= 1; 2766 } 2767 2768 len = cache->start + cache->length - start; 2769 len = min(len, end + 1 - start); 2770 2771 if (return_free_space) 2772 btrfs_add_free_space(cache, start, len); 2773 2774 start += len; 2775 total_unpinned += len; 2776 space_info = cache->space_info; 2777 2778 /* 2779 * If this space cluster has been marked as fragmented and we've 2780 * unpinned enough in this block group to potentially allow a 2781 * cluster to be created inside of it go ahead and clear the 2782 * fragmented check. 2783 */ 2784 if (cluster && cluster->fragmented && 2785 total_unpinned > empty_cluster) { 2786 spin_lock(&cluster->lock); 2787 cluster->fragmented = 0; 2788 spin_unlock(&cluster->lock); 2789 } 2790 2791 spin_lock(&space_info->lock); 2792 spin_lock(&cache->lock); 2793 cache->pinned -= len; 2794 btrfs_space_info_update_bytes_pinned(space_info, -len); 2795 space_info->max_extent_size = 0; 2796 if (cache->ro) { 2797 space_info->bytes_readonly += len; 2798 readonly = true; 2799 } else if (btrfs_is_zoned(fs_info)) { 2800 /* Need reset before reusing in a zoned block group */ 2801 btrfs_space_info_update_bytes_zone_unusable(space_info, len); 2802 readonly = true; 2803 } 2804 spin_unlock(&cache->lock); 2805 if (!readonly && return_free_space) 2806 btrfs_return_free_space(space_info, len); 2807 spin_unlock(&space_info->lock); 2808 } 2809 2810 if (cache) 2811 btrfs_put_block_group(cache); 2812 out: 2813 return ret; 2814 } 2815 2816 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 2817 { 2818 struct btrfs_fs_info *fs_info = trans->fs_info; 2819 struct btrfs_block_group *block_group, *tmp; 2820 struct list_head *deleted_bgs; 2821 struct extent_io_tree *unpin; 2822 u64 start; 2823 u64 end; 2824 int ret; 2825 2826 unpin = &trans->transaction->pinned_extents; 2827 2828 while (!TRANS_ABORTED(trans)) { 2829 struct extent_state *cached_state = NULL; 2830 2831 mutex_lock(&fs_info->unused_bg_unpin_mutex); 2832 if (!find_first_extent_bit(unpin, 0, &start, &end, 2833 EXTENT_DIRTY, &cached_state)) { 2834 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2835 break; 2836 } 2837 2838 if (btrfs_test_opt(fs_info, DISCARD_SYNC)) 2839 ret = btrfs_discard_extent(fs_info, start, 2840 end + 1 - start, NULL); 2841 2842 clear_extent_dirty(unpin, start, end, &cached_state); 2843 ret = unpin_extent_range(fs_info, start, end, true); 2844 BUG_ON(ret); 2845 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2846 free_extent_state(cached_state); 2847 cond_resched(); 2848 } 2849 2850 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 2851 btrfs_discard_calc_delay(&fs_info->discard_ctl); 2852 btrfs_discard_schedule_work(&fs_info->discard_ctl, true); 2853 } 2854 2855 /* 2856 * Transaction is finished. We don't need the lock anymore. We 2857 * do need to clean up the block groups in case of a transaction 2858 * abort. 2859 */ 2860 deleted_bgs = &trans->transaction->deleted_bgs; 2861 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 2862 u64 trimmed = 0; 2863 2864 ret = -EROFS; 2865 if (!TRANS_ABORTED(trans)) 2866 ret = btrfs_discard_extent(fs_info, 2867 block_group->start, 2868 block_group->length, 2869 &trimmed); 2870 2871 /* 2872 * Not strictly necessary to lock, as the block_group should be 2873 * read-only from btrfs_delete_unused_bgs(). 2874 */ 2875 ASSERT(block_group->ro); 2876 spin_lock(&fs_info->unused_bgs_lock); 2877 list_del_init(&block_group->bg_list); 2878 spin_unlock(&fs_info->unused_bgs_lock); 2879 2880 btrfs_unfreeze_block_group(block_group); 2881 btrfs_put_block_group(block_group); 2882 2883 if (ret) { 2884 const char *errstr = btrfs_decode_error(ret); 2885 btrfs_warn(fs_info, 2886 "discard failed while removing blockgroup: errno=%d %s", 2887 ret, errstr); 2888 } 2889 } 2890 2891 return 0; 2892 } 2893 2894 /* 2895 * Parse an extent item's inline extents looking for a simple quotas owner ref. 2896 * 2897 * @fs_info: the btrfs_fs_info for this mount 2898 * @leaf: a leaf in the extent tree containing the extent item 2899 * @slot: the slot in the leaf where the extent item is found 2900 * 2901 * Returns the objectid of the root that originally allocated the extent item 2902 * if the inline owner ref is expected and present, otherwise 0. 2903 * 2904 * If an extent item has an owner ref item, it will be the first inline ref 2905 * item. Therefore the logic is to check whether there are any inline ref 2906 * items, then check the type of the first one. 2907 */ 2908 u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, 2909 struct extent_buffer *leaf, int slot) 2910 { 2911 struct btrfs_extent_item *ei; 2912 struct btrfs_extent_inline_ref *iref; 2913 struct btrfs_extent_owner_ref *oref; 2914 unsigned long ptr; 2915 unsigned long end; 2916 int type; 2917 2918 if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) 2919 return 0; 2920 2921 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 2922 ptr = (unsigned long)(ei + 1); 2923 end = (unsigned long)ei + btrfs_item_size(leaf, slot); 2924 2925 /* No inline ref items of any kind, can't check type. */ 2926 if (ptr == end) 2927 return 0; 2928 2929 iref = (struct btrfs_extent_inline_ref *)ptr; 2930 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 2931 2932 /* We found an owner ref, get the root out of it. */ 2933 if (type == BTRFS_EXTENT_OWNER_REF_KEY) { 2934 oref = (struct btrfs_extent_owner_ref *)(&iref->offset); 2935 return btrfs_extent_owner_ref_root_id(leaf, oref); 2936 } 2937 2938 /* We have inline refs, but not an owner ref. */ 2939 return 0; 2940 } 2941 2942 static int do_free_extent_accounting(struct btrfs_trans_handle *trans, 2943 u64 bytenr, struct btrfs_squota_delta *delta) 2944 { 2945 int ret; 2946 u64 num_bytes = delta->num_bytes; 2947 2948 if (delta->is_data) { 2949 struct btrfs_root *csum_root; 2950 2951 csum_root = btrfs_csum_root(trans->fs_info, bytenr); 2952 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); 2953 if (ret) { 2954 btrfs_abort_transaction(trans, ret); 2955 return ret; 2956 } 2957 2958 ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); 2959 if (ret) { 2960 btrfs_abort_transaction(trans, ret); 2961 return ret; 2962 } 2963 } 2964 2965 ret = btrfs_record_squota_delta(trans->fs_info, delta); 2966 if (ret) { 2967 btrfs_abort_transaction(trans, ret); 2968 return ret; 2969 } 2970 2971 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 2972 if (ret) { 2973 btrfs_abort_transaction(trans, ret); 2974 return ret; 2975 } 2976 2977 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 2978 if (ret) 2979 btrfs_abort_transaction(trans, ret); 2980 2981 return ret; 2982 } 2983 2984 #define abort_and_dump(trans, path, fmt, args...) \ 2985 ({ \ 2986 btrfs_abort_transaction(trans, -EUCLEAN); \ 2987 btrfs_print_leaf(path->nodes[0]); \ 2988 btrfs_crit(trans->fs_info, fmt, ##args); \ 2989 }) 2990 2991 /* 2992 * Drop one or more refs of @node. 2993 * 2994 * 1. Locate the extent refs. 2995 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. 2996 * Locate it, then reduce the refs number or remove the ref line completely. 2997 * 2998 * 2. Update the refs count in EXTENT/METADATA_ITEM 2999 * 3000 * Inline backref case: 3001 * 3002 * in extent tree we have: 3003 * 3004 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 3005 * refs 2 gen 6 flags DATA 3006 * extent data backref root FS_TREE objectid 258 offset 0 count 1 3007 * extent data backref root FS_TREE objectid 257 offset 0 count 1 3008 * 3009 * This function gets called with: 3010 * 3011 * node->bytenr = 13631488 3012 * node->num_bytes = 1048576 3013 * root_objectid = FS_TREE 3014 * owner_objectid = 257 3015 * owner_offset = 0 3016 * refs_to_drop = 1 3017 * 3018 * Then we should get some like: 3019 * 3020 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 3021 * refs 1 gen 6 flags DATA 3022 * extent data backref root FS_TREE objectid 258 offset 0 count 1 3023 * 3024 * Keyed backref case: 3025 * 3026 * in extent tree we have: 3027 * 3028 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 3029 * refs 754 gen 6 flags DATA 3030 * [...] 3031 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 3032 * extent data backref root FS_TREE objectid 866 offset 0 count 1 3033 * 3034 * This function get called with: 3035 * 3036 * node->bytenr = 13631488 3037 * node->num_bytes = 1048576 3038 * root_objectid = FS_TREE 3039 * owner_objectid = 866 3040 * owner_offset = 0 3041 * refs_to_drop = 1 3042 * 3043 * Then we should get some like: 3044 * 3045 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 3046 * refs 753 gen 6 flags DATA 3047 * 3048 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. 3049 */ 3050 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3051 struct btrfs_delayed_ref_head *href, 3052 struct btrfs_delayed_ref_node *node, 3053 struct btrfs_delayed_extent_op *extent_op) 3054 { 3055 struct btrfs_fs_info *info = trans->fs_info; 3056 struct btrfs_key key; 3057 struct btrfs_path *path; 3058 struct btrfs_root *extent_root; 3059 struct extent_buffer *leaf; 3060 struct btrfs_extent_item *ei; 3061 struct btrfs_extent_inline_ref *iref; 3062 int ret; 3063 int is_data; 3064 int extent_slot = 0; 3065 int found_extent = 0; 3066 int num_to_del = 1; 3067 int refs_to_drop = node->ref_mod; 3068 u32 item_size; 3069 u64 refs; 3070 u64 bytenr = node->bytenr; 3071 u64 num_bytes = node->num_bytes; 3072 u64 owner_objectid = btrfs_delayed_ref_owner(node); 3073 u64 owner_offset = btrfs_delayed_ref_offset(node); 3074 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 3075 u64 delayed_ref_root = href->owning_root; 3076 3077 extent_root = btrfs_extent_root(info, bytenr); 3078 ASSERT(extent_root); 3079 3080 path = btrfs_alloc_path(); 3081 if (!path) 3082 return -ENOMEM; 3083 3084 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 3085 3086 if (!is_data && refs_to_drop != 1) { 3087 btrfs_crit(info, 3088 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", 3089 node->bytenr, refs_to_drop); 3090 ret = -EINVAL; 3091 btrfs_abort_transaction(trans, ret); 3092 goto out; 3093 } 3094 3095 if (is_data) 3096 skinny_metadata = false; 3097 3098 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 3099 node->parent, node->ref_root, owner_objectid, 3100 owner_offset); 3101 if (ret == 0) { 3102 /* 3103 * Either the inline backref or the SHARED_DATA_REF/ 3104 * SHARED_BLOCK_REF is found 3105 * 3106 * Here is a quick path to locate EXTENT/METADATA_ITEM. 3107 * It's possible the EXTENT/METADATA_ITEM is near current slot. 3108 */ 3109 extent_slot = path->slots[0]; 3110 while (extent_slot >= 0) { 3111 btrfs_item_key_to_cpu(path->nodes[0], &key, 3112 extent_slot); 3113 if (key.objectid != bytenr) 3114 break; 3115 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3116 key.offset == num_bytes) { 3117 found_extent = 1; 3118 break; 3119 } 3120 if (key.type == BTRFS_METADATA_ITEM_KEY && 3121 key.offset == owner_objectid) { 3122 found_extent = 1; 3123 break; 3124 } 3125 3126 /* Quick path didn't find the EXTENT/METADATA_ITEM */ 3127 if (path->slots[0] - extent_slot > 5) 3128 break; 3129 extent_slot--; 3130 } 3131 3132 if (!found_extent) { 3133 if (iref) { 3134 abort_and_dump(trans, path, 3135 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", 3136 path->slots[0]); 3137 ret = -EUCLEAN; 3138 goto out; 3139 } 3140 /* Must be SHARED_* item, remove the backref first */ 3141 ret = remove_extent_backref(trans, extent_root, path, 3142 NULL, refs_to_drop, is_data); 3143 if (ret) { 3144 btrfs_abort_transaction(trans, ret); 3145 goto out; 3146 } 3147 btrfs_release_path(path); 3148 3149 /* Slow path to locate EXTENT/METADATA_ITEM */ 3150 key.objectid = bytenr; 3151 key.type = BTRFS_EXTENT_ITEM_KEY; 3152 key.offset = num_bytes; 3153 3154 if (!is_data && skinny_metadata) { 3155 key.type = BTRFS_METADATA_ITEM_KEY; 3156 key.offset = owner_objectid; 3157 } 3158 3159 ret = btrfs_search_slot(trans, extent_root, 3160 &key, path, -1, 1); 3161 if (ret > 0 && skinny_metadata && path->slots[0]) { 3162 /* 3163 * Couldn't find our skinny metadata item, 3164 * see if we have ye olde extent item. 3165 */ 3166 path->slots[0]--; 3167 btrfs_item_key_to_cpu(path->nodes[0], &key, 3168 path->slots[0]); 3169 if (key.objectid == bytenr && 3170 key.type == BTRFS_EXTENT_ITEM_KEY && 3171 key.offset == num_bytes) 3172 ret = 0; 3173 } 3174 3175 if (ret > 0 && skinny_metadata) { 3176 skinny_metadata = false; 3177 key.objectid = bytenr; 3178 key.type = BTRFS_EXTENT_ITEM_KEY; 3179 key.offset = num_bytes; 3180 btrfs_release_path(path); 3181 ret = btrfs_search_slot(trans, extent_root, 3182 &key, path, -1, 1); 3183 } 3184 3185 if (ret) { 3186 if (ret > 0) 3187 btrfs_print_leaf(path->nodes[0]); 3188 btrfs_err(info, 3189 "umm, got %d back from search, was looking for %llu, slot %d", 3190 ret, bytenr, path->slots[0]); 3191 } 3192 if (ret < 0) { 3193 btrfs_abort_transaction(trans, ret); 3194 goto out; 3195 } 3196 extent_slot = path->slots[0]; 3197 } 3198 } else if (WARN_ON(ret == -ENOENT)) { 3199 abort_and_dump(trans, path, 3200 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", 3201 bytenr, node->parent, node->ref_root, owner_objectid, 3202 owner_offset, path->slots[0]); 3203 goto out; 3204 } else { 3205 btrfs_abort_transaction(trans, ret); 3206 goto out; 3207 } 3208 3209 leaf = path->nodes[0]; 3210 item_size = btrfs_item_size(leaf, extent_slot); 3211 if (unlikely(item_size < sizeof(*ei))) { 3212 ret = -EUCLEAN; 3213 btrfs_err(trans->fs_info, 3214 "unexpected extent item size, has %u expect >= %zu", 3215 item_size, sizeof(*ei)); 3216 btrfs_abort_transaction(trans, ret); 3217 goto out; 3218 } 3219 ei = btrfs_item_ptr(leaf, extent_slot, 3220 struct btrfs_extent_item); 3221 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 3222 key.type == BTRFS_EXTENT_ITEM_KEY) { 3223 struct btrfs_tree_block_info *bi; 3224 3225 if (item_size < sizeof(*ei) + sizeof(*bi)) { 3226 abort_and_dump(trans, path, 3227 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", 3228 key.objectid, key.type, key.offset, 3229 path->slots[0], owner_objectid, item_size, 3230 sizeof(*ei) + sizeof(*bi)); 3231 ret = -EUCLEAN; 3232 goto out; 3233 } 3234 bi = (struct btrfs_tree_block_info *)(ei + 1); 3235 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 3236 } 3237 3238 refs = btrfs_extent_refs(leaf, ei); 3239 if (refs < refs_to_drop) { 3240 abort_and_dump(trans, path, 3241 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", 3242 refs_to_drop, refs, bytenr, path->slots[0]); 3243 ret = -EUCLEAN; 3244 goto out; 3245 } 3246 refs -= refs_to_drop; 3247 3248 if (refs > 0) { 3249 if (extent_op) 3250 __run_delayed_extent_op(extent_op, leaf, ei); 3251 /* 3252 * In the case of inline back ref, reference count will 3253 * be updated by remove_extent_backref 3254 */ 3255 if (iref) { 3256 if (!found_extent) { 3257 abort_and_dump(trans, path, 3258 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", 3259 path->slots[0]); 3260 ret = -EUCLEAN; 3261 goto out; 3262 } 3263 } else { 3264 btrfs_set_extent_refs(leaf, ei, refs); 3265 } 3266 if (found_extent) { 3267 ret = remove_extent_backref(trans, extent_root, path, 3268 iref, refs_to_drop, is_data); 3269 if (ret) { 3270 btrfs_abort_transaction(trans, ret); 3271 goto out; 3272 } 3273 } 3274 } else { 3275 struct btrfs_squota_delta delta = { 3276 .root = delayed_ref_root, 3277 .num_bytes = num_bytes, 3278 .is_data = is_data, 3279 .is_inc = false, 3280 .generation = btrfs_extent_generation(leaf, ei), 3281 }; 3282 3283 /* In this branch refs == 1 */ 3284 if (found_extent) { 3285 if (is_data && refs_to_drop != 3286 extent_data_ref_count(path, iref)) { 3287 abort_and_dump(trans, path, 3288 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", 3289 extent_data_ref_count(path, iref), 3290 refs_to_drop, path->slots[0]); 3291 ret = -EUCLEAN; 3292 goto out; 3293 } 3294 if (iref) { 3295 if (path->slots[0] != extent_slot) { 3296 abort_and_dump(trans, path, 3297 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", 3298 key.objectid, key.type, 3299 key.offset, path->slots[0]); 3300 ret = -EUCLEAN; 3301 goto out; 3302 } 3303 } else { 3304 /* 3305 * No inline ref, we must be at SHARED_* item, 3306 * And it's single ref, it must be: 3307 * | extent_slot ||extent_slot + 1| 3308 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] 3309 */ 3310 if (path->slots[0] != extent_slot + 1) { 3311 abort_and_dump(trans, path, 3312 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", 3313 path->slots[0]); 3314 ret = -EUCLEAN; 3315 goto out; 3316 } 3317 path->slots[0] = extent_slot; 3318 num_to_del = 2; 3319 } 3320 } 3321 /* 3322 * We can't infer the data owner from the delayed ref, so we need 3323 * to try to get it from the owning ref item. 3324 * 3325 * If it is not present, then that extent was not written under 3326 * simple quotas mode, so we don't need to account for its deletion. 3327 */ 3328 if (is_data) 3329 delta.root = btrfs_get_extent_owner_root(trans->fs_info, 3330 leaf, extent_slot); 3331 3332 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3333 num_to_del); 3334 if (ret) { 3335 btrfs_abort_transaction(trans, ret); 3336 goto out; 3337 } 3338 btrfs_release_path(path); 3339 3340 ret = do_free_extent_accounting(trans, bytenr, &delta); 3341 } 3342 btrfs_release_path(path); 3343 3344 out: 3345 btrfs_free_path(path); 3346 return ret; 3347 } 3348 3349 /* 3350 * when we free an block, it is possible (and likely) that we free the last 3351 * delayed ref for that extent as well. This searches the delayed ref tree for 3352 * a given extent, and if there are no other delayed refs to be processed, it 3353 * removes it from the tree. 3354 */ 3355 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 3356 u64 bytenr) 3357 { 3358 struct btrfs_fs_info *fs_info = trans->fs_info; 3359 struct btrfs_delayed_ref_head *head; 3360 struct btrfs_delayed_ref_root *delayed_refs; 3361 int ret = 0; 3362 3363 delayed_refs = &trans->transaction->delayed_refs; 3364 spin_lock(&delayed_refs->lock); 3365 head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); 3366 if (!head) 3367 goto out_delayed_unlock; 3368 3369 spin_lock(&head->lock); 3370 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 3371 goto out; 3372 3373 if (cleanup_extent_op(head) != NULL) 3374 goto out; 3375 3376 /* 3377 * waiting for the lock here would deadlock. If someone else has it 3378 * locked they are already in the process of dropping it anyway 3379 */ 3380 if (!mutex_trylock(&head->mutex)) 3381 goto out; 3382 3383 btrfs_delete_ref_head(fs_info, delayed_refs, head); 3384 head->processing = false; 3385 3386 spin_unlock(&head->lock); 3387 spin_unlock(&delayed_refs->lock); 3388 3389 BUG_ON(head->extent_op); 3390 if (head->must_insert_reserved) 3391 ret = 1; 3392 3393 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 3394 mutex_unlock(&head->mutex); 3395 btrfs_put_delayed_ref_head(head); 3396 return ret; 3397 out: 3398 spin_unlock(&head->lock); 3399 3400 out_delayed_unlock: 3401 spin_unlock(&delayed_refs->lock); 3402 return 0; 3403 } 3404 3405 int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3406 u64 root_id, 3407 struct extent_buffer *buf, 3408 u64 parent, int last_ref) 3409 { 3410 struct btrfs_fs_info *fs_info = trans->fs_info; 3411 struct btrfs_block_group *bg; 3412 int ret; 3413 3414 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3415 struct btrfs_ref generic_ref = { 3416 .action = BTRFS_DROP_DELAYED_REF, 3417 .bytenr = buf->start, 3418 .num_bytes = buf->len, 3419 .parent = parent, 3420 .owning_root = btrfs_header_owner(buf), 3421 .ref_root = root_id, 3422 }; 3423 3424 /* 3425 * Assert that the extent buffer is not cleared due to 3426 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer 3427 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for 3428 * detail. 3429 */ 3430 ASSERT(btrfs_header_bytenr(buf) != 0); 3431 3432 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); 3433 btrfs_ref_tree_mod(fs_info, &generic_ref); 3434 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); 3435 if (ret < 0) 3436 return ret; 3437 } 3438 3439 if (!last_ref) 3440 return 0; 3441 3442 if (btrfs_header_generation(buf) != trans->transid) 3443 goto out; 3444 3445 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3446 ret = check_ref_cleanup(trans, buf->start); 3447 if (!ret) 3448 goto out; 3449 } 3450 3451 bg = btrfs_lookup_block_group(fs_info, buf->start); 3452 3453 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3454 pin_down_extent(trans, bg, buf->start, buf->len, 1); 3455 btrfs_put_block_group(bg); 3456 goto out; 3457 } 3458 3459 /* 3460 * If there are tree mod log users we may have recorded mod log 3461 * operations for this node. If we re-allocate this node we 3462 * could replay operations on this node that happened when it 3463 * existed in a completely different root. For example if it 3464 * was part of root A, then was reallocated to root B, and we 3465 * are doing a btrfs_old_search_slot(root b), we could replay 3466 * operations that happened when the block was part of root A, 3467 * giving us an inconsistent view of the btree. 3468 * 3469 * We are safe from races here because at this point no other 3470 * node or root points to this extent buffer, so if after this 3471 * check a new tree mod log user joins we will not have an 3472 * existing log of operations on this node that we have to 3473 * contend with. 3474 */ 3475 3476 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) 3477 || btrfs_is_zoned(fs_info)) { 3478 pin_down_extent(trans, bg, buf->start, buf->len, 1); 3479 btrfs_put_block_group(bg); 3480 goto out; 3481 } 3482 3483 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 3484 3485 btrfs_add_free_space(bg, buf->start, buf->len); 3486 btrfs_free_reserved_bytes(bg, buf->len, 0); 3487 btrfs_put_block_group(bg); 3488 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 3489 3490 out: 3491 3492 /* 3493 * Deleting the buffer, clear the corrupt flag since it doesn't 3494 * matter anymore. 3495 */ 3496 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 3497 return 0; 3498 } 3499 3500 /* Can return -ENOMEM */ 3501 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) 3502 { 3503 struct btrfs_fs_info *fs_info = trans->fs_info; 3504 int ret; 3505 3506 if (btrfs_is_testing(fs_info)) 3507 return 0; 3508 3509 /* 3510 * tree log blocks never actually go into the extent allocation 3511 * tree, just update pinning info and exit early. 3512 */ 3513 if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { 3514 btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); 3515 ret = 0; 3516 } else if (ref->type == BTRFS_REF_METADATA) { 3517 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); 3518 } else { 3519 ret = btrfs_add_delayed_data_ref(trans, ref, 0); 3520 } 3521 3522 if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) 3523 btrfs_ref_tree_mod(fs_info, ref); 3524 3525 return ret; 3526 } 3527 3528 enum btrfs_loop_type { 3529 /* 3530 * Start caching block groups but do not wait for progress or for them 3531 * to be done. 3532 */ 3533 LOOP_CACHING_NOWAIT, 3534 3535 /* 3536 * Wait for the block group free_space >= the space we're waiting for if 3537 * the block group isn't cached. 3538 */ 3539 LOOP_CACHING_WAIT, 3540 3541 /* 3542 * Allow allocations to happen from block groups that do not yet have a 3543 * size classification. 3544 */ 3545 LOOP_UNSET_SIZE_CLASS, 3546 3547 /* 3548 * Allocate a chunk and then retry the allocation. 3549 */ 3550 LOOP_ALLOC_CHUNK, 3551 3552 /* 3553 * Ignore the size class restrictions for this allocation. 3554 */ 3555 LOOP_WRONG_SIZE_CLASS, 3556 3557 /* 3558 * Ignore the empty size, only try to allocate the number of bytes 3559 * needed for this allocation. 3560 */ 3561 LOOP_NO_EMPTY_SIZE, 3562 }; 3563 3564 static inline void 3565 btrfs_lock_block_group(struct btrfs_block_group *cache, 3566 int delalloc) 3567 { 3568 if (delalloc) 3569 down_read(&cache->data_rwsem); 3570 } 3571 3572 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, 3573 int delalloc) 3574 { 3575 btrfs_get_block_group(cache); 3576 if (delalloc) 3577 down_read(&cache->data_rwsem); 3578 } 3579 3580 static struct btrfs_block_group *btrfs_lock_cluster( 3581 struct btrfs_block_group *block_group, 3582 struct btrfs_free_cluster *cluster, 3583 int delalloc) 3584 __acquires(&cluster->refill_lock) 3585 { 3586 struct btrfs_block_group *used_bg = NULL; 3587 3588 spin_lock(&cluster->refill_lock); 3589 while (1) { 3590 used_bg = cluster->block_group; 3591 if (!used_bg) 3592 return NULL; 3593 3594 if (used_bg == block_group) 3595 return used_bg; 3596 3597 btrfs_get_block_group(used_bg); 3598 3599 if (!delalloc) 3600 return used_bg; 3601 3602 if (down_read_trylock(&used_bg->data_rwsem)) 3603 return used_bg; 3604 3605 spin_unlock(&cluster->refill_lock); 3606 3607 /* We should only have one-level nested. */ 3608 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 3609 3610 spin_lock(&cluster->refill_lock); 3611 if (used_bg == cluster->block_group) 3612 return used_bg; 3613 3614 up_read(&used_bg->data_rwsem); 3615 btrfs_put_block_group(used_bg); 3616 } 3617 } 3618 3619 static inline void 3620 btrfs_release_block_group(struct btrfs_block_group *cache, 3621 int delalloc) 3622 { 3623 if (delalloc) 3624 up_read(&cache->data_rwsem); 3625 btrfs_put_block_group(cache); 3626 } 3627 3628 /* 3629 * Helper function for find_free_extent(). 3630 * 3631 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 3632 * Return >0 to inform caller that we find nothing 3633 * Return 0 means we have found a location and set ffe_ctl->found_offset. 3634 */ 3635 static int find_free_extent_clustered(struct btrfs_block_group *bg, 3636 struct find_free_extent_ctl *ffe_ctl, 3637 struct btrfs_block_group **cluster_bg_ret) 3638 { 3639 struct btrfs_block_group *cluster_bg; 3640 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3641 u64 aligned_cluster; 3642 u64 offset; 3643 int ret; 3644 3645 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 3646 if (!cluster_bg) 3647 goto refill_cluster; 3648 if (cluster_bg != bg && (cluster_bg->ro || 3649 !block_group_bits(cluster_bg, ffe_ctl->flags))) 3650 goto release_cluster; 3651 3652 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 3653 ffe_ctl->num_bytes, cluster_bg->start, 3654 &ffe_ctl->max_extent_size); 3655 if (offset) { 3656 /* We have a block, we're done */ 3657 spin_unlock(&last_ptr->refill_lock); 3658 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); 3659 *cluster_bg_ret = cluster_bg; 3660 ffe_ctl->found_offset = offset; 3661 return 0; 3662 } 3663 WARN_ON(last_ptr->block_group != cluster_bg); 3664 3665 release_cluster: 3666 /* 3667 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 3668 * lets just skip it and let the allocator find whatever block it can 3669 * find. If we reach this point, we will have tried the cluster 3670 * allocator plenty of times and not have found anything, so we are 3671 * likely way too fragmented for the clustering stuff to find anything. 3672 * 3673 * However, if the cluster is taken from the current block group, 3674 * release the cluster first, so that we stand a better chance of 3675 * succeeding in the unclustered allocation. 3676 */ 3677 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 3678 spin_unlock(&last_ptr->refill_lock); 3679 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3680 return -ENOENT; 3681 } 3682 3683 /* This cluster didn't work out, free it and start over */ 3684 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3685 3686 if (cluster_bg != bg) 3687 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3688 3689 refill_cluster: 3690 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 3691 spin_unlock(&last_ptr->refill_lock); 3692 return -ENOENT; 3693 } 3694 3695 aligned_cluster = max_t(u64, 3696 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 3697 bg->full_stripe_len); 3698 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, 3699 ffe_ctl->num_bytes, aligned_cluster); 3700 if (ret == 0) { 3701 /* Now pull our allocation out of this cluster */ 3702 offset = btrfs_alloc_from_cluster(bg, last_ptr, 3703 ffe_ctl->num_bytes, ffe_ctl->search_start, 3704 &ffe_ctl->max_extent_size); 3705 if (offset) { 3706 /* We found one, proceed */ 3707 spin_unlock(&last_ptr->refill_lock); 3708 ffe_ctl->found_offset = offset; 3709 trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); 3710 return 0; 3711 } 3712 } 3713 /* 3714 * At this point we either didn't find a cluster or we weren't able to 3715 * allocate a block from our cluster. Free the cluster we've been 3716 * trying to use, and go to the next block group. 3717 */ 3718 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3719 spin_unlock(&last_ptr->refill_lock); 3720 return 1; 3721 } 3722 3723 /* 3724 * Return >0 to inform caller that we find nothing 3725 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 3726 */ 3727 static int find_free_extent_unclustered(struct btrfs_block_group *bg, 3728 struct find_free_extent_ctl *ffe_ctl) 3729 { 3730 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3731 u64 offset; 3732 3733 /* 3734 * We are doing an unclustered allocation, set the fragmented flag so 3735 * we don't bother trying to setup a cluster again until we get more 3736 * space. 3737 */ 3738 if (unlikely(last_ptr)) { 3739 spin_lock(&last_ptr->lock); 3740 last_ptr->fragmented = 1; 3741 spin_unlock(&last_ptr->lock); 3742 } 3743 if (ffe_ctl->cached) { 3744 struct btrfs_free_space_ctl *free_space_ctl; 3745 3746 free_space_ctl = bg->free_space_ctl; 3747 spin_lock(&free_space_ctl->tree_lock); 3748 if (free_space_ctl->free_space < 3749 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 3750 ffe_ctl->empty_size) { 3751 ffe_ctl->total_free_space = max_t(u64, 3752 ffe_ctl->total_free_space, 3753 free_space_ctl->free_space); 3754 spin_unlock(&free_space_ctl->tree_lock); 3755 return 1; 3756 } 3757 spin_unlock(&free_space_ctl->tree_lock); 3758 } 3759 3760 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 3761 ffe_ctl->num_bytes, ffe_ctl->empty_size, 3762 &ffe_ctl->max_extent_size); 3763 if (!offset) 3764 return 1; 3765 ffe_ctl->found_offset = offset; 3766 return 0; 3767 } 3768 3769 static int do_allocation_clustered(struct btrfs_block_group *block_group, 3770 struct find_free_extent_ctl *ffe_ctl, 3771 struct btrfs_block_group **bg_ret) 3772 { 3773 int ret; 3774 3775 /* We want to try and use the cluster allocator, so lets look there */ 3776 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { 3777 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); 3778 if (ret >= 0) 3779 return ret; 3780 /* ret == -ENOENT case falls through */ 3781 } 3782 3783 return find_free_extent_unclustered(block_group, ffe_ctl); 3784 } 3785 3786 /* 3787 * Tree-log block group locking 3788 * ============================ 3789 * 3790 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which 3791 * indicates the starting address of a block group, which is reserved only 3792 * for tree-log metadata. 3793 * 3794 * Lock nesting 3795 * ============ 3796 * 3797 * space_info::lock 3798 * block_group::lock 3799 * fs_info::treelog_bg_lock 3800 */ 3801 3802 /* 3803 * Simple allocator for sequential-only block group. It only allows sequential 3804 * allocation. No need to play with trees. This function also reserves the 3805 * bytes as in btrfs_add_reserved_bytes. 3806 */ 3807 static int do_allocation_zoned(struct btrfs_block_group *block_group, 3808 struct find_free_extent_ctl *ffe_ctl, 3809 struct btrfs_block_group **bg_ret) 3810 { 3811 struct btrfs_fs_info *fs_info = block_group->fs_info; 3812 struct btrfs_space_info *space_info = block_group->space_info; 3813 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3814 u64 start = block_group->start; 3815 u64 num_bytes = ffe_ctl->num_bytes; 3816 u64 avail; 3817 u64 bytenr = block_group->start; 3818 u64 log_bytenr; 3819 u64 data_reloc_bytenr; 3820 int ret = 0; 3821 bool skip = false; 3822 3823 ASSERT(btrfs_is_zoned(block_group->fs_info)); 3824 3825 /* 3826 * Do not allow non-tree-log blocks in the dedicated tree-log block 3827 * group, and vice versa. 3828 */ 3829 spin_lock(&fs_info->treelog_bg_lock); 3830 log_bytenr = fs_info->treelog_bg; 3831 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || 3832 (!ffe_ctl->for_treelog && bytenr == log_bytenr))) 3833 skip = true; 3834 spin_unlock(&fs_info->treelog_bg_lock); 3835 if (skip) 3836 return 1; 3837 3838 /* 3839 * Do not allow non-relocation blocks in the dedicated relocation block 3840 * group, and vice versa. 3841 */ 3842 spin_lock(&fs_info->relocation_bg_lock); 3843 data_reloc_bytenr = fs_info->data_reloc_bg; 3844 if (data_reloc_bytenr && 3845 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || 3846 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) 3847 skip = true; 3848 spin_unlock(&fs_info->relocation_bg_lock); 3849 if (skip) 3850 return 1; 3851 3852 /* Check RO and no space case before trying to activate it */ 3853 spin_lock(&block_group->lock); 3854 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { 3855 ret = 1; 3856 /* 3857 * May need to clear fs_info->{treelog,data_reloc}_bg. 3858 * Return the error after taking the locks. 3859 */ 3860 } 3861 spin_unlock(&block_group->lock); 3862 3863 /* Metadata block group is activated at write time. */ 3864 if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && 3865 !btrfs_zone_activate(block_group)) { 3866 ret = 1; 3867 /* 3868 * May need to clear fs_info->{treelog,data_reloc}_bg. 3869 * Return the error after taking the locks. 3870 */ 3871 } 3872 3873 spin_lock(&space_info->lock); 3874 spin_lock(&block_group->lock); 3875 spin_lock(&fs_info->treelog_bg_lock); 3876 spin_lock(&fs_info->relocation_bg_lock); 3877 3878 if (ret) 3879 goto out; 3880 3881 ASSERT(!ffe_ctl->for_treelog || 3882 block_group->start == fs_info->treelog_bg || 3883 fs_info->treelog_bg == 0); 3884 ASSERT(!ffe_ctl->for_data_reloc || 3885 block_group->start == fs_info->data_reloc_bg || 3886 fs_info->data_reloc_bg == 0); 3887 3888 if (block_group->ro || 3889 (!ffe_ctl->for_data_reloc && 3890 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { 3891 ret = 1; 3892 goto out; 3893 } 3894 3895 /* 3896 * Do not allow currently using block group to be tree-log dedicated 3897 * block group. 3898 */ 3899 if (ffe_ctl->for_treelog && !fs_info->treelog_bg && 3900 (block_group->used || block_group->reserved)) { 3901 ret = 1; 3902 goto out; 3903 } 3904 3905 /* 3906 * Do not allow currently used block group to be the data relocation 3907 * dedicated block group. 3908 */ 3909 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && 3910 (block_group->used || block_group->reserved)) { 3911 ret = 1; 3912 goto out; 3913 } 3914 3915 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); 3916 avail = block_group->zone_capacity - block_group->alloc_offset; 3917 if (avail < num_bytes) { 3918 if (ffe_ctl->max_extent_size < avail) { 3919 /* 3920 * With sequential allocator, free space is always 3921 * contiguous 3922 */ 3923 ffe_ctl->max_extent_size = avail; 3924 ffe_ctl->total_free_space = avail; 3925 } 3926 ret = 1; 3927 goto out; 3928 } 3929 3930 if (ffe_ctl->for_treelog && !fs_info->treelog_bg) 3931 fs_info->treelog_bg = block_group->start; 3932 3933 if (ffe_ctl->for_data_reloc) { 3934 if (!fs_info->data_reloc_bg) 3935 fs_info->data_reloc_bg = block_group->start; 3936 /* 3937 * Do not allow allocations from this block group, unless it is 3938 * for data relocation. Compared to increasing the ->ro, setting 3939 * the ->zoned_data_reloc_ongoing flag still allows nocow 3940 * writers to come in. See btrfs_inc_nocow_writers(). 3941 * 3942 * We need to disable an allocation to avoid an allocation of 3943 * regular (non-relocation data) extent. With mix of relocation 3944 * extents and regular extents, we can dispatch WRITE commands 3945 * (for relocation extents) and ZONE APPEND commands (for 3946 * regular extents) at the same time to the same zone, which 3947 * easily break the write pointer. 3948 * 3949 * Also, this flag avoids this block group to be zone finished. 3950 */ 3951 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); 3952 } 3953 3954 ffe_ctl->found_offset = start + block_group->alloc_offset; 3955 block_group->alloc_offset += num_bytes; 3956 spin_lock(&ctl->tree_lock); 3957 ctl->free_space -= num_bytes; 3958 spin_unlock(&ctl->tree_lock); 3959 3960 /* 3961 * We do not check if found_offset is aligned to stripesize. The 3962 * address is anyway rewritten when using zone append writing. 3963 */ 3964 3965 ffe_ctl->search_start = ffe_ctl->found_offset; 3966 3967 out: 3968 if (ret && ffe_ctl->for_treelog) 3969 fs_info->treelog_bg = 0; 3970 if (ret && ffe_ctl->for_data_reloc) 3971 fs_info->data_reloc_bg = 0; 3972 spin_unlock(&fs_info->relocation_bg_lock); 3973 spin_unlock(&fs_info->treelog_bg_lock); 3974 spin_unlock(&block_group->lock); 3975 spin_unlock(&space_info->lock); 3976 return ret; 3977 } 3978 3979 static int do_allocation(struct btrfs_block_group *block_group, 3980 struct find_free_extent_ctl *ffe_ctl, 3981 struct btrfs_block_group **bg_ret) 3982 { 3983 switch (ffe_ctl->policy) { 3984 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3985 return do_allocation_clustered(block_group, ffe_ctl, bg_ret); 3986 case BTRFS_EXTENT_ALLOC_ZONED: 3987 return do_allocation_zoned(block_group, ffe_ctl, bg_ret); 3988 default: 3989 BUG(); 3990 } 3991 } 3992 3993 static void release_block_group(struct btrfs_block_group *block_group, 3994 struct find_free_extent_ctl *ffe_ctl, 3995 int delalloc) 3996 { 3997 switch (ffe_ctl->policy) { 3998 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3999 ffe_ctl->retry_uncached = false; 4000 break; 4001 case BTRFS_EXTENT_ALLOC_ZONED: 4002 /* Nothing to do */ 4003 break; 4004 default: 4005 BUG(); 4006 } 4007 4008 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 4009 ffe_ctl->index); 4010 btrfs_release_block_group(block_group, delalloc); 4011 } 4012 4013 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, 4014 struct btrfs_key *ins) 4015 { 4016 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 4017 4018 if (!ffe_ctl->use_cluster && last_ptr) { 4019 spin_lock(&last_ptr->lock); 4020 last_ptr->window_start = ins->objectid; 4021 spin_unlock(&last_ptr->lock); 4022 } 4023 } 4024 4025 static void found_extent(struct find_free_extent_ctl *ffe_ctl, 4026 struct btrfs_key *ins) 4027 { 4028 switch (ffe_ctl->policy) { 4029 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4030 found_extent_clustered(ffe_ctl, ins); 4031 break; 4032 case BTRFS_EXTENT_ALLOC_ZONED: 4033 /* Nothing to do */ 4034 break; 4035 default: 4036 BUG(); 4037 } 4038 } 4039 4040 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, 4041 struct find_free_extent_ctl *ffe_ctl) 4042 { 4043 /* Block group's activeness is not a requirement for METADATA block groups. */ 4044 if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) 4045 return 0; 4046 4047 /* If we can activate new zone, just allocate a chunk and use it */ 4048 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) 4049 return 0; 4050 4051 /* 4052 * We already reached the max active zones. Try to finish one block 4053 * group to make a room for a new block group. This is only possible 4054 * for a data block group because btrfs_zone_finish() may need to wait 4055 * for a running transaction which can cause a deadlock for metadata 4056 * allocation. 4057 */ 4058 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 4059 int ret = btrfs_zone_finish_one_bg(fs_info); 4060 4061 if (ret == 1) 4062 return 0; 4063 else if (ret < 0) 4064 return ret; 4065 } 4066 4067 /* 4068 * If we have enough free space left in an already active block group 4069 * and we can't activate any other zone now, do not allow allocating a 4070 * new chunk and let find_free_extent() retry with a smaller size. 4071 */ 4072 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) 4073 return -ENOSPC; 4074 4075 /* 4076 * Even min_alloc_size is not left in any block groups. Since we cannot 4077 * activate a new block group, allocating it may not help. Let's tell a 4078 * caller to try again and hope it progress something by writing some 4079 * parts of the region. That is only possible for data block groups, 4080 * where a part of the region can be written. 4081 */ 4082 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) 4083 return -EAGAIN; 4084 4085 /* 4086 * We cannot activate a new block group and no enough space left in any 4087 * block groups. So, allocating a new block group may not help. But, 4088 * there is nothing to do anyway, so let's go with it. 4089 */ 4090 return 0; 4091 } 4092 4093 static int can_allocate_chunk(struct btrfs_fs_info *fs_info, 4094 struct find_free_extent_ctl *ffe_ctl) 4095 { 4096 switch (ffe_ctl->policy) { 4097 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4098 return 0; 4099 case BTRFS_EXTENT_ALLOC_ZONED: 4100 return can_allocate_chunk_zoned(fs_info, ffe_ctl); 4101 default: 4102 BUG(); 4103 } 4104 } 4105 4106 /* 4107 * Return >0 means caller needs to re-search for free extent 4108 * Return 0 means we have the needed free extent. 4109 * Return <0 means we failed to locate any free extent. 4110 */ 4111 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 4112 struct btrfs_key *ins, 4113 struct find_free_extent_ctl *ffe_ctl, 4114 bool full_search) 4115 { 4116 struct btrfs_root *root = fs_info->chunk_root; 4117 int ret; 4118 4119 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 4120 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 4121 ffe_ctl->orig_have_caching_bg = true; 4122 4123 if (ins->objectid) { 4124 found_extent(ffe_ctl, ins); 4125 return 0; 4126 } 4127 4128 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) 4129 return 1; 4130 4131 ffe_ctl->index++; 4132 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) 4133 return 1; 4134 4135 /* See the comments for btrfs_loop_type for an explanation of the phases. */ 4136 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 4137 ffe_ctl->index = 0; 4138 /* 4139 * We want to skip the LOOP_CACHING_WAIT step if we don't have 4140 * any uncached bgs and we've already done a full search 4141 * through. 4142 */ 4143 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && 4144 (!ffe_ctl->orig_have_caching_bg && full_search)) 4145 ffe_ctl->loop++; 4146 ffe_ctl->loop++; 4147 4148 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 4149 struct btrfs_trans_handle *trans; 4150 int exist = 0; 4151 4152 /* Check if allocation policy allows to create a new chunk */ 4153 ret = can_allocate_chunk(fs_info, ffe_ctl); 4154 if (ret) 4155 return ret; 4156 4157 trans = current->journal_info; 4158 if (trans) 4159 exist = 1; 4160 else 4161 trans = btrfs_join_transaction(root); 4162 4163 if (IS_ERR(trans)) { 4164 ret = PTR_ERR(trans); 4165 return ret; 4166 } 4167 4168 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, 4169 CHUNK_ALLOC_FORCE_FOR_EXTENT); 4170 4171 /* Do not bail out on ENOSPC since we can do more. */ 4172 if (ret == -ENOSPC) { 4173 ret = 0; 4174 ffe_ctl->loop++; 4175 } 4176 else if (ret < 0) 4177 btrfs_abort_transaction(trans, ret); 4178 else 4179 ret = 0; 4180 if (!exist) 4181 btrfs_end_transaction(trans); 4182 if (ret) 4183 return ret; 4184 } 4185 4186 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 4187 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) 4188 return -ENOSPC; 4189 4190 /* 4191 * Don't loop again if we already have no empty_size and 4192 * no empty_cluster. 4193 */ 4194 if (ffe_ctl->empty_size == 0 && 4195 ffe_ctl->empty_cluster == 0) 4196 return -ENOSPC; 4197 ffe_ctl->empty_size = 0; 4198 ffe_ctl->empty_cluster = 0; 4199 } 4200 return 1; 4201 } 4202 return -ENOSPC; 4203 } 4204 4205 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, 4206 struct btrfs_block_group *bg) 4207 { 4208 if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) 4209 return true; 4210 if (!btrfs_block_group_should_use_size_class(bg)) 4211 return true; 4212 if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) 4213 return true; 4214 if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && 4215 bg->size_class == BTRFS_BG_SZ_NONE) 4216 return true; 4217 return ffe_ctl->size_class == bg->size_class; 4218 } 4219 4220 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, 4221 struct find_free_extent_ctl *ffe_ctl, 4222 struct btrfs_space_info *space_info, 4223 struct btrfs_key *ins) 4224 { 4225 /* 4226 * If our free space is heavily fragmented we may not be able to make 4227 * big contiguous allocations, so instead of doing the expensive search 4228 * for free space, simply return ENOSPC with our max_extent_size so we 4229 * can go ahead and search for a more manageable chunk. 4230 * 4231 * If our max_extent_size is large enough for our allocation simply 4232 * disable clustering since we will likely not be able to find enough 4233 * space to create a cluster and induce latency trying. 4234 */ 4235 if (space_info->max_extent_size) { 4236 spin_lock(&space_info->lock); 4237 if (space_info->max_extent_size && 4238 ffe_ctl->num_bytes > space_info->max_extent_size) { 4239 ins->offset = space_info->max_extent_size; 4240 spin_unlock(&space_info->lock); 4241 return -ENOSPC; 4242 } else if (space_info->max_extent_size) { 4243 ffe_ctl->use_cluster = false; 4244 } 4245 spin_unlock(&space_info->lock); 4246 } 4247 4248 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, 4249 &ffe_ctl->empty_cluster); 4250 if (ffe_ctl->last_ptr) { 4251 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 4252 4253 spin_lock(&last_ptr->lock); 4254 if (last_ptr->block_group) 4255 ffe_ctl->hint_byte = last_ptr->window_start; 4256 if (last_ptr->fragmented) { 4257 /* 4258 * We still set window_start so we can keep track of the 4259 * last place we found an allocation to try and save 4260 * some time. 4261 */ 4262 ffe_ctl->hint_byte = last_ptr->window_start; 4263 ffe_ctl->use_cluster = false; 4264 } 4265 spin_unlock(&last_ptr->lock); 4266 } 4267 4268 return 0; 4269 } 4270 4271 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, 4272 struct find_free_extent_ctl *ffe_ctl) 4273 { 4274 if (ffe_ctl->for_treelog) { 4275 spin_lock(&fs_info->treelog_bg_lock); 4276 if (fs_info->treelog_bg) 4277 ffe_ctl->hint_byte = fs_info->treelog_bg; 4278 spin_unlock(&fs_info->treelog_bg_lock); 4279 } else if (ffe_ctl->for_data_reloc) { 4280 spin_lock(&fs_info->relocation_bg_lock); 4281 if (fs_info->data_reloc_bg) 4282 ffe_ctl->hint_byte = fs_info->data_reloc_bg; 4283 spin_unlock(&fs_info->relocation_bg_lock); 4284 } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 4285 struct btrfs_block_group *block_group; 4286 4287 spin_lock(&fs_info->zone_active_bgs_lock); 4288 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { 4289 /* 4290 * No lock is OK here because avail is monotinically 4291 * decreasing, and this is just a hint. 4292 */ 4293 u64 avail = block_group->zone_capacity - block_group->alloc_offset; 4294 4295 if (block_group_bits(block_group, ffe_ctl->flags) && 4296 avail >= ffe_ctl->num_bytes) { 4297 ffe_ctl->hint_byte = block_group->start; 4298 break; 4299 } 4300 } 4301 spin_unlock(&fs_info->zone_active_bgs_lock); 4302 } 4303 4304 return 0; 4305 } 4306 4307 static int prepare_allocation(struct btrfs_fs_info *fs_info, 4308 struct find_free_extent_ctl *ffe_ctl, 4309 struct btrfs_space_info *space_info, 4310 struct btrfs_key *ins) 4311 { 4312 switch (ffe_ctl->policy) { 4313 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4314 return prepare_allocation_clustered(fs_info, ffe_ctl, 4315 space_info, ins); 4316 case BTRFS_EXTENT_ALLOC_ZONED: 4317 return prepare_allocation_zoned(fs_info, ffe_ctl); 4318 default: 4319 BUG(); 4320 } 4321 } 4322 4323 /* 4324 * walks the btree of allocated extents and find a hole of a given size. 4325 * The key ins is changed to record the hole: 4326 * ins->objectid == start position 4327 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4328 * ins->offset == the size of the hole. 4329 * Any available blocks before search_start are skipped. 4330 * 4331 * If there is no suitable free space, we will record the max size of 4332 * the free space extent currently. 4333 * 4334 * The overall logic and call chain: 4335 * 4336 * find_free_extent() 4337 * |- Iterate through all block groups 4338 * | |- Get a valid block group 4339 * | |- Try to do clustered allocation in that block group 4340 * | |- Try to do unclustered allocation in that block group 4341 * | |- Check if the result is valid 4342 * | | |- If valid, then exit 4343 * | |- Jump to next block group 4344 * | 4345 * |- Push harder to find free extents 4346 * |- If not found, re-iterate all block groups 4347 */ 4348 static noinline int find_free_extent(struct btrfs_root *root, 4349 struct btrfs_key *ins, 4350 struct find_free_extent_ctl *ffe_ctl) 4351 { 4352 struct btrfs_fs_info *fs_info = root->fs_info; 4353 int ret = 0; 4354 int cache_block_group_error = 0; 4355 struct btrfs_block_group *block_group = NULL; 4356 struct btrfs_space_info *space_info; 4357 bool full_search = false; 4358 4359 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); 4360 4361 ffe_ctl->search_start = 0; 4362 /* For clustered allocation */ 4363 ffe_ctl->empty_cluster = 0; 4364 ffe_ctl->last_ptr = NULL; 4365 ffe_ctl->use_cluster = true; 4366 ffe_ctl->have_caching_bg = false; 4367 ffe_ctl->orig_have_caching_bg = false; 4368 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); 4369 ffe_ctl->loop = 0; 4370 ffe_ctl->retry_uncached = false; 4371 ffe_ctl->cached = 0; 4372 ffe_ctl->max_extent_size = 0; 4373 ffe_ctl->total_free_space = 0; 4374 ffe_ctl->found_offset = 0; 4375 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; 4376 ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); 4377 4378 if (btrfs_is_zoned(fs_info)) 4379 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; 4380 4381 ins->type = BTRFS_EXTENT_ITEM_KEY; 4382 ins->objectid = 0; 4383 ins->offset = 0; 4384 4385 trace_find_free_extent(root, ffe_ctl); 4386 4387 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); 4388 if (!space_info) { 4389 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); 4390 return -ENOSPC; 4391 } 4392 4393 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); 4394 if (ret < 0) 4395 return ret; 4396 4397 ffe_ctl->search_start = max(ffe_ctl->search_start, 4398 first_logical_byte(fs_info)); 4399 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); 4400 if (ffe_ctl->search_start == ffe_ctl->hint_byte) { 4401 block_group = btrfs_lookup_block_group(fs_info, 4402 ffe_ctl->search_start); 4403 /* 4404 * we don't want to use the block group if it doesn't match our 4405 * allocation bits, or if its not cached. 4406 * 4407 * However if we are re-searching with an ideal block group 4408 * picked out then we don't care that the block group is cached. 4409 */ 4410 if (block_group && block_group_bits(block_group, ffe_ctl->flags) && 4411 block_group->cached != BTRFS_CACHE_NO) { 4412 down_read(&space_info->groups_sem); 4413 if (list_empty(&block_group->list) || 4414 block_group->ro) { 4415 /* 4416 * someone is removing this block group, 4417 * we can't jump into the have_block_group 4418 * target because our list pointers are not 4419 * valid 4420 */ 4421 btrfs_put_block_group(block_group); 4422 up_read(&space_info->groups_sem); 4423 } else { 4424 ffe_ctl->index = btrfs_bg_flags_to_raid_index( 4425 block_group->flags); 4426 btrfs_lock_block_group(block_group, 4427 ffe_ctl->delalloc); 4428 ffe_ctl->hinted = true; 4429 goto have_block_group; 4430 } 4431 } else if (block_group) { 4432 btrfs_put_block_group(block_group); 4433 } 4434 } 4435 search: 4436 trace_find_free_extent_search_loop(root, ffe_ctl); 4437 ffe_ctl->have_caching_bg = false; 4438 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || 4439 ffe_ctl->index == 0) 4440 full_search = true; 4441 down_read(&space_info->groups_sem); 4442 list_for_each_entry(block_group, 4443 &space_info->block_groups[ffe_ctl->index], list) { 4444 struct btrfs_block_group *bg_ret; 4445 4446 ffe_ctl->hinted = false; 4447 /* If the block group is read-only, we can skip it entirely. */ 4448 if (unlikely(block_group->ro)) { 4449 if (ffe_ctl->for_treelog) 4450 btrfs_clear_treelog_bg(block_group); 4451 if (ffe_ctl->for_data_reloc) 4452 btrfs_clear_data_reloc_bg(block_group); 4453 continue; 4454 } 4455 4456 btrfs_grab_block_group(block_group, ffe_ctl->delalloc); 4457 ffe_ctl->search_start = block_group->start; 4458 4459 /* 4460 * this can happen if we end up cycling through all the 4461 * raid types, but we want to make sure we only allocate 4462 * for the proper type. 4463 */ 4464 if (!block_group_bits(block_group, ffe_ctl->flags)) { 4465 u64 extra = BTRFS_BLOCK_GROUP_DUP | 4466 BTRFS_BLOCK_GROUP_RAID1_MASK | 4467 BTRFS_BLOCK_GROUP_RAID56_MASK | 4468 BTRFS_BLOCK_GROUP_RAID10; 4469 4470 /* 4471 * if they asked for extra copies and this block group 4472 * doesn't provide them, bail. This does allow us to 4473 * fill raid0 from raid1. 4474 */ 4475 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) 4476 goto loop; 4477 4478 /* 4479 * This block group has different flags than we want. 4480 * It's possible that we have MIXED_GROUP flag but no 4481 * block group is mixed. Just skip such block group. 4482 */ 4483 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4484 continue; 4485 } 4486 4487 have_block_group: 4488 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); 4489 ffe_ctl->cached = btrfs_block_group_done(block_group); 4490 if (unlikely(!ffe_ctl->cached)) { 4491 ffe_ctl->have_caching_bg = true; 4492 ret = btrfs_cache_block_group(block_group, false); 4493 4494 /* 4495 * If we get ENOMEM here or something else we want to 4496 * try other block groups, because it may not be fatal. 4497 * However if we can't find anything else we need to 4498 * save our return here so that we return the actual 4499 * error that caused problems, not ENOSPC. 4500 */ 4501 if (ret < 0) { 4502 if (!cache_block_group_error) 4503 cache_block_group_error = ret; 4504 ret = 0; 4505 goto loop; 4506 } 4507 ret = 0; 4508 } 4509 4510 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { 4511 if (!cache_block_group_error) 4512 cache_block_group_error = -EIO; 4513 goto loop; 4514 } 4515 4516 if (!find_free_extent_check_size_class(ffe_ctl, block_group)) 4517 goto loop; 4518 4519 bg_ret = NULL; 4520 ret = do_allocation(block_group, ffe_ctl, &bg_ret); 4521 if (ret > 0) 4522 goto loop; 4523 4524 if (bg_ret && bg_ret != block_group) { 4525 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4526 block_group = bg_ret; 4527 } 4528 4529 /* Checks */ 4530 ffe_ctl->search_start = round_up(ffe_ctl->found_offset, 4531 fs_info->stripesize); 4532 4533 /* move on to the next group */ 4534 if (ffe_ctl->search_start + ffe_ctl->num_bytes > 4535 block_group->start + block_group->length) { 4536 btrfs_add_free_space_unused(block_group, 4537 ffe_ctl->found_offset, 4538 ffe_ctl->num_bytes); 4539 goto loop; 4540 } 4541 4542 if (ffe_ctl->found_offset < ffe_ctl->search_start) 4543 btrfs_add_free_space_unused(block_group, 4544 ffe_ctl->found_offset, 4545 ffe_ctl->search_start - ffe_ctl->found_offset); 4546 4547 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, 4548 ffe_ctl->num_bytes, 4549 ffe_ctl->delalloc, 4550 ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); 4551 if (ret == -EAGAIN) { 4552 btrfs_add_free_space_unused(block_group, 4553 ffe_ctl->found_offset, 4554 ffe_ctl->num_bytes); 4555 goto loop; 4556 } 4557 btrfs_inc_block_group_reservations(block_group); 4558 4559 /* we are all good, lets return */ 4560 ins->objectid = ffe_ctl->search_start; 4561 ins->offset = ffe_ctl->num_bytes; 4562 4563 trace_btrfs_reserve_extent(block_group, ffe_ctl); 4564 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4565 break; 4566 loop: 4567 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 4568 !ffe_ctl->retry_uncached) { 4569 ffe_ctl->retry_uncached = true; 4570 btrfs_wait_block_group_cache_progress(block_group, 4571 ffe_ctl->num_bytes + 4572 ffe_ctl->empty_cluster + 4573 ffe_ctl->empty_size); 4574 goto have_block_group; 4575 } 4576 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); 4577 cond_resched(); 4578 } 4579 up_read(&space_info->groups_sem); 4580 4581 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); 4582 if (ret > 0) 4583 goto search; 4584 4585 if (ret == -ENOSPC && !cache_block_group_error) { 4586 /* 4587 * Use ffe_ctl->total_free_space as fallback if we can't find 4588 * any contiguous hole. 4589 */ 4590 if (!ffe_ctl->max_extent_size) 4591 ffe_ctl->max_extent_size = ffe_ctl->total_free_space; 4592 spin_lock(&space_info->lock); 4593 space_info->max_extent_size = ffe_ctl->max_extent_size; 4594 spin_unlock(&space_info->lock); 4595 ins->offset = ffe_ctl->max_extent_size; 4596 } else if (ret == -ENOSPC) { 4597 ret = cache_block_group_error; 4598 } 4599 return ret; 4600 } 4601 4602 /* 4603 * Entry point to the extent allocator. Tries to find a hole that is at least 4604 * as big as @num_bytes. 4605 * 4606 * @root - The root that will contain this extent 4607 * 4608 * @ram_bytes - The amount of space in ram that @num_bytes take. This 4609 * is used for accounting purposes. This value differs 4610 * from @num_bytes only in the case of compressed extents. 4611 * 4612 * @num_bytes - Number of bytes to allocate on-disk. 4613 * 4614 * @min_alloc_size - Indicates the minimum amount of space that the 4615 * allocator should try to satisfy. In some cases 4616 * @num_bytes may be larger than what is required and if 4617 * the filesystem is fragmented then allocation fails. 4618 * However, the presence of @min_alloc_size gives a 4619 * chance to try and satisfy the smaller allocation. 4620 * 4621 * @empty_size - A hint that you plan on doing more COW. This is the 4622 * size in bytes the allocator should try to find free 4623 * next to the block it returns. This is just a hint and 4624 * may be ignored by the allocator. 4625 * 4626 * @hint_byte - Hint to the allocator to start searching above the byte 4627 * address passed. It might be ignored. 4628 * 4629 * @ins - This key is modified to record the found hole. It will 4630 * have the following values: 4631 * ins->objectid == start position 4632 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4633 * ins->offset == the size of the hole. 4634 * 4635 * @is_data - Boolean flag indicating whether an extent is 4636 * allocated for data (true) or metadata (false) 4637 * 4638 * @delalloc - Boolean flag indicating whether this allocation is for 4639 * delalloc or not. If 'true' data_rwsem of block groups 4640 * is going to be acquired. 4641 * 4642 * 4643 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 4644 * case -ENOSPC is returned then @ins->offset will contain the size of the 4645 * largest available hole the allocator managed to find. 4646 */ 4647 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 4648 u64 num_bytes, u64 min_alloc_size, 4649 u64 empty_size, u64 hint_byte, 4650 struct btrfs_key *ins, int is_data, int delalloc) 4651 { 4652 struct btrfs_fs_info *fs_info = root->fs_info; 4653 struct find_free_extent_ctl ffe_ctl = {}; 4654 bool final_tried = num_bytes == min_alloc_size; 4655 u64 flags; 4656 int ret; 4657 bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); 4658 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); 4659 4660 flags = get_alloc_profile_by_root(root, is_data); 4661 again: 4662 WARN_ON(num_bytes < fs_info->sectorsize); 4663 4664 ffe_ctl.ram_bytes = ram_bytes; 4665 ffe_ctl.num_bytes = num_bytes; 4666 ffe_ctl.min_alloc_size = min_alloc_size; 4667 ffe_ctl.empty_size = empty_size; 4668 ffe_ctl.flags = flags; 4669 ffe_ctl.delalloc = delalloc; 4670 ffe_ctl.hint_byte = hint_byte; 4671 ffe_ctl.for_treelog = for_treelog; 4672 ffe_ctl.for_data_reloc = for_data_reloc; 4673 4674 ret = find_free_extent(root, ins, &ffe_ctl); 4675 if (!ret && !is_data) { 4676 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 4677 } else if (ret == -ENOSPC) { 4678 if (!final_tried && ins->offset) { 4679 num_bytes = min(num_bytes >> 1, ins->offset); 4680 num_bytes = round_down(num_bytes, 4681 fs_info->sectorsize); 4682 num_bytes = max(num_bytes, min_alloc_size); 4683 ram_bytes = num_bytes; 4684 if (num_bytes == min_alloc_size) 4685 final_tried = true; 4686 goto again; 4687 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4688 struct btrfs_space_info *sinfo; 4689 4690 sinfo = btrfs_find_space_info(fs_info, flags); 4691 btrfs_err(fs_info, 4692 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", 4693 flags, num_bytes, for_treelog, for_data_reloc); 4694 if (sinfo) 4695 btrfs_dump_space_info(fs_info, sinfo, 4696 num_bytes, 1); 4697 } 4698 } 4699 4700 return ret; 4701 } 4702 4703 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 4704 u64 start, u64 len, int delalloc) 4705 { 4706 struct btrfs_block_group *cache; 4707 4708 cache = btrfs_lookup_block_group(fs_info, start); 4709 if (!cache) { 4710 btrfs_err(fs_info, "Unable to find block group for %llu", 4711 start); 4712 return -ENOSPC; 4713 } 4714 4715 btrfs_add_free_space(cache, start, len); 4716 btrfs_free_reserved_bytes(cache, len, delalloc); 4717 trace_btrfs_reserved_extent_free(fs_info, start, len); 4718 4719 btrfs_put_block_group(cache); 4720 return 0; 4721 } 4722 4723 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, 4724 const struct extent_buffer *eb) 4725 { 4726 struct btrfs_block_group *cache; 4727 int ret = 0; 4728 4729 cache = btrfs_lookup_block_group(trans->fs_info, eb->start); 4730 if (!cache) { 4731 btrfs_err(trans->fs_info, "unable to find block group for %llu", 4732 eb->start); 4733 return -ENOSPC; 4734 } 4735 4736 ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); 4737 btrfs_put_block_group(cache); 4738 return ret; 4739 } 4740 4741 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, 4742 u64 num_bytes) 4743 { 4744 struct btrfs_fs_info *fs_info = trans->fs_info; 4745 int ret; 4746 4747 ret = remove_from_free_space_tree(trans, bytenr, num_bytes); 4748 if (ret) 4749 return ret; 4750 4751 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); 4752 if (ret) { 4753 ASSERT(!ret); 4754 btrfs_err(fs_info, "update block group failed for %llu %llu", 4755 bytenr, num_bytes); 4756 return ret; 4757 } 4758 4759 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); 4760 return 0; 4761 } 4762 4763 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4764 u64 parent, u64 root_objectid, 4765 u64 flags, u64 owner, u64 offset, 4766 struct btrfs_key *ins, int ref_mod, u64 oref_root) 4767 { 4768 struct btrfs_fs_info *fs_info = trans->fs_info; 4769 struct btrfs_root *extent_root; 4770 int ret; 4771 struct btrfs_extent_item *extent_item; 4772 struct btrfs_extent_owner_ref *oref; 4773 struct btrfs_extent_inline_ref *iref; 4774 struct btrfs_path *path; 4775 struct extent_buffer *leaf; 4776 int type; 4777 u32 size; 4778 const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); 4779 4780 if (parent > 0) 4781 type = BTRFS_SHARED_DATA_REF_KEY; 4782 else 4783 type = BTRFS_EXTENT_DATA_REF_KEY; 4784 4785 size = sizeof(*extent_item); 4786 if (simple_quota) 4787 size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); 4788 size += btrfs_extent_inline_ref_size(type); 4789 4790 path = btrfs_alloc_path(); 4791 if (!path) 4792 return -ENOMEM; 4793 4794 extent_root = btrfs_extent_root(fs_info, ins->objectid); 4795 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); 4796 if (ret) { 4797 btrfs_free_path(path); 4798 return ret; 4799 } 4800 4801 leaf = path->nodes[0]; 4802 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4803 struct btrfs_extent_item); 4804 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 4805 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4806 btrfs_set_extent_flags(leaf, extent_item, 4807 flags | BTRFS_EXTENT_FLAG_DATA); 4808 4809 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4810 if (simple_quota) { 4811 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); 4812 oref = (struct btrfs_extent_owner_ref *)(&iref->offset); 4813 btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); 4814 iref = (struct btrfs_extent_inline_ref *)(oref + 1); 4815 } 4816 btrfs_set_extent_inline_ref_type(leaf, iref, type); 4817 4818 if (parent > 0) { 4819 struct btrfs_shared_data_ref *ref; 4820 ref = (struct btrfs_shared_data_ref *)(iref + 1); 4821 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4822 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 4823 } else { 4824 struct btrfs_extent_data_ref *ref; 4825 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 4826 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 4827 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 4828 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 4829 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 4830 } 4831 4832 btrfs_free_path(path); 4833 4834 return alloc_reserved_extent(trans, ins->objectid, ins->offset); 4835 } 4836 4837 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 4838 struct btrfs_delayed_ref_node *node, 4839 struct btrfs_delayed_extent_op *extent_op) 4840 { 4841 struct btrfs_fs_info *fs_info = trans->fs_info; 4842 struct btrfs_root *extent_root; 4843 int ret; 4844 struct btrfs_extent_item *extent_item; 4845 struct btrfs_key extent_key; 4846 struct btrfs_tree_block_info *block_info; 4847 struct btrfs_extent_inline_ref *iref; 4848 struct btrfs_path *path; 4849 struct extent_buffer *leaf; 4850 u32 size = sizeof(*extent_item) + sizeof(*iref); 4851 const u64 flags = (extent_op ? extent_op->flags_to_set : 0); 4852 /* The owner of a tree block is the level. */ 4853 int level = btrfs_delayed_ref_owner(node); 4854 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4855 4856 extent_key.objectid = node->bytenr; 4857 if (skinny_metadata) { 4858 /* The owner of a tree block is the level. */ 4859 extent_key.offset = level; 4860 extent_key.type = BTRFS_METADATA_ITEM_KEY; 4861 } else { 4862 extent_key.offset = node->num_bytes; 4863 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 4864 size += sizeof(*block_info); 4865 } 4866 4867 path = btrfs_alloc_path(); 4868 if (!path) 4869 return -ENOMEM; 4870 4871 extent_root = btrfs_extent_root(fs_info, extent_key.objectid); 4872 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, 4873 size); 4874 if (ret) { 4875 btrfs_free_path(path); 4876 return ret; 4877 } 4878 4879 leaf = path->nodes[0]; 4880 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4881 struct btrfs_extent_item); 4882 btrfs_set_extent_refs(leaf, extent_item, 1); 4883 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4884 btrfs_set_extent_flags(leaf, extent_item, 4885 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 4886 4887 if (skinny_metadata) { 4888 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4889 } else { 4890 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 4891 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 4892 btrfs_set_tree_block_level(leaf, block_info, level); 4893 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 4894 } 4895 4896 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 4897 btrfs_set_extent_inline_ref_type(leaf, iref, 4898 BTRFS_SHARED_BLOCK_REF_KEY); 4899 btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); 4900 } else { 4901 btrfs_set_extent_inline_ref_type(leaf, iref, 4902 BTRFS_TREE_BLOCK_REF_KEY); 4903 btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); 4904 } 4905 4906 btrfs_free_path(path); 4907 4908 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); 4909 } 4910 4911 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4912 struct btrfs_root *root, u64 owner, 4913 u64 offset, u64 ram_bytes, 4914 struct btrfs_key *ins) 4915 { 4916 struct btrfs_ref generic_ref = { 4917 .action = BTRFS_ADD_DELAYED_EXTENT, 4918 .bytenr = ins->objectid, 4919 .num_bytes = ins->offset, 4920 .owning_root = btrfs_root_id(root), 4921 .ref_root = btrfs_root_id(root), 4922 }; 4923 4924 ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); 4925 4926 if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) 4927 generic_ref.owning_root = root->relocation_src_root; 4928 4929 btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); 4930 btrfs_ref_tree_mod(root->fs_info, &generic_ref); 4931 4932 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); 4933 } 4934 4935 /* 4936 * this is used by the tree logging recovery code. It records that 4937 * an extent has been allocated and makes sure to clear the free 4938 * space cache bits as well 4939 */ 4940 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 4941 u64 root_objectid, u64 owner, u64 offset, 4942 struct btrfs_key *ins) 4943 { 4944 struct btrfs_fs_info *fs_info = trans->fs_info; 4945 int ret; 4946 struct btrfs_block_group *block_group; 4947 struct btrfs_space_info *space_info; 4948 struct btrfs_squota_delta delta = { 4949 .root = root_objectid, 4950 .num_bytes = ins->offset, 4951 .generation = trans->transid, 4952 .is_data = true, 4953 .is_inc = true, 4954 }; 4955 4956 /* 4957 * Mixed block groups will exclude before processing the log so we only 4958 * need to do the exclude dance if this fs isn't mixed. 4959 */ 4960 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 4961 ret = __exclude_logged_extent(fs_info, ins->objectid, 4962 ins->offset); 4963 if (ret) 4964 return ret; 4965 } 4966 4967 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 4968 if (!block_group) 4969 return -EINVAL; 4970 4971 space_info = block_group->space_info; 4972 spin_lock(&space_info->lock); 4973 spin_lock(&block_group->lock); 4974 space_info->bytes_reserved += ins->offset; 4975 block_group->reserved += ins->offset; 4976 spin_unlock(&block_group->lock); 4977 spin_unlock(&space_info->lock); 4978 4979 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 4980 offset, ins, 1, root_objectid); 4981 if (ret) 4982 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); 4983 ret = btrfs_record_squota_delta(fs_info, &delta); 4984 btrfs_put_block_group(block_group); 4985 return ret; 4986 } 4987 4988 #ifdef CONFIG_BTRFS_DEBUG 4989 /* 4990 * Extra safety check in case the extent tree is corrupted and extent allocator 4991 * chooses to use a tree block which is already used and locked. 4992 */ 4993 static bool check_eb_lock_owner(const struct extent_buffer *eb) 4994 { 4995 if (eb->lock_owner == current->pid) { 4996 btrfs_err_rl(eb->fs_info, 4997 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 4998 eb->start, btrfs_header_owner(eb), current->pid); 4999 return true; 5000 } 5001 return false; 5002 } 5003 #else 5004 static bool check_eb_lock_owner(struct extent_buffer *eb) 5005 { 5006 return false; 5007 } 5008 #endif 5009 5010 static struct extent_buffer * 5011 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 5012 u64 bytenr, int level, u64 owner, 5013 enum btrfs_lock_nesting nest) 5014 { 5015 struct btrfs_fs_info *fs_info = root->fs_info; 5016 struct extent_buffer *buf; 5017 u64 lockdep_owner = owner; 5018 5019 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); 5020 if (IS_ERR(buf)) 5021 return buf; 5022 5023 if (check_eb_lock_owner(buf)) { 5024 free_extent_buffer(buf); 5025 return ERR_PTR(-EUCLEAN); 5026 } 5027 5028 /* 5029 * The reloc trees are just snapshots, so we need them to appear to be 5030 * just like any other fs tree WRT lockdep. 5031 * 5032 * The exception however is in replace_path() in relocation, where we 5033 * hold the lock on the original fs root and then search for the reloc 5034 * root. At that point we need to make sure any reloc root buffers are 5035 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make 5036 * lockdep happy. 5037 */ 5038 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && 5039 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) 5040 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 5041 5042 /* btrfs_clear_buffer_dirty() accesses generation field. */ 5043 btrfs_set_header_generation(buf, trans->transid); 5044 5045 /* 5046 * This needs to stay, because we could allocate a freed block from an 5047 * old tree into a new tree, so we need to make sure this new block is 5048 * set to the appropriate level and owner. 5049 */ 5050 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); 5051 5052 btrfs_tree_lock_nested(buf, nest); 5053 btrfs_clear_buffer_dirty(trans, buf); 5054 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 5055 clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); 5056 5057 set_extent_buffer_uptodate(buf); 5058 5059 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 5060 btrfs_set_header_level(buf, level); 5061 btrfs_set_header_bytenr(buf, buf->start); 5062 btrfs_set_header_generation(buf, trans->transid); 5063 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 5064 btrfs_set_header_owner(buf, owner); 5065 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 5066 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 5067 if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { 5068 buf->log_index = root->log_transid % 2; 5069 /* 5070 * we allow two log transactions at a time, use different 5071 * EXTENT bit to differentiate dirty pages. 5072 */ 5073 if (buf->log_index == 0) 5074 set_extent_bit(&root->dirty_log_pages, buf->start, 5075 buf->start + buf->len - 1, 5076 EXTENT_DIRTY, NULL); 5077 else 5078 set_extent_bit(&root->dirty_log_pages, buf->start, 5079 buf->start + buf->len - 1, 5080 EXTENT_NEW, NULL); 5081 } else { 5082 buf->log_index = -1; 5083 set_extent_bit(&trans->transaction->dirty_pages, buf->start, 5084 buf->start + buf->len - 1, EXTENT_DIRTY, NULL); 5085 } 5086 /* this returns a buffer locked for blocking */ 5087 return buf; 5088 } 5089 5090 /* 5091 * finds a free extent and does all the dirty work required for allocation 5092 * returns the tree buffer or an ERR_PTR on error. 5093 */ 5094 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 5095 struct btrfs_root *root, 5096 u64 parent, u64 root_objectid, 5097 const struct btrfs_disk_key *key, 5098 int level, u64 hint, 5099 u64 empty_size, 5100 u64 reloc_src_root, 5101 enum btrfs_lock_nesting nest) 5102 { 5103 struct btrfs_fs_info *fs_info = root->fs_info; 5104 struct btrfs_key ins; 5105 struct btrfs_block_rsv *block_rsv; 5106 struct extent_buffer *buf; 5107 u64 flags = 0; 5108 int ret; 5109 u32 blocksize = fs_info->nodesize; 5110 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 5111 u64 owning_root; 5112 5113 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5114 if (btrfs_is_testing(fs_info)) { 5115 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 5116 level, root_objectid, nest); 5117 if (!IS_ERR(buf)) 5118 root->alloc_bytenr += blocksize; 5119 return buf; 5120 } 5121 #endif 5122 5123 block_rsv = btrfs_use_block_rsv(trans, root, blocksize); 5124 if (IS_ERR(block_rsv)) 5125 return ERR_CAST(block_rsv); 5126 5127 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 5128 empty_size, hint, &ins, 0, 0); 5129 if (ret) 5130 goto out_unuse; 5131 5132 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 5133 root_objectid, nest); 5134 if (IS_ERR(buf)) { 5135 ret = PTR_ERR(buf); 5136 goto out_free_reserved; 5137 } 5138 owning_root = btrfs_header_owner(buf); 5139 5140 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 5141 if (parent == 0) 5142 parent = ins.objectid; 5143 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 5144 owning_root = reloc_src_root; 5145 } else 5146 BUG_ON(parent > 0); 5147 5148 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 5149 struct btrfs_delayed_extent_op *extent_op; 5150 struct btrfs_ref generic_ref = { 5151 .action = BTRFS_ADD_DELAYED_EXTENT, 5152 .bytenr = ins.objectid, 5153 .num_bytes = ins.offset, 5154 .parent = parent, 5155 .owning_root = owning_root, 5156 .ref_root = root_objectid, 5157 }; 5158 5159 if (!skinny_metadata || flags != 0) { 5160 extent_op = btrfs_alloc_delayed_extent_op(); 5161 if (!extent_op) { 5162 ret = -ENOMEM; 5163 goto out_free_buf; 5164 } 5165 if (key) 5166 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 5167 else 5168 memset(&extent_op->key, 0, sizeof(extent_op->key)); 5169 extent_op->flags_to_set = flags; 5170 extent_op->update_key = (skinny_metadata ? false : true); 5171 extent_op->update_flags = (flags != 0); 5172 } else { 5173 extent_op = NULL; 5174 } 5175 5176 btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); 5177 btrfs_ref_tree_mod(fs_info, &generic_ref); 5178 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); 5179 if (ret) { 5180 btrfs_free_delayed_extent_op(extent_op); 5181 goto out_free_buf; 5182 } 5183 } 5184 return buf; 5185 5186 out_free_buf: 5187 btrfs_tree_unlock(buf); 5188 free_extent_buffer(buf); 5189 out_free_reserved: 5190 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 5191 out_unuse: 5192 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); 5193 return ERR_PTR(ret); 5194 } 5195 5196 struct walk_control { 5197 u64 refs[BTRFS_MAX_LEVEL]; 5198 u64 flags[BTRFS_MAX_LEVEL]; 5199 struct btrfs_key update_progress; 5200 struct btrfs_key drop_progress; 5201 int drop_level; 5202 int stage; 5203 int level; 5204 int shared_level; 5205 int update_ref; 5206 int keep_locks; 5207 int reada_slot; 5208 int reada_count; 5209 int restarted; 5210 /* Indicate that extent info needs to be looked up when walking the tree. */ 5211 int lookup_info; 5212 }; 5213 5214 /* 5215 * This is our normal stage. We are traversing blocks the current snapshot owns 5216 * and we are dropping any of our references to any children we are able to, and 5217 * then freeing the block once we've processed all of the children. 5218 */ 5219 #define DROP_REFERENCE 1 5220 5221 /* 5222 * We enter this stage when we have to walk into a child block (meaning we can't 5223 * simply drop our reference to it from our current parent node) and there are 5224 * more than one reference on it. If we are the owner of any of the children 5225 * blocks from the current parent node then we have to do the FULL_BACKREF dance 5226 * on them in order to drop our normal ref and add the shared ref. 5227 */ 5228 #define UPDATE_BACKREF 2 5229 5230 /* 5231 * Decide if we need to walk down into this node to adjust the references. 5232 * 5233 * @root: the root we are currently deleting 5234 * @wc: the walk control for this deletion 5235 * @eb: the parent eb that we're currently visiting 5236 * @refs: the number of refs for wc->level - 1 5237 * @flags: the flags for wc->level - 1 5238 * @slot: the slot in the eb that we're currently checking 5239 * 5240 * This is meant to be called when we're evaluating if a node we point to at 5241 * wc->level should be read and walked into, or if we can simply delete our 5242 * reference to it. We return true if we should walk into the node, false if we 5243 * can skip it. 5244 * 5245 * We have assertions in here to make sure this is called correctly. We assume 5246 * that sanity checking on the blocks read to this point has been done, so any 5247 * corrupted file systems must have been caught before calling this function. 5248 */ 5249 static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, 5250 struct extent_buffer *eb, u64 flags, int slot) 5251 { 5252 struct btrfs_key key; 5253 u64 generation; 5254 int level = wc->level; 5255 5256 ASSERT(level > 0); 5257 ASSERT(wc->refs[level - 1] > 0); 5258 5259 /* 5260 * The update backref stage we only want to skip if we already have 5261 * FULL_BACKREF set, otherwise we need to read. 5262 */ 5263 if (wc->stage == UPDATE_BACKREF) { 5264 if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5265 return false; 5266 return true; 5267 } 5268 5269 /* 5270 * We're the last ref on this block, we must walk into it and process 5271 * any refs it's pointing at. 5272 */ 5273 if (wc->refs[level - 1] == 1) 5274 return true; 5275 5276 /* 5277 * If we're already FULL_BACKREF then we know we can just drop our 5278 * current reference. 5279 */ 5280 if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5281 return false; 5282 5283 /* 5284 * This block is older than our creation generation, we can drop our 5285 * reference to it. 5286 */ 5287 generation = btrfs_node_ptr_generation(eb, slot); 5288 if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) 5289 return false; 5290 5291 /* 5292 * This block was processed from a previous snapshot deletion run, we 5293 * can skip it. 5294 */ 5295 btrfs_node_key_to_cpu(eb, &key, slot); 5296 if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) 5297 return false; 5298 5299 /* All other cases we need to wander into the node. */ 5300 return true; 5301 } 5302 5303 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 5304 struct btrfs_root *root, 5305 struct walk_control *wc, 5306 struct btrfs_path *path) 5307 { 5308 struct btrfs_fs_info *fs_info = root->fs_info; 5309 u64 bytenr; 5310 u64 generation; 5311 u64 refs; 5312 u64 flags; 5313 u32 nritems; 5314 struct extent_buffer *eb; 5315 int ret; 5316 int slot; 5317 int nread = 0; 5318 5319 if (path->slots[wc->level] < wc->reada_slot) { 5320 wc->reada_count = wc->reada_count * 2 / 3; 5321 wc->reada_count = max(wc->reada_count, 2); 5322 } else { 5323 wc->reada_count = wc->reada_count * 3 / 2; 5324 wc->reada_count = min_t(int, wc->reada_count, 5325 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 5326 } 5327 5328 eb = path->nodes[wc->level]; 5329 nritems = btrfs_header_nritems(eb); 5330 5331 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 5332 if (nread >= wc->reada_count) 5333 break; 5334 5335 cond_resched(); 5336 bytenr = btrfs_node_blockptr(eb, slot); 5337 generation = btrfs_node_ptr_generation(eb, slot); 5338 5339 if (slot == path->slots[wc->level]) 5340 goto reada; 5341 5342 if (wc->stage == UPDATE_BACKREF && 5343 generation <= btrfs_root_origin_generation(root)) 5344 continue; 5345 5346 /* We don't lock the tree block, it's OK to be racy here */ 5347 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 5348 wc->level - 1, 1, &refs, 5349 &flags, NULL); 5350 /* We don't care about errors in readahead. */ 5351 if (ret < 0) 5352 continue; 5353 5354 /* 5355 * This could be racey, it's conceivable that we raced and end 5356 * up with a bogus refs count, if that's the case just skip, if 5357 * we are actually corrupt we will notice when we look up 5358 * everything again with our locks. 5359 */ 5360 if (refs == 0) 5361 continue; 5362 5363 /* If we don't need to visit this node don't reada. */ 5364 if (!visit_node_for_delete(root, wc, eb, flags, slot)) 5365 continue; 5366 reada: 5367 btrfs_readahead_node_child(eb, slot); 5368 nread++; 5369 } 5370 wc->reada_slot = slot; 5371 } 5372 5373 /* 5374 * helper to process tree block while walking down the tree. 5375 * 5376 * when wc->stage == UPDATE_BACKREF, this function updates 5377 * back refs for pointers in the block. 5378 * 5379 * NOTE: return value 1 means we should stop walking down. 5380 */ 5381 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5382 struct btrfs_root *root, 5383 struct btrfs_path *path, 5384 struct walk_control *wc) 5385 { 5386 struct btrfs_fs_info *fs_info = root->fs_info; 5387 int level = wc->level; 5388 struct extent_buffer *eb = path->nodes[level]; 5389 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5390 int ret; 5391 5392 if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) 5393 return 1; 5394 5395 /* 5396 * when reference count of tree block is 1, it won't increase 5397 * again. once full backref flag is set, we never clear it. 5398 */ 5399 if (wc->lookup_info && 5400 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5401 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 5402 ASSERT(path->locks[level]); 5403 ret = btrfs_lookup_extent_info(trans, fs_info, 5404 eb->start, level, 1, 5405 &wc->refs[level], 5406 &wc->flags[level], 5407 NULL); 5408 if (ret) 5409 return ret; 5410 if (unlikely(wc->refs[level] == 0)) { 5411 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", 5412 eb->start); 5413 return -EUCLEAN; 5414 } 5415 } 5416 5417 if (wc->stage == DROP_REFERENCE) { 5418 if (wc->refs[level] > 1) 5419 return 1; 5420 5421 if (path->locks[level] && !wc->keep_locks) { 5422 btrfs_tree_unlock_rw(eb, path->locks[level]); 5423 path->locks[level] = 0; 5424 } 5425 return 0; 5426 } 5427 5428 /* wc->stage == UPDATE_BACKREF */ 5429 if (!(wc->flags[level] & flag)) { 5430 ASSERT(path->locks[level]); 5431 ret = btrfs_inc_ref(trans, root, eb, 1); 5432 if (ret) { 5433 btrfs_abort_transaction(trans, ret); 5434 return ret; 5435 } 5436 ret = btrfs_dec_ref(trans, root, eb, 0); 5437 if (ret) { 5438 btrfs_abort_transaction(trans, ret); 5439 return ret; 5440 } 5441 ret = btrfs_set_disk_extent_flags(trans, eb, flag); 5442 if (ret) { 5443 btrfs_abort_transaction(trans, ret); 5444 return ret; 5445 } 5446 wc->flags[level] |= flag; 5447 } 5448 5449 /* 5450 * the block is shared by multiple trees, so it's not good to 5451 * keep the tree lock 5452 */ 5453 if (path->locks[level] && level > 0) { 5454 btrfs_tree_unlock_rw(eb, path->locks[level]); 5455 path->locks[level] = 0; 5456 } 5457 return 0; 5458 } 5459 5460 /* 5461 * This is used to verify a ref exists for this root to deal with a bug where we 5462 * would have a drop_progress key that hadn't been updated properly. 5463 */ 5464 static int check_ref_exists(struct btrfs_trans_handle *trans, 5465 struct btrfs_root *root, u64 bytenr, u64 parent, 5466 int level) 5467 { 5468 struct btrfs_delayed_ref_root *delayed_refs; 5469 struct btrfs_delayed_ref_head *head; 5470 BTRFS_PATH_AUTO_FREE(path); 5471 struct btrfs_extent_inline_ref *iref; 5472 int ret; 5473 bool exists = false; 5474 5475 path = btrfs_alloc_path(); 5476 if (!path) 5477 return -ENOMEM; 5478 again: 5479 ret = lookup_extent_backref(trans, path, &iref, bytenr, 5480 root->fs_info->nodesize, parent, 5481 btrfs_root_id(root), level, 0); 5482 if (ret != -ENOENT) { 5483 /* 5484 * If we get 0 then we found our reference, return 1, else 5485 * return the error if it's not -ENOENT; 5486 */ 5487 return (ret < 0 ) ? ret : 1; 5488 } 5489 5490 /* 5491 * We could have a delayed ref with this reference, so look it up while 5492 * we're holding the path open to make sure we don't race with the 5493 * delayed ref running. 5494 */ 5495 delayed_refs = &trans->transaction->delayed_refs; 5496 spin_lock(&delayed_refs->lock); 5497 head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); 5498 if (!head) 5499 goto out; 5500 if (!mutex_trylock(&head->mutex)) { 5501 /* 5502 * We're contended, means that the delayed ref is running, get a 5503 * reference and wait for the ref head to be complete and then 5504 * try again. 5505 */ 5506 refcount_inc(&head->refs); 5507 spin_unlock(&delayed_refs->lock); 5508 5509 btrfs_release_path(path); 5510 5511 mutex_lock(&head->mutex); 5512 mutex_unlock(&head->mutex); 5513 btrfs_put_delayed_ref_head(head); 5514 goto again; 5515 } 5516 5517 exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); 5518 mutex_unlock(&head->mutex); 5519 out: 5520 spin_unlock(&delayed_refs->lock); 5521 return exists ? 1 : 0; 5522 } 5523 5524 /* 5525 * We may not have an uptodate block, so if we are going to walk down into this 5526 * block we need to drop the lock, read it off of the disk, re-lock it and 5527 * return to continue dropping the snapshot. 5528 */ 5529 static int check_next_block_uptodate(struct btrfs_trans_handle *trans, 5530 struct btrfs_root *root, 5531 struct btrfs_path *path, 5532 struct walk_control *wc, 5533 struct extent_buffer *next) 5534 { 5535 struct btrfs_tree_parent_check check = { 0 }; 5536 u64 generation; 5537 int level = wc->level; 5538 int ret; 5539 5540 btrfs_assert_tree_write_locked(next); 5541 5542 generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); 5543 5544 if (btrfs_buffer_uptodate(next, generation, 0)) 5545 return 0; 5546 5547 check.level = level - 1; 5548 check.transid = generation; 5549 check.owner_root = btrfs_root_id(root); 5550 check.has_first_key = true; 5551 btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); 5552 5553 btrfs_tree_unlock(next); 5554 if (level == 1) 5555 reada_walk_down(trans, root, wc, path); 5556 ret = btrfs_read_extent_buffer(next, &check); 5557 if (ret) { 5558 free_extent_buffer(next); 5559 return ret; 5560 } 5561 btrfs_tree_lock(next); 5562 wc->lookup_info = 1; 5563 return 0; 5564 } 5565 5566 /* 5567 * If we determine that we don't have to visit wc->level - 1 then we need to 5568 * determine if we can drop our reference. 5569 * 5570 * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. 5571 * 5572 * If we are DROP_REFERENCE this will figure out if we need to drop our current 5573 * reference, skipping it if we dropped it from a previous incompleted drop, or 5574 * dropping it if we still have a reference to it. 5575 */ 5576 static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, 5577 struct btrfs_path *path, struct walk_control *wc, 5578 struct extent_buffer *next, u64 owner_root) 5579 { 5580 struct btrfs_ref ref = { 5581 .action = BTRFS_DROP_DELAYED_REF, 5582 .bytenr = next->start, 5583 .num_bytes = root->fs_info->nodesize, 5584 .owning_root = owner_root, 5585 .ref_root = btrfs_root_id(root), 5586 }; 5587 int level = wc->level; 5588 int ret; 5589 5590 /* We are UPDATE_BACKREF, we're not dropping anything. */ 5591 if (wc->stage == UPDATE_BACKREF) 5592 return 0; 5593 5594 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5595 ref.parent = path->nodes[level]->start; 5596 } else { 5597 ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); 5598 if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { 5599 btrfs_err(root->fs_info, "mismatched block owner"); 5600 return -EIO; 5601 } 5602 } 5603 5604 /* 5605 * If we had a drop_progress we need to verify the refs are set as 5606 * expected. If we find our ref then we know that from here on out 5607 * everything should be correct, and we can clear the 5608 * ->restarted flag. 5609 */ 5610 if (wc->restarted) { 5611 ret = check_ref_exists(trans, root, next->start, ref.parent, 5612 level - 1); 5613 if (ret <= 0) 5614 return ret; 5615 ret = 0; 5616 wc->restarted = 0; 5617 } 5618 5619 /* 5620 * Reloc tree doesn't contribute to qgroup numbers, and we have already 5621 * accounted them at merge time (replace_path), thus we could skip 5622 * expensive subtree trace here. 5623 */ 5624 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && 5625 wc->refs[level - 1] > 1) { 5626 u64 generation = btrfs_node_ptr_generation(path->nodes[level], 5627 path->slots[level]); 5628 5629 ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); 5630 if (ret) { 5631 btrfs_err_rl(root->fs_info, 5632 "error %d accounting shared subtree, quota is out of sync, rescan required", 5633 ret); 5634 } 5635 } 5636 5637 /* 5638 * We need to update the next key in our walk control so we can update 5639 * the drop_progress key accordingly. We don't care if find_next_key 5640 * doesn't find a key because that means we're at the end and are going 5641 * to clean up now. 5642 */ 5643 wc->drop_level = level; 5644 find_next_key(path, level, &wc->drop_progress); 5645 5646 btrfs_init_tree_ref(&ref, level - 1, 0, false); 5647 return btrfs_free_extent(trans, &ref); 5648 } 5649 5650 /* 5651 * helper to process tree block pointer. 5652 * 5653 * when wc->stage == DROP_REFERENCE, this function checks 5654 * reference count of the block pointed to. if the block 5655 * is shared and we need update back refs for the subtree 5656 * rooted at the block, this function changes wc->stage to 5657 * UPDATE_BACKREF. if the block is shared and there is no 5658 * need to update back, this function drops the reference 5659 * to the block. 5660 * 5661 * NOTE: return value 1 means we should stop walking down. 5662 */ 5663 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5664 struct btrfs_root *root, 5665 struct btrfs_path *path, 5666 struct walk_control *wc) 5667 { 5668 struct btrfs_fs_info *fs_info = root->fs_info; 5669 u64 bytenr; 5670 u64 generation; 5671 u64 owner_root = 0; 5672 struct extent_buffer *next; 5673 int level = wc->level; 5674 int ret = 0; 5675 5676 generation = btrfs_node_ptr_generation(path->nodes[level], 5677 path->slots[level]); 5678 /* 5679 * if the lower level block was created before the snapshot 5680 * was created, we know there is no need to update back refs 5681 * for the subtree 5682 */ 5683 if (wc->stage == UPDATE_BACKREF && 5684 generation <= btrfs_root_origin_generation(root)) { 5685 wc->lookup_info = 1; 5686 return 1; 5687 } 5688 5689 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5690 5691 next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), 5692 level - 1); 5693 if (IS_ERR(next)) 5694 return PTR_ERR(next); 5695 5696 btrfs_tree_lock(next); 5697 5698 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 5699 &wc->refs[level - 1], 5700 &wc->flags[level - 1], 5701 &owner_root); 5702 if (ret < 0) 5703 goto out_unlock; 5704 5705 if (unlikely(wc->refs[level - 1] == 0)) { 5706 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", 5707 bytenr); 5708 ret = -EUCLEAN; 5709 goto out_unlock; 5710 } 5711 wc->lookup_info = 0; 5712 5713 /* If we don't have to walk into this node skip it. */ 5714 if (!visit_node_for_delete(root, wc, path->nodes[level], 5715 wc->flags[level - 1], path->slots[level])) 5716 goto skip; 5717 5718 /* 5719 * We have to walk down into this node, and if we're currently at the 5720 * DROP_REFERNCE stage and this block is shared then we need to switch 5721 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. 5722 */ 5723 if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { 5724 wc->stage = UPDATE_BACKREF; 5725 wc->shared_level = level - 1; 5726 } 5727 5728 ret = check_next_block_uptodate(trans, root, path, wc, next); 5729 if (ret) 5730 return ret; 5731 5732 level--; 5733 ASSERT(level == btrfs_header_level(next)); 5734 if (level != btrfs_header_level(next)) { 5735 btrfs_err(root->fs_info, "mismatched level"); 5736 ret = -EIO; 5737 goto out_unlock; 5738 } 5739 path->nodes[level] = next; 5740 path->slots[level] = 0; 5741 path->locks[level] = BTRFS_WRITE_LOCK; 5742 wc->level = level; 5743 if (wc->level == 1) 5744 wc->reada_slot = 0; 5745 return 0; 5746 skip: 5747 ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); 5748 if (ret) 5749 goto out_unlock; 5750 wc->refs[level - 1] = 0; 5751 wc->flags[level - 1] = 0; 5752 wc->lookup_info = 1; 5753 ret = 1; 5754 5755 out_unlock: 5756 btrfs_tree_unlock(next); 5757 free_extent_buffer(next); 5758 5759 return ret; 5760 } 5761 5762 /* 5763 * helper to process tree block while walking up the tree. 5764 * 5765 * when wc->stage == DROP_REFERENCE, this function drops 5766 * reference count on the block. 5767 * 5768 * when wc->stage == UPDATE_BACKREF, this function changes 5769 * wc->stage back to DROP_REFERENCE if we changed wc->stage 5770 * to UPDATE_BACKREF previously while processing the block. 5771 * 5772 * NOTE: return value 1 means we should stop walking up. 5773 */ 5774 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 5775 struct btrfs_root *root, 5776 struct btrfs_path *path, 5777 struct walk_control *wc) 5778 { 5779 struct btrfs_fs_info *fs_info = root->fs_info; 5780 int ret = 0; 5781 int level = wc->level; 5782 struct extent_buffer *eb = path->nodes[level]; 5783 u64 parent = 0; 5784 5785 if (wc->stage == UPDATE_BACKREF) { 5786 ASSERT(wc->shared_level >= level); 5787 if (level < wc->shared_level) 5788 goto out; 5789 5790 ret = find_next_key(path, level + 1, &wc->update_progress); 5791 if (ret > 0) 5792 wc->update_ref = 0; 5793 5794 wc->stage = DROP_REFERENCE; 5795 wc->shared_level = -1; 5796 path->slots[level] = 0; 5797 5798 /* 5799 * check reference count again if the block isn't locked. 5800 * we should start walking down the tree again if reference 5801 * count is one. 5802 */ 5803 if (!path->locks[level]) { 5804 ASSERT(level > 0); 5805 btrfs_tree_lock(eb); 5806 path->locks[level] = BTRFS_WRITE_LOCK; 5807 5808 ret = btrfs_lookup_extent_info(trans, fs_info, 5809 eb->start, level, 1, 5810 &wc->refs[level], 5811 &wc->flags[level], 5812 NULL); 5813 if (ret < 0) { 5814 btrfs_tree_unlock_rw(eb, path->locks[level]); 5815 path->locks[level] = 0; 5816 return ret; 5817 } 5818 if (unlikely(wc->refs[level] == 0)) { 5819 btrfs_tree_unlock_rw(eb, path->locks[level]); 5820 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", 5821 eb->start); 5822 return -EUCLEAN; 5823 } 5824 if (wc->refs[level] == 1) { 5825 btrfs_tree_unlock_rw(eb, path->locks[level]); 5826 path->locks[level] = 0; 5827 return 1; 5828 } 5829 } 5830 } 5831 5832 /* wc->stage == DROP_REFERENCE */ 5833 ASSERT(path->locks[level] || wc->refs[level] == 1); 5834 5835 if (wc->refs[level] == 1) { 5836 if (level == 0) { 5837 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5838 ret = btrfs_dec_ref(trans, root, eb, 1); 5839 else 5840 ret = btrfs_dec_ref(trans, root, eb, 0); 5841 if (ret) { 5842 btrfs_abort_transaction(trans, ret); 5843 return ret; 5844 } 5845 if (is_fstree(btrfs_root_id(root))) { 5846 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 5847 if (ret) { 5848 btrfs_err_rl(fs_info, 5849 "error %d accounting leaf items, quota is out of sync, rescan required", 5850 ret); 5851 } 5852 } 5853 } 5854 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ 5855 if (!path->locks[level]) { 5856 btrfs_tree_lock(eb); 5857 path->locks[level] = BTRFS_WRITE_LOCK; 5858 } 5859 btrfs_clear_buffer_dirty(trans, eb); 5860 } 5861 5862 if (eb == root->node) { 5863 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5864 parent = eb->start; 5865 else if (btrfs_root_id(root) != btrfs_header_owner(eb)) 5866 goto owner_mismatch; 5867 } else { 5868 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5869 parent = path->nodes[level + 1]->start; 5870 else if (btrfs_root_id(root) != 5871 btrfs_header_owner(path->nodes[level + 1])) 5872 goto owner_mismatch; 5873 } 5874 5875 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, 5876 wc->refs[level] == 1); 5877 if (ret < 0) 5878 btrfs_abort_transaction(trans, ret); 5879 out: 5880 wc->refs[level] = 0; 5881 wc->flags[level] = 0; 5882 return ret; 5883 5884 owner_mismatch: 5885 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 5886 btrfs_header_owner(eb), btrfs_root_id(root)); 5887 return -EUCLEAN; 5888 } 5889 5890 /* 5891 * walk_down_tree consists of two steps. 5892 * 5893 * walk_down_proc(). Look up the reference count and reference of our current 5894 * wc->level. At this point path->nodes[wc->level] should be populated and 5895 * uptodate, and in most cases should already be locked. If we are in 5896 * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and 5897 * we can walk back up the tree. If we are UPDATE_BACKREF we have to set 5898 * FULL_BACKREF on this node if it's not already set, and then do the 5899 * FULL_BACKREF conversion dance, which is to drop the root reference and add 5900 * the shared reference to all of this nodes children. 5901 * 5902 * do_walk_down(). This is where we actually start iterating on the children of 5903 * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping 5904 * our reference to the children that return false from visit_node_for_delete(), 5905 * which has various conditions where we know we can just drop our reference 5906 * without visiting the node. For UPDATE_BACKREF we will skip any children that 5907 * visit_node_for_delete() returns false for, only walking down when necessary. 5908 * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of 5909 * snapshot deletion. 5910 */ 5911 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5912 struct btrfs_root *root, 5913 struct btrfs_path *path, 5914 struct walk_control *wc) 5915 { 5916 int level = wc->level; 5917 int ret = 0; 5918 5919 wc->lookup_info = 1; 5920 while (level >= 0) { 5921 ret = walk_down_proc(trans, root, path, wc); 5922 if (ret) 5923 break; 5924 5925 if (level == 0) 5926 break; 5927 5928 if (path->slots[level] >= 5929 btrfs_header_nritems(path->nodes[level])) 5930 break; 5931 5932 ret = do_walk_down(trans, root, path, wc); 5933 if (ret > 0) { 5934 path->slots[level]++; 5935 continue; 5936 } else if (ret < 0) 5937 break; 5938 level = wc->level; 5939 } 5940 return (ret == 1) ? 0 : ret; 5941 } 5942 5943 /* 5944 * walk_up_tree() is responsible for making sure we visit every slot on our 5945 * current node, and if we're at the end of that node then we call 5946 * walk_up_proc() on our current node which will do one of a few things based on 5947 * our stage. 5948 * 5949 * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level 5950 * then we need to walk back up the tree, and then going back down into the 5951 * other slots via walk_down_tree to update any other children from our original 5952 * wc->shared_level. Once we're at or above our wc->shared_level we can switch 5953 * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. 5954 * 5955 * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. 5956 * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents 5957 * in our current leaf. After that we call btrfs_free_tree_block() on the 5958 * current node and walk up to the next node to walk down the next slot. 5959 */ 5960 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5961 struct btrfs_root *root, 5962 struct btrfs_path *path, 5963 struct walk_control *wc, int max_level) 5964 { 5965 int level = wc->level; 5966 int ret; 5967 5968 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 5969 while (level < max_level && path->nodes[level]) { 5970 wc->level = level; 5971 if (path->slots[level] + 1 < 5972 btrfs_header_nritems(path->nodes[level])) { 5973 path->slots[level]++; 5974 return 0; 5975 } else { 5976 ret = walk_up_proc(trans, root, path, wc); 5977 if (ret > 0) 5978 return 0; 5979 if (ret < 0) 5980 return ret; 5981 5982 if (path->locks[level]) { 5983 btrfs_tree_unlock_rw(path->nodes[level], 5984 path->locks[level]); 5985 path->locks[level] = 0; 5986 } 5987 free_extent_buffer(path->nodes[level]); 5988 path->nodes[level] = NULL; 5989 level++; 5990 } 5991 } 5992 return 1; 5993 } 5994 5995 /* 5996 * drop a subvolume tree. 5997 * 5998 * this function traverses the tree freeing any blocks that only 5999 * referenced by the tree. 6000 * 6001 * when a shared tree block is found. this function decreases its 6002 * reference count by one. if update_ref is true, this function 6003 * also make sure backrefs for the shared block and all lower level 6004 * blocks are properly updated. 6005 * 6006 * If called with for_reloc == 0, may exit early with -EAGAIN 6007 */ 6008 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) 6009 { 6010 const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); 6011 struct btrfs_fs_info *fs_info = root->fs_info; 6012 struct btrfs_path *path; 6013 struct btrfs_trans_handle *trans; 6014 struct btrfs_root *tree_root = fs_info->tree_root; 6015 struct btrfs_root_item *root_item = &root->root_item; 6016 struct walk_control *wc; 6017 struct btrfs_key key; 6018 const u64 rootid = btrfs_root_id(root); 6019 int ret = 0; 6020 int level; 6021 bool root_dropped = false; 6022 bool unfinished_drop = false; 6023 6024 btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); 6025 6026 path = btrfs_alloc_path(); 6027 if (!path) { 6028 ret = -ENOMEM; 6029 goto out; 6030 } 6031 6032 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6033 if (!wc) { 6034 btrfs_free_path(path); 6035 ret = -ENOMEM; 6036 goto out; 6037 } 6038 6039 /* 6040 * Use join to avoid potential EINTR from transaction start. See 6041 * wait_reserve_ticket and the whole reservation callchain. 6042 */ 6043 if (for_reloc) 6044 trans = btrfs_join_transaction(tree_root); 6045 else 6046 trans = btrfs_start_transaction(tree_root, 0); 6047 if (IS_ERR(trans)) { 6048 ret = PTR_ERR(trans); 6049 goto out_free; 6050 } 6051 6052 ret = btrfs_run_delayed_items(trans); 6053 if (ret) 6054 goto out_end_trans; 6055 6056 /* 6057 * This will help us catch people modifying the fs tree while we're 6058 * dropping it. It is unsafe to mess with the fs tree while it's being 6059 * dropped as we unlock the root node and parent nodes as we walk down 6060 * the tree, assuming nothing will change. If something does change 6061 * then we'll have stale information and drop references to blocks we've 6062 * already dropped. 6063 */ 6064 set_bit(BTRFS_ROOT_DELETING, &root->state); 6065 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); 6066 6067 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 6068 level = btrfs_header_level(root->node); 6069 path->nodes[level] = btrfs_lock_root_node(root); 6070 path->slots[level] = 0; 6071 path->locks[level] = BTRFS_WRITE_LOCK; 6072 memset(&wc->update_progress, 0, 6073 sizeof(wc->update_progress)); 6074 } else { 6075 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 6076 memcpy(&wc->update_progress, &key, 6077 sizeof(wc->update_progress)); 6078 6079 level = btrfs_root_drop_level(root_item); 6080 BUG_ON(level == 0); 6081 path->lowest_level = level; 6082 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6083 path->lowest_level = 0; 6084 if (ret < 0) 6085 goto out_end_trans; 6086 6087 WARN_ON(ret > 0); 6088 ret = 0; 6089 6090 /* 6091 * unlock our path, this is safe because only this 6092 * function is allowed to delete this snapshot 6093 */ 6094 btrfs_unlock_up_safe(path, 0); 6095 6096 level = btrfs_header_level(root->node); 6097 while (1) { 6098 btrfs_tree_lock(path->nodes[level]); 6099 path->locks[level] = BTRFS_WRITE_LOCK; 6100 6101 /* 6102 * btrfs_lookup_extent_info() returns 0 for success, 6103 * or < 0 for error. 6104 */ 6105 ret = btrfs_lookup_extent_info(trans, fs_info, 6106 path->nodes[level]->start, 6107 level, 1, &wc->refs[level], 6108 &wc->flags[level], NULL); 6109 if (ret < 0) 6110 goto out_end_trans; 6111 6112 BUG_ON(wc->refs[level] == 0); 6113 6114 if (level == btrfs_root_drop_level(root_item)) 6115 break; 6116 6117 btrfs_tree_unlock(path->nodes[level]); 6118 path->locks[level] = 0; 6119 WARN_ON(wc->refs[level] != 1); 6120 level--; 6121 } 6122 } 6123 6124 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 6125 wc->level = level; 6126 wc->shared_level = -1; 6127 wc->stage = DROP_REFERENCE; 6128 wc->update_ref = update_ref; 6129 wc->keep_locks = 0; 6130 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 6131 6132 while (1) { 6133 6134 ret = walk_down_tree(trans, root, path, wc); 6135 if (ret < 0) { 6136 btrfs_abort_transaction(trans, ret); 6137 break; 6138 } 6139 6140 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 6141 if (ret < 0) { 6142 btrfs_abort_transaction(trans, ret); 6143 break; 6144 } 6145 6146 if (ret > 0) { 6147 BUG_ON(wc->stage != DROP_REFERENCE); 6148 ret = 0; 6149 break; 6150 } 6151 6152 if (wc->stage == DROP_REFERENCE) { 6153 wc->drop_level = wc->level; 6154 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 6155 &wc->drop_progress, 6156 path->slots[wc->drop_level]); 6157 } 6158 btrfs_cpu_key_to_disk(&root_item->drop_progress, 6159 &wc->drop_progress); 6160 btrfs_set_root_drop_level(root_item, wc->drop_level); 6161 6162 BUG_ON(wc->level == 0); 6163 if (btrfs_should_end_transaction(trans) || 6164 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 6165 ret = btrfs_update_root(trans, tree_root, 6166 &root->root_key, 6167 root_item); 6168 if (ret) { 6169 btrfs_abort_transaction(trans, ret); 6170 goto out_end_trans; 6171 } 6172 6173 if (!is_reloc_root) 6174 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 6175 6176 btrfs_end_transaction_throttle(trans); 6177 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 6178 btrfs_debug(fs_info, 6179 "drop snapshot early exit"); 6180 ret = -EAGAIN; 6181 goto out_free; 6182 } 6183 6184 /* 6185 * Use join to avoid potential EINTR from transaction 6186 * start. See wait_reserve_ticket and the whole 6187 * reservation callchain. 6188 */ 6189 if (for_reloc) 6190 trans = btrfs_join_transaction(tree_root); 6191 else 6192 trans = btrfs_start_transaction(tree_root, 0); 6193 if (IS_ERR(trans)) { 6194 ret = PTR_ERR(trans); 6195 goto out_free; 6196 } 6197 } 6198 } 6199 btrfs_release_path(path); 6200 if (ret) 6201 goto out_end_trans; 6202 6203 ret = btrfs_del_root(trans, &root->root_key); 6204 if (ret) { 6205 btrfs_abort_transaction(trans, ret); 6206 goto out_end_trans; 6207 } 6208 6209 if (!is_reloc_root) { 6210 ret = btrfs_find_root(tree_root, &root->root_key, path, 6211 NULL, NULL); 6212 if (ret < 0) { 6213 btrfs_abort_transaction(trans, ret); 6214 goto out_end_trans; 6215 } else if (ret > 0) { 6216 ret = 0; 6217 /* 6218 * If we fail to delete the orphan item this time 6219 * around, it'll get picked up the next time. 6220 * 6221 * The most common failure here is just -ENOENT. 6222 */ 6223 btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); 6224 } 6225 } 6226 6227 /* 6228 * This subvolume is going to be completely dropped, and won't be 6229 * recorded as dirty roots, thus pertrans meta rsv will not be freed at 6230 * commit transaction time. So free it here manually. 6231 */ 6232 btrfs_qgroup_convert_reserved_meta(root, INT_MAX); 6233 btrfs_qgroup_free_meta_all_pertrans(root); 6234 6235 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 6236 btrfs_add_dropped_root(trans, root); 6237 else 6238 btrfs_put_root(root); 6239 root_dropped = true; 6240 out_end_trans: 6241 if (!is_reloc_root) 6242 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 6243 6244 btrfs_end_transaction_throttle(trans); 6245 out_free: 6246 kfree(wc); 6247 btrfs_free_path(path); 6248 out: 6249 if (!ret && root_dropped) { 6250 ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); 6251 if (ret < 0) 6252 btrfs_warn_rl(fs_info, 6253 "failed to cleanup qgroup 0/%llu: %d", 6254 rootid, ret); 6255 ret = 0; 6256 } 6257 /* 6258 * We were an unfinished drop root, check to see if there are any 6259 * pending, and if not clear and wake up any waiters. 6260 */ 6261 if (!ret && unfinished_drop) 6262 btrfs_maybe_wake_unfinished_drop(fs_info); 6263 6264 /* 6265 * So if we need to stop dropping the snapshot for whatever reason we 6266 * need to make sure to add it back to the dead root list so that we 6267 * keep trying to do the work later. This also cleans up roots if we 6268 * don't have it in the radix (like when we recover after a power fail 6269 * or unmount) so we don't leak memory. 6270 */ 6271 if (!for_reloc && !root_dropped) 6272 btrfs_add_dead_root(root); 6273 return ret; 6274 } 6275 6276 /* 6277 * drop subtree rooted at tree block 'node'. 6278 * 6279 * NOTE: this function will unlock and release tree block 'node' 6280 * only used by relocation code 6281 */ 6282 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6283 struct btrfs_root *root, 6284 struct extent_buffer *node, 6285 struct extent_buffer *parent) 6286 { 6287 struct btrfs_fs_info *fs_info = root->fs_info; 6288 BTRFS_PATH_AUTO_FREE(path); 6289 struct walk_control *wc; 6290 int level; 6291 int parent_level; 6292 int ret = 0; 6293 6294 BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 6295 6296 path = btrfs_alloc_path(); 6297 if (!path) 6298 return -ENOMEM; 6299 6300 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6301 if (!wc) 6302 return -ENOMEM; 6303 6304 btrfs_assert_tree_write_locked(parent); 6305 parent_level = btrfs_header_level(parent); 6306 atomic_inc(&parent->refs); 6307 path->nodes[parent_level] = parent; 6308 path->slots[parent_level] = btrfs_header_nritems(parent); 6309 6310 btrfs_assert_tree_write_locked(node); 6311 level = btrfs_header_level(node); 6312 path->nodes[level] = node; 6313 path->slots[level] = 0; 6314 path->locks[level] = BTRFS_WRITE_LOCK; 6315 6316 wc->refs[parent_level] = 1; 6317 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6318 wc->level = level; 6319 wc->shared_level = -1; 6320 wc->stage = DROP_REFERENCE; 6321 wc->update_ref = 0; 6322 wc->keep_locks = 1; 6323 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 6324 6325 while (1) { 6326 ret = walk_down_tree(trans, root, path, wc); 6327 if (ret < 0) 6328 break; 6329 6330 ret = walk_up_tree(trans, root, path, wc, parent_level); 6331 if (ret) { 6332 if (ret > 0) 6333 ret = 0; 6334 break; 6335 } 6336 } 6337 6338 kfree(wc); 6339 return ret; 6340 } 6341 6342 /* 6343 * Unpin the extent range in an error context and don't add the space back. 6344 * Errors are not propagated further. 6345 */ 6346 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) 6347 { 6348 unpin_extent_range(fs_info, start, end, false); 6349 } 6350 6351 /* 6352 * It used to be that old block groups would be left around forever. 6353 * Iterating over them would be enough to trim unused space. Since we 6354 * now automatically remove them, we also need to iterate over unallocated 6355 * space. 6356 * 6357 * We don't want a transaction for this since the discard may take a 6358 * substantial amount of time. We don't require that a transaction be 6359 * running, but we do need to take a running transaction into account 6360 * to ensure that we're not discarding chunks that were released or 6361 * allocated in the current transaction. 6362 * 6363 * Holding the chunks lock will prevent other threads from allocating 6364 * or releasing chunks, but it won't prevent a running transaction 6365 * from committing and releasing the memory that the pending chunks 6366 * list head uses. For that, we need to take a reference to the 6367 * transaction and hold the commit root sem. We only need to hold 6368 * it while performing the free space search since we have already 6369 * held back allocations. 6370 */ 6371 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) 6372 { 6373 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; 6374 int ret; 6375 6376 *trimmed = 0; 6377 6378 /* Discard not supported = nothing to do. */ 6379 if (!bdev_max_discard_sectors(device->bdev)) 6380 return 0; 6381 6382 /* Not writable = nothing to do. */ 6383 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 6384 return 0; 6385 6386 /* No free space = nothing to do. */ 6387 if (device->total_bytes <= device->bytes_used) 6388 return 0; 6389 6390 ret = 0; 6391 6392 while (1) { 6393 struct btrfs_fs_info *fs_info = device->fs_info; 6394 u64 bytes; 6395 6396 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 6397 if (ret) 6398 break; 6399 6400 find_first_clear_extent_bit(&device->alloc_state, start, 6401 &start, &end, 6402 CHUNK_TRIMMED | CHUNK_ALLOCATED); 6403 6404 /* Check if there are any CHUNK_* bits left */ 6405 if (start > device->total_bytes) { 6406 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 6407 btrfs_warn_in_rcu(fs_info, 6408 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", 6409 start, end - start + 1, 6410 btrfs_dev_name(device), 6411 device->total_bytes); 6412 mutex_unlock(&fs_info->chunk_mutex); 6413 ret = 0; 6414 break; 6415 } 6416 6417 /* Ensure we skip the reserved space on each device. */ 6418 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 6419 6420 /* 6421 * If find_first_clear_extent_bit find a range that spans the 6422 * end of the device it will set end to -1, in this case it's up 6423 * to the caller to trim the value to the size of the device. 6424 */ 6425 end = min(end, device->total_bytes - 1); 6426 6427 len = end - start + 1; 6428 6429 /* We didn't find any extents */ 6430 if (!len) { 6431 mutex_unlock(&fs_info->chunk_mutex); 6432 ret = 0; 6433 break; 6434 } 6435 6436 ret = btrfs_issue_discard(device->bdev, start, len, 6437 &bytes); 6438 if (!ret) 6439 set_extent_bit(&device->alloc_state, start, 6440 start + bytes - 1, CHUNK_TRIMMED, NULL); 6441 mutex_unlock(&fs_info->chunk_mutex); 6442 6443 if (ret) 6444 break; 6445 6446 start += len; 6447 *trimmed += bytes; 6448 6449 if (btrfs_trim_interrupted()) { 6450 ret = -ERESTARTSYS; 6451 break; 6452 } 6453 6454 cond_resched(); 6455 } 6456 6457 return ret; 6458 } 6459 6460 /* 6461 * Trim the whole filesystem by: 6462 * 1) trimming the free space in each block group 6463 * 2) trimming the unallocated space on each device 6464 * 6465 * This will also continue trimming even if a block group or device encounters 6466 * an error. The return value will be the last error, or 0 if nothing bad 6467 * happens. 6468 */ 6469 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 6470 { 6471 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6472 struct btrfs_block_group *cache = NULL; 6473 struct btrfs_device *device; 6474 u64 group_trimmed; 6475 u64 range_end = U64_MAX; 6476 u64 start; 6477 u64 end; 6478 u64 trimmed = 0; 6479 u64 bg_failed = 0; 6480 u64 dev_failed = 0; 6481 int bg_ret = 0; 6482 int dev_ret = 0; 6483 int ret = 0; 6484 6485 if (range->start == U64_MAX) 6486 return -EINVAL; 6487 6488 /* 6489 * Check range overflow if range->len is set. 6490 * The default range->len is U64_MAX. 6491 */ 6492 if (range->len != U64_MAX && 6493 check_add_overflow(range->start, range->len, &range_end)) 6494 return -EINVAL; 6495 6496 cache = btrfs_lookup_first_block_group(fs_info, range->start); 6497 for (; cache; cache = btrfs_next_block_group(cache)) { 6498 if (cache->start >= range_end) { 6499 btrfs_put_block_group(cache); 6500 break; 6501 } 6502 6503 start = max(range->start, cache->start); 6504 end = min(range_end, cache->start + cache->length); 6505 6506 if (end - start >= range->minlen) { 6507 if (!btrfs_block_group_done(cache)) { 6508 ret = btrfs_cache_block_group(cache, true); 6509 if (ret) { 6510 bg_failed++; 6511 bg_ret = ret; 6512 continue; 6513 } 6514 } 6515 ret = btrfs_trim_block_group(cache, 6516 &group_trimmed, 6517 start, 6518 end, 6519 range->minlen); 6520 6521 trimmed += group_trimmed; 6522 if (ret) { 6523 bg_failed++; 6524 bg_ret = ret; 6525 continue; 6526 } 6527 } 6528 } 6529 6530 if (bg_failed) 6531 btrfs_warn(fs_info, 6532 "failed to trim %llu block group(s), last error %d", 6533 bg_failed, bg_ret); 6534 6535 mutex_lock(&fs_devices->device_list_mutex); 6536 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6537 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 6538 continue; 6539 6540 ret = btrfs_trim_free_extents(device, &group_trimmed); 6541 6542 trimmed += group_trimmed; 6543 if (ret) { 6544 dev_failed++; 6545 dev_ret = ret; 6546 break; 6547 } 6548 } 6549 mutex_unlock(&fs_devices->device_list_mutex); 6550 6551 if (dev_failed) 6552 btrfs_warn(fs_info, 6553 "failed to trim %llu device(s), last error %d", 6554 dev_failed, dev_ret); 6555 range->len = trimmed; 6556 if (bg_ret) 6557 return bg_ret; 6558 return dev_ret; 6559 } 6560