1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 Fujitsu. All rights reserved. 4 * Written by Miao Xie <miaox@cn.fujitsu.com> 5 */ 6 7 #include <linux/slab.h> 8 #include <linux/iversion.h> 9 #include "ctree.h" 10 #include "fs.h" 11 #include "messages.h" 12 #include "misc.h" 13 #include "delayed-inode.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "qgroup.h" 17 #include "locking.h" 18 #include "inode-item.h" 19 #include "space-info.h" 20 #include "accessors.h" 21 #include "file-item.h" 22 23 #define BTRFS_DELAYED_WRITEBACK 512 24 #define BTRFS_DELAYED_BACKGROUND 128 25 #define BTRFS_DELAYED_BATCH 16 26 27 static struct kmem_cache *delayed_node_cache; 28 29 int __init btrfs_delayed_inode_init(void) 30 { 31 delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); 32 if (!delayed_node_cache) 33 return -ENOMEM; 34 return 0; 35 } 36 37 void __cold btrfs_delayed_inode_exit(void) 38 { 39 kmem_cache_destroy(delayed_node_cache); 40 } 41 42 void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) 43 { 44 atomic_set(&delayed_root->items, 0); 45 atomic_set(&delayed_root->items_seq, 0); 46 delayed_root->nodes = 0; 47 spin_lock_init(&delayed_root->lock); 48 init_waitqueue_head(&delayed_root->wait); 49 INIT_LIST_HEAD(&delayed_root->node_list); 50 INIT_LIST_HEAD(&delayed_root->prepare_list); 51 } 52 53 static inline void btrfs_init_delayed_node( 54 struct btrfs_delayed_node *delayed_node, 55 struct btrfs_root *root, u64 inode_id) 56 { 57 delayed_node->root = root; 58 delayed_node->inode_id = inode_id; 59 refcount_set(&delayed_node->refs, 0); 60 delayed_node->ins_root = RB_ROOT_CACHED; 61 delayed_node->del_root = RB_ROOT_CACHED; 62 mutex_init(&delayed_node->mutex); 63 INIT_LIST_HEAD(&delayed_node->n_list); 64 INIT_LIST_HEAD(&delayed_node->p_list); 65 } 66 67 static struct btrfs_delayed_node *btrfs_get_delayed_node( 68 struct btrfs_inode *btrfs_inode) 69 { 70 struct btrfs_root *root = btrfs_inode->root; 71 u64 ino = btrfs_ino(btrfs_inode); 72 struct btrfs_delayed_node *node; 73 74 node = READ_ONCE(btrfs_inode->delayed_node); 75 if (node) { 76 refcount_inc(&node->refs); 77 return node; 78 } 79 80 xa_lock(&root->delayed_nodes); 81 node = xa_load(&root->delayed_nodes, ino); 82 83 if (node) { 84 if (btrfs_inode->delayed_node) { 85 refcount_inc(&node->refs); /* can be accessed */ 86 BUG_ON(btrfs_inode->delayed_node != node); 87 xa_unlock(&root->delayed_nodes); 88 return node; 89 } 90 91 /* 92 * It's possible that we're racing into the middle of removing 93 * this node from the xarray. In this case, the refcount 94 * was zero and it should never go back to one. Just return 95 * NULL like it was never in the xarray at all; our release 96 * function is in the process of removing it. 97 * 98 * Some implementations of refcount_inc refuse to bump the 99 * refcount once it has hit zero. If we don't do this dance 100 * here, refcount_inc() may decide to just WARN_ONCE() instead 101 * of actually bumping the refcount. 102 * 103 * If this node is properly in the xarray, we want to bump the 104 * refcount twice, once for the inode and once for this get 105 * operation. 106 */ 107 if (refcount_inc_not_zero(&node->refs)) { 108 refcount_inc(&node->refs); 109 btrfs_inode->delayed_node = node; 110 } else { 111 node = NULL; 112 } 113 114 xa_unlock(&root->delayed_nodes); 115 return node; 116 } 117 xa_unlock(&root->delayed_nodes); 118 119 return NULL; 120 } 121 122 /* Will return either the node or PTR_ERR(-ENOMEM) */ 123 static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( 124 struct btrfs_inode *btrfs_inode) 125 { 126 struct btrfs_delayed_node *node; 127 struct btrfs_root *root = btrfs_inode->root; 128 u64 ino = btrfs_ino(btrfs_inode); 129 int ret; 130 void *ptr; 131 132 again: 133 node = btrfs_get_delayed_node(btrfs_inode); 134 if (node) 135 return node; 136 137 node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); 138 if (!node) 139 return ERR_PTR(-ENOMEM); 140 btrfs_init_delayed_node(node, root, ino); 141 142 /* Cached in the inode and can be accessed. */ 143 refcount_set(&node->refs, 2); 144 145 /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ 146 ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); 147 if (ret == -ENOMEM) { 148 kmem_cache_free(delayed_node_cache, node); 149 return ERR_PTR(-ENOMEM); 150 } 151 xa_lock(&root->delayed_nodes); 152 ptr = xa_load(&root->delayed_nodes, ino); 153 if (ptr) { 154 /* Somebody inserted it, go back and read it. */ 155 xa_unlock(&root->delayed_nodes); 156 kmem_cache_free(delayed_node_cache, node); 157 node = NULL; 158 goto again; 159 } 160 ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); 161 ASSERT(xa_err(ptr) != -EINVAL); 162 ASSERT(xa_err(ptr) != -ENOMEM); 163 ASSERT(ptr == NULL); 164 btrfs_inode->delayed_node = node; 165 xa_unlock(&root->delayed_nodes); 166 167 return node; 168 } 169 170 /* 171 * Call it when holding delayed_node->mutex 172 * 173 * If mod = 1, add this node into the prepared list. 174 */ 175 static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, 176 struct btrfs_delayed_node *node, 177 int mod) 178 { 179 spin_lock(&root->lock); 180 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 181 if (!list_empty(&node->p_list)) 182 list_move_tail(&node->p_list, &root->prepare_list); 183 else if (mod) 184 list_add_tail(&node->p_list, &root->prepare_list); 185 } else { 186 list_add_tail(&node->n_list, &root->node_list); 187 list_add_tail(&node->p_list, &root->prepare_list); 188 refcount_inc(&node->refs); /* inserted into list */ 189 root->nodes++; 190 set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); 191 } 192 spin_unlock(&root->lock); 193 } 194 195 /* Call it when holding delayed_node->mutex */ 196 static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, 197 struct btrfs_delayed_node *node) 198 { 199 spin_lock(&root->lock); 200 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 201 root->nodes--; 202 refcount_dec(&node->refs); /* not in the list */ 203 list_del_init(&node->n_list); 204 if (!list_empty(&node->p_list)) 205 list_del_init(&node->p_list); 206 clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); 207 } 208 spin_unlock(&root->lock); 209 } 210 211 static struct btrfs_delayed_node *btrfs_first_delayed_node( 212 struct btrfs_delayed_root *delayed_root) 213 { 214 struct list_head *p; 215 struct btrfs_delayed_node *node = NULL; 216 217 spin_lock(&delayed_root->lock); 218 if (list_empty(&delayed_root->node_list)) 219 goto out; 220 221 p = delayed_root->node_list.next; 222 node = list_entry(p, struct btrfs_delayed_node, n_list); 223 refcount_inc(&node->refs); 224 out: 225 spin_unlock(&delayed_root->lock); 226 227 return node; 228 } 229 230 static struct btrfs_delayed_node *btrfs_next_delayed_node( 231 struct btrfs_delayed_node *node) 232 { 233 struct btrfs_delayed_root *delayed_root; 234 struct list_head *p; 235 struct btrfs_delayed_node *next = NULL; 236 237 delayed_root = node->root->fs_info->delayed_root; 238 spin_lock(&delayed_root->lock); 239 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 240 /* not in the list */ 241 if (list_empty(&delayed_root->node_list)) 242 goto out; 243 p = delayed_root->node_list.next; 244 } else if (list_is_last(&node->n_list, &delayed_root->node_list)) 245 goto out; 246 else 247 p = node->n_list.next; 248 249 next = list_entry(p, struct btrfs_delayed_node, n_list); 250 refcount_inc(&next->refs); 251 out: 252 spin_unlock(&delayed_root->lock); 253 254 return next; 255 } 256 257 static void __btrfs_release_delayed_node( 258 struct btrfs_delayed_node *delayed_node, 259 int mod) 260 { 261 struct btrfs_delayed_root *delayed_root; 262 263 if (!delayed_node) 264 return; 265 266 delayed_root = delayed_node->root->fs_info->delayed_root; 267 268 mutex_lock(&delayed_node->mutex); 269 if (delayed_node->count) 270 btrfs_queue_delayed_node(delayed_root, delayed_node, mod); 271 else 272 btrfs_dequeue_delayed_node(delayed_root, delayed_node); 273 mutex_unlock(&delayed_node->mutex); 274 275 if (refcount_dec_and_test(&delayed_node->refs)) { 276 struct btrfs_root *root = delayed_node->root; 277 278 xa_erase(&root->delayed_nodes, delayed_node->inode_id); 279 /* 280 * Once our refcount goes to zero, nobody is allowed to bump it 281 * back up. We can delete it now. 282 */ 283 ASSERT(refcount_read(&delayed_node->refs) == 0); 284 kmem_cache_free(delayed_node_cache, delayed_node); 285 } 286 } 287 288 static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) 289 { 290 __btrfs_release_delayed_node(node, 0); 291 } 292 293 static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( 294 struct btrfs_delayed_root *delayed_root) 295 { 296 struct list_head *p; 297 struct btrfs_delayed_node *node = NULL; 298 299 spin_lock(&delayed_root->lock); 300 if (list_empty(&delayed_root->prepare_list)) 301 goto out; 302 303 p = delayed_root->prepare_list.next; 304 list_del_init(p); 305 node = list_entry(p, struct btrfs_delayed_node, p_list); 306 refcount_inc(&node->refs); 307 out: 308 spin_unlock(&delayed_root->lock); 309 310 return node; 311 } 312 313 static inline void btrfs_release_prepared_delayed_node( 314 struct btrfs_delayed_node *node) 315 { 316 __btrfs_release_delayed_node(node, 1); 317 } 318 319 static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, 320 struct btrfs_delayed_node *node, 321 enum btrfs_delayed_item_type type) 322 { 323 struct btrfs_delayed_item *item; 324 325 item = kmalloc(struct_size(item, data, data_len), GFP_NOFS); 326 if (item) { 327 item->data_len = data_len; 328 item->type = type; 329 item->bytes_reserved = 0; 330 item->delayed_node = node; 331 RB_CLEAR_NODE(&item->rb_node); 332 INIT_LIST_HEAD(&item->log_list); 333 item->logged = false; 334 refcount_set(&item->refs, 1); 335 } 336 return item; 337 } 338 339 /* 340 * Look up the delayed item by key. 341 * 342 * @delayed_node: pointer to the delayed node 343 * @index: the dir index value to lookup (offset of a dir index key) 344 * 345 * Note: if we don't find the right item, we will return the prev item and 346 * the next item. 347 */ 348 static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( 349 struct rb_root *root, 350 u64 index) 351 { 352 struct rb_node *node = root->rb_node; 353 struct btrfs_delayed_item *delayed_item = NULL; 354 355 while (node) { 356 delayed_item = rb_entry(node, struct btrfs_delayed_item, 357 rb_node); 358 if (delayed_item->index < index) 359 node = node->rb_right; 360 else if (delayed_item->index > index) 361 node = node->rb_left; 362 else 363 return delayed_item; 364 } 365 366 return NULL; 367 } 368 369 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, 370 struct btrfs_delayed_item *ins) 371 { 372 struct rb_node **p, *node; 373 struct rb_node *parent_node = NULL; 374 struct rb_root_cached *root; 375 struct btrfs_delayed_item *item; 376 bool leftmost = true; 377 378 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) 379 root = &delayed_node->ins_root; 380 else 381 root = &delayed_node->del_root; 382 383 p = &root->rb_root.rb_node; 384 node = &ins->rb_node; 385 386 while (*p) { 387 parent_node = *p; 388 item = rb_entry(parent_node, struct btrfs_delayed_item, 389 rb_node); 390 391 if (item->index < ins->index) { 392 p = &(*p)->rb_right; 393 leftmost = false; 394 } else if (item->index > ins->index) { 395 p = &(*p)->rb_left; 396 } else { 397 return -EEXIST; 398 } 399 } 400 401 rb_link_node(node, parent_node, p); 402 rb_insert_color_cached(node, root, leftmost); 403 404 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && 405 ins->index >= delayed_node->index_cnt) 406 delayed_node->index_cnt = ins->index + 1; 407 408 delayed_node->count++; 409 atomic_inc(&delayed_node->root->fs_info->delayed_root->items); 410 return 0; 411 } 412 413 static void finish_one_item(struct btrfs_delayed_root *delayed_root) 414 { 415 int seq = atomic_inc_return(&delayed_root->items_seq); 416 417 /* atomic_dec_return implies a barrier */ 418 if ((atomic_dec_return(&delayed_root->items) < 419 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0)) 420 cond_wake_up_nomb(&delayed_root->wait); 421 } 422 423 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) 424 { 425 struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; 426 struct rb_root_cached *root; 427 struct btrfs_delayed_root *delayed_root; 428 429 /* Not inserted, ignore it. */ 430 if (RB_EMPTY_NODE(&delayed_item->rb_node)) 431 return; 432 433 /* If it's in a rbtree, then we need to have delayed node locked. */ 434 lockdep_assert_held(&delayed_node->mutex); 435 436 delayed_root = delayed_node->root->fs_info->delayed_root; 437 438 if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) 439 root = &delayed_node->ins_root; 440 else 441 root = &delayed_node->del_root; 442 443 rb_erase_cached(&delayed_item->rb_node, root); 444 RB_CLEAR_NODE(&delayed_item->rb_node); 445 delayed_node->count--; 446 447 finish_one_item(delayed_root); 448 } 449 450 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) 451 { 452 if (item) { 453 __btrfs_remove_delayed_item(item); 454 if (refcount_dec_and_test(&item->refs)) 455 kfree(item); 456 } 457 } 458 459 static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( 460 struct btrfs_delayed_node *delayed_node) 461 { 462 struct rb_node *p; 463 struct btrfs_delayed_item *item = NULL; 464 465 p = rb_first_cached(&delayed_node->ins_root); 466 if (p) 467 item = rb_entry(p, struct btrfs_delayed_item, rb_node); 468 469 return item; 470 } 471 472 static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( 473 struct btrfs_delayed_node *delayed_node) 474 { 475 struct rb_node *p; 476 struct btrfs_delayed_item *item = NULL; 477 478 p = rb_first_cached(&delayed_node->del_root); 479 if (p) 480 item = rb_entry(p, struct btrfs_delayed_item, rb_node); 481 482 return item; 483 } 484 485 static struct btrfs_delayed_item *__btrfs_next_delayed_item( 486 struct btrfs_delayed_item *item) 487 { 488 struct rb_node *p; 489 struct btrfs_delayed_item *next = NULL; 490 491 p = rb_next(&item->rb_node); 492 if (p) 493 next = rb_entry(p, struct btrfs_delayed_item, rb_node); 494 495 return next; 496 } 497 498 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 499 struct btrfs_delayed_item *item) 500 { 501 struct btrfs_block_rsv *src_rsv; 502 struct btrfs_block_rsv *dst_rsv; 503 struct btrfs_fs_info *fs_info = trans->fs_info; 504 u64 num_bytes; 505 int ret; 506 507 if (!trans->bytes_reserved) 508 return 0; 509 510 src_rsv = trans->block_rsv; 511 dst_rsv = &fs_info->delayed_block_rsv; 512 513 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 514 515 /* 516 * Here we migrate space rsv from transaction rsv, since have already 517 * reserved space when starting a transaction. So no need to reserve 518 * qgroup space here. 519 */ 520 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); 521 if (!ret) { 522 trace_btrfs_space_reservation(fs_info, "delayed_item", 523 item->delayed_node->inode_id, 524 num_bytes, 1); 525 /* 526 * For insertions we track reserved metadata space by accounting 527 * for the number of leaves that will be used, based on the delayed 528 * node's curr_index_batch_size and index_item_leaves fields. 529 */ 530 if (item->type == BTRFS_DELAYED_DELETION_ITEM) 531 item->bytes_reserved = num_bytes; 532 } 533 534 return ret; 535 } 536 537 static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, 538 struct btrfs_delayed_item *item) 539 { 540 struct btrfs_block_rsv *rsv; 541 struct btrfs_fs_info *fs_info = root->fs_info; 542 543 if (!item->bytes_reserved) 544 return; 545 546 rsv = &fs_info->delayed_block_rsv; 547 /* 548 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need 549 * to release/reserve qgroup space. 550 */ 551 trace_btrfs_space_reservation(fs_info, "delayed_item", 552 item->delayed_node->inode_id, 553 item->bytes_reserved, 0); 554 btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); 555 } 556 557 static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, 558 unsigned int num_leaves) 559 { 560 struct btrfs_fs_info *fs_info = node->root->fs_info; 561 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); 562 563 /* There are no space reservations during log replay, bail out. */ 564 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 565 return; 566 567 trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, 568 bytes, 0); 569 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); 570 } 571 572 static int btrfs_delayed_inode_reserve_metadata( 573 struct btrfs_trans_handle *trans, 574 struct btrfs_root *root, 575 struct btrfs_delayed_node *node) 576 { 577 struct btrfs_fs_info *fs_info = root->fs_info; 578 struct btrfs_block_rsv *src_rsv; 579 struct btrfs_block_rsv *dst_rsv; 580 u64 num_bytes; 581 int ret; 582 583 src_rsv = trans->block_rsv; 584 dst_rsv = &fs_info->delayed_block_rsv; 585 586 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 587 588 /* 589 * btrfs_dirty_inode will update the inode under btrfs_join_transaction 590 * which doesn't reserve space for speed. This is a problem since we 591 * still need to reserve space for this update, so try to reserve the 592 * space. 593 * 594 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 595 * we always reserve enough to update the inode item. 596 */ 597 if (!src_rsv || (!trans->bytes_reserved && 598 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 599 ret = btrfs_qgroup_reserve_meta(root, num_bytes, 600 BTRFS_QGROUP_RSV_META_PREALLOC, true); 601 if (ret < 0) 602 return ret; 603 ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, 604 BTRFS_RESERVE_NO_FLUSH); 605 /* NO_FLUSH could only fail with -ENOSPC */ 606 ASSERT(ret == 0 || ret == -ENOSPC); 607 if (ret) 608 btrfs_qgroup_free_meta_prealloc(root, num_bytes); 609 } else { 610 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); 611 } 612 613 if (!ret) { 614 trace_btrfs_space_reservation(fs_info, "delayed_inode", 615 node->inode_id, num_bytes, 1); 616 node->bytes_reserved = num_bytes; 617 } 618 619 return ret; 620 } 621 622 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, 623 struct btrfs_delayed_node *node, 624 bool qgroup_free) 625 { 626 struct btrfs_block_rsv *rsv; 627 628 if (!node->bytes_reserved) 629 return; 630 631 rsv = &fs_info->delayed_block_rsv; 632 trace_btrfs_space_reservation(fs_info, "delayed_inode", 633 node->inode_id, node->bytes_reserved, 0); 634 btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); 635 if (qgroup_free) 636 btrfs_qgroup_free_meta_prealloc(node->root, 637 node->bytes_reserved); 638 else 639 btrfs_qgroup_convert_reserved_meta(node->root, 640 node->bytes_reserved); 641 node->bytes_reserved = 0; 642 } 643 644 /* 645 * Insert a single delayed item or a batch of delayed items, as many as possible 646 * that fit in a leaf. The delayed items (dir index keys) are sorted by their key 647 * in the rbtree, and if there's a gap between two consecutive dir index items, 648 * then it means at some point we had delayed dir indexes to add but they got 649 * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them 650 * into the subvolume tree. Dir index keys also have their offsets coming from a 651 * monotonically increasing counter, so we can't get new keys with an offset that 652 * fits within a gap between delayed dir index items. 653 */ 654 static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, 655 struct btrfs_root *root, 656 struct btrfs_path *path, 657 struct btrfs_delayed_item *first_item) 658 { 659 struct btrfs_fs_info *fs_info = root->fs_info; 660 struct btrfs_delayed_node *node = first_item->delayed_node; 661 LIST_HEAD(item_list); 662 struct btrfs_delayed_item *curr; 663 struct btrfs_delayed_item *next; 664 const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); 665 struct btrfs_item_batch batch; 666 struct btrfs_key first_key; 667 const u32 first_data_size = first_item->data_len; 668 int total_size; 669 char *ins_data = NULL; 670 int ret; 671 bool continuous_keys_only = false; 672 673 lockdep_assert_held(&node->mutex); 674 675 /* 676 * During normal operation the delayed index offset is continuously 677 * increasing, so we can batch insert all items as there will not be any 678 * overlapping keys in the tree. 679 * 680 * The exception to this is log replay, where we may have interleaved 681 * offsets in the tree, so our batch needs to be continuous keys only in 682 * order to ensure we do not end up with out of order items in our leaf. 683 */ 684 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 685 continuous_keys_only = true; 686 687 /* 688 * For delayed items to insert, we track reserved metadata bytes based 689 * on the number of leaves that we will use. 690 * See btrfs_insert_delayed_dir_index() and 691 * btrfs_delayed_item_reserve_metadata()). 692 */ 693 ASSERT(first_item->bytes_reserved == 0); 694 695 list_add_tail(&first_item->tree_list, &item_list); 696 batch.total_data_size = first_data_size; 697 batch.nr = 1; 698 total_size = first_data_size + sizeof(struct btrfs_item); 699 curr = first_item; 700 701 while (true) { 702 int next_size; 703 704 next = __btrfs_next_delayed_item(curr); 705 if (!next) 706 break; 707 708 /* 709 * We cannot allow gaps in the key space if we're doing log 710 * replay. 711 */ 712 if (continuous_keys_only && (next->index != curr->index + 1)) 713 break; 714 715 ASSERT(next->bytes_reserved == 0); 716 717 next_size = next->data_len + sizeof(struct btrfs_item); 718 if (total_size + next_size > max_size) 719 break; 720 721 list_add_tail(&next->tree_list, &item_list); 722 batch.nr++; 723 total_size += next_size; 724 batch.total_data_size += next->data_len; 725 curr = next; 726 } 727 728 if (batch.nr == 1) { 729 first_key.objectid = node->inode_id; 730 first_key.type = BTRFS_DIR_INDEX_KEY; 731 first_key.offset = first_item->index; 732 batch.keys = &first_key; 733 batch.data_sizes = &first_data_size; 734 } else { 735 struct btrfs_key *ins_keys; 736 u32 *ins_sizes; 737 int i = 0; 738 739 ins_data = kmalloc(batch.nr * sizeof(u32) + 740 batch.nr * sizeof(struct btrfs_key), GFP_NOFS); 741 if (!ins_data) { 742 ret = -ENOMEM; 743 goto out; 744 } 745 ins_sizes = (u32 *)ins_data; 746 ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); 747 batch.keys = ins_keys; 748 batch.data_sizes = ins_sizes; 749 list_for_each_entry(curr, &item_list, tree_list) { 750 ins_keys[i].objectid = node->inode_id; 751 ins_keys[i].type = BTRFS_DIR_INDEX_KEY; 752 ins_keys[i].offset = curr->index; 753 ins_sizes[i] = curr->data_len; 754 i++; 755 } 756 } 757 758 ret = btrfs_insert_empty_items(trans, root, path, &batch); 759 if (ret) 760 goto out; 761 762 list_for_each_entry(curr, &item_list, tree_list) { 763 char *data_ptr; 764 765 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 766 write_extent_buffer(path->nodes[0], &curr->data, 767 (unsigned long)data_ptr, curr->data_len); 768 path->slots[0]++; 769 } 770 771 /* 772 * Now release our path before releasing the delayed items and their 773 * metadata reservations, so that we don't block other tasks for more 774 * time than needed. 775 */ 776 btrfs_release_path(path); 777 778 ASSERT(node->index_item_leaves > 0); 779 780 /* 781 * For normal operations we will batch an entire leaf's worth of delayed 782 * items, so if there are more items to process we can decrement 783 * index_item_leaves by 1 as we inserted 1 leaf's worth of items. 784 * 785 * However for log replay we may not have inserted an entire leaf's 786 * worth of items, we may have not had continuous items, so decrementing 787 * here would mess up the index_item_leaves accounting. For this case 788 * only clean up the accounting when there are no items left. 789 */ 790 if (next && !continuous_keys_only) { 791 /* 792 * We inserted one batch of items into a leaf a there are more 793 * items to flush in a future batch, now release one unit of 794 * metadata space from the delayed block reserve, corresponding 795 * the leaf we just flushed to. 796 */ 797 btrfs_delayed_item_release_leaves(node, 1); 798 node->index_item_leaves--; 799 } else if (!next) { 800 /* 801 * There are no more items to insert. We can have a number of 802 * reserved leaves > 1 here - this happens when many dir index 803 * items are added and then removed before they are flushed (file 804 * names with a very short life, never span a transaction). So 805 * release all remaining leaves. 806 */ 807 btrfs_delayed_item_release_leaves(node, node->index_item_leaves); 808 node->index_item_leaves = 0; 809 } 810 811 list_for_each_entry_safe(curr, next, &item_list, tree_list) { 812 list_del(&curr->tree_list); 813 btrfs_release_delayed_item(curr); 814 } 815 out: 816 kfree(ins_data); 817 return ret; 818 } 819 820 static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, 821 struct btrfs_path *path, 822 struct btrfs_root *root, 823 struct btrfs_delayed_node *node) 824 { 825 int ret = 0; 826 827 while (ret == 0) { 828 struct btrfs_delayed_item *curr; 829 830 mutex_lock(&node->mutex); 831 curr = __btrfs_first_delayed_insertion_item(node); 832 if (!curr) { 833 mutex_unlock(&node->mutex); 834 break; 835 } 836 ret = btrfs_insert_delayed_item(trans, root, path, curr); 837 mutex_unlock(&node->mutex); 838 } 839 840 return ret; 841 } 842 843 static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, 844 struct btrfs_root *root, 845 struct btrfs_path *path, 846 struct btrfs_delayed_item *item) 847 { 848 const u64 ino = item->delayed_node->inode_id; 849 struct btrfs_fs_info *fs_info = root->fs_info; 850 struct btrfs_delayed_item *curr, *next; 851 struct extent_buffer *leaf = path->nodes[0]; 852 LIST_HEAD(batch_list); 853 int nitems, slot, last_slot; 854 int ret; 855 u64 total_reserved_size = item->bytes_reserved; 856 857 ASSERT(leaf != NULL); 858 859 slot = path->slots[0]; 860 last_slot = btrfs_header_nritems(leaf) - 1; 861 /* 862 * Our caller always gives us a path pointing to an existing item, so 863 * this can not happen. 864 */ 865 ASSERT(slot <= last_slot); 866 if (WARN_ON(slot > last_slot)) 867 return -ENOENT; 868 869 nitems = 1; 870 curr = item; 871 list_add_tail(&curr->tree_list, &batch_list); 872 873 /* 874 * Keep checking if the next delayed item matches the next item in the 875 * leaf - if so, we can add it to the batch of items to delete from the 876 * leaf. 877 */ 878 while (slot < last_slot) { 879 struct btrfs_key key; 880 881 next = __btrfs_next_delayed_item(curr); 882 if (!next) 883 break; 884 885 slot++; 886 btrfs_item_key_to_cpu(leaf, &key, slot); 887 if (key.objectid != ino || 888 key.type != BTRFS_DIR_INDEX_KEY || 889 key.offset != next->index) 890 break; 891 nitems++; 892 curr = next; 893 list_add_tail(&curr->tree_list, &batch_list); 894 total_reserved_size += curr->bytes_reserved; 895 } 896 897 ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); 898 if (ret) 899 return ret; 900 901 /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ 902 if (total_reserved_size > 0) { 903 /* 904 * Check btrfs_delayed_item_reserve_metadata() to see why we 905 * don't need to release/reserve qgroup space. 906 */ 907 trace_btrfs_space_reservation(fs_info, "delayed_item", ino, 908 total_reserved_size, 0); 909 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, 910 total_reserved_size, NULL); 911 } 912 913 list_for_each_entry_safe(curr, next, &batch_list, tree_list) { 914 list_del(&curr->tree_list); 915 btrfs_release_delayed_item(curr); 916 } 917 918 return 0; 919 } 920 921 static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, 922 struct btrfs_path *path, 923 struct btrfs_root *root, 924 struct btrfs_delayed_node *node) 925 { 926 struct btrfs_key key; 927 int ret = 0; 928 929 key.objectid = node->inode_id; 930 key.type = BTRFS_DIR_INDEX_KEY; 931 932 while (ret == 0) { 933 struct btrfs_delayed_item *item; 934 935 mutex_lock(&node->mutex); 936 item = __btrfs_first_delayed_deletion_item(node); 937 if (!item) { 938 mutex_unlock(&node->mutex); 939 break; 940 } 941 942 key.offset = item->index; 943 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 944 if (ret > 0) { 945 /* 946 * There's no matching item in the leaf. This means we 947 * have already deleted this item in a past run of the 948 * delayed items. We ignore errors when running delayed 949 * items from an async context, through a work queue job 950 * running btrfs_async_run_delayed_root(), and don't 951 * release delayed items that failed to complete. This 952 * is because we will retry later, and at transaction 953 * commit time we always run delayed items and will 954 * then deal with errors if they fail to run again. 955 * 956 * So just release delayed items for which we can't find 957 * an item in the tree, and move to the next item. 958 */ 959 btrfs_release_path(path); 960 btrfs_release_delayed_item(item); 961 ret = 0; 962 } else if (ret == 0) { 963 ret = btrfs_batch_delete_items(trans, root, path, item); 964 btrfs_release_path(path); 965 } 966 967 /* 968 * We unlock and relock on each iteration, this is to prevent 969 * blocking other tasks for too long while we are being run from 970 * the async context (work queue job). Those tasks are typically 971 * running system calls like creat/mkdir/rename/unlink/etc which 972 * need to add delayed items to this delayed node. 973 */ 974 mutex_unlock(&node->mutex); 975 } 976 977 return ret; 978 } 979 980 static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) 981 { 982 struct btrfs_delayed_root *delayed_root; 983 984 if (delayed_node && 985 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 986 ASSERT(delayed_node->root); 987 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 988 delayed_node->count--; 989 990 delayed_root = delayed_node->root->fs_info->delayed_root; 991 finish_one_item(delayed_root); 992 } 993 } 994 995 static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) 996 { 997 998 if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 999 struct btrfs_delayed_root *delayed_root; 1000 1001 ASSERT(delayed_node->root); 1002 delayed_node->count--; 1003 1004 delayed_root = delayed_node->root->fs_info->delayed_root; 1005 finish_one_item(delayed_root); 1006 } 1007 } 1008 1009 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1010 struct btrfs_root *root, 1011 struct btrfs_path *path, 1012 struct btrfs_delayed_node *node) 1013 { 1014 struct btrfs_fs_info *fs_info = root->fs_info; 1015 struct btrfs_key key; 1016 struct btrfs_inode_item *inode_item; 1017 struct extent_buffer *leaf; 1018 int mod; 1019 int ret; 1020 1021 key.objectid = node->inode_id; 1022 key.type = BTRFS_INODE_ITEM_KEY; 1023 key.offset = 0; 1024 1025 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1026 mod = -1; 1027 else 1028 mod = 1; 1029 1030 ret = btrfs_lookup_inode(trans, root, path, &key, mod); 1031 if (ret > 0) 1032 ret = -ENOENT; 1033 if (ret < 0) 1034 goto out; 1035 1036 leaf = path->nodes[0]; 1037 inode_item = btrfs_item_ptr(leaf, path->slots[0], 1038 struct btrfs_inode_item); 1039 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, 1040 sizeof(struct btrfs_inode_item)); 1041 btrfs_mark_buffer_dirty(trans, leaf); 1042 1043 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1044 goto out; 1045 1046 /* 1047 * Now we're going to delete the INODE_REF/EXTREF, which should be the 1048 * only one ref left. Check if the next item is an INODE_REF/EXTREF. 1049 * 1050 * But if we're the last item already, release and search for the last 1051 * INODE_REF/EXTREF. 1052 */ 1053 if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) { 1054 key.objectid = node->inode_id; 1055 key.type = BTRFS_INODE_EXTREF_KEY; 1056 key.offset = (u64)-1; 1057 1058 btrfs_release_path(path); 1059 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1060 if (ret < 0) 1061 goto err_out; 1062 ASSERT(ret > 0); 1063 ASSERT(path->slots[0] > 0); 1064 ret = 0; 1065 path->slots[0]--; 1066 leaf = path->nodes[0]; 1067 } else { 1068 path->slots[0]++; 1069 } 1070 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1071 if (key.objectid != node->inode_id) 1072 goto out; 1073 if (key.type != BTRFS_INODE_REF_KEY && 1074 key.type != BTRFS_INODE_EXTREF_KEY) 1075 goto out; 1076 1077 /* 1078 * Delayed iref deletion is for the inode who has only one link, 1079 * so there is only one iref. The case that several irefs are 1080 * in the same item doesn't exist. 1081 */ 1082 ret = btrfs_del_item(trans, root, path); 1083 out: 1084 btrfs_release_delayed_iref(node); 1085 btrfs_release_path(path); 1086 err_out: 1087 btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); 1088 btrfs_release_delayed_inode(node); 1089 1090 /* 1091 * If we fail to update the delayed inode we need to abort the 1092 * transaction, because we could leave the inode with the improper 1093 * counts behind. 1094 */ 1095 if (ret && ret != -ENOENT) 1096 btrfs_abort_transaction(trans, ret); 1097 1098 return ret; 1099 } 1100 1101 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1102 struct btrfs_root *root, 1103 struct btrfs_path *path, 1104 struct btrfs_delayed_node *node) 1105 { 1106 int ret; 1107 1108 mutex_lock(&node->mutex); 1109 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { 1110 mutex_unlock(&node->mutex); 1111 return 0; 1112 } 1113 1114 ret = __btrfs_update_delayed_inode(trans, root, path, node); 1115 mutex_unlock(&node->mutex); 1116 return ret; 1117 } 1118 1119 static inline int 1120 __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1121 struct btrfs_path *path, 1122 struct btrfs_delayed_node *node) 1123 { 1124 int ret; 1125 1126 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1127 if (ret) 1128 return ret; 1129 1130 ret = btrfs_delete_delayed_items(trans, path, node->root, node); 1131 if (ret) 1132 return ret; 1133 1134 ret = btrfs_record_root_in_trans(trans, node->root); 1135 if (ret) 1136 return ret; 1137 ret = btrfs_update_delayed_inode(trans, node->root, path, node); 1138 return ret; 1139 } 1140 1141 /* 1142 * Called when committing the transaction. 1143 * Returns 0 on success. 1144 * Returns < 0 on error and returns with an aborted transaction with any 1145 * outstanding delayed items cleaned up. 1146 */ 1147 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) 1148 { 1149 struct btrfs_fs_info *fs_info = trans->fs_info; 1150 struct btrfs_delayed_root *delayed_root; 1151 struct btrfs_delayed_node *curr_node, *prev_node; 1152 struct btrfs_path *path; 1153 struct btrfs_block_rsv *block_rsv; 1154 int ret = 0; 1155 bool count = (nr > 0); 1156 1157 if (TRANS_ABORTED(trans)) 1158 return -EIO; 1159 1160 path = btrfs_alloc_path(); 1161 if (!path) 1162 return -ENOMEM; 1163 1164 block_rsv = trans->block_rsv; 1165 trans->block_rsv = &fs_info->delayed_block_rsv; 1166 1167 delayed_root = fs_info->delayed_root; 1168 1169 curr_node = btrfs_first_delayed_node(delayed_root); 1170 while (curr_node && (!count || nr--)) { 1171 ret = __btrfs_commit_inode_delayed_items(trans, path, 1172 curr_node); 1173 if (ret) { 1174 btrfs_abort_transaction(trans, ret); 1175 break; 1176 } 1177 1178 prev_node = curr_node; 1179 curr_node = btrfs_next_delayed_node(curr_node); 1180 /* 1181 * See the comment below about releasing path before releasing 1182 * node. If the commit of delayed items was successful the path 1183 * should always be released, but in case of an error, it may 1184 * point to locked extent buffers (a leaf at the very least). 1185 */ 1186 ASSERT(path->nodes[0] == NULL); 1187 btrfs_release_delayed_node(prev_node); 1188 } 1189 1190 /* 1191 * Release the path to avoid a potential deadlock and lockdep splat when 1192 * releasing the delayed node, as that requires taking the delayed node's 1193 * mutex. If another task starts running delayed items before we take 1194 * the mutex, it will first lock the mutex and then it may try to lock 1195 * the same btree path (leaf). 1196 */ 1197 btrfs_free_path(path); 1198 1199 if (curr_node) 1200 btrfs_release_delayed_node(curr_node); 1201 trans->block_rsv = block_rsv; 1202 1203 return ret; 1204 } 1205 1206 int btrfs_run_delayed_items(struct btrfs_trans_handle *trans) 1207 { 1208 return __btrfs_run_delayed_items(trans, -1); 1209 } 1210 1211 int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) 1212 { 1213 return __btrfs_run_delayed_items(trans, nr); 1214 } 1215 1216 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1217 struct btrfs_inode *inode) 1218 { 1219 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1220 struct btrfs_path *path; 1221 struct btrfs_block_rsv *block_rsv; 1222 int ret; 1223 1224 if (!delayed_node) 1225 return 0; 1226 1227 mutex_lock(&delayed_node->mutex); 1228 if (!delayed_node->count) { 1229 mutex_unlock(&delayed_node->mutex); 1230 btrfs_release_delayed_node(delayed_node); 1231 return 0; 1232 } 1233 mutex_unlock(&delayed_node->mutex); 1234 1235 path = btrfs_alloc_path(); 1236 if (!path) { 1237 btrfs_release_delayed_node(delayed_node); 1238 return -ENOMEM; 1239 } 1240 1241 block_rsv = trans->block_rsv; 1242 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; 1243 1244 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1245 1246 btrfs_release_delayed_node(delayed_node); 1247 btrfs_free_path(path); 1248 trans->block_rsv = block_rsv; 1249 1250 return ret; 1251 } 1252 1253 int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) 1254 { 1255 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1256 struct btrfs_trans_handle *trans; 1257 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1258 struct btrfs_path *path; 1259 struct btrfs_block_rsv *block_rsv; 1260 int ret; 1261 1262 if (!delayed_node) 1263 return 0; 1264 1265 mutex_lock(&delayed_node->mutex); 1266 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1267 mutex_unlock(&delayed_node->mutex); 1268 btrfs_release_delayed_node(delayed_node); 1269 return 0; 1270 } 1271 mutex_unlock(&delayed_node->mutex); 1272 1273 trans = btrfs_join_transaction(delayed_node->root); 1274 if (IS_ERR(trans)) { 1275 ret = PTR_ERR(trans); 1276 goto out; 1277 } 1278 1279 path = btrfs_alloc_path(); 1280 if (!path) { 1281 ret = -ENOMEM; 1282 goto trans_out; 1283 } 1284 1285 block_rsv = trans->block_rsv; 1286 trans->block_rsv = &fs_info->delayed_block_rsv; 1287 1288 mutex_lock(&delayed_node->mutex); 1289 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) 1290 ret = __btrfs_update_delayed_inode(trans, delayed_node->root, 1291 path, delayed_node); 1292 else 1293 ret = 0; 1294 mutex_unlock(&delayed_node->mutex); 1295 1296 btrfs_free_path(path); 1297 trans->block_rsv = block_rsv; 1298 trans_out: 1299 btrfs_end_transaction(trans); 1300 btrfs_btree_balance_dirty(fs_info); 1301 out: 1302 btrfs_release_delayed_node(delayed_node); 1303 1304 return ret; 1305 } 1306 1307 void btrfs_remove_delayed_node(struct btrfs_inode *inode) 1308 { 1309 struct btrfs_delayed_node *delayed_node; 1310 1311 delayed_node = READ_ONCE(inode->delayed_node); 1312 if (!delayed_node) 1313 return; 1314 1315 inode->delayed_node = NULL; 1316 btrfs_release_delayed_node(delayed_node); 1317 } 1318 1319 struct btrfs_async_delayed_work { 1320 struct btrfs_delayed_root *delayed_root; 1321 int nr; 1322 struct btrfs_work work; 1323 }; 1324 1325 static void btrfs_async_run_delayed_root(struct btrfs_work *work) 1326 { 1327 struct btrfs_async_delayed_work *async_work; 1328 struct btrfs_delayed_root *delayed_root; 1329 struct btrfs_trans_handle *trans; 1330 struct btrfs_path *path; 1331 struct btrfs_delayed_node *delayed_node = NULL; 1332 struct btrfs_root *root; 1333 struct btrfs_block_rsv *block_rsv; 1334 int total_done = 0; 1335 1336 async_work = container_of(work, struct btrfs_async_delayed_work, work); 1337 delayed_root = async_work->delayed_root; 1338 1339 path = btrfs_alloc_path(); 1340 if (!path) 1341 goto out; 1342 1343 do { 1344 if (atomic_read(&delayed_root->items) < 1345 BTRFS_DELAYED_BACKGROUND / 2) 1346 break; 1347 1348 delayed_node = btrfs_first_prepared_delayed_node(delayed_root); 1349 if (!delayed_node) 1350 break; 1351 1352 root = delayed_node->root; 1353 1354 trans = btrfs_join_transaction(root); 1355 if (IS_ERR(trans)) { 1356 btrfs_release_path(path); 1357 btrfs_release_prepared_delayed_node(delayed_node); 1358 total_done++; 1359 continue; 1360 } 1361 1362 block_rsv = trans->block_rsv; 1363 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1364 1365 __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1366 1367 trans->block_rsv = block_rsv; 1368 btrfs_end_transaction(trans); 1369 btrfs_btree_balance_dirty_nodelay(root->fs_info); 1370 1371 btrfs_release_path(path); 1372 btrfs_release_prepared_delayed_node(delayed_node); 1373 total_done++; 1374 1375 } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) 1376 || total_done < async_work->nr); 1377 1378 btrfs_free_path(path); 1379 out: 1380 wake_up(&delayed_root->wait); 1381 kfree(async_work); 1382 } 1383 1384 1385 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, 1386 struct btrfs_fs_info *fs_info, int nr) 1387 { 1388 struct btrfs_async_delayed_work *async_work; 1389 1390 async_work = kmalloc(sizeof(*async_work), GFP_NOFS); 1391 if (!async_work) 1392 return -ENOMEM; 1393 1394 async_work->delayed_root = delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL); 1396 async_work->nr = nr; 1397 1398 btrfs_queue_work(fs_info->delayed_workers, &async_work->work); 1399 return 0; 1400 } 1401 1402 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) 1403 { 1404 WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); 1405 } 1406 1407 static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) 1408 { 1409 int val = atomic_read(&delayed_root->items_seq); 1410 1411 if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) 1412 return 1; 1413 1414 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) 1415 return 1; 1416 1417 return 0; 1418 } 1419 1420 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) 1421 { 1422 struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; 1423 1424 if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || 1425 btrfs_workqueue_normal_congested(fs_info->delayed_workers)) 1426 return; 1427 1428 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { 1429 int seq; 1430 int ret; 1431 1432 seq = atomic_read(&delayed_root->items_seq); 1433 1434 ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); 1435 if (ret) 1436 return; 1437 1438 wait_event_interruptible(delayed_root->wait, 1439 could_end_wait(delayed_root, seq)); 1440 return; 1441 } 1442 1443 btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); 1444 } 1445 1446 static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) 1447 { 1448 struct btrfs_fs_info *fs_info = trans->fs_info; 1449 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 1450 1451 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 1452 return; 1453 1454 /* 1455 * Adding the new dir index item does not require touching another 1456 * leaf, so we can release 1 unit of metadata that was previously 1457 * reserved when starting the transaction. This applies only to 1458 * the case where we had a transaction start and excludes the 1459 * transaction join case (when replaying log trees). 1460 */ 1461 trace_btrfs_space_reservation(fs_info, "transaction", 1462 trans->transid, bytes, 0); 1463 btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); 1464 ASSERT(trans->bytes_reserved >= bytes); 1465 trans->bytes_reserved -= bytes; 1466 } 1467 1468 /* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ 1469 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, 1470 const char *name, int name_len, 1471 struct btrfs_inode *dir, 1472 const struct btrfs_disk_key *disk_key, u8 flags, 1473 u64 index) 1474 { 1475 struct btrfs_fs_info *fs_info = trans->fs_info; 1476 const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); 1477 struct btrfs_delayed_node *delayed_node; 1478 struct btrfs_delayed_item *delayed_item; 1479 struct btrfs_dir_item *dir_item; 1480 bool reserve_leaf_space; 1481 u32 data_len; 1482 int ret; 1483 1484 delayed_node = btrfs_get_or_create_delayed_node(dir); 1485 if (IS_ERR(delayed_node)) 1486 return PTR_ERR(delayed_node); 1487 1488 delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, 1489 delayed_node, 1490 BTRFS_DELAYED_INSERTION_ITEM); 1491 if (!delayed_item) { 1492 ret = -ENOMEM; 1493 goto release_node; 1494 } 1495 1496 delayed_item->index = index; 1497 1498 dir_item = (struct btrfs_dir_item *)delayed_item->data; 1499 dir_item->location = *disk_key; 1500 btrfs_set_stack_dir_transid(dir_item, trans->transid); 1501 btrfs_set_stack_dir_data_len(dir_item, 0); 1502 btrfs_set_stack_dir_name_len(dir_item, name_len); 1503 btrfs_set_stack_dir_flags(dir_item, flags); 1504 memcpy((char *)(dir_item + 1), name, name_len); 1505 1506 data_len = delayed_item->data_len + sizeof(struct btrfs_item); 1507 1508 mutex_lock(&delayed_node->mutex); 1509 1510 /* 1511 * First attempt to insert the delayed item. This is to make the error 1512 * handling path simpler in case we fail (-EEXIST). There's no risk of 1513 * any other task coming in and running the delayed item before we do 1514 * the metadata space reservation below, because we are holding the 1515 * delayed node's mutex and that mutex must also be locked before the 1516 * node's delayed items can be run. 1517 */ 1518 ret = __btrfs_add_delayed_item(delayed_node, delayed_item); 1519 if (unlikely(ret)) { 1520 btrfs_err(trans->fs_info, 1521 "error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", 1522 name_len, name, index, btrfs_root_id(delayed_node->root), 1523 delayed_node->inode_id, dir->index_cnt, 1524 delayed_node->index_cnt, ret); 1525 btrfs_release_delayed_item(delayed_item); 1526 btrfs_release_dir_index_item_space(trans); 1527 mutex_unlock(&delayed_node->mutex); 1528 goto release_node; 1529 } 1530 1531 if (delayed_node->index_item_leaves == 0 || 1532 delayed_node->curr_index_batch_size + data_len > leaf_data_size) { 1533 delayed_node->curr_index_batch_size = data_len; 1534 reserve_leaf_space = true; 1535 } else { 1536 delayed_node->curr_index_batch_size += data_len; 1537 reserve_leaf_space = false; 1538 } 1539 1540 if (reserve_leaf_space) { 1541 ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); 1542 /* 1543 * Space was reserved for a dir index item insertion when we 1544 * started the transaction, so getting a failure here should be 1545 * impossible. 1546 */ 1547 if (WARN_ON(ret)) { 1548 btrfs_release_delayed_item(delayed_item); 1549 mutex_unlock(&delayed_node->mutex); 1550 goto release_node; 1551 } 1552 1553 delayed_node->index_item_leaves++; 1554 } else { 1555 btrfs_release_dir_index_item_space(trans); 1556 } 1557 mutex_unlock(&delayed_node->mutex); 1558 1559 release_node: 1560 btrfs_release_delayed_node(delayed_node); 1561 return ret; 1562 } 1563 1564 static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, 1565 struct btrfs_delayed_node *node, 1566 u64 index) 1567 { 1568 struct btrfs_delayed_item *item; 1569 1570 mutex_lock(&node->mutex); 1571 item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); 1572 if (!item) { 1573 mutex_unlock(&node->mutex); 1574 return 1; 1575 } 1576 1577 /* 1578 * For delayed items to insert, we track reserved metadata bytes based 1579 * on the number of leaves that we will use. 1580 * See btrfs_insert_delayed_dir_index() and 1581 * btrfs_delayed_item_reserve_metadata()). 1582 */ 1583 ASSERT(item->bytes_reserved == 0); 1584 ASSERT(node->index_item_leaves > 0); 1585 1586 /* 1587 * If there's only one leaf reserved, we can decrement this item from the 1588 * current batch, otherwise we can not because we don't know which leaf 1589 * it belongs to. With the current limit on delayed items, we rarely 1590 * accumulate enough dir index items to fill more than one leaf (even 1591 * when using a leaf size of 4K). 1592 */ 1593 if (node->index_item_leaves == 1) { 1594 const u32 data_len = item->data_len + sizeof(struct btrfs_item); 1595 1596 ASSERT(node->curr_index_batch_size >= data_len); 1597 node->curr_index_batch_size -= data_len; 1598 } 1599 1600 btrfs_release_delayed_item(item); 1601 1602 /* If we now have no more dir index items, we can release all leaves. */ 1603 if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { 1604 btrfs_delayed_item_release_leaves(node, node->index_item_leaves); 1605 node->index_item_leaves = 0; 1606 } 1607 1608 mutex_unlock(&node->mutex); 1609 return 0; 1610 } 1611 1612 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, 1613 struct btrfs_inode *dir, u64 index) 1614 { 1615 struct btrfs_delayed_node *node; 1616 struct btrfs_delayed_item *item; 1617 int ret; 1618 1619 node = btrfs_get_or_create_delayed_node(dir); 1620 if (IS_ERR(node)) 1621 return PTR_ERR(node); 1622 1623 ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); 1624 if (!ret) 1625 goto end; 1626 1627 item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); 1628 if (!item) { 1629 ret = -ENOMEM; 1630 goto end; 1631 } 1632 1633 item->index = index; 1634 1635 ret = btrfs_delayed_item_reserve_metadata(trans, item); 1636 /* 1637 * we have reserved enough space when we start a new transaction, 1638 * so reserving metadata failure is impossible. 1639 */ 1640 if (ret < 0) { 1641 btrfs_err(trans->fs_info, 1642 "metadata reservation failed for delayed dir item deltiona, should have been reserved"); 1643 btrfs_release_delayed_item(item); 1644 goto end; 1645 } 1646 1647 mutex_lock(&node->mutex); 1648 ret = __btrfs_add_delayed_item(node, item); 1649 if (unlikely(ret)) { 1650 btrfs_err(trans->fs_info, 1651 "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", 1652 index, btrfs_root_id(node->root), 1653 node->inode_id, ret); 1654 btrfs_delayed_item_release_metadata(dir->root, item); 1655 btrfs_release_delayed_item(item); 1656 } 1657 mutex_unlock(&node->mutex); 1658 end: 1659 btrfs_release_delayed_node(node); 1660 return ret; 1661 } 1662 1663 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) 1664 { 1665 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1666 1667 if (!delayed_node) 1668 return -ENOENT; 1669 1670 /* 1671 * Since we have held i_mutex of this directory, it is impossible that 1672 * a new directory index is added into the delayed node and index_cnt 1673 * is updated now. So we needn't lock the delayed node. 1674 */ 1675 if (!delayed_node->index_cnt) { 1676 btrfs_release_delayed_node(delayed_node); 1677 return -EINVAL; 1678 } 1679 1680 inode->index_cnt = delayed_node->index_cnt; 1681 btrfs_release_delayed_node(delayed_node); 1682 return 0; 1683 } 1684 1685 bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, 1686 u64 last_index, 1687 struct list_head *ins_list, 1688 struct list_head *del_list) 1689 { 1690 struct btrfs_delayed_node *delayed_node; 1691 struct btrfs_delayed_item *item; 1692 1693 delayed_node = btrfs_get_delayed_node(inode); 1694 if (!delayed_node) 1695 return false; 1696 1697 /* 1698 * We can only do one readdir with delayed items at a time because of 1699 * item->readdir_list. 1700 */ 1701 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 1702 btrfs_inode_lock(inode, 0); 1703 1704 mutex_lock(&delayed_node->mutex); 1705 item = __btrfs_first_delayed_insertion_item(delayed_node); 1706 while (item && item->index <= last_index) { 1707 refcount_inc(&item->refs); 1708 list_add_tail(&item->readdir_list, ins_list); 1709 item = __btrfs_next_delayed_item(item); 1710 } 1711 1712 item = __btrfs_first_delayed_deletion_item(delayed_node); 1713 while (item && item->index <= last_index) { 1714 refcount_inc(&item->refs); 1715 list_add_tail(&item->readdir_list, del_list); 1716 item = __btrfs_next_delayed_item(item); 1717 } 1718 mutex_unlock(&delayed_node->mutex); 1719 /* 1720 * This delayed node is still cached in the btrfs inode, so refs 1721 * must be > 1 now, and we needn't check it is going to be freed 1722 * or not. 1723 * 1724 * Besides that, this function is used to read dir, we do not 1725 * insert/delete delayed items in this period. So we also needn't 1726 * requeue or dequeue this delayed node. 1727 */ 1728 refcount_dec(&delayed_node->refs); 1729 1730 return true; 1731 } 1732 1733 void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, 1734 struct list_head *ins_list, 1735 struct list_head *del_list) 1736 { 1737 struct btrfs_delayed_item *curr, *next; 1738 1739 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1740 list_del(&curr->readdir_list); 1741 if (refcount_dec_and_test(&curr->refs)) 1742 kfree(curr); 1743 } 1744 1745 list_for_each_entry_safe(curr, next, del_list, readdir_list) { 1746 list_del(&curr->readdir_list); 1747 if (refcount_dec_and_test(&curr->refs)) 1748 kfree(curr); 1749 } 1750 1751 /* 1752 * The VFS is going to do up_read(), so we need to downgrade back to a 1753 * read lock. 1754 */ 1755 downgrade_write(&inode->vfs_inode.i_rwsem); 1756 } 1757 1758 int btrfs_should_delete_dir_index(const struct list_head *del_list, 1759 u64 index) 1760 { 1761 struct btrfs_delayed_item *curr; 1762 int ret = 0; 1763 1764 list_for_each_entry(curr, del_list, readdir_list) { 1765 if (curr->index > index) 1766 break; 1767 if (curr->index == index) { 1768 ret = 1; 1769 break; 1770 } 1771 } 1772 return ret; 1773 } 1774 1775 /* 1776 * Read dir info stored in the delayed tree. 1777 */ 1778 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, 1779 const struct list_head *ins_list) 1780 { 1781 struct btrfs_dir_item *di; 1782 struct btrfs_delayed_item *curr, *next; 1783 struct btrfs_key location; 1784 char *name; 1785 int name_len; 1786 int over = 0; 1787 unsigned char d_type; 1788 1789 /* 1790 * Changing the data of the delayed item is impossible. So 1791 * we needn't lock them. And we have held i_mutex of the 1792 * directory, nobody can delete any directory indexes now. 1793 */ 1794 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1795 list_del(&curr->readdir_list); 1796 1797 if (curr->index < ctx->pos) { 1798 if (refcount_dec_and_test(&curr->refs)) 1799 kfree(curr); 1800 continue; 1801 } 1802 1803 ctx->pos = curr->index; 1804 1805 di = (struct btrfs_dir_item *)curr->data; 1806 name = (char *)(di + 1); 1807 name_len = btrfs_stack_dir_name_len(di); 1808 1809 d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); 1810 btrfs_disk_key_to_cpu(&location, &di->location); 1811 1812 over = !dir_emit(ctx, name, name_len, 1813 location.objectid, d_type); 1814 1815 if (refcount_dec_and_test(&curr->refs)) 1816 kfree(curr); 1817 1818 if (over) 1819 return 1; 1820 ctx->pos++; 1821 } 1822 return 0; 1823 } 1824 1825 static void fill_stack_inode_item(struct btrfs_trans_handle *trans, 1826 struct btrfs_inode_item *inode_item, 1827 struct inode *inode) 1828 { 1829 u64 flags; 1830 1831 btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); 1832 btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); 1833 btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); 1834 btrfs_set_stack_inode_mode(inode_item, inode->i_mode); 1835 btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); 1836 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); 1837 btrfs_set_stack_inode_generation(inode_item, 1838 BTRFS_I(inode)->generation); 1839 btrfs_set_stack_inode_sequence(inode_item, 1840 inode_peek_iversion(inode)); 1841 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1842 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1843 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 1844 BTRFS_I(inode)->ro_flags); 1845 btrfs_set_stack_inode_flags(inode_item, flags); 1846 btrfs_set_stack_inode_block_group(inode_item, 0); 1847 1848 btrfs_set_stack_timespec_sec(&inode_item->atime, 1849 inode_get_atime_sec(inode)); 1850 btrfs_set_stack_timespec_nsec(&inode_item->atime, 1851 inode_get_atime_nsec(inode)); 1852 1853 btrfs_set_stack_timespec_sec(&inode_item->mtime, 1854 inode_get_mtime_sec(inode)); 1855 btrfs_set_stack_timespec_nsec(&inode_item->mtime, 1856 inode_get_mtime_nsec(inode)); 1857 1858 btrfs_set_stack_timespec_sec(&inode_item->ctime, 1859 inode_get_ctime_sec(inode)); 1860 btrfs_set_stack_timespec_nsec(&inode_item->ctime, 1861 inode_get_ctime_nsec(inode)); 1862 1863 btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); 1864 btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); 1865 } 1866 1867 int btrfs_fill_inode(struct inode *inode, u32 *rdev) 1868 { 1869 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 1870 struct btrfs_delayed_node *delayed_node; 1871 struct btrfs_inode_item *inode_item; 1872 1873 delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); 1874 if (!delayed_node) 1875 return -ENOENT; 1876 1877 mutex_lock(&delayed_node->mutex); 1878 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1879 mutex_unlock(&delayed_node->mutex); 1880 btrfs_release_delayed_node(delayed_node); 1881 return -ENOENT; 1882 } 1883 1884 inode_item = &delayed_node->inode_item; 1885 1886 i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); 1887 i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); 1888 btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); 1889 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, 1890 round_up(i_size_read(inode), fs_info->sectorsize)); 1891 inode->i_mode = btrfs_stack_inode_mode(inode_item); 1892 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1893 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1894 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1895 BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); 1896 1897 inode_set_iversion_queried(inode, 1898 btrfs_stack_inode_sequence(inode_item)); 1899 inode->i_rdev = 0; 1900 *rdev = btrfs_stack_inode_rdev(inode_item); 1901 btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), 1902 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); 1903 1904 inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), 1905 btrfs_stack_timespec_nsec(&inode_item->atime)); 1906 1907 inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), 1908 btrfs_stack_timespec_nsec(&inode_item->mtime)); 1909 1910 inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), 1911 btrfs_stack_timespec_nsec(&inode_item->ctime)); 1912 1913 BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); 1914 BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); 1915 1916 inode->i_generation = BTRFS_I(inode)->generation; 1917 if (S_ISDIR(inode->i_mode)) 1918 BTRFS_I(inode)->index_cnt = (u64)-1; 1919 1920 mutex_unlock(&delayed_node->mutex); 1921 btrfs_release_delayed_node(delayed_node); 1922 return 0; 1923 } 1924 1925 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 1926 struct btrfs_inode *inode) 1927 { 1928 struct btrfs_root *root = inode->root; 1929 struct btrfs_delayed_node *delayed_node; 1930 int ret = 0; 1931 1932 delayed_node = btrfs_get_or_create_delayed_node(inode); 1933 if (IS_ERR(delayed_node)) 1934 return PTR_ERR(delayed_node); 1935 1936 mutex_lock(&delayed_node->mutex); 1937 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1938 fill_stack_inode_item(trans, &delayed_node->inode_item, 1939 &inode->vfs_inode); 1940 goto release_node; 1941 } 1942 1943 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1944 if (ret) 1945 goto release_node; 1946 1947 fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); 1948 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 1949 delayed_node->count++; 1950 atomic_inc(&root->fs_info->delayed_root->items); 1951 release_node: 1952 mutex_unlock(&delayed_node->mutex); 1953 btrfs_release_delayed_node(delayed_node); 1954 return ret; 1955 } 1956 1957 int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) 1958 { 1959 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1960 struct btrfs_delayed_node *delayed_node; 1961 1962 /* 1963 * we don't do delayed inode updates during log recovery because it 1964 * leads to enospc problems. This means we also can't do 1965 * delayed inode refs 1966 */ 1967 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 1968 return -EAGAIN; 1969 1970 delayed_node = btrfs_get_or_create_delayed_node(inode); 1971 if (IS_ERR(delayed_node)) 1972 return PTR_ERR(delayed_node); 1973 1974 /* 1975 * We don't reserve space for inode ref deletion is because: 1976 * - We ONLY do async inode ref deletion for the inode who has only 1977 * one link(i_nlink == 1), it means there is only one inode ref. 1978 * And in most case, the inode ref and the inode item are in the 1979 * same leaf, and we will deal with them at the same time. 1980 * Since we are sure we will reserve the space for the inode item, 1981 * it is unnecessary to reserve space for inode ref deletion. 1982 * - If the inode ref and the inode item are not in the same leaf, 1983 * We also needn't worry about enospc problem, because we reserve 1984 * much more space for the inode update than it needs. 1985 * - At the worst, we can steal some space from the global reservation. 1986 * It is very rare. 1987 */ 1988 mutex_lock(&delayed_node->mutex); 1989 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) 1990 goto release_node; 1991 1992 set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); 1993 delayed_node->count++; 1994 atomic_inc(&fs_info->delayed_root->items); 1995 release_node: 1996 mutex_unlock(&delayed_node->mutex); 1997 btrfs_release_delayed_node(delayed_node); 1998 return 0; 1999 } 2000 2001 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) 2002 { 2003 struct btrfs_root *root = delayed_node->root; 2004 struct btrfs_fs_info *fs_info = root->fs_info; 2005 struct btrfs_delayed_item *curr_item, *prev_item; 2006 2007 mutex_lock(&delayed_node->mutex); 2008 curr_item = __btrfs_first_delayed_insertion_item(delayed_node); 2009 while (curr_item) { 2010 prev_item = curr_item; 2011 curr_item = __btrfs_next_delayed_item(prev_item); 2012 btrfs_release_delayed_item(prev_item); 2013 } 2014 2015 if (delayed_node->index_item_leaves > 0) { 2016 btrfs_delayed_item_release_leaves(delayed_node, 2017 delayed_node->index_item_leaves); 2018 delayed_node->index_item_leaves = 0; 2019 } 2020 2021 curr_item = __btrfs_first_delayed_deletion_item(delayed_node); 2022 while (curr_item) { 2023 btrfs_delayed_item_release_metadata(root, curr_item); 2024 prev_item = curr_item; 2025 curr_item = __btrfs_next_delayed_item(prev_item); 2026 btrfs_release_delayed_item(prev_item); 2027 } 2028 2029 btrfs_release_delayed_iref(delayed_node); 2030 2031 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 2032 btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); 2033 btrfs_release_delayed_inode(delayed_node); 2034 } 2035 mutex_unlock(&delayed_node->mutex); 2036 } 2037 2038 void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) 2039 { 2040 struct btrfs_delayed_node *delayed_node; 2041 2042 delayed_node = btrfs_get_delayed_node(inode); 2043 if (!delayed_node) 2044 return; 2045 2046 __btrfs_kill_delayed_node(delayed_node); 2047 btrfs_release_delayed_node(delayed_node); 2048 } 2049 2050 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) 2051 { 2052 unsigned long index = 0; 2053 struct btrfs_delayed_node *delayed_nodes[8]; 2054 2055 while (1) { 2056 struct btrfs_delayed_node *node; 2057 int count; 2058 2059 xa_lock(&root->delayed_nodes); 2060 if (xa_empty(&root->delayed_nodes)) { 2061 xa_unlock(&root->delayed_nodes); 2062 return; 2063 } 2064 2065 count = 0; 2066 xa_for_each_start(&root->delayed_nodes, index, node, index) { 2067 /* 2068 * Don't increase refs in case the node is dead and 2069 * about to be removed from the tree in the loop below 2070 */ 2071 if (refcount_inc_not_zero(&node->refs)) { 2072 delayed_nodes[count] = node; 2073 count++; 2074 } 2075 if (count >= ARRAY_SIZE(delayed_nodes)) 2076 break; 2077 } 2078 xa_unlock(&root->delayed_nodes); 2079 index++; 2080 2081 for (int i = 0; i < count; i++) { 2082 __btrfs_kill_delayed_node(delayed_nodes[i]); 2083 btrfs_release_delayed_node(delayed_nodes[i]); 2084 } 2085 } 2086 } 2087 2088 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) 2089 { 2090 struct btrfs_delayed_node *curr_node, *prev_node; 2091 2092 curr_node = btrfs_first_delayed_node(fs_info->delayed_root); 2093 while (curr_node) { 2094 __btrfs_kill_delayed_node(curr_node); 2095 2096 prev_node = curr_node; 2097 curr_node = btrfs_next_delayed_node(curr_node); 2098 btrfs_release_delayed_node(prev_node); 2099 } 2100 } 2101 2102 void btrfs_log_get_delayed_items(struct btrfs_inode *inode, 2103 struct list_head *ins_list, 2104 struct list_head *del_list) 2105 { 2106 struct btrfs_delayed_node *node; 2107 struct btrfs_delayed_item *item; 2108 2109 node = btrfs_get_delayed_node(inode); 2110 if (!node) 2111 return; 2112 2113 mutex_lock(&node->mutex); 2114 item = __btrfs_first_delayed_insertion_item(node); 2115 while (item) { 2116 /* 2117 * It's possible that the item is already in a log list. This 2118 * can happen in case two tasks are trying to log the same 2119 * directory. For example if we have tasks A and task B: 2120 * 2121 * Task A collected the delayed items into a log list while 2122 * under the inode's log_mutex (at btrfs_log_inode()), but it 2123 * only releases the items after logging the inodes they point 2124 * to (if they are new inodes), which happens after unlocking 2125 * the log mutex; 2126 * 2127 * Task B enters btrfs_log_inode() and acquires the log_mutex 2128 * of the same directory inode, before task B releases the 2129 * delayed items. This can happen for example when logging some 2130 * inode we need to trigger logging of its parent directory, so 2131 * logging two files that have the same parent directory can 2132 * lead to this. 2133 * 2134 * If this happens, just ignore delayed items already in a log 2135 * list. All the tasks logging the directory are under a log 2136 * transaction and whichever finishes first can not sync the log 2137 * before the other completes and leaves the log transaction. 2138 */ 2139 if (!item->logged && list_empty(&item->log_list)) { 2140 refcount_inc(&item->refs); 2141 list_add_tail(&item->log_list, ins_list); 2142 } 2143 item = __btrfs_next_delayed_item(item); 2144 } 2145 2146 item = __btrfs_first_delayed_deletion_item(node); 2147 while (item) { 2148 /* It may be non-empty, for the same reason mentioned above. */ 2149 if (!item->logged && list_empty(&item->log_list)) { 2150 refcount_inc(&item->refs); 2151 list_add_tail(&item->log_list, del_list); 2152 } 2153 item = __btrfs_next_delayed_item(item); 2154 } 2155 mutex_unlock(&node->mutex); 2156 2157 /* 2158 * We are called during inode logging, which means the inode is in use 2159 * and can not be evicted before we finish logging the inode. So we never 2160 * have the last reference on the delayed inode. 2161 * Also, we don't use btrfs_release_delayed_node() because that would 2162 * requeue the delayed inode (change its order in the list of prepared 2163 * nodes) and we don't want to do such change because we don't create or 2164 * delete delayed items. 2165 */ 2166 ASSERT(refcount_read(&node->refs) > 1); 2167 refcount_dec(&node->refs); 2168 } 2169 2170 void btrfs_log_put_delayed_items(struct btrfs_inode *inode, 2171 struct list_head *ins_list, 2172 struct list_head *del_list) 2173 { 2174 struct btrfs_delayed_node *node; 2175 struct btrfs_delayed_item *item; 2176 struct btrfs_delayed_item *next; 2177 2178 node = btrfs_get_delayed_node(inode); 2179 if (!node) 2180 return; 2181 2182 mutex_lock(&node->mutex); 2183 2184 list_for_each_entry_safe(item, next, ins_list, log_list) { 2185 item->logged = true; 2186 list_del_init(&item->log_list); 2187 if (refcount_dec_and_test(&item->refs)) 2188 kfree(item); 2189 } 2190 2191 list_for_each_entry_safe(item, next, del_list, log_list) { 2192 item->logged = true; 2193 list_del_init(&item->log_list); 2194 if (refcount_dec_and_test(&item->refs)) 2195 kfree(item); 2196 } 2197 2198 mutex_unlock(&node->mutex); 2199 2200 /* 2201 * We are called during inode logging, which means the inode is in use 2202 * and can not be evicted before we finish logging the inode. So we never 2203 * have the last reference on the delayed inode. 2204 * Also, we don't use btrfs_release_delayed_node() because that would 2205 * requeue the delayed inode (change its order in the list of prepared 2206 * nodes) and we don't want to do such change because we don't create or 2207 * delete delayed items. 2208 */ 2209 ASSERT(refcount_read(&node->refs) > 1); 2210 refcount_dec(&node->refs); 2211 } 2212