1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 Fujitsu. All rights reserved. 4 * Written by Miao Xie <miaox@cn.fujitsu.com> 5 */ 6 7 #include <linux/slab.h> 8 #include <linux/iversion.h> 9 #include "ctree.h" 10 #include "fs.h" 11 #include "messages.h" 12 #include "misc.h" 13 #include "delayed-inode.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "qgroup.h" 17 #include "locking.h" 18 #include "inode-item.h" 19 #include "space-info.h" 20 #include "accessors.h" 21 #include "file-item.h" 22 23 #define BTRFS_DELAYED_WRITEBACK 512 24 #define BTRFS_DELAYED_BACKGROUND 128 25 #define BTRFS_DELAYED_BATCH 16 26 27 static struct kmem_cache *delayed_node_cache; 28 29 int __init btrfs_delayed_inode_init(void) 30 { 31 delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); 32 if (!delayed_node_cache) 33 return -ENOMEM; 34 return 0; 35 } 36 37 void __cold btrfs_delayed_inode_exit(void) 38 { 39 kmem_cache_destroy(delayed_node_cache); 40 } 41 42 void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) 43 { 44 atomic_set(&delayed_root->items, 0); 45 atomic_set(&delayed_root->items_seq, 0); 46 delayed_root->nodes = 0; 47 spin_lock_init(&delayed_root->lock); 48 init_waitqueue_head(&delayed_root->wait); 49 INIT_LIST_HEAD(&delayed_root->node_list); 50 INIT_LIST_HEAD(&delayed_root->prepare_list); 51 } 52 53 static inline void btrfs_init_delayed_node( 54 struct btrfs_delayed_node *delayed_node, 55 struct btrfs_root *root, u64 inode_id) 56 { 57 delayed_node->root = root; 58 delayed_node->inode_id = inode_id; 59 refcount_set(&delayed_node->refs, 0); 60 btrfs_delayed_node_ref_tracker_dir_init(delayed_node); 61 delayed_node->ins_root = RB_ROOT_CACHED; 62 delayed_node->del_root = RB_ROOT_CACHED; 63 mutex_init(&delayed_node->mutex); 64 INIT_LIST_HEAD(&delayed_node->n_list); 65 INIT_LIST_HEAD(&delayed_node->p_list); 66 } 67 68 static struct btrfs_delayed_node *btrfs_get_delayed_node( 69 struct btrfs_inode *btrfs_inode, 70 struct btrfs_ref_tracker *tracker) 71 { 72 struct btrfs_root *root = btrfs_inode->root; 73 u64 ino = btrfs_ino(btrfs_inode); 74 struct btrfs_delayed_node *node; 75 76 node = READ_ONCE(btrfs_inode->delayed_node); 77 if (node) { 78 refcount_inc(&node->refs); 79 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS); 80 return node; 81 } 82 83 xa_lock(&root->delayed_nodes); 84 node = xa_load(&root->delayed_nodes, ino); 85 86 if (node) { 87 if (btrfs_inode->delayed_node) { 88 refcount_inc(&node->refs); /* can be accessed */ 89 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); 90 BUG_ON(btrfs_inode->delayed_node != node); 91 xa_unlock(&root->delayed_nodes); 92 return node; 93 } 94 95 /* 96 * It's possible that we're racing into the middle of removing 97 * this node from the xarray. In this case, the refcount 98 * was zero and it should never go back to one. Just return 99 * NULL like it was never in the xarray at all; our release 100 * function is in the process of removing it. 101 * 102 * Some implementations of refcount_inc refuse to bump the 103 * refcount once it has hit zero. If we don't do this dance 104 * here, refcount_inc() may decide to just WARN_ONCE() instead 105 * of actually bumping the refcount. 106 * 107 * If this node is properly in the xarray, we want to bump the 108 * refcount twice, once for the inode and once for this get 109 * operation. 110 */ 111 if (refcount_inc_not_zero(&node->refs)) { 112 refcount_inc(&node->refs); 113 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); 114 btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, 115 GFP_ATOMIC); 116 btrfs_inode->delayed_node = node; 117 } else { 118 node = NULL; 119 } 120 121 xa_unlock(&root->delayed_nodes); 122 return node; 123 } 124 xa_unlock(&root->delayed_nodes); 125 126 return NULL; 127 } 128 129 /* 130 * Look up an existing delayed node associated with @btrfs_inode or create a new 131 * one and insert it to the delayed nodes of the root. 132 * 133 * Return the delayed node, or error pointer on failure. 134 */ 135 static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( 136 struct btrfs_inode *btrfs_inode, 137 struct btrfs_ref_tracker *tracker) 138 { 139 struct btrfs_delayed_node *node; 140 struct btrfs_root *root = btrfs_inode->root; 141 u64 ino = btrfs_ino(btrfs_inode); 142 int ret; 143 void *ptr; 144 145 again: 146 node = btrfs_get_delayed_node(btrfs_inode, tracker); 147 if (node) 148 return node; 149 150 node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); 151 if (!node) 152 return ERR_PTR(-ENOMEM); 153 btrfs_init_delayed_node(node, root, ino); 154 155 /* Cached in the inode and can be accessed. */ 156 refcount_set(&node->refs, 2); 157 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS); 158 btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_NOFS); 159 160 /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ 161 ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); 162 if (ret == -ENOMEM) 163 goto cleanup; 164 165 xa_lock(&root->delayed_nodes); 166 ptr = xa_load(&root->delayed_nodes, ino); 167 if (ptr) { 168 /* Somebody inserted it, go back and read it. */ 169 xa_unlock(&root->delayed_nodes); 170 goto cleanup; 171 } 172 ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); 173 ASSERT(xa_err(ptr) != -EINVAL); 174 ASSERT(xa_err(ptr) != -ENOMEM); 175 ASSERT(ptr == NULL); 176 btrfs_inode->delayed_node = node; 177 xa_unlock(&root->delayed_nodes); 178 179 return node; 180 cleanup: 181 btrfs_delayed_node_ref_tracker_free(node, tracker); 182 btrfs_delayed_node_ref_tracker_free(node, &node->inode_cache_tracker); 183 btrfs_delayed_node_ref_tracker_dir_exit(node); 184 kmem_cache_free(delayed_node_cache, node); 185 if (ret) 186 return ERR_PTR(ret); 187 goto again; 188 } 189 190 /* 191 * Call it when holding delayed_node->mutex 192 * 193 * If mod = 1, add this node into the prepared list. 194 */ 195 static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, 196 struct btrfs_delayed_node *node, 197 int mod) 198 { 199 spin_lock(&root->lock); 200 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 201 if (!list_empty(&node->p_list)) 202 list_move_tail(&node->p_list, &root->prepare_list); 203 else if (mod) 204 list_add_tail(&node->p_list, &root->prepare_list); 205 } else { 206 list_add_tail(&node->n_list, &root->node_list); 207 list_add_tail(&node->p_list, &root->prepare_list); 208 refcount_inc(&node->refs); /* inserted into list */ 209 btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker, 210 GFP_ATOMIC); 211 root->nodes++; 212 set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); 213 } 214 spin_unlock(&root->lock); 215 } 216 217 /* Call it when holding delayed_node->mutex */ 218 static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, 219 struct btrfs_delayed_node *node) 220 { 221 spin_lock(&root->lock); 222 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 223 root->nodes--; 224 btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker); 225 refcount_dec(&node->refs); /* not in the list */ 226 list_del_init(&node->n_list); 227 if (!list_empty(&node->p_list)) 228 list_del_init(&node->p_list); 229 clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); 230 } 231 spin_unlock(&root->lock); 232 } 233 234 static struct btrfs_delayed_node *btrfs_first_delayed_node( 235 struct btrfs_fs_info *fs_info, 236 struct btrfs_ref_tracker *tracker) 237 { 238 struct btrfs_delayed_node *node; 239 240 spin_lock(&fs_info->delayed_root.lock); 241 node = list_first_entry_or_null(&fs_info->delayed_root.node_list, 242 struct btrfs_delayed_node, n_list); 243 if (node) { 244 refcount_inc(&node->refs); 245 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); 246 } 247 spin_unlock(&fs_info->delayed_root.lock); 248 249 return node; 250 } 251 252 static struct btrfs_delayed_node *btrfs_next_delayed_node( 253 struct btrfs_delayed_node *node, 254 struct btrfs_ref_tracker *tracker) 255 { 256 struct btrfs_delayed_root *delayed_root; 257 struct list_head *p; 258 struct btrfs_delayed_node *next = NULL; 259 260 delayed_root = &node->root->fs_info->delayed_root; 261 spin_lock(&delayed_root->lock); 262 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 263 /* not in the list */ 264 if (list_empty(&delayed_root->node_list)) 265 goto out; 266 p = delayed_root->node_list.next; 267 } else if (list_is_last(&node->n_list, &delayed_root->node_list)) 268 goto out; 269 else 270 p = node->n_list.next; 271 272 next = list_entry(p, struct btrfs_delayed_node, n_list); 273 refcount_inc(&next->refs); 274 btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC); 275 out: 276 spin_unlock(&delayed_root->lock); 277 278 return next; 279 } 280 281 static void __btrfs_release_delayed_node( 282 struct btrfs_delayed_node *delayed_node, 283 int mod, struct btrfs_ref_tracker *tracker) 284 { 285 struct btrfs_delayed_root *delayed_root; 286 287 if (!delayed_node) 288 return; 289 290 delayed_root = &delayed_node->root->fs_info->delayed_root; 291 292 mutex_lock(&delayed_node->mutex); 293 if (delayed_node->count) 294 btrfs_queue_delayed_node(delayed_root, delayed_node, mod); 295 else 296 btrfs_dequeue_delayed_node(delayed_root, delayed_node); 297 mutex_unlock(&delayed_node->mutex); 298 299 btrfs_delayed_node_ref_tracker_free(delayed_node, tracker); 300 if (refcount_dec_and_test(&delayed_node->refs)) { 301 struct btrfs_root *root = delayed_node->root; 302 303 xa_erase(&root->delayed_nodes, delayed_node->inode_id); 304 /* 305 * Once our refcount goes to zero, nobody is allowed to bump it 306 * back up. We can delete it now. 307 */ 308 ASSERT(refcount_read(&delayed_node->refs) == 0); 309 btrfs_delayed_node_ref_tracker_dir_exit(delayed_node); 310 kmem_cache_free(delayed_node_cache, delayed_node); 311 } 312 } 313 314 static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node, 315 struct btrfs_ref_tracker *tracker) 316 { 317 __btrfs_release_delayed_node(node, 0, tracker); 318 } 319 320 static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( 321 struct btrfs_delayed_root *delayed_root, 322 struct btrfs_ref_tracker *tracker) 323 { 324 struct btrfs_delayed_node *node; 325 326 spin_lock(&delayed_root->lock); 327 node = list_first_entry_or_null(&delayed_root->prepare_list, 328 struct btrfs_delayed_node, p_list); 329 if (node) { 330 list_del_init(&node->p_list); 331 refcount_inc(&node->refs); 332 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); 333 } 334 spin_unlock(&delayed_root->lock); 335 336 return node; 337 } 338 339 static inline void btrfs_release_prepared_delayed_node( 340 struct btrfs_delayed_node *node, 341 struct btrfs_ref_tracker *tracker) 342 { 343 __btrfs_release_delayed_node(node, 1, tracker); 344 } 345 346 static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, 347 struct btrfs_delayed_node *node, 348 enum btrfs_delayed_item_type type) 349 { 350 struct btrfs_delayed_item *item; 351 352 item = kmalloc_flex(*item, data, data_len, GFP_NOFS); 353 if (item) { 354 item->data_len = data_len; 355 item->type = type; 356 item->bytes_reserved = 0; 357 item->delayed_node = node; 358 RB_CLEAR_NODE(&item->rb_node); 359 INIT_LIST_HEAD(&item->log_list); 360 item->logged = false; 361 refcount_set(&item->refs, 1); 362 } 363 return item; 364 } 365 366 static int delayed_item_index_cmp(const void *key, const struct rb_node *node) 367 { 368 const u64 *index = key; 369 const struct btrfs_delayed_item *delayed_item = rb_entry(node, 370 struct btrfs_delayed_item, rb_node); 371 372 if (delayed_item->index < *index) 373 return 1; 374 else if (delayed_item->index > *index) 375 return -1; 376 377 return 0; 378 } 379 380 /* 381 * Look up the delayed item by key. 382 * 383 * @delayed_node: pointer to the delayed node 384 * @index: the dir index value to lookup (offset of a dir index key) 385 * 386 * Note: if we don't find the right item, we will return the prev item and 387 * the next item. 388 */ 389 static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( 390 struct rb_root *root, 391 u64 index) 392 { 393 struct rb_node *node; 394 395 node = rb_find(&index, root, delayed_item_index_cmp); 396 return rb_entry_safe(node, struct btrfs_delayed_item, rb_node); 397 } 398 399 static int btrfs_delayed_item_cmp(const struct rb_node *new, 400 const struct rb_node *exist) 401 { 402 const struct btrfs_delayed_item *new_item = 403 rb_entry(new, struct btrfs_delayed_item, rb_node); 404 405 return delayed_item_index_cmp(&new_item->index, exist); 406 } 407 408 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, 409 struct btrfs_delayed_item *ins) 410 { 411 struct rb_root_cached *root; 412 struct rb_node *exist; 413 414 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) 415 root = &delayed_node->ins_root; 416 else 417 root = &delayed_node->del_root; 418 419 exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp); 420 if (exist) 421 return -EEXIST; 422 423 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && 424 ins->index >= delayed_node->index_cnt) 425 delayed_node->index_cnt = ins->index + 1; 426 427 delayed_node->count++; 428 atomic_inc(&delayed_node->root->fs_info->delayed_root.items); 429 return 0; 430 } 431 432 static void finish_one_item(struct btrfs_delayed_root *delayed_root) 433 { 434 int seq = atomic_inc_return(&delayed_root->items_seq); 435 436 /* atomic_dec_return implies a barrier */ 437 if ((atomic_dec_return(&delayed_root->items) < 438 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0)) 439 cond_wake_up_nomb(&delayed_root->wait); 440 } 441 442 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) 443 { 444 struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; 445 struct rb_root_cached *root; 446 447 /* Not inserted, ignore it. */ 448 if (RB_EMPTY_NODE(&delayed_item->rb_node)) 449 return; 450 451 /* If it's in a rbtree, then we need to have delayed node locked. */ 452 lockdep_assert_held(&delayed_node->mutex); 453 454 if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) 455 root = &delayed_node->ins_root; 456 else 457 root = &delayed_node->del_root; 458 459 rb_erase_cached(&delayed_item->rb_node, root); 460 RB_CLEAR_NODE(&delayed_item->rb_node); 461 delayed_node->count--; 462 finish_one_item(&delayed_node->root->fs_info->delayed_root); 463 } 464 465 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) 466 { 467 if (item) { 468 __btrfs_remove_delayed_item(item); 469 if (refcount_dec_and_test(&item->refs)) 470 kfree(item); 471 } 472 } 473 474 static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( 475 struct btrfs_delayed_node *delayed_node) 476 { 477 struct rb_node *p = rb_first_cached(&delayed_node->ins_root); 478 479 return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); 480 } 481 482 static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( 483 struct btrfs_delayed_node *delayed_node) 484 { 485 struct rb_node *p = rb_first_cached(&delayed_node->del_root); 486 487 return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); 488 } 489 490 static struct btrfs_delayed_item *__btrfs_next_delayed_item( 491 struct btrfs_delayed_item *item) 492 { 493 struct rb_node *p = rb_next(&item->rb_node); 494 495 return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); 496 } 497 498 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 499 struct btrfs_delayed_item *item) 500 { 501 struct btrfs_block_rsv *src_rsv; 502 struct btrfs_block_rsv *dst_rsv; 503 struct btrfs_fs_info *fs_info = trans->fs_info; 504 u64 num_bytes; 505 int ret; 506 507 if (!trans->bytes_reserved) 508 return 0; 509 510 src_rsv = trans->block_rsv; 511 dst_rsv = &fs_info->delayed_block_rsv; 512 513 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 514 515 /* 516 * Here we migrate space rsv from transaction rsv, since have already 517 * reserved space when starting a transaction. So no need to reserve 518 * qgroup space here. 519 */ 520 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); 521 if (!ret) { 522 trace_btrfs_space_reservation(fs_info, "delayed_item", 523 item->delayed_node->inode_id, 524 num_bytes, 1); 525 /* 526 * For insertions we track reserved metadata space by accounting 527 * for the number of leaves that will be used, based on the delayed 528 * node's curr_index_batch_size and index_item_leaves fields. 529 */ 530 if (item->type == BTRFS_DELAYED_DELETION_ITEM) 531 item->bytes_reserved = num_bytes; 532 } 533 534 return ret; 535 } 536 537 static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, 538 struct btrfs_delayed_item *item) 539 { 540 struct btrfs_block_rsv *rsv; 541 struct btrfs_fs_info *fs_info = root->fs_info; 542 543 if (!item->bytes_reserved) 544 return; 545 546 rsv = &fs_info->delayed_block_rsv; 547 /* 548 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need 549 * to release/reserve qgroup space. 550 */ 551 trace_btrfs_space_reservation(fs_info, "delayed_item", 552 item->delayed_node->inode_id, 553 item->bytes_reserved, 0); 554 btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); 555 } 556 557 static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, 558 unsigned int num_leaves) 559 { 560 struct btrfs_fs_info *fs_info = node->root->fs_info; 561 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); 562 563 /* There are no space reservations during log replay, bail out. */ 564 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 565 return; 566 567 trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, 568 bytes, 0); 569 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); 570 } 571 572 static int btrfs_delayed_inode_reserve_metadata( 573 struct btrfs_trans_handle *trans, 574 struct btrfs_root *root, 575 struct btrfs_delayed_node *node) 576 { 577 struct btrfs_fs_info *fs_info = root->fs_info; 578 struct btrfs_block_rsv *src_rsv; 579 struct btrfs_block_rsv *dst_rsv; 580 u64 num_bytes; 581 int ret; 582 583 src_rsv = trans->block_rsv; 584 dst_rsv = &fs_info->delayed_block_rsv; 585 586 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 587 588 /* 589 * btrfs_dirty_inode will update the inode under btrfs_join_transaction 590 * which doesn't reserve space for speed. This is a problem since we 591 * still need to reserve space for this update, so try to reserve the 592 * space. 593 * 594 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 595 * we always reserve enough to update the inode item. 596 */ 597 if (!src_rsv || (!trans->bytes_reserved && 598 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 599 ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true, true); 600 if (ret < 0) 601 return ret; 602 ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, 603 BTRFS_RESERVE_NO_FLUSH); 604 /* NO_FLUSH could only fail with -ENOSPC */ 605 ASSERT(ret == 0 || ret == -ENOSPC); 606 if (ret) 607 btrfs_qgroup_free_meta_prealloc(root, num_bytes); 608 } else { 609 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); 610 } 611 612 if (!ret) { 613 trace_btrfs_space_reservation(fs_info, "delayed_inode", 614 node->inode_id, num_bytes, 1); 615 node->bytes_reserved = num_bytes; 616 } 617 618 return ret; 619 } 620 621 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, 622 struct btrfs_delayed_node *node, 623 bool qgroup_free) 624 { 625 struct btrfs_block_rsv *rsv; 626 627 if (!node->bytes_reserved) 628 return; 629 630 rsv = &fs_info->delayed_block_rsv; 631 trace_btrfs_space_reservation(fs_info, "delayed_inode", 632 node->inode_id, node->bytes_reserved, 0); 633 btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); 634 if (qgroup_free) 635 btrfs_qgroup_free_meta_prealloc(node->root, 636 node->bytes_reserved); 637 else 638 btrfs_qgroup_convert_reserved_meta(node->root, 639 node->bytes_reserved); 640 node->bytes_reserved = 0; 641 } 642 643 /* 644 * Insert a single delayed item or a batch of delayed items, as many as possible 645 * that fit in a leaf. The delayed items (dir index keys) are sorted by their key 646 * in the rbtree, and if there's a gap between two consecutive dir index items, 647 * then it means at some point we had delayed dir indexes to add but they got 648 * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them 649 * into the subvolume tree. Dir index keys also have their offsets coming from a 650 * monotonically increasing counter, so we can't get new keys with an offset that 651 * fits within a gap between delayed dir index items. 652 */ 653 static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, 654 struct btrfs_root *root, 655 struct btrfs_path *path, 656 struct btrfs_delayed_item *first_item) 657 { 658 struct btrfs_fs_info *fs_info = root->fs_info; 659 struct btrfs_delayed_node *node = first_item->delayed_node; 660 LIST_HEAD(item_list); 661 struct btrfs_delayed_item *curr; 662 struct btrfs_delayed_item *next; 663 const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); 664 struct btrfs_item_batch batch; 665 struct btrfs_key first_key; 666 const u32 first_data_size = first_item->data_len; 667 int total_size; 668 char AUTO_KFREE(ins_data); 669 int ret; 670 bool continuous_keys_only = false; 671 672 lockdep_assert_held(&node->mutex); 673 674 /* 675 * During normal operation the delayed index offset is continuously 676 * increasing, so we can batch insert all items as there will not be any 677 * overlapping keys in the tree. 678 * 679 * The exception to this is log replay, where we may have interleaved 680 * offsets in the tree, so our batch needs to be continuous keys only in 681 * order to ensure we do not end up with out of order items in our leaf. 682 */ 683 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 684 continuous_keys_only = true; 685 686 /* 687 * For delayed items to insert, we track reserved metadata bytes based 688 * on the number of leaves that we will use. 689 * See btrfs_insert_delayed_dir_index() and 690 * btrfs_delayed_item_reserve_metadata()). 691 */ 692 ASSERT(first_item->bytes_reserved == 0); 693 694 list_add_tail(&first_item->tree_list, &item_list); 695 batch.total_data_size = first_data_size; 696 batch.nr = 1; 697 total_size = first_data_size + sizeof(struct btrfs_item); 698 curr = first_item; 699 700 while (true) { 701 int next_size; 702 703 next = __btrfs_next_delayed_item(curr); 704 if (!next) 705 break; 706 707 /* 708 * We cannot allow gaps in the key space if we're doing log 709 * replay. 710 */ 711 if (continuous_keys_only && (next->index != curr->index + 1)) 712 break; 713 714 ASSERT(next->bytes_reserved == 0); 715 716 next_size = next->data_len + sizeof(struct btrfs_item); 717 if (total_size + next_size > max_size) 718 break; 719 720 list_add_tail(&next->tree_list, &item_list); 721 batch.nr++; 722 total_size += next_size; 723 batch.total_data_size += next->data_len; 724 curr = next; 725 } 726 727 if (batch.nr == 1) { 728 first_key.objectid = node->inode_id; 729 first_key.type = BTRFS_DIR_INDEX_KEY; 730 first_key.offset = first_item->index; 731 batch.keys = &first_key; 732 batch.data_sizes = &first_data_size; 733 } else { 734 struct btrfs_key *ins_keys; 735 u32 *ins_sizes; 736 int i = 0; 737 738 ins_data = kmalloc_array(batch.nr, 739 sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); 740 if (!ins_data) 741 return -ENOMEM; 742 ins_sizes = (u32 *)ins_data; 743 ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); 744 batch.keys = ins_keys; 745 batch.data_sizes = ins_sizes; 746 list_for_each_entry(curr, &item_list, tree_list) { 747 ins_keys[i].objectid = node->inode_id; 748 ins_keys[i].type = BTRFS_DIR_INDEX_KEY; 749 ins_keys[i].offset = curr->index; 750 ins_sizes[i] = curr->data_len; 751 i++; 752 } 753 } 754 755 ret = btrfs_insert_empty_items(trans, root, path, &batch); 756 if (ret) 757 return ret; 758 759 list_for_each_entry(curr, &item_list, tree_list) { 760 char *data_ptr; 761 762 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 763 write_extent_buffer(path->nodes[0], &curr->data, 764 (unsigned long)data_ptr, curr->data_len); 765 path->slots[0]++; 766 } 767 768 /* 769 * Now release our path before releasing the delayed items and their 770 * metadata reservations, so that we don't block other tasks for more 771 * time than needed. 772 */ 773 btrfs_release_path(path); 774 775 ASSERT(node->index_item_leaves > 0); 776 777 /* 778 * For normal operations we will batch an entire leaf's worth of delayed 779 * items, so if there are more items to process we can decrement 780 * index_item_leaves by 1 as we inserted 1 leaf's worth of items. 781 * 782 * However for log replay we may not have inserted an entire leaf's 783 * worth of items, we may have not had continuous items, so decrementing 784 * here would mess up the index_item_leaves accounting. For this case 785 * only clean up the accounting when there are no items left. 786 */ 787 if (next && !continuous_keys_only) { 788 /* 789 * We inserted one batch of items into a leaf a there are more 790 * items to flush in a future batch, now release one unit of 791 * metadata space from the delayed block reserve, corresponding 792 * the leaf we just flushed to. 793 */ 794 btrfs_delayed_item_release_leaves(node, 1); 795 node->index_item_leaves--; 796 } else if (!next) { 797 /* 798 * There are no more items to insert. We can have a number of 799 * reserved leaves > 1 here - this happens when many dir index 800 * items are added and then removed before they are flushed (file 801 * names with a very short life, never span a transaction). So 802 * release all remaining leaves. 803 */ 804 btrfs_delayed_item_release_leaves(node, node->index_item_leaves); 805 node->index_item_leaves = 0; 806 } 807 808 list_for_each_entry_safe(curr, next, &item_list, tree_list) { 809 list_del(&curr->tree_list); 810 btrfs_release_delayed_item(curr); 811 } 812 813 return 0; 814 } 815 816 static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, 817 struct btrfs_path *path, 818 struct btrfs_root *root, 819 struct btrfs_delayed_node *node) 820 { 821 int ret = 0; 822 823 while (ret == 0) { 824 struct btrfs_delayed_item *curr; 825 826 mutex_lock(&node->mutex); 827 curr = __btrfs_first_delayed_insertion_item(node); 828 if (!curr) { 829 mutex_unlock(&node->mutex); 830 break; 831 } 832 ret = btrfs_insert_delayed_item(trans, root, path, curr); 833 mutex_unlock(&node->mutex); 834 } 835 836 return ret; 837 } 838 839 static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, 840 struct btrfs_root *root, 841 struct btrfs_path *path, 842 struct btrfs_delayed_item *item) 843 { 844 const u64 ino = item->delayed_node->inode_id; 845 struct btrfs_fs_info *fs_info = root->fs_info; 846 struct btrfs_delayed_item *curr, *next; 847 struct extent_buffer *leaf = path->nodes[0]; 848 LIST_HEAD(batch_list); 849 int nitems, slot, last_slot; 850 int ret; 851 u64 total_reserved_size = item->bytes_reserved; 852 853 ASSERT(leaf != NULL); 854 855 slot = path->slots[0]; 856 last_slot = btrfs_header_nritems(leaf) - 1; 857 /* 858 * Our caller always gives us a path pointing to an existing item, so 859 * this can not happen. 860 */ 861 ASSERT(slot <= last_slot); 862 if (WARN_ON(slot > last_slot)) 863 return -ENOENT; 864 865 nitems = 1; 866 curr = item; 867 list_add_tail(&curr->tree_list, &batch_list); 868 869 /* 870 * Keep checking if the next delayed item matches the next item in the 871 * leaf - if so, we can add it to the batch of items to delete from the 872 * leaf. 873 */ 874 while (slot < last_slot) { 875 struct btrfs_key key; 876 877 next = __btrfs_next_delayed_item(curr); 878 if (!next) 879 break; 880 881 slot++; 882 btrfs_item_key_to_cpu(leaf, &key, slot); 883 if (key.objectid != ino || 884 key.type != BTRFS_DIR_INDEX_KEY || 885 key.offset != next->index) 886 break; 887 nitems++; 888 curr = next; 889 list_add_tail(&curr->tree_list, &batch_list); 890 total_reserved_size += curr->bytes_reserved; 891 } 892 893 ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); 894 if (ret) 895 return ret; 896 897 /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ 898 if (total_reserved_size > 0) { 899 /* 900 * Check btrfs_delayed_item_reserve_metadata() to see why we 901 * don't need to release/reserve qgroup space. 902 */ 903 trace_btrfs_space_reservation(fs_info, "delayed_item", ino, 904 total_reserved_size, 0); 905 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, 906 total_reserved_size, NULL); 907 } 908 909 list_for_each_entry_safe(curr, next, &batch_list, tree_list) { 910 list_del(&curr->tree_list); 911 btrfs_release_delayed_item(curr); 912 } 913 914 return 0; 915 } 916 917 static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, 918 struct btrfs_path *path, 919 struct btrfs_root *root, 920 struct btrfs_delayed_node *node) 921 { 922 struct btrfs_key key; 923 int ret = 0; 924 925 key.objectid = node->inode_id; 926 key.type = BTRFS_DIR_INDEX_KEY; 927 928 while (ret == 0) { 929 struct btrfs_delayed_item *item; 930 931 mutex_lock(&node->mutex); 932 item = __btrfs_first_delayed_deletion_item(node); 933 if (!item) { 934 mutex_unlock(&node->mutex); 935 break; 936 } 937 938 key.offset = item->index; 939 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 940 if (ret > 0) { 941 /* 942 * There's no matching item in the leaf. This means we 943 * have already deleted this item in a past run of the 944 * delayed items. We ignore errors when running delayed 945 * items from an async context, through a work queue job 946 * running btrfs_async_run_delayed_root(), and don't 947 * release delayed items that failed to complete. This 948 * is because we will retry later, and at transaction 949 * commit time we always run delayed items and will 950 * then deal with errors if they fail to run again. 951 * 952 * So just release delayed items for which we can't find 953 * an item in the tree, and move to the next item. 954 */ 955 btrfs_release_path(path); 956 btrfs_release_delayed_item(item); 957 ret = 0; 958 } else if (ret == 0) { 959 ret = btrfs_batch_delete_items(trans, root, path, item); 960 btrfs_release_path(path); 961 } 962 963 /* 964 * We unlock and relock on each iteration, this is to prevent 965 * blocking other tasks for too long while we are being run from 966 * the async context (work queue job). Those tasks are typically 967 * running system calls like creat/mkdir/rename/unlink/etc which 968 * need to add delayed items to this delayed node. 969 */ 970 mutex_unlock(&node->mutex); 971 } 972 973 return ret; 974 } 975 976 static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) 977 { 978 if (delayed_node && 979 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 980 ASSERT(delayed_node->root); 981 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 982 delayed_node->count--; 983 finish_one_item(&delayed_node->root->fs_info->delayed_root); 984 } 985 } 986 987 static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) 988 { 989 if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 990 ASSERT(delayed_node->root); 991 delayed_node->count--; 992 finish_one_item(&delayed_node->root->fs_info->delayed_root); 993 } 994 } 995 996 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 997 struct btrfs_root *root, 998 struct btrfs_path *path, 999 struct btrfs_delayed_node *node) 1000 { 1001 struct btrfs_fs_info *fs_info = root->fs_info; 1002 struct btrfs_key key; 1003 struct btrfs_inode_item *inode_item; 1004 struct extent_buffer *leaf; 1005 int mod; 1006 int ret; 1007 1008 key.objectid = node->inode_id; 1009 key.type = BTRFS_INODE_ITEM_KEY; 1010 key.offset = 0; 1011 1012 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1013 mod = -1; 1014 else 1015 mod = 1; 1016 1017 ret = btrfs_lookup_inode(trans, root, path, &key, mod); 1018 if (ret > 0) 1019 ret = -ENOENT; 1020 if (ret < 0) { 1021 /* 1022 * If we fail to update the delayed inode we need to abort the 1023 * transaction, because we could leave the inode with the 1024 * improper counts behind. 1025 */ 1026 if (unlikely(ret != -ENOENT)) 1027 btrfs_abort_transaction(trans, ret); 1028 goto out; 1029 } 1030 1031 leaf = path->nodes[0]; 1032 inode_item = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_inode_item); 1034 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, 1035 sizeof(struct btrfs_inode_item)); 1036 1037 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1038 goto out; 1039 1040 /* 1041 * Now we're going to delete the INODE_REF/EXTREF, which should be the 1042 * only one ref left. Check if the next item is an INODE_REF/EXTREF. 1043 * 1044 * But if we're the last item already, release and search for the last 1045 * INODE_REF/EXTREF. 1046 */ 1047 if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) { 1048 key.objectid = node->inode_id; 1049 key.type = BTRFS_INODE_EXTREF_KEY; 1050 key.offset = (u64)-1; 1051 1052 btrfs_release_path(path); 1053 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1054 if (unlikely(ret < 0)) { 1055 btrfs_abort_transaction(trans, ret); 1056 goto err_out; 1057 } 1058 ASSERT(ret > 0); 1059 ASSERT(path->slots[0] > 0); 1060 ret = 0; 1061 path->slots[0]--; 1062 leaf = path->nodes[0]; 1063 } else { 1064 path->slots[0]++; 1065 } 1066 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1067 if (key.objectid != node->inode_id) 1068 goto out; 1069 if (key.type != BTRFS_INODE_REF_KEY && 1070 key.type != BTRFS_INODE_EXTREF_KEY) 1071 goto out; 1072 1073 /* 1074 * Delayed iref deletion is for the inode who has only one link, 1075 * so there is only one iref. The case that several irefs are 1076 * in the same item doesn't exist. 1077 */ 1078 ret = btrfs_del_item(trans, root, path); 1079 if (ret < 0) 1080 btrfs_abort_transaction(trans, ret); 1081 out: 1082 btrfs_release_delayed_iref(node); 1083 btrfs_release_path(path); 1084 err_out: 1085 btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); 1086 btrfs_release_delayed_inode(node); 1087 return ret; 1088 } 1089 1090 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1091 struct btrfs_root *root, 1092 struct btrfs_path *path, 1093 struct btrfs_delayed_node *node) 1094 { 1095 int ret; 1096 1097 mutex_lock(&node->mutex); 1098 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { 1099 mutex_unlock(&node->mutex); 1100 return 0; 1101 } 1102 1103 ret = __btrfs_update_delayed_inode(trans, root, path, node); 1104 mutex_unlock(&node->mutex); 1105 return ret; 1106 } 1107 1108 static inline int 1109 __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1110 struct btrfs_path *path, 1111 struct btrfs_delayed_node *node) 1112 { 1113 int ret; 1114 1115 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1116 if (ret) 1117 return ret; 1118 1119 ret = btrfs_delete_delayed_items(trans, path, node->root, node); 1120 if (ret) 1121 return ret; 1122 1123 ret = btrfs_record_root_in_trans(trans, node->root); 1124 if (ret) 1125 return ret; 1126 1127 return btrfs_update_delayed_inode(trans, node->root, path, node); 1128 } 1129 1130 /* 1131 * Called when committing the transaction. 1132 * Returns 0 on success. 1133 * Returns < 0 on error and returns with an aborted transaction with any 1134 * outstanding delayed items cleaned up. 1135 */ 1136 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) 1137 { 1138 struct btrfs_fs_info *fs_info = trans->fs_info; 1139 struct btrfs_delayed_node *curr_node, *prev_node; 1140 struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; 1141 struct btrfs_path *path; 1142 struct btrfs_block_rsv *block_rsv; 1143 int ret = 0; 1144 bool count = (nr > 0); 1145 1146 if (TRANS_ABORTED(trans)) 1147 return -EIO; 1148 1149 path = btrfs_alloc_path(); 1150 if (!path) 1151 return -ENOMEM; 1152 1153 block_rsv = trans->block_rsv; 1154 trans->block_rsv = &fs_info->delayed_block_rsv; 1155 1156 curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); 1157 while (curr_node && (!count || nr--)) { 1158 ret = __btrfs_commit_inode_delayed_items(trans, path, 1159 curr_node); 1160 if (unlikely(ret)) { 1161 btrfs_abort_transaction(trans, ret); 1162 break; 1163 } 1164 1165 prev_node = curr_node; 1166 prev_delayed_node_tracker = curr_delayed_node_tracker; 1167 curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); 1168 /* 1169 * See the comment below about releasing path before releasing 1170 * node. If the commit of delayed items was successful the path 1171 * should always be released, but in case of an error, it may 1172 * point to locked extent buffers (a leaf at the very least). 1173 */ 1174 ASSERT(path->nodes[0] == NULL); 1175 btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); 1176 } 1177 1178 /* 1179 * Release the path to avoid a potential deadlock and lockdep splat when 1180 * releasing the delayed node, as that requires taking the delayed node's 1181 * mutex. If another task starts running delayed items before we take 1182 * the mutex, it will first lock the mutex and then it may try to lock 1183 * the same btree path (leaf). 1184 */ 1185 btrfs_free_path(path); 1186 1187 if (curr_node) 1188 btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker); 1189 trans->block_rsv = block_rsv; 1190 1191 return ret; 1192 } 1193 1194 int btrfs_run_delayed_items(struct btrfs_trans_handle *trans) 1195 { 1196 return __btrfs_run_delayed_items(trans, -1); 1197 } 1198 1199 int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) 1200 { 1201 return __btrfs_run_delayed_items(trans, nr); 1202 } 1203 1204 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1205 struct btrfs_inode *inode) 1206 { 1207 struct btrfs_ref_tracker delayed_node_tracker; 1208 struct btrfs_delayed_node *delayed_node = 1209 btrfs_get_delayed_node(inode, &delayed_node_tracker); 1210 BTRFS_PATH_AUTO_FREE(path); 1211 struct btrfs_block_rsv *block_rsv; 1212 int ret; 1213 1214 if (!delayed_node) 1215 return 0; 1216 1217 mutex_lock(&delayed_node->mutex); 1218 if (!delayed_node->count) { 1219 mutex_unlock(&delayed_node->mutex); 1220 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1221 return 0; 1222 } 1223 mutex_unlock(&delayed_node->mutex); 1224 1225 path = btrfs_alloc_path(); 1226 if (!path) { 1227 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1228 return -ENOMEM; 1229 } 1230 1231 block_rsv = trans->block_rsv; 1232 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; 1233 1234 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1235 1236 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1237 trans->block_rsv = block_rsv; 1238 1239 return ret; 1240 } 1241 1242 int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) 1243 { 1244 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1245 struct btrfs_trans_handle *trans; 1246 struct btrfs_ref_tracker delayed_node_tracker; 1247 struct btrfs_delayed_node *delayed_node; 1248 struct btrfs_path *path; 1249 struct btrfs_block_rsv *block_rsv; 1250 int ret; 1251 1252 delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 1253 if (!delayed_node) 1254 return 0; 1255 1256 mutex_lock(&delayed_node->mutex); 1257 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1258 mutex_unlock(&delayed_node->mutex); 1259 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1260 return 0; 1261 } 1262 mutex_unlock(&delayed_node->mutex); 1263 1264 trans = btrfs_join_transaction(delayed_node->root); 1265 if (IS_ERR(trans)) { 1266 ret = PTR_ERR(trans); 1267 goto out; 1268 } 1269 1270 path = btrfs_alloc_path(); 1271 if (!path) { 1272 ret = -ENOMEM; 1273 goto trans_out; 1274 } 1275 1276 block_rsv = trans->block_rsv; 1277 trans->block_rsv = &fs_info->delayed_block_rsv; 1278 1279 mutex_lock(&delayed_node->mutex); 1280 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) 1281 ret = __btrfs_update_delayed_inode(trans, delayed_node->root, 1282 path, delayed_node); 1283 else 1284 ret = 0; 1285 mutex_unlock(&delayed_node->mutex); 1286 1287 btrfs_free_path(path); 1288 trans->block_rsv = block_rsv; 1289 trans_out: 1290 btrfs_end_transaction(trans); 1291 btrfs_btree_balance_dirty(fs_info); 1292 out: 1293 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1294 1295 return ret; 1296 } 1297 1298 void btrfs_remove_delayed_node(struct btrfs_inode *inode) 1299 { 1300 struct btrfs_delayed_node *delayed_node; 1301 1302 delayed_node = READ_ONCE(inode->delayed_node); 1303 if (!delayed_node) 1304 return; 1305 1306 inode->delayed_node = NULL; 1307 1308 btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker); 1309 } 1310 1311 struct btrfs_async_delayed_work { 1312 struct btrfs_delayed_root *delayed_root; 1313 int nr; 1314 struct btrfs_work work; 1315 }; 1316 1317 static void btrfs_async_run_delayed_root(struct btrfs_work *work) 1318 { 1319 struct btrfs_async_delayed_work *async_work; 1320 struct btrfs_delayed_root *delayed_root; 1321 struct btrfs_trans_handle *trans; 1322 struct btrfs_path *path; 1323 struct btrfs_delayed_node *delayed_node = NULL; 1324 struct btrfs_ref_tracker delayed_node_tracker; 1325 struct btrfs_root *root; 1326 struct btrfs_block_rsv *block_rsv; 1327 int total_done = 0; 1328 1329 async_work = container_of(work, struct btrfs_async_delayed_work, work); 1330 delayed_root = async_work->delayed_root; 1331 1332 path = btrfs_alloc_path(); 1333 if (!path) 1334 goto out; 1335 1336 do { 1337 if (atomic_read(&delayed_root->items) < 1338 BTRFS_DELAYED_BACKGROUND / 2) 1339 break; 1340 1341 delayed_node = btrfs_first_prepared_delayed_node(delayed_root, 1342 &delayed_node_tracker); 1343 if (!delayed_node) 1344 break; 1345 1346 root = delayed_node->root; 1347 1348 trans = btrfs_join_transaction(root); 1349 if (IS_ERR(trans)) { 1350 btrfs_release_path(path); 1351 btrfs_release_prepared_delayed_node(delayed_node, 1352 &delayed_node_tracker); 1353 total_done++; 1354 continue; 1355 } 1356 1357 block_rsv = trans->block_rsv; 1358 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1359 1360 __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1361 1362 trans->block_rsv = block_rsv; 1363 btrfs_end_transaction(trans); 1364 btrfs_btree_balance_dirty_nodelay(root->fs_info); 1365 1366 btrfs_release_path(path); 1367 btrfs_release_prepared_delayed_node(delayed_node, 1368 &delayed_node_tracker); 1369 total_done++; 1370 1371 } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) 1372 || total_done < async_work->nr); 1373 1374 btrfs_free_path(path); 1375 out: 1376 wake_up(&delayed_root->wait); 1377 kfree(async_work); 1378 } 1379 1380 1381 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, 1382 struct btrfs_fs_info *fs_info, int nr) 1383 { 1384 struct btrfs_async_delayed_work *async_work; 1385 1386 async_work = kmalloc_obj(*async_work, GFP_NOFS); 1387 if (!async_work) 1388 return -ENOMEM; 1389 1390 async_work->delayed_root = delayed_root; 1391 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL); 1392 async_work->nr = nr; 1393 1394 btrfs_queue_work(fs_info->delayed_workers, &async_work->work); 1395 return 0; 1396 } 1397 1398 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) 1399 { 1400 struct btrfs_ref_tracker delayed_node_tracker; 1401 struct btrfs_delayed_node *node; 1402 1403 node = btrfs_first_delayed_node(fs_info, &delayed_node_tracker); 1404 if (WARN_ON(node)) { 1405 btrfs_delayed_node_ref_tracker_free(node, 1406 &delayed_node_tracker); 1407 refcount_dec(&node->refs); 1408 } 1409 } 1410 1411 static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) 1412 { 1413 int val = atomic_read(&delayed_root->items_seq); 1414 1415 if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) 1416 return true; 1417 1418 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) 1419 return true; 1420 1421 return false; 1422 } 1423 1424 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) 1425 { 1426 struct btrfs_delayed_root *delayed_root = &fs_info->delayed_root; 1427 1428 if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || 1429 btrfs_workqueue_normal_congested(fs_info->delayed_workers)) 1430 return; 1431 1432 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { 1433 int seq; 1434 int ret; 1435 1436 seq = atomic_read(&delayed_root->items_seq); 1437 1438 ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); 1439 if (ret) 1440 return; 1441 1442 wait_event_interruptible(delayed_root->wait, 1443 could_end_wait(delayed_root, seq)); 1444 return; 1445 } 1446 1447 btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); 1448 } 1449 1450 static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) 1451 { 1452 struct btrfs_fs_info *fs_info = trans->fs_info; 1453 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 1454 1455 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 1456 return; 1457 1458 /* 1459 * Adding the new dir index item does not require touching another 1460 * leaf, so we can release 1 unit of metadata that was previously 1461 * reserved when starting the transaction. This applies only to 1462 * the case where we had a transaction start and excludes the 1463 * transaction join case (when replaying log trees). 1464 */ 1465 trace_btrfs_space_reservation(fs_info, "transaction", 1466 trans->transid, bytes, 0); 1467 btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); 1468 ASSERT(trans->bytes_reserved >= bytes); 1469 trans->bytes_reserved -= bytes; 1470 } 1471 1472 /* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ 1473 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, 1474 const char *name, int name_len, 1475 struct btrfs_inode *dir, 1476 const struct btrfs_disk_key *disk_key, u8 flags, 1477 u64 index) 1478 { 1479 struct btrfs_fs_info *fs_info = trans->fs_info; 1480 const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); 1481 struct btrfs_delayed_node *delayed_node; 1482 struct btrfs_ref_tracker delayed_node_tracker; 1483 struct btrfs_delayed_item *delayed_item; 1484 struct btrfs_dir_item *dir_item; 1485 bool reserve_leaf_space; 1486 u32 data_len; 1487 int ret; 1488 1489 delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); 1490 if (IS_ERR(delayed_node)) 1491 return PTR_ERR(delayed_node); 1492 1493 delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, 1494 delayed_node, 1495 BTRFS_DELAYED_INSERTION_ITEM); 1496 if (!delayed_item) { 1497 ret = -ENOMEM; 1498 goto release_node; 1499 } 1500 1501 delayed_item->index = index; 1502 1503 dir_item = (struct btrfs_dir_item *)delayed_item->data; 1504 dir_item->location = *disk_key; 1505 btrfs_set_stack_dir_transid(dir_item, trans->transid); 1506 btrfs_set_stack_dir_data_len(dir_item, 0); 1507 btrfs_set_stack_dir_name_len(dir_item, name_len); 1508 btrfs_set_stack_dir_flags(dir_item, flags); 1509 memcpy((char *)(dir_item + 1), name, name_len); 1510 1511 data_len = delayed_item->data_len + sizeof(struct btrfs_item); 1512 1513 mutex_lock(&delayed_node->mutex); 1514 1515 /* 1516 * First attempt to insert the delayed item. This is to make the error 1517 * handling path simpler in case we fail (-EEXIST). There's no risk of 1518 * any other task coming in and running the delayed item before we do 1519 * the metadata space reservation below, because we are holding the 1520 * delayed node's mutex and that mutex must also be locked before the 1521 * node's delayed items can be run. 1522 */ 1523 ret = __btrfs_add_delayed_item(delayed_node, delayed_item); 1524 if (unlikely(ret)) { 1525 btrfs_err(trans->fs_info, 1526 "error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", 1527 name_len, name, index, btrfs_root_id(delayed_node->root), 1528 delayed_node->inode_id, dir->index_cnt, 1529 delayed_node->index_cnt, ret); 1530 btrfs_release_delayed_item(delayed_item); 1531 btrfs_release_dir_index_item_space(trans); 1532 mutex_unlock(&delayed_node->mutex); 1533 goto release_node; 1534 } 1535 1536 if (delayed_node->index_item_leaves == 0 || 1537 delayed_node->curr_index_batch_size + data_len > leaf_data_size) { 1538 delayed_node->curr_index_batch_size = data_len; 1539 reserve_leaf_space = true; 1540 } else { 1541 delayed_node->curr_index_batch_size += data_len; 1542 reserve_leaf_space = false; 1543 } 1544 1545 if (reserve_leaf_space) { 1546 ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); 1547 /* 1548 * Space was reserved for a dir index item insertion when we 1549 * started the transaction, so getting a failure here should be 1550 * impossible. 1551 */ 1552 if (WARN_ON(ret)) { 1553 btrfs_release_delayed_item(delayed_item); 1554 mutex_unlock(&delayed_node->mutex); 1555 goto release_node; 1556 } 1557 1558 delayed_node->index_item_leaves++; 1559 } else { 1560 btrfs_release_dir_index_item_space(trans); 1561 } 1562 mutex_unlock(&delayed_node->mutex); 1563 1564 release_node: 1565 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1566 return ret; 1567 } 1568 1569 static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, 1570 u64 index) 1571 { 1572 struct btrfs_delayed_item *item; 1573 1574 mutex_lock(&node->mutex); 1575 item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); 1576 if (!item) { 1577 mutex_unlock(&node->mutex); 1578 return false; 1579 } 1580 1581 /* 1582 * For delayed items to insert, we track reserved metadata bytes based 1583 * on the number of leaves that we will use. 1584 * See btrfs_insert_delayed_dir_index() and 1585 * btrfs_delayed_item_reserve_metadata()). 1586 */ 1587 ASSERT(item->bytes_reserved == 0); 1588 ASSERT(node->index_item_leaves > 0); 1589 1590 /* 1591 * If there's only one leaf reserved, we can decrement this item from the 1592 * current batch, otherwise we can not because we don't know which leaf 1593 * it belongs to. With the current limit on delayed items, we rarely 1594 * accumulate enough dir index items to fill more than one leaf (even 1595 * when using a leaf size of 4K). 1596 */ 1597 if (node->index_item_leaves == 1) { 1598 const u32 data_len = item->data_len + sizeof(struct btrfs_item); 1599 1600 ASSERT(node->curr_index_batch_size >= data_len); 1601 node->curr_index_batch_size -= data_len; 1602 } 1603 1604 btrfs_release_delayed_item(item); 1605 1606 /* If we now have no more dir index items, we can release all leaves. */ 1607 if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { 1608 btrfs_delayed_item_release_leaves(node, node->index_item_leaves); 1609 node->index_item_leaves = 0; 1610 } 1611 1612 mutex_unlock(&node->mutex); 1613 return true; 1614 } 1615 1616 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, 1617 struct btrfs_inode *dir, u64 index) 1618 { 1619 struct btrfs_delayed_node *node; 1620 struct btrfs_ref_tracker delayed_node_tracker; 1621 struct btrfs_delayed_item *item; 1622 int ret; 1623 1624 node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); 1625 if (IS_ERR(node)) 1626 return PTR_ERR(node); 1627 1628 if (btrfs_delete_delayed_insertion_item(node, index)) { 1629 ret = 0; 1630 goto end; 1631 } 1632 1633 item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); 1634 if (!item) { 1635 ret = -ENOMEM; 1636 goto end; 1637 } 1638 1639 item->index = index; 1640 1641 ret = btrfs_delayed_item_reserve_metadata(trans, item); 1642 /* 1643 * we have reserved enough space when we start a new transaction, 1644 * so reserving metadata failure is impossible. 1645 */ 1646 if (ret < 0) { 1647 btrfs_err(trans->fs_info, 1648 "metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d", 1649 index, btrfs_root_id(node->root), node->inode_id, ret); 1650 btrfs_release_delayed_item(item); 1651 goto end; 1652 } 1653 1654 mutex_lock(&node->mutex); 1655 ret = __btrfs_add_delayed_item(node, item); 1656 if (unlikely(ret)) { 1657 btrfs_err(trans->fs_info, 1658 "failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d", 1659 btrfs_root_id(node->root), node->inode_id, index, ret); 1660 btrfs_delayed_item_release_metadata(dir->root, item); 1661 btrfs_release_delayed_item(item); 1662 } 1663 mutex_unlock(&node->mutex); 1664 end: 1665 btrfs_release_delayed_node(node, &delayed_node_tracker); 1666 return ret; 1667 } 1668 1669 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) 1670 { 1671 struct btrfs_ref_tracker delayed_node_tracker; 1672 struct btrfs_delayed_node *delayed_node; 1673 1674 delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 1675 if (!delayed_node) 1676 return -ENOENT; 1677 1678 /* 1679 * Since we have held i_mutex of this directory, it is impossible that 1680 * a new directory index is added into the delayed node and index_cnt 1681 * is updated now. So we needn't lock the delayed node. 1682 */ 1683 if (!delayed_node->index_cnt) { 1684 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1685 return -EINVAL; 1686 } 1687 1688 inode->index_cnt = delayed_node->index_cnt; 1689 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1690 return 0; 1691 } 1692 1693 bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, 1694 u64 last_index, 1695 struct list_head *ins_list, 1696 struct list_head *del_list) 1697 { 1698 struct btrfs_delayed_node *delayed_node; 1699 struct btrfs_delayed_item *item; 1700 struct btrfs_ref_tracker delayed_node_tracker; 1701 1702 delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 1703 if (!delayed_node) 1704 return false; 1705 1706 /* 1707 * We can only do one readdir with delayed items at a time because of 1708 * item->readdir_list. 1709 */ 1710 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 1711 btrfs_inode_lock(inode, 0); 1712 1713 mutex_lock(&delayed_node->mutex); 1714 item = __btrfs_first_delayed_insertion_item(delayed_node); 1715 while (item && item->index <= last_index) { 1716 refcount_inc(&item->refs); 1717 list_add_tail(&item->readdir_list, ins_list); 1718 item = __btrfs_next_delayed_item(item); 1719 } 1720 1721 item = __btrfs_first_delayed_deletion_item(delayed_node); 1722 while (item && item->index <= last_index) { 1723 refcount_inc(&item->refs); 1724 list_add_tail(&item->readdir_list, del_list); 1725 item = __btrfs_next_delayed_item(item); 1726 } 1727 mutex_unlock(&delayed_node->mutex); 1728 /* 1729 * This delayed node is still cached in the btrfs inode, so refs 1730 * must be > 1 now, and we needn't check it is going to be freed 1731 * or not. 1732 * 1733 * Besides that, this function is used to read dir, we do not 1734 * insert/delete delayed items in this period. So we also needn't 1735 * requeue or dequeue this delayed node. 1736 */ 1737 btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker); 1738 refcount_dec(&delayed_node->refs); 1739 1740 return true; 1741 } 1742 1743 void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, 1744 struct list_head *ins_list, 1745 struct list_head *del_list) 1746 { 1747 struct btrfs_delayed_item *curr, *next; 1748 1749 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1750 list_del(&curr->readdir_list); 1751 if (refcount_dec_and_test(&curr->refs)) 1752 kfree(curr); 1753 } 1754 1755 list_for_each_entry_safe(curr, next, del_list, readdir_list) { 1756 list_del(&curr->readdir_list); 1757 if (refcount_dec_and_test(&curr->refs)) 1758 kfree(curr); 1759 } 1760 1761 /* 1762 * The VFS is going to do up_read(), so we need to downgrade back to a 1763 * read lock. 1764 */ 1765 downgrade_write(&inode->vfs_inode.i_rwsem); 1766 } 1767 1768 bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index) 1769 { 1770 struct btrfs_delayed_item *curr; 1771 bool ret = false; 1772 1773 list_for_each_entry(curr, del_list, readdir_list) { 1774 if (curr->index > index) 1775 break; 1776 if (curr->index == index) { 1777 ret = true; 1778 break; 1779 } 1780 } 1781 return ret; 1782 } 1783 1784 /* 1785 * Read dir info stored in the delayed tree. 1786 */ 1787 bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, 1788 const struct list_head *ins_list) 1789 { 1790 struct btrfs_dir_item *di; 1791 struct btrfs_delayed_item *curr, *next; 1792 struct btrfs_key location; 1793 char *name; 1794 int name_len; 1795 unsigned char d_type; 1796 1797 /* 1798 * Changing the data of the delayed item is impossible. So 1799 * we needn't lock them. And we have held i_mutex of the 1800 * directory, nobody can delete any directory indexes now. 1801 */ 1802 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1803 bool over; 1804 1805 list_del(&curr->readdir_list); 1806 1807 if (curr->index < ctx->pos) { 1808 if (refcount_dec_and_test(&curr->refs)) 1809 kfree(curr); 1810 continue; 1811 } 1812 1813 ctx->pos = curr->index; 1814 1815 di = (struct btrfs_dir_item *)curr->data; 1816 name = (char *)(di + 1); 1817 name_len = btrfs_stack_dir_name_len(di); 1818 1819 d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); 1820 btrfs_disk_key_to_cpu(&location, &di->location); 1821 1822 over = !dir_emit(ctx, name, name_len, location.objectid, d_type); 1823 1824 if (refcount_dec_and_test(&curr->refs)) 1825 kfree(curr); 1826 1827 if (over) 1828 return true; 1829 ctx->pos++; 1830 } 1831 return false; 1832 } 1833 1834 static void fill_stack_inode_item(struct btrfs_trans_handle *trans, 1835 struct btrfs_inode_item *inode_item, 1836 struct btrfs_inode *inode) 1837 { 1838 struct inode *vfs_inode = &inode->vfs_inode; 1839 u64 flags; 1840 1841 btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); 1842 btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); 1843 btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); 1844 btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); 1845 btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); 1846 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); 1847 btrfs_set_stack_inode_generation(inode_item, inode->generation); 1848 btrfs_set_stack_inode_sequence(inode_item, 1849 inode_peek_iversion(vfs_inode)); 1850 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1851 btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); 1852 flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); 1853 btrfs_set_stack_inode_flags(inode_item, flags); 1854 btrfs_set_stack_inode_block_group(inode_item, 0); 1855 1856 btrfs_set_stack_timespec_sec(&inode_item->atime, 1857 inode_get_atime_sec(vfs_inode)); 1858 btrfs_set_stack_timespec_nsec(&inode_item->atime, 1859 inode_get_atime_nsec(vfs_inode)); 1860 1861 btrfs_set_stack_timespec_sec(&inode_item->mtime, 1862 inode_get_mtime_sec(vfs_inode)); 1863 btrfs_set_stack_timespec_nsec(&inode_item->mtime, 1864 inode_get_mtime_nsec(vfs_inode)); 1865 1866 btrfs_set_stack_timespec_sec(&inode_item->ctime, 1867 inode_get_ctime_sec(vfs_inode)); 1868 btrfs_set_stack_timespec_nsec(&inode_item->ctime, 1869 inode_get_ctime_nsec(vfs_inode)); 1870 1871 btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); 1872 btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); 1873 } 1874 1875 int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) 1876 { 1877 struct btrfs_delayed_node *delayed_node; 1878 struct btrfs_ref_tracker delayed_node_tracker; 1879 struct btrfs_inode_item *inode_item; 1880 struct inode *vfs_inode = &inode->vfs_inode; 1881 1882 delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 1883 if (!delayed_node) 1884 return -ENOENT; 1885 1886 mutex_lock(&delayed_node->mutex); 1887 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1888 mutex_unlock(&delayed_node->mutex); 1889 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1890 return -ENOENT; 1891 } 1892 1893 inode_item = &delayed_node->inode_item; 1894 1895 i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); 1896 i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); 1897 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); 1898 vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); 1899 set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); 1900 inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); 1901 inode->generation = btrfs_stack_inode_generation(inode_item); 1902 inode->last_trans = btrfs_stack_inode_transid(inode_item); 1903 1904 inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); 1905 vfs_inode->i_rdev = 0; 1906 *rdev = btrfs_stack_inode_rdev(inode_item); 1907 btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), 1908 &inode->flags, &inode->ro_flags); 1909 1910 inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), 1911 btrfs_stack_timespec_nsec(&inode_item->atime)); 1912 1913 inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), 1914 btrfs_stack_timespec_nsec(&inode_item->mtime)); 1915 1916 inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), 1917 btrfs_stack_timespec_nsec(&inode_item->ctime)); 1918 1919 inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); 1920 inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); 1921 1922 vfs_inode->i_generation = inode->generation; 1923 if (S_ISDIR(vfs_inode->i_mode)) 1924 inode->index_cnt = (u64)-1; 1925 1926 mutex_unlock(&delayed_node->mutex); 1927 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1928 return 0; 1929 } 1930 1931 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 1932 struct btrfs_inode *inode) 1933 { 1934 struct btrfs_root *root = inode->root; 1935 struct btrfs_delayed_node *delayed_node; 1936 struct btrfs_ref_tracker delayed_node_tracker; 1937 int ret = 0; 1938 1939 delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); 1940 if (IS_ERR(delayed_node)) 1941 return PTR_ERR(delayed_node); 1942 1943 mutex_lock(&delayed_node->mutex); 1944 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1945 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1946 goto release_node; 1947 } 1948 1949 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1950 if (ret) 1951 goto release_node; 1952 1953 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1954 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 1955 delayed_node->count++; 1956 atomic_inc(&root->fs_info->delayed_root.items); 1957 release_node: 1958 mutex_unlock(&delayed_node->mutex); 1959 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 1960 return ret; 1961 } 1962 1963 int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) 1964 { 1965 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1966 struct btrfs_delayed_node *delayed_node; 1967 struct btrfs_ref_tracker delayed_node_tracker; 1968 1969 /* 1970 * we don't do delayed inode updates during log recovery because it 1971 * leads to enospc problems. This means we also can't do 1972 * delayed inode refs 1973 */ 1974 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 1975 return -EAGAIN; 1976 1977 delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); 1978 if (IS_ERR(delayed_node)) 1979 return PTR_ERR(delayed_node); 1980 1981 /* 1982 * We don't reserve space for inode ref deletion is because: 1983 * - We ONLY do async inode ref deletion for the inode who has only 1984 * one link(i_nlink == 1), it means there is only one inode ref. 1985 * And in most case, the inode ref and the inode item are in the 1986 * same leaf, and we will deal with them at the same time. 1987 * Since we are sure we will reserve the space for the inode item, 1988 * it is unnecessary to reserve space for inode ref deletion. 1989 * - If the inode ref and the inode item are not in the same leaf, 1990 * We also needn't worry about enospc problem, because we reserve 1991 * much more space for the inode update than it needs. 1992 * - At the worst, we can steal some space from the global reservation. 1993 * It is very rare. 1994 */ 1995 mutex_lock(&delayed_node->mutex); 1996 if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 1997 delayed_node->count++; 1998 atomic_inc(&fs_info->delayed_root.items); 1999 } 2000 mutex_unlock(&delayed_node->mutex); 2001 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 2002 return 0; 2003 } 2004 2005 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) 2006 { 2007 struct btrfs_root *root = delayed_node->root; 2008 struct btrfs_fs_info *fs_info = root->fs_info; 2009 struct btrfs_delayed_item *curr_item, *prev_item; 2010 2011 mutex_lock(&delayed_node->mutex); 2012 curr_item = __btrfs_first_delayed_insertion_item(delayed_node); 2013 while (curr_item) { 2014 prev_item = curr_item; 2015 curr_item = __btrfs_next_delayed_item(prev_item); 2016 btrfs_release_delayed_item(prev_item); 2017 } 2018 2019 if (delayed_node->index_item_leaves > 0) { 2020 btrfs_delayed_item_release_leaves(delayed_node, 2021 delayed_node->index_item_leaves); 2022 delayed_node->index_item_leaves = 0; 2023 } 2024 2025 curr_item = __btrfs_first_delayed_deletion_item(delayed_node); 2026 while (curr_item) { 2027 btrfs_delayed_item_release_metadata(root, curr_item); 2028 prev_item = curr_item; 2029 curr_item = __btrfs_next_delayed_item(prev_item); 2030 btrfs_release_delayed_item(prev_item); 2031 } 2032 2033 btrfs_release_delayed_iref(delayed_node); 2034 2035 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 2036 btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); 2037 btrfs_release_delayed_inode(delayed_node); 2038 } 2039 mutex_unlock(&delayed_node->mutex); 2040 } 2041 2042 void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) 2043 { 2044 struct btrfs_delayed_node *delayed_node; 2045 struct btrfs_ref_tracker delayed_node_tracker; 2046 2047 delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 2048 if (!delayed_node) 2049 return; 2050 2051 __btrfs_kill_delayed_node(delayed_node); 2052 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); 2053 } 2054 2055 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) 2056 { 2057 unsigned long index = 0; 2058 struct btrfs_delayed_node *delayed_nodes[8]; 2059 struct btrfs_ref_tracker delayed_node_trackers[8]; 2060 2061 while (1) { 2062 struct btrfs_delayed_node *node; 2063 int count; 2064 2065 xa_lock(&root->delayed_nodes); 2066 if (xa_empty(&root->delayed_nodes)) { 2067 xa_unlock(&root->delayed_nodes); 2068 return; 2069 } 2070 2071 count = 0; 2072 xa_for_each_start(&root->delayed_nodes, index, node, index) { 2073 /* 2074 * Don't increase refs in case the node is dead and 2075 * about to be removed from the tree in the loop below 2076 */ 2077 if (refcount_inc_not_zero(&node->refs)) { 2078 btrfs_delayed_node_ref_tracker_alloc(node, 2079 &delayed_node_trackers[count], 2080 GFP_ATOMIC); 2081 delayed_nodes[count] = node; 2082 count++; 2083 } 2084 if (count >= ARRAY_SIZE(delayed_nodes)) 2085 break; 2086 } 2087 xa_unlock(&root->delayed_nodes); 2088 index++; 2089 2090 for (int i = 0; i < count; i++) { 2091 __btrfs_kill_delayed_node(delayed_nodes[i]); 2092 btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); 2093 btrfs_release_delayed_node(delayed_nodes[i], 2094 &delayed_node_trackers[i]); 2095 } 2096 } 2097 } 2098 2099 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) 2100 { 2101 struct btrfs_delayed_node *curr_node, *prev_node; 2102 struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; 2103 2104 curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); 2105 while (curr_node) { 2106 __btrfs_kill_delayed_node(curr_node); 2107 2108 prev_node = curr_node; 2109 prev_delayed_node_tracker = curr_delayed_node_tracker; 2110 curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); 2111 btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); 2112 } 2113 } 2114 2115 void btrfs_log_get_delayed_items(struct btrfs_inode *inode, 2116 struct list_head *ins_list, 2117 struct list_head *del_list) 2118 { 2119 struct btrfs_delayed_node *node; 2120 struct btrfs_delayed_item *item; 2121 struct btrfs_ref_tracker delayed_node_tracker; 2122 2123 node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 2124 if (!node) 2125 return; 2126 2127 mutex_lock(&node->mutex); 2128 item = __btrfs_first_delayed_insertion_item(node); 2129 while (item) { 2130 /* 2131 * It's possible that the item is already in a log list. This 2132 * can happen in case two tasks are trying to log the same 2133 * directory. For example if we have tasks A and task B: 2134 * 2135 * Task A collected the delayed items into a log list while 2136 * under the inode's log_mutex (at btrfs_log_inode()), but it 2137 * only releases the items after logging the inodes they point 2138 * to (if they are new inodes), which happens after unlocking 2139 * the log mutex; 2140 * 2141 * Task B enters btrfs_log_inode() and acquires the log_mutex 2142 * of the same directory inode, before task B releases the 2143 * delayed items. This can happen for example when logging some 2144 * inode we need to trigger logging of its parent directory, so 2145 * logging two files that have the same parent directory can 2146 * lead to this. 2147 * 2148 * If this happens, just ignore delayed items already in a log 2149 * list. All the tasks logging the directory are under a log 2150 * transaction and whichever finishes first can not sync the log 2151 * before the other completes and leaves the log transaction. 2152 */ 2153 if (!item->logged && list_empty(&item->log_list)) { 2154 refcount_inc(&item->refs); 2155 list_add_tail(&item->log_list, ins_list); 2156 } 2157 item = __btrfs_next_delayed_item(item); 2158 } 2159 2160 item = __btrfs_first_delayed_deletion_item(node); 2161 while (item) { 2162 /* It may be non-empty, for the same reason mentioned above. */ 2163 if (!item->logged && list_empty(&item->log_list)) { 2164 refcount_inc(&item->refs); 2165 list_add_tail(&item->log_list, del_list); 2166 } 2167 item = __btrfs_next_delayed_item(item); 2168 } 2169 mutex_unlock(&node->mutex); 2170 2171 /* 2172 * We are called during inode logging, which means the inode is in use 2173 * and can not be evicted before we finish logging the inode. So we never 2174 * have the last reference on the delayed inode. 2175 * Also, we don't use btrfs_release_delayed_node() because that would 2176 * requeue the delayed inode (change its order in the list of prepared 2177 * nodes) and we don't want to do such change because we don't create or 2178 * delete delayed items. 2179 */ 2180 ASSERT(refcount_read(&node->refs) > 1); 2181 btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); 2182 refcount_dec(&node->refs); 2183 } 2184 2185 void btrfs_log_put_delayed_items(struct btrfs_inode *inode, 2186 struct list_head *ins_list, 2187 struct list_head *del_list) 2188 { 2189 struct btrfs_delayed_node *node; 2190 struct btrfs_delayed_item *item; 2191 struct btrfs_delayed_item *next; 2192 struct btrfs_ref_tracker delayed_node_tracker; 2193 2194 node = btrfs_get_delayed_node(inode, &delayed_node_tracker); 2195 if (!node) 2196 return; 2197 2198 mutex_lock(&node->mutex); 2199 2200 list_for_each_entry_safe(item, next, ins_list, log_list) { 2201 item->logged = true; 2202 list_del_init(&item->log_list); 2203 if (refcount_dec_and_test(&item->refs)) 2204 kfree(item); 2205 } 2206 2207 list_for_each_entry_safe(item, next, del_list, log_list) { 2208 item->logged = true; 2209 list_del_init(&item->log_list); 2210 if (refcount_dec_and_test(&item->refs)) 2211 kfree(item); 2212 } 2213 2214 mutex_unlock(&node->mutex); 2215 2216 /* 2217 * We are called during inode logging, which means the inode is in use 2218 * and can not be evicted before we finish logging the inode. So we never 2219 * have the last reference on the delayed inode. 2220 * Also, we don't use btrfs_release_delayed_node() because that would 2221 * requeue the delayed inode (change its order in the list of prepared 2222 * nodes) and we don't want to do such change because we don't create or 2223 * delete delayed items. 2224 */ 2225 ASSERT(refcount_read(&node->refs) > 1); 2226 btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); 2227 refcount_dec(&node->refs); 2228 } 2229