1 /* 2 * Copyright (C) 2009 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/sort.h> 22 #include "ctree.h" 23 #include "delayed-ref.h" 24 #include "transaction.h" 25 26 /* 27 * delayed back reference update tracking. For subvolume trees 28 * we queue up extent allocations and backref maintenance for 29 * delayed processing. This avoids deep call chains where we 30 * add extents in the middle of btrfs_search_slot, and it allows 31 * us to buffer up frequently modified backrefs in an rb tree instead 32 * of hammering updates on the extent allocation tree. 33 */ 34 35 /* 36 * compare two delayed tree backrefs with same bytenr and type 37 */ 38 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, 39 struct btrfs_delayed_tree_ref *ref1) 40 { 41 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { 42 if (ref1->root < ref2->root) 43 return -1; 44 if (ref1->root > ref2->root) 45 return 1; 46 } else { 47 if (ref1->parent < ref2->parent) 48 return -1; 49 if (ref1->parent > ref2->parent) 50 return 1; 51 } 52 return 0; 53 } 54 55 /* 56 * compare two delayed data backrefs with same bytenr and type 57 */ 58 static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, 59 struct btrfs_delayed_data_ref *ref1) 60 { 61 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { 62 if (ref1->root < ref2->root) 63 return -1; 64 if (ref1->root > ref2->root) 65 return 1; 66 if (ref1->objectid < ref2->objectid) 67 return -1; 68 if (ref1->objectid > ref2->objectid) 69 return 1; 70 if (ref1->offset < ref2->offset) 71 return -1; 72 if (ref1->offset > ref2->offset) 73 return 1; 74 } else { 75 if (ref1->parent < ref2->parent) 76 return -1; 77 if (ref1->parent > ref2->parent) 78 return 1; 79 } 80 return 0; 81 } 82 83 /* 84 * entries in the rb tree are ordered by the byte number of the extent, 85 * type of the delayed backrefs and content of delayed backrefs. 86 */ 87 static int comp_entry(struct btrfs_delayed_ref_node *ref2, 88 struct btrfs_delayed_ref_node *ref1) 89 { 90 if (ref1->bytenr < ref2->bytenr) 91 return -1; 92 if (ref1->bytenr > ref2->bytenr) 93 return 1; 94 if (ref1->is_head && ref2->is_head) 95 return 0; 96 if (ref2->is_head) 97 return -1; 98 if (ref1->is_head) 99 return 1; 100 if (ref1->type < ref2->type) 101 return -1; 102 if (ref1->type > ref2->type) 103 return 1; 104 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 105 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 107 btrfs_delayed_node_to_tree_ref(ref1)); 108 } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || 109 ref1->type == BTRFS_SHARED_DATA_REF_KEY) { 110 return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), 111 btrfs_delayed_node_to_data_ref(ref1)); 112 } 113 BUG(); 114 return 0; 115 } 116 117 /* 118 * insert a new ref into the rbtree. This returns any existing refs 119 * for the same (bytenr,parent) tuple, or NULL if the new node was properly 120 * inserted. 121 */ 122 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 123 struct rb_node *node) 124 { 125 struct rb_node **p = &root->rb_node; 126 struct rb_node *parent_node = NULL; 127 struct btrfs_delayed_ref_node *entry; 128 struct btrfs_delayed_ref_node *ins; 129 int cmp; 130 131 ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 132 while (*p) { 133 parent_node = *p; 134 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 135 rb_node); 136 137 cmp = comp_entry(entry, ins); 138 if (cmp < 0) 139 p = &(*p)->rb_left; 140 else if (cmp > 0) 141 p = &(*p)->rb_right; 142 else 143 return entry; 144 } 145 146 rb_link_node(node, parent_node, p); 147 rb_insert_color(node, root); 148 return NULL; 149 } 150 151 /* 152 * find an head entry based on bytenr. This returns the delayed ref 153 * head if it was able to find one, or NULL if nothing was in that spot 154 */ 155 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 156 u64 bytenr, 157 struct btrfs_delayed_ref_node **last) 158 { 159 struct rb_node *n = root->rb_node; 160 struct btrfs_delayed_ref_node *entry; 161 int cmp; 162 163 while (n) { 164 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 165 WARN_ON(!entry->in_tree); 166 if (last) 167 *last = entry; 168 169 if (bytenr < entry->bytenr) 170 cmp = -1; 171 else if (bytenr > entry->bytenr) 172 cmp = 1; 173 else if (!btrfs_delayed_ref_is_head(entry)) 174 cmp = 1; 175 else 176 cmp = 0; 177 178 if (cmp < 0) 179 n = n->rb_left; 180 else if (cmp > 0) 181 n = n->rb_right; 182 else 183 return entry; 184 } 185 return NULL; 186 } 187 188 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 189 struct btrfs_delayed_ref_head *head) 190 { 191 struct btrfs_delayed_ref_root *delayed_refs; 192 193 delayed_refs = &trans->transaction->delayed_refs; 194 assert_spin_locked(&delayed_refs->lock); 195 if (mutex_trylock(&head->mutex)) 196 return 0; 197 198 atomic_inc(&head->node.refs); 199 spin_unlock(&delayed_refs->lock); 200 201 mutex_lock(&head->mutex); 202 spin_lock(&delayed_refs->lock); 203 if (!head->node.in_tree) { 204 mutex_unlock(&head->mutex); 205 btrfs_put_delayed_ref(&head->node); 206 return -EAGAIN; 207 } 208 btrfs_put_delayed_ref(&head->node); 209 return 0; 210 } 211 212 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 213 struct list_head *cluster, u64 start) 214 { 215 int count = 0; 216 struct btrfs_delayed_ref_root *delayed_refs; 217 struct rb_node *node; 218 struct btrfs_delayed_ref_node *ref; 219 struct btrfs_delayed_ref_head *head; 220 221 delayed_refs = &trans->transaction->delayed_refs; 222 if (start == 0) { 223 node = rb_first(&delayed_refs->root); 224 } else { 225 ref = NULL; 226 find_ref_head(&delayed_refs->root, start, &ref); 227 if (ref) { 228 struct btrfs_delayed_ref_node *tmp; 229 230 node = rb_prev(&ref->rb_node); 231 while (node) { 232 tmp = rb_entry(node, 233 struct btrfs_delayed_ref_node, 234 rb_node); 235 if (tmp->bytenr < start) 236 break; 237 ref = tmp; 238 node = rb_prev(&ref->rb_node); 239 } 240 node = &ref->rb_node; 241 } else 242 node = rb_first(&delayed_refs->root); 243 } 244 again: 245 while (node && count < 32) { 246 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 247 if (btrfs_delayed_ref_is_head(ref)) { 248 head = btrfs_delayed_node_to_head(ref); 249 if (list_empty(&head->cluster)) { 250 list_add_tail(&head->cluster, cluster); 251 delayed_refs->run_delayed_start = 252 head->node.bytenr; 253 count++; 254 255 WARN_ON(delayed_refs->num_heads_ready == 0); 256 delayed_refs->num_heads_ready--; 257 } else if (count) { 258 /* the goal of the clustering is to find extents 259 * that are likely to end up in the same extent 260 * leaf on disk. So, we don't want them spread 261 * all over the tree. Stop now if we've hit 262 * a head that was already in use 263 */ 264 break; 265 } 266 } 267 node = rb_next(node); 268 } 269 if (count) { 270 return 0; 271 } else if (start) { 272 /* 273 * we've gone to the end of the rbtree without finding any 274 * clusters. start from the beginning and try again 275 */ 276 start = 0; 277 node = rb_first(&delayed_refs->root); 278 goto again; 279 } 280 return 1; 281 } 282 283 /* 284 * helper function to update an extent delayed ref in the 285 * rbtree. existing and update must both have the same 286 * bytenr and parent 287 * 288 * This may free existing if the update cancels out whatever 289 * operation it was doing. 290 */ 291 static noinline void 292 update_existing_ref(struct btrfs_trans_handle *trans, 293 struct btrfs_delayed_ref_root *delayed_refs, 294 struct btrfs_delayed_ref_node *existing, 295 struct btrfs_delayed_ref_node *update) 296 { 297 if (update->action != existing->action) { 298 /* 299 * this is effectively undoing either an add or a 300 * drop. We decrement the ref_mod, and if it goes 301 * down to zero we just delete the entry without 302 * every changing the extent allocation tree. 303 */ 304 existing->ref_mod--; 305 if (existing->ref_mod == 0) { 306 rb_erase(&existing->rb_node, 307 &delayed_refs->root); 308 existing->in_tree = 0; 309 btrfs_put_delayed_ref(existing); 310 delayed_refs->num_entries--; 311 if (trans->delayed_ref_updates) 312 trans->delayed_ref_updates--; 313 } else { 314 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 315 existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 316 } 317 } else { 318 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 319 existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 320 /* 321 * the action on the existing ref matches 322 * the action on the ref we're trying to add. 323 * Bump the ref_mod by one so the backref that 324 * is eventually added/removed has the correct 325 * reference count 326 */ 327 existing->ref_mod += update->ref_mod; 328 } 329 } 330 331 /* 332 * helper function to update the accounting in the head ref 333 * existing and update must have the same bytenr 334 */ 335 static noinline void 336 update_existing_head_ref(struct btrfs_delayed_ref_node *existing, 337 struct btrfs_delayed_ref_node *update) 338 { 339 struct btrfs_delayed_ref_head *existing_ref; 340 struct btrfs_delayed_ref_head *ref; 341 342 existing_ref = btrfs_delayed_node_to_head(existing); 343 ref = btrfs_delayed_node_to_head(update); 344 BUG_ON(existing_ref->is_data != ref->is_data); 345 346 if (ref->must_insert_reserved) { 347 /* if the extent was freed and then 348 * reallocated before the delayed ref 349 * entries were processed, we can end up 350 * with an existing head ref without 351 * the must_insert_reserved flag set. 352 * Set it again here 353 */ 354 existing_ref->must_insert_reserved = ref->must_insert_reserved; 355 356 /* 357 * update the num_bytes so we make sure the accounting 358 * is done correctly 359 */ 360 existing->num_bytes = update->num_bytes; 361 362 } 363 364 if (ref->extent_op) { 365 if (!existing_ref->extent_op) { 366 existing_ref->extent_op = ref->extent_op; 367 } else { 368 if (ref->extent_op->update_key) { 369 memcpy(&existing_ref->extent_op->key, 370 &ref->extent_op->key, 371 sizeof(ref->extent_op->key)); 372 existing_ref->extent_op->update_key = 1; 373 } 374 if (ref->extent_op->update_flags) { 375 existing_ref->extent_op->flags_to_set |= 376 ref->extent_op->flags_to_set; 377 existing_ref->extent_op->update_flags = 1; 378 } 379 kfree(ref->extent_op); 380 } 381 } 382 /* 383 * update the reference mod on the head to reflect this new operation 384 */ 385 existing->ref_mod += update->ref_mod; 386 } 387 388 /* 389 * helper function to actually insert a head node into the rbtree. 390 * this does all the dirty work in terms of maintaining the correct 391 * overall modification count. 392 */ 393 static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, 394 struct btrfs_delayed_ref_node *ref, 395 u64 bytenr, u64 num_bytes, 396 int action, int is_data) 397 { 398 struct btrfs_delayed_ref_node *existing; 399 struct btrfs_delayed_ref_head *head_ref = NULL; 400 struct btrfs_delayed_ref_root *delayed_refs; 401 int count_mod = 1; 402 int must_insert_reserved = 0; 403 404 /* 405 * the head node stores the sum of all the mods, so dropping a ref 406 * should drop the sum in the head node by one. 407 */ 408 if (action == BTRFS_UPDATE_DELAYED_HEAD) 409 count_mod = 0; 410 else if (action == BTRFS_DROP_DELAYED_REF) 411 count_mod = -1; 412 413 /* 414 * BTRFS_ADD_DELAYED_EXTENT means that we need to update 415 * the reserved accounting when the extent is finally added, or 416 * if a later modification deletes the delayed ref without ever 417 * inserting the extent into the extent allocation tree. 418 * ref->must_insert_reserved is the flag used to record 419 * that accounting mods are required. 420 * 421 * Once we record must_insert_reserved, switch the action to 422 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 423 */ 424 if (action == BTRFS_ADD_DELAYED_EXTENT) 425 must_insert_reserved = 1; 426 else 427 must_insert_reserved = 0; 428 429 delayed_refs = &trans->transaction->delayed_refs; 430 431 /* first set the basic ref node struct up */ 432 atomic_set(&ref->refs, 1); 433 ref->bytenr = bytenr; 434 ref->num_bytes = num_bytes; 435 ref->ref_mod = count_mod; 436 ref->type = 0; 437 ref->action = 0; 438 ref->is_head = 1; 439 ref->in_tree = 1; 440 441 head_ref = btrfs_delayed_node_to_head(ref); 442 head_ref->must_insert_reserved = must_insert_reserved; 443 head_ref->is_data = is_data; 444 445 INIT_LIST_HEAD(&head_ref->cluster); 446 mutex_init(&head_ref->mutex); 447 448 trace_btrfs_delayed_ref_head(ref, head_ref, action); 449 450 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 451 452 if (existing) { 453 update_existing_head_ref(existing, ref); 454 /* 455 * we've updated the existing ref, free the newly 456 * allocated ref 457 */ 458 kfree(ref); 459 } else { 460 delayed_refs->num_heads++; 461 delayed_refs->num_heads_ready++; 462 delayed_refs->num_entries++; 463 trans->delayed_ref_updates++; 464 } 465 return 0; 466 } 467 468 /* 469 * helper to insert a delayed tree ref into the rbtree. 470 */ 471 static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, 472 struct btrfs_delayed_ref_node *ref, 473 u64 bytenr, u64 num_bytes, u64 parent, 474 u64 ref_root, int level, int action) 475 { 476 struct btrfs_delayed_ref_node *existing; 477 struct btrfs_delayed_tree_ref *full_ref; 478 struct btrfs_delayed_ref_root *delayed_refs; 479 480 if (action == BTRFS_ADD_DELAYED_EXTENT) 481 action = BTRFS_ADD_DELAYED_REF; 482 483 delayed_refs = &trans->transaction->delayed_refs; 484 485 /* first set the basic ref node struct up */ 486 atomic_set(&ref->refs, 1); 487 ref->bytenr = bytenr; 488 ref->num_bytes = num_bytes; 489 ref->ref_mod = 1; 490 ref->action = action; 491 ref->is_head = 0; 492 ref->in_tree = 1; 493 494 full_ref = btrfs_delayed_node_to_tree_ref(ref); 495 if (parent) { 496 full_ref->parent = parent; 497 ref->type = BTRFS_SHARED_BLOCK_REF_KEY; 498 } else { 499 full_ref->root = ref_root; 500 ref->type = BTRFS_TREE_BLOCK_REF_KEY; 501 } 502 full_ref->level = level; 503 504 trace_btrfs_delayed_tree_ref(ref, full_ref, action); 505 506 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 507 508 if (existing) { 509 update_existing_ref(trans, delayed_refs, existing, ref); 510 /* 511 * we've updated the existing ref, free the newly 512 * allocated ref 513 */ 514 kfree(ref); 515 } else { 516 delayed_refs->num_entries++; 517 trans->delayed_ref_updates++; 518 } 519 return 0; 520 } 521 522 /* 523 * helper to insert a delayed data ref into the rbtree. 524 */ 525 static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, 526 struct btrfs_delayed_ref_node *ref, 527 u64 bytenr, u64 num_bytes, u64 parent, 528 u64 ref_root, u64 owner, u64 offset, 529 int action) 530 { 531 struct btrfs_delayed_ref_node *existing; 532 struct btrfs_delayed_data_ref *full_ref; 533 struct btrfs_delayed_ref_root *delayed_refs; 534 535 if (action == BTRFS_ADD_DELAYED_EXTENT) 536 action = BTRFS_ADD_DELAYED_REF; 537 538 delayed_refs = &trans->transaction->delayed_refs; 539 540 /* first set the basic ref node struct up */ 541 atomic_set(&ref->refs, 1); 542 ref->bytenr = bytenr; 543 ref->num_bytes = num_bytes; 544 ref->ref_mod = 1; 545 ref->action = action; 546 ref->is_head = 0; 547 ref->in_tree = 1; 548 549 full_ref = btrfs_delayed_node_to_data_ref(ref); 550 if (parent) { 551 full_ref->parent = parent; 552 ref->type = BTRFS_SHARED_DATA_REF_KEY; 553 } else { 554 full_ref->root = ref_root; 555 ref->type = BTRFS_EXTENT_DATA_REF_KEY; 556 } 557 full_ref->objectid = owner; 558 full_ref->offset = offset; 559 560 trace_btrfs_delayed_data_ref(ref, full_ref, action); 561 562 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 563 564 if (existing) { 565 update_existing_ref(trans, delayed_refs, existing, ref); 566 /* 567 * we've updated the existing ref, free the newly 568 * allocated ref 569 */ 570 kfree(ref); 571 } else { 572 delayed_refs->num_entries++; 573 trans->delayed_ref_updates++; 574 } 575 return 0; 576 } 577 578 /* 579 * add a delayed tree ref. This does all of the accounting required 580 * to make sure the delayed ref is eventually processed before this 581 * transaction commits. 582 */ 583 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 584 u64 bytenr, u64 num_bytes, u64 parent, 585 u64 ref_root, int level, int action, 586 struct btrfs_delayed_extent_op *extent_op) 587 { 588 struct btrfs_delayed_tree_ref *ref; 589 struct btrfs_delayed_ref_head *head_ref; 590 struct btrfs_delayed_ref_root *delayed_refs; 591 int ret; 592 593 BUG_ON(extent_op && extent_op->is_data); 594 ref = kmalloc(sizeof(*ref), GFP_NOFS); 595 if (!ref) 596 return -ENOMEM; 597 598 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 599 if (!head_ref) { 600 kfree(ref); 601 return -ENOMEM; 602 } 603 604 head_ref->extent_op = extent_op; 605 606 delayed_refs = &trans->transaction->delayed_refs; 607 spin_lock(&delayed_refs->lock); 608 609 /* 610 * insert both the head node and the new ref without dropping 611 * the spin lock 612 */ 613 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 614 action, 0); 615 BUG_ON(ret); 616 617 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, 618 parent, ref_root, level, action); 619 BUG_ON(ret); 620 spin_unlock(&delayed_refs->lock); 621 return 0; 622 } 623 624 /* 625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 626 */ 627 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 628 u64 bytenr, u64 num_bytes, 629 u64 parent, u64 ref_root, 630 u64 owner, u64 offset, int action, 631 struct btrfs_delayed_extent_op *extent_op) 632 { 633 struct btrfs_delayed_data_ref *ref; 634 struct btrfs_delayed_ref_head *head_ref; 635 struct btrfs_delayed_ref_root *delayed_refs; 636 int ret; 637 638 BUG_ON(extent_op && !extent_op->is_data); 639 ref = kmalloc(sizeof(*ref), GFP_NOFS); 640 if (!ref) 641 return -ENOMEM; 642 643 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 644 if (!head_ref) { 645 kfree(ref); 646 return -ENOMEM; 647 } 648 649 head_ref->extent_op = extent_op; 650 651 delayed_refs = &trans->transaction->delayed_refs; 652 spin_lock(&delayed_refs->lock); 653 654 /* 655 * insert both the head node and the new ref without dropping 656 * the spin lock 657 */ 658 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 659 action, 1); 660 BUG_ON(ret); 661 662 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, 663 parent, ref_root, owner, offset, action); 664 BUG_ON(ret); 665 spin_unlock(&delayed_refs->lock); 666 return 0; 667 } 668 669 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 670 u64 bytenr, u64 num_bytes, 671 struct btrfs_delayed_extent_op *extent_op) 672 { 673 struct btrfs_delayed_ref_head *head_ref; 674 struct btrfs_delayed_ref_root *delayed_refs; 675 int ret; 676 677 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 678 if (!head_ref) 679 return -ENOMEM; 680 681 head_ref->extent_op = extent_op; 682 683 delayed_refs = &trans->transaction->delayed_refs; 684 spin_lock(&delayed_refs->lock); 685 686 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, 687 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 688 extent_op->is_data); 689 BUG_ON(ret); 690 691 spin_unlock(&delayed_refs->lock); 692 return 0; 693 } 694 695 /* 696 * this does a simple search for the head node for a given extent. 697 * It must be called with the delayed ref spinlock held, and it returns 698 * the head node if any where found, or NULL if not. 699 */ 700 struct btrfs_delayed_ref_head * 701 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) 702 { 703 struct btrfs_delayed_ref_node *ref; 704 struct btrfs_delayed_ref_root *delayed_refs; 705 706 delayed_refs = &trans->transaction->delayed_refs; 707 ref = find_ref_head(&delayed_refs->root, bytenr, NULL); 708 if (ref) 709 return btrfs_delayed_node_to_head(ref); 710 return NULL; 711 } 712