1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * refcounttree.c 5 * 6 * Copyright (C) 2009 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 18 #include <linux/sort.h> 19 #define MLOG_MASK_PREFIX ML_REFCOUNT 20 #include <cluster/masklog.h> 21 #include "ocfs2.h" 22 #include "inode.h" 23 #include "alloc.h" 24 #include "suballoc.h" 25 #include "journal.h" 26 #include "uptodate.h" 27 #include "super.h" 28 #include "buffer_head_io.h" 29 #include "blockcheck.h" 30 #include "refcounttree.h" 31 #include "sysfile.h" 32 #include "dlmglue.h" 33 #include "extent_map.h" 34 #include "aops.h" 35 #include "xattr.h" 36 #include "namei.h" 37 38 #include <linux/bio.h> 39 #include <linux/blkdev.h> 40 #include <linux/gfp.h> 41 #include <linux/slab.h> 42 #include <linux/writeback.h> 43 #include <linux/pagevec.h> 44 #include <linux/swap.h> 45 #include <linux/security.h> 46 #include <linux/fsnotify.h> 47 #include <linux/quotaops.h> 48 #include <linux/namei.h> 49 #include <linux/mount.h> 50 51 struct ocfs2_cow_context { 52 struct inode *inode; 53 u32 cow_start; 54 u32 cow_len; 55 struct ocfs2_extent_tree data_et; 56 struct ocfs2_refcount_tree *ref_tree; 57 struct buffer_head *ref_root_bh; 58 struct ocfs2_alloc_context *meta_ac; 59 struct ocfs2_alloc_context *data_ac; 60 struct ocfs2_cached_dealloc_ctxt dealloc; 61 void *cow_object; 62 struct ocfs2_post_refcount *post_refcount; 63 int extra_credits; 64 int (*get_clusters)(struct ocfs2_cow_context *context, 65 u32 v_cluster, u32 *p_cluster, 66 u32 *num_clusters, 67 unsigned int *extent_flags); 68 int (*cow_duplicate_clusters)(handle_t *handle, 69 struct ocfs2_cow_context *context, 70 u32 cpos, u32 old_cluster, 71 u32 new_cluster, u32 new_len); 72 }; 73 74 static inline struct ocfs2_refcount_tree * 75 cache_info_to_refcount(struct ocfs2_caching_info *ci) 76 { 77 return container_of(ci, struct ocfs2_refcount_tree, rf_ci); 78 } 79 80 static int ocfs2_validate_refcount_block(struct super_block *sb, 81 struct buffer_head *bh) 82 { 83 int rc; 84 struct ocfs2_refcount_block *rb = 85 (struct ocfs2_refcount_block *)bh->b_data; 86 87 mlog(0, "Validating refcount block %llu\n", 88 (unsigned long long)bh->b_blocknr); 89 90 BUG_ON(!buffer_uptodate(bh)); 91 92 /* 93 * If the ecc fails, we return the error but otherwise 94 * leave the filesystem running. We know any error is 95 * local to this block. 96 */ 97 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); 98 if (rc) { 99 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", 100 (unsigned long long)bh->b_blocknr); 101 return rc; 102 } 103 104 105 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { 106 ocfs2_error(sb, 107 "Refcount block #%llu has bad signature %.*s", 108 (unsigned long long)bh->b_blocknr, 7, 109 rb->rf_signature); 110 return -EINVAL; 111 } 112 113 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { 114 ocfs2_error(sb, 115 "Refcount block #%llu has an invalid rf_blkno " 116 "of %llu", 117 (unsigned long long)bh->b_blocknr, 118 (unsigned long long)le64_to_cpu(rb->rf_blkno)); 119 return -EINVAL; 120 } 121 122 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { 123 ocfs2_error(sb, 124 "Refcount block #%llu has an invalid " 125 "rf_fs_generation of #%u", 126 (unsigned long long)bh->b_blocknr, 127 le32_to_cpu(rb->rf_fs_generation)); 128 return -EINVAL; 129 } 130 131 return 0; 132 } 133 134 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, 135 u64 rb_blkno, 136 struct buffer_head **bh) 137 { 138 int rc; 139 struct buffer_head *tmp = *bh; 140 141 rc = ocfs2_read_block(ci, rb_blkno, &tmp, 142 ocfs2_validate_refcount_block); 143 144 /* If ocfs2_read_block() got us a new bh, pass it up. */ 145 if (!rc && !*bh) 146 *bh = tmp; 147 148 return rc; 149 } 150 151 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) 152 { 153 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 154 155 return rf->rf_blkno; 156 } 157 158 static struct super_block * 159 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) 160 { 161 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 162 163 return rf->rf_sb; 164 } 165 166 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) 167 { 168 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 169 170 spin_lock(&rf->rf_lock); 171 } 172 173 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) 174 { 175 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 176 177 spin_unlock(&rf->rf_lock); 178 } 179 180 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) 181 { 182 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 183 184 mutex_lock(&rf->rf_io_mutex); 185 } 186 187 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) 188 { 189 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 190 191 mutex_unlock(&rf->rf_io_mutex); 192 } 193 194 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { 195 .co_owner = ocfs2_refcount_cache_owner, 196 .co_get_super = ocfs2_refcount_cache_get_super, 197 .co_cache_lock = ocfs2_refcount_cache_lock, 198 .co_cache_unlock = ocfs2_refcount_cache_unlock, 199 .co_io_lock = ocfs2_refcount_cache_io_lock, 200 .co_io_unlock = ocfs2_refcount_cache_io_unlock, 201 }; 202 203 static struct ocfs2_refcount_tree * 204 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) 205 { 206 struct rb_node *n = osb->osb_rf_lock_tree.rb_node; 207 struct ocfs2_refcount_tree *tree = NULL; 208 209 while (n) { 210 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); 211 212 if (blkno < tree->rf_blkno) 213 n = n->rb_left; 214 else if (blkno > tree->rf_blkno) 215 n = n->rb_right; 216 else 217 return tree; 218 } 219 220 return NULL; 221 } 222 223 /* osb_lock is already locked. */ 224 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, 225 struct ocfs2_refcount_tree *new) 226 { 227 u64 rf_blkno = new->rf_blkno; 228 struct rb_node *parent = NULL; 229 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; 230 struct ocfs2_refcount_tree *tmp; 231 232 while (*p) { 233 parent = *p; 234 235 tmp = rb_entry(parent, struct ocfs2_refcount_tree, 236 rf_node); 237 238 if (rf_blkno < tmp->rf_blkno) 239 p = &(*p)->rb_left; 240 else if (rf_blkno > tmp->rf_blkno) 241 p = &(*p)->rb_right; 242 else { 243 /* This should never happen! */ 244 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", 245 (unsigned long long)rf_blkno); 246 BUG(); 247 } 248 } 249 250 rb_link_node(&new->rf_node, parent, p); 251 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); 252 } 253 254 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) 255 { 256 ocfs2_metadata_cache_exit(&tree->rf_ci); 257 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); 258 ocfs2_lock_res_free(&tree->rf_lockres); 259 kfree(tree); 260 } 261 262 static inline void 263 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, 264 struct ocfs2_refcount_tree *tree) 265 { 266 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); 267 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) 268 osb->osb_ref_tree_lru = NULL; 269 } 270 271 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, 272 struct ocfs2_refcount_tree *tree) 273 { 274 spin_lock(&osb->osb_lock); 275 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); 276 spin_unlock(&osb->osb_lock); 277 } 278 279 static void ocfs2_kref_remove_refcount_tree(struct kref *kref) 280 { 281 struct ocfs2_refcount_tree *tree = 282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); 283 284 ocfs2_free_refcount_tree(tree); 285 } 286 287 static inline void 288 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) 289 { 290 kref_get(&tree->rf_getcnt); 291 } 292 293 static inline void 294 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) 295 { 296 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); 297 } 298 299 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, 300 struct super_block *sb) 301 { 302 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); 303 mutex_init(&new->rf_io_mutex); 304 new->rf_sb = sb; 305 spin_lock_init(&new->rf_lock); 306 } 307 308 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, 309 struct ocfs2_refcount_tree *new, 310 u64 rf_blkno, u32 generation) 311 { 312 init_rwsem(&new->rf_sem); 313 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, 314 rf_blkno, generation); 315 } 316 317 static struct ocfs2_refcount_tree* 318 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) 319 { 320 struct ocfs2_refcount_tree *new; 321 322 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); 323 if (!new) 324 return NULL; 325 326 new->rf_blkno = rf_blkno; 327 kref_init(&new->rf_getcnt); 328 ocfs2_init_refcount_tree_ci(new, osb->sb); 329 330 return new; 331 } 332 333 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, 334 struct ocfs2_refcount_tree **ret_tree) 335 { 336 int ret = 0; 337 struct ocfs2_refcount_tree *tree, *new = NULL; 338 struct buffer_head *ref_root_bh = NULL; 339 struct ocfs2_refcount_block *ref_rb; 340 341 spin_lock(&osb->osb_lock); 342 if (osb->osb_ref_tree_lru && 343 osb->osb_ref_tree_lru->rf_blkno == rf_blkno) 344 tree = osb->osb_ref_tree_lru; 345 else 346 tree = ocfs2_find_refcount_tree(osb, rf_blkno); 347 if (tree) 348 goto out; 349 350 spin_unlock(&osb->osb_lock); 351 352 new = ocfs2_allocate_refcount_tree(osb, rf_blkno); 353 if (!new) { 354 ret = -ENOMEM; 355 mlog_errno(ret); 356 return ret; 357 } 358 /* 359 * We need the generation to create the refcount tree lock and since 360 * it isn't changed during the tree modification, we are safe here to 361 * read without protection. 362 * We also have to purge the cache after we create the lock since the 363 * refcount block may have the stale data. It can only be trusted when 364 * we hold the refcount lock. 365 */ 366 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); 367 if (ret) { 368 mlog_errno(ret); 369 ocfs2_metadata_cache_exit(&new->rf_ci); 370 kfree(new); 371 return ret; 372 } 373 374 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 375 new->rf_generation = le32_to_cpu(ref_rb->rf_generation); 376 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, 377 new->rf_generation); 378 ocfs2_metadata_cache_purge(&new->rf_ci); 379 380 spin_lock(&osb->osb_lock); 381 tree = ocfs2_find_refcount_tree(osb, rf_blkno); 382 if (tree) 383 goto out; 384 385 ocfs2_insert_refcount_tree(osb, new); 386 387 tree = new; 388 new = NULL; 389 390 out: 391 *ret_tree = tree; 392 393 osb->osb_ref_tree_lru = tree; 394 395 spin_unlock(&osb->osb_lock); 396 397 if (new) 398 ocfs2_free_refcount_tree(new); 399 400 brelse(ref_root_bh); 401 return ret; 402 } 403 404 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) 405 { 406 int ret; 407 struct buffer_head *di_bh = NULL; 408 struct ocfs2_dinode *di; 409 410 ret = ocfs2_read_inode_block(inode, &di_bh); 411 if (ret) { 412 mlog_errno(ret); 413 goto out; 414 } 415 416 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 417 418 di = (struct ocfs2_dinode *)di_bh->b_data; 419 *ref_blkno = le64_to_cpu(di->i_refcount_loc); 420 brelse(di_bh); 421 out: 422 return ret; 423 } 424 425 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, 426 struct ocfs2_refcount_tree *tree, int rw) 427 { 428 int ret; 429 430 ret = ocfs2_refcount_lock(tree, rw); 431 if (ret) { 432 mlog_errno(ret); 433 goto out; 434 } 435 436 if (rw) 437 down_write(&tree->rf_sem); 438 else 439 down_read(&tree->rf_sem); 440 441 out: 442 return ret; 443 } 444 445 /* 446 * Lock the refcount tree pointed by ref_blkno and return the tree. 447 * In most case, we lock the tree and read the refcount block. 448 * So read it here if the caller really needs it. 449 * 450 * If the tree has been re-created by other node, it will free the 451 * old one and re-create it. 452 */ 453 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, 454 u64 ref_blkno, int rw, 455 struct ocfs2_refcount_tree **ret_tree, 456 struct buffer_head **ref_bh) 457 { 458 int ret, delete_tree = 0; 459 struct ocfs2_refcount_tree *tree = NULL; 460 struct buffer_head *ref_root_bh = NULL; 461 struct ocfs2_refcount_block *rb; 462 463 again: 464 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); 465 if (ret) { 466 mlog_errno(ret); 467 return ret; 468 } 469 470 ocfs2_refcount_tree_get(tree); 471 472 ret = __ocfs2_lock_refcount_tree(osb, tree, rw); 473 if (ret) { 474 mlog_errno(ret); 475 ocfs2_refcount_tree_put(tree); 476 goto out; 477 } 478 479 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, 480 &ref_root_bh); 481 if (ret) { 482 mlog_errno(ret); 483 ocfs2_unlock_refcount_tree(osb, tree, rw); 484 ocfs2_refcount_tree_put(tree); 485 goto out; 486 } 487 488 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 489 /* 490 * If the refcount block has been freed and re-created, we may need 491 * to recreate the refcount tree also. 492 * 493 * Here we just remove the tree from the rb-tree, and the last 494 * kref holder will unlock and delete this refcount_tree. 495 * Then we goto "again" and ocfs2_get_refcount_tree will create 496 * the new refcount tree for us. 497 */ 498 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { 499 if (!tree->rf_removed) { 500 ocfs2_erase_refcount_tree_from_list(osb, tree); 501 tree->rf_removed = 1; 502 delete_tree = 1; 503 } 504 505 ocfs2_unlock_refcount_tree(osb, tree, rw); 506 /* 507 * We get an extra reference when we create the refcount 508 * tree, so another put will destroy it. 509 */ 510 if (delete_tree) 511 ocfs2_refcount_tree_put(tree); 512 brelse(ref_root_bh); 513 ref_root_bh = NULL; 514 goto again; 515 } 516 517 *ret_tree = tree; 518 if (ref_bh) { 519 *ref_bh = ref_root_bh; 520 ref_root_bh = NULL; 521 } 522 out: 523 brelse(ref_root_bh); 524 return ret; 525 } 526 527 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, 528 struct ocfs2_refcount_tree *tree, int rw) 529 { 530 if (rw) 531 up_write(&tree->rf_sem); 532 else 533 up_read(&tree->rf_sem); 534 535 ocfs2_refcount_unlock(tree, rw); 536 ocfs2_refcount_tree_put(tree); 537 } 538 539 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) 540 { 541 struct rb_node *node; 542 struct ocfs2_refcount_tree *tree; 543 struct rb_root *root = &osb->osb_rf_lock_tree; 544 545 while ((node = rb_last(root)) != NULL) { 546 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); 547 548 mlog(0, "Purge tree %llu\n", 549 (unsigned long long) tree->rf_blkno); 550 551 rb_erase(&tree->rf_node, root); 552 ocfs2_free_refcount_tree(tree); 553 } 554 } 555 556 /* 557 * Create a refcount tree for an inode. 558 * We take for granted that the inode is already locked. 559 */ 560 static int ocfs2_create_refcount_tree(struct inode *inode, 561 struct buffer_head *di_bh) 562 { 563 int ret; 564 handle_t *handle = NULL; 565 struct ocfs2_alloc_context *meta_ac = NULL; 566 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 567 struct ocfs2_inode_info *oi = OCFS2_I(inode); 568 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 569 struct buffer_head *new_bh = NULL; 570 struct ocfs2_refcount_block *rb; 571 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 572 u16 suballoc_bit_start; 573 u32 num_got; 574 u64 first_blkno; 575 576 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 577 578 mlog(0, "create tree for inode %lu\n", inode->i_ino); 579 580 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 581 if (ret) { 582 mlog_errno(ret); 583 goto out; 584 } 585 586 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); 587 if (IS_ERR(handle)) { 588 ret = PTR_ERR(handle); 589 mlog_errno(ret); 590 goto out; 591 } 592 593 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 594 OCFS2_JOURNAL_ACCESS_WRITE); 595 if (ret) { 596 mlog_errno(ret); 597 goto out_commit; 598 } 599 600 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 601 &suballoc_bit_start, &num_got, 602 &first_blkno); 603 if (ret) { 604 mlog_errno(ret); 605 goto out_commit; 606 } 607 608 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); 609 if (!new_tree) { 610 ret = -ENOMEM; 611 mlog_errno(ret); 612 goto out_commit; 613 } 614 615 new_bh = sb_getblk(inode->i_sb, first_blkno); 616 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); 617 618 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, 619 OCFS2_JOURNAL_ACCESS_CREATE); 620 if (ret) { 621 mlog_errno(ret); 622 goto out_commit; 623 } 624 625 /* Initialize ocfs2_refcount_block. */ 626 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 627 memset(rb, 0, inode->i_sb->s_blocksize); 628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 629 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 632 rb->rf_blkno = cpu_to_le64(first_blkno); 633 rb->rf_count = cpu_to_le32(1); 634 rb->rf_records.rl_count = 635 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); 636 spin_lock(&osb->osb_lock); 637 rb->rf_generation = osb->s_next_generation++; 638 spin_unlock(&osb->osb_lock); 639 640 ocfs2_journal_dirty(handle, new_bh); 641 642 spin_lock(&oi->ip_lock); 643 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; 644 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 645 di->i_refcount_loc = cpu_to_le64(first_blkno); 646 spin_unlock(&oi->ip_lock); 647 648 mlog(0, "created tree for inode %lu, refblock %llu\n", 649 inode->i_ino, (unsigned long long)first_blkno); 650 651 ocfs2_journal_dirty(handle, di_bh); 652 653 /* 654 * We have to init the tree lock here since it will use 655 * the generation number to create it. 656 */ 657 new_tree->rf_generation = le32_to_cpu(rb->rf_generation); 658 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, 659 new_tree->rf_generation); 660 661 spin_lock(&osb->osb_lock); 662 tree = ocfs2_find_refcount_tree(osb, first_blkno); 663 664 /* 665 * We've just created a new refcount tree in this block. If 666 * we found a refcount tree on the ocfs2_super, it must be 667 * one we just deleted. We free the old tree before 668 * inserting the new tree. 669 */ 670 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); 671 if (tree) 672 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); 673 ocfs2_insert_refcount_tree(osb, new_tree); 674 spin_unlock(&osb->osb_lock); 675 new_tree = NULL; 676 if (tree) 677 ocfs2_refcount_tree_put(tree); 678 679 out_commit: 680 ocfs2_commit_trans(osb, handle); 681 682 out: 683 if (new_tree) { 684 ocfs2_metadata_cache_exit(&new_tree->rf_ci); 685 kfree(new_tree); 686 } 687 688 brelse(new_bh); 689 if (meta_ac) 690 ocfs2_free_alloc_context(meta_ac); 691 692 return ret; 693 } 694 695 static int ocfs2_set_refcount_tree(struct inode *inode, 696 struct buffer_head *di_bh, 697 u64 refcount_loc) 698 { 699 int ret; 700 handle_t *handle = NULL; 701 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 702 struct ocfs2_inode_info *oi = OCFS2_I(inode); 703 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 704 struct buffer_head *ref_root_bh = NULL; 705 struct ocfs2_refcount_block *rb; 706 struct ocfs2_refcount_tree *ref_tree; 707 708 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 709 710 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, 711 &ref_tree, &ref_root_bh); 712 if (ret) { 713 mlog_errno(ret); 714 return ret; 715 } 716 717 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); 718 if (IS_ERR(handle)) { 719 ret = PTR_ERR(handle); 720 mlog_errno(ret); 721 goto out; 722 } 723 724 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 725 OCFS2_JOURNAL_ACCESS_WRITE); 726 if (ret) { 727 mlog_errno(ret); 728 goto out_commit; 729 } 730 731 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, 732 OCFS2_JOURNAL_ACCESS_WRITE); 733 if (ret) { 734 mlog_errno(ret); 735 goto out_commit; 736 } 737 738 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 739 le32_add_cpu(&rb->rf_count, 1); 740 741 ocfs2_journal_dirty(handle, ref_root_bh); 742 743 spin_lock(&oi->ip_lock); 744 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; 745 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 746 di->i_refcount_loc = cpu_to_le64(refcount_loc); 747 spin_unlock(&oi->ip_lock); 748 ocfs2_journal_dirty(handle, di_bh); 749 750 out_commit: 751 ocfs2_commit_trans(osb, handle); 752 out: 753 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 754 brelse(ref_root_bh); 755 756 return ret; 757 } 758 759 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) 760 { 761 int ret, delete_tree = 0; 762 handle_t *handle = NULL; 763 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 764 struct ocfs2_inode_info *oi = OCFS2_I(inode); 765 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 766 struct ocfs2_refcount_block *rb; 767 struct inode *alloc_inode = NULL; 768 struct buffer_head *alloc_bh = NULL; 769 struct buffer_head *blk_bh = NULL; 770 struct ocfs2_refcount_tree *ref_tree; 771 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; 772 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); 773 u16 bit = 0; 774 775 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) 776 return 0; 777 778 BUG_ON(!ref_blkno); 779 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); 780 if (ret) { 781 mlog_errno(ret); 782 return ret; 783 } 784 785 rb = (struct ocfs2_refcount_block *)blk_bh->b_data; 786 787 /* 788 * If we are the last user, we need to free the block. 789 * So lock the allocator ahead. 790 */ 791 if (le32_to_cpu(rb->rf_count) == 1) { 792 blk = le64_to_cpu(rb->rf_blkno); 793 bit = le16_to_cpu(rb->rf_suballoc_bit); 794 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 795 796 alloc_inode = ocfs2_get_system_file_inode(osb, 797 EXTENT_ALLOC_SYSTEM_INODE, 798 le16_to_cpu(rb->rf_suballoc_slot)); 799 if (!alloc_inode) { 800 ret = -ENOMEM; 801 mlog_errno(ret); 802 goto out; 803 } 804 mutex_lock(&alloc_inode->i_mutex); 805 806 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); 807 if (ret) { 808 mlog_errno(ret); 809 goto out_mutex; 810 } 811 812 credits += OCFS2_SUBALLOC_FREE; 813 } 814 815 handle = ocfs2_start_trans(osb, credits); 816 if (IS_ERR(handle)) { 817 ret = PTR_ERR(handle); 818 mlog_errno(ret); 819 goto out_unlock; 820 } 821 822 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 823 OCFS2_JOURNAL_ACCESS_WRITE); 824 if (ret) { 825 mlog_errno(ret); 826 goto out_commit; 827 } 828 829 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, 830 OCFS2_JOURNAL_ACCESS_WRITE); 831 if (ret) { 832 mlog_errno(ret); 833 goto out_commit; 834 } 835 836 spin_lock(&oi->ip_lock); 837 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; 838 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 839 di->i_refcount_loc = 0; 840 spin_unlock(&oi->ip_lock); 841 ocfs2_journal_dirty(handle, di_bh); 842 843 le32_add_cpu(&rb->rf_count , -1); 844 ocfs2_journal_dirty(handle, blk_bh); 845 846 if (!rb->rf_count) { 847 delete_tree = 1; 848 ocfs2_erase_refcount_tree_from_list(osb, ref_tree); 849 ret = ocfs2_free_suballoc_bits(handle, alloc_inode, 850 alloc_bh, bit, bg_blkno, 1); 851 if (ret) 852 mlog_errno(ret); 853 } 854 855 out_commit: 856 ocfs2_commit_trans(osb, handle); 857 out_unlock: 858 if (alloc_inode) { 859 ocfs2_inode_unlock(alloc_inode, 1); 860 brelse(alloc_bh); 861 } 862 out_mutex: 863 if (alloc_inode) { 864 mutex_unlock(&alloc_inode->i_mutex); 865 iput(alloc_inode); 866 } 867 out: 868 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 869 if (delete_tree) 870 ocfs2_refcount_tree_put(ref_tree); 871 brelse(blk_bh); 872 873 return ret; 874 } 875 876 static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, 877 struct buffer_head *ref_leaf_bh, 878 u64 cpos, unsigned int len, 879 struct ocfs2_refcount_rec *ret_rec, 880 int *index) 881 { 882 int i = 0; 883 struct ocfs2_refcount_block *rb = 884 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 885 struct ocfs2_refcount_rec *rec = NULL; 886 887 for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { 888 rec = &rb->rf_records.rl_recs[i]; 889 890 if (le64_to_cpu(rec->r_cpos) + 891 le32_to_cpu(rec->r_clusters) <= cpos) 892 continue; 893 else if (le64_to_cpu(rec->r_cpos) > cpos) 894 break; 895 896 /* ok, cpos fail in this rec. Just return. */ 897 if (ret_rec) 898 *ret_rec = *rec; 899 goto out; 900 } 901 902 if (ret_rec) { 903 /* We meet with a hole here, so fake the rec. */ 904 ret_rec->r_cpos = cpu_to_le64(cpos); 905 ret_rec->r_refcount = 0; 906 if (i < le16_to_cpu(rb->rf_records.rl_used) && 907 le64_to_cpu(rec->r_cpos) < cpos + len) 908 ret_rec->r_clusters = 909 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); 910 else 911 ret_rec->r_clusters = cpu_to_le32(len); 912 } 913 914 out: 915 *index = i; 916 } 917 918 /* 919 * Try to remove refcount tree. The mechanism is: 920 * 1) Check whether i_clusters == 0, if no, exit. 921 * 2) check whether we have i_xattr_loc in dinode. if yes, exit. 922 * 3) Check whether we have inline xattr stored outside, if yes, exit. 923 * 4) Remove the tree. 924 */ 925 int ocfs2_try_remove_refcount_tree(struct inode *inode, 926 struct buffer_head *di_bh) 927 { 928 int ret; 929 struct ocfs2_inode_info *oi = OCFS2_I(inode); 930 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 931 932 down_write(&oi->ip_xattr_sem); 933 down_write(&oi->ip_alloc_sem); 934 935 if (oi->ip_clusters) 936 goto out; 937 938 if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) 939 goto out; 940 941 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && 942 ocfs2_has_inline_xattr_value_outside(inode, di)) 943 goto out; 944 945 ret = ocfs2_remove_refcount_tree(inode, di_bh); 946 if (ret) 947 mlog_errno(ret); 948 out: 949 up_write(&oi->ip_alloc_sem); 950 up_write(&oi->ip_xattr_sem); 951 return 0; 952 } 953 954 /* 955 * Find the end range for a leaf refcount block indicated by 956 * el->l_recs[index].e_blkno. 957 */ 958 static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, 959 struct buffer_head *ref_root_bh, 960 struct ocfs2_extent_block *eb, 961 struct ocfs2_extent_list *el, 962 int index, u32 *cpos_end) 963 { 964 int ret, i, subtree_root; 965 u32 cpos; 966 u64 blkno; 967 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 968 struct ocfs2_path *left_path = NULL, *right_path = NULL; 969 struct ocfs2_extent_tree et; 970 struct ocfs2_extent_list *tmp_el; 971 972 if (index < le16_to_cpu(el->l_next_free_rec) - 1) { 973 /* 974 * We have a extent rec after index, so just use the e_cpos 975 * of the next extent rec. 976 */ 977 *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); 978 return 0; 979 } 980 981 if (!eb || (eb && !eb->h_next_leaf_blk)) { 982 /* 983 * We are the last extent rec, so any high cpos should 984 * be stored in this leaf refcount block. 985 */ 986 *cpos_end = UINT_MAX; 987 return 0; 988 } 989 990 /* 991 * If the extent block isn't the last one, we have to find 992 * the subtree root between this extent block and the next 993 * leaf extent block and get the corresponding e_cpos from 994 * the subroot. Otherwise we may corrupt the b-tree. 995 */ 996 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 997 998 left_path = ocfs2_new_path_from_et(&et); 999 if (!left_path) { 1000 ret = -ENOMEM; 1001 mlog_errno(ret); 1002 goto out; 1003 } 1004 1005 cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); 1006 ret = ocfs2_find_path(ci, left_path, cpos); 1007 if (ret) { 1008 mlog_errno(ret); 1009 goto out; 1010 } 1011 1012 right_path = ocfs2_new_path_from_path(left_path); 1013 if (!right_path) { 1014 ret = -ENOMEM; 1015 mlog_errno(ret); 1016 goto out; 1017 } 1018 1019 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); 1020 if (ret) { 1021 mlog_errno(ret); 1022 goto out; 1023 } 1024 1025 ret = ocfs2_find_path(ci, right_path, cpos); 1026 if (ret) { 1027 mlog_errno(ret); 1028 goto out; 1029 } 1030 1031 subtree_root = ocfs2_find_subtree_root(&et, left_path, 1032 right_path); 1033 1034 tmp_el = left_path->p_node[subtree_root].el; 1035 blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; 1036 for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { 1037 if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { 1038 *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); 1039 break; 1040 } 1041 } 1042 1043 BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); 1044 1045 out: 1046 ocfs2_free_path(left_path); 1047 ocfs2_free_path(right_path); 1048 return ret; 1049 } 1050 1051 /* 1052 * Given a cpos and len, try to find the refcount record which contains cpos. 1053 * 1. If cpos can be found in one refcount record, return the record. 1054 * 2. If cpos can't be found, return a fake record which start from cpos 1055 * and end at a small value between cpos+len and start of the next record. 1056 * This fake record has r_refcount = 0. 1057 */ 1058 static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, 1059 struct buffer_head *ref_root_bh, 1060 u64 cpos, unsigned int len, 1061 struct ocfs2_refcount_rec *ret_rec, 1062 int *index, 1063 struct buffer_head **ret_bh) 1064 { 1065 int ret = 0, i, found; 1066 u32 low_cpos, uninitialized_var(cpos_end); 1067 struct ocfs2_extent_list *el; 1068 struct ocfs2_extent_rec *rec = NULL; 1069 struct ocfs2_extent_block *eb = NULL; 1070 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; 1071 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1072 struct ocfs2_refcount_block *rb = 1073 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1074 1075 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { 1076 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, 1077 ret_rec, index); 1078 *ret_bh = ref_root_bh; 1079 get_bh(ref_root_bh); 1080 return 0; 1081 } 1082 1083 el = &rb->rf_list; 1084 low_cpos = cpos & OCFS2_32BIT_POS_MASK; 1085 1086 if (el->l_tree_depth) { 1087 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); 1088 if (ret) { 1089 mlog_errno(ret); 1090 goto out; 1091 } 1092 1093 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 1094 el = &eb->h_list; 1095 1096 if (el->l_tree_depth) { 1097 ocfs2_error(sb, 1098 "refcount tree %llu has non zero tree " 1099 "depth in leaf btree tree block %llu\n", 1100 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1101 (unsigned long long)eb_bh->b_blocknr); 1102 ret = -EROFS; 1103 goto out; 1104 } 1105 } 1106 1107 found = 0; 1108 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 1109 rec = &el->l_recs[i]; 1110 1111 if (le32_to_cpu(rec->e_cpos) <= low_cpos) { 1112 found = 1; 1113 break; 1114 } 1115 } 1116 1117 if (found) { 1118 ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, 1119 eb, el, i, &cpos_end); 1120 if (ret) { 1121 mlog_errno(ret); 1122 goto out; 1123 } 1124 1125 if (cpos_end < low_cpos + len) 1126 len = cpos_end - low_cpos; 1127 } 1128 1129 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), 1130 &ref_leaf_bh); 1131 if (ret) { 1132 mlog_errno(ret); 1133 goto out; 1134 } 1135 1136 ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, 1137 ret_rec, index); 1138 *ret_bh = ref_leaf_bh; 1139 out: 1140 brelse(eb_bh); 1141 return ret; 1142 } 1143 1144 enum ocfs2_ref_rec_contig { 1145 REF_CONTIG_NONE = 0, 1146 REF_CONTIG_LEFT, 1147 REF_CONTIG_RIGHT, 1148 REF_CONTIG_LEFTRIGHT, 1149 }; 1150 1151 static enum ocfs2_ref_rec_contig 1152 ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, 1153 int index) 1154 { 1155 if ((rb->rf_records.rl_recs[index].r_refcount == 1156 rb->rf_records.rl_recs[index + 1].r_refcount) && 1157 (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + 1158 le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == 1159 le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) 1160 return REF_CONTIG_RIGHT; 1161 1162 return REF_CONTIG_NONE; 1163 } 1164 1165 static enum ocfs2_ref_rec_contig 1166 ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, 1167 int index) 1168 { 1169 enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; 1170 1171 if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) 1172 ret = ocfs2_refcount_rec_adjacent(rb, index); 1173 1174 if (index > 0) { 1175 enum ocfs2_ref_rec_contig tmp; 1176 1177 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); 1178 1179 if (tmp == REF_CONTIG_RIGHT) { 1180 if (ret == REF_CONTIG_RIGHT) 1181 ret = REF_CONTIG_LEFTRIGHT; 1182 else 1183 ret = REF_CONTIG_LEFT; 1184 } 1185 } 1186 1187 return ret; 1188 } 1189 1190 static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, 1191 int index) 1192 { 1193 BUG_ON(rb->rf_records.rl_recs[index].r_refcount != 1194 rb->rf_records.rl_recs[index+1].r_refcount); 1195 1196 le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, 1197 le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); 1198 1199 if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) 1200 memmove(&rb->rf_records.rl_recs[index + 1], 1201 &rb->rf_records.rl_recs[index + 2], 1202 sizeof(struct ocfs2_refcount_rec) * 1203 (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); 1204 1205 memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], 1206 0, sizeof(struct ocfs2_refcount_rec)); 1207 le16_add_cpu(&rb->rf_records.rl_used, -1); 1208 } 1209 1210 /* 1211 * Merge the refcount rec if we are contiguous with the adjacent recs. 1212 */ 1213 static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, 1214 int index) 1215 { 1216 enum ocfs2_ref_rec_contig contig = 1217 ocfs2_refcount_rec_contig(rb, index); 1218 1219 if (contig == REF_CONTIG_NONE) 1220 return; 1221 1222 if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { 1223 BUG_ON(index == 0); 1224 index--; 1225 } 1226 1227 ocfs2_rotate_refcount_rec_left(rb, index); 1228 1229 if (contig == REF_CONTIG_LEFTRIGHT) 1230 ocfs2_rotate_refcount_rec_left(rb, index); 1231 } 1232 1233 /* 1234 * Change the refcount indexed by "index" in ref_bh. 1235 * If refcount reaches 0, remove it. 1236 */ 1237 static int ocfs2_change_refcount_rec(handle_t *handle, 1238 struct ocfs2_caching_info *ci, 1239 struct buffer_head *ref_leaf_bh, 1240 int index, int merge, int change) 1241 { 1242 int ret; 1243 struct ocfs2_refcount_block *rb = 1244 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1245 struct ocfs2_refcount_list *rl = &rb->rf_records; 1246 struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; 1247 1248 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1249 OCFS2_JOURNAL_ACCESS_WRITE); 1250 if (ret) { 1251 mlog_errno(ret); 1252 goto out; 1253 } 1254 1255 mlog(0, "change index %d, old count %u, change %d\n", index, 1256 le32_to_cpu(rec->r_refcount), change); 1257 le32_add_cpu(&rec->r_refcount, change); 1258 1259 if (!rec->r_refcount) { 1260 if (index != le16_to_cpu(rl->rl_used) - 1) { 1261 memmove(rec, rec + 1, 1262 (le16_to_cpu(rl->rl_used) - index - 1) * 1263 sizeof(struct ocfs2_refcount_rec)); 1264 memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], 1265 0, sizeof(struct ocfs2_refcount_rec)); 1266 } 1267 1268 le16_add_cpu(&rl->rl_used, -1); 1269 } else if (merge) 1270 ocfs2_refcount_rec_merge(rb, index); 1271 1272 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1273 if (ret) 1274 mlog_errno(ret); 1275 out: 1276 return ret; 1277 } 1278 1279 static int ocfs2_expand_inline_ref_root(handle_t *handle, 1280 struct ocfs2_caching_info *ci, 1281 struct buffer_head *ref_root_bh, 1282 struct buffer_head **ref_leaf_bh, 1283 struct ocfs2_alloc_context *meta_ac) 1284 { 1285 int ret; 1286 u16 suballoc_bit_start; 1287 u32 num_got; 1288 u64 blkno; 1289 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1290 struct buffer_head *new_bh = NULL; 1291 struct ocfs2_refcount_block *new_rb; 1292 struct ocfs2_refcount_block *root_rb = 1293 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1294 1295 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1296 OCFS2_JOURNAL_ACCESS_WRITE); 1297 if (ret) { 1298 mlog_errno(ret); 1299 goto out; 1300 } 1301 1302 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1303 &suballoc_bit_start, &num_got, 1304 &blkno); 1305 if (ret) { 1306 mlog_errno(ret); 1307 goto out; 1308 } 1309 1310 new_bh = sb_getblk(sb, blkno); 1311 if (new_bh == NULL) { 1312 ret = -EIO; 1313 mlog_errno(ret); 1314 goto out; 1315 } 1316 ocfs2_set_new_buffer_uptodate(ci, new_bh); 1317 1318 ret = ocfs2_journal_access_rb(handle, ci, new_bh, 1319 OCFS2_JOURNAL_ACCESS_CREATE); 1320 if (ret) { 1321 mlog_errno(ret); 1322 goto out; 1323 } 1324 1325 /* 1326 * Initialize ocfs2_refcount_block. 1327 * It should contain the same information as the old root. 1328 * so just memcpy it and change the corresponding field. 1329 */ 1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1331 1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1333 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1336 new_rb->rf_cpos = cpu_to_le32(0); 1337 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); 1338 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); 1339 ocfs2_journal_dirty(handle, new_bh); 1340 1341 /* Now change the root. */ 1342 memset(&root_rb->rf_list, 0, sb->s_blocksize - 1343 offsetof(struct ocfs2_refcount_block, rf_list)); 1344 root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); 1345 root_rb->rf_clusters = cpu_to_le32(1); 1346 root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); 1347 root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); 1348 root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); 1349 root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); 1350 1351 ocfs2_journal_dirty(handle, ref_root_bh); 1352 1353 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, 1354 le16_to_cpu(new_rb->rf_records.rl_used)); 1355 1356 *ref_leaf_bh = new_bh; 1357 new_bh = NULL; 1358 out: 1359 brelse(new_bh); 1360 return ret; 1361 } 1362 1363 static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, 1364 struct ocfs2_refcount_rec *next) 1365 { 1366 if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= 1367 ocfs2_get_ref_rec_low_cpos(next)) 1368 return 1; 1369 1370 return 0; 1371 } 1372 1373 static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) 1374 { 1375 const struct ocfs2_refcount_rec *l = a, *r = b; 1376 u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); 1377 u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); 1378 1379 if (l_cpos > r_cpos) 1380 return 1; 1381 if (l_cpos < r_cpos) 1382 return -1; 1383 return 0; 1384 } 1385 1386 static int cmp_refcount_rec_by_cpos(const void *a, const void *b) 1387 { 1388 const struct ocfs2_refcount_rec *l = a, *r = b; 1389 u64 l_cpos = le64_to_cpu(l->r_cpos); 1390 u64 r_cpos = le64_to_cpu(r->r_cpos); 1391 1392 if (l_cpos > r_cpos) 1393 return 1; 1394 if (l_cpos < r_cpos) 1395 return -1; 1396 return 0; 1397 } 1398 1399 static void swap_refcount_rec(void *a, void *b, int size) 1400 { 1401 struct ocfs2_refcount_rec *l = a, *r = b, tmp; 1402 1403 tmp = *(struct ocfs2_refcount_rec *)l; 1404 *(struct ocfs2_refcount_rec *)l = 1405 *(struct ocfs2_refcount_rec *)r; 1406 *(struct ocfs2_refcount_rec *)r = tmp; 1407 } 1408 1409 /* 1410 * The refcount cpos are ordered by their 64bit cpos, 1411 * But we will use the low 32 bit to be the e_cpos in the b-tree. 1412 * So we need to make sure that this pos isn't intersected with others. 1413 * 1414 * Note: The refcount block is already sorted by their low 32 bit cpos, 1415 * So just try the middle pos first, and we will exit when we find 1416 * the good position. 1417 */ 1418 static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, 1419 u32 *split_pos, int *split_index) 1420 { 1421 int num_used = le16_to_cpu(rl->rl_used); 1422 int delta, middle = num_used / 2; 1423 1424 for (delta = 0; delta < middle; delta++) { 1425 /* Let's check delta earlier than middle */ 1426 if (ocfs2_refcount_rec_no_intersect( 1427 &rl->rl_recs[middle - delta - 1], 1428 &rl->rl_recs[middle - delta])) { 1429 *split_index = middle - delta; 1430 break; 1431 } 1432 1433 /* For even counts, don't walk off the end */ 1434 if ((middle + delta + 1) == num_used) 1435 continue; 1436 1437 /* Now try delta past middle */ 1438 if (ocfs2_refcount_rec_no_intersect( 1439 &rl->rl_recs[middle + delta], 1440 &rl->rl_recs[middle + delta + 1])) { 1441 *split_index = middle + delta + 1; 1442 break; 1443 } 1444 } 1445 1446 if (delta >= middle) 1447 return -ENOSPC; 1448 1449 *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); 1450 return 0; 1451 } 1452 1453 static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, 1454 struct buffer_head *new_bh, 1455 u32 *split_cpos) 1456 { 1457 int split_index = 0, num_moved, ret; 1458 u32 cpos = 0; 1459 struct ocfs2_refcount_block *rb = 1460 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1461 struct ocfs2_refcount_list *rl = &rb->rf_records; 1462 struct ocfs2_refcount_block *new_rb = 1463 (struct ocfs2_refcount_block *)new_bh->b_data; 1464 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; 1465 1466 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", 1467 (unsigned long long)ref_leaf_bh->b_blocknr, 1468 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); 1469 1470 /* 1471 * XXX: Improvement later. 1472 * If we know all the high 32 bit cpos is the same, no need to sort. 1473 * 1474 * In order to make the whole process safe, we do: 1475 * 1. sort the entries by their low 32 bit cpos first so that we can 1476 * find the split cpos easily. 1477 * 2. call ocfs2_insert_extent to insert the new refcount block. 1478 * 3. move the refcount rec to the new block. 1479 * 4. sort the entries by their 64 bit cpos. 1480 * 5. dirty the new_rb and rb. 1481 */ 1482 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1483 sizeof(struct ocfs2_refcount_rec), 1484 cmp_refcount_rec_by_low_cpos, swap_refcount_rec); 1485 1486 ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); 1487 if (ret) { 1488 mlog_errno(ret); 1489 return ret; 1490 } 1491 1492 new_rb->rf_cpos = cpu_to_le32(cpos); 1493 1494 /* move refcount records starting from split_index to the new block. */ 1495 num_moved = le16_to_cpu(rl->rl_used) - split_index; 1496 memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], 1497 num_moved * sizeof(struct ocfs2_refcount_rec)); 1498 1499 /*ok, remove the entries we just moved over to the other block. */ 1500 memset(&rl->rl_recs[split_index], 0, 1501 num_moved * sizeof(struct ocfs2_refcount_rec)); 1502 1503 /* change old and new rl_used accordingly. */ 1504 le16_add_cpu(&rl->rl_used, -num_moved); 1505 new_rl->rl_used = cpu_to_le16(num_moved); 1506 1507 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1508 sizeof(struct ocfs2_refcount_rec), 1509 cmp_refcount_rec_by_cpos, swap_refcount_rec); 1510 1511 sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), 1512 sizeof(struct ocfs2_refcount_rec), 1513 cmp_refcount_rec_by_cpos, swap_refcount_rec); 1514 1515 *split_cpos = cpos; 1516 return 0; 1517 } 1518 1519 static int ocfs2_new_leaf_refcount_block(handle_t *handle, 1520 struct ocfs2_caching_info *ci, 1521 struct buffer_head *ref_root_bh, 1522 struct buffer_head *ref_leaf_bh, 1523 struct ocfs2_alloc_context *meta_ac) 1524 { 1525 int ret; 1526 u16 suballoc_bit_start; 1527 u32 num_got, new_cpos; 1528 u64 blkno; 1529 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1530 struct ocfs2_refcount_block *root_rb = 1531 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1532 struct buffer_head *new_bh = NULL; 1533 struct ocfs2_refcount_block *new_rb; 1534 struct ocfs2_extent_tree ref_et; 1535 1536 BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); 1537 1538 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1539 OCFS2_JOURNAL_ACCESS_WRITE); 1540 if (ret) { 1541 mlog_errno(ret); 1542 goto out; 1543 } 1544 1545 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1546 OCFS2_JOURNAL_ACCESS_WRITE); 1547 if (ret) { 1548 mlog_errno(ret); 1549 goto out; 1550 } 1551 1552 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1553 &suballoc_bit_start, &num_got, 1554 &blkno); 1555 if (ret) { 1556 mlog_errno(ret); 1557 goto out; 1558 } 1559 1560 new_bh = sb_getblk(sb, blkno); 1561 if (new_bh == NULL) { 1562 ret = -EIO; 1563 mlog_errno(ret); 1564 goto out; 1565 } 1566 ocfs2_set_new_buffer_uptodate(ci, new_bh); 1567 1568 ret = ocfs2_journal_access_rb(handle, ci, new_bh, 1569 OCFS2_JOURNAL_ACCESS_CREATE); 1570 if (ret) { 1571 mlog_errno(ret); 1572 goto out; 1573 } 1574 1575 /* Initialize ocfs2_refcount_block. */ 1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1577 memset(new_rb, 0, sb->s_blocksize); 1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1579 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1583 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); 1584 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); 1585 new_rb->rf_records.rl_count = 1586 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); 1587 new_rb->rf_generation = root_rb->rf_generation; 1588 1589 ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); 1590 if (ret) { 1591 mlog_errno(ret); 1592 goto out; 1593 } 1594 1595 ocfs2_journal_dirty(handle, ref_leaf_bh); 1596 ocfs2_journal_dirty(handle, new_bh); 1597 1598 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); 1599 1600 mlog(0, "insert new leaf block %llu at %u\n", 1601 (unsigned long long)new_bh->b_blocknr, new_cpos); 1602 1603 /* Insert the new leaf block with the specific offset cpos. */ 1604 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, 1605 1, 0, meta_ac); 1606 if (ret) 1607 mlog_errno(ret); 1608 1609 out: 1610 brelse(new_bh); 1611 return ret; 1612 } 1613 1614 static int ocfs2_expand_refcount_tree(handle_t *handle, 1615 struct ocfs2_caching_info *ci, 1616 struct buffer_head *ref_root_bh, 1617 struct buffer_head *ref_leaf_bh, 1618 struct ocfs2_alloc_context *meta_ac) 1619 { 1620 int ret; 1621 struct buffer_head *expand_bh = NULL; 1622 1623 if (ref_root_bh == ref_leaf_bh) { 1624 /* 1625 * the old root bh hasn't been expanded to a b-tree, 1626 * so expand it first. 1627 */ 1628 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, 1629 &expand_bh, meta_ac); 1630 if (ret) { 1631 mlog_errno(ret); 1632 goto out; 1633 } 1634 } else { 1635 expand_bh = ref_leaf_bh; 1636 get_bh(expand_bh); 1637 } 1638 1639 1640 /* Now add a new refcount block into the tree.*/ 1641 ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, 1642 expand_bh, meta_ac); 1643 if (ret) 1644 mlog_errno(ret); 1645 out: 1646 brelse(expand_bh); 1647 return ret; 1648 } 1649 1650 /* 1651 * Adjust the extent rec in b-tree representing ref_leaf_bh. 1652 * 1653 * Only called when we have inserted a new refcount rec at index 0 1654 * which means ocfs2_extent_rec.e_cpos may need some change. 1655 */ 1656 static int ocfs2_adjust_refcount_rec(handle_t *handle, 1657 struct ocfs2_caching_info *ci, 1658 struct buffer_head *ref_root_bh, 1659 struct buffer_head *ref_leaf_bh, 1660 struct ocfs2_refcount_rec *rec) 1661 { 1662 int ret = 0, i; 1663 u32 new_cpos, old_cpos; 1664 struct ocfs2_path *path = NULL; 1665 struct ocfs2_extent_tree et; 1666 struct ocfs2_refcount_block *rb = 1667 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1668 struct ocfs2_extent_list *el; 1669 1670 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) 1671 goto out; 1672 1673 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1674 old_cpos = le32_to_cpu(rb->rf_cpos); 1675 new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; 1676 if (old_cpos <= new_cpos) 1677 goto out; 1678 1679 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 1680 1681 path = ocfs2_new_path_from_et(&et); 1682 if (!path) { 1683 ret = -ENOMEM; 1684 mlog_errno(ret); 1685 goto out; 1686 } 1687 1688 ret = ocfs2_find_path(ci, path, old_cpos); 1689 if (ret) { 1690 mlog_errno(ret); 1691 goto out; 1692 } 1693 1694 /* 1695 * 2 more credits, one for the leaf refcount block, one for 1696 * the extent block contains the extent rec. 1697 */ 1698 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1699 if (ret < 0) { 1700 mlog_errno(ret); 1701 goto out; 1702 } 1703 1704 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1705 OCFS2_JOURNAL_ACCESS_WRITE); 1706 if (ret < 0) { 1707 mlog_errno(ret); 1708 goto out; 1709 } 1710 1711 ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), 1712 OCFS2_JOURNAL_ACCESS_WRITE); 1713 if (ret < 0) { 1714 mlog_errno(ret); 1715 goto out; 1716 } 1717 1718 /* change the leaf extent block first. */ 1719 el = path_leaf_el(path); 1720 1721 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) 1722 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) 1723 break; 1724 1725 BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); 1726 1727 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 1728 1729 /* change the r_cpos in the leaf block. */ 1730 rb->rf_cpos = cpu_to_le32(new_cpos); 1731 1732 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 1733 ocfs2_journal_dirty(handle, ref_leaf_bh); 1734 1735 out: 1736 ocfs2_free_path(path); 1737 return ret; 1738 } 1739 1740 static int ocfs2_insert_refcount_rec(handle_t *handle, 1741 struct ocfs2_caching_info *ci, 1742 struct buffer_head *ref_root_bh, 1743 struct buffer_head *ref_leaf_bh, 1744 struct ocfs2_refcount_rec *rec, 1745 int index, int merge, 1746 struct ocfs2_alloc_context *meta_ac) 1747 { 1748 int ret; 1749 struct ocfs2_refcount_block *rb = 1750 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1751 struct ocfs2_refcount_list *rf_list = &rb->rf_records; 1752 struct buffer_head *new_bh = NULL; 1753 1754 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1755 1756 if (rf_list->rl_used == rf_list->rl_count) { 1757 u64 cpos = le64_to_cpu(rec->r_cpos); 1758 u32 len = le32_to_cpu(rec->r_clusters); 1759 1760 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, 1761 ref_leaf_bh, meta_ac); 1762 if (ret) { 1763 mlog_errno(ret); 1764 goto out; 1765 } 1766 1767 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1768 cpos, len, NULL, &index, 1769 &new_bh); 1770 if (ret) { 1771 mlog_errno(ret); 1772 goto out; 1773 } 1774 1775 ref_leaf_bh = new_bh; 1776 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1777 rf_list = &rb->rf_records; 1778 } 1779 1780 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1781 OCFS2_JOURNAL_ACCESS_WRITE); 1782 if (ret) { 1783 mlog_errno(ret); 1784 goto out; 1785 } 1786 1787 if (index < le16_to_cpu(rf_list->rl_used)) 1788 memmove(&rf_list->rl_recs[index + 1], 1789 &rf_list->rl_recs[index], 1790 (le16_to_cpu(rf_list->rl_used) - index) * 1791 sizeof(struct ocfs2_refcount_rec)); 1792 1793 mlog(0, "insert refcount record start %llu, len %u, count %u " 1794 "to leaf block %llu at index %d\n", 1795 (unsigned long long)le64_to_cpu(rec->r_cpos), 1796 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), 1797 (unsigned long long)ref_leaf_bh->b_blocknr, index); 1798 1799 rf_list->rl_recs[index] = *rec; 1800 1801 le16_add_cpu(&rf_list->rl_used, 1); 1802 1803 if (merge) 1804 ocfs2_refcount_rec_merge(rb, index); 1805 1806 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1807 if (ret) { 1808 mlog_errno(ret); 1809 goto out; 1810 } 1811 1812 if (index == 0) { 1813 ret = ocfs2_adjust_refcount_rec(handle, ci, 1814 ref_root_bh, 1815 ref_leaf_bh, rec); 1816 if (ret) 1817 mlog_errno(ret); 1818 } 1819 out: 1820 brelse(new_bh); 1821 return ret; 1822 } 1823 1824 /* 1825 * Split the refcount_rec indexed by "index" in ref_leaf_bh. 1826 * This is much simple than our b-tree code. 1827 * split_rec is the new refcount rec we want to insert. 1828 * If split_rec->r_refcount > 0, we are changing the refcount(in case we 1829 * increase refcount or decrease a refcount to non-zero). 1830 * If split_rec->r_refcount == 0, we are punching a hole in current refcount 1831 * rec( in case we decrease a refcount to zero). 1832 */ 1833 static int ocfs2_split_refcount_rec(handle_t *handle, 1834 struct ocfs2_caching_info *ci, 1835 struct buffer_head *ref_root_bh, 1836 struct buffer_head *ref_leaf_bh, 1837 struct ocfs2_refcount_rec *split_rec, 1838 int index, int merge, 1839 struct ocfs2_alloc_context *meta_ac, 1840 struct ocfs2_cached_dealloc_ctxt *dealloc) 1841 { 1842 int ret, recs_need; 1843 u32 len; 1844 struct ocfs2_refcount_block *rb = 1845 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1846 struct ocfs2_refcount_list *rf_list = &rb->rf_records; 1847 struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; 1848 struct ocfs2_refcount_rec *tail_rec = NULL; 1849 struct buffer_head *new_bh = NULL; 1850 1851 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1852 1853 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", 1854 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), 1855 le64_to_cpu(split_rec->r_cpos), 1856 le32_to_cpu(split_rec->r_clusters)); 1857 1858 /* 1859 * If we just need to split the header or tail clusters, 1860 * no more recs are needed, just split is OK. 1861 * Otherwise we at least need one new recs. 1862 */ 1863 if (!split_rec->r_refcount && 1864 (split_rec->r_cpos == orig_rec->r_cpos || 1865 le64_to_cpu(split_rec->r_cpos) + 1866 le32_to_cpu(split_rec->r_clusters) == 1867 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) 1868 recs_need = 0; 1869 else 1870 recs_need = 1; 1871 1872 /* 1873 * We need one more rec if we split in the middle and the new rec have 1874 * some refcount in it. 1875 */ 1876 if (split_rec->r_refcount && 1877 (split_rec->r_cpos != orig_rec->r_cpos && 1878 le64_to_cpu(split_rec->r_cpos) + 1879 le32_to_cpu(split_rec->r_clusters) != 1880 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) 1881 recs_need++; 1882 1883 /* If the leaf block don't have enough record, expand it. */ 1884 if (le16_to_cpu(rf_list->rl_used) + recs_need > 1885 le16_to_cpu(rf_list->rl_count)) { 1886 struct ocfs2_refcount_rec tmp_rec; 1887 u64 cpos = le64_to_cpu(orig_rec->r_cpos); 1888 len = le32_to_cpu(orig_rec->r_clusters); 1889 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, 1890 ref_leaf_bh, meta_ac); 1891 if (ret) { 1892 mlog_errno(ret); 1893 goto out; 1894 } 1895 1896 /* 1897 * We have to re-get it since now cpos may be moved to 1898 * another leaf block. 1899 */ 1900 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1901 cpos, len, &tmp_rec, &index, 1902 &new_bh); 1903 if (ret) { 1904 mlog_errno(ret); 1905 goto out; 1906 } 1907 1908 ref_leaf_bh = new_bh; 1909 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1910 rf_list = &rb->rf_records; 1911 orig_rec = &rf_list->rl_recs[index]; 1912 } 1913 1914 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1915 OCFS2_JOURNAL_ACCESS_WRITE); 1916 if (ret) { 1917 mlog_errno(ret); 1918 goto out; 1919 } 1920 1921 /* 1922 * We have calculated out how many new records we need and store 1923 * in recs_need, so spare enough space first by moving the records 1924 * after "index" to the end. 1925 */ 1926 if (index != le16_to_cpu(rf_list->rl_used) - 1) 1927 memmove(&rf_list->rl_recs[index + 1 + recs_need], 1928 &rf_list->rl_recs[index + 1], 1929 (le16_to_cpu(rf_list->rl_used) - index - 1) * 1930 sizeof(struct ocfs2_refcount_rec)); 1931 1932 len = (le64_to_cpu(orig_rec->r_cpos) + 1933 le32_to_cpu(orig_rec->r_clusters)) - 1934 (le64_to_cpu(split_rec->r_cpos) + 1935 le32_to_cpu(split_rec->r_clusters)); 1936 1937 /* 1938 * If we have "len", the we will split in the tail and move it 1939 * to the end of the space we have just spared. 1940 */ 1941 if (len) { 1942 tail_rec = &rf_list->rl_recs[index + recs_need]; 1943 1944 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); 1945 le64_add_cpu(&tail_rec->r_cpos, 1946 le32_to_cpu(tail_rec->r_clusters) - len); 1947 tail_rec->r_clusters = cpu_to_le32(len); 1948 } 1949 1950 /* 1951 * If the split pos isn't the same as the original one, we need to 1952 * split in the head. 1953 * 1954 * Note: We have the chance that split_rec.r_refcount = 0, 1955 * recs_need = 0 and len > 0, which means we just cut the head from 1956 * the orig_rec and in that case we have done some modification in 1957 * orig_rec above, so the check for r_cpos is faked. 1958 */ 1959 if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { 1960 len = le64_to_cpu(split_rec->r_cpos) - 1961 le64_to_cpu(orig_rec->r_cpos); 1962 orig_rec->r_clusters = cpu_to_le32(len); 1963 index++; 1964 } 1965 1966 le16_add_cpu(&rf_list->rl_used, recs_need); 1967 1968 if (split_rec->r_refcount) { 1969 rf_list->rl_recs[index] = *split_rec; 1970 mlog(0, "insert refcount record start %llu, len %u, count %u " 1971 "to leaf block %llu at index %d\n", 1972 (unsigned long long)le64_to_cpu(split_rec->r_cpos), 1973 le32_to_cpu(split_rec->r_clusters), 1974 le32_to_cpu(split_rec->r_refcount), 1975 (unsigned long long)ref_leaf_bh->b_blocknr, index); 1976 1977 if (merge) 1978 ocfs2_refcount_rec_merge(rb, index); 1979 } 1980 1981 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1982 if (ret) 1983 mlog_errno(ret); 1984 1985 out: 1986 brelse(new_bh); 1987 return ret; 1988 } 1989 1990 static int __ocfs2_increase_refcount(handle_t *handle, 1991 struct ocfs2_caching_info *ci, 1992 struct buffer_head *ref_root_bh, 1993 u64 cpos, u32 len, int merge, 1994 struct ocfs2_alloc_context *meta_ac, 1995 struct ocfs2_cached_dealloc_ctxt *dealloc) 1996 { 1997 int ret = 0, index; 1998 struct buffer_head *ref_leaf_bh = NULL; 1999 struct ocfs2_refcount_rec rec; 2000 unsigned int set_len = 0; 2001 2002 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", 2003 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2004 (unsigned long long)cpos, len); 2005 2006 while (len) { 2007 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2008 cpos, len, &rec, &index, 2009 &ref_leaf_bh); 2010 if (ret) { 2011 mlog_errno(ret); 2012 goto out; 2013 } 2014 2015 set_len = le32_to_cpu(rec.r_clusters); 2016 2017 /* 2018 * Here we may meet with 3 situations: 2019 * 2020 * 1. If we find an already existing record, and the length 2021 * is the same, cool, we just need to increase the r_refcount 2022 * and it is OK. 2023 * 2. If we find a hole, just insert it with r_refcount = 1. 2024 * 3. If we are in the middle of one extent record, split 2025 * it. 2026 */ 2027 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && 2028 set_len <= len) { 2029 mlog(0, "increase refcount rec, start %llu, len %u, " 2030 "count %u\n", (unsigned long long)cpos, set_len, 2031 le32_to_cpu(rec.r_refcount)); 2032 ret = ocfs2_change_refcount_rec(handle, ci, 2033 ref_leaf_bh, index, 2034 merge, 1); 2035 if (ret) { 2036 mlog_errno(ret); 2037 goto out; 2038 } 2039 } else if (!rec.r_refcount) { 2040 rec.r_refcount = cpu_to_le32(1); 2041 2042 mlog(0, "insert refcount rec, start %llu, len %u\n", 2043 (unsigned long long)le64_to_cpu(rec.r_cpos), 2044 set_len); 2045 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, 2046 ref_leaf_bh, 2047 &rec, index, 2048 merge, meta_ac); 2049 if (ret) { 2050 mlog_errno(ret); 2051 goto out; 2052 } 2053 } else { 2054 set_len = min((u64)(cpos + len), 2055 le64_to_cpu(rec.r_cpos) + set_len) - cpos; 2056 rec.r_cpos = cpu_to_le64(cpos); 2057 rec.r_clusters = cpu_to_le32(set_len); 2058 le32_add_cpu(&rec.r_refcount, 1); 2059 2060 mlog(0, "split refcount rec, start %llu, " 2061 "len %u, count %u\n", 2062 (unsigned long long)le64_to_cpu(rec.r_cpos), 2063 set_len, le32_to_cpu(rec.r_refcount)); 2064 ret = ocfs2_split_refcount_rec(handle, ci, 2065 ref_root_bh, ref_leaf_bh, 2066 &rec, index, merge, 2067 meta_ac, dealloc); 2068 if (ret) { 2069 mlog_errno(ret); 2070 goto out; 2071 } 2072 } 2073 2074 cpos += set_len; 2075 len -= set_len; 2076 brelse(ref_leaf_bh); 2077 ref_leaf_bh = NULL; 2078 } 2079 2080 out: 2081 brelse(ref_leaf_bh); 2082 return ret; 2083 } 2084 2085 static int ocfs2_remove_refcount_extent(handle_t *handle, 2086 struct ocfs2_caching_info *ci, 2087 struct buffer_head *ref_root_bh, 2088 struct buffer_head *ref_leaf_bh, 2089 struct ocfs2_alloc_context *meta_ac, 2090 struct ocfs2_cached_dealloc_ctxt *dealloc) 2091 { 2092 int ret; 2093 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2094 struct ocfs2_refcount_block *rb = 2095 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2096 struct ocfs2_extent_tree et; 2097 2098 BUG_ON(rb->rf_records.rl_used); 2099 2100 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 2101 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 2102 1, meta_ac, dealloc); 2103 if (ret) { 2104 mlog_errno(ret); 2105 goto out; 2106 } 2107 2108 ocfs2_remove_from_cache(ci, ref_leaf_bh); 2109 2110 /* 2111 * add the freed block to the dealloc so that it will be freed 2112 * when we run dealloc. 2113 */ 2114 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 2115 le16_to_cpu(rb->rf_suballoc_slot), 2116 le64_to_cpu(rb->rf_blkno), 2117 le16_to_cpu(rb->rf_suballoc_bit)); 2118 if (ret) { 2119 mlog_errno(ret); 2120 goto out; 2121 } 2122 2123 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 2124 OCFS2_JOURNAL_ACCESS_WRITE); 2125 if (ret) { 2126 mlog_errno(ret); 2127 goto out; 2128 } 2129 2130 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 2131 2132 le32_add_cpu(&rb->rf_clusters, -1); 2133 2134 /* 2135 * check whether we need to restore the root refcount block if 2136 * there is no leaf extent block at atll. 2137 */ 2138 if (!rb->rf_list.l_next_free_rec) { 2139 BUG_ON(rb->rf_clusters); 2140 2141 mlog(0, "reset refcount tree root %llu to be a record block.\n", 2142 (unsigned long long)ref_root_bh->b_blocknr); 2143 2144 rb->rf_flags = 0; 2145 rb->rf_parent = 0; 2146 rb->rf_cpos = 0; 2147 memset(&rb->rf_records, 0, sb->s_blocksize - 2148 offsetof(struct ocfs2_refcount_block, rf_records)); 2149 rb->rf_records.rl_count = 2150 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); 2151 } 2152 2153 ocfs2_journal_dirty(handle, ref_root_bh); 2154 2155 out: 2156 return ret; 2157 } 2158 2159 int ocfs2_increase_refcount(handle_t *handle, 2160 struct ocfs2_caching_info *ci, 2161 struct buffer_head *ref_root_bh, 2162 u64 cpos, u32 len, 2163 struct ocfs2_alloc_context *meta_ac, 2164 struct ocfs2_cached_dealloc_ctxt *dealloc) 2165 { 2166 return __ocfs2_increase_refcount(handle, ci, ref_root_bh, 2167 cpos, len, 1, 2168 meta_ac, dealloc); 2169 } 2170 2171 static int ocfs2_decrease_refcount_rec(handle_t *handle, 2172 struct ocfs2_caching_info *ci, 2173 struct buffer_head *ref_root_bh, 2174 struct buffer_head *ref_leaf_bh, 2175 int index, u64 cpos, unsigned int len, 2176 struct ocfs2_alloc_context *meta_ac, 2177 struct ocfs2_cached_dealloc_ctxt *dealloc) 2178 { 2179 int ret; 2180 struct ocfs2_refcount_block *rb = 2181 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2182 struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; 2183 2184 BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); 2185 BUG_ON(cpos + len > 2186 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); 2187 2188 if (cpos == le64_to_cpu(rec->r_cpos) && 2189 len == le32_to_cpu(rec->r_clusters)) 2190 ret = ocfs2_change_refcount_rec(handle, ci, 2191 ref_leaf_bh, index, 1, -1); 2192 else { 2193 struct ocfs2_refcount_rec split = *rec; 2194 split.r_cpos = cpu_to_le64(cpos); 2195 split.r_clusters = cpu_to_le32(len); 2196 2197 le32_add_cpu(&split.r_refcount, -1); 2198 2199 mlog(0, "split refcount rec, start %llu, " 2200 "len %u, count %u, original start %llu, len %u\n", 2201 (unsigned long long)le64_to_cpu(split.r_cpos), 2202 len, le32_to_cpu(split.r_refcount), 2203 (unsigned long long)le64_to_cpu(rec->r_cpos), 2204 le32_to_cpu(rec->r_clusters)); 2205 ret = ocfs2_split_refcount_rec(handle, ci, 2206 ref_root_bh, ref_leaf_bh, 2207 &split, index, 1, 2208 meta_ac, dealloc); 2209 } 2210 2211 if (ret) { 2212 mlog_errno(ret); 2213 goto out; 2214 } 2215 2216 /* Remove the leaf refcount block if it contains no refcount record. */ 2217 if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { 2218 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, 2219 ref_leaf_bh, meta_ac, 2220 dealloc); 2221 if (ret) 2222 mlog_errno(ret); 2223 } 2224 2225 out: 2226 return ret; 2227 } 2228 2229 static int __ocfs2_decrease_refcount(handle_t *handle, 2230 struct ocfs2_caching_info *ci, 2231 struct buffer_head *ref_root_bh, 2232 u64 cpos, u32 len, 2233 struct ocfs2_alloc_context *meta_ac, 2234 struct ocfs2_cached_dealloc_ctxt *dealloc, 2235 int delete) 2236 { 2237 int ret = 0, index = 0; 2238 struct ocfs2_refcount_rec rec; 2239 unsigned int r_count = 0, r_len; 2240 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2241 struct buffer_head *ref_leaf_bh = NULL; 2242 2243 mlog(0, "Tree owner %llu, decrease refcount start %llu, " 2244 "len %u, delete %u\n", 2245 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2246 (unsigned long long)cpos, len, delete); 2247 2248 while (len) { 2249 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2250 cpos, len, &rec, &index, 2251 &ref_leaf_bh); 2252 if (ret) { 2253 mlog_errno(ret); 2254 goto out; 2255 } 2256 2257 r_count = le32_to_cpu(rec.r_refcount); 2258 BUG_ON(r_count == 0); 2259 if (!delete) 2260 BUG_ON(r_count > 1); 2261 2262 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + 2263 le32_to_cpu(rec.r_clusters)) - cpos; 2264 2265 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, 2266 ref_leaf_bh, index, 2267 cpos, r_len, 2268 meta_ac, dealloc); 2269 if (ret) { 2270 mlog_errno(ret); 2271 goto out; 2272 } 2273 2274 if (le32_to_cpu(rec.r_refcount) == 1 && delete) { 2275 ret = ocfs2_cache_cluster_dealloc(dealloc, 2276 ocfs2_clusters_to_blocks(sb, cpos), 2277 r_len); 2278 if (ret) { 2279 mlog_errno(ret); 2280 goto out; 2281 } 2282 } 2283 2284 cpos += r_len; 2285 len -= r_len; 2286 brelse(ref_leaf_bh); 2287 ref_leaf_bh = NULL; 2288 } 2289 2290 out: 2291 brelse(ref_leaf_bh); 2292 return ret; 2293 } 2294 2295 /* Caller must hold refcount tree lock. */ 2296 int ocfs2_decrease_refcount(struct inode *inode, 2297 handle_t *handle, u32 cpos, u32 len, 2298 struct ocfs2_alloc_context *meta_ac, 2299 struct ocfs2_cached_dealloc_ctxt *dealloc, 2300 int delete) 2301 { 2302 int ret; 2303 u64 ref_blkno; 2304 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2305 struct buffer_head *ref_root_bh = NULL; 2306 struct ocfs2_refcount_tree *tree; 2307 2308 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2309 2310 ret = ocfs2_get_refcount_block(inode, &ref_blkno); 2311 if (ret) { 2312 mlog_errno(ret); 2313 goto out; 2314 } 2315 2316 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); 2317 if (ret) { 2318 mlog_errno(ret); 2319 goto out; 2320 } 2321 2322 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, 2323 &ref_root_bh); 2324 if (ret) { 2325 mlog_errno(ret); 2326 goto out; 2327 } 2328 2329 ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, 2330 cpos, len, meta_ac, dealloc, delete); 2331 if (ret) 2332 mlog_errno(ret); 2333 out: 2334 brelse(ref_root_bh); 2335 return ret; 2336 } 2337 2338 /* 2339 * Mark the already-existing extent at cpos as refcounted for len clusters. 2340 * This adds the refcount extent flag. 2341 * 2342 * If the existing extent is larger than the request, initiate a 2343 * split. An attempt will be made at merging with adjacent extents. 2344 * 2345 * The caller is responsible for passing down meta_ac if we'll need it. 2346 */ 2347 static int ocfs2_mark_extent_refcounted(struct inode *inode, 2348 struct ocfs2_extent_tree *et, 2349 handle_t *handle, u32 cpos, 2350 u32 len, u32 phys, 2351 struct ocfs2_alloc_context *meta_ac, 2352 struct ocfs2_cached_dealloc_ctxt *dealloc) 2353 { 2354 int ret; 2355 2356 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", 2357 inode->i_ino, cpos, len, phys); 2358 2359 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2360 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2361 "tree, but the feature bit is not set in the " 2362 "super block.", inode->i_ino); 2363 ret = -EROFS; 2364 goto out; 2365 } 2366 2367 ret = ocfs2_change_extent_flag(handle, et, cpos, 2368 len, phys, meta_ac, dealloc, 2369 OCFS2_EXT_REFCOUNTED, 0); 2370 if (ret) 2371 mlog_errno(ret); 2372 2373 out: 2374 return ret; 2375 } 2376 2377 /* 2378 * Given some contiguous physical clusters, calculate what we need 2379 * for modifying their refcount. 2380 */ 2381 static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, 2382 struct ocfs2_caching_info *ci, 2383 struct buffer_head *ref_root_bh, 2384 u64 start_cpos, 2385 u32 clusters, 2386 int *meta_add, 2387 int *credits) 2388 { 2389 int ret = 0, index, ref_blocks = 0, recs_add = 0; 2390 u64 cpos = start_cpos; 2391 struct ocfs2_refcount_block *rb; 2392 struct ocfs2_refcount_rec rec; 2393 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; 2394 u32 len; 2395 2396 mlog(0, "start_cpos %llu, clusters %u\n", 2397 (unsigned long long)start_cpos, clusters); 2398 while (clusters) { 2399 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2400 cpos, clusters, &rec, 2401 &index, &ref_leaf_bh); 2402 if (ret) { 2403 mlog_errno(ret); 2404 goto out; 2405 } 2406 2407 if (ref_leaf_bh != prev_bh) { 2408 /* 2409 * Now we encounter a new leaf block, so calculate 2410 * whether we need to extend the old leaf. 2411 */ 2412 if (prev_bh) { 2413 rb = (struct ocfs2_refcount_block *) 2414 prev_bh->b_data; 2415 2416 if (le64_to_cpu(rb->rf_records.rl_used) + 2417 recs_add > 2418 le16_to_cpu(rb->rf_records.rl_count)) 2419 ref_blocks++; 2420 } 2421 2422 recs_add = 0; 2423 *credits += 1; 2424 brelse(prev_bh); 2425 prev_bh = ref_leaf_bh; 2426 get_bh(prev_bh); 2427 } 2428 2429 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2430 2431 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," 2432 "rec->r_clusters %u, rec->r_refcount %u, index %d\n", 2433 recs_add, (unsigned long long)cpos, clusters, 2434 (unsigned long long)le64_to_cpu(rec.r_cpos), 2435 le32_to_cpu(rec.r_clusters), 2436 le32_to_cpu(rec.r_refcount), index); 2437 2438 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2439 le32_to_cpu(rec.r_clusters)) - cpos; 2440 /* 2441 * If the refcount rec already exist, cool. We just need 2442 * to check whether there is a split. Otherwise we just need 2443 * to increase the refcount. 2444 * If we will insert one, increases recs_add. 2445 * 2446 * We record all the records which will be inserted to the 2447 * same refcount block, so that we can tell exactly whether 2448 * we need a new refcount block or not. 2449 */ 2450 if (rec.r_refcount) { 2451 /* Check whether we need a split at the beginning. */ 2452 if (cpos == start_cpos && 2453 cpos != le64_to_cpu(rec.r_cpos)) 2454 recs_add++; 2455 2456 /* Check whether we need a split in the end. */ 2457 if (cpos + clusters < le64_to_cpu(rec.r_cpos) + 2458 le32_to_cpu(rec.r_clusters)) 2459 recs_add++; 2460 } else 2461 recs_add++; 2462 2463 brelse(ref_leaf_bh); 2464 ref_leaf_bh = NULL; 2465 clusters -= len; 2466 cpos += len; 2467 } 2468 2469 if (prev_bh) { 2470 rb = (struct ocfs2_refcount_block *)prev_bh->b_data; 2471 2472 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > 2473 le16_to_cpu(rb->rf_records.rl_count)) 2474 ref_blocks++; 2475 2476 *credits += 1; 2477 } 2478 2479 if (!ref_blocks) 2480 goto out; 2481 2482 mlog(0, "we need ref_blocks %d\n", ref_blocks); 2483 *meta_add += ref_blocks; 2484 *credits += ref_blocks; 2485 2486 /* 2487 * So we may need ref_blocks to insert into the tree. 2488 * That also means we need to change the b-tree and add that number 2489 * of records since we never merge them. 2490 * We need one more block for expansion since the new created leaf 2491 * block is also full and needs split. 2492 */ 2493 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 2494 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { 2495 struct ocfs2_extent_tree et; 2496 2497 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 2498 *meta_add += ocfs2_extend_meta_needed(et.et_root_el); 2499 *credits += ocfs2_calc_extend_credits(sb, 2500 et.et_root_el, 2501 ref_blocks); 2502 } else { 2503 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; 2504 *meta_add += 1; 2505 } 2506 2507 out: 2508 brelse(ref_leaf_bh); 2509 brelse(prev_bh); 2510 return ret; 2511 } 2512 2513 /* 2514 * For refcount tree, we will decrease some contiguous clusters 2515 * refcount count, so just go through it to see how many blocks 2516 * we gonna touch and whether we need to create new blocks. 2517 * 2518 * Normally the refcount blocks store these refcount should be 2519 * contiguous also, so that we can get the number easily. 2520 * As for meta_ac, we will at most add split 2 refcount record and 2521 * 2 more refcount block, so just check it in a rough way. 2522 * 2523 * Caller must hold refcount tree lock. 2524 */ 2525 int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2526 struct buffer_head *di_bh, 2527 u64 phys_blkno, 2528 u32 clusters, 2529 int *credits, 2530 struct ocfs2_alloc_context **meta_ac) 2531 { 2532 int ret, ref_blocks = 0; 2533 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2534 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2535 struct buffer_head *ref_root_bh = NULL; 2536 struct ocfs2_refcount_tree *tree; 2537 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); 2538 2539 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2540 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2541 "tree, but the feature bit is not set in the " 2542 "super block.", inode->i_ino); 2543 ret = -EROFS; 2544 goto out; 2545 } 2546 2547 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2548 2549 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2550 le64_to_cpu(di->i_refcount_loc), &tree); 2551 if (ret) { 2552 mlog_errno(ret); 2553 goto out; 2554 } 2555 2556 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2557 le64_to_cpu(di->i_refcount_loc), 2558 &ref_root_bh); 2559 if (ret) { 2560 mlog_errno(ret); 2561 goto out; 2562 } 2563 2564 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, 2565 &tree->rf_ci, 2566 ref_root_bh, 2567 start_cpos, clusters, 2568 &ref_blocks, credits); 2569 if (ret) { 2570 mlog_errno(ret); 2571 goto out; 2572 } 2573 2574 mlog(0, "reserve new metadata %d, credits = %d\n", 2575 ref_blocks, *credits); 2576 2577 if (ref_blocks) { 2578 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 2579 ref_blocks, meta_ac); 2580 if (ret) 2581 mlog_errno(ret); 2582 } 2583 2584 out: 2585 brelse(ref_root_bh); 2586 return ret; 2587 } 2588 2589 #define MAX_CONTIG_BYTES 1048576 2590 2591 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) 2592 { 2593 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); 2594 } 2595 2596 static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) 2597 { 2598 return ~(ocfs2_cow_contig_clusters(sb) - 1); 2599 } 2600 2601 /* 2602 * Given an extent that starts at 'start' and an I/O that starts at 'cpos', 2603 * find an offset (start + (n * contig_clusters)) that is closest to cpos 2604 * while still being less than or equal to it. 2605 * 2606 * The goal is to break the extent at a multiple of contig_clusters. 2607 */ 2608 static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, 2609 unsigned int start, 2610 unsigned int cpos) 2611 { 2612 BUG_ON(start > cpos); 2613 2614 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); 2615 } 2616 2617 /* 2618 * Given a cluster count of len, pad it out so that it is a multiple 2619 * of contig_clusters. 2620 */ 2621 static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, 2622 unsigned int len) 2623 { 2624 unsigned int padded = 2625 (len + (ocfs2_cow_contig_clusters(sb) - 1)) & 2626 ocfs2_cow_contig_mask(sb); 2627 2628 /* Did we wrap? */ 2629 if (padded < len) 2630 padded = UINT_MAX; 2631 2632 return padded; 2633 } 2634 2635 /* 2636 * Calculate out the start and number of virtual clusters we need to to CoW. 2637 * 2638 * cpos is vitual start cluster position we want to do CoW in a 2639 * file and write_len is the cluster length. 2640 * max_cpos is the place where we want to stop CoW intentionally. 2641 * 2642 * Normal we will start CoW from the beginning of extent record cotaining cpos. 2643 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we 2644 * get good I/O from the resulting extent tree. 2645 */ 2646 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, 2647 struct ocfs2_extent_list *el, 2648 u32 cpos, 2649 u32 write_len, 2650 u32 max_cpos, 2651 u32 *cow_start, 2652 u32 *cow_len) 2653 { 2654 int ret = 0; 2655 int tree_height = le16_to_cpu(el->l_tree_depth), i; 2656 struct buffer_head *eb_bh = NULL; 2657 struct ocfs2_extent_block *eb = NULL; 2658 struct ocfs2_extent_rec *rec; 2659 unsigned int want_clusters, rec_end = 0; 2660 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); 2661 int leaf_clusters; 2662 2663 BUG_ON(cpos + write_len > max_cpos); 2664 2665 if (tree_height > 0) { 2666 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); 2667 if (ret) { 2668 mlog_errno(ret); 2669 goto out; 2670 } 2671 2672 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 2673 el = &eb->h_list; 2674 2675 if (el->l_tree_depth) { 2676 ocfs2_error(inode->i_sb, 2677 "Inode %lu has non zero tree depth in " 2678 "leaf block %llu\n", inode->i_ino, 2679 (unsigned long long)eb_bh->b_blocknr); 2680 ret = -EROFS; 2681 goto out; 2682 } 2683 } 2684 2685 *cow_len = 0; 2686 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 2687 rec = &el->l_recs[i]; 2688 2689 if (ocfs2_is_empty_extent(rec)) { 2690 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " 2691 "index %d\n", inode->i_ino, i); 2692 continue; 2693 } 2694 2695 if (le32_to_cpu(rec->e_cpos) + 2696 le16_to_cpu(rec->e_leaf_clusters) <= cpos) 2697 continue; 2698 2699 if (*cow_len == 0) { 2700 /* 2701 * We should find a refcounted record in the 2702 * first pass. 2703 */ 2704 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); 2705 *cow_start = le32_to_cpu(rec->e_cpos); 2706 } 2707 2708 /* 2709 * If we encounter a hole, a non-refcounted record or 2710 * pass the max_cpos, stop the search. 2711 */ 2712 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || 2713 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || 2714 (max_cpos <= le32_to_cpu(rec->e_cpos))) 2715 break; 2716 2717 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); 2718 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; 2719 if (rec_end > max_cpos) { 2720 rec_end = max_cpos; 2721 leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); 2722 } 2723 2724 /* 2725 * How many clusters do we actually need from 2726 * this extent? First we see how many we actually 2727 * need to complete the write. If that's smaller 2728 * than contig_clusters, we try for contig_clusters. 2729 */ 2730 if (!*cow_len) 2731 want_clusters = write_len; 2732 else 2733 want_clusters = (cpos + write_len) - 2734 (*cow_start + *cow_len); 2735 if (want_clusters < contig_clusters) 2736 want_clusters = contig_clusters; 2737 2738 /* 2739 * If the write does not cover the whole extent, we 2740 * need to calculate how we're going to split the extent. 2741 * We try to do it on contig_clusters boundaries. 2742 * 2743 * Any extent smaller than contig_clusters will be 2744 * CoWed in its entirety. 2745 */ 2746 if (leaf_clusters <= contig_clusters) 2747 *cow_len += leaf_clusters; 2748 else if (*cow_len || (*cow_start == cpos)) { 2749 /* 2750 * This extent needs to be CoW'd from its 2751 * beginning, so all we have to do is compute 2752 * how many clusters to grab. We align 2753 * want_clusters to the edge of contig_clusters 2754 * to get better I/O. 2755 */ 2756 want_clusters = ocfs2_cow_align_length(inode->i_sb, 2757 want_clusters); 2758 2759 if (leaf_clusters < want_clusters) 2760 *cow_len += leaf_clusters; 2761 else 2762 *cow_len += want_clusters; 2763 } else if ((*cow_start + contig_clusters) >= 2764 (cpos + write_len)) { 2765 /* 2766 * Breaking off contig_clusters at the front 2767 * of the extent will cover our write. That's 2768 * easy. 2769 */ 2770 *cow_len = contig_clusters; 2771 } else if ((rec_end - cpos) <= contig_clusters) { 2772 /* 2773 * Breaking off contig_clusters at the tail of 2774 * this extent will cover cpos. 2775 */ 2776 *cow_start = rec_end - contig_clusters; 2777 *cow_len = contig_clusters; 2778 } else if ((rec_end - cpos) <= want_clusters) { 2779 /* 2780 * While we can't fit the entire write in this 2781 * extent, we know that the write goes from cpos 2782 * to the end of the extent. Break that off. 2783 * We try to break it at some multiple of 2784 * contig_clusters from the front of the extent. 2785 * Failing that (ie, cpos is within 2786 * contig_clusters of the front), we'll CoW the 2787 * entire extent. 2788 */ 2789 *cow_start = ocfs2_cow_align_start(inode->i_sb, 2790 *cow_start, cpos); 2791 *cow_len = rec_end - *cow_start; 2792 } else { 2793 /* 2794 * Ok, the entire write lives in the middle of 2795 * this extent. Let's try to slice the extent up 2796 * nicely. Optimally, our CoW region starts at 2797 * m*contig_clusters from the beginning of the 2798 * extent and goes for n*contig_clusters, 2799 * covering the entire write. 2800 */ 2801 *cow_start = ocfs2_cow_align_start(inode->i_sb, 2802 *cow_start, cpos); 2803 2804 want_clusters = (cpos + write_len) - *cow_start; 2805 want_clusters = ocfs2_cow_align_length(inode->i_sb, 2806 want_clusters); 2807 if (*cow_start + want_clusters <= rec_end) 2808 *cow_len = want_clusters; 2809 else 2810 *cow_len = rec_end - *cow_start; 2811 } 2812 2813 /* Have we covered our entire write yet? */ 2814 if ((*cow_start + *cow_len) >= (cpos + write_len)) 2815 break; 2816 2817 /* 2818 * If we reach the end of the extent block and don't get enough 2819 * clusters, continue with the next extent block if possible. 2820 */ 2821 if (i + 1 == le16_to_cpu(el->l_next_free_rec) && 2822 eb && eb->h_next_leaf_blk) { 2823 brelse(eb_bh); 2824 eb_bh = NULL; 2825 2826 ret = ocfs2_read_extent_block(INODE_CACHE(inode), 2827 le64_to_cpu(eb->h_next_leaf_blk), 2828 &eb_bh); 2829 if (ret) { 2830 mlog_errno(ret); 2831 goto out; 2832 } 2833 2834 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 2835 el = &eb->h_list; 2836 i = -1; 2837 } 2838 } 2839 2840 out: 2841 brelse(eb_bh); 2842 return ret; 2843 } 2844 2845 /* 2846 * Prepare meta_ac, data_ac and calculate credits when we want to add some 2847 * num_clusters in data_tree "et" and change the refcount for the old 2848 * clusters(starting form p_cluster) in the refcount tree. 2849 * 2850 * Note: 2851 * 1. since we may split the old tree, so we at most will need num_clusters + 2 2852 * more new leaf records. 2853 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so 2854 * just give data_ac = NULL. 2855 */ 2856 static int ocfs2_lock_refcount_allocators(struct super_block *sb, 2857 u32 p_cluster, u32 num_clusters, 2858 struct ocfs2_extent_tree *et, 2859 struct ocfs2_caching_info *ref_ci, 2860 struct buffer_head *ref_root_bh, 2861 struct ocfs2_alloc_context **meta_ac, 2862 struct ocfs2_alloc_context **data_ac, 2863 int *credits) 2864 { 2865 int ret = 0, meta_add = 0; 2866 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); 2867 2868 if (num_free_extents < 0) { 2869 ret = num_free_extents; 2870 mlog_errno(ret); 2871 goto out; 2872 } 2873 2874 if (num_free_extents < num_clusters + 2) 2875 meta_add = 2876 ocfs2_extend_meta_needed(et->et_root_el); 2877 2878 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, 2879 num_clusters + 2); 2880 2881 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, 2882 p_cluster, num_clusters, 2883 &meta_add, credits); 2884 if (ret) { 2885 mlog_errno(ret); 2886 goto out; 2887 } 2888 2889 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", 2890 meta_add, num_clusters, *credits); 2891 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, 2892 meta_ac); 2893 if (ret) { 2894 mlog_errno(ret); 2895 goto out; 2896 } 2897 2898 if (data_ac) { 2899 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, 2900 data_ac); 2901 if (ret) 2902 mlog_errno(ret); 2903 } 2904 2905 out: 2906 if (ret) { 2907 if (*meta_ac) { 2908 ocfs2_free_alloc_context(*meta_ac); 2909 *meta_ac = NULL; 2910 } 2911 } 2912 2913 return ret; 2914 } 2915 2916 static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) 2917 { 2918 BUG_ON(buffer_dirty(bh)); 2919 2920 clear_buffer_mapped(bh); 2921 2922 return 0; 2923 } 2924 2925 static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2926 struct ocfs2_cow_context *context, 2927 u32 cpos, u32 old_cluster, 2928 u32 new_cluster, u32 new_len) 2929 { 2930 int ret = 0, partial; 2931 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2934 struct page *page; 2935 pgoff_t page_index; 2936 unsigned int from, to; 2937 loff_t offset, end, map_end; 2938 struct address_space *mapping = context->inode->i_mapping; 2939 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2941 new_cluster, new_len, cpos); 2942 2943 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2944 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2945 2946 while (offset < end) { 2947 page_index = offset >> PAGE_CACHE_SHIFT; 2948 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; 2949 if (map_end > end) 2950 map_end = end; 2951 2952 /* from, to is the offset within the page. */ 2953 from = offset & (PAGE_CACHE_SIZE - 1); 2954 to = PAGE_CACHE_SIZE; 2955 if (map_end & (PAGE_CACHE_SIZE - 1)) 2956 to = map_end & (PAGE_CACHE_SIZE - 1); 2957 2958 page = grab_cache_page(mapping, page_index); 2959 2960 /* 2961 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2962 * can't be dirtied before we CoW it out. 2963 */ 2964 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2965 BUG_ON(PageDirty(page)); 2966 2967 if (!PageUptodate(page)) { 2968 ret = block_read_full_page(page, ocfs2_get_block); 2969 if (ret) { 2970 mlog_errno(ret); 2971 goto unlock; 2972 } 2973 lock_page(page); 2974 } 2975 2976 if (page_has_buffers(page)) { 2977 ret = walk_page_buffers(handle, page_buffers(page), 2978 from, to, &partial, 2979 ocfs2_clear_cow_buffer); 2980 if (ret) { 2981 mlog_errno(ret); 2982 goto unlock; 2983 } 2984 } 2985 2986 ocfs2_map_and_dirty_page(context->inode, 2987 handle, from, to, 2988 page, 0, &new_block); 2989 mark_page_accessed(page); 2990 unlock: 2991 unlock_page(page); 2992 page_cache_release(page); 2993 page = NULL; 2994 offset = map_end; 2995 if (ret) 2996 break; 2997 } 2998 2999 return ret; 3000 } 3001 3002 static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 3003 struct ocfs2_cow_context *context, 3004 u32 cpos, u32 old_cluster, 3005 u32 new_cluster, u32 new_len) 3006 { 3007 int ret = 0; 3008 struct super_block *sb = context->inode->i_sb; 3009 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3010 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3011 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); 3012 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 3013 struct ocfs2_super *osb = OCFS2_SB(sb); 3014 struct buffer_head *old_bh = NULL; 3015 struct buffer_head *new_bh = NULL; 3016 3017 mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, 3018 new_cluster, new_len); 3019 3020 for (i = 0; i < blocks; i++, old_block++, new_block++) { 3021 new_bh = sb_getblk(osb->sb, new_block); 3022 if (new_bh == NULL) { 3023 ret = -EIO; 3024 mlog_errno(ret); 3025 break; 3026 } 3027 3028 ocfs2_set_new_buffer_uptodate(ci, new_bh); 3029 3030 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); 3031 if (ret) { 3032 mlog_errno(ret); 3033 break; 3034 } 3035 3036 ret = ocfs2_journal_access(handle, ci, new_bh, 3037 OCFS2_JOURNAL_ACCESS_CREATE); 3038 if (ret) { 3039 mlog_errno(ret); 3040 break; 3041 } 3042 3043 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); 3044 ret = ocfs2_journal_dirty(handle, new_bh); 3045 if (ret) { 3046 mlog_errno(ret); 3047 break; 3048 } 3049 3050 brelse(new_bh); 3051 brelse(old_bh); 3052 new_bh = NULL; 3053 old_bh = NULL; 3054 } 3055 3056 brelse(new_bh); 3057 brelse(old_bh); 3058 return ret; 3059 } 3060 3061 static int ocfs2_clear_ext_refcount(handle_t *handle, 3062 struct ocfs2_extent_tree *et, 3063 u32 cpos, u32 p_cluster, u32 len, 3064 unsigned int ext_flags, 3065 struct ocfs2_alloc_context *meta_ac, 3066 struct ocfs2_cached_dealloc_ctxt *dealloc) 3067 { 3068 int ret, index; 3069 struct ocfs2_extent_rec replace_rec; 3070 struct ocfs2_path *path = NULL; 3071 struct ocfs2_extent_list *el; 3072 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 3073 u64 ino = ocfs2_metadata_cache_owner(et->et_ci); 3074 3075 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", 3076 (unsigned long long)ino, cpos, len, p_cluster, ext_flags); 3077 3078 memset(&replace_rec, 0, sizeof(replace_rec)); 3079 replace_rec.e_cpos = cpu_to_le32(cpos); 3080 replace_rec.e_leaf_clusters = cpu_to_le16(len); 3081 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, 3082 p_cluster)); 3083 replace_rec.e_flags = ext_flags; 3084 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; 3085 3086 path = ocfs2_new_path_from_et(et); 3087 if (!path) { 3088 ret = -ENOMEM; 3089 mlog_errno(ret); 3090 goto out; 3091 } 3092 3093 ret = ocfs2_find_path(et->et_ci, path, cpos); 3094 if (ret) { 3095 mlog_errno(ret); 3096 goto out; 3097 } 3098 3099 el = path_leaf_el(path); 3100 3101 index = ocfs2_search_extent_list(el, cpos); 3102 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 3103 ocfs2_error(sb, 3104 "Inode %llu has an extent at cpos %u which can no " 3105 "longer be found.\n", 3106 (unsigned long long)ino, cpos); 3107 ret = -EROFS; 3108 goto out; 3109 } 3110 3111 ret = ocfs2_split_extent(handle, et, path, index, 3112 &replace_rec, meta_ac, dealloc); 3113 if (ret) 3114 mlog_errno(ret); 3115 3116 out: 3117 ocfs2_free_path(path); 3118 return ret; 3119 } 3120 3121 static int ocfs2_replace_clusters(handle_t *handle, 3122 struct ocfs2_cow_context *context, 3123 u32 cpos, u32 old, 3124 u32 new, u32 len, 3125 unsigned int ext_flags) 3126 { 3127 int ret; 3128 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3129 u64 ino = ocfs2_metadata_cache_owner(ci); 3130 3131 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", 3132 (unsigned long long)ino, cpos, old, new, len, ext_flags); 3133 3134 /*If the old clusters is unwritten, no need to duplicate. */ 3135 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3136 ret = context->cow_duplicate_clusters(handle, context, cpos, 3137 old, new, len); 3138 if (ret) { 3139 mlog_errno(ret); 3140 goto out; 3141 } 3142 } 3143 3144 ret = ocfs2_clear_ext_refcount(handle, &context->data_et, 3145 cpos, new, len, ext_flags, 3146 context->meta_ac, &context->dealloc); 3147 if (ret) 3148 mlog_errno(ret); 3149 out: 3150 return ret; 3151 } 3152 3153 static int ocfs2_cow_sync_writeback(struct super_block *sb, 3154 struct ocfs2_cow_context *context, 3155 u32 cpos, u32 num_clusters) 3156 { 3157 int ret = 0; 3158 loff_t offset, end, map_end; 3159 pgoff_t page_index; 3160 struct page *page; 3161 3162 if (ocfs2_should_order_data(context->inode)) 3163 return 0; 3164 3165 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 3166 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 3167 3168 ret = filemap_fdatawrite_range(context->inode->i_mapping, 3169 offset, end - 1); 3170 if (ret < 0) { 3171 mlog_errno(ret); 3172 return ret; 3173 } 3174 3175 while (offset < end) { 3176 page_index = offset >> PAGE_CACHE_SHIFT; 3177 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; 3178 if (map_end > end) 3179 map_end = end; 3180 3181 page = grab_cache_page(context->inode->i_mapping, page_index); 3182 BUG_ON(!page); 3183 3184 wait_on_page_writeback(page); 3185 if (PageError(page)) { 3186 ret = -EIO; 3187 mlog_errno(ret); 3188 } else 3189 mark_page_accessed(page); 3190 3191 unlock_page(page); 3192 page_cache_release(page); 3193 page = NULL; 3194 offset = map_end; 3195 if (ret) 3196 break; 3197 } 3198 3199 return ret; 3200 } 3201 3202 static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, 3203 u32 v_cluster, u32 *p_cluster, 3204 u32 *num_clusters, 3205 unsigned int *extent_flags) 3206 { 3207 return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, 3208 num_clusters, extent_flags); 3209 } 3210 3211 static int ocfs2_make_clusters_writable(struct super_block *sb, 3212 struct ocfs2_cow_context *context, 3213 u32 cpos, u32 p_cluster, 3214 u32 num_clusters, unsigned int e_flags) 3215 { 3216 int ret, delete, index, credits = 0; 3217 u32 new_bit, new_len; 3218 unsigned int set_len; 3219 struct ocfs2_super *osb = OCFS2_SB(sb); 3220 handle_t *handle; 3221 struct buffer_head *ref_leaf_bh = NULL; 3222 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; 3223 struct ocfs2_refcount_rec rec; 3224 3225 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", 3226 cpos, p_cluster, num_clusters, e_flags); 3227 3228 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, 3229 &context->data_et, 3230 ref_ci, 3231 context->ref_root_bh, 3232 &context->meta_ac, 3233 &context->data_ac, &credits); 3234 if (ret) { 3235 mlog_errno(ret); 3236 return ret; 3237 } 3238 3239 if (context->post_refcount) 3240 credits += context->post_refcount->credits; 3241 3242 credits += context->extra_credits; 3243 handle = ocfs2_start_trans(osb, credits); 3244 if (IS_ERR(handle)) { 3245 ret = PTR_ERR(handle); 3246 mlog_errno(ret); 3247 goto out; 3248 } 3249 3250 while (num_clusters) { 3251 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, 3252 p_cluster, num_clusters, 3253 &rec, &index, &ref_leaf_bh); 3254 if (ret) { 3255 mlog_errno(ret); 3256 goto out_commit; 3257 } 3258 3259 BUG_ON(!rec.r_refcount); 3260 set_len = min((u64)p_cluster + num_clusters, 3261 le64_to_cpu(rec.r_cpos) + 3262 le32_to_cpu(rec.r_clusters)) - p_cluster; 3263 3264 /* 3265 * There are many different situation here. 3266 * 1. If refcount == 1, remove the flag and don't COW. 3267 * 2. If refcount > 1, allocate clusters. 3268 * Here we may not allocate r_len once at a time, so continue 3269 * until we reach num_clusters. 3270 */ 3271 if (le32_to_cpu(rec.r_refcount) == 1) { 3272 delete = 0; 3273 ret = ocfs2_clear_ext_refcount(handle, 3274 &context->data_et, 3275 cpos, p_cluster, 3276 set_len, e_flags, 3277 context->meta_ac, 3278 &context->dealloc); 3279 if (ret) { 3280 mlog_errno(ret); 3281 goto out_commit; 3282 } 3283 } else { 3284 delete = 1; 3285 3286 ret = __ocfs2_claim_clusters(osb, handle, 3287 context->data_ac, 3288 1, set_len, 3289 &new_bit, &new_len); 3290 if (ret) { 3291 mlog_errno(ret); 3292 goto out_commit; 3293 } 3294 3295 ret = ocfs2_replace_clusters(handle, context, 3296 cpos, p_cluster, new_bit, 3297 new_len, e_flags); 3298 if (ret) { 3299 mlog_errno(ret); 3300 goto out_commit; 3301 } 3302 set_len = new_len; 3303 } 3304 3305 ret = __ocfs2_decrease_refcount(handle, ref_ci, 3306 context->ref_root_bh, 3307 p_cluster, set_len, 3308 context->meta_ac, 3309 &context->dealloc, delete); 3310 if (ret) { 3311 mlog_errno(ret); 3312 goto out_commit; 3313 } 3314 3315 cpos += set_len; 3316 p_cluster += set_len; 3317 num_clusters -= set_len; 3318 brelse(ref_leaf_bh); 3319 ref_leaf_bh = NULL; 3320 } 3321 3322 /* handle any post_cow action. */ 3323 if (context->post_refcount && context->post_refcount->func) { 3324 ret = context->post_refcount->func(context->inode, handle, 3325 context->post_refcount->para); 3326 if (ret) { 3327 mlog_errno(ret); 3328 goto out_commit; 3329 } 3330 } 3331 3332 /* 3333 * Here we should write the new page out first if we are 3334 * in write-back mode. 3335 */ 3336 if (context->get_clusters == ocfs2_di_get_clusters) { 3337 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); 3338 if (ret) 3339 mlog_errno(ret); 3340 } 3341 3342 out_commit: 3343 ocfs2_commit_trans(osb, handle); 3344 3345 out: 3346 if (context->data_ac) { 3347 ocfs2_free_alloc_context(context->data_ac); 3348 context->data_ac = NULL; 3349 } 3350 if (context->meta_ac) { 3351 ocfs2_free_alloc_context(context->meta_ac); 3352 context->meta_ac = NULL; 3353 } 3354 brelse(ref_leaf_bh); 3355 3356 return ret; 3357 } 3358 3359 static int ocfs2_replace_cow(struct ocfs2_cow_context *context) 3360 { 3361 int ret = 0; 3362 struct inode *inode = context->inode; 3363 u32 cow_start = context->cow_start, cow_len = context->cow_len; 3364 u32 p_cluster, num_clusters; 3365 unsigned int ext_flags; 3366 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3367 3368 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 3369 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 3370 "tree, but the feature bit is not set in the " 3371 "super block.", inode->i_ino); 3372 return -EROFS; 3373 } 3374 3375 ocfs2_init_dealloc_ctxt(&context->dealloc); 3376 3377 while (cow_len) { 3378 ret = context->get_clusters(context, cow_start, &p_cluster, 3379 &num_clusters, &ext_flags); 3380 if (ret) { 3381 mlog_errno(ret); 3382 break; 3383 } 3384 3385 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); 3386 3387 if (cow_len < num_clusters) 3388 num_clusters = cow_len; 3389 3390 ret = ocfs2_make_clusters_writable(inode->i_sb, context, 3391 cow_start, p_cluster, 3392 num_clusters, ext_flags); 3393 if (ret) { 3394 mlog_errno(ret); 3395 break; 3396 } 3397 3398 cow_len -= num_clusters; 3399 cow_start += num_clusters; 3400 } 3401 3402 if (ocfs2_dealloc_has_cluster(&context->dealloc)) { 3403 ocfs2_schedule_truncate_log_flush(osb, 1); 3404 ocfs2_run_deallocs(osb, &context->dealloc); 3405 } 3406 3407 return ret; 3408 } 3409 3410 /* 3411 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3412 * past max_cpos. This will stop when it runs into a hole or an 3413 * unrefcounted extent. 3414 */ 3415 static int ocfs2_refcount_cow_hunk(struct inode *inode, 3416 struct buffer_head *di_bh, 3417 u32 cpos, u32 write_len, u32 max_cpos) 3418 { 3419 int ret; 3420 u32 cow_start = 0, cow_len = 0; 3421 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3422 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3423 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3424 struct buffer_head *ref_root_bh = NULL; 3425 struct ocfs2_refcount_tree *ref_tree; 3426 struct ocfs2_cow_context *context = NULL; 3427 3428 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 3429 3430 ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, 3431 cpos, write_len, max_cpos, 3432 &cow_start, &cow_len); 3433 if (ret) { 3434 mlog_errno(ret); 3435 goto out; 3436 } 3437 3438 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " 3439 "cow_len %u\n", inode->i_ino, 3440 cpos, write_len, cow_start, cow_len); 3441 3442 BUG_ON(cow_len == 0); 3443 3444 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3445 if (!context) { 3446 ret = -ENOMEM; 3447 mlog_errno(ret); 3448 goto out; 3449 } 3450 3451 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 3452 1, &ref_tree, &ref_root_bh); 3453 if (ret) { 3454 mlog_errno(ret); 3455 goto out; 3456 } 3457 3458 context->inode = inode; 3459 context->cow_start = cow_start; 3460 context->cow_len = cow_len; 3461 context->ref_tree = ref_tree; 3462 context->ref_root_bh = ref_root_bh; 3463 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3464 context->get_clusters = ocfs2_di_get_clusters; 3465 3466 ocfs2_init_dinode_extent_tree(&context->data_et, 3467 INODE_CACHE(inode), di_bh); 3468 3469 ret = ocfs2_replace_cow(context); 3470 if (ret) 3471 mlog_errno(ret); 3472 3473 /* 3474 * truncate the extent map here since no matter whether we meet with 3475 * any error during the action, we shouldn't trust cached extent map 3476 * any more. 3477 */ 3478 ocfs2_extent_map_trunc(inode, cow_start); 3479 3480 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3481 brelse(ref_root_bh); 3482 out: 3483 kfree(context); 3484 return ret; 3485 } 3486 3487 /* 3488 * CoW any and all clusters between cpos and cpos+write_len. 3489 * Don't CoW past max_cpos. If this returns successfully, all 3490 * clusters between cpos and cpos+write_len are safe to modify. 3491 */ 3492 int ocfs2_refcount_cow(struct inode *inode, 3493 struct buffer_head *di_bh, 3494 u32 cpos, u32 write_len, u32 max_cpos) 3495 { 3496 int ret = 0; 3497 u32 p_cluster, num_clusters; 3498 unsigned int ext_flags; 3499 3500 while (write_len) { 3501 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3502 &num_clusters, &ext_flags); 3503 if (ret) { 3504 mlog_errno(ret); 3505 break; 3506 } 3507 3508 if (write_len < num_clusters) 3509 num_clusters = write_len; 3510 3511 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3512 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3513 num_clusters, max_cpos); 3514 if (ret) { 3515 mlog_errno(ret); 3516 break; 3517 } 3518 } 3519 3520 write_len -= num_clusters; 3521 cpos += num_clusters; 3522 } 3523 3524 return ret; 3525 } 3526 3527 static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, 3528 u32 v_cluster, u32 *p_cluster, 3529 u32 *num_clusters, 3530 unsigned int *extent_flags) 3531 { 3532 struct inode *inode = context->inode; 3533 struct ocfs2_xattr_value_root *xv = context->cow_object; 3534 3535 return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, 3536 num_clusters, &xv->xr_list, 3537 extent_flags); 3538 } 3539 3540 /* 3541 * Given a xattr value root, calculate the most meta/credits we need for 3542 * refcount tree change if we truncate it to 0. 3543 */ 3544 int ocfs2_refcounted_xattr_delete_need(struct inode *inode, 3545 struct ocfs2_caching_info *ref_ci, 3546 struct buffer_head *ref_root_bh, 3547 struct ocfs2_xattr_value_root *xv, 3548 int *meta_add, int *credits) 3549 { 3550 int ret = 0, index, ref_blocks = 0; 3551 u32 p_cluster, num_clusters; 3552 u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); 3553 struct ocfs2_refcount_block *rb; 3554 struct ocfs2_refcount_rec rec; 3555 struct buffer_head *ref_leaf_bh = NULL; 3556 3557 while (cpos < clusters) { 3558 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 3559 &num_clusters, &xv->xr_list, 3560 NULL); 3561 if (ret) { 3562 mlog_errno(ret); 3563 goto out; 3564 } 3565 3566 cpos += num_clusters; 3567 3568 while (num_clusters) { 3569 ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, 3570 p_cluster, num_clusters, 3571 &rec, &index, 3572 &ref_leaf_bh); 3573 if (ret) { 3574 mlog_errno(ret); 3575 goto out; 3576 } 3577 3578 BUG_ON(!rec.r_refcount); 3579 3580 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 3581 3582 /* 3583 * We really don't know whether the other clusters is in 3584 * this refcount block or not, so just take the worst 3585 * case that all the clusters are in this block and each 3586 * one will split a refcount rec, so totally we need 3587 * clusters * 2 new refcount rec. 3588 */ 3589 if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > 3590 le16_to_cpu(rb->rf_records.rl_count)) 3591 ref_blocks++; 3592 3593 *credits += 1; 3594 brelse(ref_leaf_bh); 3595 ref_leaf_bh = NULL; 3596 3597 if (num_clusters <= le32_to_cpu(rec.r_clusters)) 3598 break; 3599 else 3600 num_clusters -= le32_to_cpu(rec.r_clusters); 3601 p_cluster += num_clusters; 3602 } 3603 } 3604 3605 *meta_add += ref_blocks; 3606 if (!ref_blocks) 3607 goto out; 3608 3609 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 3610 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) 3611 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; 3612 else { 3613 struct ocfs2_extent_tree et; 3614 3615 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); 3616 *credits += ocfs2_calc_extend_credits(inode->i_sb, 3617 et.et_root_el, 3618 ref_blocks); 3619 } 3620 3621 out: 3622 brelse(ref_leaf_bh); 3623 return ret; 3624 } 3625 3626 /* 3627 * Do CoW for xattr. 3628 */ 3629 int ocfs2_refcount_cow_xattr(struct inode *inode, 3630 struct ocfs2_dinode *di, 3631 struct ocfs2_xattr_value_buf *vb, 3632 struct ocfs2_refcount_tree *ref_tree, 3633 struct buffer_head *ref_root_bh, 3634 u32 cpos, u32 write_len, 3635 struct ocfs2_post_refcount *post) 3636 { 3637 int ret; 3638 struct ocfs2_xattr_value_root *xv = vb->vb_xv; 3639 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3640 struct ocfs2_cow_context *context = NULL; 3641 u32 cow_start, cow_len; 3642 3643 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 3644 3645 ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, 3646 cpos, write_len, UINT_MAX, 3647 &cow_start, &cow_len); 3648 if (ret) { 3649 mlog_errno(ret); 3650 goto out; 3651 } 3652 3653 BUG_ON(cow_len == 0); 3654 3655 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3656 if (!context) { 3657 ret = -ENOMEM; 3658 mlog_errno(ret); 3659 goto out; 3660 } 3661 3662 context->inode = inode; 3663 context->cow_start = cow_start; 3664 context->cow_len = cow_len; 3665 context->ref_tree = ref_tree; 3666 context->ref_root_bh = ref_root_bh;; 3667 context->cow_object = xv; 3668 3669 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; 3670 /* We need the extra credits for duplicate_clusters by jbd. */ 3671 context->extra_credits = 3672 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; 3673 context->get_clusters = ocfs2_xattr_value_get_clusters; 3674 context->post_refcount = post; 3675 3676 ocfs2_init_xattr_value_extent_tree(&context->data_et, 3677 INODE_CACHE(inode), vb); 3678 3679 ret = ocfs2_replace_cow(context); 3680 if (ret) 3681 mlog_errno(ret); 3682 3683 out: 3684 kfree(context); 3685 return ret; 3686 } 3687 3688 /* 3689 * Insert a new extent into refcount tree and mark a extent rec 3690 * as refcounted in the dinode tree. 3691 */ 3692 int ocfs2_add_refcount_flag(struct inode *inode, 3693 struct ocfs2_extent_tree *data_et, 3694 struct ocfs2_caching_info *ref_ci, 3695 struct buffer_head *ref_root_bh, 3696 u32 cpos, u32 p_cluster, u32 num_clusters, 3697 struct ocfs2_cached_dealloc_ctxt *dealloc, 3698 struct ocfs2_post_refcount *post) 3699 { 3700 int ret; 3701 handle_t *handle; 3702 int credits = 1, ref_blocks = 0; 3703 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3704 struct ocfs2_alloc_context *meta_ac = NULL; 3705 3706 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, 3707 ref_ci, ref_root_bh, 3708 p_cluster, num_clusters, 3709 &ref_blocks, &credits); 3710 if (ret) { 3711 mlog_errno(ret); 3712 goto out; 3713 } 3714 3715 mlog(0, "reserve new metadata %d, credits = %d\n", 3716 ref_blocks, credits); 3717 3718 if (ref_blocks) { 3719 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 3720 ref_blocks, &meta_ac); 3721 if (ret) { 3722 mlog_errno(ret); 3723 goto out; 3724 } 3725 } 3726 3727 if (post) 3728 credits += post->credits; 3729 3730 handle = ocfs2_start_trans(osb, credits); 3731 if (IS_ERR(handle)) { 3732 ret = PTR_ERR(handle); 3733 mlog_errno(ret); 3734 goto out; 3735 } 3736 3737 ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, 3738 cpos, num_clusters, p_cluster, 3739 meta_ac, dealloc); 3740 if (ret) { 3741 mlog_errno(ret); 3742 goto out_commit; 3743 } 3744 3745 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, 3746 p_cluster, num_clusters, 0, 3747 meta_ac, dealloc); 3748 if (ret) { 3749 mlog_errno(ret); 3750 goto out_commit; 3751 } 3752 3753 if (post && post->func) { 3754 ret = post->func(inode, handle, post->para); 3755 if (ret) 3756 mlog_errno(ret); 3757 } 3758 3759 out_commit: 3760 ocfs2_commit_trans(osb, handle); 3761 out: 3762 if (meta_ac) 3763 ocfs2_free_alloc_context(meta_ac); 3764 return ret; 3765 } 3766 3767 static int ocfs2_change_ctime(struct inode *inode, 3768 struct buffer_head *di_bh) 3769 { 3770 int ret; 3771 handle_t *handle; 3772 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3773 3774 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), 3775 OCFS2_INODE_UPDATE_CREDITS); 3776 if (IS_ERR(handle)) { 3777 ret = PTR_ERR(handle); 3778 mlog_errno(ret); 3779 goto out; 3780 } 3781 3782 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 3783 OCFS2_JOURNAL_ACCESS_WRITE); 3784 if (ret) { 3785 mlog_errno(ret); 3786 goto out_commit; 3787 } 3788 3789 inode->i_ctime = CURRENT_TIME; 3790 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 3791 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 3792 3793 ocfs2_journal_dirty(handle, di_bh); 3794 3795 out_commit: 3796 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 3797 out: 3798 return ret; 3799 } 3800 3801 static int ocfs2_attach_refcount_tree(struct inode *inode, 3802 struct buffer_head *di_bh) 3803 { 3804 int ret, data_changed = 0; 3805 struct buffer_head *ref_root_bh = NULL; 3806 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3807 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3808 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3809 struct ocfs2_refcount_tree *ref_tree; 3810 unsigned int ext_flags; 3811 loff_t size; 3812 u32 cpos, num_clusters, clusters, p_cluster; 3813 struct ocfs2_cached_dealloc_ctxt dealloc; 3814 struct ocfs2_extent_tree di_et; 3815 3816 ocfs2_init_dealloc_ctxt(&dealloc); 3817 3818 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { 3819 ret = ocfs2_create_refcount_tree(inode, di_bh); 3820 if (ret) { 3821 mlog_errno(ret); 3822 goto out; 3823 } 3824 } 3825 3826 BUG_ON(!di->i_refcount_loc); 3827 ret = ocfs2_lock_refcount_tree(osb, 3828 le64_to_cpu(di->i_refcount_loc), 1, 3829 &ref_tree, &ref_root_bh); 3830 if (ret) { 3831 mlog_errno(ret); 3832 goto out; 3833 } 3834 3835 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 3836 goto attach_xattr; 3837 3838 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); 3839 3840 size = i_size_read(inode); 3841 clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); 3842 3843 cpos = 0; 3844 while (cpos < clusters) { 3845 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3846 &num_clusters, &ext_flags); 3847 3848 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { 3849 ret = ocfs2_add_refcount_flag(inode, &di_et, 3850 &ref_tree->rf_ci, 3851 ref_root_bh, cpos, 3852 p_cluster, num_clusters, 3853 &dealloc, NULL); 3854 if (ret) { 3855 mlog_errno(ret); 3856 goto unlock; 3857 } 3858 3859 data_changed = 1; 3860 } 3861 cpos += num_clusters; 3862 } 3863 3864 attach_xattr: 3865 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { 3866 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, 3867 &ref_tree->rf_ci, 3868 ref_root_bh, 3869 &dealloc); 3870 if (ret) { 3871 mlog_errno(ret); 3872 goto unlock; 3873 } 3874 } 3875 3876 if (data_changed) { 3877 ret = ocfs2_change_ctime(inode, di_bh); 3878 if (ret) 3879 mlog_errno(ret); 3880 } 3881 3882 unlock: 3883 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3884 brelse(ref_root_bh); 3885 3886 if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { 3887 ocfs2_schedule_truncate_log_flush(osb, 1); 3888 ocfs2_run_deallocs(osb, &dealloc); 3889 } 3890 out: 3891 /* 3892 * Empty the extent map so that we may get the right extent 3893 * record from the disk. 3894 */ 3895 ocfs2_extent_map_trunc(inode, 0); 3896 3897 return ret; 3898 } 3899 3900 static int ocfs2_add_refcounted_extent(struct inode *inode, 3901 struct ocfs2_extent_tree *et, 3902 struct ocfs2_caching_info *ref_ci, 3903 struct buffer_head *ref_root_bh, 3904 u32 cpos, u32 p_cluster, u32 num_clusters, 3905 unsigned int ext_flags, 3906 struct ocfs2_cached_dealloc_ctxt *dealloc) 3907 { 3908 int ret; 3909 handle_t *handle; 3910 int credits = 0; 3911 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3912 struct ocfs2_alloc_context *meta_ac = NULL; 3913 3914 ret = ocfs2_lock_refcount_allocators(inode->i_sb, 3915 p_cluster, num_clusters, 3916 et, ref_ci, 3917 ref_root_bh, &meta_ac, 3918 NULL, &credits); 3919 if (ret) { 3920 mlog_errno(ret); 3921 goto out; 3922 } 3923 3924 handle = ocfs2_start_trans(osb, credits); 3925 if (IS_ERR(handle)) { 3926 ret = PTR_ERR(handle); 3927 mlog_errno(ret); 3928 goto out; 3929 } 3930 3931 ret = ocfs2_insert_extent(handle, et, cpos, 3932 ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), 3933 num_clusters, ext_flags, meta_ac); 3934 if (ret) { 3935 mlog_errno(ret); 3936 goto out_commit; 3937 } 3938 3939 ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, 3940 p_cluster, num_clusters, 3941 meta_ac, dealloc); 3942 if (ret) 3943 mlog_errno(ret); 3944 3945 out_commit: 3946 ocfs2_commit_trans(osb, handle); 3947 out: 3948 if (meta_ac) 3949 ocfs2_free_alloc_context(meta_ac); 3950 return ret; 3951 } 3952 3953 static int ocfs2_duplicate_inline_data(struct inode *s_inode, 3954 struct buffer_head *s_bh, 3955 struct inode *t_inode, 3956 struct buffer_head *t_bh) 3957 { 3958 int ret; 3959 handle_t *handle; 3960 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); 3961 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; 3962 struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; 3963 3964 BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 3965 3966 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 3967 if (IS_ERR(handle)) { 3968 ret = PTR_ERR(handle); 3969 mlog_errno(ret); 3970 goto out; 3971 } 3972 3973 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, 3974 OCFS2_JOURNAL_ACCESS_WRITE); 3975 if (ret) { 3976 mlog_errno(ret); 3977 goto out_commit; 3978 } 3979 3980 t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; 3981 memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, 3982 le16_to_cpu(s_di->id2.i_data.id_count)); 3983 spin_lock(&OCFS2_I(t_inode)->ip_lock); 3984 OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 3985 t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); 3986 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 3987 3988 ocfs2_journal_dirty(handle, t_bh); 3989 3990 out_commit: 3991 ocfs2_commit_trans(osb, handle); 3992 out: 3993 return ret; 3994 } 3995 3996 static int ocfs2_duplicate_extent_list(struct inode *s_inode, 3997 struct inode *t_inode, 3998 struct buffer_head *t_bh, 3999 struct ocfs2_caching_info *ref_ci, 4000 struct buffer_head *ref_root_bh, 4001 struct ocfs2_cached_dealloc_ctxt *dealloc) 4002 { 4003 int ret = 0; 4004 u32 p_cluster, num_clusters, clusters, cpos; 4005 loff_t size; 4006 unsigned int ext_flags; 4007 struct ocfs2_extent_tree et; 4008 4009 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); 4010 4011 size = i_size_read(s_inode); 4012 clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); 4013 4014 cpos = 0; 4015 while (cpos < clusters) { 4016 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, 4017 &num_clusters, &ext_flags); 4018 4019 if (p_cluster) { 4020 ret = ocfs2_add_refcounted_extent(t_inode, &et, 4021 ref_ci, ref_root_bh, 4022 cpos, p_cluster, 4023 num_clusters, 4024 ext_flags, 4025 dealloc); 4026 if (ret) { 4027 mlog_errno(ret); 4028 goto out; 4029 } 4030 } 4031 4032 cpos += num_clusters; 4033 } 4034 4035 out: 4036 return ret; 4037 } 4038 4039 /* 4040 * change the new file's attributes to the src. 4041 * 4042 * reflink creates a snapshot of a file, that means the attributes 4043 * must be identical except for three exceptions - nlink, ino, and ctime. 4044 */ 4045 static int ocfs2_complete_reflink(struct inode *s_inode, 4046 struct buffer_head *s_bh, 4047 struct inode *t_inode, 4048 struct buffer_head *t_bh, 4049 bool preserve) 4050 { 4051 int ret; 4052 handle_t *handle; 4053 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; 4054 struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; 4055 loff_t size = i_size_read(s_inode); 4056 4057 handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), 4058 OCFS2_INODE_UPDATE_CREDITS); 4059 if (IS_ERR(handle)) { 4060 ret = PTR_ERR(handle); 4061 mlog_errno(ret); 4062 return ret; 4063 } 4064 4065 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, 4066 OCFS2_JOURNAL_ACCESS_WRITE); 4067 if (ret) { 4068 mlog_errno(ret); 4069 goto out_commit; 4070 } 4071 4072 spin_lock(&OCFS2_I(t_inode)->ip_lock); 4073 OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; 4074 OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; 4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4077 i_size_write(t_inode, size); 4078 4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4080 di->i_clusters = s_di->i_clusters; 4081 di->i_size = s_di->i_size; 4082 di->i_dyn_features = s_di->i_dyn_features; 4083 di->i_attr = s_di->i_attr; 4084 4085 if (preserve) { 4086 di->i_uid = s_di->i_uid; 4087 di->i_gid = s_di->i_gid; 4088 di->i_mode = s_di->i_mode; 4089 4090 /* 4091 * update time. 4092 * we want mtime to appear identical to the source and 4093 * update ctime. 4094 */ 4095 t_inode->i_ctime = CURRENT_TIME; 4096 4097 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); 4098 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); 4099 4100 t_inode->i_mtime = s_inode->i_mtime; 4101 di->i_mtime = s_di->i_mtime; 4102 di->i_mtime_nsec = s_di->i_mtime_nsec; 4103 } 4104 4105 ocfs2_journal_dirty(handle, t_bh); 4106 4107 out_commit: 4108 ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); 4109 return ret; 4110 } 4111 4112 static int ocfs2_create_reflink_node(struct inode *s_inode, 4113 struct buffer_head *s_bh, 4114 struct inode *t_inode, 4115 struct buffer_head *t_bh, 4116 bool preserve) 4117 { 4118 int ret; 4119 struct buffer_head *ref_root_bh = NULL; 4120 struct ocfs2_cached_dealloc_ctxt dealloc; 4121 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); 4122 struct ocfs2_refcount_block *rb; 4123 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; 4124 struct ocfs2_refcount_tree *ref_tree; 4125 4126 ocfs2_init_dealloc_ctxt(&dealloc); 4127 4128 ret = ocfs2_set_refcount_tree(t_inode, t_bh, 4129 le64_to_cpu(di->i_refcount_loc)); 4130 if (ret) { 4131 mlog_errno(ret); 4132 goto out; 4133 } 4134 4135 if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4136 ret = ocfs2_duplicate_inline_data(s_inode, s_bh, 4137 t_inode, t_bh); 4138 if (ret) 4139 mlog_errno(ret); 4140 goto out; 4141 } 4142 4143 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 4144 1, &ref_tree, &ref_root_bh); 4145 if (ret) { 4146 mlog_errno(ret); 4147 goto out; 4148 } 4149 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 4150 4151 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, 4152 &ref_tree->rf_ci, ref_root_bh, 4153 &dealloc); 4154 if (ret) { 4155 mlog_errno(ret); 4156 goto out_unlock_refcount; 4157 } 4158 4159 out_unlock_refcount: 4160 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 4161 brelse(ref_root_bh); 4162 out: 4163 if (ocfs2_dealloc_has_cluster(&dealloc)) { 4164 ocfs2_schedule_truncate_log_flush(osb, 1); 4165 ocfs2_run_deallocs(osb, &dealloc); 4166 } 4167 4168 return ret; 4169 } 4170 4171 static int __ocfs2_reflink(struct dentry *old_dentry, 4172 struct buffer_head *old_bh, 4173 struct inode *new_inode, 4174 bool preserve) 4175 { 4176 int ret; 4177 struct inode *inode = old_dentry->d_inode; 4178 struct buffer_head *new_bh = NULL; 4179 4180 ret = filemap_fdatawrite(inode->i_mapping); 4181 if (ret) { 4182 mlog_errno(ret); 4183 goto out; 4184 } 4185 4186 ret = ocfs2_attach_refcount_tree(inode, old_bh); 4187 if (ret) { 4188 mlog_errno(ret); 4189 goto out; 4190 } 4191 4192 mutex_lock(&new_inode->i_mutex); 4193 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4194 if (ret) { 4195 mlog_errno(ret); 4196 goto out_unlock; 4197 } 4198 4199 ret = ocfs2_create_reflink_node(inode, old_bh, 4200 new_inode, new_bh, preserve); 4201 if (ret) { 4202 mlog_errno(ret); 4203 goto inode_unlock; 4204 } 4205 4206 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { 4207 ret = ocfs2_reflink_xattrs(inode, old_bh, 4208 new_inode, new_bh, 4209 preserve); 4210 if (ret) { 4211 mlog_errno(ret); 4212 goto inode_unlock; 4213 } 4214 } 4215 4216 ret = ocfs2_complete_reflink(inode, old_bh, 4217 new_inode, new_bh, preserve); 4218 if (ret) 4219 mlog_errno(ret); 4220 4221 inode_unlock: 4222 ocfs2_inode_unlock(new_inode, 1); 4223 brelse(new_bh); 4224 out_unlock: 4225 mutex_unlock(&new_inode->i_mutex); 4226 out: 4227 if (!ret) { 4228 ret = filemap_fdatawait(inode->i_mapping); 4229 if (ret) 4230 mlog_errno(ret); 4231 } 4232 return ret; 4233 } 4234 4235 static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, 4236 struct dentry *new_dentry, bool preserve) 4237 { 4238 int error; 4239 struct inode *inode = old_dentry->d_inode; 4240 struct buffer_head *old_bh = NULL; 4241 struct inode *new_orphan_inode = NULL; 4242 4243 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) 4244 return -EOPNOTSUPP; 4245 4246 error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, 4247 &new_orphan_inode); 4248 if (error) { 4249 mlog_errno(error); 4250 goto out; 4251 } 4252 4253 error = ocfs2_inode_lock(inode, &old_bh, 1); 4254 if (error) { 4255 mlog_errno(error); 4256 goto out; 4257 } 4258 4259 down_write(&OCFS2_I(inode)->ip_xattr_sem); 4260 down_write(&OCFS2_I(inode)->ip_alloc_sem); 4261 error = __ocfs2_reflink(old_dentry, old_bh, 4262 new_orphan_inode, preserve); 4263 up_write(&OCFS2_I(inode)->ip_alloc_sem); 4264 up_write(&OCFS2_I(inode)->ip_xattr_sem); 4265 4266 ocfs2_inode_unlock(inode, 1); 4267 brelse(old_bh); 4268 4269 if (error) { 4270 mlog_errno(error); 4271 goto out; 4272 } 4273 4274 /* If the security isn't preserved, we need to re-initialize them. */ 4275 if (!preserve) { 4276 error = ocfs2_init_security_and_acl(dir, new_orphan_inode); 4277 if (error) 4278 mlog_errno(error); 4279 } 4280 out: 4281 if (!error) { 4282 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, 4283 new_dentry); 4284 if (error) 4285 mlog_errno(error); 4286 } 4287 4288 if (new_orphan_inode) { 4289 /* 4290 * We need to open_unlock the inode no matter whether we 4291 * succeed or not, so that other nodes can delete it later. 4292 */ 4293 ocfs2_open_unlock(new_orphan_inode); 4294 if (error) 4295 iput(new_orphan_inode); 4296 } 4297 4298 return error; 4299 } 4300 4301 /* 4302 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake 4303 * sys_reflink(). This will go away when vfs_reflink() exists in 4304 * fs/namei.c. 4305 */ 4306 4307 /* copied from may_create in VFS. */ 4308 static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) 4309 { 4310 if (child->d_inode) 4311 return -EEXIST; 4312 if (IS_DEADDIR(dir)) 4313 return -ENOENT; 4314 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 4315 } 4316 4317 /* copied from user_path_parent. */ 4318 static int ocfs2_user_path_parent(const char __user *path, 4319 struct nameidata *nd, char **name) 4320 { 4321 char *s = getname(path); 4322 int error; 4323 4324 if (IS_ERR(s)) 4325 return PTR_ERR(s); 4326 4327 error = path_lookup(s, LOOKUP_PARENT, nd); 4328 if (error) 4329 putname(s); 4330 else 4331 *name = s; 4332 4333 return error; 4334 } 4335 4336 /** 4337 * ocfs2_vfs_reflink - Create a reference-counted link 4338 * 4339 * @old_dentry: source dentry + inode 4340 * @dir: directory to create the target 4341 * @new_dentry: target dentry 4342 * @preserve: if true, preserve all file attributes 4343 */ 4344 static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, 4345 struct dentry *new_dentry, bool preserve) 4346 { 4347 struct inode *inode = old_dentry->d_inode; 4348 int error; 4349 4350 if (!inode) 4351 return -ENOENT; 4352 4353 error = ocfs2_may_create(dir, new_dentry); 4354 if (error) 4355 return error; 4356 4357 if (dir->i_sb != inode->i_sb) 4358 return -EXDEV; 4359 4360 /* 4361 * A reflink to an append-only or immutable file cannot be created. 4362 */ 4363 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 4364 return -EPERM; 4365 4366 /* Only regular files can be reflinked. */ 4367 if (!S_ISREG(inode->i_mode)) 4368 return -EPERM; 4369 4370 /* 4371 * If the caller wants to preserve ownership, they require the 4372 * rights to do so. 4373 */ 4374 if (preserve) { 4375 if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) 4376 return -EPERM; 4377 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) 4378 return -EPERM; 4379 } 4380 4381 /* 4382 * If the caller is modifying any aspect of the attributes, they 4383 * are not creating a snapshot. They need read permission on the 4384 * file. 4385 */ 4386 if (!preserve) { 4387 error = inode_permission(inode, MAY_READ); 4388 if (error) 4389 return error; 4390 } 4391 4392 mutex_lock(&inode->i_mutex); 4393 vfs_dq_init(dir); 4394 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4395 mutex_unlock(&inode->i_mutex); 4396 if (!error) 4397 fsnotify_create(dir, new_dentry); 4398 return error; 4399 } 4400 /* 4401 * Most codes are copied from sys_linkat. 4402 */ 4403 int ocfs2_reflink_ioctl(struct inode *inode, 4404 const char __user *oldname, 4405 const char __user *newname, 4406 bool preserve) 4407 { 4408 struct dentry *new_dentry; 4409 struct nameidata nd; 4410 struct path old_path; 4411 int error; 4412 char *to = NULL; 4413 4414 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) 4415 return -EOPNOTSUPP; 4416 4417 error = user_path_at(AT_FDCWD, oldname, 0, &old_path); 4418 if (error) { 4419 mlog_errno(error); 4420 return error; 4421 } 4422 4423 error = ocfs2_user_path_parent(newname, &nd, &to); 4424 if (error) { 4425 mlog_errno(error); 4426 goto out; 4427 } 4428 4429 error = -EXDEV; 4430 if (old_path.mnt != nd.path.mnt) 4431 goto out_release; 4432 new_dentry = lookup_create(&nd, 0); 4433 error = PTR_ERR(new_dentry); 4434 if (IS_ERR(new_dentry)) { 4435 mlog_errno(error); 4436 goto out_unlock; 4437 } 4438 4439 error = mnt_want_write(nd.path.mnt); 4440 if (error) { 4441 mlog_errno(error); 4442 goto out_dput; 4443 } 4444 4445 error = ocfs2_vfs_reflink(old_path.dentry, 4446 nd.path.dentry->d_inode, 4447 new_dentry, preserve); 4448 mnt_drop_write(nd.path.mnt); 4449 out_dput: 4450 dput(new_dentry); 4451 out_unlock: 4452 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 4453 out_release: 4454 path_put(&nd.path); 4455 putname(to); 4456 out: 4457 path_put(&old_path); 4458 4459 return error; 4460 } 4461