1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23 /* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32 #include <linux/module.h> 33 #include <linux/fs.h> 34 #include <linux/time.h> 35 #include <linux/jbd2.h> 36 #include <linux/highuid.h> 37 #include <linux/pagemap.h> 38 #include <linux/quotaops.h> 39 #include <linux/string.h> 40 #include <linux/slab.h> 41 #include <linux/falloc.h> 42 #include <asm/uaccess.h> 43 #include <linux/fiemap.h> 44 #include "ext4_jbd2.h" 45 #include "ext4_extents.h" 46 47 48 /* 49 * ext_pblock: 50 * combine low and high parts of physical block number into ext4_fsblk_t 51 */ 52 ext4_fsblk_t ext_pblock(struct ext4_extent *ex) 53 { 54 ext4_fsblk_t block; 55 56 block = le32_to_cpu(ex->ee_start_lo); 57 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; 58 return block; 59 } 60 61 /* 62 * idx_pblock: 63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t 64 */ 65 ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) 66 { 67 ext4_fsblk_t block; 68 69 block = le32_to_cpu(ix->ei_leaf_lo); 70 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; 71 return block; 72 } 73 74 /* 75 * ext4_ext_store_pblock: 76 * stores a large physical block number into an extent struct, 77 * breaking it into parts 78 */ 79 void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) 80 { 81 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 82 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 83 } 84 85 /* 86 * ext4_idx_store_pblock: 87 * stores a large physical block number into an index struct, 88 * breaking it into parts 89 */ 90 static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) 91 { 92 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 94 } 95 96 static int ext4_ext_truncate_extend_restart(handle_t *handle, 97 struct inode *inode, 98 int needed) 99 { 100 int err; 101 102 if (!ext4_handle_valid(handle)) 103 return 0; 104 if (handle->h_buffer_credits > needed) 105 return 0; 106 err = ext4_journal_extend(handle, needed); 107 if (err <= 0) 108 return err; 109 err = ext4_truncate_restart_trans(handle, inode, needed); 110 if (err == 0) 111 err = -EAGAIN; 112 113 return err; 114 } 115 116 /* 117 * could return: 118 * - EROFS 119 * - ENOMEM 120 */ 121 static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 122 struct ext4_ext_path *path) 123 { 124 if (path->p_bh) { 125 /* path points to block */ 126 return ext4_journal_get_write_access(handle, path->p_bh); 127 } 128 /* path points to leaf/index in inode body */ 129 /* we use in-core data, no need to protect them */ 130 return 0; 131 } 132 133 /* 134 * could return: 135 * - EROFS 136 * - ENOMEM 137 * - EIO 138 */ 139 static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 140 struct ext4_ext_path *path) 141 { 142 int err; 143 if (path->p_bh) { 144 /* path points to block */ 145 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 146 } else { 147 /* path points to leaf/index in inode body */ 148 err = ext4_mark_inode_dirty(handle, inode); 149 } 150 return err; 151 } 152 153 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 154 struct ext4_ext_path *path, 155 ext4_lblk_t block) 156 { 157 struct ext4_inode_info *ei = EXT4_I(inode); 158 ext4_fsblk_t bg_start; 159 ext4_fsblk_t last_block; 160 ext4_grpblk_t colour; 161 ext4_group_t block_group; 162 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 163 int depth; 164 165 if (path) { 166 struct ext4_extent *ex; 167 depth = path->p_depth; 168 169 /* try to predict block placement */ 170 ex = path[depth].p_ext; 171 if (ex) 172 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 173 174 /* it looks like index is empty; 175 * try to find starting block from index itself */ 176 if (path[depth].p_bh) 177 return path[depth].p_bh->b_blocknr; 178 } 179 180 /* OK. use inode's group */ 181 block_group = ei->i_block_group; 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 183 /* 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 185 * block groups per flexgroup, reserve the first block 186 * group for directories and special files. Regular 187 * files will start at the second block group. This 188 * tends to speed up directory access and improves 189 * fsck times. 190 */ 191 block_group &= ~(flex_size-1); 192 if (S_ISREG(inode->i_mode)) 193 block_group++; 194 } 195 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 196 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 197 198 /* 199 * If we are doing delayed allocation, we don't need take 200 * colour into account. 201 */ 202 if (test_opt(inode->i_sb, DELALLOC)) 203 return bg_start; 204 205 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 206 colour = (current->pid % 16) * 207 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 208 else 209 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 210 return bg_start + colour + block; 211 } 212 213 /* 214 * Allocation for a meta data block 215 */ 216 static ext4_fsblk_t 217 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 218 struct ext4_ext_path *path, 219 struct ext4_extent *ex, int *err) 220 { 221 ext4_fsblk_t goal, newblock; 222 223 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 224 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 225 return newblock; 226 } 227 228 static inline int ext4_ext_space_block(struct inode *inode, int check) 229 { 230 int size; 231 232 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 233 / sizeof(struct ext4_extent); 234 if (!check) { 235 #ifdef AGGRESSIVE_TEST 236 if (size > 6) 237 size = 6; 238 #endif 239 } 240 return size; 241 } 242 243 static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 244 { 245 int size; 246 247 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 248 / sizeof(struct ext4_extent_idx); 249 if (!check) { 250 #ifdef AGGRESSIVE_TEST 251 if (size > 5) 252 size = 5; 253 #endif 254 } 255 return size; 256 } 257 258 static inline int ext4_ext_space_root(struct inode *inode, int check) 259 { 260 int size; 261 262 size = sizeof(EXT4_I(inode)->i_data); 263 size -= sizeof(struct ext4_extent_header); 264 size /= sizeof(struct ext4_extent); 265 if (!check) { 266 #ifdef AGGRESSIVE_TEST 267 if (size > 3) 268 size = 3; 269 #endif 270 } 271 return size; 272 } 273 274 static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 275 { 276 int size; 277 278 size = sizeof(EXT4_I(inode)->i_data); 279 size -= sizeof(struct ext4_extent_header); 280 size /= sizeof(struct ext4_extent_idx); 281 if (!check) { 282 #ifdef AGGRESSIVE_TEST 283 if (size > 4) 284 size = 4; 285 #endif 286 } 287 return size; 288 } 289 290 /* 291 * Calculate the number of metadata blocks needed 292 * to allocate @blocks 293 * Worse case is one block per extent 294 */ 295 int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) 296 { 297 struct ext4_inode_info *ei = EXT4_I(inode); 298 int idxs, num = 0; 299 300 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 301 / sizeof(struct ext4_extent_idx)); 302 303 /* 304 * If the new delayed allocation block is contiguous with the 305 * previous da block, it can share index blocks with the 306 * previous block, so we only need to allocate a new index 307 * block every idxs leaf blocks. At ldxs**2 blocks, we need 308 * an additional index block, and at ldxs**3 blocks, yet 309 * another index blocks. 310 */ 311 if (ei->i_da_metadata_calc_len && 312 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 313 if ((ei->i_da_metadata_calc_len % idxs) == 0) 314 num++; 315 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 316 num++; 317 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 318 num++; 319 ei->i_da_metadata_calc_len = 0; 320 } else 321 ei->i_da_metadata_calc_len++; 322 ei->i_da_metadata_calc_last_lblock++; 323 return num; 324 } 325 326 /* 327 * In the worst case we need a new set of index blocks at 328 * every level of the inode's extent tree. 329 */ 330 ei->i_da_metadata_calc_len = 1; 331 ei->i_da_metadata_calc_last_lblock = lblock; 332 return ext_depth(inode) + 1; 333 } 334 335 static int 336 ext4_ext_max_entries(struct inode *inode, int depth) 337 { 338 int max; 339 340 if (depth == ext_depth(inode)) { 341 if (depth == 0) 342 max = ext4_ext_space_root(inode, 1); 343 else 344 max = ext4_ext_space_root_idx(inode, 1); 345 } else { 346 if (depth == 0) 347 max = ext4_ext_space_block(inode, 1); 348 else 349 max = ext4_ext_space_block_idx(inode, 1); 350 } 351 352 return max; 353 } 354 355 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 356 { 357 ext4_fsblk_t block = ext_pblock(ext); 358 int len = ext4_ext_get_actual_len(ext); 359 360 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 361 } 362 363 static int ext4_valid_extent_idx(struct inode *inode, 364 struct ext4_extent_idx *ext_idx) 365 { 366 ext4_fsblk_t block = idx_pblock(ext_idx); 367 368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 369 } 370 371 static int ext4_valid_extent_entries(struct inode *inode, 372 struct ext4_extent_header *eh, 373 int depth) 374 { 375 struct ext4_extent *ext; 376 struct ext4_extent_idx *ext_idx; 377 unsigned short entries; 378 if (eh->eh_entries == 0) 379 return 1; 380 381 entries = le16_to_cpu(eh->eh_entries); 382 383 if (depth == 0) { 384 /* leaf entries */ 385 ext = EXT_FIRST_EXTENT(eh); 386 while (entries) { 387 if (!ext4_valid_extent(inode, ext)) 388 return 0; 389 ext++; 390 entries--; 391 } 392 } else { 393 ext_idx = EXT_FIRST_INDEX(eh); 394 while (entries) { 395 if (!ext4_valid_extent_idx(inode, ext_idx)) 396 return 0; 397 ext_idx++; 398 entries--; 399 } 400 } 401 return 1; 402 } 403 404 static int __ext4_ext_check(const char *function, struct inode *inode, 405 struct ext4_extent_header *eh, 406 int depth) 407 { 408 const char *error_msg; 409 int max = 0; 410 411 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 412 error_msg = "invalid magic"; 413 goto corrupted; 414 } 415 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 416 error_msg = "unexpected eh_depth"; 417 goto corrupted; 418 } 419 if (unlikely(eh->eh_max == 0)) { 420 error_msg = "invalid eh_max"; 421 goto corrupted; 422 } 423 max = ext4_ext_max_entries(inode, depth); 424 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 425 error_msg = "too large eh_max"; 426 goto corrupted; 427 } 428 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 429 error_msg = "invalid eh_entries"; 430 goto corrupted; 431 } 432 if (!ext4_valid_extent_entries(inode, eh, depth)) { 433 error_msg = "invalid extent entries"; 434 goto corrupted; 435 } 436 return 0; 437 438 corrupted: 439 ext4_error_inode(function, inode, 440 "bad header/extent: %s - magic %x, " 441 "entries %u, max %u(%u), depth %u(%u)", 442 error_msg, le16_to_cpu(eh->eh_magic), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 444 max, le16_to_cpu(eh->eh_depth), depth); 445 446 return -EIO; 447 } 448 449 #define ext4_ext_check(inode, eh, depth) \ 450 __ext4_ext_check(__func__, inode, eh, depth) 451 452 int ext4_ext_check_inode(struct inode *inode) 453 { 454 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 455 } 456 457 #ifdef EXT_DEBUG 458 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 459 { 460 int k, l = path->p_depth; 461 462 ext_debug("path:"); 463 for (k = 0; k <= l; k++, path++) { 464 if (path->p_idx) { 465 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 466 idx_pblock(path->p_idx)); 467 } else if (path->p_ext) { 468 ext_debug(" %d:[%d]%d:%llu ", 469 le32_to_cpu(path->p_ext->ee_block), 470 ext4_ext_is_uninitialized(path->p_ext), 471 ext4_ext_get_actual_len(path->p_ext), 472 ext_pblock(path->p_ext)); 473 } else 474 ext_debug(" []"); 475 } 476 ext_debug("\n"); 477 } 478 479 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 480 { 481 int depth = ext_depth(inode); 482 struct ext4_extent_header *eh; 483 struct ext4_extent *ex; 484 int i; 485 486 if (!path) 487 return; 488 489 eh = path[depth].p_hdr; 490 ex = EXT_FIRST_EXTENT(eh); 491 492 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 493 494 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 495 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 496 ext4_ext_is_uninitialized(ex), 497 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 498 } 499 ext_debug("\n"); 500 } 501 #else 502 #define ext4_ext_show_path(inode, path) 503 #define ext4_ext_show_leaf(inode, path) 504 #endif 505 506 void ext4_ext_drop_refs(struct ext4_ext_path *path) 507 { 508 int depth = path->p_depth; 509 int i; 510 511 for (i = 0; i <= depth; i++, path++) 512 if (path->p_bh) { 513 brelse(path->p_bh); 514 path->p_bh = NULL; 515 } 516 } 517 518 /* 519 * ext4_ext_binsearch_idx: 520 * binary search for the closest index of the given block 521 * the header must be checked before calling this 522 */ 523 static void 524 ext4_ext_binsearch_idx(struct inode *inode, 525 struct ext4_ext_path *path, ext4_lblk_t block) 526 { 527 struct ext4_extent_header *eh = path->p_hdr; 528 struct ext4_extent_idx *r, *l, *m; 529 530 531 ext_debug("binsearch for %u(idx): ", block); 532 533 l = EXT_FIRST_INDEX(eh) + 1; 534 r = EXT_LAST_INDEX(eh); 535 while (l <= r) { 536 m = l + (r - l) / 2; 537 if (block < le32_to_cpu(m->ei_block)) 538 r = m - 1; 539 else 540 l = m + 1; 541 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 542 m, le32_to_cpu(m->ei_block), 543 r, le32_to_cpu(r->ei_block)); 544 } 545 546 path->p_idx = l - 1; 547 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 548 idx_pblock(path->p_idx)); 549 550 #ifdef CHECK_BINSEARCH 551 { 552 struct ext4_extent_idx *chix, *ix; 553 int k; 554 555 chix = ix = EXT_FIRST_INDEX(eh); 556 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 557 if (k != 0 && 558 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 559 printk(KERN_DEBUG "k=%d, ix=0x%p, " 560 "first=0x%p\n", k, 561 ix, EXT_FIRST_INDEX(eh)); 562 printk(KERN_DEBUG "%u <= %u\n", 563 le32_to_cpu(ix->ei_block), 564 le32_to_cpu(ix[-1].ei_block)); 565 } 566 BUG_ON(k && le32_to_cpu(ix->ei_block) 567 <= le32_to_cpu(ix[-1].ei_block)); 568 if (block < le32_to_cpu(ix->ei_block)) 569 break; 570 chix = ix; 571 } 572 BUG_ON(chix != path->p_idx); 573 } 574 #endif 575 576 } 577 578 /* 579 * ext4_ext_binsearch: 580 * binary search for closest extent of the given block 581 * the header must be checked before calling this 582 */ 583 static void 584 ext4_ext_binsearch(struct inode *inode, 585 struct ext4_ext_path *path, ext4_lblk_t block) 586 { 587 struct ext4_extent_header *eh = path->p_hdr; 588 struct ext4_extent *r, *l, *m; 589 590 if (eh->eh_entries == 0) { 591 /* 592 * this leaf is empty: 593 * we get such a leaf in split/add case 594 */ 595 return; 596 } 597 598 ext_debug("binsearch for %u: ", block); 599 600 l = EXT_FIRST_EXTENT(eh) + 1; 601 r = EXT_LAST_EXTENT(eh); 602 603 while (l <= r) { 604 m = l + (r - l) / 2; 605 if (block < le32_to_cpu(m->ee_block)) 606 r = m - 1; 607 else 608 l = m + 1; 609 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 610 m, le32_to_cpu(m->ee_block), 611 r, le32_to_cpu(r->ee_block)); 612 } 613 614 path->p_ext = l - 1; 615 ext_debug(" -> %d:%llu:[%d]%d ", 616 le32_to_cpu(path->p_ext->ee_block), 617 ext_pblock(path->p_ext), 618 ext4_ext_is_uninitialized(path->p_ext), 619 ext4_ext_get_actual_len(path->p_ext)); 620 621 #ifdef CHECK_BINSEARCH 622 { 623 struct ext4_extent *chex, *ex; 624 int k; 625 626 chex = ex = EXT_FIRST_EXTENT(eh); 627 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 628 BUG_ON(k && le32_to_cpu(ex->ee_block) 629 <= le32_to_cpu(ex[-1].ee_block)); 630 if (block < le32_to_cpu(ex->ee_block)) 631 break; 632 chex = ex; 633 } 634 BUG_ON(chex != path->p_ext); 635 } 636 #endif 637 638 } 639 640 int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 641 { 642 struct ext4_extent_header *eh; 643 644 eh = ext_inode_hdr(inode); 645 eh->eh_depth = 0; 646 eh->eh_entries = 0; 647 eh->eh_magic = EXT4_EXT_MAGIC; 648 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 649 ext4_mark_inode_dirty(handle, inode); 650 ext4_ext_invalidate_cache(inode); 651 return 0; 652 } 653 654 struct ext4_ext_path * 655 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 656 struct ext4_ext_path *path) 657 { 658 struct ext4_extent_header *eh; 659 struct buffer_head *bh; 660 short int depth, i, ppos = 0, alloc = 0; 661 662 eh = ext_inode_hdr(inode); 663 depth = ext_depth(inode); 664 665 /* account possible depth increase */ 666 if (!path) { 667 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 668 GFP_NOFS); 669 if (!path) 670 return ERR_PTR(-ENOMEM); 671 alloc = 1; 672 } 673 path[0].p_hdr = eh; 674 path[0].p_bh = NULL; 675 676 i = depth; 677 /* walk through the tree */ 678 while (i) { 679 int need_to_validate = 0; 680 681 ext_debug("depth %d: num %d, max %d\n", 682 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 683 684 ext4_ext_binsearch_idx(inode, path + ppos, block); 685 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 686 path[ppos].p_depth = i; 687 path[ppos].p_ext = NULL; 688 689 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 690 if (unlikely(!bh)) 691 goto err; 692 if (!bh_uptodate_or_lock(bh)) { 693 if (bh_submit_read(bh) < 0) { 694 put_bh(bh); 695 goto err; 696 } 697 /* validate the extent entries */ 698 need_to_validate = 1; 699 } 700 eh = ext_block_hdr(bh); 701 ppos++; 702 if (unlikely(ppos > depth)) { 703 put_bh(bh); 704 EXT4_ERROR_INODE(inode, 705 "ppos %d > depth %d", ppos, depth); 706 goto err; 707 } 708 path[ppos].p_bh = bh; 709 path[ppos].p_hdr = eh; 710 i--; 711 712 if (need_to_validate && ext4_ext_check(inode, eh, i)) 713 goto err; 714 } 715 716 path[ppos].p_depth = i; 717 path[ppos].p_ext = NULL; 718 path[ppos].p_idx = NULL; 719 720 /* find extent */ 721 ext4_ext_binsearch(inode, path + ppos, block); 722 /* if not an empty leaf */ 723 if (path[ppos].p_ext) 724 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 725 726 ext4_ext_show_path(inode, path); 727 728 return path; 729 730 err: 731 ext4_ext_drop_refs(path); 732 if (alloc) 733 kfree(path); 734 return ERR_PTR(-EIO); 735 } 736 737 /* 738 * ext4_ext_insert_index: 739 * insert new index [@logical;@ptr] into the block at @curp; 740 * check where to insert: before @curp or after @curp 741 */ 742 int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 743 struct ext4_ext_path *curp, 744 int logical, ext4_fsblk_t ptr) 745 { 746 struct ext4_extent_idx *ix; 747 int len, err; 748 749 err = ext4_ext_get_access(handle, inode, curp); 750 if (err) 751 return err; 752 753 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 754 EXT4_ERROR_INODE(inode, 755 "logical %d == ei_block %d!", 756 logical, le32_to_cpu(curp->p_idx->ei_block)); 757 return -EIO; 758 } 759 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 760 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 761 /* insert after */ 762 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 763 len = (len - 1) * sizeof(struct ext4_extent_idx); 764 len = len < 0 ? 0 : len; 765 ext_debug("insert new index %d after: %llu. " 766 "move %d from 0x%p to 0x%p\n", 767 logical, ptr, len, 768 (curp->p_idx + 1), (curp->p_idx + 2)); 769 memmove(curp->p_idx + 2, curp->p_idx + 1, len); 770 } 771 ix = curp->p_idx + 1; 772 } else { 773 /* insert before */ 774 len = len * sizeof(struct ext4_extent_idx); 775 len = len < 0 ? 0 : len; 776 ext_debug("insert new index %d before: %llu. " 777 "move %d from 0x%p to 0x%p\n", 778 logical, ptr, len, 779 curp->p_idx, (curp->p_idx + 1)); 780 memmove(curp->p_idx + 1, curp->p_idx, len); 781 ix = curp->p_idx; 782 } 783 784 ix->ei_block = cpu_to_le32(logical); 785 ext4_idx_store_pblock(ix, ptr); 786 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 787 788 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 789 > le16_to_cpu(curp->p_hdr->eh_max))) { 790 EXT4_ERROR_INODE(inode, 791 "logical %d == ei_block %d!", 792 logical, le32_to_cpu(curp->p_idx->ei_block)); 793 return -EIO; 794 } 795 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 796 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 797 return -EIO; 798 } 799 800 err = ext4_ext_dirty(handle, inode, curp); 801 ext4_std_error(inode->i_sb, err); 802 803 return err; 804 } 805 806 /* 807 * ext4_ext_split: 808 * inserts new subtree into the path, using free index entry 809 * at depth @at: 810 * - allocates all needed blocks (new leaf and all intermediate index blocks) 811 * - makes decision where to split 812 * - moves remaining extents and index entries (right to the split point) 813 * into the newly allocated blocks 814 * - initializes subtree 815 */ 816 static int ext4_ext_split(handle_t *handle, struct inode *inode, 817 struct ext4_ext_path *path, 818 struct ext4_extent *newext, int at) 819 { 820 struct buffer_head *bh = NULL; 821 int depth = ext_depth(inode); 822 struct ext4_extent_header *neh; 823 struct ext4_extent_idx *fidx; 824 struct ext4_extent *ex; 825 int i = at, k, m, a; 826 ext4_fsblk_t newblock, oldblock; 827 __le32 border; 828 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 829 int err = 0; 830 831 /* make decision: where to split? */ 832 /* FIXME: now decision is simplest: at current extent */ 833 834 /* if current leaf will be split, then we should use 835 * border from split point */ 836 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 837 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 838 return -EIO; 839 } 840 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 841 border = path[depth].p_ext[1].ee_block; 842 ext_debug("leaf will be split." 843 " next leaf starts at %d\n", 844 le32_to_cpu(border)); 845 } else { 846 border = newext->ee_block; 847 ext_debug("leaf will be added." 848 " next leaf starts at %d\n", 849 le32_to_cpu(border)); 850 } 851 852 /* 853 * If error occurs, then we break processing 854 * and mark filesystem read-only. index won't 855 * be inserted and tree will be in consistent 856 * state. Next mount will repair buffers too. 857 */ 858 859 /* 860 * Get array to track all allocated blocks. 861 * We need this to handle errors and free blocks 862 * upon them. 863 */ 864 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 865 if (!ablocks) 866 return -ENOMEM; 867 868 /* allocate all needed blocks */ 869 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 870 for (a = 0; a < depth - at; a++) { 871 newblock = ext4_ext_new_meta_block(handle, inode, path, 872 newext, &err); 873 if (newblock == 0) 874 goto cleanup; 875 ablocks[a] = newblock; 876 } 877 878 /* initialize new leaf */ 879 newblock = ablocks[--a]; 880 if (unlikely(newblock == 0)) { 881 EXT4_ERROR_INODE(inode, "newblock == 0!"); 882 err = -EIO; 883 goto cleanup; 884 } 885 bh = sb_getblk(inode->i_sb, newblock); 886 if (!bh) { 887 err = -EIO; 888 goto cleanup; 889 } 890 lock_buffer(bh); 891 892 err = ext4_journal_get_create_access(handle, bh); 893 if (err) 894 goto cleanup; 895 896 neh = ext_block_hdr(bh); 897 neh->eh_entries = 0; 898 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 899 neh->eh_magic = EXT4_EXT_MAGIC; 900 neh->eh_depth = 0; 901 ex = EXT_FIRST_EXTENT(neh); 902 903 /* move remainder of path[depth] to the new leaf */ 904 if (unlikely(path[depth].p_hdr->eh_entries != 905 path[depth].p_hdr->eh_max)) { 906 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 907 path[depth].p_hdr->eh_entries, 908 path[depth].p_hdr->eh_max); 909 err = -EIO; 910 goto cleanup; 911 } 912 /* start copy from next extent */ 913 /* TODO: we could do it by single memmove */ 914 m = 0; 915 path[depth].p_ext++; 916 while (path[depth].p_ext <= 917 EXT_MAX_EXTENT(path[depth].p_hdr)) { 918 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 919 le32_to_cpu(path[depth].p_ext->ee_block), 920 ext_pblock(path[depth].p_ext), 921 ext4_ext_is_uninitialized(path[depth].p_ext), 922 ext4_ext_get_actual_len(path[depth].p_ext), 923 newblock); 924 /*memmove(ex++, path[depth].p_ext++, 925 sizeof(struct ext4_extent)); 926 neh->eh_entries++;*/ 927 path[depth].p_ext++; 928 m++; 929 } 930 if (m) { 931 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 932 le16_add_cpu(&neh->eh_entries, m); 933 } 934 935 set_buffer_uptodate(bh); 936 unlock_buffer(bh); 937 938 err = ext4_handle_dirty_metadata(handle, inode, bh); 939 if (err) 940 goto cleanup; 941 brelse(bh); 942 bh = NULL; 943 944 /* correct old leaf */ 945 if (m) { 946 err = ext4_ext_get_access(handle, inode, path + depth); 947 if (err) 948 goto cleanup; 949 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 950 err = ext4_ext_dirty(handle, inode, path + depth); 951 if (err) 952 goto cleanup; 953 954 } 955 956 /* create intermediate indexes */ 957 k = depth - at - 1; 958 if (unlikely(k < 0)) { 959 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 960 err = -EIO; 961 goto cleanup; 962 } 963 if (k) 964 ext_debug("create %d intermediate indices\n", k); 965 /* insert new index into current index block */ 966 /* current depth stored in i var */ 967 i = depth - 1; 968 while (k--) { 969 oldblock = newblock; 970 newblock = ablocks[--a]; 971 bh = sb_getblk(inode->i_sb, newblock); 972 if (!bh) { 973 err = -EIO; 974 goto cleanup; 975 } 976 lock_buffer(bh); 977 978 err = ext4_journal_get_create_access(handle, bh); 979 if (err) 980 goto cleanup; 981 982 neh = ext_block_hdr(bh); 983 neh->eh_entries = cpu_to_le16(1); 984 neh->eh_magic = EXT4_EXT_MAGIC; 985 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 986 neh->eh_depth = cpu_to_le16(depth - i); 987 fidx = EXT_FIRST_INDEX(neh); 988 fidx->ei_block = border; 989 ext4_idx_store_pblock(fidx, oldblock); 990 991 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 992 i, newblock, le32_to_cpu(border), oldblock); 993 /* copy indexes */ 994 m = 0; 995 path[i].p_idx++; 996 997 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 998 EXT_MAX_INDEX(path[i].p_hdr)); 999 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1000 EXT_LAST_INDEX(path[i].p_hdr))) { 1001 EXT4_ERROR_INODE(inode, 1002 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1003 le32_to_cpu(path[i].p_ext->ee_block)); 1004 err = -EIO; 1005 goto cleanup; 1006 } 1007 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1008 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1009 le32_to_cpu(path[i].p_idx->ei_block), 1010 idx_pblock(path[i].p_idx), 1011 newblock); 1012 /*memmove(++fidx, path[i].p_idx++, 1013 sizeof(struct ext4_extent_idx)); 1014 neh->eh_entries++; 1015 BUG_ON(neh->eh_entries > neh->eh_max);*/ 1016 path[i].p_idx++; 1017 m++; 1018 } 1019 if (m) { 1020 memmove(++fidx, path[i].p_idx - m, 1021 sizeof(struct ext4_extent_idx) * m); 1022 le16_add_cpu(&neh->eh_entries, m); 1023 } 1024 set_buffer_uptodate(bh); 1025 unlock_buffer(bh); 1026 1027 err = ext4_handle_dirty_metadata(handle, inode, bh); 1028 if (err) 1029 goto cleanup; 1030 brelse(bh); 1031 bh = NULL; 1032 1033 /* correct old index */ 1034 if (m) { 1035 err = ext4_ext_get_access(handle, inode, path + i); 1036 if (err) 1037 goto cleanup; 1038 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1039 err = ext4_ext_dirty(handle, inode, path + i); 1040 if (err) 1041 goto cleanup; 1042 } 1043 1044 i--; 1045 } 1046 1047 /* insert new index */ 1048 err = ext4_ext_insert_index(handle, inode, path + at, 1049 le32_to_cpu(border), newblock); 1050 1051 cleanup: 1052 if (bh) { 1053 if (buffer_locked(bh)) 1054 unlock_buffer(bh); 1055 brelse(bh); 1056 } 1057 1058 if (err) { 1059 /* free all allocated blocks in error case */ 1060 for (i = 0; i < depth; i++) { 1061 if (!ablocks[i]) 1062 continue; 1063 ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1064 EXT4_FREE_BLOCKS_METADATA); 1065 } 1066 } 1067 kfree(ablocks); 1068 1069 return err; 1070 } 1071 1072 /* 1073 * ext4_ext_grow_indepth: 1074 * implements tree growing procedure: 1075 * - allocates new block 1076 * - moves top-level data (index block or leaf) into the new block 1077 * - initializes new top-level, creating index that points to the 1078 * just created block 1079 */ 1080 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1081 struct ext4_ext_path *path, 1082 struct ext4_extent *newext) 1083 { 1084 struct ext4_ext_path *curp = path; 1085 struct ext4_extent_header *neh; 1086 struct ext4_extent_idx *fidx; 1087 struct buffer_head *bh; 1088 ext4_fsblk_t newblock; 1089 int err = 0; 1090 1091 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1092 if (newblock == 0) 1093 return err; 1094 1095 bh = sb_getblk(inode->i_sb, newblock); 1096 if (!bh) { 1097 err = -EIO; 1098 ext4_std_error(inode->i_sb, err); 1099 return err; 1100 } 1101 lock_buffer(bh); 1102 1103 err = ext4_journal_get_create_access(handle, bh); 1104 if (err) { 1105 unlock_buffer(bh); 1106 goto out; 1107 } 1108 1109 /* move top-level index/leaf into new block */ 1110 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1111 1112 /* set size of new block */ 1113 neh = ext_block_hdr(bh); 1114 /* old root could have indexes or leaves 1115 * so calculate e_max right way */ 1116 if (ext_depth(inode)) 1117 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1118 else 1119 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1120 neh->eh_magic = EXT4_EXT_MAGIC; 1121 set_buffer_uptodate(bh); 1122 unlock_buffer(bh); 1123 1124 err = ext4_handle_dirty_metadata(handle, inode, bh); 1125 if (err) 1126 goto out; 1127 1128 /* create index in new top-level index: num,max,pointer */ 1129 err = ext4_ext_get_access(handle, inode, curp); 1130 if (err) 1131 goto out; 1132 1133 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1134 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1135 curp->p_hdr->eh_entries = cpu_to_le16(1); 1136 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1137 1138 if (path[0].p_hdr->eh_depth) 1139 curp->p_idx->ei_block = 1140 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; 1141 else 1142 curp->p_idx->ei_block = 1143 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 1144 ext4_idx_store_pblock(curp->p_idx, newblock); 1145 1146 neh = ext_inode_hdr(inode); 1147 fidx = EXT_FIRST_INDEX(neh); 1148 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1149 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1150 le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); 1151 1152 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1153 err = ext4_ext_dirty(handle, inode, curp); 1154 out: 1155 brelse(bh); 1156 1157 return err; 1158 } 1159 1160 /* 1161 * ext4_ext_create_new_leaf: 1162 * finds empty index and adds new leaf. 1163 * if no free index is found, then it requests in-depth growing. 1164 */ 1165 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1166 struct ext4_ext_path *path, 1167 struct ext4_extent *newext) 1168 { 1169 struct ext4_ext_path *curp; 1170 int depth, i, err = 0; 1171 1172 repeat: 1173 i = depth = ext_depth(inode); 1174 1175 /* walk up to the tree and look for free index entry */ 1176 curp = path + depth; 1177 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1178 i--; 1179 curp--; 1180 } 1181 1182 /* we use already allocated block for index block, 1183 * so subsequent data blocks should be contiguous */ 1184 if (EXT_HAS_FREE_INDEX(curp)) { 1185 /* if we found index with free entry, then use that 1186 * entry: create all needed subtree and add new leaf */ 1187 err = ext4_ext_split(handle, inode, path, newext, i); 1188 if (err) 1189 goto out; 1190 1191 /* refill path */ 1192 ext4_ext_drop_refs(path); 1193 path = ext4_ext_find_extent(inode, 1194 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1195 path); 1196 if (IS_ERR(path)) 1197 err = PTR_ERR(path); 1198 } else { 1199 /* tree is full, time to grow in depth */ 1200 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1201 if (err) 1202 goto out; 1203 1204 /* refill path */ 1205 ext4_ext_drop_refs(path); 1206 path = ext4_ext_find_extent(inode, 1207 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1208 path); 1209 if (IS_ERR(path)) { 1210 err = PTR_ERR(path); 1211 goto out; 1212 } 1213 1214 /* 1215 * only first (depth 0 -> 1) produces free space; 1216 * in all other cases we have to split the grown tree 1217 */ 1218 depth = ext_depth(inode); 1219 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1220 /* now we need to split */ 1221 goto repeat; 1222 } 1223 } 1224 1225 out: 1226 return err; 1227 } 1228 1229 /* 1230 * search the closest allocated block to the left for *logical 1231 * and returns it at @logical + it's physical address at @phys 1232 * if *logical is the smallest allocated block, the function 1233 * returns 0 at @phys 1234 * return value contains 0 (success) or error code 1235 */ 1236 int 1237 ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 1238 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1239 { 1240 struct ext4_extent_idx *ix; 1241 struct ext4_extent *ex; 1242 int depth, ee_len; 1243 1244 if (unlikely(path == NULL)) { 1245 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1246 return -EIO; 1247 } 1248 depth = path->p_depth; 1249 *phys = 0; 1250 1251 if (depth == 0 && path->p_ext == NULL) 1252 return 0; 1253 1254 /* usually extent in the path covers blocks smaller 1255 * then *logical, but it can be that extent is the 1256 * first one in the file */ 1257 1258 ex = path[depth].p_ext; 1259 ee_len = ext4_ext_get_actual_len(ex); 1260 if (*logical < le32_to_cpu(ex->ee_block)) { 1261 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1262 EXT4_ERROR_INODE(inode, 1263 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1264 *logical, le32_to_cpu(ex->ee_block)); 1265 return -EIO; 1266 } 1267 while (--depth >= 0) { 1268 ix = path[depth].p_idx; 1269 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1270 EXT4_ERROR_INODE(inode, 1271 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1272 ix != NULL ? ix->ei_block : 0, 1273 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1274 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1275 depth); 1276 return -EIO; 1277 } 1278 } 1279 return 0; 1280 } 1281 1282 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1283 EXT4_ERROR_INODE(inode, 1284 "logical %d < ee_block %d + ee_len %d!", 1285 *logical, le32_to_cpu(ex->ee_block), ee_len); 1286 return -EIO; 1287 } 1288 1289 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1290 *phys = ext_pblock(ex) + ee_len - 1; 1291 return 0; 1292 } 1293 1294 /* 1295 * search the closest allocated block to the right for *logical 1296 * and returns it at @logical + it's physical address at @phys 1297 * if *logical is the smallest allocated block, the function 1298 * returns 0 at @phys 1299 * return value contains 0 (success) or error code 1300 */ 1301 int 1302 ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, 1303 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1304 { 1305 struct buffer_head *bh = NULL; 1306 struct ext4_extent_header *eh; 1307 struct ext4_extent_idx *ix; 1308 struct ext4_extent *ex; 1309 ext4_fsblk_t block; 1310 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1311 int ee_len; 1312 1313 if (unlikely(path == NULL)) { 1314 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1315 return -EIO; 1316 } 1317 depth = path->p_depth; 1318 *phys = 0; 1319 1320 if (depth == 0 && path->p_ext == NULL) 1321 return 0; 1322 1323 /* usually extent in the path covers blocks smaller 1324 * then *logical, but it can be that extent is the 1325 * first one in the file */ 1326 1327 ex = path[depth].p_ext; 1328 ee_len = ext4_ext_get_actual_len(ex); 1329 if (*logical < le32_to_cpu(ex->ee_block)) { 1330 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1331 EXT4_ERROR_INODE(inode, 1332 "first_extent(path[%d].p_hdr) != ex", 1333 depth); 1334 return -EIO; 1335 } 1336 while (--depth >= 0) { 1337 ix = path[depth].p_idx; 1338 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1339 EXT4_ERROR_INODE(inode, 1340 "ix != EXT_FIRST_INDEX *logical %d!", 1341 *logical); 1342 return -EIO; 1343 } 1344 } 1345 *logical = le32_to_cpu(ex->ee_block); 1346 *phys = ext_pblock(ex); 1347 return 0; 1348 } 1349 1350 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1351 EXT4_ERROR_INODE(inode, 1352 "logical %d < ee_block %d + ee_len %d!", 1353 *logical, le32_to_cpu(ex->ee_block), ee_len); 1354 return -EIO; 1355 } 1356 1357 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1358 /* next allocated block in this leaf */ 1359 ex++; 1360 *logical = le32_to_cpu(ex->ee_block); 1361 *phys = ext_pblock(ex); 1362 return 0; 1363 } 1364 1365 /* go up and search for index to the right */ 1366 while (--depth >= 0) { 1367 ix = path[depth].p_idx; 1368 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1369 goto got_index; 1370 } 1371 1372 /* we've gone up to the root and found no index to the right */ 1373 return 0; 1374 1375 got_index: 1376 /* we've found index to the right, let's 1377 * follow it and find the closest allocated 1378 * block to the right */ 1379 ix++; 1380 block = idx_pblock(ix); 1381 while (++depth < path->p_depth) { 1382 bh = sb_bread(inode->i_sb, block); 1383 if (bh == NULL) 1384 return -EIO; 1385 eh = ext_block_hdr(bh); 1386 /* subtract from p_depth to get proper eh_depth */ 1387 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1388 put_bh(bh); 1389 return -EIO; 1390 } 1391 ix = EXT_FIRST_INDEX(eh); 1392 block = idx_pblock(ix); 1393 put_bh(bh); 1394 } 1395 1396 bh = sb_bread(inode->i_sb, block); 1397 if (bh == NULL) 1398 return -EIO; 1399 eh = ext_block_hdr(bh); 1400 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1401 put_bh(bh); 1402 return -EIO; 1403 } 1404 ex = EXT_FIRST_EXTENT(eh); 1405 *logical = le32_to_cpu(ex->ee_block); 1406 *phys = ext_pblock(ex); 1407 put_bh(bh); 1408 return 0; 1409 } 1410 1411 /* 1412 * ext4_ext_next_allocated_block: 1413 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1414 * NOTE: it considers block number from index entry as 1415 * allocated block. Thus, index entries have to be consistent 1416 * with leaves. 1417 */ 1418 static ext4_lblk_t 1419 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1420 { 1421 int depth; 1422 1423 BUG_ON(path == NULL); 1424 depth = path->p_depth; 1425 1426 if (depth == 0 && path->p_ext == NULL) 1427 return EXT_MAX_BLOCK; 1428 1429 while (depth >= 0) { 1430 if (depth == path->p_depth) { 1431 /* leaf */ 1432 if (path[depth].p_ext != 1433 EXT_LAST_EXTENT(path[depth].p_hdr)) 1434 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1435 } else { 1436 /* index */ 1437 if (path[depth].p_idx != 1438 EXT_LAST_INDEX(path[depth].p_hdr)) 1439 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1440 } 1441 depth--; 1442 } 1443 1444 return EXT_MAX_BLOCK; 1445 } 1446 1447 /* 1448 * ext4_ext_next_leaf_block: 1449 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1450 */ 1451 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1452 struct ext4_ext_path *path) 1453 { 1454 int depth; 1455 1456 BUG_ON(path == NULL); 1457 depth = path->p_depth; 1458 1459 /* zero-tree has no leaf blocks at all */ 1460 if (depth == 0) 1461 return EXT_MAX_BLOCK; 1462 1463 /* go to index block */ 1464 depth--; 1465 1466 while (depth >= 0) { 1467 if (path[depth].p_idx != 1468 EXT_LAST_INDEX(path[depth].p_hdr)) 1469 return (ext4_lblk_t) 1470 le32_to_cpu(path[depth].p_idx[1].ei_block); 1471 depth--; 1472 } 1473 1474 return EXT_MAX_BLOCK; 1475 } 1476 1477 /* 1478 * ext4_ext_correct_indexes: 1479 * if leaf gets modified and modified extent is first in the leaf, 1480 * then we have to correct all indexes above. 1481 * TODO: do we need to correct tree in all cases? 1482 */ 1483 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1484 struct ext4_ext_path *path) 1485 { 1486 struct ext4_extent_header *eh; 1487 int depth = ext_depth(inode); 1488 struct ext4_extent *ex; 1489 __le32 border; 1490 int k, err = 0; 1491 1492 eh = path[depth].p_hdr; 1493 ex = path[depth].p_ext; 1494 1495 if (unlikely(ex == NULL || eh == NULL)) { 1496 EXT4_ERROR_INODE(inode, 1497 "ex %p == NULL or eh %p == NULL", ex, eh); 1498 return -EIO; 1499 } 1500 1501 if (depth == 0) { 1502 /* there is no tree at all */ 1503 return 0; 1504 } 1505 1506 if (ex != EXT_FIRST_EXTENT(eh)) { 1507 /* we correct tree if first leaf got modified only */ 1508 return 0; 1509 } 1510 1511 /* 1512 * TODO: we need correction if border is smaller than current one 1513 */ 1514 k = depth - 1; 1515 border = path[depth].p_ext->ee_block; 1516 err = ext4_ext_get_access(handle, inode, path + k); 1517 if (err) 1518 return err; 1519 path[k].p_idx->ei_block = border; 1520 err = ext4_ext_dirty(handle, inode, path + k); 1521 if (err) 1522 return err; 1523 1524 while (k--) { 1525 /* change all left-side indexes */ 1526 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1527 break; 1528 err = ext4_ext_get_access(handle, inode, path + k); 1529 if (err) 1530 break; 1531 path[k].p_idx->ei_block = border; 1532 err = ext4_ext_dirty(handle, inode, path + k); 1533 if (err) 1534 break; 1535 } 1536 1537 return err; 1538 } 1539 1540 int 1541 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1542 struct ext4_extent *ex2) 1543 { 1544 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1545 1546 /* 1547 * Make sure that either both extents are uninitialized, or 1548 * both are _not_. 1549 */ 1550 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1551 return 0; 1552 1553 if (ext4_ext_is_uninitialized(ex1)) 1554 max_len = EXT_UNINIT_MAX_LEN; 1555 else 1556 max_len = EXT_INIT_MAX_LEN; 1557 1558 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1559 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1560 1561 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1562 le32_to_cpu(ex2->ee_block)) 1563 return 0; 1564 1565 /* 1566 * To allow future support for preallocated extents to be added 1567 * as an RO_COMPAT feature, refuse to merge to extents if 1568 * this can result in the top bit of ee_len being set. 1569 */ 1570 if (ext1_ee_len + ext2_ee_len > max_len) 1571 return 0; 1572 #ifdef AGGRESSIVE_TEST 1573 if (ext1_ee_len >= 4) 1574 return 0; 1575 #endif 1576 1577 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1578 return 1; 1579 return 0; 1580 } 1581 1582 /* 1583 * This function tries to merge the "ex" extent to the next extent in the tree. 1584 * It always tries to merge towards right. If you want to merge towards 1585 * left, pass "ex - 1" as argument instead of "ex". 1586 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1587 * 1 if they got merged. 1588 */ 1589 int ext4_ext_try_to_merge(struct inode *inode, 1590 struct ext4_ext_path *path, 1591 struct ext4_extent *ex) 1592 { 1593 struct ext4_extent_header *eh; 1594 unsigned int depth, len; 1595 int merge_done = 0; 1596 int uninitialized = 0; 1597 1598 depth = ext_depth(inode); 1599 BUG_ON(path[depth].p_hdr == NULL); 1600 eh = path[depth].p_hdr; 1601 1602 while (ex < EXT_LAST_EXTENT(eh)) { 1603 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1604 break; 1605 /* merge with next extent! */ 1606 if (ext4_ext_is_uninitialized(ex)) 1607 uninitialized = 1; 1608 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1609 + ext4_ext_get_actual_len(ex + 1)); 1610 if (uninitialized) 1611 ext4_ext_mark_uninitialized(ex); 1612 1613 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1614 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1615 * sizeof(struct ext4_extent); 1616 memmove(ex + 1, ex + 2, len); 1617 } 1618 le16_add_cpu(&eh->eh_entries, -1); 1619 merge_done = 1; 1620 WARN_ON(eh->eh_entries == 0); 1621 if (!eh->eh_entries) 1622 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1623 } 1624 1625 return merge_done; 1626 } 1627 1628 /* 1629 * check if a portion of the "newext" extent overlaps with an 1630 * existing extent. 1631 * 1632 * If there is an overlap discovered, it updates the length of the newext 1633 * such that there will be no overlap, and then returns 1. 1634 * If there is no overlap found, it returns 0. 1635 */ 1636 unsigned int ext4_ext_check_overlap(struct inode *inode, 1637 struct ext4_extent *newext, 1638 struct ext4_ext_path *path) 1639 { 1640 ext4_lblk_t b1, b2; 1641 unsigned int depth, len1; 1642 unsigned int ret = 0; 1643 1644 b1 = le32_to_cpu(newext->ee_block); 1645 len1 = ext4_ext_get_actual_len(newext); 1646 depth = ext_depth(inode); 1647 if (!path[depth].p_ext) 1648 goto out; 1649 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1650 1651 /* 1652 * get the next allocated block if the extent in the path 1653 * is before the requested block(s) 1654 */ 1655 if (b2 < b1) { 1656 b2 = ext4_ext_next_allocated_block(path); 1657 if (b2 == EXT_MAX_BLOCK) 1658 goto out; 1659 } 1660 1661 /* check for wrap through zero on extent logical start block*/ 1662 if (b1 + len1 < b1) { 1663 len1 = EXT_MAX_BLOCK - b1; 1664 newext->ee_len = cpu_to_le16(len1); 1665 ret = 1; 1666 } 1667 1668 /* check for overlap */ 1669 if (b1 + len1 > b2) { 1670 newext->ee_len = cpu_to_le16(b2 - b1); 1671 ret = 1; 1672 } 1673 out: 1674 return ret; 1675 } 1676 1677 /* 1678 * ext4_ext_insert_extent: 1679 * tries to merge requsted extent into the existing extent or 1680 * inserts requested extent as new one into the tree, 1681 * creating new leaf in the no-space case. 1682 */ 1683 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1684 struct ext4_ext_path *path, 1685 struct ext4_extent *newext, int flag) 1686 { 1687 struct ext4_extent_header *eh; 1688 struct ext4_extent *ex, *fex; 1689 struct ext4_extent *nearex; /* nearest extent */ 1690 struct ext4_ext_path *npath = NULL; 1691 int depth, len, err; 1692 ext4_lblk_t next; 1693 unsigned uninitialized = 0; 1694 1695 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1696 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1697 return -EIO; 1698 } 1699 depth = ext_depth(inode); 1700 ex = path[depth].p_ext; 1701 if (unlikely(path[depth].p_hdr == NULL)) { 1702 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1703 return -EIO; 1704 } 1705 1706 /* try to insert block into found extent and return */ 1707 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1708 && ext4_can_extents_be_merged(inode, ex, newext)) { 1709 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1710 ext4_ext_is_uninitialized(newext), 1711 ext4_ext_get_actual_len(newext), 1712 le32_to_cpu(ex->ee_block), 1713 ext4_ext_is_uninitialized(ex), 1714 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1715 err = ext4_ext_get_access(handle, inode, path + depth); 1716 if (err) 1717 return err; 1718 1719 /* 1720 * ext4_can_extents_be_merged should have checked that either 1721 * both extents are uninitialized, or both aren't. Thus we 1722 * need to check only one of them here. 1723 */ 1724 if (ext4_ext_is_uninitialized(ex)) 1725 uninitialized = 1; 1726 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1727 + ext4_ext_get_actual_len(newext)); 1728 if (uninitialized) 1729 ext4_ext_mark_uninitialized(ex); 1730 eh = path[depth].p_hdr; 1731 nearex = ex; 1732 goto merge; 1733 } 1734 1735 repeat: 1736 depth = ext_depth(inode); 1737 eh = path[depth].p_hdr; 1738 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1739 goto has_space; 1740 1741 /* probably next leaf has space for us? */ 1742 fex = EXT_LAST_EXTENT(eh); 1743 next = ext4_ext_next_leaf_block(inode, path); 1744 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1745 && next != EXT_MAX_BLOCK) { 1746 ext_debug("next leaf block - %d\n", next); 1747 BUG_ON(npath != NULL); 1748 npath = ext4_ext_find_extent(inode, next, NULL); 1749 if (IS_ERR(npath)) 1750 return PTR_ERR(npath); 1751 BUG_ON(npath->p_depth != path->p_depth); 1752 eh = npath[depth].p_hdr; 1753 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1754 ext_debug("next leaf isnt full(%d)\n", 1755 le16_to_cpu(eh->eh_entries)); 1756 path = npath; 1757 goto repeat; 1758 } 1759 ext_debug("next leaf has no free space(%d,%d)\n", 1760 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1761 } 1762 1763 /* 1764 * There is no free space in the found leaf. 1765 * We're gonna add a new leaf in the tree. 1766 */ 1767 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1768 if (err) 1769 goto cleanup; 1770 depth = ext_depth(inode); 1771 eh = path[depth].p_hdr; 1772 1773 has_space: 1774 nearex = path[depth].p_ext; 1775 1776 err = ext4_ext_get_access(handle, inode, path + depth); 1777 if (err) 1778 goto cleanup; 1779 1780 if (!nearex) { 1781 /* there is no extent in this leaf, create first one */ 1782 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1783 le32_to_cpu(newext->ee_block), 1784 ext_pblock(newext), 1785 ext4_ext_is_uninitialized(newext), 1786 ext4_ext_get_actual_len(newext)); 1787 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1788 } else if (le32_to_cpu(newext->ee_block) 1789 > le32_to_cpu(nearex->ee_block)) { 1790 /* BUG_ON(newext->ee_block == nearex->ee_block); */ 1791 if (nearex != EXT_LAST_EXTENT(eh)) { 1792 len = EXT_MAX_EXTENT(eh) - nearex; 1793 len = (len - 1) * sizeof(struct ext4_extent); 1794 len = len < 0 ? 0 : len; 1795 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1796 "move %d from 0x%p to 0x%p\n", 1797 le32_to_cpu(newext->ee_block), 1798 ext_pblock(newext), 1799 ext4_ext_is_uninitialized(newext), 1800 ext4_ext_get_actual_len(newext), 1801 nearex, len, nearex + 1, nearex + 2); 1802 memmove(nearex + 2, nearex + 1, len); 1803 } 1804 path[depth].p_ext = nearex + 1; 1805 } else { 1806 BUG_ON(newext->ee_block == nearex->ee_block); 1807 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1808 len = len < 0 ? 0 : len; 1809 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1810 "move %d from 0x%p to 0x%p\n", 1811 le32_to_cpu(newext->ee_block), 1812 ext_pblock(newext), 1813 ext4_ext_is_uninitialized(newext), 1814 ext4_ext_get_actual_len(newext), 1815 nearex, len, nearex + 1, nearex + 2); 1816 memmove(nearex + 1, nearex, len); 1817 path[depth].p_ext = nearex; 1818 } 1819 1820 le16_add_cpu(&eh->eh_entries, 1); 1821 nearex = path[depth].p_ext; 1822 nearex->ee_block = newext->ee_block; 1823 ext4_ext_store_pblock(nearex, ext_pblock(newext)); 1824 nearex->ee_len = newext->ee_len; 1825 1826 merge: 1827 /* try to merge extents to the right */ 1828 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1829 ext4_ext_try_to_merge(inode, path, nearex); 1830 1831 /* try to merge extents to the left */ 1832 1833 /* time to correct all indexes above */ 1834 err = ext4_ext_correct_indexes(handle, inode, path); 1835 if (err) 1836 goto cleanup; 1837 1838 err = ext4_ext_dirty(handle, inode, path + depth); 1839 1840 cleanup: 1841 if (npath) { 1842 ext4_ext_drop_refs(npath); 1843 kfree(npath); 1844 } 1845 ext4_ext_invalidate_cache(inode); 1846 return err; 1847 } 1848 1849 int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1850 ext4_lblk_t num, ext_prepare_callback func, 1851 void *cbdata) 1852 { 1853 struct ext4_ext_path *path = NULL; 1854 struct ext4_ext_cache cbex; 1855 struct ext4_extent *ex; 1856 ext4_lblk_t next, start = 0, end = 0; 1857 ext4_lblk_t last = block + num; 1858 int depth, exists, err = 0; 1859 1860 BUG_ON(func == NULL); 1861 BUG_ON(inode == NULL); 1862 1863 while (block < last && block != EXT_MAX_BLOCK) { 1864 num = last - block; 1865 /* find extent for this block */ 1866 down_read(&EXT4_I(inode)->i_data_sem); 1867 path = ext4_ext_find_extent(inode, block, path); 1868 up_read(&EXT4_I(inode)->i_data_sem); 1869 if (IS_ERR(path)) { 1870 err = PTR_ERR(path); 1871 path = NULL; 1872 break; 1873 } 1874 1875 depth = ext_depth(inode); 1876 if (unlikely(path[depth].p_hdr == NULL)) { 1877 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1878 err = -EIO; 1879 break; 1880 } 1881 ex = path[depth].p_ext; 1882 next = ext4_ext_next_allocated_block(path); 1883 1884 exists = 0; 1885 if (!ex) { 1886 /* there is no extent yet, so try to allocate 1887 * all requested space */ 1888 start = block; 1889 end = block + num; 1890 } else if (le32_to_cpu(ex->ee_block) > block) { 1891 /* need to allocate space before found extent */ 1892 start = block; 1893 end = le32_to_cpu(ex->ee_block); 1894 if (block + num < end) 1895 end = block + num; 1896 } else if (block >= le32_to_cpu(ex->ee_block) 1897 + ext4_ext_get_actual_len(ex)) { 1898 /* need to allocate space after found extent */ 1899 start = block; 1900 end = block + num; 1901 if (end >= next) 1902 end = next; 1903 } else if (block >= le32_to_cpu(ex->ee_block)) { 1904 /* 1905 * some part of requested space is covered 1906 * by found extent 1907 */ 1908 start = block; 1909 end = le32_to_cpu(ex->ee_block) 1910 + ext4_ext_get_actual_len(ex); 1911 if (block + num < end) 1912 end = block + num; 1913 exists = 1; 1914 } else { 1915 BUG(); 1916 } 1917 BUG_ON(end <= start); 1918 1919 if (!exists) { 1920 cbex.ec_block = start; 1921 cbex.ec_len = end - start; 1922 cbex.ec_start = 0; 1923 cbex.ec_type = EXT4_EXT_CACHE_GAP; 1924 } else { 1925 cbex.ec_block = le32_to_cpu(ex->ee_block); 1926 cbex.ec_len = ext4_ext_get_actual_len(ex); 1927 cbex.ec_start = ext_pblock(ex); 1928 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1929 } 1930 1931 if (unlikely(cbex.ec_len == 0)) { 1932 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 1933 err = -EIO; 1934 break; 1935 } 1936 err = func(inode, path, &cbex, ex, cbdata); 1937 ext4_ext_drop_refs(path); 1938 1939 if (err < 0) 1940 break; 1941 1942 if (err == EXT_REPEAT) 1943 continue; 1944 else if (err == EXT_BREAK) { 1945 err = 0; 1946 break; 1947 } 1948 1949 if (ext_depth(inode) != depth) { 1950 /* depth was changed. we have to realloc path */ 1951 kfree(path); 1952 path = NULL; 1953 } 1954 1955 block = cbex.ec_block + cbex.ec_len; 1956 } 1957 1958 if (path) { 1959 ext4_ext_drop_refs(path); 1960 kfree(path); 1961 } 1962 1963 return err; 1964 } 1965 1966 static void 1967 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1968 __u32 len, ext4_fsblk_t start, int type) 1969 { 1970 struct ext4_ext_cache *cex; 1971 BUG_ON(len == 0); 1972 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1973 cex = &EXT4_I(inode)->i_cached_extent; 1974 cex->ec_type = type; 1975 cex->ec_block = block; 1976 cex->ec_len = len; 1977 cex->ec_start = start; 1978 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1979 } 1980 1981 /* 1982 * ext4_ext_put_gap_in_cache: 1983 * calculate boundaries of the gap that the requested block fits into 1984 * and cache this gap 1985 */ 1986 static void 1987 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1988 ext4_lblk_t block) 1989 { 1990 int depth = ext_depth(inode); 1991 unsigned long len; 1992 ext4_lblk_t lblock; 1993 struct ext4_extent *ex; 1994 1995 ex = path[depth].p_ext; 1996 if (ex == NULL) { 1997 /* there is no extent yet, so gap is [0;-] */ 1998 lblock = 0; 1999 len = EXT_MAX_BLOCK; 2000 ext_debug("cache gap(whole file):"); 2001 } else if (block < le32_to_cpu(ex->ee_block)) { 2002 lblock = block; 2003 len = le32_to_cpu(ex->ee_block) - block; 2004 ext_debug("cache gap(before): %u [%u:%u]", 2005 block, 2006 le32_to_cpu(ex->ee_block), 2007 ext4_ext_get_actual_len(ex)); 2008 } else if (block >= le32_to_cpu(ex->ee_block) 2009 + ext4_ext_get_actual_len(ex)) { 2010 ext4_lblk_t next; 2011 lblock = le32_to_cpu(ex->ee_block) 2012 + ext4_ext_get_actual_len(ex); 2013 2014 next = ext4_ext_next_allocated_block(path); 2015 ext_debug("cache gap(after): [%u:%u] %u", 2016 le32_to_cpu(ex->ee_block), 2017 ext4_ext_get_actual_len(ex), 2018 block); 2019 BUG_ON(next == lblock); 2020 len = next - lblock; 2021 } else { 2022 lblock = len = 0; 2023 BUG(); 2024 } 2025 2026 ext_debug(" -> %u:%lu\n", lblock, len); 2027 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 2028 } 2029 2030 static int 2031 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2032 struct ext4_extent *ex) 2033 { 2034 struct ext4_ext_cache *cex; 2035 int ret = EXT4_EXT_CACHE_NO; 2036 2037 /* 2038 * We borrow i_block_reservation_lock to protect i_cached_extent 2039 */ 2040 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2041 cex = &EXT4_I(inode)->i_cached_extent; 2042 2043 /* has cache valid data? */ 2044 if (cex->ec_type == EXT4_EXT_CACHE_NO) 2045 goto errout; 2046 2047 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2048 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2049 if (in_range(block, cex->ec_block, cex->ec_len)) { 2050 ex->ee_block = cpu_to_le32(cex->ec_block); 2051 ext4_ext_store_pblock(ex, cex->ec_start); 2052 ex->ee_len = cpu_to_le16(cex->ec_len); 2053 ext_debug("%u cached by %u:%u:%llu\n", 2054 block, 2055 cex->ec_block, cex->ec_len, cex->ec_start); 2056 ret = cex->ec_type; 2057 } 2058 errout: 2059 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2060 return ret; 2061 } 2062 2063 /* 2064 * ext4_ext_rm_idx: 2065 * removes index from the index block. 2066 * It's used in truncate case only, thus all requests are for 2067 * last index in the block only. 2068 */ 2069 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2070 struct ext4_ext_path *path) 2071 { 2072 int err; 2073 ext4_fsblk_t leaf; 2074 2075 /* free index block */ 2076 path--; 2077 leaf = idx_pblock(path->p_idx); 2078 if (unlikely(path->p_hdr->eh_entries == 0)) { 2079 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2080 return -EIO; 2081 } 2082 err = ext4_ext_get_access(handle, inode, path); 2083 if (err) 2084 return err; 2085 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2086 err = ext4_ext_dirty(handle, inode, path); 2087 if (err) 2088 return err; 2089 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2090 ext4_free_blocks(handle, inode, 0, leaf, 1, 2091 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2092 return err; 2093 } 2094 2095 /* 2096 * ext4_ext_calc_credits_for_single_extent: 2097 * This routine returns max. credits that needed to insert an extent 2098 * to the extent tree. 2099 * When pass the actual path, the caller should calculate credits 2100 * under i_data_sem. 2101 */ 2102 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2103 struct ext4_ext_path *path) 2104 { 2105 if (path) { 2106 int depth = ext_depth(inode); 2107 int ret = 0; 2108 2109 /* probably there is space in leaf? */ 2110 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2111 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2112 2113 /* 2114 * There are some space in the leaf tree, no 2115 * need to account for leaf block credit 2116 * 2117 * bitmaps and block group descriptor blocks 2118 * and other metadat blocks still need to be 2119 * accounted. 2120 */ 2121 /* 1 bitmap, 1 block group descriptor */ 2122 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2123 return ret; 2124 } 2125 } 2126 2127 return ext4_chunk_trans_blocks(inode, nrblocks); 2128 } 2129 2130 /* 2131 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2132 * 2133 * if nrblocks are fit in a single extent (chunk flag is 1), then 2134 * in the worse case, each tree level index/leaf need to be changed 2135 * if the tree split due to insert a new extent, then the old tree 2136 * index/leaf need to be updated too 2137 * 2138 * If the nrblocks are discontiguous, they could cause 2139 * the whole tree split more than once, but this is really rare. 2140 */ 2141 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2142 { 2143 int index; 2144 int depth = ext_depth(inode); 2145 2146 if (chunk) 2147 index = depth * 2; 2148 else 2149 index = depth * 3; 2150 2151 return index; 2152 } 2153 2154 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2155 struct ext4_extent *ex, 2156 ext4_lblk_t from, ext4_lblk_t to) 2157 { 2158 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2159 int flags = EXT4_FREE_BLOCKS_FORGET; 2160 2161 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2162 flags |= EXT4_FREE_BLOCKS_METADATA; 2163 #ifdef EXTENTS_STATS 2164 { 2165 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2166 spin_lock(&sbi->s_ext_stats_lock); 2167 sbi->s_ext_blocks += ee_len; 2168 sbi->s_ext_extents++; 2169 if (ee_len < sbi->s_ext_min) 2170 sbi->s_ext_min = ee_len; 2171 if (ee_len > sbi->s_ext_max) 2172 sbi->s_ext_max = ee_len; 2173 if (ext_depth(inode) > sbi->s_depth_max) 2174 sbi->s_depth_max = ext_depth(inode); 2175 spin_unlock(&sbi->s_ext_stats_lock); 2176 } 2177 #endif 2178 if (from >= le32_to_cpu(ex->ee_block) 2179 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2180 /* tail removal */ 2181 ext4_lblk_t num; 2182 ext4_fsblk_t start; 2183 2184 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2185 start = ext_pblock(ex) + ee_len - num; 2186 ext_debug("free last %u blocks starting %llu\n", num, start); 2187 ext4_free_blocks(handle, inode, 0, start, num, flags); 2188 } else if (from == le32_to_cpu(ex->ee_block) 2189 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2190 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2191 from, to, le32_to_cpu(ex->ee_block), ee_len); 2192 } else { 2193 printk(KERN_INFO "strange request: removal(2) " 2194 "%u-%u from %u:%u\n", 2195 from, to, le32_to_cpu(ex->ee_block), ee_len); 2196 } 2197 return 0; 2198 } 2199 2200 static int 2201 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2202 struct ext4_ext_path *path, ext4_lblk_t start) 2203 { 2204 int err = 0, correct_index = 0; 2205 int depth = ext_depth(inode), credits; 2206 struct ext4_extent_header *eh; 2207 ext4_lblk_t a, b, block; 2208 unsigned num; 2209 ext4_lblk_t ex_ee_block; 2210 unsigned short ex_ee_len; 2211 unsigned uninitialized = 0; 2212 struct ext4_extent *ex; 2213 2214 /* the header must be checked already in ext4_ext_remove_space() */ 2215 ext_debug("truncate since %u in leaf\n", start); 2216 if (!path[depth].p_hdr) 2217 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2218 eh = path[depth].p_hdr; 2219 if (unlikely(path[depth].p_hdr == NULL)) { 2220 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2221 return -EIO; 2222 } 2223 /* find where to start removing */ 2224 ex = EXT_LAST_EXTENT(eh); 2225 2226 ex_ee_block = le32_to_cpu(ex->ee_block); 2227 ex_ee_len = ext4_ext_get_actual_len(ex); 2228 2229 while (ex >= EXT_FIRST_EXTENT(eh) && 2230 ex_ee_block + ex_ee_len > start) { 2231 2232 if (ext4_ext_is_uninitialized(ex)) 2233 uninitialized = 1; 2234 else 2235 uninitialized = 0; 2236 2237 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2238 uninitialized, ex_ee_len); 2239 path[depth].p_ext = ex; 2240 2241 a = ex_ee_block > start ? ex_ee_block : start; 2242 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2243 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2244 2245 ext_debug(" border %u:%u\n", a, b); 2246 2247 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2248 block = 0; 2249 num = 0; 2250 BUG(); 2251 } else if (a != ex_ee_block) { 2252 /* remove tail of the extent */ 2253 block = ex_ee_block; 2254 num = a - block; 2255 } else if (b != ex_ee_block + ex_ee_len - 1) { 2256 /* remove head of the extent */ 2257 block = a; 2258 num = b - a; 2259 /* there is no "make a hole" API yet */ 2260 BUG(); 2261 } else { 2262 /* remove whole extent: excellent! */ 2263 block = ex_ee_block; 2264 num = 0; 2265 BUG_ON(a != ex_ee_block); 2266 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2267 } 2268 2269 /* 2270 * 3 for leaf, sb, and inode plus 2 (bmap and group 2271 * descriptor) for each block group; assume two block 2272 * groups plus ex_ee_len/blocks_per_block_group for 2273 * the worst case 2274 */ 2275 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2276 if (ex == EXT_FIRST_EXTENT(eh)) { 2277 correct_index = 1; 2278 credits += (ext_depth(inode)) + 1; 2279 } 2280 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2281 2282 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2283 if (err) 2284 goto out; 2285 2286 err = ext4_ext_get_access(handle, inode, path + depth); 2287 if (err) 2288 goto out; 2289 2290 err = ext4_remove_blocks(handle, inode, ex, a, b); 2291 if (err) 2292 goto out; 2293 2294 if (num == 0) { 2295 /* this extent is removed; mark slot entirely unused */ 2296 ext4_ext_store_pblock(ex, 0); 2297 le16_add_cpu(&eh->eh_entries, -1); 2298 } 2299 2300 ex->ee_block = cpu_to_le32(block); 2301 ex->ee_len = cpu_to_le16(num); 2302 /* 2303 * Do not mark uninitialized if all the blocks in the 2304 * extent have been removed. 2305 */ 2306 if (uninitialized && num) 2307 ext4_ext_mark_uninitialized(ex); 2308 2309 err = ext4_ext_dirty(handle, inode, path + depth); 2310 if (err) 2311 goto out; 2312 2313 ext_debug("new extent: %u:%u:%llu\n", block, num, 2314 ext_pblock(ex)); 2315 ex--; 2316 ex_ee_block = le32_to_cpu(ex->ee_block); 2317 ex_ee_len = ext4_ext_get_actual_len(ex); 2318 } 2319 2320 if (correct_index && eh->eh_entries) 2321 err = ext4_ext_correct_indexes(handle, inode, path); 2322 2323 /* if this leaf is free, then we should 2324 * remove it from index block above */ 2325 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2326 err = ext4_ext_rm_idx(handle, inode, path + depth); 2327 2328 out: 2329 return err; 2330 } 2331 2332 /* 2333 * ext4_ext_more_to_rm: 2334 * returns 1 if current index has to be freed (even partial) 2335 */ 2336 static int 2337 ext4_ext_more_to_rm(struct ext4_ext_path *path) 2338 { 2339 BUG_ON(path->p_idx == NULL); 2340 2341 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2342 return 0; 2343 2344 /* 2345 * if truncate on deeper level happened, it wasn't partial, 2346 * so we have to consider current index for truncation 2347 */ 2348 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2349 return 0; 2350 return 1; 2351 } 2352 2353 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2354 { 2355 struct super_block *sb = inode->i_sb; 2356 int depth = ext_depth(inode); 2357 struct ext4_ext_path *path; 2358 handle_t *handle; 2359 int i, err; 2360 2361 ext_debug("truncate since %u\n", start); 2362 2363 /* probably first extent we're gonna free will be last in block */ 2364 handle = ext4_journal_start(inode, depth + 1); 2365 if (IS_ERR(handle)) 2366 return PTR_ERR(handle); 2367 2368 again: 2369 ext4_ext_invalidate_cache(inode); 2370 2371 /* 2372 * We start scanning from right side, freeing all the blocks 2373 * after i_size and walking into the tree depth-wise. 2374 */ 2375 depth = ext_depth(inode); 2376 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2377 if (path == NULL) { 2378 ext4_journal_stop(handle); 2379 return -ENOMEM; 2380 } 2381 path[0].p_depth = depth; 2382 path[0].p_hdr = ext_inode_hdr(inode); 2383 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2384 err = -EIO; 2385 goto out; 2386 } 2387 i = err = 0; 2388 2389 while (i >= 0 && err == 0) { 2390 if (i == depth) { 2391 /* this is leaf block */ 2392 err = ext4_ext_rm_leaf(handle, inode, path, start); 2393 /* root level has p_bh == NULL, brelse() eats this */ 2394 brelse(path[i].p_bh); 2395 path[i].p_bh = NULL; 2396 i--; 2397 continue; 2398 } 2399 2400 /* this is index block */ 2401 if (!path[i].p_hdr) { 2402 ext_debug("initialize header\n"); 2403 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2404 } 2405 2406 if (!path[i].p_idx) { 2407 /* this level hasn't been touched yet */ 2408 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2409 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2410 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2411 path[i].p_hdr, 2412 le16_to_cpu(path[i].p_hdr->eh_entries)); 2413 } else { 2414 /* we were already here, see at next index */ 2415 path[i].p_idx--; 2416 } 2417 2418 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2419 i, EXT_FIRST_INDEX(path[i].p_hdr), 2420 path[i].p_idx); 2421 if (ext4_ext_more_to_rm(path + i)) { 2422 struct buffer_head *bh; 2423 /* go to the next level */ 2424 ext_debug("move to level %d (block %llu)\n", 2425 i + 1, idx_pblock(path[i].p_idx)); 2426 memset(path + i + 1, 0, sizeof(*path)); 2427 bh = sb_bread(sb, idx_pblock(path[i].p_idx)); 2428 if (!bh) { 2429 /* should we reset i_size? */ 2430 err = -EIO; 2431 break; 2432 } 2433 if (WARN_ON(i + 1 > depth)) { 2434 err = -EIO; 2435 break; 2436 } 2437 if (ext4_ext_check(inode, ext_block_hdr(bh), 2438 depth - i - 1)) { 2439 err = -EIO; 2440 break; 2441 } 2442 path[i + 1].p_bh = bh; 2443 2444 /* save actual number of indexes since this 2445 * number is changed at the next iteration */ 2446 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2447 i++; 2448 } else { 2449 /* we finished processing this index, go up */ 2450 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2451 /* index is empty, remove it; 2452 * handle must be already prepared by the 2453 * truncatei_leaf() */ 2454 err = ext4_ext_rm_idx(handle, inode, path + i); 2455 } 2456 /* root level has p_bh == NULL, brelse() eats this */ 2457 brelse(path[i].p_bh); 2458 path[i].p_bh = NULL; 2459 i--; 2460 ext_debug("return to level %d\n", i); 2461 } 2462 } 2463 2464 /* TODO: flexible tree reduction should be here */ 2465 if (path->p_hdr->eh_entries == 0) { 2466 /* 2467 * truncate to zero freed all the tree, 2468 * so we need to correct eh_depth 2469 */ 2470 err = ext4_ext_get_access(handle, inode, path); 2471 if (err == 0) { 2472 ext_inode_hdr(inode)->eh_depth = 0; 2473 ext_inode_hdr(inode)->eh_max = 2474 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2475 err = ext4_ext_dirty(handle, inode, path); 2476 } 2477 } 2478 out: 2479 ext4_ext_drop_refs(path); 2480 kfree(path); 2481 if (err == -EAGAIN) 2482 goto again; 2483 ext4_journal_stop(handle); 2484 2485 return err; 2486 } 2487 2488 /* 2489 * called at mount time 2490 */ 2491 void ext4_ext_init(struct super_block *sb) 2492 { 2493 /* 2494 * possible initialization would be here 2495 */ 2496 2497 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2498 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2499 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2500 #ifdef AGGRESSIVE_TEST 2501 printk(", aggressive tests"); 2502 #endif 2503 #ifdef CHECK_BINSEARCH 2504 printk(", check binsearch"); 2505 #endif 2506 #ifdef EXTENTS_STATS 2507 printk(", stats"); 2508 #endif 2509 printk("\n"); 2510 #endif 2511 #ifdef EXTENTS_STATS 2512 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2513 EXT4_SB(sb)->s_ext_min = 1 << 30; 2514 EXT4_SB(sb)->s_ext_max = 0; 2515 #endif 2516 } 2517 } 2518 2519 /* 2520 * called at umount time 2521 */ 2522 void ext4_ext_release(struct super_block *sb) 2523 { 2524 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 2525 return; 2526 2527 #ifdef EXTENTS_STATS 2528 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2529 struct ext4_sb_info *sbi = EXT4_SB(sb); 2530 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2531 sbi->s_ext_blocks, sbi->s_ext_extents, 2532 sbi->s_ext_blocks / sbi->s_ext_extents); 2533 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2534 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2535 } 2536 #endif 2537 } 2538 2539 static void bi_complete(struct bio *bio, int error) 2540 { 2541 complete((struct completion *)bio->bi_private); 2542 } 2543 2544 /* FIXME!! we need to try to merge to left or right after zero-out */ 2545 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2546 { 2547 int ret; 2548 struct bio *bio; 2549 int blkbits, blocksize; 2550 sector_t ee_pblock; 2551 struct completion event; 2552 unsigned int ee_len, len, done, offset; 2553 2554 2555 blkbits = inode->i_blkbits; 2556 blocksize = inode->i_sb->s_blocksize; 2557 ee_len = ext4_ext_get_actual_len(ex); 2558 ee_pblock = ext_pblock(ex); 2559 2560 /* convert ee_pblock to 512 byte sectors */ 2561 ee_pblock = ee_pblock << (blkbits - 9); 2562 2563 while (ee_len > 0) { 2564 2565 if (ee_len > BIO_MAX_PAGES) 2566 len = BIO_MAX_PAGES; 2567 else 2568 len = ee_len; 2569 2570 bio = bio_alloc(GFP_NOIO, len); 2571 if (!bio) 2572 return -ENOMEM; 2573 2574 bio->bi_sector = ee_pblock; 2575 bio->bi_bdev = inode->i_sb->s_bdev; 2576 2577 done = 0; 2578 offset = 0; 2579 while (done < len) { 2580 ret = bio_add_page(bio, ZERO_PAGE(0), 2581 blocksize, offset); 2582 if (ret != blocksize) { 2583 /* 2584 * We can't add any more pages because of 2585 * hardware limitations. Start a new bio. 2586 */ 2587 break; 2588 } 2589 done++; 2590 offset += blocksize; 2591 if (offset >= PAGE_CACHE_SIZE) 2592 offset = 0; 2593 } 2594 2595 init_completion(&event); 2596 bio->bi_private = &event; 2597 bio->bi_end_io = bi_complete; 2598 submit_bio(WRITE, bio); 2599 wait_for_completion(&event); 2600 2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2602 bio_put(bio); 2603 return -EIO; 2604 } 2605 bio_put(bio); 2606 ee_len -= done; 2607 ee_pblock += done << (blkbits - 9); 2608 } 2609 return 0; 2610 } 2611 2612 #define EXT4_EXT_ZERO_LEN 7 2613 /* 2614 * This function is called by ext4_ext_map_blocks() if someone tries to write 2615 * to an uninitialized extent. It may result in splitting the uninitialized 2616 * extent into multiple extents (upto three - one initialized and two 2617 * uninitialized). 2618 * There are three possibilities: 2619 * a> There is no split required: Entire extent should be initialized 2620 * b> Splits in two extents: Write is happening at either end of the extent 2621 * c> Splits in three extents: Somone is writing in middle of the extent 2622 */ 2623 static int ext4_ext_convert_to_initialized(handle_t *handle, 2624 struct inode *inode, 2625 struct ext4_map_blocks *map, 2626 struct ext4_ext_path *path) 2627 { 2628 struct ext4_extent *ex, newex, orig_ex; 2629 struct ext4_extent *ex1 = NULL; 2630 struct ext4_extent *ex2 = NULL; 2631 struct ext4_extent *ex3 = NULL; 2632 struct ext4_extent_header *eh; 2633 ext4_lblk_t ee_block, eof_block; 2634 unsigned int allocated, ee_len, depth; 2635 ext4_fsblk_t newblock; 2636 int err = 0; 2637 int ret = 0; 2638 int may_zeroout; 2639 2640 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2641 "block %llu, max_blocks %u\n", inode->i_ino, 2642 (unsigned long long)map->m_lblk, map->m_len); 2643 2644 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2645 inode->i_sb->s_blocksize_bits; 2646 if (eof_block < map->m_lblk + map->m_len) 2647 eof_block = map->m_lblk + map->m_len; 2648 2649 depth = ext_depth(inode); 2650 eh = path[depth].p_hdr; 2651 ex = path[depth].p_ext; 2652 ee_block = le32_to_cpu(ex->ee_block); 2653 ee_len = ext4_ext_get_actual_len(ex); 2654 allocated = ee_len - (map->m_lblk - ee_block); 2655 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2656 2657 ex2 = ex; 2658 orig_ex.ee_block = ex->ee_block; 2659 orig_ex.ee_len = cpu_to_le16(ee_len); 2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2661 2662 /* 2663 * It is safe to convert extent to initialized via explicit 2664 * zeroout only if extent is fully insde i_size or new_size. 2665 */ 2666 may_zeroout = ee_block + ee_len <= eof_block; 2667 2668 err = ext4_ext_get_access(handle, inode, path + depth); 2669 if (err) 2670 goto out; 2671 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2672 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2673 err = ext4_ext_zeroout(inode, &orig_ex); 2674 if (err) 2675 goto fix_extent_len; 2676 /* update the extent length and mark as initialized */ 2677 ex->ee_block = orig_ex.ee_block; 2678 ex->ee_len = orig_ex.ee_len; 2679 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2680 ext4_ext_dirty(handle, inode, path + depth); 2681 /* zeroed the full extent */ 2682 return allocated; 2683 } 2684 2685 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 2686 if (map->m_lblk > ee_block) { 2687 ex1 = ex; 2688 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2689 ext4_ext_mark_uninitialized(ex1); 2690 ex2 = &newex; 2691 } 2692 /* 2693 * for sanity, update the length of the ex2 extent before 2694 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2695 * overlap of blocks. 2696 */ 2697 if (!ex1 && allocated > map->m_len) 2698 ex2->ee_len = cpu_to_le16(map->m_len); 2699 /* ex3: to ee_block + ee_len : uninitialised */ 2700 if (allocated > map->m_len) { 2701 unsigned int newdepth; 2702 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2703 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { 2704 /* 2705 * map->m_lblk == ee_block is handled by the zerouout 2706 * at the beginning. 2707 * Mark first half uninitialized. 2708 * Mark second half initialized and zero out the 2709 * initialized extent 2710 */ 2711 ex->ee_block = orig_ex.ee_block; 2712 ex->ee_len = cpu_to_le16(ee_len - allocated); 2713 ext4_ext_mark_uninitialized(ex); 2714 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2715 ext4_ext_dirty(handle, inode, path + depth); 2716 2717 ex3 = &newex; 2718 ex3->ee_block = cpu_to_le32(map->m_lblk); 2719 ext4_ext_store_pblock(ex3, newblock); 2720 ex3->ee_len = cpu_to_le16(allocated); 2721 err = ext4_ext_insert_extent(handle, inode, path, 2722 ex3, 0); 2723 if (err == -ENOSPC) { 2724 err = ext4_ext_zeroout(inode, &orig_ex); 2725 if (err) 2726 goto fix_extent_len; 2727 ex->ee_block = orig_ex.ee_block; 2728 ex->ee_len = orig_ex.ee_len; 2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2730 ext4_ext_dirty(handle, inode, path + depth); 2731 /* blocks available from map->m_lblk */ 2732 return allocated; 2733 2734 } else if (err) 2735 goto fix_extent_len; 2736 2737 /* 2738 * We need to zero out the second half because 2739 * an fallocate request can update file size and 2740 * converting the second half to initialized extent 2741 * implies that we can leak some junk data to user 2742 * space. 2743 */ 2744 err = ext4_ext_zeroout(inode, ex3); 2745 if (err) { 2746 /* 2747 * We should actually mark the 2748 * second half as uninit and return error 2749 * Insert would have changed the extent 2750 */ 2751 depth = ext_depth(inode); 2752 ext4_ext_drop_refs(path); 2753 path = ext4_ext_find_extent(inode, map->m_lblk, 2754 path); 2755 if (IS_ERR(path)) { 2756 err = PTR_ERR(path); 2757 return err; 2758 } 2759 /* get the second half extent details */ 2760 ex = path[depth].p_ext; 2761 err = ext4_ext_get_access(handle, inode, 2762 path + depth); 2763 if (err) 2764 return err; 2765 ext4_ext_mark_uninitialized(ex); 2766 ext4_ext_dirty(handle, inode, path + depth); 2767 return err; 2768 } 2769 2770 /* zeroed the second half */ 2771 return allocated; 2772 } 2773 ex3 = &newex; 2774 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 2775 ext4_ext_store_pblock(ex3, newblock + map->m_len); 2776 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 2777 ext4_ext_mark_uninitialized(ex3); 2778 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2779 if (err == -ENOSPC && may_zeroout) { 2780 err = ext4_ext_zeroout(inode, &orig_ex); 2781 if (err) 2782 goto fix_extent_len; 2783 /* update the extent length and mark as initialized */ 2784 ex->ee_block = orig_ex.ee_block; 2785 ex->ee_len = orig_ex.ee_len; 2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2787 ext4_ext_dirty(handle, inode, path + depth); 2788 /* zeroed the full extent */ 2789 /* blocks available from map->m_lblk */ 2790 return allocated; 2791 2792 } else if (err) 2793 goto fix_extent_len; 2794 /* 2795 * The depth, and hence eh & ex might change 2796 * as part of the insert above. 2797 */ 2798 newdepth = ext_depth(inode); 2799 /* 2800 * update the extent length after successful insert of the 2801 * split extent 2802 */ 2803 ee_len -= ext4_ext_get_actual_len(ex3); 2804 orig_ex.ee_len = cpu_to_le16(ee_len); 2805 may_zeroout = ee_block + ee_len <= eof_block; 2806 2807 depth = newdepth; 2808 ext4_ext_drop_refs(path); 2809 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2810 if (IS_ERR(path)) { 2811 err = PTR_ERR(path); 2812 goto out; 2813 } 2814 eh = path[depth].p_hdr; 2815 ex = path[depth].p_ext; 2816 if (ex2 != &newex) 2817 ex2 = ex; 2818 2819 err = ext4_ext_get_access(handle, inode, path + depth); 2820 if (err) 2821 goto out; 2822 2823 allocated = map->m_len; 2824 2825 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2826 * to insert a extent in the middle zerout directly 2827 * otherwise give the extent a chance to merge to left 2828 */ 2829 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2830 map->m_lblk != ee_block && may_zeroout) { 2831 err = ext4_ext_zeroout(inode, &orig_ex); 2832 if (err) 2833 goto fix_extent_len; 2834 /* update the extent length and mark as initialized */ 2835 ex->ee_block = orig_ex.ee_block; 2836 ex->ee_len = orig_ex.ee_len; 2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2838 ext4_ext_dirty(handle, inode, path + depth); 2839 /* zero out the first half */ 2840 /* blocks available from map->m_lblk */ 2841 return allocated; 2842 } 2843 } 2844 /* 2845 * If there was a change of depth as part of the 2846 * insertion of ex3 above, we need to update the length 2847 * of the ex1 extent again here 2848 */ 2849 if (ex1 && ex1 != ex) { 2850 ex1 = ex; 2851 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2852 ext4_ext_mark_uninitialized(ex1); 2853 ex2 = &newex; 2854 } 2855 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ 2856 ex2->ee_block = cpu_to_le32(map->m_lblk); 2857 ext4_ext_store_pblock(ex2, newblock); 2858 ex2->ee_len = cpu_to_le16(allocated); 2859 if (ex2 != ex) 2860 goto insert; 2861 /* 2862 * New (initialized) extent starts from the first block 2863 * in the current extent. i.e., ex2 == ex 2864 * We have to see if it can be merged with the extent 2865 * on the left. 2866 */ 2867 if (ex2 > EXT_FIRST_EXTENT(eh)) { 2868 /* 2869 * To merge left, pass "ex2 - 1" to try_to_merge(), 2870 * since it merges towards right _only_. 2871 */ 2872 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); 2873 if (ret) { 2874 err = ext4_ext_correct_indexes(handle, inode, path); 2875 if (err) 2876 goto out; 2877 depth = ext_depth(inode); 2878 ex2--; 2879 } 2880 } 2881 /* 2882 * Try to Merge towards right. This might be required 2883 * only when the whole extent is being written to. 2884 * i.e. ex2 == ex and ex3 == NULL. 2885 */ 2886 if (!ex3) { 2887 ret = ext4_ext_try_to_merge(inode, path, ex2); 2888 if (ret) { 2889 err = ext4_ext_correct_indexes(handle, inode, path); 2890 if (err) 2891 goto out; 2892 } 2893 } 2894 /* Mark modified extent as dirty */ 2895 err = ext4_ext_dirty(handle, inode, path + depth); 2896 goto out; 2897 insert: 2898 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2899 if (err == -ENOSPC && may_zeroout) { 2900 err = ext4_ext_zeroout(inode, &orig_ex); 2901 if (err) 2902 goto fix_extent_len; 2903 /* update the extent length and mark as initialized */ 2904 ex->ee_block = orig_ex.ee_block; 2905 ex->ee_len = orig_ex.ee_len; 2906 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2907 ext4_ext_dirty(handle, inode, path + depth); 2908 /* zero out the first half */ 2909 return allocated; 2910 } else if (err) 2911 goto fix_extent_len; 2912 out: 2913 ext4_ext_show_leaf(inode, path); 2914 return err ? err : allocated; 2915 2916 fix_extent_len: 2917 ex->ee_block = orig_ex.ee_block; 2918 ex->ee_len = orig_ex.ee_len; 2919 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2920 ext4_ext_mark_uninitialized(ex); 2921 ext4_ext_dirty(handle, inode, path + depth); 2922 return err; 2923 } 2924 2925 /* 2926 * This function is called by ext4_ext_map_blocks() from 2927 * ext4_get_blocks_dio_write() when DIO to write 2928 * to an uninitialized extent. 2929 * 2930 * Writing to an uninitized extent may result in splitting the uninitialized 2931 * extent into multiple /intialized unintialized extents (up to three) 2932 * There are three possibilities: 2933 * a> There is no split required: Entire extent should be uninitialized 2934 * b> Splits in two extents: Write is happening at either end of the extent 2935 * c> Splits in three extents: Somone is writing in middle of the extent 2936 * 2937 * One of more index blocks maybe needed if the extent tree grow after 2938 * the unintialized extent split. To prevent ENOSPC occur at the IO 2939 * complete, we need to split the uninitialized extent before DIO submit 2940 * the IO. The uninitilized extent called at this time will be split 2941 * into three uninitialized extent(at most). After IO complete, the part 2942 * being filled will be convert to initialized by the end_io callback function 2943 * via ext4_convert_unwritten_extents(). 2944 * 2945 * Returns the size of uninitialized extent to be written on success. 2946 */ 2947 static int ext4_split_unwritten_extents(handle_t *handle, 2948 struct inode *inode, 2949 struct ext4_map_blocks *map, 2950 struct ext4_ext_path *path, 2951 int flags) 2952 { 2953 struct ext4_extent *ex, newex, orig_ex; 2954 struct ext4_extent *ex1 = NULL; 2955 struct ext4_extent *ex2 = NULL; 2956 struct ext4_extent *ex3 = NULL; 2957 struct ext4_extent_header *eh; 2958 ext4_lblk_t ee_block, eof_block; 2959 unsigned int allocated, ee_len, depth; 2960 ext4_fsblk_t newblock; 2961 int err = 0; 2962 int may_zeroout; 2963 2964 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 2965 "block %llu, max_blocks %u\n", inode->i_ino, 2966 (unsigned long long)map->m_lblk, map->m_len); 2967 2968 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2969 inode->i_sb->s_blocksize_bits; 2970 if (eof_block < map->m_lblk + map->m_len) 2971 eof_block = map->m_lblk + map->m_len; 2972 2973 depth = ext_depth(inode); 2974 eh = path[depth].p_hdr; 2975 ex = path[depth].p_ext; 2976 ee_block = le32_to_cpu(ex->ee_block); 2977 ee_len = ext4_ext_get_actual_len(ex); 2978 allocated = ee_len - (map->m_lblk - ee_block); 2979 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2980 2981 ex2 = ex; 2982 orig_ex.ee_block = ex->ee_block; 2983 orig_ex.ee_len = cpu_to_le16(ee_len); 2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2985 2986 /* 2987 * It is safe to convert extent to initialized via explicit 2988 * zeroout only if extent is fully insde i_size or new_size. 2989 */ 2990 may_zeroout = ee_block + ee_len <= eof_block; 2991 2992 /* 2993 * If the uninitialized extent begins at the same logical 2994 * block where the write begins, and the write completely 2995 * covers the extent, then we don't need to split it. 2996 */ 2997 if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) 2998 return allocated; 2999 3000 err = ext4_ext_get_access(handle, inode, path + depth); 3001 if (err) 3002 goto out; 3003 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 3004 if (map->m_lblk > ee_block) { 3005 ex1 = ex; 3006 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 3007 ext4_ext_mark_uninitialized(ex1); 3008 ex2 = &newex; 3009 } 3010 /* 3011 * for sanity, update the length of the ex2 extent before 3012 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3013 * overlap of blocks. 3014 */ 3015 if (!ex1 && allocated > map->m_len) 3016 ex2->ee_len = cpu_to_le16(map->m_len); 3017 /* ex3: to ee_block + ee_len : uninitialised */ 3018 if (allocated > map->m_len) { 3019 unsigned int newdepth; 3020 ex3 = &newex; 3021 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 3022 ext4_ext_store_pblock(ex3, newblock + map->m_len); 3023 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 3024 ext4_ext_mark_uninitialized(ex3); 3025 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3026 if (err == -ENOSPC && may_zeroout) { 3027 err = ext4_ext_zeroout(inode, &orig_ex); 3028 if (err) 3029 goto fix_extent_len; 3030 /* update the extent length and mark as initialized */ 3031 ex->ee_block = orig_ex.ee_block; 3032 ex->ee_len = orig_ex.ee_len; 3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3034 ext4_ext_dirty(handle, inode, path + depth); 3035 /* zeroed the full extent */ 3036 /* blocks available from map->m_lblk */ 3037 return allocated; 3038 3039 } else if (err) 3040 goto fix_extent_len; 3041 /* 3042 * The depth, and hence eh & ex might change 3043 * as part of the insert above. 3044 */ 3045 newdepth = ext_depth(inode); 3046 /* 3047 * update the extent length after successful insert of the 3048 * split extent 3049 */ 3050 ee_len -= ext4_ext_get_actual_len(ex3); 3051 orig_ex.ee_len = cpu_to_le16(ee_len); 3052 may_zeroout = ee_block + ee_len <= eof_block; 3053 3054 depth = newdepth; 3055 ext4_ext_drop_refs(path); 3056 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3057 if (IS_ERR(path)) { 3058 err = PTR_ERR(path); 3059 goto out; 3060 } 3061 eh = path[depth].p_hdr; 3062 ex = path[depth].p_ext; 3063 if (ex2 != &newex) 3064 ex2 = ex; 3065 3066 err = ext4_ext_get_access(handle, inode, path + depth); 3067 if (err) 3068 goto out; 3069 3070 allocated = map->m_len; 3071 } 3072 /* 3073 * If there was a change of depth as part of the 3074 * insertion of ex3 above, we need to update the length 3075 * of the ex1 extent again here 3076 */ 3077 if (ex1 && ex1 != ex) { 3078 ex1 = ex; 3079 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 3080 ext4_ext_mark_uninitialized(ex1); 3081 ex2 = &newex; 3082 } 3083 /* 3084 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written 3085 * using direct I/O, uninitialised still. 3086 */ 3087 ex2->ee_block = cpu_to_le32(map->m_lblk); 3088 ext4_ext_store_pblock(ex2, newblock); 3089 ex2->ee_len = cpu_to_le16(allocated); 3090 ext4_ext_mark_uninitialized(ex2); 3091 if (ex2 != ex) 3092 goto insert; 3093 /* Mark modified extent as dirty */ 3094 err = ext4_ext_dirty(handle, inode, path + depth); 3095 ext_debug("out here\n"); 3096 goto out; 3097 insert: 3098 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3099 if (err == -ENOSPC && may_zeroout) { 3100 err = ext4_ext_zeroout(inode, &orig_ex); 3101 if (err) 3102 goto fix_extent_len; 3103 /* update the extent length and mark as initialized */ 3104 ex->ee_block = orig_ex.ee_block; 3105 ex->ee_len = orig_ex.ee_len; 3106 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3107 ext4_ext_dirty(handle, inode, path + depth); 3108 /* zero out the first half */ 3109 return allocated; 3110 } else if (err) 3111 goto fix_extent_len; 3112 out: 3113 ext4_ext_show_leaf(inode, path); 3114 return err ? err : allocated; 3115 3116 fix_extent_len: 3117 ex->ee_block = orig_ex.ee_block; 3118 ex->ee_len = orig_ex.ee_len; 3119 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3120 ext4_ext_mark_uninitialized(ex); 3121 ext4_ext_dirty(handle, inode, path + depth); 3122 return err; 3123 } 3124 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3125 struct inode *inode, 3126 struct ext4_ext_path *path) 3127 { 3128 struct ext4_extent *ex; 3129 struct ext4_extent_header *eh; 3130 int depth; 3131 int err = 0; 3132 int ret = 0; 3133 3134 depth = ext_depth(inode); 3135 eh = path[depth].p_hdr; 3136 ex = path[depth].p_ext; 3137 3138 err = ext4_ext_get_access(handle, inode, path + depth); 3139 if (err) 3140 goto out; 3141 /* first mark the extent as initialized */ 3142 ext4_ext_mark_initialized(ex); 3143 3144 /* 3145 * We have to see if it can be merged with the extent 3146 * on the left. 3147 */ 3148 if (ex > EXT_FIRST_EXTENT(eh)) { 3149 /* 3150 * To merge left, pass "ex - 1" to try_to_merge(), 3151 * since it merges towards right _only_. 3152 */ 3153 ret = ext4_ext_try_to_merge(inode, path, ex - 1); 3154 if (ret) { 3155 err = ext4_ext_correct_indexes(handle, inode, path); 3156 if (err) 3157 goto out; 3158 depth = ext_depth(inode); 3159 ex--; 3160 } 3161 } 3162 /* 3163 * Try to Merge towards right. 3164 */ 3165 ret = ext4_ext_try_to_merge(inode, path, ex); 3166 if (ret) { 3167 err = ext4_ext_correct_indexes(handle, inode, path); 3168 if (err) 3169 goto out; 3170 depth = ext_depth(inode); 3171 } 3172 /* Mark modified extent as dirty */ 3173 err = ext4_ext_dirty(handle, inode, path + depth); 3174 out: 3175 ext4_ext_show_leaf(inode, path); 3176 return err; 3177 } 3178 3179 static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3180 sector_t block, int count) 3181 { 3182 int i; 3183 for (i = 0; i < count; i++) 3184 unmap_underlying_metadata(bdev, block + i); 3185 } 3186 3187 static int 3188 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3189 struct ext4_map_blocks *map, 3190 struct ext4_ext_path *path, int flags, 3191 unsigned int allocated, ext4_fsblk_t newblock) 3192 { 3193 int ret = 0; 3194 int err = 0; 3195 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3196 3197 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3198 "block %llu, max_blocks %u, flags %d, allocated %u", 3199 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3200 flags, allocated); 3201 ext4_ext_show_leaf(inode, path); 3202 3203 /* get_block() before submit the IO, split the extent */ 3204 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3205 ret = ext4_split_unwritten_extents(handle, inode, map, 3206 path, flags); 3207 /* 3208 * Flag the inode(non aio case) or end_io struct (aio case) 3209 * that this IO needs to convertion to written when IO is 3210 * completed 3211 */ 3212 if (io) 3213 io->flag = EXT4_IO_UNWRITTEN; 3214 else 3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3216 if (ext4_should_dioread_nolock(inode)) 3217 map->m_flags |= EXT4_MAP_UNINIT; 3218 goto out; 3219 } 3220 /* IO end_io complete, convert the filled extent to written */ 3221 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3222 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3223 path); 3224 if (ret >= 0) 3225 ext4_update_inode_fsync_trans(handle, inode, 1); 3226 goto out2; 3227 } 3228 /* buffered IO case */ 3229 /* 3230 * repeat fallocate creation request 3231 * we already have an unwritten extent 3232 */ 3233 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3234 goto map_out; 3235 3236 /* buffered READ or buffered write_begin() lookup */ 3237 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3238 /* 3239 * We have blocks reserved already. We 3240 * return allocated blocks so that delalloc 3241 * won't do block reservation for us. But 3242 * the buffer head will be unmapped so that 3243 * a read from the block returns 0s. 3244 */ 3245 map->m_flags |= EXT4_MAP_UNWRITTEN; 3246 goto out1; 3247 } 3248 3249 /* buffered write, writepage time, convert*/ 3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3251 if (ret >= 0) 3252 ext4_update_inode_fsync_trans(handle, inode, 1); 3253 out: 3254 if (ret <= 0) { 3255 err = ret; 3256 goto out2; 3257 } else 3258 allocated = ret; 3259 map->m_flags |= EXT4_MAP_NEW; 3260 /* 3261 * if we allocated more blocks than requested 3262 * we need to make sure we unmap the extra block 3263 * allocated. The actual needed block will get 3264 * unmapped later when we find the buffer_head marked 3265 * new. 3266 */ 3267 if (allocated > map->m_len) { 3268 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3269 newblock + map->m_len, 3270 allocated - map->m_len); 3271 allocated = map->m_len; 3272 } 3273 3274 /* 3275 * If we have done fallocate with the offset that is already 3276 * delayed allocated, we would have block reservation 3277 * and quota reservation done in the delayed write path. 3278 * But fallocate would have already updated quota and block 3279 * count for this offset. So cancel these reservation 3280 */ 3281 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3282 ext4_da_update_reserve_space(inode, allocated, 0); 3283 3284 map_out: 3285 map->m_flags |= EXT4_MAP_MAPPED; 3286 out1: 3287 if (allocated > map->m_len) 3288 allocated = map->m_len; 3289 ext4_ext_show_leaf(inode, path); 3290 map->m_pblk = newblock; 3291 map->m_len = allocated; 3292 out2: 3293 if (path) { 3294 ext4_ext_drop_refs(path); 3295 kfree(path); 3296 } 3297 return err ? err : allocated; 3298 } 3299 /* 3300 * Block allocation/map/preallocation routine for extents based files 3301 * 3302 * 3303 * Need to be called with 3304 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 3305 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 3306 * 3307 * return > 0, number of of blocks already mapped/allocated 3308 * if create == 0 and these are pre-allocated blocks 3309 * buffer head is unmapped 3310 * otherwise blocks are mapped 3311 * 3312 * return = 0, if plain look up failed (blocks have not been allocated) 3313 * buffer head is unmapped 3314 * 3315 * return < 0, error case. 3316 */ 3317 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3318 struct ext4_map_blocks *map, int flags) 3319 { 3320 struct ext4_ext_path *path = NULL; 3321 struct ext4_extent_header *eh; 3322 struct ext4_extent newex, *ex, *last_ex; 3323 ext4_fsblk_t newblock; 3324 int i, err = 0, depth, ret, cache_type; 3325 unsigned int allocated = 0; 3326 struct ext4_allocation_request ar; 3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3328 3329 ext_debug("blocks %u/%u requested for inode %lu\n", 3330 map->m_lblk, map->m_len, inode->i_ino); 3331 3332 /* check in cache */ 3333 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); 3334 if (cache_type) { 3335 if (cache_type == EXT4_EXT_CACHE_GAP) { 3336 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3337 /* 3338 * block isn't allocated yet and 3339 * user doesn't want to allocate it 3340 */ 3341 goto out2; 3342 } 3343 /* we should allocate requested block */ 3344 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3345 /* block is already allocated */ 3346 newblock = map->m_lblk 3347 - le32_to_cpu(newex.ee_block) 3348 + ext_pblock(&newex); 3349 /* number of remaining blocks in the extent */ 3350 allocated = ext4_ext_get_actual_len(&newex) - 3351 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3352 goto out; 3353 } else { 3354 BUG(); 3355 } 3356 } 3357 3358 /* find extent for this block */ 3359 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 3360 if (IS_ERR(path)) { 3361 err = PTR_ERR(path); 3362 path = NULL; 3363 goto out2; 3364 } 3365 3366 depth = ext_depth(inode); 3367 3368 /* 3369 * consistent leaf must not be empty; 3370 * this situation is possible, though, _during_ tree modification; 3371 * this is why assert can't be put in ext4_ext_find_extent() 3372 */ 3373 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3374 EXT4_ERROR_INODE(inode, "bad extent address " 3375 "lblock: %lu, depth: %d pblock %lld", 3376 (unsigned long) map->m_lblk, depth, 3377 path[depth].p_block); 3378 err = -EIO; 3379 goto out2; 3380 } 3381 eh = path[depth].p_hdr; 3382 3383 ex = path[depth].p_ext; 3384 if (ex) { 3385 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3386 ext4_fsblk_t ee_start = ext_pblock(ex); 3387 unsigned short ee_len; 3388 3389 /* 3390 * Uninitialized extents are treated as holes, except that 3391 * we split out initialized portions during a write. 3392 */ 3393 ee_len = ext4_ext_get_actual_len(ex); 3394 /* if found extent covers block, simply return it */ 3395 if (in_range(map->m_lblk, ee_block, ee_len)) { 3396 newblock = map->m_lblk - ee_block + ee_start; 3397 /* number of remaining blocks in the extent */ 3398 allocated = ee_len - (map->m_lblk - ee_block); 3399 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3400 ee_block, ee_len, newblock); 3401 3402 /* Do not put uninitialized extent in the cache */ 3403 if (!ext4_ext_is_uninitialized(ex)) { 3404 ext4_ext_put_in_cache(inode, ee_block, 3405 ee_len, ee_start, 3406 EXT4_EXT_CACHE_EXTENT); 3407 goto out; 3408 } 3409 ret = ext4_ext_handle_uninitialized_extents(handle, 3410 inode, map, path, flags, allocated, 3411 newblock); 3412 return ret; 3413 } 3414 } 3415 3416 /* 3417 * requested block isn't allocated yet; 3418 * we couldn't try to create block if create flag is zero 3419 */ 3420 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3421 /* 3422 * put just found gap into cache to speed up 3423 * subsequent requests 3424 */ 3425 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3426 goto out2; 3427 } 3428 /* 3429 * Okay, we need to do block allocation. 3430 */ 3431 3432 /* find neighbour allocated blocks */ 3433 ar.lleft = map->m_lblk; 3434 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3435 if (err) 3436 goto out2; 3437 ar.lright = map->m_lblk; 3438 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3439 if (err) 3440 goto out2; 3441 3442 /* 3443 * See if request is beyond maximum number of blocks we can have in 3444 * a single extent. For an initialized extent this limit is 3445 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3446 * EXT_UNINIT_MAX_LEN. 3447 */ 3448 if (map->m_len > EXT_INIT_MAX_LEN && 3449 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3450 map->m_len = EXT_INIT_MAX_LEN; 3451 else if (map->m_len > EXT_UNINIT_MAX_LEN && 3452 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3453 map->m_len = EXT_UNINIT_MAX_LEN; 3454 3455 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3456 newex.ee_block = cpu_to_le32(map->m_lblk); 3457 newex.ee_len = cpu_to_le16(map->m_len); 3458 err = ext4_ext_check_overlap(inode, &newex, path); 3459 if (err) 3460 allocated = ext4_ext_get_actual_len(&newex); 3461 else 3462 allocated = map->m_len; 3463 3464 /* allocate new block */ 3465 ar.inode = inode; 3466 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3467 ar.logical = map->m_lblk; 3468 ar.len = allocated; 3469 if (S_ISREG(inode->i_mode)) 3470 ar.flags = EXT4_MB_HINT_DATA; 3471 else 3472 /* disable in-core preallocation for non-regular files */ 3473 ar.flags = 0; 3474 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3475 if (!newblock) 3476 goto out2; 3477 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 3478 ar.goal, newblock, allocated); 3479 3480 /* try to insert new extent into found leaf and return */ 3481 ext4_ext_store_pblock(&newex, newblock); 3482 newex.ee_len = cpu_to_le16(ar.len); 3483 /* Mark uninitialized */ 3484 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3485 ext4_ext_mark_uninitialized(&newex); 3486 /* 3487 * io_end structure was created for every IO write to an 3488 * uninitialized extent. To avoid unecessary conversion, 3489 * here we flag the IO that really needs the conversion. 3490 * For non asycn direct IO case, flag the inode state 3491 * that we need to perform convertion when IO is done. 3492 */ 3493 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3494 if (io) 3495 io->flag = EXT4_IO_UNWRITTEN; 3496 else 3497 ext4_set_inode_state(inode, 3498 EXT4_STATE_DIO_UNWRITTEN); 3499 } 3500 if (ext4_should_dioread_nolock(inode)) 3501 map->m_flags |= EXT4_MAP_UNINIT; 3502 } 3503 3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { 3505 if (unlikely(!eh->eh_entries)) { 3506 EXT4_ERROR_INODE(inode, 3507 "eh->eh_entries == 0 and " 3508 "EOFBLOCKS_FL set"); 3509 err = -EIO; 3510 goto out2; 3511 } 3512 last_ex = EXT_LAST_EXTENT(eh); 3513 /* 3514 * If the current leaf block was reached by looking at 3515 * the last index block all the way down the tree, and 3516 * we are extending the inode beyond the last extent 3517 * in the current leaf block, then clear the 3518 * EOFBLOCKS_FL flag. 3519 */ 3520 for (i = depth-1; i >= 0; i--) { 3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3522 break; 3523 } 3524 if ((i < 0) && 3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) + 3526 ext4_ext_get_actual_len(last_ex))) 3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3528 } 3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3530 if (err) { 3531 /* free data blocks we just allocated */ 3532 /* not a good idea to call discard here directly, 3533 * but otherwise we'd need to call it every free() */ 3534 ext4_discard_preallocations(inode); 3535 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3536 ext4_ext_get_actual_len(&newex), 0); 3537 goto out2; 3538 } 3539 3540 /* previous routine could use block we allocated */ 3541 newblock = ext_pblock(&newex); 3542 allocated = ext4_ext_get_actual_len(&newex); 3543 if (allocated > map->m_len) 3544 allocated = map->m_len; 3545 map->m_flags |= EXT4_MAP_NEW; 3546 3547 /* 3548 * Update reserved blocks/metadata blocks after successful 3549 * block allocation which had been deferred till now. 3550 */ 3551 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3552 ext4_da_update_reserve_space(inode, allocated, 1); 3553 3554 /* 3555 * Cache the extent and update transaction to commit on fdatasync only 3556 * when it is _not_ an uninitialized extent. 3557 */ 3558 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3559 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, 3560 EXT4_EXT_CACHE_EXTENT); 3561 ext4_update_inode_fsync_trans(handle, inode, 1); 3562 } else 3563 ext4_update_inode_fsync_trans(handle, inode, 0); 3564 out: 3565 if (allocated > map->m_len) 3566 allocated = map->m_len; 3567 ext4_ext_show_leaf(inode, path); 3568 map->m_flags |= EXT4_MAP_MAPPED; 3569 map->m_pblk = newblock; 3570 map->m_len = allocated; 3571 out2: 3572 if (path) { 3573 ext4_ext_drop_refs(path); 3574 kfree(path); 3575 } 3576 return err ? err : allocated; 3577 } 3578 3579 void ext4_ext_truncate(struct inode *inode) 3580 { 3581 struct address_space *mapping = inode->i_mapping; 3582 struct super_block *sb = inode->i_sb; 3583 ext4_lblk_t last_block; 3584 handle_t *handle; 3585 int err = 0; 3586 3587 /* 3588 * probably first extent we're gonna free will be last in block 3589 */ 3590 err = ext4_writepage_trans_blocks(inode); 3591 handle = ext4_journal_start(inode, err); 3592 if (IS_ERR(handle)) 3593 return; 3594 3595 if (inode->i_size & (sb->s_blocksize - 1)) 3596 ext4_block_truncate_page(handle, mapping, inode->i_size); 3597 3598 if (ext4_orphan_add(handle, inode)) 3599 goto out_stop; 3600 3601 down_write(&EXT4_I(inode)->i_data_sem); 3602 ext4_ext_invalidate_cache(inode); 3603 3604 ext4_discard_preallocations(inode); 3605 3606 /* 3607 * TODO: optimization is possible here. 3608 * Probably we need not scan at all, 3609 * because page truncation is enough. 3610 */ 3611 3612 /* we have to know where to truncate from in crash case */ 3613 EXT4_I(inode)->i_disksize = inode->i_size; 3614 ext4_mark_inode_dirty(handle, inode); 3615 3616 last_block = (inode->i_size + sb->s_blocksize - 1) 3617 >> EXT4_BLOCK_SIZE_BITS(sb); 3618 err = ext4_ext_remove_space(inode, last_block); 3619 3620 /* In a multi-transaction truncate, we only make the final 3621 * transaction synchronous. 3622 */ 3623 if (IS_SYNC(inode)) 3624 ext4_handle_sync(handle); 3625 3626 out_stop: 3627 up_write(&EXT4_I(inode)->i_data_sem); 3628 /* 3629 * If this was a simple ftruncate() and the file will remain alive, 3630 * then we need to clear up the orphan record which we created above. 3631 * However, if this was a real unlink then we were called by 3632 * ext4_delete_inode(), and we allow that function to clean up the 3633 * orphan info for us. 3634 */ 3635 if (inode->i_nlink) 3636 ext4_orphan_del(handle, inode); 3637 3638 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3639 ext4_mark_inode_dirty(handle, inode); 3640 ext4_journal_stop(handle); 3641 } 3642 3643 static void ext4_falloc_update_inode(struct inode *inode, 3644 int mode, loff_t new_size, int update_ctime) 3645 { 3646 struct timespec now; 3647 3648 if (update_ctime) { 3649 now = current_fs_time(inode->i_sb); 3650 if (!timespec_equal(&inode->i_ctime, &now)) 3651 inode->i_ctime = now; 3652 } 3653 /* 3654 * Update only when preallocation was requested beyond 3655 * the file size. 3656 */ 3657 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 3658 if (new_size > i_size_read(inode)) 3659 i_size_write(inode, new_size); 3660 if (new_size > EXT4_I(inode)->i_disksize) 3661 ext4_update_i_disksize(inode, new_size); 3662 } else { 3663 /* 3664 * Mark that we allocate beyond EOF so the subsequent truncate 3665 * can proceed even if the new size is the same as i_size. 3666 */ 3667 if (new_size > i_size_read(inode)) 3668 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3669 } 3670 3671 } 3672 3673 /* 3674 * preallocate space for a file. This implements ext4's fallocate inode 3675 * operation, which gets called from sys_fallocate system call. 3676 * For block-mapped files, posix_fallocate should fall back to the method 3677 * of writing zeroes to the required new blocks (the same behavior which is 3678 * expected for file systems which do not support fallocate() system call). 3679 */ 3680 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3681 { 3682 handle_t *handle; 3683 loff_t new_size; 3684 unsigned int max_blocks; 3685 int ret = 0; 3686 int ret2 = 0; 3687 int retries = 0; 3688 struct ext4_map_blocks map; 3689 unsigned int credits, blkbits = inode->i_blkbits; 3690 3691 /* 3692 * currently supporting (pre)allocate mode for extent-based 3693 * files _only_ 3694 */ 3695 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3696 return -EOPNOTSUPP; 3697 3698 /* preallocation to directories is currently not supported */ 3699 if (S_ISDIR(inode->i_mode)) 3700 return -ENODEV; 3701 3702 map.m_lblk = offset >> blkbits; 3703 /* 3704 * We can't just convert len to max_blocks because 3705 * If blocksize = 4096 offset = 3072 and len = 2048 3706 */ 3707 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3708 - map.m_lblk; 3709 /* 3710 * credits to insert 1 extent into extent tree 3711 */ 3712 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3713 mutex_lock(&inode->i_mutex); 3714 ret = inode_newsize_ok(inode, (len + offset)); 3715 if (ret) { 3716 mutex_unlock(&inode->i_mutex); 3717 return ret; 3718 } 3719 retry: 3720 while (ret >= 0 && ret < max_blocks) { 3721 map.m_lblk = map.m_lblk + ret; 3722 map.m_len = max_blocks = max_blocks - ret; 3723 handle = ext4_journal_start(inode, credits); 3724 if (IS_ERR(handle)) { 3725 ret = PTR_ERR(handle); 3726 break; 3727 } 3728 ret = ext4_map_blocks(handle, inode, &map, 3729 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3730 if (ret <= 0) { 3731 #ifdef EXT4FS_DEBUG 3732 WARN_ON(ret <= 0); 3733 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3734 "returned error inode#%lu, block=%u, " 3735 "max_blocks=%u", __func__, 3736 inode->i_ino, block, max_blocks); 3737 #endif 3738 ext4_mark_inode_dirty(handle, inode); 3739 ret2 = ext4_journal_stop(handle); 3740 break; 3741 } 3742 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3743 blkbits) >> blkbits)) 3744 new_size = offset + len; 3745 else 3746 new_size = (map.m_lblk + ret) << blkbits; 3747 3748 ext4_falloc_update_inode(inode, mode, new_size, 3749 (map.m_flags & EXT4_MAP_NEW)); 3750 ext4_mark_inode_dirty(handle, inode); 3751 ret2 = ext4_journal_stop(handle); 3752 if (ret2) 3753 break; 3754 } 3755 if (ret == -ENOSPC && 3756 ext4_should_retry_alloc(inode->i_sb, &retries)) { 3757 ret = 0; 3758 goto retry; 3759 } 3760 mutex_unlock(&inode->i_mutex); 3761 return ret > 0 ? ret2 : ret; 3762 } 3763 3764 /* 3765 * This function convert a range of blocks to written extents 3766 * The caller of this function will pass the start offset and the size. 3767 * all unwritten extents within this range will be converted to 3768 * written extents. 3769 * 3770 * This function is called from the direct IO end io call back 3771 * function, to convert the fallocated extents after IO is completed. 3772 * Returns 0 on success. 3773 */ 3774 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3775 ssize_t len) 3776 { 3777 handle_t *handle; 3778 unsigned int max_blocks; 3779 int ret = 0; 3780 int ret2 = 0; 3781 struct ext4_map_blocks map; 3782 unsigned int credits, blkbits = inode->i_blkbits; 3783 3784 map.m_lblk = offset >> blkbits; 3785 /* 3786 * We can't just convert len to max_blocks because 3787 * If blocksize = 4096 offset = 3072 and len = 2048 3788 */ 3789 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 3790 map.m_lblk); 3791 /* 3792 * credits to insert 1 extent into extent tree 3793 */ 3794 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3795 while (ret >= 0 && ret < max_blocks) { 3796 map.m_lblk += ret; 3797 map.m_len = (max_blocks -= ret); 3798 handle = ext4_journal_start(inode, credits); 3799 if (IS_ERR(handle)) { 3800 ret = PTR_ERR(handle); 3801 break; 3802 } 3803 ret = ext4_map_blocks(handle, inode, &map, 3804 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3805 if (ret <= 0) { 3806 WARN_ON(ret <= 0); 3807 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3808 "returned error inode#%lu, block=%u, " 3809 "max_blocks=%u", __func__, 3810 inode->i_ino, map.m_lblk, map.m_len); 3811 } 3812 ext4_mark_inode_dirty(handle, inode); 3813 ret2 = ext4_journal_stop(handle); 3814 if (ret <= 0 || ret2 ) 3815 break; 3816 } 3817 return ret > 0 ? ret2 : ret; 3818 } 3819 /* 3820 * Callback function called for each extent to gather FIEMAP information. 3821 */ 3822 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3823 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3824 void *data) 3825 { 3826 struct fiemap_extent_info *fieinfo = data; 3827 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 3828 __u64 logical; 3829 __u64 physical; 3830 __u64 length; 3831 __u32 flags = 0; 3832 int error; 3833 3834 logical = (__u64)newex->ec_block << blksize_bits; 3835 3836 if (newex->ec_type == EXT4_EXT_CACHE_GAP) { 3837 pgoff_t offset; 3838 struct page *page; 3839 struct buffer_head *bh = NULL; 3840 3841 offset = logical >> PAGE_SHIFT; 3842 page = find_get_page(inode->i_mapping, offset); 3843 if (!page || !page_has_buffers(page)) 3844 return EXT_CONTINUE; 3845 3846 bh = page_buffers(page); 3847 3848 if (!bh) 3849 return EXT_CONTINUE; 3850 3851 if (buffer_delay(bh)) { 3852 flags |= FIEMAP_EXTENT_DELALLOC; 3853 page_cache_release(page); 3854 } else { 3855 page_cache_release(page); 3856 return EXT_CONTINUE; 3857 } 3858 } 3859 3860 physical = (__u64)newex->ec_start << blksize_bits; 3861 length = (__u64)newex->ec_len << blksize_bits; 3862 3863 if (ex && ext4_ext_is_uninitialized(ex)) 3864 flags |= FIEMAP_EXTENT_UNWRITTEN; 3865 3866 /* 3867 * If this extent reaches EXT_MAX_BLOCK, it must be last. 3868 * 3869 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, 3870 * this also indicates no more allocated blocks. 3871 * 3872 * XXX this might miss a single-block extent at EXT_MAX_BLOCK 3873 */ 3874 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || 3875 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { 3876 loff_t size = i_size_read(inode); 3877 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); 3878 3879 flags |= FIEMAP_EXTENT_LAST; 3880 if ((flags & FIEMAP_EXTENT_DELALLOC) && 3881 logical+length > size) 3882 length = (size - logical + bs - 1) & ~(bs-1); 3883 } 3884 3885 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3886 length, flags); 3887 if (error < 0) 3888 return error; 3889 if (error == 1) 3890 return EXT_BREAK; 3891 3892 return EXT_CONTINUE; 3893 } 3894 3895 /* fiemap flags we can handle specified here */ 3896 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3897 3898 static int ext4_xattr_fiemap(struct inode *inode, 3899 struct fiemap_extent_info *fieinfo) 3900 { 3901 __u64 physical = 0; 3902 __u64 length; 3903 __u32 flags = FIEMAP_EXTENT_LAST; 3904 int blockbits = inode->i_sb->s_blocksize_bits; 3905 int error = 0; 3906 3907 /* in-inode? */ 3908 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 3909 struct ext4_iloc iloc; 3910 int offset; /* offset of xattr in inode */ 3911 3912 error = ext4_get_inode_loc(inode, &iloc); 3913 if (error) 3914 return error; 3915 physical = iloc.bh->b_blocknr << blockbits; 3916 offset = EXT4_GOOD_OLD_INODE_SIZE + 3917 EXT4_I(inode)->i_extra_isize; 3918 physical += offset; 3919 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 3920 flags |= FIEMAP_EXTENT_DATA_INLINE; 3921 brelse(iloc.bh); 3922 } else { /* external block */ 3923 physical = EXT4_I(inode)->i_file_acl << blockbits; 3924 length = inode->i_sb->s_blocksize; 3925 } 3926 3927 if (physical) 3928 error = fiemap_fill_next_extent(fieinfo, 0, physical, 3929 length, flags); 3930 return (error < 0 ? error : 0); 3931 } 3932 3933 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3934 __u64 start, __u64 len) 3935 { 3936 ext4_lblk_t start_blk; 3937 int error = 0; 3938 3939 /* fallback to generic here if not in extents fmt */ 3940 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3941 return generic_block_fiemap(inode, fieinfo, start, len, 3942 ext4_get_block); 3943 3944 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 3945 return -EBADR; 3946 3947 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3948 error = ext4_xattr_fiemap(inode, fieinfo); 3949 } else { 3950 ext4_lblk_t len_blks; 3951 __u64 last_blk; 3952 3953 start_blk = start >> inode->i_sb->s_blocksize_bits; 3954 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 3955 if (last_blk >= EXT_MAX_BLOCK) 3956 last_blk = EXT_MAX_BLOCK-1; 3957 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 3958 3959 /* 3960 * Walk the extent tree gathering extent information. 3961 * ext4_ext_fiemap_cb will push extents back to user. 3962 */ 3963 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3964 ext4_ext_fiemap_cb, fieinfo); 3965 } 3966 3967 return error; 3968 } 3969 3970