1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23 /* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32 #include <linux/module.h> 33 #include <linux/fs.h> 34 #include <linux/time.h> 35 #include <linux/jbd2.h> 36 #include <linux/highuid.h> 37 #include <linux/pagemap.h> 38 #include <linux/quotaops.h> 39 #include <linux/string.h> 40 #include <linux/slab.h> 41 #include <linux/falloc.h> 42 #include <asm/uaccess.h> 43 #include <linux/fiemap.h> 44 #include "ext4_jbd2.h" 45 #include "ext4_extents.h" 46 47 48 /* 49 * ext_pblock: 50 * combine low and high parts of physical block number into ext4_fsblk_t 51 */ 52 ext4_fsblk_t ext_pblock(struct ext4_extent *ex) 53 { 54 ext4_fsblk_t block; 55 56 block = le32_to_cpu(ex->ee_start_lo); 57 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; 58 return block; 59 } 60 61 /* 62 * idx_pblock: 63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t 64 */ 65 ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) 66 { 67 ext4_fsblk_t block; 68 69 block = le32_to_cpu(ix->ei_leaf_lo); 70 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; 71 return block; 72 } 73 74 /* 75 * ext4_ext_store_pblock: 76 * stores a large physical block number into an extent struct, 77 * breaking it into parts 78 */ 79 void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) 80 { 81 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 82 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 83 } 84 85 /* 86 * ext4_idx_store_pblock: 87 * stores a large physical block number into an index struct, 88 * breaking it into parts 89 */ 90 static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) 91 { 92 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 94 } 95 96 static int ext4_ext_truncate_extend_restart(handle_t *handle, 97 struct inode *inode, 98 int needed) 99 { 100 int err; 101 102 if (!ext4_handle_valid(handle)) 103 return 0; 104 if (handle->h_buffer_credits > needed) 105 return 0; 106 err = ext4_journal_extend(handle, needed); 107 if (err <= 0) 108 return err; 109 err = ext4_truncate_restart_trans(handle, inode, needed); 110 if (err == 0) 111 err = -EAGAIN; 112 113 return err; 114 } 115 116 /* 117 * could return: 118 * - EROFS 119 * - ENOMEM 120 */ 121 static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 122 struct ext4_ext_path *path) 123 { 124 if (path->p_bh) { 125 /* path points to block */ 126 return ext4_journal_get_write_access(handle, path->p_bh); 127 } 128 /* path points to leaf/index in inode body */ 129 /* we use in-core data, no need to protect them */ 130 return 0; 131 } 132 133 /* 134 * could return: 135 * - EROFS 136 * - ENOMEM 137 * - EIO 138 */ 139 static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 140 struct ext4_ext_path *path) 141 { 142 int err; 143 if (path->p_bh) { 144 /* path points to block */ 145 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 146 } else { 147 /* path points to leaf/index in inode body */ 148 err = ext4_mark_inode_dirty(handle, inode); 149 } 150 return err; 151 } 152 153 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 154 struct ext4_ext_path *path, 155 ext4_lblk_t block) 156 { 157 struct ext4_inode_info *ei = EXT4_I(inode); 158 ext4_fsblk_t bg_start; 159 ext4_fsblk_t last_block; 160 ext4_grpblk_t colour; 161 ext4_group_t block_group; 162 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 163 int depth; 164 165 if (path) { 166 struct ext4_extent *ex; 167 depth = path->p_depth; 168 169 /* try to predict block placement */ 170 ex = path[depth].p_ext; 171 if (ex) 172 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 173 174 /* it looks like index is empty; 175 * try to find starting block from index itself */ 176 if (path[depth].p_bh) 177 return path[depth].p_bh->b_blocknr; 178 } 179 180 /* OK. use inode's group */ 181 block_group = ei->i_block_group; 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 183 /* 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 185 * block groups per flexgroup, reserve the first block 186 * group for directories and special files. Regular 187 * files will start at the second block group. This 188 * tends to speed up directory access and improves 189 * fsck times. 190 */ 191 block_group &= ~(flex_size-1); 192 if (S_ISREG(inode->i_mode)) 193 block_group++; 194 } 195 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 196 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 197 198 /* 199 * If we are doing delayed allocation, we don't need take 200 * colour into account. 201 */ 202 if (test_opt(inode->i_sb, DELALLOC)) 203 return bg_start; 204 205 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 206 colour = (current->pid % 16) * 207 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 208 else 209 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 210 return bg_start + colour + block; 211 } 212 213 /* 214 * Allocation for a meta data block 215 */ 216 static ext4_fsblk_t 217 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 218 struct ext4_ext_path *path, 219 struct ext4_extent *ex, int *err) 220 { 221 ext4_fsblk_t goal, newblock; 222 223 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 224 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 225 return newblock; 226 } 227 228 static inline int ext4_ext_space_block(struct inode *inode, int check) 229 { 230 int size; 231 232 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 233 / sizeof(struct ext4_extent); 234 if (!check) { 235 #ifdef AGGRESSIVE_TEST 236 if (size > 6) 237 size = 6; 238 #endif 239 } 240 return size; 241 } 242 243 static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 244 { 245 int size; 246 247 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 248 / sizeof(struct ext4_extent_idx); 249 if (!check) { 250 #ifdef AGGRESSIVE_TEST 251 if (size > 5) 252 size = 5; 253 #endif 254 } 255 return size; 256 } 257 258 static inline int ext4_ext_space_root(struct inode *inode, int check) 259 { 260 int size; 261 262 size = sizeof(EXT4_I(inode)->i_data); 263 size -= sizeof(struct ext4_extent_header); 264 size /= sizeof(struct ext4_extent); 265 if (!check) { 266 #ifdef AGGRESSIVE_TEST 267 if (size > 3) 268 size = 3; 269 #endif 270 } 271 return size; 272 } 273 274 static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 275 { 276 int size; 277 278 size = sizeof(EXT4_I(inode)->i_data); 279 size -= sizeof(struct ext4_extent_header); 280 size /= sizeof(struct ext4_extent_idx); 281 if (!check) { 282 #ifdef AGGRESSIVE_TEST 283 if (size > 4) 284 size = 4; 285 #endif 286 } 287 return size; 288 } 289 290 /* 291 * Calculate the number of metadata blocks needed 292 * to allocate @blocks 293 * Worse case is one block per extent 294 */ 295 int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) 296 { 297 struct ext4_inode_info *ei = EXT4_I(inode); 298 int idxs, num = 0; 299 300 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 301 / sizeof(struct ext4_extent_idx)); 302 303 /* 304 * If the new delayed allocation block is contiguous with the 305 * previous da block, it can share index blocks with the 306 * previous block, so we only need to allocate a new index 307 * block every idxs leaf blocks. At ldxs**2 blocks, we need 308 * an additional index block, and at ldxs**3 blocks, yet 309 * another index blocks. 310 */ 311 if (ei->i_da_metadata_calc_len && 312 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 313 if ((ei->i_da_metadata_calc_len % idxs) == 0) 314 num++; 315 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 316 num++; 317 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 318 num++; 319 ei->i_da_metadata_calc_len = 0; 320 } else 321 ei->i_da_metadata_calc_len++; 322 ei->i_da_metadata_calc_last_lblock++; 323 return num; 324 } 325 326 /* 327 * In the worst case we need a new set of index blocks at 328 * every level of the inode's extent tree. 329 */ 330 ei->i_da_metadata_calc_len = 1; 331 ei->i_da_metadata_calc_last_lblock = lblock; 332 return ext_depth(inode) + 1; 333 } 334 335 static int 336 ext4_ext_max_entries(struct inode *inode, int depth) 337 { 338 int max; 339 340 if (depth == ext_depth(inode)) { 341 if (depth == 0) 342 max = ext4_ext_space_root(inode, 1); 343 else 344 max = ext4_ext_space_root_idx(inode, 1); 345 } else { 346 if (depth == 0) 347 max = ext4_ext_space_block(inode, 1); 348 else 349 max = ext4_ext_space_block_idx(inode, 1); 350 } 351 352 return max; 353 } 354 355 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 356 { 357 ext4_fsblk_t block = ext_pblock(ext); 358 int len = ext4_ext_get_actual_len(ext); 359 360 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 361 } 362 363 static int ext4_valid_extent_idx(struct inode *inode, 364 struct ext4_extent_idx *ext_idx) 365 { 366 ext4_fsblk_t block = idx_pblock(ext_idx); 367 368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 369 } 370 371 static int ext4_valid_extent_entries(struct inode *inode, 372 struct ext4_extent_header *eh, 373 int depth) 374 { 375 struct ext4_extent *ext; 376 struct ext4_extent_idx *ext_idx; 377 unsigned short entries; 378 if (eh->eh_entries == 0) 379 return 1; 380 381 entries = le16_to_cpu(eh->eh_entries); 382 383 if (depth == 0) { 384 /* leaf entries */ 385 ext = EXT_FIRST_EXTENT(eh); 386 while (entries) { 387 if (!ext4_valid_extent(inode, ext)) 388 return 0; 389 ext++; 390 entries--; 391 } 392 } else { 393 ext_idx = EXT_FIRST_INDEX(eh); 394 while (entries) { 395 if (!ext4_valid_extent_idx(inode, ext_idx)) 396 return 0; 397 ext_idx++; 398 entries--; 399 } 400 } 401 return 1; 402 } 403 404 static int __ext4_ext_check(const char *function, unsigned int line, 405 struct inode *inode, struct ext4_extent_header *eh, 406 int depth) 407 { 408 const char *error_msg; 409 int max = 0; 410 411 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 412 error_msg = "invalid magic"; 413 goto corrupted; 414 } 415 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 416 error_msg = "unexpected eh_depth"; 417 goto corrupted; 418 } 419 if (unlikely(eh->eh_max == 0)) { 420 error_msg = "invalid eh_max"; 421 goto corrupted; 422 } 423 max = ext4_ext_max_entries(inode, depth); 424 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 425 error_msg = "too large eh_max"; 426 goto corrupted; 427 } 428 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 429 error_msg = "invalid eh_entries"; 430 goto corrupted; 431 } 432 if (!ext4_valid_extent_entries(inode, eh, depth)) { 433 error_msg = "invalid extent entries"; 434 goto corrupted; 435 } 436 return 0; 437 438 corrupted: 439 ext4_error_inode(inode, function, line, 0, 440 "bad header/extent: %s - magic %x, " 441 "entries %u, max %u(%u), depth %u(%u)", 442 error_msg, le16_to_cpu(eh->eh_magic), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 444 max, le16_to_cpu(eh->eh_depth), depth); 445 446 return -EIO; 447 } 448 449 #define ext4_ext_check(inode, eh, depth) \ 450 __ext4_ext_check(__func__, __LINE__, inode, eh, depth) 451 452 int ext4_ext_check_inode(struct inode *inode) 453 { 454 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 455 } 456 457 #ifdef EXT_DEBUG 458 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 459 { 460 int k, l = path->p_depth; 461 462 ext_debug("path:"); 463 for (k = 0; k <= l; k++, path++) { 464 if (path->p_idx) { 465 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 466 idx_pblock(path->p_idx)); 467 } else if (path->p_ext) { 468 ext_debug(" %d:[%d]%d:%llu ", 469 le32_to_cpu(path->p_ext->ee_block), 470 ext4_ext_is_uninitialized(path->p_ext), 471 ext4_ext_get_actual_len(path->p_ext), 472 ext_pblock(path->p_ext)); 473 } else 474 ext_debug(" []"); 475 } 476 ext_debug("\n"); 477 } 478 479 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 480 { 481 int depth = ext_depth(inode); 482 struct ext4_extent_header *eh; 483 struct ext4_extent *ex; 484 int i; 485 486 if (!path) 487 return; 488 489 eh = path[depth].p_hdr; 490 ex = EXT_FIRST_EXTENT(eh); 491 492 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 493 494 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 495 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 496 ext4_ext_is_uninitialized(ex), 497 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 498 } 499 ext_debug("\n"); 500 } 501 #else 502 #define ext4_ext_show_path(inode, path) 503 #define ext4_ext_show_leaf(inode, path) 504 #endif 505 506 void ext4_ext_drop_refs(struct ext4_ext_path *path) 507 { 508 int depth = path->p_depth; 509 int i; 510 511 for (i = 0; i <= depth; i++, path++) 512 if (path->p_bh) { 513 brelse(path->p_bh); 514 path->p_bh = NULL; 515 } 516 } 517 518 /* 519 * ext4_ext_binsearch_idx: 520 * binary search for the closest index of the given block 521 * the header must be checked before calling this 522 */ 523 static void 524 ext4_ext_binsearch_idx(struct inode *inode, 525 struct ext4_ext_path *path, ext4_lblk_t block) 526 { 527 struct ext4_extent_header *eh = path->p_hdr; 528 struct ext4_extent_idx *r, *l, *m; 529 530 531 ext_debug("binsearch for %u(idx): ", block); 532 533 l = EXT_FIRST_INDEX(eh) + 1; 534 r = EXT_LAST_INDEX(eh); 535 while (l <= r) { 536 m = l + (r - l) / 2; 537 if (block < le32_to_cpu(m->ei_block)) 538 r = m - 1; 539 else 540 l = m + 1; 541 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 542 m, le32_to_cpu(m->ei_block), 543 r, le32_to_cpu(r->ei_block)); 544 } 545 546 path->p_idx = l - 1; 547 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 548 idx_pblock(path->p_idx)); 549 550 #ifdef CHECK_BINSEARCH 551 { 552 struct ext4_extent_idx *chix, *ix; 553 int k; 554 555 chix = ix = EXT_FIRST_INDEX(eh); 556 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 557 if (k != 0 && 558 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 559 printk(KERN_DEBUG "k=%d, ix=0x%p, " 560 "first=0x%p\n", k, 561 ix, EXT_FIRST_INDEX(eh)); 562 printk(KERN_DEBUG "%u <= %u\n", 563 le32_to_cpu(ix->ei_block), 564 le32_to_cpu(ix[-1].ei_block)); 565 } 566 BUG_ON(k && le32_to_cpu(ix->ei_block) 567 <= le32_to_cpu(ix[-1].ei_block)); 568 if (block < le32_to_cpu(ix->ei_block)) 569 break; 570 chix = ix; 571 } 572 BUG_ON(chix != path->p_idx); 573 } 574 #endif 575 576 } 577 578 /* 579 * ext4_ext_binsearch: 580 * binary search for closest extent of the given block 581 * the header must be checked before calling this 582 */ 583 static void 584 ext4_ext_binsearch(struct inode *inode, 585 struct ext4_ext_path *path, ext4_lblk_t block) 586 { 587 struct ext4_extent_header *eh = path->p_hdr; 588 struct ext4_extent *r, *l, *m; 589 590 if (eh->eh_entries == 0) { 591 /* 592 * this leaf is empty: 593 * we get such a leaf in split/add case 594 */ 595 return; 596 } 597 598 ext_debug("binsearch for %u: ", block); 599 600 l = EXT_FIRST_EXTENT(eh) + 1; 601 r = EXT_LAST_EXTENT(eh); 602 603 while (l <= r) { 604 m = l + (r - l) / 2; 605 if (block < le32_to_cpu(m->ee_block)) 606 r = m - 1; 607 else 608 l = m + 1; 609 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 610 m, le32_to_cpu(m->ee_block), 611 r, le32_to_cpu(r->ee_block)); 612 } 613 614 path->p_ext = l - 1; 615 ext_debug(" -> %d:%llu:[%d]%d ", 616 le32_to_cpu(path->p_ext->ee_block), 617 ext_pblock(path->p_ext), 618 ext4_ext_is_uninitialized(path->p_ext), 619 ext4_ext_get_actual_len(path->p_ext)); 620 621 #ifdef CHECK_BINSEARCH 622 { 623 struct ext4_extent *chex, *ex; 624 int k; 625 626 chex = ex = EXT_FIRST_EXTENT(eh); 627 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 628 BUG_ON(k && le32_to_cpu(ex->ee_block) 629 <= le32_to_cpu(ex[-1].ee_block)); 630 if (block < le32_to_cpu(ex->ee_block)) 631 break; 632 chex = ex; 633 } 634 BUG_ON(chex != path->p_ext); 635 } 636 #endif 637 638 } 639 640 int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 641 { 642 struct ext4_extent_header *eh; 643 644 eh = ext_inode_hdr(inode); 645 eh->eh_depth = 0; 646 eh->eh_entries = 0; 647 eh->eh_magic = EXT4_EXT_MAGIC; 648 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 649 ext4_mark_inode_dirty(handle, inode); 650 ext4_ext_invalidate_cache(inode); 651 return 0; 652 } 653 654 struct ext4_ext_path * 655 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 656 struct ext4_ext_path *path) 657 { 658 struct ext4_extent_header *eh; 659 struct buffer_head *bh; 660 short int depth, i, ppos = 0, alloc = 0; 661 662 eh = ext_inode_hdr(inode); 663 depth = ext_depth(inode); 664 665 /* account possible depth increase */ 666 if (!path) { 667 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 668 GFP_NOFS); 669 if (!path) 670 return ERR_PTR(-ENOMEM); 671 alloc = 1; 672 } 673 path[0].p_hdr = eh; 674 path[0].p_bh = NULL; 675 676 i = depth; 677 /* walk through the tree */ 678 while (i) { 679 int need_to_validate = 0; 680 681 ext_debug("depth %d: num %d, max %d\n", 682 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 683 684 ext4_ext_binsearch_idx(inode, path + ppos, block); 685 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 686 path[ppos].p_depth = i; 687 path[ppos].p_ext = NULL; 688 689 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 690 if (unlikely(!bh)) 691 goto err; 692 if (!bh_uptodate_or_lock(bh)) { 693 if (bh_submit_read(bh) < 0) { 694 put_bh(bh); 695 goto err; 696 } 697 /* validate the extent entries */ 698 need_to_validate = 1; 699 } 700 eh = ext_block_hdr(bh); 701 ppos++; 702 if (unlikely(ppos > depth)) { 703 put_bh(bh); 704 EXT4_ERROR_INODE(inode, 705 "ppos %d > depth %d", ppos, depth); 706 goto err; 707 } 708 path[ppos].p_bh = bh; 709 path[ppos].p_hdr = eh; 710 i--; 711 712 if (need_to_validate && ext4_ext_check(inode, eh, i)) 713 goto err; 714 } 715 716 path[ppos].p_depth = i; 717 path[ppos].p_ext = NULL; 718 path[ppos].p_idx = NULL; 719 720 /* find extent */ 721 ext4_ext_binsearch(inode, path + ppos, block); 722 /* if not an empty leaf */ 723 if (path[ppos].p_ext) 724 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 725 726 ext4_ext_show_path(inode, path); 727 728 return path; 729 730 err: 731 ext4_ext_drop_refs(path); 732 if (alloc) 733 kfree(path); 734 return ERR_PTR(-EIO); 735 } 736 737 /* 738 * ext4_ext_insert_index: 739 * insert new index [@logical;@ptr] into the block at @curp; 740 * check where to insert: before @curp or after @curp 741 */ 742 int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 743 struct ext4_ext_path *curp, 744 int logical, ext4_fsblk_t ptr) 745 { 746 struct ext4_extent_idx *ix; 747 int len, err; 748 749 err = ext4_ext_get_access(handle, inode, curp); 750 if (err) 751 return err; 752 753 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 754 EXT4_ERROR_INODE(inode, 755 "logical %d == ei_block %d!", 756 logical, le32_to_cpu(curp->p_idx->ei_block)); 757 return -EIO; 758 } 759 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 760 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 761 /* insert after */ 762 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 763 len = (len - 1) * sizeof(struct ext4_extent_idx); 764 len = len < 0 ? 0 : len; 765 ext_debug("insert new index %d after: %llu. " 766 "move %d from 0x%p to 0x%p\n", 767 logical, ptr, len, 768 (curp->p_idx + 1), (curp->p_idx + 2)); 769 memmove(curp->p_idx + 2, curp->p_idx + 1, len); 770 } 771 ix = curp->p_idx + 1; 772 } else { 773 /* insert before */ 774 len = len * sizeof(struct ext4_extent_idx); 775 len = len < 0 ? 0 : len; 776 ext_debug("insert new index %d before: %llu. " 777 "move %d from 0x%p to 0x%p\n", 778 logical, ptr, len, 779 curp->p_idx, (curp->p_idx + 1)); 780 memmove(curp->p_idx + 1, curp->p_idx, len); 781 ix = curp->p_idx; 782 } 783 784 ix->ei_block = cpu_to_le32(logical); 785 ext4_idx_store_pblock(ix, ptr); 786 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 787 788 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 789 > le16_to_cpu(curp->p_hdr->eh_max))) { 790 EXT4_ERROR_INODE(inode, 791 "logical %d == ei_block %d!", 792 logical, le32_to_cpu(curp->p_idx->ei_block)); 793 return -EIO; 794 } 795 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 796 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 797 return -EIO; 798 } 799 800 err = ext4_ext_dirty(handle, inode, curp); 801 ext4_std_error(inode->i_sb, err); 802 803 return err; 804 } 805 806 /* 807 * ext4_ext_split: 808 * inserts new subtree into the path, using free index entry 809 * at depth @at: 810 * - allocates all needed blocks (new leaf and all intermediate index blocks) 811 * - makes decision where to split 812 * - moves remaining extents and index entries (right to the split point) 813 * into the newly allocated blocks 814 * - initializes subtree 815 */ 816 static int ext4_ext_split(handle_t *handle, struct inode *inode, 817 struct ext4_ext_path *path, 818 struct ext4_extent *newext, int at) 819 { 820 struct buffer_head *bh = NULL; 821 int depth = ext_depth(inode); 822 struct ext4_extent_header *neh; 823 struct ext4_extent_idx *fidx; 824 struct ext4_extent *ex; 825 int i = at, k, m, a; 826 ext4_fsblk_t newblock, oldblock; 827 __le32 border; 828 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 829 int err = 0; 830 831 /* make decision: where to split? */ 832 /* FIXME: now decision is simplest: at current extent */ 833 834 /* if current leaf will be split, then we should use 835 * border from split point */ 836 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 837 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 838 return -EIO; 839 } 840 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 841 border = path[depth].p_ext[1].ee_block; 842 ext_debug("leaf will be split." 843 " next leaf starts at %d\n", 844 le32_to_cpu(border)); 845 } else { 846 border = newext->ee_block; 847 ext_debug("leaf will be added." 848 " next leaf starts at %d\n", 849 le32_to_cpu(border)); 850 } 851 852 /* 853 * If error occurs, then we break processing 854 * and mark filesystem read-only. index won't 855 * be inserted and tree will be in consistent 856 * state. Next mount will repair buffers too. 857 */ 858 859 /* 860 * Get array to track all allocated blocks. 861 * We need this to handle errors and free blocks 862 * upon them. 863 */ 864 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 865 if (!ablocks) 866 return -ENOMEM; 867 868 /* allocate all needed blocks */ 869 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 870 for (a = 0; a < depth - at; a++) { 871 newblock = ext4_ext_new_meta_block(handle, inode, path, 872 newext, &err); 873 if (newblock == 0) 874 goto cleanup; 875 ablocks[a] = newblock; 876 } 877 878 /* initialize new leaf */ 879 newblock = ablocks[--a]; 880 if (unlikely(newblock == 0)) { 881 EXT4_ERROR_INODE(inode, "newblock == 0!"); 882 err = -EIO; 883 goto cleanup; 884 } 885 bh = sb_getblk(inode->i_sb, newblock); 886 if (!bh) { 887 err = -EIO; 888 goto cleanup; 889 } 890 lock_buffer(bh); 891 892 err = ext4_journal_get_create_access(handle, bh); 893 if (err) 894 goto cleanup; 895 896 neh = ext_block_hdr(bh); 897 neh->eh_entries = 0; 898 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 899 neh->eh_magic = EXT4_EXT_MAGIC; 900 neh->eh_depth = 0; 901 ex = EXT_FIRST_EXTENT(neh); 902 903 /* move remainder of path[depth] to the new leaf */ 904 if (unlikely(path[depth].p_hdr->eh_entries != 905 path[depth].p_hdr->eh_max)) { 906 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 907 path[depth].p_hdr->eh_entries, 908 path[depth].p_hdr->eh_max); 909 err = -EIO; 910 goto cleanup; 911 } 912 /* start copy from next extent */ 913 /* TODO: we could do it by single memmove */ 914 m = 0; 915 path[depth].p_ext++; 916 while (path[depth].p_ext <= 917 EXT_MAX_EXTENT(path[depth].p_hdr)) { 918 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 919 le32_to_cpu(path[depth].p_ext->ee_block), 920 ext_pblock(path[depth].p_ext), 921 ext4_ext_is_uninitialized(path[depth].p_ext), 922 ext4_ext_get_actual_len(path[depth].p_ext), 923 newblock); 924 /*memmove(ex++, path[depth].p_ext++, 925 sizeof(struct ext4_extent)); 926 neh->eh_entries++;*/ 927 path[depth].p_ext++; 928 m++; 929 } 930 if (m) { 931 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 932 le16_add_cpu(&neh->eh_entries, m); 933 } 934 935 set_buffer_uptodate(bh); 936 unlock_buffer(bh); 937 938 err = ext4_handle_dirty_metadata(handle, inode, bh); 939 if (err) 940 goto cleanup; 941 brelse(bh); 942 bh = NULL; 943 944 /* correct old leaf */ 945 if (m) { 946 err = ext4_ext_get_access(handle, inode, path + depth); 947 if (err) 948 goto cleanup; 949 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 950 err = ext4_ext_dirty(handle, inode, path + depth); 951 if (err) 952 goto cleanup; 953 954 } 955 956 /* create intermediate indexes */ 957 k = depth - at - 1; 958 if (unlikely(k < 0)) { 959 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 960 err = -EIO; 961 goto cleanup; 962 } 963 if (k) 964 ext_debug("create %d intermediate indices\n", k); 965 /* insert new index into current index block */ 966 /* current depth stored in i var */ 967 i = depth - 1; 968 while (k--) { 969 oldblock = newblock; 970 newblock = ablocks[--a]; 971 bh = sb_getblk(inode->i_sb, newblock); 972 if (!bh) { 973 err = -EIO; 974 goto cleanup; 975 } 976 lock_buffer(bh); 977 978 err = ext4_journal_get_create_access(handle, bh); 979 if (err) 980 goto cleanup; 981 982 neh = ext_block_hdr(bh); 983 neh->eh_entries = cpu_to_le16(1); 984 neh->eh_magic = EXT4_EXT_MAGIC; 985 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 986 neh->eh_depth = cpu_to_le16(depth - i); 987 fidx = EXT_FIRST_INDEX(neh); 988 fidx->ei_block = border; 989 ext4_idx_store_pblock(fidx, oldblock); 990 991 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 992 i, newblock, le32_to_cpu(border), oldblock); 993 /* copy indexes */ 994 m = 0; 995 path[i].p_idx++; 996 997 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 998 EXT_MAX_INDEX(path[i].p_hdr)); 999 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1000 EXT_LAST_INDEX(path[i].p_hdr))) { 1001 EXT4_ERROR_INODE(inode, 1002 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1003 le32_to_cpu(path[i].p_ext->ee_block)); 1004 err = -EIO; 1005 goto cleanup; 1006 } 1007 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1008 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1009 le32_to_cpu(path[i].p_idx->ei_block), 1010 idx_pblock(path[i].p_idx), 1011 newblock); 1012 /*memmove(++fidx, path[i].p_idx++, 1013 sizeof(struct ext4_extent_idx)); 1014 neh->eh_entries++; 1015 BUG_ON(neh->eh_entries > neh->eh_max);*/ 1016 path[i].p_idx++; 1017 m++; 1018 } 1019 if (m) { 1020 memmove(++fidx, path[i].p_idx - m, 1021 sizeof(struct ext4_extent_idx) * m); 1022 le16_add_cpu(&neh->eh_entries, m); 1023 } 1024 set_buffer_uptodate(bh); 1025 unlock_buffer(bh); 1026 1027 err = ext4_handle_dirty_metadata(handle, inode, bh); 1028 if (err) 1029 goto cleanup; 1030 brelse(bh); 1031 bh = NULL; 1032 1033 /* correct old index */ 1034 if (m) { 1035 err = ext4_ext_get_access(handle, inode, path + i); 1036 if (err) 1037 goto cleanup; 1038 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1039 err = ext4_ext_dirty(handle, inode, path + i); 1040 if (err) 1041 goto cleanup; 1042 } 1043 1044 i--; 1045 } 1046 1047 /* insert new index */ 1048 err = ext4_ext_insert_index(handle, inode, path + at, 1049 le32_to_cpu(border), newblock); 1050 1051 cleanup: 1052 if (bh) { 1053 if (buffer_locked(bh)) 1054 unlock_buffer(bh); 1055 brelse(bh); 1056 } 1057 1058 if (err) { 1059 /* free all allocated blocks in error case */ 1060 for (i = 0; i < depth; i++) { 1061 if (!ablocks[i]) 1062 continue; 1063 ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1064 EXT4_FREE_BLOCKS_METADATA); 1065 } 1066 } 1067 kfree(ablocks); 1068 1069 return err; 1070 } 1071 1072 /* 1073 * ext4_ext_grow_indepth: 1074 * implements tree growing procedure: 1075 * - allocates new block 1076 * - moves top-level data (index block or leaf) into the new block 1077 * - initializes new top-level, creating index that points to the 1078 * just created block 1079 */ 1080 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1081 struct ext4_ext_path *path, 1082 struct ext4_extent *newext) 1083 { 1084 struct ext4_ext_path *curp = path; 1085 struct ext4_extent_header *neh; 1086 struct buffer_head *bh; 1087 ext4_fsblk_t newblock; 1088 int err = 0; 1089 1090 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1091 if (newblock == 0) 1092 return err; 1093 1094 bh = sb_getblk(inode->i_sb, newblock); 1095 if (!bh) { 1096 err = -EIO; 1097 ext4_std_error(inode->i_sb, err); 1098 return err; 1099 } 1100 lock_buffer(bh); 1101 1102 err = ext4_journal_get_create_access(handle, bh); 1103 if (err) { 1104 unlock_buffer(bh); 1105 goto out; 1106 } 1107 1108 /* move top-level index/leaf into new block */ 1109 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1110 1111 /* set size of new block */ 1112 neh = ext_block_hdr(bh); 1113 /* old root could have indexes or leaves 1114 * so calculate e_max right way */ 1115 if (ext_depth(inode)) 1116 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1117 else 1118 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1119 neh->eh_magic = EXT4_EXT_MAGIC; 1120 set_buffer_uptodate(bh); 1121 unlock_buffer(bh); 1122 1123 err = ext4_handle_dirty_metadata(handle, inode, bh); 1124 if (err) 1125 goto out; 1126 1127 /* create index in new top-level index: num,max,pointer */ 1128 err = ext4_ext_get_access(handle, inode, curp); 1129 if (err) 1130 goto out; 1131 1132 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1133 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1134 curp->p_hdr->eh_entries = cpu_to_le16(1); 1135 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1136 1137 if (path[0].p_hdr->eh_depth) 1138 curp->p_idx->ei_block = 1139 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; 1140 else 1141 curp->p_idx->ei_block = 1142 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 1143 ext4_idx_store_pblock(curp->p_idx, newblock); 1144 1145 neh = ext_inode_hdr(inode); 1146 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1147 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1148 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1149 idx_pblock(EXT_FIRST_INDEX(neh))); 1150 1151 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1152 err = ext4_ext_dirty(handle, inode, curp); 1153 out: 1154 brelse(bh); 1155 1156 return err; 1157 } 1158 1159 /* 1160 * ext4_ext_create_new_leaf: 1161 * finds empty index and adds new leaf. 1162 * if no free index is found, then it requests in-depth growing. 1163 */ 1164 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1165 struct ext4_ext_path *path, 1166 struct ext4_extent *newext) 1167 { 1168 struct ext4_ext_path *curp; 1169 int depth, i, err = 0; 1170 1171 repeat: 1172 i = depth = ext_depth(inode); 1173 1174 /* walk up to the tree and look for free index entry */ 1175 curp = path + depth; 1176 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1177 i--; 1178 curp--; 1179 } 1180 1181 /* we use already allocated block for index block, 1182 * so subsequent data blocks should be contiguous */ 1183 if (EXT_HAS_FREE_INDEX(curp)) { 1184 /* if we found index with free entry, then use that 1185 * entry: create all needed subtree and add new leaf */ 1186 err = ext4_ext_split(handle, inode, path, newext, i); 1187 if (err) 1188 goto out; 1189 1190 /* refill path */ 1191 ext4_ext_drop_refs(path); 1192 path = ext4_ext_find_extent(inode, 1193 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1194 path); 1195 if (IS_ERR(path)) 1196 err = PTR_ERR(path); 1197 } else { 1198 /* tree is full, time to grow in depth */ 1199 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1200 if (err) 1201 goto out; 1202 1203 /* refill path */ 1204 ext4_ext_drop_refs(path); 1205 path = ext4_ext_find_extent(inode, 1206 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1207 path); 1208 if (IS_ERR(path)) { 1209 err = PTR_ERR(path); 1210 goto out; 1211 } 1212 1213 /* 1214 * only first (depth 0 -> 1) produces free space; 1215 * in all other cases we have to split the grown tree 1216 */ 1217 depth = ext_depth(inode); 1218 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1219 /* now we need to split */ 1220 goto repeat; 1221 } 1222 } 1223 1224 out: 1225 return err; 1226 } 1227 1228 /* 1229 * search the closest allocated block to the left for *logical 1230 * and returns it at @logical + it's physical address at @phys 1231 * if *logical is the smallest allocated block, the function 1232 * returns 0 at @phys 1233 * return value contains 0 (success) or error code 1234 */ 1235 int 1236 ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 1237 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1238 { 1239 struct ext4_extent_idx *ix; 1240 struct ext4_extent *ex; 1241 int depth, ee_len; 1242 1243 if (unlikely(path == NULL)) { 1244 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1245 return -EIO; 1246 } 1247 depth = path->p_depth; 1248 *phys = 0; 1249 1250 if (depth == 0 && path->p_ext == NULL) 1251 return 0; 1252 1253 /* usually extent in the path covers blocks smaller 1254 * then *logical, but it can be that extent is the 1255 * first one in the file */ 1256 1257 ex = path[depth].p_ext; 1258 ee_len = ext4_ext_get_actual_len(ex); 1259 if (*logical < le32_to_cpu(ex->ee_block)) { 1260 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1261 EXT4_ERROR_INODE(inode, 1262 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1263 *logical, le32_to_cpu(ex->ee_block)); 1264 return -EIO; 1265 } 1266 while (--depth >= 0) { 1267 ix = path[depth].p_idx; 1268 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1269 EXT4_ERROR_INODE(inode, 1270 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1271 ix != NULL ? ix->ei_block : 0, 1272 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1273 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1274 depth); 1275 return -EIO; 1276 } 1277 } 1278 return 0; 1279 } 1280 1281 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1282 EXT4_ERROR_INODE(inode, 1283 "logical %d < ee_block %d + ee_len %d!", 1284 *logical, le32_to_cpu(ex->ee_block), ee_len); 1285 return -EIO; 1286 } 1287 1288 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1289 *phys = ext_pblock(ex) + ee_len - 1; 1290 return 0; 1291 } 1292 1293 /* 1294 * search the closest allocated block to the right for *logical 1295 * and returns it at @logical + it's physical address at @phys 1296 * if *logical is the smallest allocated block, the function 1297 * returns 0 at @phys 1298 * return value contains 0 (success) or error code 1299 */ 1300 int 1301 ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, 1302 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1303 { 1304 struct buffer_head *bh = NULL; 1305 struct ext4_extent_header *eh; 1306 struct ext4_extent_idx *ix; 1307 struct ext4_extent *ex; 1308 ext4_fsblk_t block; 1309 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1310 int ee_len; 1311 1312 if (unlikely(path == NULL)) { 1313 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1314 return -EIO; 1315 } 1316 depth = path->p_depth; 1317 *phys = 0; 1318 1319 if (depth == 0 && path->p_ext == NULL) 1320 return 0; 1321 1322 /* usually extent in the path covers blocks smaller 1323 * then *logical, but it can be that extent is the 1324 * first one in the file */ 1325 1326 ex = path[depth].p_ext; 1327 ee_len = ext4_ext_get_actual_len(ex); 1328 if (*logical < le32_to_cpu(ex->ee_block)) { 1329 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1330 EXT4_ERROR_INODE(inode, 1331 "first_extent(path[%d].p_hdr) != ex", 1332 depth); 1333 return -EIO; 1334 } 1335 while (--depth >= 0) { 1336 ix = path[depth].p_idx; 1337 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1338 EXT4_ERROR_INODE(inode, 1339 "ix != EXT_FIRST_INDEX *logical %d!", 1340 *logical); 1341 return -EIO; 1342 } 1343 } 1344 *logical = le32_to_cpu(ex->ee_block); 1345 *phys = ext_pblock(ex); 1346 return 0; 1347 } 1348 1349 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1350 EXT4_ERROR_INODE(inode, 1351 "logical %d < ee_block %d + ee_len %d!", 1352 *logical, le32_to_cpu(ex->ee_block), ee_len); 1353 return -EIO; 1354 } 1355 1356 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1357 /* next allocated block in this leaf */ 1358 ex++; 1359 *logical = le32_to_cpu(ex->ee_block); 1360 *phys = ext_pblock(ex); 1361 return 0; 1362 } 1363 1364 /* go up and search for index to the right */ 1365 while (--depth >= 0) { 1366 ix = path[depth].p_idx; 1367 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1368 goto got_index; 1369 } 1370 1371 /* we've gone up to the root and found no index to the right */ 1372 return 0; 1373 1374 got_index: 1375 /* we've found index to the right, let's 1376 * follow it and find the closest allocated 1377 * block to the right */ 1378 ix++; 1379 block = idx_pblock(ix); 1380 while (++depth < path->p_depth) { 1381 bh = sb_bread(inode->i_sb, block); 1382 if (bh == NULL) 1383 return -EIO; 1384 eh = ext_block_hdr(bh); 1385 /* subtract from p_depth to get proper eh_depth */ 1386 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1387 put_bh(bh); 1388 return -EIO; 1389 } 1390 ix = EXT_FIRST_INDEX(eh); 1391 block = idx_pblock(ix); 1392 put_bh(bh); 1393 } 1394 1395 bh = sb_bread(inode->i_sb, block); 1396 if (bh == NULL) 1397 return -EIO; 1398 eh = ext_block_hdr(bh); 1399 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1400 put_bh(bh); 1401 return -EIO; 1402 } 1403 ex = EXT_FIRST_EXTENT(eh); 1404 *logical = le32_to_cpu(ex->ee_block); 1405 *phys = ext_pblock(ex); 1406 put_bh(bh); 1407 return 0; 1408 } 1409 1410 /* 1411 * ext4_ext_next_allocated_block: 1412 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1413 * NOTE: it considers block number from index entry as 1414 * allocated block. Thus, index entries have to be consistent 1415 * with leaves. 1416 */ 1417 static ext4_lblk_t 1418 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1419 { 1420 int depth; 1421 1422 BUG_ON(path == NULL); 1423 depth = path->p_depth; 1424 1425 if (depth == 0 && path->p_ext == NULL) 1426 return EXT_MAX_BLOCK; 1427 1428 while (depth >= 0) { 1429 if (depth == path->p_depth) { 1430 /* leaf */ 1431 if (path[depth].p_ext != 1432 EXT_LAST_EXTENT(path[depth].p_hdr)) 1433 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1434 } else { 1435 /* index */ 1436 if (path[depth].p_idx != 1437 EXT_LAST_INDEX(path[depth].p_hdr)) 1438 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1439 } 1440 depth--; 1441 } 1442 1443 return EXT_MAX_BLOCK; 1444 } 1445 1446 /* 1447 * ext4_ext_next_leaf_block: 1448 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1449 */ 1450 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1451 struct ext4_ext_path *path) 1452 { 1453 int depth; 1454 1455 BUG_ON(path == NULL); 1456 depth = path->p_depth; 1457 1458 /* zero-tree has no leaf blocks at all */ 1459 if (depth == 0) 1460 return EXT_MAX_BLOCK; 1461 1462 /* go to index block */ 1463 depth--; 1464 1465 while (depth >= 0) { 1466 if (path[depth].p_idx != 1467 EXT_LAST_INDEX(path[depth].p_hdr)) 1468 return (ext4_lblk_t) 1469 le32_to_cpu(path[depth].p_idx[1].ei_block); 1470 depth--; 1471 } 1472 1473 return EXT_MAX_BLOCK; 1474 } 1475 1476 /* 1477 * ext4_ext_correct_indexes: 1478 * if leaf gets modified and modified extent is first in the leaf, 1479 * then we have to correct all indexes above. 1480 * TODO: do we need to correct tree in all cases? 1481 */ 1482 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1483 struct ext4_ext_path *path) 1484 { 1485 struct ext4_extent_header *eh; 1486 int depth = ext_depth(inode); 1487 struct ext4_extent *ex; 1488 __le32 border; 1489 int k, err = 0; 1490 1491 eh = path[depth].p_hdr; 1492 ex = path[depth].p_ext; 1493 1494 if (unlikely(ex == NULL || eh == NULL)) { 1495 EXT4_ERROR_INODE(inode, 1496 "ex %p == NULL or eh %p == NULL", ex, eh); 1497 return -EIO; 1498 } 1499 1500 if (depth == 0) { 1501 /* there is no tree at all */ 1502 return 0; 1503 } 1504 1505 if (ex != EXT_FIRST_EXTENT(eh)) { 1506 /* we correct tree if first leaf got modified only */ 1507 return 0; 1508 } 1509 1510 /* 1511 * TODO: we need correction if border is smaller than current one 1512 */ 1513 k = depth - 1; 1514 border = path[depth].p_ext->ee_block; 1515 err = ext4_ext_get_access(handle, inode, path + k); 1516 if (err) 1517 return err; 1518 path[k].p_idx->ei_block = border; 1519 err = ext4_ext_dirty(handle, inode, path + k); 1520 if (err) 1521 return err; 1522 1523 while (k--) { 1524 /* change all left-side indexes */ 1525 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1526 break; 1527 err = ext4_ext_get_access(handle, inode, path + k); 1528 if (err) 1529 break; 1530 path[k].p_idx->ei_block = border; 1531 err = ext4_ext_dirty(handle, inode, path + k); 1532 if (err) 1533 break; 1534 } 1535 1536 return err; 1537 } 1538 1539 int 1540 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1541 struct ext4_extent *ex2) 1542 { 1543 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1544 1545 /* 1546 * Make sure that either both extents are uninitialized, or 1547 * both are _not_. 1548 */ 1549 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1550 return 0; 1551 1552 if (ext4_ext_is_uninitialized(ex1)) 1553 max_len = EXT_UNINIT_MAX_LEN; 1554 else 1555 max_len = EXT_INIT_MAX_LEN; 1556 1557 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1558 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1559 1560 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1561 le32_to_cpu(ex2->ee_block)) 1562 return 0; 1563 1564 /* 1565 * To allow future support for preallocated extents to be added 1566 * as an RO_COMPAT feature, refuse to merge to extents if 1567 * this can result in the top bit of ee_len being set. 1568 */ 1569 if (ext1_ee_len + ext2_ee_len > max_len) 1570 return 0; 1571 #ifdef AGGRESSIVE_TEST 1572 if (ext1_ee_len >= 4) 1573 return 0; 1574 #endif 1575 1576 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1577 return 1; 1578 return 0; 1579 } 1580 1581 /* 1582 * This function tries to merge the "ex" extent to the next extent in the tree. 1583 * It always tries to merge towards right. If you want to merge towards 1584 * left, pass "ex - 1" as argument instead of "ex". 1585 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1586 * 1 if they got merged. 1587 */ 1588 int ext4_ext_try_to_merge(struct inode *inode, 1589 struct ext4_ext_path *path, 1590 struct ext4_extent *ex) 1591 { 1592 struct ext4_extent_header *eh; 1593 unsigned int depth, len; 1594 int merge_done = 0; 1595 int uninitialized = 0; 1596 1597 depth = ext_depth(inode); 1598 BUG_ON(path[depth].p_hdr == NULL); 1599 eh = path[depth].p_hdr; 1600 1601 while (ex < EXT_LAST_EXTENT(eh)) { 1602 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1603 break; 1604 /* merge with next extent! */ 1605 if (ext4_ext_is_uninitialized(ex)) 1606 uninitialized = 1; 1607 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1608 + ext4_ext_get_actual_len(ex + 1)); 1609 if (uninitialized) 1610 ext4_ext_mark_uninitialized(ex); 1611 1612 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1613 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1614 * sizeof(struct ext4_extent); 1615 memmove(ex + 1, ex + 2, len); 1616 } 1617 le16_add_cpu(&eh->eh_entries, -1); 1618 merge_done = 1; 1619 WARN_ON(eh->eh_entries == 0); 1620 if (!eh->eh_entries) 1621 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1622 } 1623 1624 return merge_done; 1625 } 1626 1627 /* 1628 * check if a portion of the "newext" extent overlaps with an 1629 * existing extent. 1630 * 1631 * If there is an overlap discovered, it updates the length of the newext 1632 * such that there will be no overlap, and then returns 1. 1633 * If there is no overlap found, it returns 0. 1634 */ 1635 unsigned int ext4_ext_check_overlap(struct inode *inode, 1636 struct ext4_extent *newext, 1637 struct ext4_ext_path *path) 1638 { 1639 ext4_lblk_t b1, b2; 1640 unsigned int depth, len1; 1641 unsigned int ret = 0; 1642 1643 b1 = le32_to_cpu(newext->ee_block); 1644 len1 = ext4_ext_get_actual_len(newext); 1645 depth = ext_depth(inode); 1646 if (!path[depth].p_ext) 1647 goto out; 1648 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1649 1650 /* 1651 * get the next allocated block if the extent in the path 1652 * is before the requested block(s) 1653 */ 1654 if (b2 < b1) { 1655 b2 = ext4_ext_next_allocated_block(path); 1656 if (b2 == EXT_MAX_BLOCK) 1657 goto out; 1658 } 1659 1660 /* check for wrap through zero on extent logical start block*/ 1661 if (b1 + len1 < b1) { 1662 len1 = EXT_MAX_BLOCK - b1; 1663 newext->ee_len = cpu_to_le16(len1); 1664 ret = 1; 1665 } 1666 1667 /* check for overlap */ 1668 if (b1 + len1 > b2) { 1669 newext->ee_len = cpu_to_le16(b2 - b1); 1670 ret = 1; 1671 } 1672 out: 1673 return ret; 1674 } 1675 1676 /* 1677 * ext4_ext_insert_extent: 1678 * tries to merge requsted extent into the existing extent or 1679 * inserts requested extent as new one into the tree, 1680 * creating new leaf in the no-space case. 1681 */ 1682 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1683 struct ext4_ext_path *path, 1684 struct ext4_extent *newext, int flag) 1685 { 1686 struct ext4_extent_header *eh; 1687 struct ext4_extent *ex, *fex; 1688 struct ext4_extent *nearex; /* nearest extent */ 1689 struct ext4_ext_path *npath = NULL; 1690 int depth, len, err; 1691 ext4_lblk_t next; 1692 unsigned uninitialized = 0; 1693 1694 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1695 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1696 return -EIO; 1697 } 1698 depth = ext_depth(inode); 1699 ex = path[depth].p_ext; 1700 if (unlikely(path[depth].p_hdr == NULL)) { 1701 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1702 return -EIO; 1703 } 1704 1705 /* try to insert block into found extent and return */ 1706 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1707 && ext4_can_extents_be_merged(inode, ex, newext)) { 1708 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1709 ext4_ext_is_uninitialized(newext), 1710 ext4_ext_get_actual_len(newext), 1711 le32_to_cpu(ex->ee_block), 1712 ext4_ext_is_uninitialized(ex), 1713 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1714 err = ext4_ext_get_access(handle, inode, path + depth); 1715 if (err) 1716 return err; 1717 1718 /* 1719 * ext4_can_extents_be_merged should have checked that either 1720 * both extents are uninitialized, or both aren't. Thus we 1721 * need to check only one of them here. 1722 */ 1723 if (ext4_ext_is_uninitialized(ex)) 1724 uninitialized = 1; 1725 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1726 + ext4_ext_get_actual_len(newext)); 1727 if (uninitialized) 1728 ext4_ext_mark_uninitialized(ex); 1729 eh = path[depth].p_hdr; 1730 nearex = ex; 1731 goto merge; 1732 } 1733 1734 repeat: 1735 depth = ext_depth(inode); 1736 eh = path[depth].p_hdr; 1737 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1738 goto has_space; 1739 1740 /* probably next leaf has space for us? */ 1741 fex = EXT_LAST_EXTENT(eh); 1742 next = ext4_ext_next_leaf_block(inode, path); 1743 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1744 && next != EXT_MAX_BLOCK) { 1745 ext_debug("next leaf block - %d\n", next); 1746 BUG_ON(npath != NULL); 1747 npath = ext4_ext_find_extent(inode, next, NULL); 1748 if (IS_ERR(npath)) 1749 return PTR_ERR(npath); 1750 BUG_ON(npath->p_depth != path->p_depth); 1751 eh = npath[depth].p_hdr; 1752 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1753 ext_debug("next leaf isnt full(%d)\n", 1754 le16_to_cpu(eh->eh_entries)); 1755 path = npath; 1756 goto repeat; 1757 } 1758 ext_debug("next leaf has no free space(%d,%d)\n", 1759 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1760 } 1761 1762 /* 1763 * There is no free space in the found leaf. 1764 * We're gonna add a new leaf in the tree. 1765 */ 1766 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1767 if (err) 1768 goto cleanup; 1769 depth = ext_depth(inode); 1770 eh = path[depth].p_hdr; 1771 1772 has_space: 1773 nearex = path[depth].p_ext; 1774 1775 err = ext4_ext_get_access(handle, inode, path + depth); 1776 if (err) 1777 goto cleanup; 1778 1779 if (!nearex) { 1780 /* there is no extent in this leaf, create first one */ 1781 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1782 le32_to_cpu(newext->ee_block), 1783 ext_pblock(newext), 1784 ext4_ext_is_uninitialized(newext), 1785 ext4_ext_get_actual_len(newext)); 1786 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1787 } else if (le32_to_cpu(newext->ee_block) 1788 > le32_to_cpu(nearex->ee_block)) { 1789 /* BUG_ON(newext->ee_block == nearex->ee_block); */ 1790 if (nearex != EXT_LAST_EXTENT(eh)) { 1791 len = EXT_MAX_EXTENT(eh) - nearex; 1792 len = (len - 1) * sizeof(struct ext4_extent); 1793 len = len < 0 ? 0 : len; 1794 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1795 "move %d from 0x%p to 0x%p\n", 1796 le32_to_cpu(newext->ee_block), 1797 ext_pblock(newext), 1798 ext4_ext_is_uninitialized(newext), 1799 ext4_ext_get_actual_len(newext), 1800 nearex, len, nearex + 1, nearex + 2); 1801 memmove(nearex + 2, nearex + 1, len); 1802 } 1803 path[depth].p_ext = nearex + 1; 1804 } else { 1805 BUG_ON(newext->ee_block == nearex->ee_block); 1806 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1807 len = len < 0 ? 0 : len; 1808 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1809 "move %d from 0x%p to 0x%p\n", 1810 le32_to_cpu(newext->ee_block), 1811 ext_pblock(newext), 1812 ext4_ext_is_uninitialized(newext), 1813 ext4_ext_get_actual_len(newext), 1814 nearex, len, nearex + 1, nearex + 2); 1815 memmove(nearex + 1, nearex, len); 1816 path[depth].p_ext = nearex; 1817 } 1818 1819 le16_add_cpu(&eh->eh_entries, 1); 1820 nearex = path[depth].p_ext; 1821 nearex->ee_block = newext->ee_block; 1822 ext4_ext_store_pblock(nearex, ext_pblock(newext)); 1823 nearex->ee_len = newext->ee_len; 1824 1825 merge: 1826 /* try to merge extents to the right */ 1827 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1828 ext4_ext_try_to_merge(inode, path, nearex); 1829 1830 /* try to merge extents to the left */ 1831 1832 /* time to correct all indexes above */ 1833 err = ext4_ext_correct_indexes(handle, inode, path); 1834 if (err) 1835 goto cleanup; 1836 1837 err = ext4_ext_dirty(handle, inode, path + depth); 1838 1839 cleanup: 1840 if (npath) { 1841 ext4_ext_drop_refs(npath); 1842 kfree(npath); 1843 } 1844 ext4_ext_invalidate_cache(inode); 1845 return err; 1846 } 1847 1848 int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1849 ext4_lblk_t num, ext_prepare_callback func, 1850 void *cbdata) 1851 { 1852 struct ext4_ext_path *path = NULL; 1853 struct ext4_ext_cache cbex; 1854 struct ext4_extent *ex; 1855 ext4_lblk_t next, start = 0, end = 0; 1856 ext4_lblk_t last = block + num; 1857 int depth, exists, err = 0; 1858 1859 BUG_ON(func == NULL); 1860 BUG_ON(inode == NULL); 1861 1862 while (block < last && block != EXT_MAX_BLOCK) { 1863 num = last - block; 1864 /* find extent for this block */ 1865 down_read(&EXT4_I(inode)->i_data_sem); 1866 path = ext4_ext_find_extent(inode, block, path); 1867 up_read(&EXT4_I(inode)->i_data_sem); 1868 if (IS_ERR(path)) { 1869 err = PTR_ERR(path); 1870 path = NULL; 1871 break; 1872 } 1873 1874 depth = ext_depth(inode); 1875 if (unlikely(path[depth].p_hdr == NULL)) { 1876 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1877 err = -EIO; 1878 break; 1879 } 1880 ex = path[depth].p_ext; 1881 next = ext4_ext_next_allocated_block(path); 1882 1883 exists = 0; 1884 if (!ex) { 1885 /* there is no extent yet, so try to allocate 1886 * all requested space */ 1887 start = block; 1888 end = block + num; 1889 } else if (le32_to_cpu(ex->ee_block) > block) { 1890 /* need to allocate space before found extent */ 1891 start = block; 1892 end = le32_to_cpu(ex->ee_block); 1893 if (block + num < end) 1894 end = block + num; 1895 } else if (block >= le32_to_cpu(ex->ee_block) 1896 + ext4_ext_get_actual_len(ex)) { 1897 /* need to allocate space after found extent */ 1898 start = block; 1899 end = block + num; 1900 if (end >= next) 1901 end = next; 1902 } else if (block >= le32_to_cpu(ex->ee_block)) { 1903 /* 1904 * some part of requested space is covered 1905 * by found extent 1906 */ 1907 start = block; 1908 end = le32_to_cpu(ex->ee_block) 1909 + ext4_ext_get_actual_len(ex); 1910 if (block + num < end) 1911 end = block + num; 1912 exists = 1; 1913 } else { 1914 BUG(); 1915 } 1916 BUG_ON(end <= start); 1917 1918 if (!exists) { 1919 cbex.ec_block = start; 1920 cbex.ec_len = end - start; 1921 cbex.ec_start = 0; 1922 cbex.ec_type = EXT4_EXT_CACHE_GAP; 1923 } else { 1924 cbex.ec_block = le32_to_cpu(ex->ee_block); 1925 cbex.ec_len = ext4_ext_get_actual_len(ex); 1926 cbex.ec_start = ext_pblock(ex); 1927 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1928 } 1929 1930 if (unlikely(cbex.ec_len == 0)) { 1931 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 1932 err = -EIO; 1933 break; 1934 } 1935 err = func(inode, path, &cbex, ex, cbdata); 1936 ext4_ext_drop_refs(path); 1937 1938 if (err < 0) 1939 break; 1940 1941 if (err == EXT_REPEAT) 1942 continue; 1943 else if (err == EXT_BREAK) { 1944 err = 0; 1945 break; 1946 } 1947 1948 if (ext_depth(inode) != depth) { 1949 /* depth was changed. we have to realloc path */ 1950 kfree(path); 1951 path = NULL; 1952 } 1953 1954 block = cbex.ec_block + cbex.ec_len; 1955 } 1956 1957 if (path) { 1958 ext4_ext_drop_refs(path); 1959 kfree(path); 1960 } 1961 1962 return err; 1963 } 1964 1965 static void 1966 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1967 __u32 len, ext4_fsblk_t start, int type) 1968 { 1969 struct ext4_ext_cache *cex; 1970 BUG_ON(len == 0); 1971 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1972 cex = &EXT4_I(inode)->i_cached_extent; 1973 cex->ec_type = type; 1974 cex->ec_block = block; 1975 cex->ec_len = len; 1976 cex->ec_start = start; 1977 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1978 } 1979 1980 /* 1981 * ext4_ext_put_gap_in_cache: 1982 * calculate boundaries of the gap that the requested block fits into 1983 * and cache this gap 1984 */ 1985 static void 1986 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1987 ext4_lblk_t block) 1988 { 1989 int depth = ext_depth(inode); 1990 unsigned long len; 1991 ext4_lblk_t lblock; 1992 struct ext4_extent *ex; 1993 1994 ex = path[depth].p_ext; 1995 if (ex == NULL) { 1996 /* there is no extent yet, so gap is [0;-] */ 1997 lblock = 0; 1998 len = EXT_MAX_BLOCK; 1999 ext_debug("cache gap(whole file):"); 2000 } else if (block < le32_to_cpu(ex->ee_block)) { 2001 lblock = block; 2002 len = le32_to_cpu(ex->ee_block) - block; 2003 ext_debug("cache gap(before): %u [%u:%u]", 2004 block, 2005 le32_to_cpu(ex->ee_block), 2006 ext4_ext_get_actual_len(ex)); 2007 } else if (block >= le32_to_cpu(ex->ee_block) 2008 + ext4_ext_get_actual_len(ex)) { 2009 ext4_lblk_t next; 2010 lblock = le32_to_cpu(ex->ee_block) 2011 + ext4_ext_get_actual_len(ex); 2012 2013 next = ext4_ext_next_allocated_block(path); 2014 ext_debug("cache gap(after): [%u:%u] %u", 2015 le32_to_cpu(ex->ee_block), 2016 ext4_ext_get_actual_len(ex), 2017 block); 2018 BUG_ON(next == lblock); 2019 len = next - lblock; 2020 } else { 2021 lblock = len = 0; 2022 BUG(); 2023 } 2024 2025 ext_debug(" -> %u:%lu\n", lblock, len); 2026 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 2027 } 2028 2029 static int 2030 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2031 struct ext4_extent *ex) 2032 { 2033 struct ext4_ext_cache *cex; 2034 int ret = EXT4_EXT_CACHE_NO; 2035 2036 /* 2037 * We borrow i_block_reservation_lock to protect i_cached_extent 2038 */ 2039 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2040 cex = &EXT4_I(inode)->i_cached_extent; 2041 2042 /* has cache valid data? */ 2043 if (cex->ec_type == EXT4_EXT_CACHE_NO) 2044 goto errout; 2045 2046 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2047 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2048 if (in_range(block, cex->ec_block, cex->ec_len)) { 2049 ex->ee_block = cpu_to_le32(cex->ec_block); 2050 ext4_ext_store_pblock(ex, cex->ec_start); 2051 ex->ee_len = cpu_to_le16(cex->ec_len); 2052 ext_debug("%u cached by %u:%u:%llu\n", 2053 block, 2054 cex->ec_block, cex->ec_len, cex->ec_start); 2055 ret = cex->ec_type; 2056 } 2057 errout: 2058 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2059 return ret; 2060 } 2061 2062 /* 2063 * ext4_ext_rm_idx: 2064 * removes index from the index block. 2065 * It's used in truncate case only, thus all requests are for 2066 * last index in the block only. 2067 */ 2068 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2069 struct ext4_ext_path *path) 2070 { 2071 int err; 2072 ext4_fsblk_t leaf; 2073 2074 /* free index block */ 2075 path--; 2076 leaf = idx_pblock(path->p_idx); 2077 if (unlikely(path->p_hdr->eh_entries == 0)) { 2078 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2079 return -EIO; 2080 } 2081 err = ext4_ext_get_access(handle, inode, path); 2082 if (err) 2083 return err; 2084 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2085 err = ext4_ext_dirty(handle, inode, path); 2086 if (err) 2087 return err; 2088 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2089 ext4_free_blocks(handle, inode, 0, leaf, 1, 2090 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2091 return err; 2092 } 2093 2094 /* 2095 * ext4_ext_calc_credits_for_single_extent: 2096 * This routine returns max. credits that needed to insert an extent 2097 * to the extent tree. 2098 * When pass the actual path, the caller should calculate credits 2099 * under i_data_sem. 2100 */ 2101 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2102 struct ext4_ext_path *path) 2103 { 2104 if (path) { 2105 int depth = ext_depth(inode); 2106 int ret = 0; 2107 2108 /* probably there is space in leaf? */ 2109 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2110 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2111 2112 /* 2113 * There are some space in the leaf tree, no 2114 * need to account for leaf block credit 2115 * 2116 * bitmaps and block group descriptor blocks 2117 * and other metadat blocks still need to be 2118 * accounted. 2119 */ 2120 /* 1 bitmap, 1 block group descriptor */ 2121 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2122 return ret; 2123 } 2124 } 2125 2126 return ext4_chunk_trans_blocks(inode, nrblocks); 2127 } 2128 2129 /* 2130 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2131 * 2132 * if nrblocks are fit in a single extent (chunk flag is 1), then 2133 * in the worse case, each tree level index/leaf need to be changed 2134 * if the tree split due to insert a new extent, then the old tree 2135 * index/leaf need to be updated too 2136 * 2137 * If the nrblocks are discontiguous, they could cause 2138 * the whole tree split more than once, but this is really rare. 2139 */ 2140 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2141 { 2142 int index; 2143 int depth = ext_depth(inode); 2144 2145 if (chunk) 2146 index = depth * 2; 2147 else 2148 index = depth * 3; 2149 2150 return index; 2151 } 2152 2153 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2154 struct ext4_extent *ex, 2155 ext4_lblk_t from, ext4_lblk_t to) 2156 { 2157 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2158 int flags = EXT4_FREE_BLOCKS_FORGET; 2159 2160 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2161 flags |= EXT4_FREE_BLOCKS_METADATA; 2162 #ifdef EXTENTS_STATS 2163 { 2164 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2165 spin_lock(&sbi->s_ext_stats_lock); 2166 sbi->s_ext_blocks += ee_len; 2167 sbi->s_ext_extents++; 2168 if (ee_len < sbi->s_ext_min) 2169 sbi->s_ext_min = ee_len; 2170 if (ee_len > sbi->s_ext_max) 2171 sbi->s_ext_max = ee_len; 2172 if (ext_depth(inode) > sbi->s_depth_max) 2173 sbi->s_depth_max = ext_depth(inode); 2174 spin_unlock(&sbi->s_ext_stats_lock); 2175 } 2176 #endif 2177 if (from >= le32_to_cpu(ex->ee_block) 2178 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2179 /* tail removal */ 2180 ext4_lblk_t num; 2181 ext4_fsblk_t start; 2182 2183 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2184 start = ext_pblock(ex) + ee_len - num; 2185 ext_debug("free last %u blocks starting %llu\n", num, start); 2186 ext4_free_blocks(handle, inode, 0, start, num, flags); 2187 } else if (from == le32_to_cpu(ex->ee_block) 2188 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2189 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2190 from, to, le32_to_cpu(ex->ee_block), ee_len); 2191 } else { 2192 printk(KERN_INFO "strange request: removal(2) " 2193 "%u-%u from %u:%u\n", 2194 from, to, le32_to_cpu(ex->ee_block), ee_len); 2195 } 2196 return 0; 2197 } 2198 2199 static int 2200 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2201 struct ext4_ext_path *path, ext4_lblk_t start) 2202 { 2203 int err = 0, correct_index = 0; 2204 int depth = ext_depth(inode), credits; 2205 struct ext4_extent_header *eh; 2206 ext4_lblk_t a, b, block; 2207 unsigned num; 2208 ext4_lblk_t ex_ee_block; 2209 unsigned short ex_ee_len; 2210 unsigned uninitialized = 0; 2211 struct ext4_extent *ex; 2212 2213 /* the header must be checked already in ext4_ext_remove_space() */ 2214 ext_debug("truncate since %u in leaf\n", start); 2215 if (!path[depth].p_hdr) 2216 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2217 eh = path[depth].p_hdr; 2218 if (unlikely(path[depth].p_hdr == NULL)) { 2219 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2220 return -EIO; 2221 } 2222 /* find where to start removing */ 2223 ex = EXT_LAST_EXTENT(eh); 2224 2225 ex_ee_block = le32_to_cpu(ex->ee_block); 2226 ex_ee_len = ext4_ext_get_actual_len(ex); 2227 2228 while (ex >= EXT_FIRST_EXTENT(eh) && 2229 ex_ee_block + ex_ee_len > start) { 2230 2231 if (ext4_ext_is_uninitialized(ex)) 2232 uninitialized = 1; 2233 else 2234 uninitialized = 0; 2235 2236 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2237 uninitialized, ex_ee_len); 2238 path[depth].p_ext = ex; 2239 2240 a = ex_ee_block > start ? ex_ee_block : start; 2241 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2242 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2243 2244 ext_debug(" border %u:%u\n", a, b); 2245 2246 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2247 block = 0; 2248 num = 0; 2249 BUG(); 2250 } else if (a != ex_ee_block) { 2251 /* remove tail of the extent */ 2252 block = ex_ee_block; 2253 num = a - block; 2254 } else if (b != ex_ee_block + ex_ee_len - 1) { 2255 /* remove head of the extent */ 2256 block = a; 2257 num = b - a; 2258 /* there is no "make a hole" API yet */ 2259 BUG(); 2260 } else { 2261 /* remove whole extent: excellent! */ 2262 block = ex_ee_block; 2263 num = 0; 2264 BUG_ON(a != ex_ee_block); 2265 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2266 } 2267 2268 /* 2269 * 3 for leaf, sb, and inode plus 2 (bmap and group 2270 * descriptor) for each block group; assume two block 2271 * groups plus ex_ee_len/blocks_per_block_group for 2272 * the worst case 2273 */ 2274 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2275 if (ex == EXT_FIRST_EXTENT(eh)) { 2276 correct_index = 1; 2277 credits += (ext_depth(inode)) + 1; 2278 } 2279 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2280 2281 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2282 if (err) 2283 goto out; 2284 2285 err = ext4_ext_get_access(handle, inode, path + depth); 2286 if (err) 2287 goto out; 2288 2289 err = ext4_remove_blocks(handle, inode, ex, a, b); 2290 if (err) 2291 goto out; 2292 2293 if (num == 0) { 2294 /* this extent is removed; mark slot entirely unused */ 2295 ext4_ext_store_pblock(ex, 0); 2296 le16_add_cpu(&eh->eh_entries, -1); 2297 } 2298 2299 ex->ee_block = cpu_to_le32(block); 2300 ex->ee_len = cpu_to_le16(num); 2301 /* 2302 * Do not mark uninitialized if all the blocks in the 2303 * extent have been removed. 2304 */ 2305 if (uninitialized && num) 2306 ext4_ext_mark_uninitialized(ex); 2307 2308 err = ext4_ext_dirty(handle, inode, path + depth); 2309 if (err) 2310 goto out; 2311 2312 ext_debug("new extent: %u:%u:%llu\n", block, num, 2313 ext_pblock(ex)); 2314 ex--; 2315 ex_ee_block = le32_to_cpu(ex->ee_block); 2316 ex_ee_len = ext4_ext_get_actual_len(ex); 2317 } 2318 2319 if (correct_index && eh->eh_entries) 2320 err = ext4_ext_correct_indexes(handle, inode, path); 2321 2322 /* if this leaf is free, then we should 2323 * remove it from index block above */ 2324 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2325 err = ext4_ext_rm_idx(handle, inode, path + depth); 2326 2327 out: 2328 return err; 2329 } 2330 2331 /* 2332 * ext4_ext_more_to_rm: 2333 * returns 1 if current index has to be freed (even partial) 2334 */ 2335 static int 2336 ext4_ext_more_to_rm(struct ext4_ext_path *path) 2337 { 2338 BUG_ON(path->p_idx == NULL); 2339 2340 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2341 return 0; 2342 2343 /* 2344 * if truncate on deeper level happened, it wasn't partial, 2345 * so we have to consider current index for truncation 2346 */ 2347 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2348 return 0; 2349 return 1; 2350 } 2351 2352 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2353 { 2354 struct super_block *sb = inode->i_sb; 2355 int depth = ext_depth(inode); 2356 struct ext4_ext_path *path; 2357 handle_t *handle; 2358 int i, err; 2359 2360 ext_debug("truncate since %u\n", start); 2361 2362 /* probably first extent we're gonna free will be last in block */ 2363 handle = ext4_journal_start(inode, depth + 1); 2364 if (IS_ERR(handle)) 2365 return PTR_ERR(handle); 2366 2367 again: 2368 ext4_ext_invalidate_cache(inode); 2369 2370 /* 2371 * We start scanning from right side, freeing all the blocks 2372 * after i_size and walking into the tree depth-wise. 2373 */ 2374 depth = ext_depth(inode); 2375 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2376 if (path == NULL) { 2377 ext4_journal_stop(handle); 2378 return -ENOMEM; 2379 } 2380 path[0].p_depth = depth; 2381 path[0].p_hdr = ext_inode_hdr(inode); 2382 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2383 err = -EIO; 2384 goto out; 2385 } 2386 i = err = 0; 2387 2388 while (i >= 0 && err == 0) { 2389 if (i == depth) { 2390 /* this is leaf block */ 2391 err = ext4_ext_rm_leaf(handle, inode, path, start); 2392 /* root level has p_bh == NULL, brelse() eats this */ 2393 brelse(path[i].p_bh); 2394 path[i].p_bh = NULL; 2395 i--; 2396 continue; 2397 } 2398 2399 /* this is index block */ 2400 if (!path[i].p_hdr) { 2401 ext_debug("initialize header\n"); 2402 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2403 } 2404 2405 if (!path[i].p_idx) { 2406 /* this level hasn't been touched yet */ 2407 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2408 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2409 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2410 path[i].p_hdr, 2411 le16_to_cpu(path[i].p_hdr->eh_entries)); 2412 } else { 2413 /* we were already here, see at next index */ 2414 path[i].p_idx--; 2415 } 2416 2417 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2418 i, EXT_FIRST_INDEX(path[i].p_hdr), 2419 path[i].p_idx); 2420 if (ext4_ext_more_to_rm(path + i)) { 2421 struct buffer_head *bh; 2422 /* go to the next level */ 2423 ext_debug("move to level %d (block %llu)\n", 2424 i + 1, idx_pblock(path[i].p_idx)); 2425 memset(path + i + 1, 0, sizeof(*path)); 2426 bh = sb_bread(sb, idx_pblock(path[i].p_idx)); 2427 if (!bh) { 2428 /* should we reset i_size? */ 2429 err = -EIO; 2430 break; 2431 } 2432 if (WARN_ON(i + 1 > depth)) { 2433 err = -EIO; 2434 break; 2435 } 2436 if (ext4_ext_check(inode, ext_block_hdr(bh), 2437 depth - i - 1)) { 2438 err = -EIO; 2439 break; 2440 } 2441 path[i + 1].p_bh = bh; 2442 2443 /* save actual number of indexes since this 2444 * number is changed at the next iteration */ 2445 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2446 i++; 2447 } else { 2448 /* we finished processing this index, go up */ 2449 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2450 /* index is empty, remove it; 2451 * handle must be already prepared by the 2452 * truncatei_leaf() */ 2453 err = ext4_ext_rm_idx(handle, inode, path + i); 2454 } 2455 /* root level has p_bh == NULL, brelse() eats this */ 2456 brelse(path[i].p_bh); 2457 path[i].p_bh = NULL; 2458 i--; 2459 ext_debug("return to level %d\n", i); 2460 } 2461 } 2462 2463 /* TODO: flexible tree reduction should be here */ 2464 if (path->p_hdr->eh_entries == 0) { 2465 /* 2466 * truncate to zero freed all the tree, 2467 * so we need to correct eh_depth 2468 */ 2469 err = ext4_ext_get_access(handle, inode, path); 2470 if (err == 0) { 2471 ext_inode_hdr(inode)->eh_depth = 0; 2472 ext_inode_hdr(inode)->eh_max = 2473 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2474 err = ext4_ext_dirty(handle, inode, path); 2475 } 2476 } 2477 out: 2478 ext4_ext_drop_refs(path); 2479 kfree(path); 2480 if (err == -EAGAIN) 2481 goto again; 2482 ext4_journal_stop(handle); 2483 2484 return err; 2485 } 2486 2487 /* 2488 * called at mount time 2489 */ 2490 void ext4_ext_init(struct super_block *sb) 2491 { 2492 /* 2493 * possible initialization would be here 2494 */ 2495 2496 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2497 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2498 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2499 #ifdef AGGRESSIVE_TEST 2500 printk(", aggressive tests"); 2501 #endif 2502 #ifdef CHECK_BINSEARCH 2503 printk(", check binsearch"); 2504 #endif 2505 #ifdef EXTENTS_STATS 2506 printk(", stats"); 2507 #endif 2508 printk("\n"); 2509 #endif 2510 #ifdef EXTENTS_STATS 2511 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2512 EXT4_SB(sb)->s_ext_min = 1 << 30; 2513 EXT4_SB(sb)->s_ext_max = 0; 2514 #endif 2515 } 2516 } 2517 2518 /* 2519 * called at umount time 2520 */ 2521 void ext4_ext_release(struct super_block *sb) 2522 { 2523 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 2524 return; 2525 2526 #ifdef EXTENTS_STATS 2527 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2528 struct ext4_sb_info *sbi = EXT4_SB(sb); 2529 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2530 sbi->s_ext_blocks, sbi->s_ext_extents, 2531 sbi->s_ext_blocks / sbi->s_ext_extents); 2532 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2533 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2534 } 2535 #endif 2536 } 2537 2538 static void bi_complete(struct bio *bio, int error) 2539 { 2540 complete((struct completion *)bio->bi_private); 2541 } 2542 2543 /* FIXME!! we need to try to merge to left or right after zero-out */ 2544 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2545 { 2546 int ret; 2547 struct bio *bio; 2548 int blkbits, blocksize; 2549 sector_t ee_pblock; 2550 struct completion event; 2551 unsigned int ee_len, len, done, offset; 2552 2553 2554 blkbits = inode->i_blkbits; 2555 blocksize = inode->i_sb->s_blocksize; 2556 ee_len = ext4_ext_get_actual_len(ex); 2557 ee_pblock = ext_pblock(ex); 2558 2559 /* convert ee_pblock to 512 byte sectors */ 2560 ee_pblock = ee_pblock << (blkbits - 9); 2561 2562 while (ee_len > 0) { 2563 2564 if (ee_len > BIO_MAX_PAGES) 2565 len = BIO_MAX_PAGES; 2566 else 2567 len = ee_len; 2568 2569 bio = bio_alloc(GFP_NOIO, len); 2570 if (!bio) 2571 return -ENOMEM; 2572 2573 bio->bi_sector = ee_pblock; 2574 bio->bi_bdev = inode->i_sb->s_bdev; 2575 2576 done = 0; 2577 offset = 0; 2578 while (done < len) { 2579 ret = bio_add_page(bio, ZERO_PAGE(0), 2580 blocksize, offset); 2581 if (ret != blocksize) { 2582 /* 2583 * We can't add any more pages because of 2584 * hardware limitations. Start a new bio. 2585 */ 2586 break; 2587 } 2588 done++; 2589 offset += blocksize; 2590 if (offset >= PAGE_CACHE_SIZE) 2591 offset = 0; 2592 } 2593 2594 init_completion(&event); 2595 bio->bi_private = &event; 2596 bio->bi_end_io = bi_complete; 2597 submit_bio(WRITE, bio); 2598 wait_for_completion(&event); 2599 2600 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2601 bio_put(bio); 2602 return -EIO; 2603 } 2604 bio_put(bio); 2605 ee_len -= done; 2606 ee_pblock += done << (blkbits - 9); 2607 } 2608 return 0; 2609 } 2610 2611 #define EXT4_EXT_ZERO_LEN 7 2612 /* 2613 * This function is called by ext4_ext_map_blocks() if someone tries to write 2614 * to an uninitialized extent. It may result in splitting the uninitialized 2615 * extent into multiple extents (upto three - one initialized and two 2616 * uninitialized). 2617 * There are three possibilities: 2618 * a> There is no split required: Entire extent should be initialized 2619 * b> Splits in two extents: Write is happening at either end of the extent 2620 * c> Splits in three extents: Somone is writing in middle of the extent 2621 */ 2622 static int ext4_ext_convert_to_initialized(handle_t *handle, 2623 struct inode *inode, 2624 struct ext4_map_blocks *map, 2625 struct ext4_ext_path *path) 2626 { 2627 struct ext4_extent *ex, newex, orig_ex; 2628 struct ext4_extent *ex1 = NULL; 2629 struct ext4_extent *ex2 = NULL; 2630 struct ext4_extent *ex3 = NULL; 2631 struct ext4_extent_header *eh; 2632 ext4_lblk_t ee_block, eof_block; 2633 unsigned int allocated, ee_len, depth; 2634 ext4_fsblk_t newblock; 2635 int err = 0; 2636 int ret = 0; 2637 int may_zeroout; 2638 2639 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2640 "block %llu, max_blocks %u\n", inode->i_ino, 2641 (unsigned long long)map->m_lblk, map->m_len); 2642 2643 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2644 inode->i_sb->s_blocksize_bits; 2645 if (eof_block < map->m_lblk + map->m_len) 2646 eof_block = map->m_lblk + map->m_len; 2647 2648 depth = ext_depth(inode); 2649 eh = path[depth].p_hdr; 2650 ex = path[depth].p_ext; 2651 ee_block = le32_to_cpu(ex->ee_block); 2652 ee_len = ext4_ext_get_actual_len(ex); 2653 allocated = ee_len - (map->m_lblk - ee_block); 2654 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2655 2656 ex2 = ex; 2657 orig_ex.ee_block = ex->ee_block; 2658 orig_ex.ee_len = cpu_to_le16(ee_len); 2659 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2660 2661 /* 2662 * It is safe to convert extent to initialized via explicit 2663 * zeroout only if extent is fully insde i_size or new_size. 2664 */ 2665 may_zeroout = ee_block + ee_len <= eof_block; 2666 2667 err = ext4_ext_get_access(handle, inode, path + depth); 2668 if (err) 2669 goto out; 2670 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2671 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2672 err = ext4_ext_zeroout(inode, &orig_ex); 2673 if (err) 2674 goto fix_extent_len; 2675 /* update the extent length and mark as initialized */ 2676 ex->ee_block = orig_ex.ee_block; 2677 ex->ee_len = orig_ex.ee_len; 2678 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2679 ext4_ext_dirty(handle, inode, path + depth); 2680 /* zeroed the full extent */ 2681 return allocated; 2682 } 2683 2684 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 2685 if (map->m_lblk > ee_block) { 2686 ex1 = ex; 2687 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2688 ext4_ext_mark_uninitialized(ex1); 2689 ex2 = &newex; 2690 } 2691 /* 2692 * for sanity, update the length of the ex2 extent before 2693 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2694 * overlap of blocks. 2695 */ 2696 if (!ex1 && allocated > map->m_len) 2697 ex2->ee_len = cpu_to_le16(map->m_len); 2698 /* ex3: to ee_block + ee_len : uninitialised */ 2699 if (allocated > map->m_len) { 2700 unsigned int newdepth; 2701 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2702 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { 2703 /* 2704 * map->m_lblk == ee_block is handled by the zerouout 2705 * at the beginning. 2706 * Mark first half uninitialized. 2707 * Mark second half initialized and zero out the 2708 * initialized extent 2709 */ 2710 ex->ee_block = orig_ex.ee_block; 2711 ex->ee_len = cpu_to_le16(ee_len - allocated); 2712 ext4_ext_mark_uninitialized(ex); 2713 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2714 ext4_ext_dirty(handle, inode, path + depth); 2715 2716 ex3 = &newex; 2717 ex3->ee_block = cpu_to_le32(map->m_lblk); 2718 ext4_ext_store_pblock(ex3, newblock); 2719 ex3->ee_len = cpu_to_le16(allocated); 2720 err = ext4_ext_insert_extent(handle, inode, path, 2721 ex3, 0); 2722 if (err == -ENOSPC) { 2723 err = ext4_ext_zeroout(inode, &orig_ex); 2724 if (err) 2725 goto fix_extent_len; 2726 ex->ee_block = orig_ex.ee_block; 2727 ex->ee_len = orig_ex.ee_len; 2728 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2729 ext4_ext_dirty(handle, inode, path + depth); 2730 /* blocks available from map->m_lblk */ 2731 return allocated; 2732 2733 } else if (err) 2734 goto fix_extent_len; 2735 2736 /* 2737 * We need to zero out the second half because 2738 * an fallocate request can update file size and 2739 * converting the second half to initialized extent 2740 * implies that we can leak some junk data to user 2741 * space. 2742 */ 2743 err = ext4_ext_zeroout(inode, ex3); 2744 if (err) { 2745 /* 2746 * We should actually mark the 2747 * second half as uninit and return error 2748 * Insert would have changed the extent 2749 */ 2750 depth = ext_depth(inode); 2751 ext4_ext_drop_refs(path); 2752 path = ext4_ext_find_extent(inode, map->m_lblk, 2753 path); 2754 if (IS_ERR(path)) { 2755 err = PTR_ERR(path); 2756 return err; 2757 } 2758 /* get the second half extent details */ 2759 ex = path[depth].p_ext; 2760 err = ext4_ext_get_access(handle, inode, 2761 path + depth); 2762 if (err) 2763 return err; 2764 ext4_ext_mark_uninitialized(ex); 2765 ext4_ext_dirty(handle, inode, path + depth); 2766 return err; 2767 } 2768 2769 /* zeroed the second half */ 2770 return allocated; 2771 } 2772 ex3 = &newex; 2773 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 2774 ext4_ext_store_pblock(ex3, newblock + map->m_len); 2775 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 2776 ext4_ext_mark_uninitialized(ex3); 2777 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2778 if (err == -ENOSPC && may_zeroout) { 2779 err = ext4_ext_zeroout(inode, &orig_ex); 2780 if (err) 2781 goto fix_extent_len; 2782 /* update the extent length and mark as initialized */ 2783 ex->ee_block = orig_ex.ee_block; 2784 ex->ee_len = orig_ex.ee_len; 2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2786 ext4_ext_dirty(handle, inode, path + depth); 2787 /* zeroed the full extent */ 2788 /* blocks available from map->m_lblk */ 2789 return allocated; 2790 2791 } else if (err) 2792 goto fix_extent_len; 2793 /* 2794 * The depth, and hence eh & ex might change 2795 * as part of the insert above. 2796 */ 2797 newdepth = ext_depth(inode); 2798 /* 2799 * update the extent length after successful insert of the 2800 * split extent 2801 */ 2802 ee_len -= ext4_ext_get_actual_len(ex3); 2803 orig_ex.ee_len = cpu_to_le16(ee_len); 2804 may_zeroout = ee_block + ee_len <= eof_block; 2805 2806 depth = newdepth; 2807 ext4_ext_drop_refs(path); 2808 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2809 if (IS_ERR(path)) { 2810 err = PTR_ERR(path); 2811 goto out; 2812 } 2813 eh = path[depth].p_hdr; 2814 ex = path[depth].p_ext; 2815 if (ex2 != &newex) 2816 ex2 = ex; 2817 2818 err = ext4_ext_get_access(handle, inode, path + depth); 2819 if (err) 2820 goto out; 2821 2822 allocated = map->m_len; 2823 2824 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2825 * to insert a extent in the middle zerout directly 2826 * otherwise give the extent a chance to merge to left 2827 */ 2828 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2829 map->m_lblk != ee_block && may_zeroout) { 2830 err = ext4_ext_zeroout(inode, &orig_ex); 2831 if (err) 2832 goto fix_extent_len; 2833 /* update the extent length and mark as initialized */ 2834 ex->ee_block = orig_ex.ee_block; 2835 ex->ee_len = orig_ex.ee_len; 2836 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2837 ext4_ext_dirty(handle, inode, path + depth); 2838 /* zero out the first half */ 2839 /* blocks available from map->m_lblk */ 2840 return allocated; 2841 } 2842 } 2843 /* 2844 * If there was a change of depth as part of the 2845 * insertion of ex3 above, we need to update the length 2846 * of the ex1 extent again here 2847 */ 2848 if (ex1 && ex1 != ex) { 2849 ex1 = ex; 2850 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2851 ext4_ext_mark_uninitialized(ex1); 2852 ex2 = &newex; 2853 } 2854 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ 2855 ex2->ee_block = cpu_to_le32(map->m_lblk); 2856 ext4_ext_store_pblock(ex2, newblock); 2857 ex2->ee_len = cpu_to_le16(allocated); 2858 if (ex2 != ex) 2859 goto insert; 2860 /* 2861 * New (initialized) extent starts from the first block 2862 * in the current extent. i.e., ex2 == ex 2863 * We have to see if it can be merged with the extent 2864 * on the left. 2865 */ 2866 if (ex2 > EXT_FIRST_EXTENT(eh)) { 2867 /* 2868 * To merge left, pass "ex2 - 1" to try_to_merge(), 2869 * since it merges towards right _only_. 2870 */ 2871 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); 2872 if (ret) { 2873 err = ext4_ext_correct_indexes(handle, inode, path); 2874 if (err) 2875 goto out; 2876 depth = ext_depth(inode); 2877 ex2--; 2878 } 2879 } 2880 /* 2881 * Try to Merge towards right. This might be required 2882 * only when the whole extent is being written to. 2883 * i.e. ex2 == ex and ex3 == NULL. 2884 */ 2885 if (!ex3) { 2886 ret = ext4_ext_try_to_merge(inode, path, ex2); 2887 if (ret) { 2888 err = ext4_ext_correct_indexes(handle, inode, path); 2889 if (err) 2890 goto out; 2891 } 2892 } 2893 /* Mark modified extent as dirty */ 2894 err = ext4_ext_dirty(handle, inode, path + depth); 2895 goto out; 2896 insert: 2897 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2898 if (err == -ENOSPC && may_zeroout) { 2899 err = ext4_ext_zeroout(inode, &orig_ex); 2900 if (err) 2901 goto fix_extent_len; 2902 /* update the extent length and mark as initialized */ 2903 ex->ee_block = orig_ex.ee_block; 2904 ex->ee_len = orig_ex.ee_len; 2905 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2906 ext4_ext_dirty(handle, inode, path + depth); 2907 /* zero out the first half */ 2908 return allocated; 2909 } else if (err) 2910 goto fix_extent_len; 2911 out: 2912 ext4_ext_show_leaf(inode, path); 2913 return err ? err : allocated; 2914 2915 fix_extent_len: 2916 ex->ee_block = orig_ex.ee_block; 2917 ex->ee_len = orig_ex.ee_len; 2918 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2919 ext4_ext_mark_uninitialized(ex); 2920 ext4_ext_dirty(handle, inode, path + depth); 2921 return err; 2922 } 2923 2924 /* 2925 * This function is called by ext4_ext_map_blocks() from 2926 * ext4_get_blocks_dio_write() when DIO to write 2927 * to an uninitialized extent. 2928 * 2929 * Writing to an uninitized extent may result in splitting the uninitialized 2930 * extent into multiple /intialized unintialized extents (up to three) 2931 * There are three possibilities: 2932 * a> There is no split required: Entire extent should be uninitialized 2933 * b> Splits in two extents: Write is happening at either end of the extent 2934 * c> Splits in three extents: Somone is writing in middle of the extent 2935 * 2936 * One of more index blocks maybe needed if the extent tree grow after 2937 * the unintialized extent split. To prevent ENOSPC occur at the IO 2938 * complete, we need to split the uninitialized extent before DIO submit 2939 * the IO. The uninitialized extent called at this time will be split 2940 * into three uninitialized extent(at most). After IO complete, the part 2941 * being filled will be convert to initialized by the end_io callback function 2942 * via ext4_convert_unwritten_extents(). 2943 * 2944 * Returns the size of uninitialized extent to be written on success. 2945 */ 2946 static int ext4_split_unwritten_extents(handle_t *handle, 2947 struct inode *inode, 2948 struct ext4_map_blocks *map, 2949 struct ext4_ext_path *path, 2950 int flags) 2951 { 2952 struct ext4_extent *ex, newex, orig_ex; 2953 struct ext4_extent *ex1 = NULL; 2954 struct ext4_extent *ex2 = NULL; 2955 struct ext4_extent *ex3 = NULL; 2956 ext4_lblk_t ee_block, eof_block; 2957 unsigned int allocated, ee_len, depth; 2958 ext4_fsblk_t newblock; 2959 int err = 0; 2960 int may_zeroout; 2961 2962 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 2963 "block %llu, max_blocks %u\n", inode->i_ino, 2964 (unsigned long long)map->m_lblk, map->m_len); 2965 2966 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2967 inode->i_sb->s_blocksize_bits; 2968 if (eof_block < map->m_lblk + map->m_len) 2969 eof_block = map->m_lblk + map->m_len; 2970 2971 depth = ext_depth(inode); 2972 ex = path[depth].p_ext; 2973 ee_block = le32_to_cpu(ex->ee_block); 2974 ee_len = ext4_ext_get_actual_len(ex); 2975 allocated = ee_len - (map->m_lblk - ee_block); 2976 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2977 2978 ex2 = ex; 2979 orig_ex.ee_block = ex->ee_block; 2980 orig_ex.ee_len = cpu_to_le16(ee_len); 2981 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2982 2983 /* 2984 * It is safe to convert extent to initialized via explicit 2985 * zeroout only if extent is fully insde i_size or new_size. 2986 */ 2987 may_zeroout = ee_block + ee_len <= eof_block; 2988 2989 /* 2990 * If the uninitialized extent begins at the same logical 2991 * block where the write begins, and the write completely 2992 * covers the extent, then we don't need to split it. 2993 */ 2994 if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) 2995 return allocated; 2996 2997 err = ext4_ext_get_access(handle, inode, path + depth); 2998 if (err) 2999 goto out; 3000 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 3001 if (map->m_lblk > ee_block) { 3002 ex1 = ex; 3003 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 3004 ext4_ext_mark_uninitialized(ex1); 3005 ex2 = &newex; 3006 } 3007 /* 3008 * for sanity, update the length of the ex2 extent before 3009 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3010 * overlap of blocks. 3011 */ 3012 if (!ex1 && allocated > map->m_len) 3013 ex2->ee_len = cpu_to_le16(map->m_len); 3014 /* ex3: to ee_block + ee_len : uninitialised */ 3015 if (allocated > map->m_len) { 3016 unsigned int newdepth; 3017 ex3 = &newex; 3018 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 3019 ext4_ext_store_pblock(ex3, newblock + map->m_len); 3020 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 3021 ext4_ext_mark_uninitialized(ex3); 3022 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3023 if (err == -ENOSPC && may_zeroout) { 3024 err = ext4_ext_zeroout(inode, &orig_ex); 3025 if (err) 3026 goto fix_extent_len; 3027 /* update the extent length and mark as initialized */ 3028 ex->ee_block = orig_ex.ee_block; 3029 ex->ee_len = orig_ex.ee_len; 3030 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3031 ext4_ext_dirty(handle, inode, path + depth); 3032 /* zeroed the full extent */ 3033 /* blocks available from map->m_lblk */ 3034 return allocated; 3035 3036 } else if (err) 3037 goto fix_extent_len; 3038 /* 3039 * The depth, and hence eh & ex might change 3040 * as part of the insert above. 3041 */ 3042 newdepth = ext_depth(inode); 3043 /* 3044 * update the extent length after successful insert of the 3045 * split extent 3046 */ 3047 ee_len -= ext4_ext_get_actual_len(ex3); 3048 orig_ex.ee_len = cpu_to_le16(ee_len); 3049 may_zeroout = ee_block + ee_len <= eof_block; 3050 3051 depth = newdepth; 3052 ext4_ext_drop_refs(path); 3053 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3054 if (IS_ERR(path)) { 3055 err = PTR_ERR(path); 3056 goto out; 3057 } 3058 ex = path[depth].p_ext; 3059 if (ex2 != &newex) 3060 ex2 = ex; 3061 3062 err = ext4_ext_get_access(handle, inode, path + depth); 3063 if (err) 3064 goto out; 3065 3066 allocated = map->m_len; 3067 } 3068 /* 3069 * If there was a change of depth as part of the 3070 * insertion of ex3 above, we need to update the length 3071 * of the ex1 extent again here 3072 */ 3073 if (ex1 && ex1 != ex) { 3074 ex1 = ex; 3075 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 3076 ext4_ext_mark_uninitialized(ex1); 3077 ex2 = &newex; 3078 } 3079 /* 3080 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written 3081 * using direct I/O, uninitialised still. 3082 */ 3083 ex2->ee_block = cpu_to_le32(map->m_lblk); 3084 ext4_ext_store_pblock(ex2, newblock); 3085 ex2->ee_len = cpu_to_le16(allocated); 3086 ext4_ext_mark_uninitialized(ex2); 3087 if (ex2 != ex) 3088 goto insert; 3089 /* Mark modified extent as dirty */ 3090 err = ext4_ext_dirty(handle, inode, path + depth); 3091 ext_debug("out here\n"); 3092 goto out; 3093 insert: 3094 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3095 if (err == -ENOSPC && may_zeroout) { 3096 err = ext4_ext_zeroout(inode, &orig_ex); 3097 if (err) 3098 goto fix_extent_len; 3099 /* update the extent length and mark as initialized */ 3100 ex->ee_block = orig_ex.ee_block; 3101 ex->ee_len = orig_ex.ee_len; 3102 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3103 ext4_ext_dirty(handle, inode, path + depth); 3104 /* zero out the first half */ 3105 return allocated; 3106 } else if (err) 3107 goto fix_extent_len; 3108 out: 3109 ext4_ext_show_leaf(inode, path); 3110 return err ? err : allocated; 3111 3112 fix_extent_len: 3113 ex->ee_block = orig_ex.ee_block; 3114 ex->ee_len = orig_ex.ee_len; 3115 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3116 ext4_ext_mark_uninitialized(ex); 3117 ext4_ext_dirty(handle, inode, path + depth); 3118 return err; 3119 } 3120 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3121 struct inode *inode, 3122 struct ext4_ext_path *path) 3123 { 3124 struct ext4_extent *ex; 3125 struct ext4_extent_header *eh; 3126 int depth; 3127 int err = 0; 3128 int ret = 0; 3129 3130 depth = ext_depth(inode); 3131 eh = path[depth].p_hdr; 3132 ex = path[depth].p_ext; 3133 3134 err = ext4_ext_get_access(handle, inode, path + depth); 3135 if (err) 3136 goto out; 3137 /* first mark the extent as initialized */ 3138 ext4_ext_mark_initialized(ex); 3139 3140 /* 3141 * We have to see if it can be merged with the extent 3142 * on the left. 3143 */ 3144 if (ex > EXT_FIRST_EXTENT(eh)) { 3145 /* 3146 * To merge left, pass "ex - 1" to try_to_merge(), 3147 * since it merges towards right _only_. 3148 */ 3149 ret = ext4_ext_try_to_merge(inode, path, ex - 1); 3150 if (ret) { 3151 err = ext4_ext_correct_indexes(handle, inode, path); 3152 if (err) 3153 goto out; 3154 depth = ext_depth(inode); 3155 ex--; 3156 } 3157 } 3158 /* 3159 * Try to Merge towards right. 3160 */ 3161 ret = ext4_ext_try_to_merge(inode, path, ex); 3162 if (ret) { 3163 err = ext4_ext_correct_indexes(handle, inode, path); 3164 if (err) 3165 goto out; 3166 depth = ext_depth(inode); 3167 } 3168 /* Mark modified extent as dirty */ 3169 err = ext4_ext_dirty(handle, inode, path + depth); 3170 out: 3171 ext4_ext_show_leaf(inode, path); 3172 return err; 3173 } 3174 3175 static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3176 sector_t block, int count) 3177 { 3178 int i; 3179 for (i = 0; i < count; i++) 3180 unmap_underlying_metadata(bdev, block + i); 3181 } 3182 3183 static int 3184 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3185 struct ext4_map_blocks *map, 3186 struct ext4_ext_path *path, int flags, 3187 unsigned int allocated, ext4_fsblk_t newblock) 3188 { 3189 int ret = 0; 3190 int err = 0; 3191 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3192 3193 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3194 "block %llu, max_blocks %u, flags %d, allocated %u", 3195 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3196 flags, allocated); 3197 ext4_ext_show_leaf(inode, path); 3198 3199 /* get_block() before submit the IO, split the extent */ 3200 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3201 ret = ext4_split_unwritten_extents(handle, inode, map, 3202 path, flags); 3203 /* 3204 * Flag the inode(non aio case) or end_io struct (aio case) 3205 * that this IO needs to convertion to written when IO is 3206 * completed 3207 */ 3208 if (io) 3209 io->flag = EXT4_IO_UNWRITTEN; 3210 else 3211 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3212 if (ext4_should_dioread_nolock(inode)) 3213 map->m_flags |= EXT4_MAP_UNINIT; 3214 goto out; 3215 } 3216 /* IO end_io complete, convert the filled extent to written */ 3217 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3218 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3219 path); 3220 if (ret >= 0) 3221 ext4_update_inode_fsync_trans(handle, inode, 1); 3222 goto out2; 3223 } 3224 /* buffered IO case */ 3225 /* 3226 * repeat fallocate creation request 3227 * we already have an unwritten extent 3228 */ 3229 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3230 goto map_out; 3231 3232 /* buffered READ or buffered write_begin() lookup */ 3233 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3234 /* 3235 * We have blocks reserved already. We 3236 * return allocated blocks so that delalloc 3237 * won't do block reservation for us. But 3238 * the buffer head will be unmapped so that 3239 * a read from the block returns 0s. 3240 */ 3241 map->m_flags |= EXT4_MAP_UNWRITTEN; 3242 goto out1; 3243 } 3244 3245 /* buffered write, writepage time, convert*/ 3246 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3247 if (ret >= 0) 3248 ext4_update_inode_fsync_trans(handle, inode, 1); 3249 out: 3250 if (ret <= 0) { 3251 err = ret; 3252 goto out2; 3253 } else 3254 allocated = ret; 3255 map->m_flags |= EXT4_MAP_NEW; 3256 /* 3257 * if we allocated more blocks than requested 3258 * we need to make sure we unmap the extra block 3259 * allocated. The actual needed block will get 3260 * unmapped later when we find the buffer_head marked 3261 * new. 3262 */ 3263 if (allocated > map->m_len) { 3264 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3265 newblock + map->m_len, 3266 allocated - map->m_len); 3267 allocated = map->m_len; 3268 } 3269 3270 /* 3271 * If we have done fallocate with the offset that is already 3272 * delayed allocated, we would have block reservation 3273 * and quota reservation done in the delayed write path. 3274 * But fallocate would have already updated quota and block 3275 * count for this offset. So cancel these reservation 3276 */ 3277 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3278 ext4_da_update_reserve_space(inode, allocated, 0); 3279 3280 map_out: 3281 map->m_flags |= EXT4_MAP_MAPPED; 3282 out1: 3283 if (allocated > map->m_len) 3284 allocated = map->m_len; 3285 ext4_ext_show_leaf(inode, path); 3286 map->m_pblk = newblock; 3287 map->m_len = allocated; 3288 out2: 3289 if (path) { 3290 ext4_ext_drop_refs(path); 3291 kfree(path); 3292 } 3293 return err ? err : allocated; 3294 } 3295 /* 3296 * Block allocation/map/preallocation routine for extents based files 3297 * 3298 * 3299 * Need to be called with 3300 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 3301 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 3302 * 3303 * return > 0, number of of blocks already mapped/allocated 3304 * if create == 0 and these are pre-allocated blocks 3305 * buffer head is unmapped 3306 * otherwise blocks are mapped 3307 * 3308 * return = 0, if plain look up failed (blocks have not been allocated) 3309 * buffer head is unmapped 3310 * 3311 * return < 0, error case. 3312 */ 3313 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3314 struct ext4_map_blocks *map, int flags) 3315 { 3316 struct ext4_ext_path *path = NULL; 3317 struct ext4_extent_header *eh; 3318 struct ext4_extent newex, *ex, *last_ex; 3319 ext4_fsblk_t newblock; 3320 int i, err = 0, depth, ret, cache_type; 3321 unsigned int allocated = 0; 3322 struct ext4_allocation_request ar; 3323 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3324 3325 ext_debug("blocks %u/%u requested for inode %lu\n", 3326 map->m_lblk, map->m_len, inode->i_ino); 3327 3328 /* check in cache */ 3329 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); 3330 if (cache_type) { 3331 if (cache_type == EXT4_EXT_CACHE_GAP) { 3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3333 /* 3334 * block isn't allocated yet and 3335 * user doesn't want to allocate it 3336 */ 3337 goto out2; 3338 } 3339 /* we should allocate requested block */ 3340 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3341 /* block is already allocated */ 3342 newblock = map->m_lblk 3343 - le32_to_cpu(newex.ee_block) 3344 + ext_pblock(&newex); 3345 /* number of remaining blocks in the extent */ 3346 allocated = ext4_ext_get_actual_len(&newex) - 3347 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3348 goto out; 3349 } else { 3350 BUG(); 3351 } 3352 } 3353 3354 /* find extent for this block */ 3355 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 3356 if (IS_ERR(path)) { 3357 err = PTR_ERR(path); 3358 path = NULL; 3359 goto out2; 3360 } 3361 3362 depth = ext_depth(inode); 3363 3364 /* 3365 * consistent leaf must not be empty; 3366 * this situation is possible, though, _during_ tree modification; 3367 * this is why assert can't be put in ext4_ext_find_extent() 3368 */ 3369 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3370 EXT4_ERROR_INODE(inode, "bad extent address " 3371 "lblock: %lu, depth: %d pblock %lld", 3372 (unsigned long) map->m_lblk, depth, 3373 path[depth].p_block); 3374 err = -EIO; 3375 goto out2; 3376 } 3377 eh = path[depth].p_hdr; 3378 3379 ex = path[depth].p_ext; 3380 if (ex) { 3381 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3382 ext4_fsblk_t ee_start = ext_pblock(ex); 3383 unsigned short ee_len; 3384 3385 /* 3386 * Uninitialized extents are treated as holes, except that 3387 * we split out initialized portions during a write. 3388 */ 3389 ee_len = ext4_ext_get_actual_len(ex); 3390 /* if found extent covers block, simply return it */ 3391 if (in_range(map->m_lblk, ee_block, ee_len)) { 3392 newblock = map->m_lblk - ee_block + ee_start; 3393 /* number of remaining blocks in the extent */ 3394 allocated = ee_len - (map->m_lblk - ee_block); 3395 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3396 ee_block, ee_len, newblock); 3397 3398 /* Do not put uninitialized extent in the cache */ 3399 if (!ext4_ext_is_uninitialized(ex)) { 3400 ext4_ext_put_in_cache(inode, ee_block, 3401 ee_len, ee_start, 3402 EXT4_EXT_CACHE_EXTENT); 3403 goto out; 3404 } 3405 ret = ext4_ext_handle_uninitialized_extents(handle, 3406 inode, map, path, flags, allocated, 3407 newblock); 3408 return ret; 3409 } 3410 } 3411 3412 /* 3413 * requested block isn't allocated yet; 3414 * we couldn't try to create block if create flag is zero 3415 */ 3416 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3417 /* 3418 * put just found gap into cache to speed up 3419 * subsequent requests 3420 */ 3421 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3422 goto out2; 3423 } 3424 /* 3425 * Okay, we need to do block allocation. 3426 */ 3427 3428 /* find neighbour allocated blocks */ 3429 ar.lleft = map->m_lblk; 3430 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3431 if (err) 3432 goto out2; 3433 ar.lright = map->m_lblk; 3434 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3435 if (err) 3436 goto out2; 3437 3438 /* 3439 * See if request is beyond maximum number of blocks we can have in 3440 * a single extent. For an initialized extent this limit is 3441 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3442 * EXT_UNINIT_MAX_LEN. 3443 */ 3444 if (map->m_len > EXT_INIT_MAX_LEN && 3445 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3446 map->m_len = EXT_INIT_MAX_LEN; 3447 else if (map->m_len > EXT_UNINIT_MAX_LEN && 3448 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3449 map->m_len = EXT_UNINIT_MAX_LEN; 3450 3451 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3452 newex.ee_block = cpu_to_le32(map->m_lblk); 3453 newex.ee_len = cpu_to_le16(map->m_len); 3454 err = ext4_ext_check_overlap(inode, &newex, path); 3455 if (err) 3456 allocated = ext4_ext_get_actual_len(&newex); 3457 else 3458 allocated = map->m_len; 3459 3460 /* allocate new block */ 3461 ar.inode = inode; 3462 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3463 ar.logical = map->m_lblk; 3464 ar.len = allocated; 3465 if (S_ISREG(inode->i_mode)) 3466 ar.flags = EXT4_MB_HINT_DATA; 3467 else 3468 /* disable in-core preallocation for non-regular files */ 3469 ar.flags = 0; 3470 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3471 if (!newblock) 3472 goto out2; 3473 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 3474 ar.goal, newblock, allocated); 3475 3476 /* try to insert new extent into found leaf and return */ 3477 ext4_ext_store_pblock(&newex, newblock); 3478 newex.ee_len = cpu_to_le16(ar.len); 3479 /* Mark uninitialized */ 3480 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3481 ext4_ext_mark_uninitialized(&newex); 3482 /* 3483 * io_end structure was created for every IO write to an 3484 * uninitialized extent. To avoid unecessary conversion, 3485 * here we flag the IO that really needs the conversion. 3486 * For non asycn direct IO case, flag the inode state 3487 * that we need to perform convertion when IO is done. 3488 */ 3489 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3490 if (io) 3491 io->flag = EXT4_IO_UNWRITTEN; 3492 else 3493 ext4_set_inode_state(inode, 3494 EXT4_STATE_DIO_UNWRITTEN); 3495 } 3496 if (ext4_should_dioread_nolock(inode)) 3497 map->m_flags |= EXT4_MAP_UNINIT; 3498 } 3499 3500 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { 3501 if (unlikely(!eh->eh_entries)) { 3502 EXT4_ERROR_INODE(inode, 3503 "eh->eh_entries == 0 and " 3504 "EOFBLOCKS_FL set"); 3505 err = -EIO; 3506 goto out2; 3507 } 3508 last_ex = EXT_LAST_EXTENT(eh); 3509 /* 3510 * If the current leaf block was reached by looking at 3511 * the last index block all the way down the tree, and 3512 * we are extending the inode beyond the last extent 3513 * in the current leaf block, then clear the 3514 * EOFBLOCKS_FL flag. 3515 */ 3516 for (i = depth-1; i >= 0; i--) { 3517 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3518 break; 3519 } 3520 if ((i < 0) && 3521 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) + 3522 ext4_ext_get_actual_len(last_ex))) 3523 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3524 } 3525 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3526 if (err) { 3527 /* free data blocks we just allocated */ 3528 /* not a good idea to call discard here directly, 3529 * but otherwise we'd need to call it every free() */ 3530 ext4_discard_preallocations(inode); 3531 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3532 ext4_ext_get_actual_len(&newex), 0); 3533 goto out2; 3534 } 3535 3536 /* previous routine could use block we allocated */ 3537 newblock = ext_pblock(&newex); 3538 allocated = ext4_ext_get_actual_len(&newex); 3539 if (allocated > map->m_len) 3540 allocated = map->m_len; 3541 map->m_flags |= EXT4_MAP_NEW; 3542 3543 /* 3544 * Update reserved blocks/metadata blocks after successful 3545 * block allocation which had been deferred till now. 3546 */ 3547 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3548 ext4_da_update_reserve_space(inode, allocated, 1); 3549 3550 /* 3551 * Cache the extent and update transaction to commit on fdatasync only 3552 * when it is _not_ an uninitialized extent. 3553 */ 3554 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3555 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, 3556 EXT4_EXT_CACHE_EXTENT); 3557 ext4_update_inode_fsync_trans(handle, inode, 1); 3558 } else 3559 ext4_update_inode_fsync_trans(handle, inode, 0); 3560 out: 3561 if (allocated > map->m_len) 3562 allocated = map->m_len; 3563 ext4_ext_show_leaf(inode, path); 3564 map->m_flags |= EXT4_MAP_MAPPED; 3565 map->m_pblk = newblock; 3566 map->m_len = allocated; 3567 out2: 3568 if (path) { 3569 ext4_ext_drop_refs(path); 3570 kfree(path); 3571 } 3572 return err ? err : allocated; 3573 } 3574 3575 void ext4_ext_truncate(struct inode *inode) 3576 { 3577 struct address_space *mapping = inode->i_mapping; 3578 struct super_block *sb = inode->i_sb; 3579 ext4_lblk_t last_block; 3580 handle_t *handle; 3581 int err = 0; 3582 3583 /* 3584 * probably first extent we're gonna free will be last in block 3585 */ 3586 err = ext4_writepage_trans_blocks(inode); 3587 handle = ext4_journal_start(inode, err); 3588 if (IS_ERR(handle)) 3589 return; 3590 3591 if (inode->i_size & (sb->s_blocksize - 1)) 3592 ext4_block_truncate_page(handle, mapping, inode->i_size); 3593 3594 if (ext4_orphan_add(handle, inode)) 3595 goto out_stop; 3596 3597 down_write(&EXT4_I(inode)->i_data_sem); 3598 ext4_ext_invalidate_cache(inode); 3599 3600 ext4_discard_preallocations(inode); 3601 3602 /* 3603 * TODO: optimization is possible here. 3604 * Probably we need not scan at all, 3605 * because page truncation is enough. 3606 */ 3607 3608 /* we have to know where to truncate from in crash case */ 3609 EXT4_I(inode)->i_disksize = inode->i_size; 3610 ext4_mark_inode_dirty(handle, inode); 3611 3612 last_block = (inode->i_size + sb->s_blocksize - 1) 3613 >> EXT4_BLOCK_SIZE_BITS(sb); 3614 err = ext4_ext_remove_space(inode, last_block); 3615 3616 /* In a multi-transaction truncate, we only make the final 3617 * transaction synchronous. 3618 */ 3619 if (IS_SYNC(inode)) 3620 ext4_handle_sync(handle); 3621 3622 out_stop: 3623 up_write(&EXT4_I(inode)->i_data_sem); 3624 /* 3625 * If this was a simple ftruncate() and the file will remain alive, 3626 * then we need to clear up the orphan record which we created above. 3627 * However, if this was a real unlink then we were called by 3628 * ext4_delete_inode(), and we allow that function to clean up the 3629 * orphan info for us. 3630 */ 3631 if (inode->i_nlink) 3632 ext4_orphan_del(handle, inode); 3633 3634 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3635 ext4_mark_inode_dirty(handle, inode); 3636 ext4_journal_stop(handle); 3637 } 3638 3639 static void ext4_falloc_update_inode(struct inode *inode, 3640 int mode, loff_t new_size, int update_ctime) 3641 { 3642 struct timespec now; 3643 3644 if (update_ctime) { 3645 now = current_fs_time(inode->i_sb); 3646 if (!timespec_equal(&inode->i_ctime, &now)) 3647 inode->i_ctime = now; 3648 } 3649 /* 3650 * Update only when preallocation was requested beyond 3651 * the file size. 3652 */ 3653 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 3654 if (new_size > i_size_read(inode)) 3655 i_size_write(inode, new_size); 3656 if (new_size > EXT4_I(inode)->i_disksize) 3657 ext4_update_i_disksize(inode, new_size); 3658 } else { 3659 /* 3660 * Mark that we allocate beyond EOF so the subsequent truncate 3661 * can proceed even if the new size is the same as i_size. 3662 */ 3663 if (new_size > i_size_read(inode)) 3664 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3665 } 3666 3667 } 3668 3669 /* 3670 * preallocate space for a file. This implements ext4's fallocate inode 3671 * operation, which gets called from sys_fallocate system call. 3672 * For block-mapped files, posix_fallocate should fall back to the method 3673 * of writing zeroes to the required new blocks (the same behavior which is 3674 * expected for file systems which do not support fallocate() system call). 3675 */ 3676 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3677 { 3678 handle_t *handle; 3679 loff_t new_size; 3680 unsigned int max_blocks; 3681 int ret = 0; 3682 int ret2 = 0; 3683 int retries = 0; 3684 struct ext4_map_blocks map; 3685 unsigned int credits, blkbits = inode->i_blkbits; 3686 3687 /* 3688 * currently supporting (pre)allocate mode for extent-based 3689 * files _only_ 3690 */ 3691 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3692 return -EOPNOTSUPP; 3693 3694 /* preallocation to directories is currently not supported */ 3695 if (S_ISDIR(inode->i_mode)) 3696 return -ENODEV; 3697 3698 map.m_lblk = offset >> blkbits; 3699 /* 3700 * We can't just convert len to max_blocks because 3701 * If blocksize = 4096 offset = 3072 and len = 2048 3702 */ 3703 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3704 - map.m_lblk; 3705 /* 3706 * credits to insert 1 extent into extent tree 3707 */ 3708 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3709 mutex_lock(&inode->i_mutex); 3710 ret = inode_newsize_ok(inode, (len + offset)); 3711 if (ret) { 3712 mutex_unlock(&inode->i_mutex); 3713 return ret; 3714 } 3715 retry: 3716 while (ret >= 0 && ret < max_blocks) { 3717 map.m_lblk = map.m_lblk + ret; 3718 map.m_len = max_blocks = max_blocks - ret; 3719 handle = ext4_journal_start(inode, credits); 3720 if (IS_ERR(handle)) { 3721 ret = PTR_ERR(handle); 3722 break; 3723 } 3724 ret = ext4_map_blocks(handle, inode, &map, 3725 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3726 if (ret <= 0) { 3727 #ifdef EXT4FS_DEBUG 3728 WARN_ON(ret <= 0); 3729 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3730 "returned error inode#%lu, block=%u, " 3731 "max_blocks=%u", __func__, 3732 inode->i_ino, block, max_blocks); 3733 #endif 3734 ext4_mark_inode_dirty(handle, inode); 3735 ret2 = ext4_journal_stop(handle); 3736 break; 3737 } 3738 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3739 blkbits) >> blkbits)) 3740 new_size = offset + len; 3741 else 3742 new_size = (map.m_lblk + ret) << blkbits; 3743 3744 ext4_falloc_update_inode(inode, mode, new_size, 3745 (map.m_flags & EXT4_MAP_NEW)); 3746 ext4_mark_inode_dirty(handle, inode); 3747 ret2 = ext4_journal_stop(handle); 3748 if (ret2) 3749 break; 3750 } 3751 if (ret == -ENOSPC && 3752 ext4_should_retry_alloc(inode->i_sb, &retries)) { 3753 ret = 0; 3754 goto retry; 3755 } 3756 mutex_unlock(&inode->i_mutex); 3757 return ret > 0 ? ret2 : ret; 3758 } 3759 3760 /* 3761 * This function convert a range of blocks to written extents 3762 * The caller of this function will pass the start offset and the size. 3763 * all unwritten extents within this range will be converted to 3764 * written extents. 3765 * 3766 * This function is called from the direct IO end io call back 3767 * function, to convert the fallocated extents after IO is completed. 3768 * Returns 0 on success. 3769 */ 3770 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3771 ssize_t len) 3772 { 3773 handle_t *handle; 3774 unsigned int max_blocks; 3775 int ret = 0; 3776 int ret2 = 0; 3777 struct ext4_map_blocks map; 3778 unsigned int credits, blkbits = inode->i_blkbits; 3779 3780 map.m_lblk = offset >> blkbits; 3781 /* 3782 * We can't just convert len to max_blocks because 3783 * If blocksize = 4096 offset = 3072 and len = 2048 3784 */ 3785 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 3786 map.m_lblk); 3787 /* 3788 * credits to insert 1 extent into extent tree 3789 */ 3790 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3791 while (ret >= 0 && ret < max_blocks) { 3792 map.m_lblk += ret; 3793 map.m_len = (max_blocks -= ret); 3794 handle = ext4_journal_start(inode, credits); 3795 if (IS_ERR(handle)) { 3796 ret = PTR_ERR(handle); 3797 break; 3798 } 3799 ret = ext4_map_blocks(handle, inode, &map, 3800 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3801 if (ret <= 0) { 3802 WARN_ON(ret <= 0); 3803 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3804 "returned error inode#%lu, block=%u, " 3805 "max_blocks=%u", __func__, 3806 inode->i_ino, map.m_lblk, map.m_len); 3807 } 3808 ext4_mark_inode_dirty(handle, inode); 3809 ret2 = ext4_journal_stop(handle); 3810 if (ret <= 0 || ret2 ) 3811 break; 3812 } 3813 return ret > 0 ? ret2 : ret; 3814 } 3815 /* 3816 * Callback function called for each extent to gather FIEMAP information. 3817 */ 3818 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3819 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3820 void *data) 3821 { 3822 struct fiemap_extent_info *fieinfo = data; 3823 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 3824 __u64 logical; 3825 __u64 physical; 3826 __u64 length; 3827 __u32 flags = 0; 3828 int error; 3829 3830 logical = (__u64)newex->ec_block << blksize_bits; 3831 3832 if (newex->ec_type == EXT4_EXT_CACHE_GAP) { 3833 pgoff_t offset; 3834 struct page *page; 3835 struct buffer_head *bh = NULL; 3836 3837 offset = logical >> PAGE_SHIFT; 3838 page = find_get_page(inode->i_mapping, offset); 3839 if (!page || !page_has_buffers(page)) 3840 return EXT_CONTINUE; 3841 3842 bh = page_buffers(page); 3843 3844 if (!bh) 3845 return EXT_CONTINUE; 3846 3847 if (buffer_delay(bh)) { 3848 flags |= FIEMAP_EXTENT_DELALLOC; 3849 page_cache_release(page); 3850 } else { 3851 page_cache_release(page); 3852 return EXT_CONTINUE; 3853 } 3854 } 3855 3856 physical = (__u64)newex->ec_start << blksize_bits; 3857 length = (__u64)newex->ec_len << blksize_bits; 3858 3859 if (ex && ext4_ext_is_uninitialized(ex)) 3860 flags |= FIEMAP_EXTENT_UNWRITTEN; 3861 3862 /* 3863 * If this extent reaches EXT_MAX_BLOCK, it must be last. 3864 * 3865 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, 3866 * this also indicates no more allocated blocks. 3867 * 3868 * XXX this might miss a single-block extent at EXT_MAX_BLOCK 3869 */ 3870 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || 3871 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { 3872 loff_t size = i_size_read(inode); 3873 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); 3874 3875 flags |= FIEMAP_EXTENT_LAST; 3876 if ((flags & FIEMAP_EXTENT_DELALLOC) && 3877 logical+length > size) 3878 length = (size - logical + bs - 1) & ~(bs-1); 3879 } 3880 3881 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3882 length, flags); 3883 if (error < 0) 3884 return error; 3885 if (error == 1) 3886 return EXT_BREAK; 3887 3888 return EXT_CONTINUE; 3889 } 3890 3891 /* fiemap flags we can handle specified here */ 3892 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3893 3894 static int ext4_xattr_fiemap(struct inode *inode, 3895 struct fiemap_extent_info *fieinfo) 3896 { 3897 __u64 physical = 0; 3898 __u64 length; 3899 __u32 flags = FIEMAP_EXTENT_LAST; 3900 int blockbits = inode->i_sb->s_blocksize_bits; 3901 int error = 0; 3902 3903 /* in-inode? */ 3904 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 3905 struct ext4_iloc iloc; 3906 int offset; /* offset of xattr in inode */ 3907 3908 error = ext4_get_inode_loc(inode, &iloc); 3909 if (error) 3910 return error; 3911 physical = iloc.bh->b_blocknr << blockbits; 3912 offset = EXT4_GOOD_OLD_INODE_SIZE + 3913 EXT4_I(inode)->i_extra_isize; 3914 physical += offset; 3915 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 3916 flags |= FIEMAP_EXTENT_DATA_INLINE; 3917 brelse(iloc.bh); 3918 } else { /* external block */ 3919 physical = EXT4_I(inode)->i_file_acl << blockbits; 3920 length = inode->i_sb->s_blocksize; 3921 } 3922 3923 if (physical) 3924 error = fiemap_fill_next_extent(fieinfo, 0, physical, 3925 length, flags); 3926 return (error < 0 ? error : 0); 3927 } 3928 3929 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3930 __u64 start, __u64 len) 3931 { 3932 ext4_lblk_t start_blk; 3933 int error = 0; 3934 3935 /* fallback to generic here if not in extents fmt */ 3936 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3937 return generic_block_fiemap(inode, fieinfo, start, len, 3938 ext4_get_block); 3939 3940 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 3941 return -EBADR; 3942 3943 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3944 error = ext4_xattr_fiemap(inode, fieinfo); 3945 } else { 3946 ext4_lblk_t len_blks; 3947 __u64 last_blk; 3948 3949 start_blk = start >> inode->i_sb->s_blocksize_bits; 3950 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 3951 if (last_blk >= EXT_MAX_BLOCK) 3952 last_blk = EXT_MAX_BLOCK-1; 3953 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 3954 3955 /* 3956 * Walk the extent tree gathering extent information. 3957 * ext4_ext_fiemap_cb will push extents back to user. 3958 */ 3959 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3960 ext4_ext_fiemap_cb, fieinfo); 3961 } 3962 3963 return error; 3964 } 3965 3966