1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/jbd2.h> 29 #include <linux/highuid.h> 30 #include <linux/pagemap.h> 31 #include <linux/quotaops.h> 32 #include <linux/string.h> 33 #include <linux/buffer_head.h> 34 #include <linux/writeback.h> 35 #include <linux/pagevec.h> 36 #include <linux/mpage.h> 37 #include <linux/namei.h> 38 #include <linux/uio.h> 39 #include <linux/bio.h> 40 41 #include "ext4_jbd2.h" 42 #include "xattr.h" 43 #include "acl.h" 44 #include "ext4_extents.h" 45 46 #include <trace/events/ext4.h> 47 48 #define MPAGE_DA_EXTENT_TAIL 0x01 49 50 static inline int ext4_begin_ordered_truncate(struct inode *inode, 51 loff_t new_size) 52 { 53 return jbd2_journal_begin_ordered_truncate( 54 EXT4_SB(inode->i_sb)->s_journal, 55 &EXT4_I(inode)->jinode, 56 new_size); 57 } 58 59 static void ext4_invalidatepage(struct page *page, unsigned long offset); 60 61 /* 62 * Test whether an inode is a fast symlink. 63 */ 64 static int ext4_inode_is_fast_symlink(struct inode *inode) 65 { 66 int ea_blocks = EXT4_I(inode)->i_file_acl ? 67 (inode->i_sb->s_blocksize >> 9) : 0; 68 69 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 70 } 71 72 /* 73 * The ext4 forget function must perform a revoke if we are freeing data 74 * which has been journaled. Metadata (eg. indirect blocks) must be 75 * revoked in all cases. 76 * 77 * "bh" may be NULL: a metadata block may have been freed from memory 78 * but there may still be a record of it in the journal, and that record 79 * still needs to be revoked. 80 * 81 * If the handle isn't valid we're not journaling so there's nothing to do. 82 */ 83 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 84 struct buffer_head *bh, ext4_fsblk_t blocknr) 85 { 86 int err; 87 88 if (!ext4_handle_valid(handle)) 89 return 0; 90 91 might_sleep(); 92 93 BUFFER_TRACE(bh, "enter"); 94 95 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 96 "data mode %x\n", 97 bh, is_metadata, inode->i_mode, 98 test_opt(inode->i_sb, DATA_FLAGS)); 99 100 /* Never use the revoke function if we are doing full data 101 * journaling: there is no need to, and a V1 superblock won't 102 * support it. Otherwise, only skip the revoke on un-journaled 103 * data blocks. */ 104 105 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 106 (!is_metadata && !ext4_should_journal_data(inode))) { 107 if (bh) { 108 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 109 return ext4_journal_forget(handle, bh); 110 } 111 return 0; 112 } 113 114 /* 115 * data!=journal && (is_metadata || should_journal_data(inode)) 116 */ 117 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 118 err = ext4_journal_revoke(handle, blocknr, bh); 119 if (err) 120 ext4_abort(inode->i_sb, __func__, 121 "error %d when attempting revoke", err); 122 BUFFER_TRACE(bh, "exit"); 123 return err; 124 } 125 126 /* 127 * Work out how many blocks we need to proceed with the next chunk of a 128 * truncate transaction. 129 */ 130 static unsigned long blocks_for_truncate(struct inode *inode) 131 { 132 ext4_lblk_t needed; 133 134 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 135 136 /* Give ourselves just enough room to cope with inodes in which 137 * i_blocks is corrupt: we've seen disk corruptions in the past 138 * which resulted in random data in an inode which looked enough 139 * like a regular file for ext4 to try to delete it. Things 140 * will go a bit crazy if that happens, but at least we should 141 * try not to panic the whole kernel. */ 142 if (needed < 2) 143 needed = 2; 144 145 /* But we need to bound the transaction so we don't overflow the 146 * journal. */ 147 if (needed > EXT4_MAX_TRANS_DATA) 148 needed = EXT4_MAX_TRANS_DATA; 149 150 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 151 } 152 153 /* 154 * Truncate transactions can be complex and absolutely huge. So we need to 155 * be able to restart the transaction at a conventient checkpoint to make 156 * sure we don't overflow the journal. 157 * 158 * start_transaction gets us a new handle for a truncate transaction, 159 * and extend_transaction tries to extend the existing one a bit. If 160 * extend fails, we need to propagate the failure up and restart the 161 * transaction in the top-level truncate loop. --sct 162 */ 163 static handle_t *start_transaction(struct inode *inode) 164 { 165 handle_t *result; 166 167 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 168 if (!IS_ERR(result)) 169 return result; 170 171 ext4_std_error(inode->i_sb, PTR_ERR(result)); 172 return result; 173 } 174 175 /* 176 * Try to extend this transaction for the purposes of truncation. 177 * 178 * Returns 0 if we managed to create more room. If we can't create more 179 * room, and the transaction must be restarted we return 1. 180 */ 181 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 182 { 183 if (!ext4_handle_valid(handle)) 184 return 0; 185 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 186 return 0; 187 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 188 return 0; 189 return 1; 190 } 191 192 /* 193 * Restart the transaction associated with *handle. This does a commit, 194 * so before we call here everything must be consistently dirtied against 195 * this transaction. 196 */ 197 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 198 { 199 BUG_ON(EXT4_JOURNAL(inode) == NULL); 200 jbd_debug(2, "restarting handle %p\n", handle); 201 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 202 } 203 204 /* 205 * Called at the last iput() if i_nlink is zero. 206 */ 207 void ext4_delete_inode(struct inode *inode) 208 { 209 handle_t *handle; 210 int err; 211 212 if (ext4_should_order_data(inode)) 213 ext4_begin_ordered_truncate(inode, 0); 214 truncate_inode_pages(&inode->i_data, 0); 215 216 if (is_bad_inode(inode)) 217 goto no_delete; 218 219 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 220 if (IS_ERR(handle)) { 221 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 222 /* 223 * If we're going to skip the normal cleanup, we still need to 224 * make sure that the in-core orphan linked list is properly 225 * cleaned up. 226 */ 227 ext4_orphan_del(NULL, inode); 228 goto no_delete; 229 } 230 231 if (IS_SYNC(inode)) 232 ext4_handle_sync(handle); 233 inode->i_size = 0; 234 err = ext4_mark_inode_dirty(handle, inode); 235 if (err) { 236 ext4_warning(inode->i_sb, __func__, 237 "couldn't mark inode dirty (err %d)", err); 238 goto stop_handle; 239 } 240 if (inode->i_blocks) 241 ext4_truncate(inode); 242 243 /* 244 * ext4_ext_truncate() doesn't reserve any slop when it 245 * restarts journal transactions; therefore there may not be 246 * enough credits left in the handle to remove the inode from 247 * the orphan list and set the dtime field. 248 */ 249 if (!ext4_handle_has_enough_credits(handle, 3)) { 250 err = ext4_journal_extend(handle, 3); 251 if (err > 0) 252 err = ext4_journal_restart(handle, 3); 253 if (err != 0) { 254 ext4_warning(inode->i_sb, __func__, 255 "couldn't extend journal (err %d)", err); 256 stop_handle: 257 ext4_journal_stop(handle); 258 goto no_delete; 259 } 260 } 261 262 /* 263 * Kill off the orphan record which ext4_truncate created. 264 * AKPM: I think this can be inside the above `if'. 265 * Note that ext4_orphan_del() has to be able to cope with the 266 * deletion of a non-existent orphan - this is because we don't 267 * know if ext4_truncate() actually created an orphan record. 268 * (Well, we could do this if we need to, but heck - it works) 269 */ 270 ext4_orphan_del(handle, inode); 271 EXT4_I(inode)->i_dtime = get_seconds(); 272 273 /* 274 * One subtle ordering requirement: if anything has gone wrong 275 * (transaction abort, IO errors, whatever), then we can still 276 * do these next steps (the fs will already have been marked as 277 * having errors), but we can't free the inode if the mark_dirty 278 * fails. 279 */ 280 if (ext4_mark_inode_dirty(handle, inode)) 281 /* If that failed, just do the required in-core inode clear. */ 282 clear_inode(inode); 283 else 284 ext4_free_inode(handle, inode); 285 ext4_journal_stop(handle); 286 return; 287 no_delete: 288 clear_inode(inode); /* We must guarantee clearing of inode... */ 289 } 290 291 typedef struct { 292 __le32 *p; 293 __le32 key; 294 struct buffer_head *bh; 295 } Indirect; 296 297 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 298 { 299 p->key = *(p->p = v); 300 p->bh = bh; 301 } 302 303 /** 304 * ext4_block_to_path - parse the block number into array of offsets 305 * @inode: inode in question (we are only interested in its superblock) 306 * @i_block: block number to be parsed 307 * @offsets: array to store the offsets in 308 * @boundary: set this non-zero if the referred-to block is likely to be 309 * followed (on disk) by an indirect block. 310 * 311 * To store the locations of file's data ext4 uses a data structure common 312 * for UNIX filesystems - tree of pointers anchored in the inode, with 313 * data blocks at leaves and indirect blocks in intermediate nodes. 314 * This function translates the block number into path in that tree - 315 * return value is the path length and @offsets[n] is the offset of 316 * pointer to (n+1)th node in the nth one. If @block is out of range 317 * (negative or too large) warning is printed and zero returned. 318 * 319 * Note: function doesn't find node addresses, so no IO is needed. All 320 * we need to know is the capacity of indirect blocks (taken from the 321 * inode->i_sb). 322 */ 323 324 /* 325 * Portability note: the last comparison (check that we fit into triple 326 * indirect block) is spelled differently, because otherwise on an 327 * architecture with 32-bit longs and 8Kb pages we might get into trouble 328 * if our filesystem had 8Kb blocks. We might use long long, but that would 329 * kill us on x86. Oh, well, at least the sign propagation does not matter - 330 * i_block would have to be negative in the very beginning, so we would not 331 * get there at all. 332 */ 333 334 static int ext4_block_to_path(struct inode *inode, 335 ext4_lblk_t i_block, 336 ext4_lblk_t offsets[4], int *boundary) 337 { 338 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 339 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 340 const long direct_blocks = EXT4_NDIR_BLOCKS, 341 indirect_blocks = ptrs, 342 double_blocks = (1 << (ptrs_bits * 2)); 343 int n = 0; 344 int final = 0; 345 346 if (i_block < 0) { 347 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); 348 } else if (i_block < direct_blocks) { 349 offsets[n++] = i_block; 350 final = direct_blocks; 351 } else if ((i_block -= direct_blocks) < indirect_blocks) { 352 offsets[n++] = EXT4_IND_BLOCK; 353 offsets[n++] = i_block; 354 final = ptrs; 355 } else if ((i_block -= indirect_blocks) < double_blocks) { 356 offsets[n++] = EXT4_DIND_BLOCK; 357 offsets[n++] = i_block >> ptrs_bits; 358 offsets[n++] = i_block & (ptrs - 1); 359 final = ptrs; 360 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 361 offsets[n++] = EXT4_TIND_BLOCK; 362 offsets[n++] = i_block >> (ptrs_bits * 2); 363 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 364 offsets[n++] = i_block & (ptrs - 1); 365 final = ptrs; 366 } else { 367 ext4_warning(inode->i_sb, "ext4_block_to_path", 368 "block %lu > max in inode %lu", 369 i_block + direct_blocks + 370 indirect_blocks + double_blocks, inode->i_ino); 371 } 372 if (boundary) 373 *boundary = final - 1 - (i_block & (ptrs - 1)); 374 return n; 375 } 376 377 static int __ext4_check_blockref(const char *function, struct inode *inode, 378 __le32 *p, unsigned int max) 379 { 380 __le32 *bref = p; 381 unsigned int blk; 382 383 while (bref < p+max) { 384 blk = le32_to_cpu(*bref++); 385 if (blk && 386 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 387 blk, 1))) { 388 ext4_error(inode->i_sb, function, 389 "invalid block reference %u " 390 "in inode #%lu", blk, inode->i_ino); 391 return -EIO; 392 } 393 } 394 return 0; 395 } 396 397 398 #define ext4_check_indirect_blockref(inode, bh) \ 399 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ 400 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 401 402 #define ext4_check_inode_blockref(inode) \ 403 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ 404 EXT4_NDIR_BLOCKS) 405 406 /** 407 * ext4_get_branch - read the chain of indirect blocks leading to data 408 * @inode: inode in question 409 * @depth: depth of the chain (1 - direct pointer, etc.) 410 * @offsets: offsets of pointers in inode/indirect blocks 411 * @chain: place to store the result 412 * @err: here we store the error value 413 * 414 * Function fills the array of triples <key, p, bh> and returns %NULL 415 * if everything went OK or the pointer to the last filled triple 416 * (incomplete one) otherwise. Upon the return chain[i].key contains 417 * the number of (i+1)-th block in the chain (as it is stored in memory, 418 * i.e. little-endian 32-bit), chain[i].p contains the address of that 419 * number (it points into struct inode for i==0 and into the bh->b_data 420 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 421 * block for i>0 and NULL for i==0. In other words, it holds the block 422 * numbers of the chain, addresses they were taken from (and where we can 423 * verify that chain did not change) and buffer_heads hosting these 424 * numbers. 425 * 426 * Function stops when it stumbles upon zero pointer (absent block) 427 * (pointer to last triple returned, *@err == 0) 428 * or when it gets an IO error reading an indirect block 429 * (ditto, *@err == -EIO) 430 * or when it reads all @depth-1 indirect blocks successfully and finds 431 * the whole chain, all way to the data (returns %NULL, *err == 0). 432 * 433 * Need to be called with 434 * down_read(&EXT4_I(inode)->i_data_sem) 435 */ 436 static Indirect *ext4_get_branch(struct inode *inode, int depth, 437 ext4_lblk_t *offsets, 438 Indirect chain[4], int *err) 439 { 440 struct super_block *sb = inode->i_sb; 441 Indirect *p = chain; 442 struct buffer_head *bh; 443 444 *err = 0; 445 /* i_data is not going away, no lock needed */ 446 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 447 if (!p->key) 448 goto no_block; 449 while (--depth) { 450 bh = sb_getblk(sb, le32_to_cpu(p->key)); 451 if (unlikely(!bh)) 452 goto failure; 453 454 if (!bh_uptodate_or_lock(bh)) { 455 if (bh_submit_read(bh) < 0) { 456 put_bh(bh); 457 goto failure; 458 } 459 /* validate block references */ 460 if (ext4_check_indirect_blockref(inode, bh)) { 461 put_bh(bh); 462 goto failure; 463 } 464 } 465 466 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 467 /* Reader: end */ 468 if (!p->key) 469 goto no_block; 470 } 471 return NULL; 472 473 failure: 474 *err = -EIO; 475 no_block: 476 return p; 477 } 478 479 /** 480 * ext4_find_near - find a place for allocation with sufficient locality 481 * @inode: owner 482 * @ind: descriptor of indirect block. 483 * 484 * This function returns the preferred place for block allocation. 485 * It is used when heuristic for sequential allocation fails. 486 * Rules are: 487 * + if there is a block to the left of our position - allocate near it. 488 * + if pointer will live in indirect block - allocate near that block. 489 * + if pointer will live in inode - allocate in the same 490 * cylinder group. 491 * 492 * In the latter case we colour the starting block by the callers PID to 493 * prevent it from clashing with concurrent allocations for a different inode 494 * in the same block group. The PID is used here so that functionally related 495 * files will be close-by on-disk. 496 * 497 * Caller must make sure that @ind is valid and will stay that way. 498 */ 499 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 500 { 501 struct ext4_inode_info *ei = EXT4_I(inode); 502 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 503 __le32 *p; 504 ext4_fsblk_t bg_start; 505 ext4_fsblk_t last_block; 506 ext4_grpblk_t colour; 507 ext4_group_t block_group; 508 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 509 510 /* Try to find previous block */ 511 for (p = ind->p - 1; p >= start; p--) { 512 if (*p) 513 return le32_to_cpu(*p); 514 } 515 516 /* No such thing, so let's try location of indirect block */ 517 if (ind->bh) 518 return ind->bh->b_blocknr; 519 520 /* 521 * It is going to be referred to from the inode itself? OK, just put it 522 * into the same cylinder group then. 523 */ 524 block_group = ei->i_block_group; 525 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 526 block_group &= ~(flex_size-1); 527 if (S_ISREG(inode->i_mode)) 528 block_group++; 529 } 530 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 531 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 532 533 /* 534 * If we are doing delayed allocation, we don't need take 535 * colour into account. 536 */ 537 if (test_opt(inode->i_sb, DELALLOC)) 538 return bg_start; 539 540 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 541 colour = (current->pid % 16) * 542 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 543 else 544 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 545 return bg_start + colour; 546 } 547 548 /** 549 * ext4_find_goal - find a preferred place for allocation. 550 * @inode: owner 551 * @block: block we want 552 * @partial: pointer to the last triple within a chain 553 * 554 * Normally this function find the preferred place for block allocation, 555 * returns it. 556 */ 557 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 558 Indirect *partial) 559 { 560 /* 561 * XXX need to get goal block from mballoc's data structures 562 */ 563 564 return ext4_find_near(inode, partial); 565 } 566 567 /** 568 * ext4_blks_to_allocate: Look up the block map and count the number 569 * of direct blocks need to be allocated for the given branch. 570 * 571 * @branch: chain of indirect blocks 572 * @k: number of blocks need for indirect blocks 573 * @blks: number of data blocks to be mapped. 574 * @blocks_to_boundary: the offset in the indirect block 575 * 576 * return the total number of blocks to be allocate, including the 577 * direct and indirect blocks. 578 */ 579 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 580 int blocks_to_boundary) 581 { 582 unsigned int count = 0; 583 584 /* 585 * Simple case, [t,d]Indirect block(s) has not allocated yet 586 * then it's clear blocks on that path have not allocated 587 */ 588 if (k > 0) { 589 /* right now we don't handle cross boundary allocation */ 590 if (blks < blocks_to_boundary + 1) 591 count += blks; 592 else 593 count += blocks_to_boundary + 1; 594 return count; 595 } 596 597 count++; 598 while (count < blks && count <= blocks_to_boundary && 599 le32_to_cpu(*(branch[0].p + count)) == 0) { 600 count++; 601 } 602 return count; 603 } 604 605 /** 606 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 607 * @indirect_blks: the number of blocks need to allocate for indirect 608 * blocks 609 * 610 * @new_blocks: on return it will store the new block numbers for 611 * the indirect blocks(if needed) and the first direct block, 612 * @blks: on return it will store the total number of allocated 613 * direct blocks 614 */ 615 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 616 ext4_lblk_t iblock, ext4_fsblk_t goal, 617 int indirect_blks, int blks, 618 ext4_fsblk_t new_blocks[4], int *err) 619 { 620 struct ext4_allocation_request ar; 621 int target, i; 622 unsigned long count = 0, blk_allocated = 0; 623 int index = 0; 624 ext4_fsblk_t current_block = 0; 625 int ret = 0; 626 627 /* 628 * Here we try to allocate the requested multiple blocks at once, 629 * on a best-effort basis. 630 * To build a branch, we should allocate blocks for 631 * the indirect blocks(if not allocated yet), and at least 632 * the first direct block of this branch. That's the 633 * minimum number of blocks need to allocate(required) 634 */ 635 /* first we try to allocate the indirect blocks */ 636 target = indirect_blks; 637 while (target > 0) { 638 count = target; 639 /* allocating blocks for indirect blocks and direct blocks */ 640 current_block = ext4_new_meta_blocks(handle, inode, 641 goal, &count, err); 642 if (*err) 643 goto failed_out; 644 645 target -= count; 646 /* allocate blocks for indirect blocks */ 647 while (index < indirect_blks && count) { 648 new_blocks[index++] = current_block++; 649 count--; 650 } 651 if (count > 0) { 652 /* 653 * save the new block number 654 * for the first direct block 655 */ 656 new_blocks[index] = current_block; 657 printk(KERN_INFO "%s returned more blocks than " 658 "requested\n", __func__); 659 WARN_ON(1); 660 break; 661 } 662 } 663 664 target = blks - count ; 665 blk_allocated = count; 666 if (!target) 667 goto allocated; 668 /* Now allocate data blocks */ 669 memset(&ar, 0, sizeof(ar)); 670 ar.inode = inode; 671 ar.goal = goal; 672 ar.len = target; 673 ar.logical = iblock; 674 if (S_ISREG(inode->i_mode)) 675 /* enable in-core preallocation only for regular files */ 676 ar.flags = EXT4_MB_HINT_DATA; 677 678 current_block = ext4_mb_new_blocks(handle, &ar, err); 679 680 if (*err && (target == blks)) { 681 /* 682 * if the allocation failed and we didn't allocate 683 * any blocks before 684 */ 685 goto failed_out; 686 } 687 if (!*err) { 688 if (target == blks) { 689 /* 690 * save the new block number 691 * for the first direct block 692 */ 693 new_blocks[index] = current_block; 694 } 695 blk_allocated += ar.len; 696 } 697 allocated: 698 /* total number of blocks allocated for direct blocks */ 699 ret = blk_allocated; 700 *err = 0; 701 return ret; 702 failed_out: 703 for (i = 0; i < index; i++) 704 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 705 return ret; 706 } 707 708 /** 709 * ext4_alloc_branch - allocate and set up a chain of blocks. 710 * @inode: owner 711 * @indirect_blks: number of allocated indirect blocks 712 * @blks: number of allocated direct blocks 713 * @offsets: offsets (in the blocks) to store the pointers to next. 714 * @branch: place to store the chain in. 715 * 716 * This function allocates blocks, zeroes out all but the last one, 717 * links them into chain and (if we are synchronous) writes them to disk. 718 * In other words, it prepares a branch that can be spliced onto the 719 * inode. It stores the information about that chain in the branch[], in 720 * the same format as ext4_get_branch() would do. We are calling it after 721 * we had read the existing part of chain and partial points to the last 722 * triple of that (one with zero ->key). Upon the exit we have the same 723 * picture as after the successful ext4_get_block(), except that in one 724 * place chain is disconnected - *branch->p is still zero (we did not 725 * set the last link), but branch->key contains the number that should 726 * be placed into *branch->p to fill that gap. 727 * 728 * If allocation fails we free all blocks we've allocated (and forget 729 * their buffer_heads) and return the error value the from failed 730 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 731 * as described above and return 0. 732 */ 733 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 734 ext4_lblk_t iblock, int indirect_blks, 735 int *blks, ext4_fsblk_t goal, 736 ext4_lblk_t *offsets, Indirect *branch) 737 { 738 int blocksize = inode->i_sb->s_blocksize; 739 int i, n = 0; 740 int err = 0; 741 struct buffer_head *bh; 742 int num; 743 ext4_fsblk_t new_blocks[4]; 744 ext4_fsblk_t current_block; 745 746 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 747 *blks, new_blocks, &err); 748 if (err) 749 return err; 750 751 branch[0].key = cpu_to_le32(new_blocks[0]); 752 /* 753 * metadata blocks and data blocks are allocated. 754 */ 755 for (n = 1; n <= indirect_blks; n++) { 756 /* 757 * Get buffer_head for parent block, zero it out 758 * and set the pointer to new one, then send 759 * parent to disk. 760 */ 761 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 762 branch[n].bh = bh; 763 lock_buffer(bh); 764 BUFFER_TRACE(bh, "call get_create_access"); 765 err = ext4_journal_get_create_access(handle, bh); 766 if (err) { 767 unlock_buffer(bh); 768 brelse(bh); 769 goto failed; 770 } 771 772 memset(bh->b_data, 0, blocksize); 773 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 774 branch[n].key = cpu_to_le32(new_blocks[n]); 775 *branch[n].p = branch[n].key; 776 if (n == indirect_blks) { 777 current_block = new_blocks[n]; 778 /* 779 * End of chain, update the last new metablock of 780 * the chain to point to the new allocated 781 * data blocks numbers 782 */ 783 for (i = 1; i < num; i++) 784 *(branch[n].p + i) = cpu_to_le32(++current_block); 785 } 786 BUFFER_TRACE(bh, "marking uptodate"); 787 set_buffer_uptodate(bh); 788 unlock_buffer(bh); 789 790 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 791 err = ext4_handle_dirty_metadata(handle, inode, bh); 792 if (err) 793 goto failed; 794 } 795 *blks = num; 796 return err; 797 failed: 798 /* Allocation failed, free what we already allocated */ 799 for (i = 1; i <= n ; i++) { 800 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 801 ext4_journal_forget(handle, branch[i].bh); 802 } 803 for (i = 0; i < indirect_blks; i++) 804 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 805 806 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 807 808 return err; 809 } 810 811 /** 812 * ext4_splice_branch - splice the allocated branch onto inode. 813 * @inode: owner 814 * @block: (logical) number of block we are adding 815 * @chain: chain of indirect blocks (with a missing link - see 816 * ext4_alloc_branch) 817 * @where: location of missing link 818 * @num: number of indirect blocks we are adding 819 * @blks: number of direct blocks we are adding 820 * 821 * This function fills the missing link and does all housekeeping needed in 822 * inode (->i_blocks, etc.). In case of success we end up with the full 823 * chain to new block and return 0. 824 */ 825 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 826 ext4_lblk_t block, Indirect *where, int num, 827 int blks) 828 { 829 int i; 830 int err = 0; 831 ext4_fsblk_t current_block; 832 833 /* 834 * If we're splicing into a [td]indirect block (as opposed to the 835 * inode) then we need to get write access to the [td]indirect block 836 * before the splice. 837 */ 838 if (where->bh) { 839 BUFFER_TRACE(where->bh, "get_write_access"); 840 err = ext4_journal_get_write_access(handle, where->bh); 841 if (err) 842 goto err_out; 843 } 844 /* That's it */ 845 846 *where->p = where->key; 847 848 /* 849 * Update the host buffer_head or inode to point to more just allocated 850 * direct blocks blocks 851 */ 852 if (num == 0 && blks > 1) { 853 current_block = le32_to_cpu(where->key) + 1; 854 for (i = 1; i < blks; i++) 855 *(where->p + i) = cpu_to_le32(current_block++); 856 } 857 858 /* We are done with atomic stuff, now do the rest of housekeeping */ 859 /* had we spliced it onto indirect block? */ 860 if (where->bh) { 861 /* 862 * If we spliced it onto an indirect block, we haven't 863 * altered the inode. Note however that if it is being spliced 864 * onto an indirect block at the very end of the file (the 865 * file is growing) then we *will* alter the inode to reflect 866 * the new i_size. But that is not done here - it is done in 867 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 868 */ 869 jbd_debug(5, "splicing indirect only\n"); 870 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 871 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 872 if (err) 873 goto err_out; 874 } else { 875 /* 876 * OK, we spliced it into the inode itself on a direct block. 877 */ 878 ext4_mark_inode_dirty(handle, inode); 879 jbd_debug(5, "splicing direct\n"); 880 } 881 return err; 882 883 err_out: 884 for (i = 1; i <= num; i++) { 885 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 886 ext4_journal_forget(handle, where[i].bh); 887 ext4_free_blocks(handle, inode, 888 le32_to_cpu(where[i-1].key), 1, 0); 889 } 890 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 891 892 return err; 893 } 894 895 /* 896 * The ext4_ind_get_blocks() function handles non-extents inodes 897 * (i.e., using the traditional indirect/double-indirect i_blocks 898 * scheme) for ext4_get_blocks(). 899 * 900 * Allocation strategy is simple: if we have to allocate something, we will 901 * have to go the whole way to leaf. So let's do it before attaching anything 902 * to tree, set linkage between the newborn blocks, write them if sync is 903 * required, recheck the path, free and repeat if check fails, otherwise 904 * set the last missing link (that will protect us from any truncate-generated 905 * removals - all blocks on the path are immune now) and possibly force the 906 * write on the parent block. 907 * That has a nice additional property: no special recovery from the failed 908 * allocations is needed - we simply release blocks and do not touch anything 909 * reachable from inode. 910 * 911 * `handle' can be NULL if create == 0. 912 * 913 * return > 0, # of blocks mapped or allocated. 914 * return = 0, if plain lookup failed. 915 * return < 0, error case. 916 * 917 * The ext4_ind_get_blocks() function should be called with 918 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem 919 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or 920 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 921 * blocks. 922 */ 923 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 924 ext4_lblk_t iblock, unsigned int maxblocks, 925 struct buffer_head *bh_result, 926 int flags) 927 { 928 int err = -EIO; 929 ext4_lblk_t offsets[4]; 930 Indirect chain[4]; 931 Indirect *partial; 932 ext4_fsblk_t goal; 933 int indirect_blks; 934 int blocks_to_boundary = 0; 935 int depth; 936 int count = 0; 937 ext4_fsblk_t first_block = 0; 938 939 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 940 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 941 depth = ext4_block_to_path(inode, iblock, offsets, 942 &blocks_to_boundary); 943 944 if (depth == 0) 945 goto out; 946 947 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 948 949 /* Simplest case - block found, no allocation needed */ 950 if (!partial) { 951 first_block = le32_to_cpu(chain[depth - 1].key); 952 clear_buffer_new(bh_result); 953 count++; 954 /*map more blocks*/ 955 while (count < maxblocks && count <= blocks_to_boundary) { 956 ext4_fsblk_t blk; 957 958 blk = le32_to_cpu(*(chain[depth-1].p + count)); 959 960 if (blk == first_block + count) 961 count++; 962 else 963 break; 964 } 965 goto got_it; 966 } 967 968 /* Next simple case - plain lookup or failed read of indirect block */ 969 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 970 goto cleanup; 971 972 /* 973 * Okay, we need to do block allocation. 974 */ 975 goal = ext4_find_goal(inode, iblock, partial); 976 977 /* the number of blocks need to allocate for [d,t]indirect blocks */ 978 indirect_blks = (chain + depth) - partial - 1; 979 980 /* 981 * Next look up the indirect map to count the totoal number of 982 * direct blocks to allocate for this branch. 983 */ 984 count = ext4_blks_to_allocate(partial, indirect_blks, 985 maxblocks, blocks_to_boundary); 986 /* 987 * Block out ext4_truncate while we alter the tree 988 */ 989 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 990 &count, goal, 991 offsets + (partial - chain), partial); 992 993 /* 994 * The ext4_splice_branch call will free and forget any buffers 995 * on the new chain if there is a failure, but that risks using 996 * up transaction credits, especially for bitmaps where the 997 * credits cannot be returned. Can we handle this somehow? We 998 * may need to return -EAGAIN upwards in the worst case. --sct 999 */ 1000 if (!err) 1001 err = ext4_splice_branch(handle, inode, iblock, 1002 partial, indirect_blks, count); 1003 else 1004 goto cleanup; 1005 1006 set_buffer_new(bh_result); 1007 got_it: 1008 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1009 if (count > blocks_to_boundary) 1010 set_buffer_boundary(bh_result); 1011 err = count; 1012 /* Clean up and exit */ 1013 partial = chain + depth - 1; /* the whole chain */ 1014 cleanup: 1015 while (partial > chain) { 1016 BUFFER_TRACE(partial->bh, "call brelse"); 1017 brelse(partial->bh); 1018 partial--; 1019 } 1020 BUFFER_TRACE(bh_result, "returned"); 1021 out: 1022 return err; 1023 } 1024 1025 qsize_t ext4_get_reserved_space(struct inode *inode) 1026 { 1027 unsigned long long total; 1028 1029 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1030 total = EXT4_I(inode)->i_reserved_data_blocks + 1031 EXT4_I(inode)->i_reserved_meta_blocks; 1032 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1033 1034 return total; 1035 } 1036 /* 1037 * Calculate the number of metadata blocks need to reserve 1038 * to allocate @blocks for non extent file based file 1039 */ 1040 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 1041 { 1042 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1043 int ind_blks, dind_blks, tind_blks; 1044 1045 /* number of new indirect blocks needed */ 1046 ind_blks = (blocks + icap - 1) / icap; 1047 1048 dind_blks = (ind_blks + icap - 1) / icap; 1049 1050 tind_blks = 1; 1051 1052 return ind_blks + dind_blks + tind_blks; 1053 } 1054 1055 /* 1056 * Calculate the number of metadata blocks need to reserve 1057 * to allocate given number of blocks 1058 */ 1059 static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1060 { 1061 if (!blocks) 1062 return 0; 1063 1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1065 return ext4_ext_calc_metadata_amount(inode, blocks); 1066 1067 return ext4_indirect_calc_metadata_amount(inode, blocks); 1068 } 1069 1070 static void ext4_da_update_reserve_space(struct inode *inode, int used) 1071 { 1072 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1073 int total, mdb, mdb_free; 1074 1075 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1076 /* recalculate the number of metablocks still need to be reserved */ 1077 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1078 mdb = ext4_calc_metadata_amount(inode, total); 1079 1080 /* figure out how many metablocks to release */ 1081 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1082 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1083 1084 if (mdb_free) { 1085 /* Account for allocated meta_blocks */ 1086 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1087 1088 /* update fs dirty blocks counter */ 1089 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1090 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1091 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1092 } 1093 1094 /* update per-inode reservations */ 1095 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1096 EXT4_I(inode)->i_reserved_data_blocks -= used; 1097 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1098 1099 /* 1100 * free those over-booking quota for metadata blocks 1101 */ 1102 if (mdb_free) 1103 vfs_dq_release_reservation_block(inode, mdb_free); 1104 1105 /* 1106 * If we have done all the pending block allocations and if 1107 * there aren't any writers on the inode, we can discard the 1108 * inode's preallocations. 1109 */ 1110 if (!total && (atomic_read(&inode->i_writecount) == 0)) 1111 ext4_discard_preallocations(inode); 1112 } 1113 1114 static int check_block_validity(struct inode *inode, sector_t logical, 1115 sector_t phys, int len) 1116 { 1117 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1118 ext4_error(inode->i_sb, "check_block_validity", 1119 "inode #%lu logical block %llu mapped to %llu " 1120 "(size %d)", inode->i_ino, 1121 (unsigned long long) logical, 1122 (unsigned long long) phys, len); 1123 WARN_ON(1); 1124 return -EIO; 1125 } 1126 return 0; 1127 } 1128 1129 /* 1130 * The ext4_get_blocks() function tries to look up the requested blocks, 1131 * and returns if the blocks are already mapped. 1132 * 1133 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1134 * and store the allocated blocks in the result buffer head and mark it 1135 * mapped. 1136 * 1137 * If file type is extents based, it will call ext4_ext_get_blocks(), 1138 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1139 * based files 1140 * 1141 * On success, it returns the number of blocks being mapped or allocate. 1142 * if create==0 and the blocks are pre-allocated and uninitialized block, 1143 * the result buffer head is unmapped. If the create ==1, it will make sure 1144 * the buffer head is mapped. 1145 * 1146 * It returns 0 if plain look up failed (blocks have not been allocated), in 1147 * that casem, buffer head is unmapped 1148 * 1149 * It returns the error in case of allocation failure. 1150 */ 1151 int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1152 unsigned int max_blocks, struct buffer_head *bh, 1153 int flags) 1154 { 1155 int retval; 1156 1157 clear_buffer_mapped(bh); 1158 clear_buffer_unwritten(bh); 1159 1160 /* 1161 * Try to see if we can get the block without requesting a new 1162 * file system block. 1163 */ 1164 down_read((&EXT4_I(inode)->i_data_sem)); 1165 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1166 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1167 bh, 0); 1168 } else { 1169 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1170 bh, 0); 1171 } 1172 up_read((&EXT4_I(inode)->i_data_sem)); 1173 1174 if (retval > 0 && buffer_mapped(bh)) { 1175 int ret = check_block_validity(inode, block, 1176 bh->b_blocknr, retval); 1177 if (ret != 0) 1178 return ret; 1179 } 1180 1181 /* If it is only a block(s) look up */ 1182 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 1183 return retval; 1184 1185 /* 1186 * Returns if the blocks have already allocated 1187 * 1188 * Note that if blocks have been preallocated 1189 * ext4_ext_get_block() returns th create = 0 1190 * with buffer head unmapped. 1191 */ 1192 if (retval > 0 && buffer_mapped(bh)) 1193 return retval; 1194 1195 /* 1196 * When we call get_blocks without the create flag, the 1197 * BH_Unwritten flag could have gotten set if the blocks 1198 * requested were part of a uninitialized extent. We need to 1199 * clear this flag now that we are committed to convert all or 1200 * part of the uninitialized extent to be an initialized 1201 * extent. This is because we need to avoid the combination 1202 * of BH_Unwritten and BH_Mapped flags being simultaneously 1203 * set on the buffer_head. 1204 */ 1205 clear_buffer_unwritten(bh); 1206 1207 /* 1208 * New blocks allocate and/or writing to uninitialized extent 1209 * will possibly result in updating i_data, so we take 1210 * the write lock of i_data_sem, and call get_blocks() 1211 * with create == 1 flag. 1212 */ 1213 down_write((&EXT4_I(inode)->i_data_sem)); 1214 1215 /* 1216 * if the caller is from delayed allocation writeout path 1217 * we have already reserved fs blocks for allocation 1218 * let the underlying get_block() function know to 1219 * avoid double accounting 1220 */ 1221 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1222 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1223 /* 1224 * We need to check for EXT4 here because migrate 1225 * could have changed the inode type in between 1226 */ 1227 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1228 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1229 bh, flags); 1230 } else { 1231 retval = ext4_ind_get_blocks(handle, inode, block, 1232 max_blocks, bh, flags); 1233 1234 if (retval > 0 && buffer_new(bh)) { 1235 /* 1236 * We allocated new blocks which will result in 1237 * i_data's format changing. Force the migrate 1238 * to fail by clearing migrate flags 1239 */ 1240 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1241 ~EXT4_EXT_MIGRATE; 1242 } 1243 } 1244 1245 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1246 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1247 1248 /* 1249 * Update reserved blocks/metadata blocks after successful 1250 * block allocation which had been deferred till now. 1251 */ 1252 if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) 1253 ext4_da_update_reserve_space(inode, retval); 1254 1255 up_write((&EXT4_I(inode)->i_data_sem)); 1256 if (retval > 0 && buffer_mapped(bh)) { 1257 int ret = check_block_validity(inode, block, 1258 bh->b_blocknr, retval); 1259 if (ret != 0) 1260 return ret; 1261 } 1262 return retval; 1263 } 1264 1265 /* Maximum number of blocks we map for direct IO at once. */ 1266 #define DIO_MAX_BLOCKS 4096 1267 1268 int ext4_get_block(struct inode *inode, sector_t iblock, 1269 struct buffer_head *bh_result, int create) 1270 { 1271 handle_t *handle = ext4_journal_current_handle(); 1272 int ret = 0, started = 0; 1273 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1274 int dio_credits; 1275 1276 if (create && !handle) { 1277 /* Direct IO write... */ 1278 if (max_blocks > DIO_MAX_BLOCKS) 1279 max_blocks = DIO_MAX_BLOCKS; 1280 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1281 handle = ext4_journal_start(inode, dio_credits); 1282 if (IS_ERR(handle)) { 1283 ret = PTR_ERR(handle); 1284 goto out; 1285 } 1286 started = 1; 1287 } 1288 1289 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1290 create ? EXT4_GET_BLOCKS_CREATE : 0); 1291 if (ret > 0) { 1292 bh_result->b_size = (ret << inode->i_blkbits); 1293 ret = 0; 1294 } 1295 if (started) 1296 ext4_journal_stop(handle); 1297 out: 1298 return ret; 1299 } 1300 1301 /* 1302 * `handle' can be NULL if create is zero 1303 */ 1304 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1305 ext4_lblk_t block, int create, int *errp) 1306 { 1307 struct buffer_head dummy; 1308 int fatal = 0, err; 1309 int flags = 0; 1310 1311 J_ASSERT(handle != NULL || create == 0); 1312 1313 dummy.b_state = 0; 1314 dummy.b_blocknr = -1000; 1315 buffer_trace_init(&dummy.b_history); 1316 if (create) 1317 flags |= EXT4_GET_BLOCKS_CREATE; 1318 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1319 /* 1320 * ext4_get_blocks() returns number of blocks mapped. 0 in 1321 * case of a HOLE. 1322 */ 1323 if (err > 0) { 1324 if (err > 1) 1325 WARN_ON(1); 1326 err = 0; 1327 } 1328 *errp = err; 1329 if (!err && buffer_mapped(&dummy)) { 1330 struct buffer_head *bh; 1331 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1332 if (!bh) { 1333 *errp = -EIO; 1334 goto err; 1335 } 1336 if (buffer_new(&dummy)) { 1337 J_ASSERT(create != 0); 1338 J_ASSERT(handle != NULL); 1339 1340 /* 1341 * Now that we do not always journal data, we should 1342 * keep in mind whether this should always journal the 1343 * new buffer as metadata. For now, regular file 1344 * writes use ext4_get_block instead, so it's not a 1345 * problem. 1346 */ 1347 lock_buffer(bh); 1348 BUFFER_TRACE(bh, "call get_create_access"); 1349 fatal = ext4_journal_get_create_access(handle, bh); 1350 if (!fatal && !buffer_uptodate(bh)) { 1351 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1352 set_buffer_uptodate(bh); 1353 } 1354 unlock_buffer(bh); 1355 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1356 err = ext4_handle_dirty_metadata(handle, inode, bh); 1357 if (!fatal) 1358 fatal = err; 1359 } else { 1360 BUFFER_TRACE(bh, "not a new buffer"); 1361 } 1362 if (fatal) { 1363 *errp = fatal; 1364 brelse(bh); 1365 bh = NULL; 1366 } 1367 return bh; 1368 } 1369 err: 1370 return NULL; 1371 } 1372 1373 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1374 ext4_lblk_t block, int create, int *err) 1375 { 1376 struct buffer_head *bh; 1377 1378 bh = ext4_getblk(handle, inode, block, create, err); 1379 if (!bh) 1380 return bh; 1381 if (buffer_uptodate(bh)) 1382 return bh; 1383 ll_rw_block(READ_META, 1, &bh); 1384 wait_on_buffer(bh); 1385 if (buffer_uptodate(bh)) 1386 return bh; 1387 put_bh(bh); 1388 *err = -EIO; 1389 return NULL; 1390 } 1391 1392 static int walk_page_buffers(handle_t *handle, 1393 struct buffer_head *head, 1394 unsigned from, 1395 unsigned to, 1396 int *partial, 1397 int (*fn)(handle_t *handle, 1398 struct buffer_head *bh)) 1399 { 1400 struct buffer_head *bh; 1401 unsigned block_start, block_end; 1402 unsigned blocksize = head->b_size; 1403 int err, ret = 0; 1404 struct buffer_head *next; 1405 1406 for (bh = head, block_start = 0; 1407 ret == 0 && (bh != head || !block_start); 1408 block_start = block_end, bh = next) { 1409 next = bh->b_this_page; 1410 block_end = block_start + blocksize; 1411 if (block_end <= from || block_start >= to) { 1412 if (partial && !buffer_uptodate(bh)) 1413 *partial = 1; 1414 continue; 1415 } 1416 err = (*fn)(handle, bh); 1417 if (!ret) 1418 ret = err; 1419 } 1420 return ret; 1421 } 1422 1423 /* 1424 * To preserve ordering, it is essential that the hole instantiation and 1425 * the data write be encapsulated in a single transaction. We cannot 1426 * close off a transaction and start a new one between the ext4_get_block() 1427 * and the commit_write(). So doing the jbd2_journal_start at the start of 1428 * prepare_write() is the right place. 1429 * 1430 * Also, this function can nest inside ext4_writepage() -> 1431 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1432 * has generated enough buffer credits to do the whole page. So we won't 1433 * block on the journal in that case, which is good, because the caller may 1434 * be PF_MEMALLOC. 1435 * 1436 * By accident, ext4 can be reentered when a transaction is open via 1437 * quota file writes. If we were to commit the transaction while thus 1438 * reentered, there can be a deadlock - we would be holding a quota 1439 * lock, and the commit would never complete if another thread had a 1440 * transaction open and was blocking on the quota lock - a ranking 1441 * violation. 1442 * 1443 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1444 * will _not_ run commit under these circumstances because handle->h_ref 1445 * is elevated. We'll still have enough credits for the tiny quotafile 1446 * write. 1447 */ 1448 static int do_journal_get_write_access(handle_t *handle, 1449 struct buffer_head *bh) 1450 { 1451 if (!buffer_mapped(bh) || buffer_freed(bh)) 1452 return 0; 1453 return ext4_journal_get_write_access(handle, bh); 1454 } 1455 1456 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1457 loff_t pos, unsigned len, unsigned flags, 1458 struct page **pagep, void **fsdata) 1459 { 1460 struct inode *inode = mapping->host; 1461 int ret, needed_blocks; 1462 handle_t *handle; 1463 int retries = 0; 1464 struct page *page; 1465 pgoff_t index; 1466 unsigned from, to; 1467 1468 trace_ext4_write_begin(inode, pos, len, flags); 1469 /* 1470 * Reserve one block more for addition to orphan list in case 1471 * we allocate blocks but write fails for some reason 1472 */ 1473 needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1474 index = pos >> PAGE_CACHE_SHIFT; 1475 from = pos & (PAGE_CACHE_SIZE - 1); 1476 to = from + len; 1477 1478 retry: 1479 handle = ext4_journal_start(inode, needed_blocks); 1480 if (IS_ERR(handle)) { 1481 ret = PTR_ERR(handle); 1482 goto out; 1483 } 1484 1485 /* We cannot recurse into the filesystem as the transaction is already 1486 * started */ 1487 flags |= AOP_FLAG_NOFS; 1488 1489 page = grab_cache_page_write_begin(mapping, index, flags); 1490 if (!page) { 1491 ext4_journal_stop(handle); 1492 ret = -ENOMEM; 1493 goto out; 1494 } 1495 *pagep = page; 1496 1497 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1498 ext4_get_block); 1499 1500 if (!ret && ext4_should_journal_data(inode)) { 1501 ret = walk_page_buffers(handle, page_buffers(page), 1502 from, to, NULL, do_journal_get_write_access); 1503 } 1504 1505 if (ret) { 1506 unlock_page(page); 1507 page_cache_release(page); 1508 /* 1509 * block_write_begin may have instantiated a few blocks 1510 * outside i_size. Trim these off again. Don't need 1511 * i_size_read because we hold i_mutex. 1512 * 1513 * Add inode to orphan list in case we crash before 1514 * truncate finishes 1515 */ 1516 if (pos + len > inode->i_size) 1517 ext4_orphan_add(handle, inode); 1518 1519 ext4_journal_stop(handle); 1520 if (pos + len > inode->i_size) { 1521 vmtruncate(inode, inode->i_size); 1522 /* 1523 * If vmtruncate failed early the inode might 1524 * still be on the orphan list; we need to 1525 * make sure the inode is removed from the 1526 * orphan list in that case. 1527 */ 1528 if (inode->i_nlink) 1529 ext4_orphan_del(NULL, inode); 1530 } 1531 } 1532 1533 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1534 goto retry; 1535 out: 1536 return ret; 1537 } 1538 1539 /* For write_end() in data=journal mode */ 1540 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1541 { 1542 if (!buffer_mapped(bh) || buffer_freed(bh)) 1543 return 0; 1544 set_buffer_uptodate(bh); 1545 return ext4_handle_dirty_metadata(handle, NULL, bh); 1546 } 1547 1548 static int ext4_generic_write_end(struct file *file, 1549 struct address_space *mapping, 1550 loff_t pos, unsigned len, unsigned copied, 1551 struct page *page, void *fsdata) 1552 { 1553 int i_size_changed = 0; 1554 struct inode *inode = mapping->host; 1555 handle_t *handle = ext4_journal_current_handle(); 1556 1557 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1558 1559 /* 1560 * No need to use i_size_read() here, the i_size 1561 * cannot change under us because we hold i_mutex. 1562 * 1563 * But it's important to update i_size while still holding page lock: 1564 * page writeout could otherwise come in and zero beyond i_size. 1565 */ 1566 if (pos + copied > inode->i_size) { 1567 i_size_write(inode, pos + copied); 1568 i_size_changed = 1; 1569 } 1570 1571 if (pos + copied > EXT4_I(inode)->i_disksize) { 1572 /* We need to mark inode dirty even if 1573 * new_i_size is less that inode->i_size 1574 * bu greater than i_disksize.(hint delalloc) 1575 */ 1576 ext4_update_i_disksize(inode, (pos + copied)); 1577 i_size_changed = 1; 1578 } 1579 unlock_page(page); 1580 page_cache_release(page); 1581 1582 /* 1583 * Don't mark the inode dirty under page lock. First, it unnecessarily 1584 * makes the holding time of page lock longer. Second, it forces lock 1585 * ordering of page lock and transaction start for journaling 1586 * filesystems. 1587 */ 1588 if (i_size_changed) 1589 ext4_mark_inode_dirty(handle, inode); 1590 1591 return copied; 1592 } 1593 1594 /* 1595 * We need to pick up the new inode size which generic_commit_write gave us 1596 * `file' can be NULL - eg, when called from page_symlink(). 1597 * 1598 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1599 * buffers are managed internally. 1600 */ 1601 static int ext4_ordered_write_end(struct file *file, 1602 struct address_space *mapping, 1603 loff_t pos, unsigned len, unsigned copied, 1604 struct page *page, void *fsdata) 1605 { 1606 handle_t *handle = ext4_journal_current_handle(); 1607 struct inode *inode = mapping->host; 1608 int ret = 0, ret2; 1609 1610 trace_ext4_ordered_write_end(inode, pos, len, copied); 1611 ret = ext4_jbd2_file_inode(handle, inode); 1612 1613 if (ret == 0) { 1614 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1615 page, fsdata); 1616 copied = ret2; 1617 if (pos + len > inode->i_size) 1618 /* if we have allocated more blocks and copied 1619 * less. We will have blocks allocated outside 1620 * inode->i_size. So truncate them 1621 */ 1622 ext4_orphan_add(handle, inode); 1623 if (ret2 < 0) 1624 ret = ret2; 1625 } 1626 ret2 = ext4_journal_stop(handle); 1627 if (!ret) 1628 ret = ret2; 1629 1630 if (pos + len > inode->i_size) { 1631 vmtruncate(inode, inode->i_size); 1632 /* 1633 * If vmtruncate failed early the inode might still be 1634 * on the orphan list; we need to make sure the inode 1635 * is removed from the orphan list in that case. 1636 */ 1637 if (inode->i_nlink) 1638 ext4_orphan_del(NULL, inode); 1639 } 1640 1641 1642 return ret ? ret : copied; 1643 } 1644 1645 static int ext4_writeback_write_end(struct file *file, 1646 struct address_space *mapping, 1647 loff_t pos, unsigned len, unsigned copied, 1648 struct page *page, void *fsdata) 1649 { 1650 handle_t *handle = ext4_journal_current_handle(); 1651 struct inode *inode = mapping->host; 1652 int ret = 0, ret2; 1653 1654 trace_ext4_writeback_write_end(inode, pos, len, copied); 1655 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1656 page, fsdata); 1657 copied = ret2; 1658 if (pos + len > inode->i_size) 1659 /* if we have allocated more blocks and copied 1660 * less. We will have blocks allocated outside 1661 * inode->i_size. So truncate them 1662 */ 1663 ext4_orphan_add(handle, inode); 1664 1665 if (ret2 < 0) 1666 ret = ret2; 1667 1668 ret2 = ext4_journal_stop(handle); 1669 if (!ret) 1670 ret = ret2; 1671 1672 if (pos + len > inode->i_size) { 1673 vmtruncate(inode, inode->i_size); 1674 /* 1675 * If vmtruncate failed early the inode might still be 1676 * on the orphan list; we need to make sure the inode 1677 * is removed from the orphan list in that case. 1678 */ 1679 if (inode->i_nlink) 1680 ext4_orphan_del(NULL, inode); 1681 } 1682 1683 return ret ? ret : copied; 1684 } 1685 1686 static int ext4_journalled_write_end(struct file *file, 1687 struct address_space *mapping, 1688 loff_t pos, unsigned len, unsigned copied, 1689 struct page *page, void *fsdata) 1690 { 1691 handle_t *handle = ext4_journal_current_handle(); 1692 struct inode *inode = mapping->host; 1693 int ret = 0, ret2; 1694 int partial = 0; 1695 unsigned from, to; 1696 loff_t new_i_size; 1697 1698 trace_ext4_journalled_write_end(inode, pos, len, copied); 1699 from = pos & (PAGE_CACHE_SIZE - 1); 1700 to = from + len; 1701 1702 if (copied < len) { 1703 if (!PageUptodate(page)) 1704 copied = 0; 1705 page_zero_new_buffers(page, from+copied, to); 1706 } 1707 1708 ret = walk_page_buffers(handle, page_buffers(page), from, 1709 to, &partial, write_end_fn); 1710 if (!partial) 1711 SetPageUptodate(page); 1712 new_i_size = pos + copied; 1713 if (new_i_size > inode->i_size) 1714 i_size_write(inode, pos+copied); 1715 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1716 if (new_i_size > EXT4_I(inode)->i_disksize) { 1717 ext4_update_i_disksize(inode, new_i_size); 1718 ret2 = ext4_mark_inode_dirty(handle, inode); 1719 if (!ret) 1720 ret = ret2; 1721 } 1722 1723 unlock_page(page); 1724 page_cache_release(page); 1725 if (pos + len > inode->i_size) 1726 /* if we have allocated more blocks and copied 1727 * less. We will have blocks allocated outside 1728 * inode->i_size. So truncate them 1729 */ 1730 ext4_orphan_add(handle, inode); 1731 1732 ret2 = ext4_journal_stop(handle); 1733 if (!ret) 1734 ret = ret2; 1735 if (pos + len > inode->i_size) { 1736 vmtruncate(inode, inode->i_size); 1737 /* 1738 * If vmtruncate failed early the inode might still be 1739 * on the orphan list; we need to make sure the inode 1740 * is removed from the orphan list in that case. 1741 */ 1742 if (inode->i_nlink) 1743 ext4_orphan_del(NULL, inode); 1744 } 1745 1746 return ret ? ret : copied; 1747 } 1748 1749 static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1750 { 1751 int retries = 0; 1752 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1753 unsigned long md_needed, mdblocks, total = 0; 1754 1755 /* 1756 * recalculate the amount of metadata blocks to reserve 1757 * in order to allocate nrblocks 1758 * worse case is one extent per block 1759 */ 1760 repeat: 1761 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1762 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1763 mdblocks = ext4_calc_metadata_amount(inode, total); 1764 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1765 1766 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1767 total = md_needed + nrblocks; 1768 1769 /* 1770 * Make quota reservation here to prevent quota overflow 1771 * later. Real quota accounting is done at pages writeout 1772 * time. 1773 */ 1774 if (vfs_dq_reserve_block(inode, total)) { 1775 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1776 return -EDQUOT; 1777 } 1778 1779 if (ext4_claim_free_blocks(sbi, total)) { 1780 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1781 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1782 yield(); 1783 goto repeat; 1784 } 1785 vfs_dq_release_reservation_block(inode, total); 1786 return -ENOSPC; 1787 } 1788 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1789 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1790 1791 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1792 return 0; /* success */ 1793 } 1794 1795 static void ext4_da_release_space(struct inode *inode, int to_free) 1796 { 1797 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1798 int total, mdb, mdb_free, release; 1799 1800 if (!to_free) 1801 return; /* Nothing to release, exit */ 1802 1803 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1804 1805 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1806 /* 1807 * if there is no reserved blocks, but we try to free some 1808 * then the counter is messed up somewhere. 1809 * but since this function is called from invalidate 1810 * page, it's harmless to return without any action 1811 */ 1812 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1813 "blocks for inode %lu, but there is no reserved " 1814 "data blocks\n", to_free, inode->i_ino); 1815 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1816 return; 1817 } 1818 1819 /* recalculate the number of metablocks still need to be reserved */ 1820 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1821 mdb = ext4_calc_metadata_amount(inode, total); 1822 1823 /* figure out how many metablocks to release */ 1824 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1825 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1826 1827 release = to_free + mdb_free; 1828 1829 /* update fs dirty blocks counter for truncate case */ 1830 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); 1831 1832 /* update per-inode reservations */ 1833 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1834 EXT4_I(inode)->i_reserved_data_blocks -= to_free; 1835 1836 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1837 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1838 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1839 1840 vfs_dq_release_reservation_block(inode, release); 1841 } 1842 1843 static void ext4_da_page_release_reservation(struct page *page, 1844 unsigned long offset) 1845 { 1846 int to_release = 0; 1847 struct buffer_head *head, *bh; 1848 unsigned int curr_off = 0; 1849 1850 head = page_buffers(page); 1851 bh = head; 1852 do { 1853 unsigned int next_off = curr_off + bh->b_size; 1854 1855 if ((offset <= curr_off) && (buffer_delay(bh))) { 1856 to_release++; 1857 clear_buffer_delay(bh); 1858 } 1859 curr_off = next_off; 1860 } while ((bh = bh->b_this_page) != head); 1861 ext4_da_release_space(page->mapping->host, to_release); 1862 } 1863 1864 /* 1865 * Delayed allocation stuff 1866 */ 1867 1868 struct mpage_da_data { 1869 struct inode *inode; 1870 sector_t b_blocknr; /* start block number of extent */ 1871 size_t b_size; /* size of extent */ 1872 unsigned long b_state; /* state of the extent */ 1873 unsigned long first_page, next_page; /* extent of pages */ 1874 struct writeback_control *wbc; 1875 int io_done; 1876 int pages_written; 1877 int retval; 1878 }; 1879 1880 /* 1881 * mpage_da_submit_io - walks through extent of pages and try to write 1882 * them with writepage() call back 1883 * 1884 * @mpd->inode: inode 1885 * @mpd->first_page: first page of the extent 1886 * @mpd->next_page: page after the last page of the extent 1887 * 1888 * By the time mpage_da_submit_io() is called we expect all blocks 1889 * to be allocated. this may be wrong if allocation failed. 1890 * 1891 * As pages are already locked by write_cache_pages(), we can't use it 1892 */ 1893 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1894 { 1895 long pages_skipped; 1896 struct pagevec pvec; 1897 unsigned long index, end; 1898 int ret = 0, err, nr_pages, i; 1899 struct inode *inode = mpd->inode; 1900 struct address_space *mapping = inode->i_mapping; 1901 1902 BUG_ON(mpd->next_page <= mpd->first_page); 1903 /* 1904 * We need to start from the first_page to the next_page - 1 1905 * to make sure we also write the mapped dirty buffer_heads. 1906 * If we look at mpd->b_blocknr we would only be looking 1907 * at the currently mapped buffer_heads. 1908 */ 1909 index = mpd->first_page; 1910 end = mpd->next_page - 1; 1911 1912 pagevec_init(&pvec, 0); 1913 while (index <= end) { 1914 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1915 if (nr_pages == 0) 1916 break; 1917 for (i = 0; i < nr_pages; i++) { 1918 struct page *page = pvec.pages[i]; 1919 1920 index = page->index; 1921 if (index > end) 1922 break; 1923 index++; 1924 1925 BUG_ON(!PageLocked(page)); 1926 BUG_ON(PageWriteback(page)); 1927 1928 pages_skipped = mpd->wbc->pages_skipped; 1929 err = mapping->a_ops->writepage(page, mpd->wbc); 1930 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1931 /* 1932 * have successfully written the page 1933 * without skipping the same 1934 */ 1935 mpd->pages_written++; 1936 /* 1937 * In error case, we have to continue because 1938 * remaining pages are still locked 1939 * XXX: unlock and re-dirty them? 1940 */ 1941 if (ret == 0) 1942 ret = err; 1943 } 1944 pagevec_release(&pvec); 1945 } 1946 return ret; 1947 } 1948 1949 /* 1950 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 1951 * 1952 * @mpd->inode - inode to walk through 1953 * @exbh->b_blocknr - first block on a disk 1954 * @exbh->b_size - amount of space in bytes 1955 * @logical - first logical block to start assignment with 1956 * 1957 * the function goes through all passed space and put actual disk 1958 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 1959 */ 1960 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1961 struct buffer_head *exbh) 1962 { 1963 struct inode *inode = mpd->inode; 1964 struct address_space *mapping = inode->i_mapping; 1965 int blocks = exbh->b_size >> inode->i_blkbits; 1966 sector_t pblock = exbh->b_blocknr, cur_logical; 1967 struct buffer_head *head, *bh; 1968 pgoff_t index, end; 1969 struct pagevec pvec; 1970 int nr_pages, i; 1971 1972 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1973 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1974 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1975 1976 pagevec_init(&pvec, 0); 1977 1978 while (index <= end) { 1979 /* XXX: optimize tail */ 1980 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1981 if (nr_pages == 0) 1982 break; 1983 for (i = 0; i < nr_pages; i++) { 1984 struct page *page = pvec.pages[i]; 1985 1986 index = page->index; 1987 if (index > end) 1988 break; 1989 index++; 1990 1991 BUG_ON(!PageLocked(page)); 1992 BUG_ON(PageWriteback(page)); 1993 BUG_ON(!page_has_buffers(page)); 1994 1995 bh = page_buffers(page); 1996 head = bh; 1997 1998 /* skip blocks out of the range */ 1999 do { 2000 if (cur_logical >= logical) 2001 break; 2002 cur_logical++; 2003 } while ((bh = bh->b_this_page) != head); 2004 2005 do { 2006 if (cur_logical >= logical + blocks) 2007 break; 2008 2009 if (buffer_delay(bh) || 2010 buffer_unwritten(bh)) { 2011 2012 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2013 2014 if (buffer_delay(bh)) { 2015 clear_buffer_delay(bh); 2016 bh->b_blocknr = pblock; 2017 } else { 2018 /* 2019 * unwritten already should have 2020 * blocknr assigned. Verify that 2021 */ 2022 clear_buffer_unwritten(bh); 2023 BUG_ON(bh->b_blocknr != pblock); 2024 } 2025 2026 } else if (buffer_mapped(bh)) 2027 BUG_ON(bh->b_blocknr != pblock); 2028 2029 cur_logical++; 2030 pblock++; 2031 } while ((bh = bh->b_this_page) != head); 2032 } 2033 pagevec_release(&pvec); 2034 } 2035 } 2036 2037 2038 /* 2039 * __unmap_underlying_blocks - just a helper function to unmap 2040 * set of blocks described by @bh 2041 */ 2042 static inline void __unmap_underlying_blocks(struct inode *inode, 2043 struct buffer_head *bh) 2044 { 2045 struct block_device *bdev = inode->i_sb->s_bdev; 2046 int blocks, i; 2047 2048 blocks = bh->b_size >> inode->i_blkbits; 2049 for (i = 0; i < blocks; i++) 2050 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 2051 } 2052 2053 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2054 sector_t logical, long blk_cnt) 2055 { 2056 int nr_pages, i; 2057 pgoff_t index, end; 2058 struct pagevec pvec; 2059 struct inode *inode = mpd->inode; 2060 struct address_space *mapping = inode->i_mapping; 2061 2062 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2063 end = (logical + blk_cnt - 1) >> 2064 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2065 while (index <= end) { 2066 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2067 if (nr_pages == 0) 2068 break; 2069 for (i = 0; i < nr_pages; i++) { 2070 struct page *page = pvec.pages[i]; 2071 index = page->index; 2072 if (index > end) 2073 break; 2074 index++; 2075 2076 BUG_ON(!PageLocked(page)); 2077 BUG_ON(PageWriteback(page)); 2078 block_invalidatepage(page, 0); 2079 ClearPageUptodate(page); 2080 unlock_page(page); 2081 } 2082 } 2083 return; 2084 } 2085 2086 static void ext4_print_free_blocks(struct inode *inode) 2087 { 2088 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2089 printk(KERN_EMERG "Total free blocks count %lld\n", 2090 ext4_count_free_blocks(inode->i_sb)); 2091 printk(KERN_EMERG "Free/Dirty block details\n"); 2092 printk(KERN_EMERG "free_blocks=%lld\n", 2093 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2094 printk(KERN_EMERG "dirty_blocks=%lld\n", 2095 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2096 printk(KERN_EMERG "Block reservation details\n"); 2097 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2098 EXT4_I(inode)->i_reserved_data_blocks); 2099 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2100 EXT4_I(inode)->i_reserved_meta_blocks); 2101 return; 2102 } 2103 2104 /* 2105 * mpage_da_map_blocks - go through given space 2106 * 2107 * @mpd - bh describing space 2108 * 2109 * The function skips space we know is already mapped to disk blocks. 2110 * 2111 */ 2112 static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2113 { 2114 int err, blks, get_blocks_flags; 2115 struct buffer_head new; 2116 sector_t next = mpd->b_blocknr; 2117 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2118 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2119 handle_t *handle = NULL; 2120 2121 /* 2122 * We consider only non-mapped and non-allocated blocks 2123 */ 2124 if ((mpd->b_state & (1 << BH_Mapped)) && 2125 !(mpd->b_state & (1 << BH_Delay)) && 2126 !(mpd->b_state & (1 << BH_Unwritten))) 2127 return 0; 2128 2129 /* 2130 * If we didn't accumulate anything to write simply return 2131 */ 2132 if (!mpd->b_size) 2133 return 0; 2134 2135 handle = ext4_journal_current_handle(); 2136 BUG_ON(!handle); 2137 2138 /* 2139 * Call ext4_get_blocks() to allocate any delayed allocation 2140 * blocks, or to convert an uninitialized extent to be 2141 * initialized (in the case where we have written into 2142 * one or more preallocated blocks). 2143 * 2144 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to 2145 * indicate that we are on the delayed allocation path. This 2146 * affects functions in many different parts of the allocation 2147 * call path. This flag exists primarily because we don't 2148 * want to change *many* call functions, so ext4_get_blocks() 2149 * will set the magic i_delalloc_reserved_flag once the 2150 * inode's allocation semaphore is taken. 2151 * 2152 * If the blocks in questions were delalloc blocks, set 2153 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2154 * variables are updated after the blocks have been allocated. 2155 */ 2156 new.b_state = 0; 2157 get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | 2158 EXT4_GET_BLOCKS_DELALLOC_RESERVE); 2159 if (mpd->b_state & (1 << BH_Delay)) 2160 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; 2161 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2162 &new, get_blocks_flags); 2163 if (blks < 0) { 2164 err = blks; 2165 /* 2166 * If get block returns with error we simply 2167 * return. Later writepage will redirty the page and 2168 * writepages will find the dirty page again 2169 */ 2170 if (err == -EAGAIN) 2171 return 0; 2172 2173 if (err == -ENOSPC && 2174 ext4_count_free_blocks(mpd->inode->i_sb)) { 2175 mpd->retval = err; 2176 return 0; 2177 } 2178 2179 /* 2180 * get block failure will cause us to loop in 2181 * writepages, because a_ops->writepage won't be able 2182 * to make progress. The page will be redirtied by 2183 * writepage and writepages will again try to write 2184 * the same. 2185 */ 2186 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2187 "at logical offset %llu with max blocks " 2188 "%zd with error %d\n", 2189 __func__, mpd->inode->i_ino, 2190 (unsigned long long)next, 2191 mpd->b_size >> mpd->inode->i_blkbits, err); 2192 printk(KERN_EMERG "This should not happen.!! " 2193 "Data will be lost\n"); 2194 if (err == -ENOSPC) { 2195 ext4_print_free_blocks(mpd->inode); 2196 } 2197 /* invalidate all the pages */ 2198 ext4_da_block_invalidatepages(mpd, next, 2199 mpd->b_size >> mpd->inode->i_blkbits); 2200 return err; 2201 } 2202 BUG_ON(blks == 0); 2203 2204 new.b_size = (blks << mpd->inode->i_blkbits); 2205 2206 if (buffer_new(&new)) 2207 __unmap_underlying_blocks(mpd->inode, &new); 2208 2209 /* 2210 * If blocks are delayed marked, we need to 2211 * put actual blocknr and drop delayed bit 2212 */ 2213 if ((mpd->b_state & (1 << BH_Delay)) || 2214 (mpd->b_state & (1 << BH_Unwritten))) 2215 mpage_put_bnr_to_bhs(mpd, next, &new); 2216 2217 if (ext4_should_order_data(mpd->inode)) { 2218 err = ext4_jbd2_file_inode(handle, mpd->inode); 2219 if (err) 2220 return err; 2221 } 2222 2223 /* 2224 * Update on-disk size along with block allocation. 2225 */ 2226 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; 2227 if (disksize > i_size_read(mpd->inode)) 2228 disksize = i_size_read(mpd->inode); 2229 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2230 ext4_update_i_disksize(mpd->inode, disksize); 2231 return ext4_mark_inode_dirty(handle, mpd->inode); 2232 } 2233 2234 return 0; 2235 } 2236 2237 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2238 (1 << BH_Delay) | (1 << BH_Unwritten)) 2239 2240 /* 2241 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 2242 * 2243 * @mpd->lbh - extent of blocks 2244 * @logical - logical number of the block in the file 2245 * @bh - bh of the block (used to access block's state) 2246 * 2247 * the function is used to collect contig. blocks in same state 2248 */ 2249 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2250 sector_t logical, size_t b_size, 2251 unsigned long b_state) 2252 { 2253 sector_t next; 2254 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2255 2256 /* check if thereserved journal credits might overflow */ 2257 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2258 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2259 /* 2260 * With non-extent format we are limited by the journal 2261 * credit available. Total credit needed to insert 2262 * nrblocks contiguous blocks is dependent on the 2263 * nrblocks. So limit nrblocks. 2264 */ 2265 goto flush_it; 2266 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2267 EXT4_MAX_TRANS_DATA) { 2268 /* 2269 * Adding the new buffer_head would make it cross the 2270 * allowed limit for which we have journal credit 2271 * reserved. So limit the new bh->b_size 2272 */ 2273 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2274 mpd->inode->i_blkbits; 2275 /* we will do mpage_da_submit_io in the next loop */ 2276 } 2277 } 2278 /* 2279 * First block in the extent 2280 */ 2281 if (mpd->b_size == 0) { 2282 mpd->b_blocknr = logical; 2283 mpd->b_size = b_size; 2284 mpd->b_state = b_state & BH_FLAGS; 2285 return; 2286 } 2287 2288 next = mpd->b_blocknr + nrblocks; 2289 /* 2290 * Can we merge the block to our big extent? 2291 */ 2292 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { 2293 mpd->b_size += b_size; 2294 return; 2295 } 2296 2297 flush_it: 2298 /* 2299 * We couldn't merge the block to our extent, so we 2300 * need to flush current extent and start new one 2301 */ 2302 if (mpage_da_map_blocks(mpd) == 0) 2303 mpage_da_submit_io(mpd); 2304 mpd->io_done = 1; 2305 return; 2306 } 2307 2308 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2309 { 2310 /* 2311 * unmapped buffer is possible for holes. 2312 * delay buffer is possible with delayed allocation. 2313 * We also need to consider unwritten buffer as unmapped. 2314 */ 2315 return (!buffer_mapped(bh) || buffer_delay(bh) || 2316 buffer_unwritten(bh)) && buffer_dirty(bh); 2317 } 2318 2319 /* 2320 * __mpage_da_writepage - finds extent of pages and blocks 2321 * 2322 * @page: page to consider 2323 * @wbc: not used, we just follow rules 2324 * @data: context 2325 * 2326 * The function finds extents of pages and scan them for all blocks. 2327 */ 2328 static int __mpage_da_writepage(struct page *page, 2329 struct writeback_control *wbc, void *data) 2330 { 2331 struct mpage_da_data *mpd = data; 2332 struct inode *inode = mpd->inode; 2333 struct buffer_head *bh, *head; 2334 sector_t logical; 2335 2336 if (mpd->io_done) { 2337 /* 2338 * Rest of the page in the page_vec 2339 * redirty then and skip then. We will 2340 * try to to write them again after 2341 * starting a new transaction 2342 */ 2343 redirty_page_for_writepage(wbc, page); 2344 unlock_page(page); 2345 return MPAGE_DA_EXTENT_TAIL; 2346 } 2347 /* 2348 * Can we merge this page to current extent? 2349 */ 2350 if (mpd->next_page != page->index) { 2351 /* 2352 * Nope, we can't. So, we map non-allocated blocks 2353 * and start IO on them using writepage() 2354 */ 2355 if (mpd->next_page != mpd->first_page) { 2356 if (mpage_da_map_blocks(mpd) == 0) 2357 mpage_da_submit_io(mpd); 2358 /* 2359 * skip rest of the page in the page_vec 2360 */ 2361 mpd->io_done = 1; 2362 redirty_page_for_writepage(wbc, page); 2363 unlock_page(page); 2364 return MPAGE_DA_EXTENT_TAIL; 2365 } 2366 2367 /* 2368 * Start next extent of pages ... 2369 */ 2370 mpd->first_page = page->index; 2371 2372 /* 2373 * ... and blocks 2374 */ 2375 mpd->b_size = 0; 2376 mpd->b_state = 0; 2377 mpd->b_blocknr = 0; 2378 } 2379 2380 mpd->next_page = page->index + 1; 2381 logical = (sector_t) page->index << 2382 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2383 2384 if (!page_has_buffers(page)) { 2385 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, 2386 (1 << BH_Dirty) | (1 << BH_Uptodate)); 2387 if (mpd->io_done) 2388 return MPAGE_DA_EXTENT_TAIL; 2389 } else { 2390 /* 2391 * Page with regular buffer heads, just add all dirty ones 2392 */ 2393 head = page_buffers(page); 2394 bh = head; 2395 do { 2396 BUG_ON(buffer_locked(bh)); 2397 /* 2398 * We need to try to allocate 2399 * unmapped blocks in the same page. 2400 * Otherwise we won't make progress 2401 * with the page in ext4_da_writepage 2402 */ 2403 if (ext4_bh_unmapped_or_delay(NULL, bh)) { 2404 mpage_add_bh_to_extent(mpd, logical, 2405 bh->b_size, 2406 bh->b_state); 2407 if (mpd->io_done) 2408 return MPAGE_DA_EXTENT_TAIL; 2409 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2410 /* 2411 * mapped dirty buffer. We need to update 2412 * the b_state because we look at 2413 * b_state in mpage_da_map_blocks. We don't 2414 * update b_size because if we find an 2415 * unmapped buffer_head later we need to 2416 * use the b_state flag of that buffer_head. 2417 */ 2418 if (mpd->b_size == 0) 2419 mpd->b_state = bh->b_state & BH_FLAGS; 2420 } 2421 logical++; 2422 } while ((bh = bh->b_this_page) != head); 2423 } 2424 2425 return 0; 2426 } 2427 2428 /* 2429 * This is a special get_blocks_t callback which is used by 2430 * ext4_da_write_begin(). It will either return mapped block or 2431 * reserve space for a single block. 2432 * 2433 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 2434 * We also have b_blocknr = -1 and b_bdev initialized properly 2435 * 2436 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 2437 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 2438 * initialized properly. 2439 */ 2440 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2441 struct buffer_head *bh_result, int create) 2442 { 2443 int ret = 0; 2444 sector_t invalid_block = ~((sector_t) 0xffff); 2445 2446 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 2447 invalid_block = ~0; 2448 2449 BUG_ON(create == 0); 2450 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2451 2452 /* 2453 * first, we need to know whether the block is allocated already 2454 * preallocated blocks are unmapped but should treated 2455 * the same as allocated blocks. 2456 */ 2457 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2458 if ((ret == 0) && !buffer_delay(bh_result)) { 2459 /* the block isn't (pre)allocated yet, let's reserve space */ 2460 /* 2461 * XXX: __block_prepare_write() unmaps passed block, 2462 * is it OK? 2463 */ 2464 ret = ext4_da_reserve_space(inode, 1); 2465 if (ret) 2466 /* not enough space to reserve */ 2467 return ret; 2468 2469 map_bh(bh_result, inode->i_sb, invalid_block); 2470 set_buffer_new(bh_result); 2471 set_buffer_delay(bh_result); 2472 } else if (ret > 0) { 2473 bh_result->b_size = (ret << inode->i_blkbits); 2474 if (buffer_unwritten(bh_result)) { 2475 /* A delayed write to unwritten bh should 2476 * be marked new and mapped. Mapped ensures 2477 * that we don't do get_block multiple times 2478 * when we write to the same offset and new 2479 * ensures that we do proper zero out for 2480 * partial write. 2481 */ 2482 set_buffer_new(bh_result); 2483 set_buffer_mapped(bh_result); 2484 } 2485 ret = 0; 2486 } 2487 2488 return ret; 2489 } 2490 2491 /* 2492 * This function is used as a standard get_block_t calback function 2493 * when there is no desire to allocate any blocks. It is used as a 2494 * callback function for block_prepare_write(), nobh_writepage(), and 2495 * block_write_full_page(). These functions should only try to map a 2496 * single block at a time. 2497 * 2498 * Since this function doesn't do block allocations even if the caller 2499 * requests it by passing in create=1, it is critically important that 2500 * any caller checks to make sure that any buffer heads are returned 2501 * by this function are either all already mapped or marked for 2502 * delayed allocation before calling nobh_writepage() or 2503 * block_write_full_page(). Otherwise, b_blocknr could be left 2504 * unitialized, and the page write functions will be taken by 2505 * surprise. 2506 */ 2507 static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2508 struct buffer_head *bh_result, int create) 2509 { 2510 int ret = 0; 2511 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2512 2513 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2514 2515 /* 2516 * we don't want to do block allocation in writepage 2517 * so call get_block_wrap with create = 0 2518 */ 2519 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); 2520 BUG_ON(create && ret == 0); 2521 if (ret > 0) { 2522 bh_result->b_size = (ret << inode->i_blkbits); 2523 ret = 0; 2524 } 2525 return ret; 2526 } 2527 2528 /* 2529 * This function can get called via... 2530 * - ext4_da_writepages after taking page lock (have journal handle) 2531 * - journal_submit_inode_data_buffers (no journal handle) 2532 * - shrink_page_list via pdflush (no journal handle) 2533 * - grab_page_cache when doing write_begin (have journal handle) 2534 */ 2535 static int ext4_da_writepage(struct page *page, 2536 struct writeback_control *wbc) 2537 { 2538 int ret = 0; 2539 loff_t size; 2540 unsigned int len; 2541 struct buffer_head *page_bufs; 2542 struct inode *inode = page->mapping->host; 2543 2544 trace_ext4_da_writepage(inode, page); 2545 size = i_size_read(inode); 2546 if (page->index == size >> PAGE_CACHE_SHIFT) 2547 len = size & ~PAGE_CACHE_MASK; 2548 else 2549 len = PAGE_CACHE_SIZE; 2550 2551 if (page_has_buffers(page)) { 2552 page_bufs = page_buffers(page); 2553 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2554 ext4_bh_unmapped_or_delay)) { 2555 /* 2556 * We don't want to do block allocation 2557 * So redirty the page and return 2558 * We may reach here when we do a journal commit 2559 * via journal_submit_inode_data_buffers. 2560 * If we don't have mapping block we just ignore 2561 * them. We can also reach here via shrink_page_list 2562 */ 2563 redirty_page_for_writepage(wbc, page); 2564 unlock_page(page); 2565 return 0; 2566 } 2567 } else { 2568 /* 2569 * The test for page_has_buffers() is subtle: 2570 * We know the page is dirty but it lost buffers. That means 2571 * that at some moment in time after write_begin()/write_end() 2572 * has been called all buffers have been clean and thus they 2573 * must have been written at least once. So they are all 2574 * mapped and we can happily proceed with mapping them 2575 * and writing the page. 2576 * 2577 * Try to initialize the buffer_heads and check whether 2578 * all are mapped and non delay. We don't want to 2579 * do block allocation here. 2580 */ 2581 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2582 noalloc_get_block_write); 2583 if (!ret) { 2584 page_bufs = page_buffers(page); 2585 /* check whether all are mapped and non delay */ 2586 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2587 ext4_bh_unmapped_or_delay)) { 2588 redirty_page_for_writepage(wbc, page); 2589 unlock_page(page); 2590 return 0; 2591 } 2592 } else { 2593 /* 2594 * We can't do block allocation here 2595 * so just redity the page and unlock 2596 * and return 2597 */ 2598 redirty_page_for_writepage(wbc, page); 2599 unlock_page(page); 2600 return 0; 2601 } 2602 /* now mark the buffer_heads as dirty and uptodate */ 2603 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2604 } 2605 2606 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2607 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2608 else 2609 ret = block_write_full_page(page, noalloc_get_block_write, 2610 wbc); 2611 2612 return ret; 2613 } 2614 2615 /* 2616 * This is called via ext4_da_writepages() to 2617 * calulate the total number of credits to reserve to fit 2618 * a single extent allocation into a single transaction, 2619 * ext4_da_writpeages() will loop calling this before 2620 * the block allocation. 2621 */ 2622 2623 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2624 { 2625 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2626 2627 /* 2628 * With non-extent format the journal credit needed to 2629 * insert nrblocks contiguous block is dependent on 2630 * number of contiguous block. So we will limit 2631 * number of contiguous block to a sane value 2632 */ 2633 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2634 (max_blocks > EXT4_MAX_TRANS_DATA)) 2635 max_blocks = EXT4_MAX_TRANS_DATA; 2636 2637 return ext4_chunk_trans_blocks(inode, max_blocks); 2638 } 2639 2640 static int ext4_da_writepages(struct address_space *mapping, 2641 struct writeback_control *wbc) 2642 { 2643 pgoff_t index; 2644 int range_whole = 0; 2645 handle_t *handle = NULL; 2646 struct mpage_da_data mpd; 2647 struct inode *inode = mapping->host; 2648 int no_nrwrite_index_update; 2649 int pages_written = 0; 2650 long pages_skipped; 2651 int range_cyclic, cycled = 1, io_done = 0; 2652 int needed_blocks, ret = 0, nr_to_writebump = 0; 2653 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2654 2655 trace_ext4_da_writepages(inode, wbc); 2656 2657 /* 2658 * No pages to write? This is mainly a kludge to avoid starting 2659 * a transaction for special inodes like journal inode on last iput() 2660 * because that could violate lock ordering on umount 2661 */ 2662 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2663 return 0; 2664 2665 /* 2666 * If the filesystem has aborted, it is read-only, so return 2667 * right away instead of dumping stack traces later on that 2668 * will obscure the real source of the problem. We test 2669 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2670 * the latter could be true if the filesystem is mounted 2671 * read-only, and in that case, ext4_da_writepages should 2672 * *never* be called, so if that ever happens, we would want 2673 * the stack trace. 2674 */ 2675 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2676 return -EROFS; 2677 2678 /* 2679 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2680 * This make sure small files blocks are allocated in 2681 * single attempt. This ensure that small files 2682 * get less fragmented. 2683 */ 2684 if (wbc->nr_to_write < sbi->s_mb_stream_request) { 2685 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2686 wbc->nr_to_write = sbi->s_mb_stream_request; 2687 } 2688 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2689 range_whole = 1; 2690 2691 range_cyclic = wbc->range_cyclic; 2692 if (wbc->range_cyclic) { 2693 index = mapping->writeback_index; 2694 if (index) 2695 cycled = 0; 2696 wbc->range_start = index << PAGE_CACHE_SHIFT; 2697 wbc->range_end = LLONG_MAX; 2698 wbc->range_cyclic = 0; 2699 } else 2700 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2701 2702 mpd.wbc = wbc; 2703 mpd.inode = mapping->host; 2704 2705 /* 2706 * we don't want write_cache_pages to update 2707 * nr_to_write and writeback_index 2708 */ 2709 no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2710 wbc->no_nrwrite_index_update = 1; 2711 pages_skipped = wbc->pages_skipped; 2712 2713 retry: 2714 while (!ret && wbc->nr_to_write > 0) { 2715 2716 /* 2717 * we insert one extent at a time. So we need 2718 * credit needed for single extent allocation. 2719 * journalled mode is currently not supported 2720 * by delalloc 2721 */ 2722 BUG_ON(ext4_should_journal_data(inode)); 2723 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2724 2725 /* start a new transaction*/ 2726 handle = ext4_journal_start(inode, needed_blocks); 2727 if (IS_ERR(handle)) { 2728 ret = PTR_ERR(handle); 2729 printk(KERN_CRIT "%s: jbd2_start: " 2730 "%ld pages, ino %lu; err %d\n", __func__, 2731 wbc->nr_to_write, inode->i_ino, ret); 2732 dump_stack(); 2733 goto out_writepages; 2734 } 2735 2736 /* 2737 * Now call __mpage_da_writepage to find the next 2738 * contiguous region of logical blocks that need 2739 * blocks to be allocated by ext4. We don't actually 2740 * submit the blocks for I/O here, even though 2741 * write_cache_pages thinks it will, and will set the 2742 * pages as clean for write before calling 2743 * __mpage_da_writepage(). 2744 */ 2745 mpd.b_size = 0; 2746 mpd.b_state = 0; 2747 mpd.b_blocknr = 0; 2748 mpd.first_page = 0; 2749 mpd.next_page = 0; 2750 mpd.io_done = 0; 2751 mpd.pages_written = 0; 2752 mpd.retval = 0; 2753 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 2754 &mpd); 2755 /* 2756 * If we have a contigous extent of pages and we 2757 * haven't done the I/O yet, map the blocks and submit 2758 * them for I/O. 2759 */ 2760 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2761 if (mpage_da_map_blocks(&mpd) == 0) 2762 mpage_da_submit_io(&mpd); 2763 mpd.io_done = 1; 2764 ret = MPAGE_DA_EXTENT_TAIL; 2765 } 2766 wbc->nr_to_write -= mpd.pages_written; 2767 2768 ext4_journal_stop(handle); 2769 2770 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2771 /* commit the transaction which would 2772 * free blocks released in the transaction 2773 * and try again 2774 */ 2775 jbd2_journal_force_commit_nested(sbi->s_journal); 2776 wbc->pages_skipped = pages_skipped; 2777 ret = 0; 2778 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2779 /* 2780 * got one extent now try with 2781 * rest of the pages 2782 */ 2783 pages_written += mpd.pages_written; 2784 wbc->pages_skipped = pages_skipped; 2785 ret = 0; 2786 io_done = 1; 2787 } else if (wbc->nr_to_write) 2788 /* 2789 * There is no more writeout needed 2790 * or we requested for a noblocking writeout 2791 * and we found the device congested 2792 */ 2793 break; 2794 } 2795 if (!io_done && !cycled) { 2796 cycled = 1; 2797 index = 0; 2798 wbc->range_start = index << PAGE_CACHE_SHIFT; 2799 wbc->range_end = mapping->writeback_index - 1; 2800 goto retry; 2801 } 2802 if (pages_skipped != wbc->pages_skipped) 2803 printk(KERN_EMERG "This should not happen leaving %s " 2804 "with nr_to_write = %ld ret = %d\n", 2805 __func__, wbc->nr_to_write, ret); 2806 2807 /* Update index */ 2808 index += pages_written; 2809 wbc->range_cyclic = range_cyclic; 2810 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2811 /* 2812 * set the writeback_index so that range_cyclic 2813 * mode will write it back later 2814 */ 2815 mapping->writeback_index = index; 2816 2817 out_writepages: 2818 if (!no_nrwrite_index_update) 2819 wbc->no_nrwrite_index_update = 0; 2820 wbc->nr_to_write -= nr_to_writebump; 2821 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2822 return ret; 2823 } 2824 2825 #define FALL_BACK_TO_NONDELALLOC 1 2826 static int ext4_nonda_switch(struct super_block *sb) 2827 { 2828 s64 free_blocks, dirty_blocks; 2829 struct ext4_sb_info *sbi = EXT4_SB(sb); 2830 2831 /* 2832 * switch to non delalloc mode if we are running low 2833 * on free block. The free block accounting via percpu 2834 * counters can get slightly wrong with percpu_counter_batch getting 2835 * accumulated on each CPU without updating global counters 2836 * Delalloc need an accurate free block accounting. So switch 2837 * to non delalloc when we are near to error range. 2838 */ 2839 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2840 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2841 if (2 * free_blocks < 3 * dirty_blocks || 2842 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2843 /* 2844 * free block count is less that 150% of dirty blocks 2845 * or free blocks is less that watermark 2846 */ 2847 return 1; 2848 } 2849 return 0; 2850 } 2851 2852 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2853 loff_t pos, unsigned len, unsigned flags, 2854 struct page **pagep, void **fsdata) 2855 { 2856 int ret, retries = 0; 2857 struct page *page; 2858 pgoff_t index; 2859 unsigned from, to; 2860 struct inode *inode = mapping->host; 2861 handle_t *handle; 2862 2863 index = pos >> PAGE_CACHE_SHIFT; 2864 from = pos & (PAGE_CACHE_SIZE - 1); 2865 to = from + len; 2866 2867 if (ext4_nonda_switch(inode->i_sb)) { 2868 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2869 return ext4_write_begin(file, mapping, pos, 2870 len, flags, pagep, fsdata); 2871 } 2872 *fsdata = (void *)0; 2873 trace_ext4_da_write_begin(inode, pos, len, flags); 2874 retry: 2875 /* 2876 * With delayed allocation, we don't log the i_disksize update 2877 * if there is delayed block allocation. But we still need 2878 * to journalling the i_disksize update if writes to the end 2879 * of file which has an already mapped buffer. 2880 */ 2881 handle = ext4_journal_start(inode, 1); 2882 if (IS_ERR(handle)) { 2883 ret = PTR_ERR(handle); 2884 goto out; 2885 } 2886 /* We cannot recurse into the filesystem as the transaction is already 2887 * started */ 2888 flags |= AOP_FLAG_NOFS; 2889 2890 page = grab_cache_page_write_begin(mapping, index, flags); 2891 if (!page) { 2892 ext4_journal_stop(handle); 2893 ret = -ENOMEM; 2894 goto out; 2895 } 2896 *pagep = page; 2897 2898 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2899 ext4_da_get_block_prep); 2900 if (ret < 0) { 2901 unlock_page(page); 2902 ext4_journal_stop(handle); 2903 page_cache_release(page); 2904 /* 2905 * block_write_begin may have instantiated a few blocks 2906 * outside i_size. Trim these off again. Don't need 2907 * i_size_read because we hold i_mutex. 2908 */ 2909 if (pos + len > inode->i_size) 2910 vmtruncate(inode, inode->i_size); 2911 } 2912 2913 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2914 goto retry; 2915 out: 2916 return ret; 2917 } 2918 2919 /* 2920 * Check if we should update i_disksize 2921 * when write to the end of file but not require block allocation 2922 */ 2923 static int ext4_da_should_update_i_disksize(struct page *page, 2924 unsigned long offset) 2925 { 2926 struct buffer_head *bh; 2927 struct inode *inode = page->mapping->host; 2928 unsigned int idx; 2929 int i; 2930 2931 bh = page_buffers(page); 2932 idx = offset >> inode->i_blkbits; 2933 2934 for (i = 0; i < idx; i++) 2935 bh = bh->b_this_page; 2936 2937 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 2938 return 0; 2939 return 1; 2940 } 2941 2942 static int ext4_da_write_end(struct file *file, 2943 struct address_space *mapping, 2944 loff_t pos, unsigned len, unsigned copied, 2945 struct page *page, void *fsdata) 2946 { 2947 struct inode *inode = mapping->host; 2948 int ret = 0, ret2; 2949 handle_t *handle = ext4_journal_current_handle(); 2950 loff_t new_i_size; 2951 unsigned long start, end; 2952 int write_mode = (int)(unsigned long)fsdata; 2953 2954 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2955 if (ext4_should_order_data(inode)) { 2956 return ext4_ordered_write_end(file, mapping, pos, 2957 len, copied, page, fsdata); 2958 } else if (ext4_should_writeback_data(inode)) { 2959 return ext4_writeback_write_end(file, mapping, pos, 2960 len, copied, page, fsdata); 2961 } else { 2962 BUG(); 2963 } 2964 } 2965 2966 trace_ext4_da_write_end(inode, pos, len, copied); 2967 start = pos & (PAGE_CACHE_SIZE - 1); 2968 end = start + copied - 1; 2969 2970 /* 2971 * generic_write_end() will run mark_inode_dirty() if i_size 2972 * changes. So let's piggyback the i_disksize mark_inode_dirty 2973 * into that. 2974 */ 2975 2976 new_i_size = pos + copied; 2977 if (new_i_size > EXT4_I(inode)->i_disksize) { 2978 if (ext4_da_should_update_i_disksize(page, end)) { 2979 down_write(&EXT4_I(inode)->i_data_sem); 2980 if (new_i_size > EXT4_I(inode)->i_disksize) { 2981 /* 2982 * Updating i_disksize when extending file 2983 * without needing block allocation 2984 */ 2985 if (ext4_should_order_data(inode)) 2986 ret = ext4_jbd2_file_inode(handle, 2987 inode); 2988 2989 EXT4_I(inode)->i_disksize = new_i_size; 2990 } 2991 up_write(&EXT4_I(inode)->i_data_sem); 2992 /* We need to mark inode dirty even if 2993 * new_i_size is less that inode->i_size 2994 * bu greater than i_disksize.(hint delalloc) 2995 */ 2996 ext4_mark_inode_dirty(handle, inode); 2997 } 2998 } 2999 ret2 = generic_write_end(file, mapping, pos, len, copied, 3000 page, fsdata); 3001 copied = ret2; 3002 if (ret2 < 0) 3003 ret = ret2; 3004 ret2 = ext4_journal_stop(handle); 3005 if (!ret) 3006 ret = ret2; 3007 3008 return ret ? ret : copied; 3009 } 3010 3011 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 3012 { 3013 /* 3014 * Drop reserved blocks 3015 */ 3016 BUG_ON(!PageLocked(page)); 3017 if (!page_has_buffers(page)) 3018 goto out; 3019 3020 ext4_da_page_release_reservation(page, offset); 3021 3022 out: 3023 ext4_invalidatepage(page, offset); 3024 3025 return; 3026 } 3027 3028 /* 3029 * Force all delayed allocation blocks to be allocated for a given inode. 3030 */ 3031 int ext4_alloc_da_blocks(struct inode *inode) 3032 { 3033 if (!EXT4_I(inode)->i_reserved_data_blocks && 3034 !EXT4_I(inode)->i_reserved_meta_blocks) 3035 return 0; 3036 3037 /* 3038 * We do something simple for now. The filemap_flush() will 3039 * also start triggering a write of the data blocks, which is 3040 * not strictly speaking necessary (and for users of 3041 * laptop_mode, not even desirable). However, to do otherwise 3042 * would require replicating code paths in: 3043 * 3044 * ext4_da_writepages() -> 3045 * write_cache_pages() ---> (via passed in callback function) 3046 * __mpage_da_writepage() --> 3047 * mpage_add_bh_to_extent() 3048 * mpage_da_map_blocks() 3049 * 3050 * The problem is that write_cache_pages(), located in 3051 * mm/page-writeback.c, marks pages clean in preparation for 3052 * doing I/O, which is not desirable if we're not planning on 3053 * doing I/O at all. 3054 * 3055 * We could call write_cache_pages(), and then redirty all of 3056 * the pages by calling redirty_page_for_writeback() but that 3057 * would be ugly in the extreme. So instead we would need to 3058 * replicate parts of the code in the above functions, 3059 * simplifying them becuase we wouldn't actually intend to 3060 * write out the pages, but rather only collect contiguous 3061 * logical block extents, call the multi-block allocator, and 3062 * then update the buffer heads with the block allocations. 3063 * 3064 * For now, though, we'll cheat by calling filemap_flush(), 3065 * which will map the blocks, and start the I/O, but not 3066 * actually wait for the I/O to complete. 3067 */ 3068 return filemap_flush(inode->i_mapping); 3069 } 3070 3071 /* 3072 * bmap() is special. It gets used by applications such as lilo and by 3073 * the swapper to find the on-disk block of a specific piece of data. 3074 * 3075 * Naturally, this is dangerous if the block concerned is still in the 3076 * journal. If somebody makes a swapfile on an ext4 data-journaling 3077 * filesystem and enables swap, then they may get a nasty shock when the 3078 * data getting swapped to that swapfile suddenly gets overwritten by 3079 * the original zero's written out previously to the journal and 3080 * awaiting writeback in the kernel's buffer cache. 3081 * 3082 * So, if we see any bmap calls here on a modified, data-journaled file, 3083 * take extra steps to flush any blocks which might be in the cache. 3084 */ 3085 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3086 { 3087 struct inode *inode = mapping->host; 3088 journal_t *journal; 3089 int err; 3090 3091 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3092 test_opt(inode->i_sb, DELALLOC)) { 3093 /* 3094 * With delalloc we want to sync the file 3095 * so that we can make sure we allocate 3096 * blocks for file 3097 */ 3098 filemap_write_and_wait(mapping); 3099 } 3100 3101 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3102 /* 3103 * This is a REALLY heavyweight approach, but the use of 3104 * bmap on dirty files is expected to be extremely rare: 3105 * only if we run lilo or swapon on a freshly made file 3106 * do we expect this to happen. 3107 * 3108 * (bmap requires CAP_SYS_RAWIO so this does not 3109 * represent an unprivileged user DOS attack --- we'd be 3110 * in trouble if mortal users could trigger this path at 3111 * will.) 3112 * 3113 * NB. EXT4_STATE_JDATA is not set on files other than 3114 * regular files. If somebody wants to bmap a directory 3115 * or symlink and gets confused because the buffer 3116 * hasn't yet been flushed to disk, they deserve 3117 * everything they get. 3118 */ 3119 3120 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3121 journal = EXT4_JOURNAL(inode); 3122 jbd2_journal_lock_updates(journal); 3123 err = jbd2_journal_flush(journal); 3124 jbd2_journal_unlock_updates(journal); 3125 3126 if (err) 3127 return 0; 3128 } 3129 3130 return generic_block_bmap(mapping, block, ext4_get_block); 3131 } 3132 3133 static int bget_one(handle_t *handle, struct buffer_head *bh) 3134 { 3135 get_bh(bh); 3136 return 0; 3137 } 3138 3139 static int bput_one(handle_t *handle, struct buffer_head *bh) 3140 { 3141 put_bh(bh); 3142 return 0; 3143 } 3144 3145 /* 3146 * Note that we don't need to start a transaction unless we're journaling data 3147 * because we should have holes filled from ext4_page_mkwrite(). We even don't 3148 * need to file the inode to the transaction's list in ordered mode because if 3149 * we are writing back data added by write(), the inode is already there and if 3150 * we are writing back data modified via mmap(), noone guarantees in which 3151 * transaction the data will hit the disk. In case we are journaling data, we 3152 * cannot start transaction directly because transaction start ranks above page 3153 * lock so we have to do some magic. 3154 * 3155 * In all journaling modes block_write_full_page() will start the I/O. 3156 * 3157 * Problem: 3158 * 3159 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 3160 * ext4_writepage() 3161 * 3162 * Similar for: 3163 * 3164 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 3165 * 3166 * Same applies to ext4_get_block(). We will deadlock on various things like 3167 * lock_journal and i_data_sem 3168 * 3169 * Setting PF_MEMALLOC here doesn't work - too many internal memory 3170 * allocations fail. 3171 * 3172 * 16May01: If we're reentered then journal_current_handle() will be 3173 * non-zero. We simply *return*. 3174 * 3175 * 1 July 2001: @@@ FIXME: 3176 * In journalled data mode, a data buffer may be metadata against the 3177 * current transaction. But the same file is part of a shared mapping 3178 * and someone does a writepage() on it. 3179 * 3180 * We will move the buffer onto the async_data list, but *after* it has 3181 * been dirtied. So there's a small window where we have dirty data on 3182 * BJ_Metadata. 3183 * 3184 * Note that this only applies to the last partial page in the file. The 3185 * bit which block_write_full_page() uses prepare/commit for. (That's 3186 * broken code anyway: it's wrong for msync()). 3187 * 3188 * It's a rare case: affects the final partial page, for journalled data 3189 * where the file is subject to bith write() and writepage() in the same 3190 * transction. To fix it we'll need a custom block_write_full_page(). 3191 * We'll probably need that anyway for journalling writepage() output. 3192 * 3193 * We don't honour synchronous mounts for writepage(). That would be 3194 * disastrous. Any write() or metadata operation will sync the fs for 3195 * us. 3196 * 3197 */ 3198 static int __ext4_normal_writepage(struct page *page, 3199 struct writeback_control *wbc) 3200 { 3201 struct inode *inode = page->mapping->host; 3202 3203 if (test_opt(inode->i_sb, NOBH)) 3204 return nobh_writepage(page, noalloc_get_block_write, wbc); 3205 else 3206 return block_write_full_page(page, noalloc_get_block_write, 3207 wbc); 3208 } 3209 3210 static int ext4_normal_writepage(struct page *page, 3211 struct writeback_control *wbc) 3212 { 3213 struct inode *inode = page->mapping->host; 3214 loff_t size = i_size_read(inode); 3215 loff_t len; 3216 3217 trace_ext4_normal_writepage(inode, page); 3218 J_ASSERT(PageLocked(page)); 3219 if (page->index == size >> PAGE_CACHE_SHIFT) 3220 len = size & ~PAGE_CACHE_MASK; 3221 else 3222 len = PAGE_CACHE_SIZE; 3223 3224 if (page_has_buffers(page)) { 3225 /* if page has buffers it should all be mapped 3226 * and allocated. If there are not buffers attached 3227 * to the page we know the page is dirty but it lost 3228 * buffers. That means that at some moment in time 3229 * after write_begin() / write_end() has been called 3230 * all buffers have been clean and thus they must have been 3231 * written at least once. So they are all mapped and we can 3232 * happily proceed with mapping them and writing the page. 3233 */ 3234 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3235 ext4_bh_unmapped_or_delay)); 3236 } 3237 3238 if (!ext4_journal_current_handle()) 3239 return __ext4_normal_writepage(page, wbc); 3240 3241 redirty_page_for_writepage(wbc, page); 3242 unlock_page(page); 3243 return 0; 3244 } 3245 3246 static int __ext4_journalled_writepage(struct page *page, 3247 struct writeback_control *wbc) 3248 { 3249 struct address_space *mapping = page->mapping; 3250 struct inode *inode = mapping->host; 3251 struct buffer_head *page_bufs; 3252 handle_t *handle = NULL; 3253 int ret = 0; 3254 int err; 3255 3256 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3257 noalloc_get_block_write); 3258 if (ret != 0) 3259 goto out_unlock; 3260 3261 page_bufs = page_buffers(page); 3262 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 3263 bget_one); 3264 /* As soon as we unlock the page, it can go away, but we have 3265 * references to buffers so we are safe */ 3266 unlock_page(page); 3267 3268 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 3269 if (IS_ERR(handle)) { 3270 ret = PTR_ERR(handle); 3271 goto out; 3272 } 3273 3274 ret = walk_page_buffers(handle, page_bufs, 0, 3275 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 3276 3277 err = walk_page_buffers(handle, page_bufs, 0, 3278 PAGE_CACHE_SIZE, NULL, write_end_fn); 3279 if (ret == 0) 3280 ret = err; 3281 err = ext4_journal_stop(handle); 3282 if (!ret) 3283 ret = err; 3284 3285 walk_page_buffers(handle, page_bufs, 0, 3286 PAGE_CACHE_SIZE, NULL, bput_one); 3287 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3288 goto out; 3289 3290 out_unlock: 3291 unlock_page(page); 3292 out: 3293 return ret; 3294 } 3295 3296 static int ext4_journalled_writepage(struct page *page, 3297 struct writeback_control *wbc) 3298 { 3299 struct inode *inode = page->mapping->host; 3300 loff_t size = i_size_read(inode); 3301 loff_t len; 3302 3303 trace_ext4_journalled_writepage(inode, page); 3304 J_ASSERT(PageLocked(page)); 3305 if (page->index == size >> PAGE_CACHE_SHIFT) 3306 len = size & ~PAGE_CACHE_MASK; 3307 else 3308 len = PAGE_CACHE_SIZE; 3309 3310 if (page_has_buffers(page)) { 3311 /* if page has buffers it should all be mapped 3312 * and allocated. If there are not buffers attached 3313 * to the page we know the page is dirty but it lost 3314 * buffers. That means that at some moment in time 3315 * after write_begin() / write_end() has been called 3316 * all buffers have been clean and thus they must have been 3317 * written at least once. So they are all mapped and we can 3318 * happily proceed with mapping them and writing the page. 3319 */ 3320 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3321 ext4_bh_unmapped_or_delay)); 3322 } 3323 3324 if (ext4_journal_current_handle()) 3325 goto no_write; 3326 3327 if (PageChecked(page)) { 3328 /* 3329 * It's mmapped pagecache. Add buffers and journal it. There 3330 * doesn't seem much point in redirtying the page here. 3331 */ 3332 ClearPageChecked(page); 3333 return __ext4_journalled_writepage(page, wbc); 3334 } else { 3335 /* 3336 * It may be a page full of checkpoint-mode buffers. We don't 3337 * really know unless we go poke around in the buffer_heads. 3338 * But block_write_full_page will do the right thing. 3339 */ 3340 return block_write_full_page(page, noalloc_get_block_write, 3341 wbc); 3342 } 3343 no_write: 3344 redirty_page_for_writepage(wbc, page); 3345 unlock_page(page); 3346 return 0; 3347 } 3348 3349 static int ext4_readpage(struct file *file, struct page *page) 3350 { 3351 return mpage_readpage(page, ext4_get_block); 3352 } 3353 3354 static int 3355 ext4_readpages(struct file *file, struct address_space *mapping, 3356 struct list_head *pages, unsigned nr_pages) 3357 { 3358 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3359 } 3360 3361 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3362 { 3363 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3364 3365 /* 3366 * If it's a full truncate we just forget about the pending dirtying 3367 */ 3368 if (offset == 0) 3369 ClearPageChecked(page); 3370 3371 if (journal) 3372 jbd2_journal_invalidatepage(journal, page, offset); 3373 else 3374 block_invalidatepage(page, offset); 3375 } 3376 3377 static int ext4_releasepage(struct page *page, gfp_t wait) 3378 { 3379 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3380 3381 WARN_ON(PageChecked(page)); 3382 if (!page_has_buffers(page)) 3383 return 0; 3384 if (journal) 3385 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3386 else 3387 return try_to_free_buffers(page); 3388 } 3389 3390 /* 3391 * If the O_DIRECT write will extend the file then add this inode to the 3392 * orphan list. So recovery will truncate it back to the original size 3393 * if the machine crashes during the write. 3394 * 3395 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3396 * crashes then stale disk data _may_ be exposed inside the file. But current 3397 * VFS code falls back into buffered path in that case so we are safe. 3398 */ 3399 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3400 const struct iovec *iov, loff_t offset, 3401 unsigned long nr_segs) 3402 { 3403 struct file *file = iocb->ki_filp; 3404 struct inode *inode = file->f_mapping->host; 3405 struct ext4_inode_info *ei = EXT4_I(inode); 3406 handle_t *handle; 3407 ssize_t ret; 3408 int orphan = 0; 3409 size_t count = iov_length(iov, nr_segs); 3410 3411 if (rw == WRITE) { 3412 loff_t final_size = offset + count; 3413 3414 if (final_size > inode->i_size) { 3415 /* Credits for sb + inode write */ 3416 handle = ext4_journal_start(inode, 2); 3417 if (IS_ERR(handle)) { 3418 ret = PTR_ERR(handle); 3419 goto out; 3420 } 3421 ret = ext4_orphan_add(handle, inode); 3422 if (ret) { 3423 ext4_journal_stop(handle); 3424 goto out; 3425 } 3426 orphan = 1; 3427 ei->i_disksize = inode->i_size; 3428 ext4_journal_stop(handle); 3429 } 3430 } 3431 3432 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3433 offset, nr_segs, 3434 ext4_get_block, NULL); 3435 3436 if (orphan) { 3437 int err; 3438 3439 /* Credits for sb + inode write */ 3440 handle = ext4_journal_start(inode, 2); 3441 if (IS_ERR(handle)) { 3442 /* This is really bad luck. We've written the data 3443 * but cannot extend i_size. Bail out and pretend 3444 * the write failed... */ 3445 ret = PTR_ERR(handle); 3446 goto out; 3447 } 3448 if (inode->i_nlink) 3449 ext4_orphan_del(handle, inode); 3450 if (ret > 0) { 3451 loff_t end = offset + ret; 3452 if (end > inode->i_size) { 3453 ei->i_disksize = end; 3454 i_size_write(inode, end); 3455 /* 3456 * We're going to return a positive `ret' 3457 * here due to non-zero-length I/O, so there's 3458 * no way of reporting error returns from 3459 * ext4_mark_inode_dirty() to userspace. So 3460 * ignore it. 3461 */ 3462 ext4_mark_inode_dirty(handle, inode); 3463 } 3464 } 3465 err = ext4_journal_stop(handle); 3466 if (ret == 0) 3467 ret = err; 3468 } 3469 out: 3470 return ret; 3471 } 3472 3473 /* 3474 * Pages can be marked dirty completely asynchronously from ext4's journalling 3475 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3476 * much here because ->set_page_dirty is called under VFS locks. The page is 3477 * not necessarily locked. 3478 * 3479 * We cannot just dirty the page and leave attached buffers clean, because the 3480 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3481 * or jbddirty because all the journalling code will explode. 3482 * 3483 * So what we do is to mark the page "pending dirty" and next time writepage 3484 * is called, propagate that into the buffers appropriately. 3485 */ 3486 static int ext4_journalled_set_page_dirty(struct page *page) 3487 { 3488 SetPageChecked(page); 3489 return __set_page_dirty_nobuffers(page); 3490 } 3491 3492 static const struct address_space_operations ext4_ordered_aops = { 3493 .readpage = ext4_readpage, 3494 .readpages = ext4_readpages, 3495 .writepage = ext4_normal_writepage, 3496 .sync_page = block_sync_page, 3497 .write_begin = ext4_write_begin, 3498 .write_end = ext4_ordered_write_end, 3499 .bmap = ext4_bmap, 3500 .invalidatepage = ext4_invalidatepage, 3501 .releasepage = ext4_releasepage, 3502 .direct_IO = ext4_direct_IO, 3503 .migratepage = buffer_migrate_page, 3504 .is_partially_uptodate = block_is_partially_uptodate, 3505 }; 3506 3507 static const struct address_space_operations ext4_writeback_aops = { 3508 .readpage = ext4_readpage, 3509 .readpages = ext4_readpages, 3510 .writepage = ext4_normal_writepage, 3511 .sync_page = block_sync_page, 3512 .write_begin = ext4_write_begin, 3513 .write_end = ext4_writeback_write_end, 3514 .bmap = ext4_bmap, 3515 .invalidatepage = ext4_invalidatepage, 3516 .releasepage = ext4_releasepage, 3517 .direct_IO = ext4_direct_IO, 3518 .migratepage = buffer_migrate_page, 3519 .is_partially_uptodate = block_is_partially_uptodate, 3520 }; 3521 3522 static const struct address_space_operations ext4_journalled_aops = { 3523 .readpage = ext4_readpage, 3524 .readpages = ext4_readpages, 3525 .writepage = ext4_journalled_writepage, 3526 .sync_page = block_sync_page, 3527 .write_begin = ext4_write_begin, 3528 .write_end = ext4_journalled_write_end, 3529 .set_page_dirty = ext4_journalled_set_page_dirty, 3530 .bmap = ext4_bmap, 3531 .invalidatepage = ext4_invalidatepage, 3532 .releasepage = ext4_releasepage, 3533 .is_partially_uptodate = block_is_partially_uptodate, 3534 }; 3535 3536 static const struct address_space_operations ext4_da_aops = { 3537 .readpage = ext4_readpage, 3538 .readpages = ext4_readpages, 3539 .writepage = ext4_da_writepage, 3540 .writepages = ext4_da_writepages, 3541 .sync_page = block_sync_page, 3542 .write_begin = ext4_da_write_begin, 3543 .write_end = ext4_da_write_end, 3544 .bmap = ext4_bmap, 3545 .invalidatepage = ext4_da_invalidatepage, 3546 .releasepage = ext4_releasepage, 3547 .direct_IO = ext4_direct_IO, 3548 .migratepage = buffer_migrate_page, 3549 .is_partially_uptodate = block_is_partially_uptodate, 3550 }; 3551 3552 void ext4_set_aops(struct inode *inode) 3553 { 3554 if (ext4_should_order_data(inode) && 3555 test_opt(inode->i_sb, DELALLOC)) 3556 inode->i_mapping->a_ops = &ext4_da_aops; 3557 else if (ext4_should_order_data(inode)) 3558 inode->i_mapping->a_ops = &ext4_ordered_aops; 3559 else if (ext4_should_writeback_data(inode) && 3560 test_opt(inode->i_sb, DELALLOC)) 3561 inode->i_mapping->a_ops = &ext4_da_aops; 3562 else if (ext4_should_writeback_data(inode)) 3563 inode->i_mapping->a_ops = &ext4_writeback_aops; 3564 else 3565 inode->i_mapping->a_ops = &ext4_journalled_aops; 3566 } 3567 3568 /* 3569 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3570 * up to the end of the block which corresponds to `from'. 3571 * This required during truncate. We need to physically zero the tail end 3572 * of that block so it doesn't yield old data if the file is later grown. 3573 */ 3574 int ext4_block_truncate_page(handle_t *handle, 3575 struct address_space *mapping, loff_t from) 3576 { 3577 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3578 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3579 unsigned blocksize, length, pos; 3580 ext4_lblk_t iblock; 3581 struct inode *inode = mapping->host; 3582 struct buffer_head *bh; 3583 struct page *page; 3584 int err = 0; 3585 3586 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3587 if (!page) 3588 return -EINVAL; 3589 3590 blocksize = inode->i_sb->s_blocksize; 3591 length = blocksize - (offset & (blocksize - 1)); 3592 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3593 3594 /* 3595 * For "nobh" option, we can only work if we don't need to 3596 * read-in the page - otherwise we create buffers to do the IO. 3597 */ 3598 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 3599 ext4_should_writeback_data(inode) && PageUptodate(page)) { 3600 zero_user(page, offset, length); 3601 set_page_dirty(page); 3602 goto unlock; 3603 } 3604 3605 if (!page_has_buffers(page)) 3606 create_empty_buffers(page, blocksize, 0); 3607 3608 /* Find the buffer that contains "offset" */ 3609 bh = page_buffers(page); 3610 pos = blocksize; 3611 while (offset >= pos) { 3612 bh = bh->b_this_page; 3613 iblock++; 3614 pos += blocksize; 3615 } 3616 3617 err = 0; 3618 if (buffer_freed(bh)) { 3619 BUFFER_TRACE(bh, "freed: skip"); 3620 goto unlock; 3621 } 3622 3623 if (!buffer_mapped(bh)) { 3624 BUFFER_TRACE(bh, "unmapped"); 3625 ext4_get_block(inode, iblock, bh, 0); 3626 /* unmapped? It's a hole - nothing to do */ 3627 if (!buffer_mapped(bh)) { 3628 BUFFER_TRACE(bh, "still unmapped"); 3629 goto unlock; 3630 } 3631 } 3632 3633 /* Ok, it's mapped. Make sure it's up-to-date */ 3634 if (PageUptodate(page)) 3635 set_buffer_uptodate(bh); 3636 3637 if (!buffer_uptodate(bh)) { 3638 err = -EIO; 3639 ll_rw_block(READ, 1, &bh); 3640 wait_on_buffer(bh); 3641 /* Uhhuh. Read error. Complain and punt. */ 3642 if (!buffer_uptodate(bh)) 3643 goto unlock; 3644 } 3645 3646 if (ext4_should_journal_data(inode)) { 3647 BUFFER_TRACE(bh, "get write access"); 3648 err = ext4_journal_get_write_access(handle, bh); 3649 if (err) 3650 goto unlock; 3651 } 3652 3653 zero_user(page, offset, length); 3654 3655 BUFFER_TRACE(bh, "zeroed end of block"); 3656 3657 err = 0; 3658 if (ext4_should_journal_data(inode)) { 3659 err = ext4_handle_dirty_metadata(handle, inode, bh); 3660 } else { 3661 if (ext4_should_order_data(inode)) 3662 err = ext4_jbd2_file_inode(handle, inode); 3663 mark_buffer_dirty(bh); 3664 } 3665 3666 unlock: 3667 unlock_page(page); 3668 page_cache_release(page); 3669 return err; 3670 } 3671 3672 /* 3673 * Probably it should be a library function... search for first non-zero word 3674 * or memcmp with zero_page, whatever is better for particular architecture. 3675 * Linus? 3676 */ 3677 static inline int all_zeroes(__le32 *p, __le32 *q) 3678 { 3679 while (p < q) 3680 if (*p++) 3681 return 0; 3682 return 1; 3683 } 3684 3685 /** 3686 * ext4_find_shared - find the indirect blocks for partial truncation. 3687 * @inode: inode in question 3688 * @depth: depth of the affected branch 3689 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3690 * @chain: place to store the pointers to partial indirect blocks 3691 * @top: place to the (detached) top of branch 3692 * 3693 * This is a helper function used by ext4_truncate(). 3694 * 3695 * When we do truncate() we may have to clean the ends of several 3696 * indirect blocks but leave the blocks themselves alive. Block is 3697 * partially truncated if some data below the new i_size is refered 3698 * from it (and it is on the path to the first completely truncated 3699 * data block, indeed). We have to free the top of that path along 3700 * with everything to the right of the path. Since no allocation 3701 * past the truncation point is possible until ext4_truncate() 3702 * finishes, we may safely do the latter, but top of branch may 3703 * require special attention - pageout below the truncation point 3704 * might try to populate it. 3705 * 3706 * We atomically detach the top of branch from the tree, store the 3707 * block number of its root in *@top, pointers to buffer_heads of 3708 * partially truncated blocks - in @chain[].bh and pointers to 3709 * their last elements that should not be removed - in 3710 * @chain[].p. Return value is the pointer to last filled element 3711 * of @chain. 3712 * 3713 * The work left to caller to do the actual freeing of subtrees: 3714 * a) free the subtree starting from *@top 3715 * b) free the subtrees whose roots are stored in 3716 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3717 * c) free the subtrees growing from the inode past the @chain[0]. 3718 * (no partially truncated stuff there). */ 3719 3720 static Indirect *ext4_find_shared(struct inode *inode, int depth, 3721 ext4_lblk_t offsets[4], Indirect chain[4], 3722 __le32 *top) 3723 { 3724 Indirect *partial, *p; 3725 int k, err; 3726 3727 *top = 0; 3728 /* Make k index the deepest non-null offest + 1 */ 3729 for (k = depth; k > 1 && !offsets[k-1]; k--) 3730 ; 3731 partial = ext4_get_branch(inode, k, offsets, chain, &err); 3732 /* Writer: pointers */ 3733 if (!partial) 3734 partial = chain + k-1; 3735 /* 3736 * If the branch acquired continuation since we've looked at it - 3737 * fine, it should all survive and (new) top doesn't belong to us. 3738 */ 3739 if (!partial->key && *partial->p) 3740 /* Writer: end */ 3741 goto no_top; 3742 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3743 ; 3744 /* 3745 * OK, we've found the last block that must survive. The rest of our 3746 * branch should be detached before unlocking. However, if that rest 3747 * of branch is all ours and does not grow immediately from the inode 3748 * it's easier to cheat and just decrement partial->p. 3749 */ 3750 if (p == chain + k - 1 && p > chain) { 3751 p->p--; 3752 } else { 3753 *top = *p->p; 3754 /* Nope, don't do this in ext4. Must leave the tree intact */ 3755 #if 0 3756 *p->p = 0; 3757 #endif 3758 } 3759 /* Writer: end */ 3760 3761 while (partial > p) { 3762 brelse(partial->bh); 3763 partial--; 3764 } 3765 no_top: 3766 return partial; 3767 } 3768 3769 /* 3770 * Zero a number of block pointers in either an inode or an indirect block. 3771 * If we restart the transaction we must again get write access to the 3772 * indirect block for further modification. 3773 * 3774 * We release `count' blocks on disk, but (last - first) may be greater 3775 * than `count' because there can be holes in there. 3776 */ 3777 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3778 struct buffer_head *bh, 3779 ext4_fsblk_t block_to_free, 3780 unsigned long count, __le32 *first, 3781 __le32 *last) 3782 { 3783 __le32 *p; 3784 if (try_to_extend_transaction(handle, inode)) { 3785 if (bh) { 3786 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 3787 ext4_handle_dirty_metadata(handle, inode, bh); 3788 } 3789 ext4_mark_inode_dirty(handle, inode); 3790 ext4_journal_test_restart(handle, inode); 3791 if (bh) { 3792 BUFFER_TRACE(bh, "retaking write access"); 3793 ext4_journal_get_write_access(handle, bh); 3794 } 3795 } 3796 3797 /* 3798 * Any buffers which are on the journal will be in memory. We 3799 * find them on the hash table so jbd2_journal_revoke() will 3800 * run jbd2_journal_forget() on them. We've already detached 3801 * each block from the file, so bforget() in 3802 * jbd2_journal_forget() should be safe. 3803 * 3804 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3805 */ 3806 for (p = first; p < last; p++) { 3807 u32 nr = le32_to_cpu(*p); 3808 if (nr) { 3809 struct buffer_head *tbh; 3810 3811 *p = 0; 3812 tbh = sb_find_get_block(inode->i_sb, nr); 3813 ext4_forget(handle, 0, inode, tbh, nr); 3814 } 3815 } 3816 3817 ext4_free_blocks(handle, inode, block_to_free, count, 0); 3818 } 3819 3820 /** 3821 * ext4_free_data - free a list of data blocks 3822 * @handle: handle for this transaction 3823 * @inode: inode we are dealing with 3824 * @this_bh: indirect buffer_head which contains *@first and *@last 3825 * @first: array of block numbers 3826 * @last: points immediately past the end of array 3827 * 3828 * We are freeing all blocks refered from that array (numbers are stored as 3829 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3830 * 3831 * We accumulate contiguous runs of blocks to free. Conveniently, if these 3832 * blocks are contiguous then releasing them at one time will only affect one 3833 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3834 * actually use a lot of journal space. 3835 * 3836 * @this_bh will be %NULL if @first and @last point into the inode's direct 3837 * block pointers. 3838 */ 3839 static void ext4_free_data(handle_t *handle, struct inode *inode, 3840 struct buffer_head *this_bh, 3841 __le32 *first, __le32 *last) 3842 { 3843 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3844 unsigned long count = 0; /* Number of blocks in the run */ 3845 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3846 corresponding to 3847 block_to_free */ 3848 ext4_fsblk_t nr; /* Current block # */ 3849 __le32 *p; /* Pointer into inode/ind 3850 for current block */ 3851 int err; 3852 3853 if (this_bh) { /* For indirect block */ 3854 BUFFER_TRACE(this_bh, "get_write_access"); 3855 err = ext4_journal_get_write_access(handle, this_bh); 3856 /* Important: if we can't update the indirect pointers 3857 * to the blocks, we can't free them. */ 3858 if (err) 3859 return; 3860 } 3861 3862 for (p = first; p < last; p++) { 3863 nr = le32_to_cpu(*p); 3864 if (nr) { 3865 /* accumulate blocks to free if they're contiguous */ 3866 if (count == 0) { 3867 block_to_free = nr; 3868 block_to_free_p = p; 3869 count = 1; 3870 } else if (nr == block_to_free + count) { 3871 count++; 3872 } else { 3873 ext4_clear_blocks(handle, inode, this_bh, 3874 block_to_free, 3875 count, block_to_free_p, p); 3876 block_to_free = nr; 3877 block_to_free_p = p; 3878 count = 1; 3879 } 3880 } 3881 } 3882 3883 if (count > 0) 3884 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3885 count, block_to_free_p, p); 3886 3887 if (this_bh) { 3888 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 3889 3890 /* 3891 * The buffer head should have an attached journal head at this 3892 * point. However, if the data is corrupted and an indirect 3893 * block pointed to itself, it would have been detached when 3894 * the block was cleared. Check for this instead of OOPSing. 3895 */ 3896 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 3897 ext4_handle_dirty_metadata(handle, inode, this_bh); 3898 else 3899 ext4_error(inode->i_sb, __func__, 3900 "circular indirect block detected, " 3901 "inode=%lu, block=%llu", 3902 inode->i_ino, 3903 (unsigned long long) this_bh->b_blocknr); 3904 } 3905 } 3906 3907 /** 3908 * ext4_free_branches - free an array of branches 3909 * @handle: JBD handle for this transaction 3910 * @inode: inode we are dealing with 3911 * @parent_bh: the buffer_head which contains *@first and *@last 3912 * @first: array of block numbers 3913 * @last: pointer immediately past the end of array 3914 * @depth: depth of the branches to free 3915 * 3916 * We are freeing all blocks refered from these branches (numbers are 3917 * stored as little-endian 32-bit) and updating @inode->i_blocks 3918 * appropriately. 3919 */ 3920 static void ext4_free_branches(handle_t *handle, struct inode *inode, 3921 struct buffer_head *parent_bh, 3922 __le32 *first, __le32 *last, int depth) 3923 { 3924 ext4_fsblk_t nr; 3925 __le32 *p; 3926 3927 if (ext4_handle_is_aborted(handle)) 3928 return; 3929 3930 if (depth--) { 3931 struct buffer_head *bh; 3932 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3933 p = last; 3934 while (--p >= first) { 3935 nr = le32_to_cpu(*p); 3936 if (!nr) 3937 continue; /* A hole */ 3938 3939 /* Go read the buffer for the next level down */ 3940 bh = sb_bread(inode->i_sb, nr); 3941 3942 /* 3943 * A read failure? Report error and clear slot 3944 * (should be rare). 3945 */ 3946 if (!bh) { 3947 ext4_error(inode->i_sb, "ext4_free_branches", 3948 "Read failure, inode=%lu, block=%llu", 3949 inode->i_ino, nr); 3950 continue; 3951 } 3952 3953 /* This zaps the entire block. Bottom up. */ 3954 BUFFER_TRACE(bh, "free child branches"); 3955 ext4_free_branches(handle, inode, bh, 3956 (__le32 *) bh->b_data, 3957 (__le32 *) bh->b_data + addr_per_block, 3958 depth); 3959 3960 /* 3961 * We've probably journalled the indirect block several 3962 * times during the truncate. But it's no longer 3963 * needed and we now drop it from the transaction via 3964 * jbd2_journal_revoke(). 3965 * 3966 * That's easy if it's exclusively part of this 3967 * transaction. But if it's part of the committing 3968 * transaction then jbd2_journal_forget() will simply 3969 * brelse() it. That means that if the underlying 3970 * block is reallocated in ext4_get_block(), 3971 * unmap_underlying_metadata() will find this block 3972 * and will try to get rid of it. damn, damn. 3973 * 3974 * If this block has already been committed to the 3975 * journal, a revoke record will be written. And 3976 * revoke records must be emitted *before* clearing 3977 * this block's bit in the bitmaps. 3978 */ 3979 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 3980 3981 /* 3982 * Everything below this this pointer has been 3983 * released. Now let this top-of-subtree go. 3984 * 3985 * We want the freeing of this indirect block to be 3986 * atomic in the journal with the updating of the 3987 * bitmap block which owns it. So make some room in 3988 * the journal. 3989 * 3990 * We zero the parent pointer *after* freeing its 3991 * pointee in the bitmaps, so if extend_transaction() 3992 * for some reason fails to put the bitmap changes and 3993 * the release into the same transaction, recovery 3994 * will merely complain about releasing a free block, 3995 * rather than leaking blocks. 3996 */ 3997 if (ext4_handle_is_aborted(handle)) 3998 return; 3999 if (try_to_extend_transaction(handle, inode)) { 4000 ext4_mark_inode_dirty(handle, inode); 4001 ext4_journal_test_restart(handle, inode); 4002 } 4003 4004 ext4_free_blocks(handle, inode, nr, 1, 1); 4005 4006 if (parent_bh) { 4007 /* 4008 * The block which we have just freed is 4009 * pointed to by an indirect block: journal it 4010 */ 4011 BUFFER_TRACE(parent_bh, "get_write_access"); 4012 if (!ext4_journal_get_write_access(handle, 4013 parent_bh)){ 4014 *p = 0; 4015 BUFFER_TRACE(parent_bh, 4016 "call ext4_handle_dirty_metadata"); 4017 ext4_handle_dirty_metadata(handle, 4018 inode, 4019 parent_bh); 4020 } 4021 } 4022 } 4023 } else { 4024 /* We have reached the bottom of the tree. */ 4025 BUFFER_TRACE(parent_bh, "free data blocks"); 4026 ext4_free_data(handle, inode, parent_bh, first, last); 4027 } 4028 } 4029 4030 int ext4_can_truncate(struct inode *inode) 4031 { 4032 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 4033 return 0; 4034 if (S_ISREG(inode->i_mode)) 4035 return 1; 4036 if (S_ISDIR(inode->i_mode)) 4037 return 1; 4038 if (S_ISLNK(inode->i_mode)) 4039 return !ext4_inode_is_fast_symlink(inode); 4040 return 0; 4041 } 4042 4043 /* 4044 * ext4_truncate() 4045 * 4046 * We block out ext4_get_block() block instantiations across the entire 4047 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 4048 * simultaneously on behalf of the same inode. 4049 * 4050 * As we work through the truncate and commmit bits of it to the journal there 4051 * is one core, guiding principle: the file's tree must always be consistent on 4052 * disk. We must be able to restart the truncate after a crash. 4053 * 4054 * The file's tree may be transiently inconsistent in memory (although it 4055 * probably isn't), but whenever we close off and commit a journal transaction, 4056 * the contents of (the filesystem + the journal) must be consistent and 4057 * restartable. It's pretty simple, really: bottom up, right to left (although 4058 * left-to-right works OK too). 4059 * 4060 * Note that at recovery time, journal replay occurs *before* the restart of 4061 * truncate against the orphan inode list. 4062 * 4063 * The committed inode has the new, desired i_size (which is the same as 4064 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4065 * that this inode's truncate did not complete and it will again call 4066 * ext4_truncate() to have another go. So there will be instantiated blocks 4067 * to the right of the truncation point in a crashed ext4 filesystem. But 4068 * that's fine - as long as they are linked from the inode, the post-crash 4069 * ext4_truncate() run will find them and release them. 4070 */ 4071 void ext4_truncate(struct inode *inode) 4072 { 4073 handle_t *handle; 4074 struct ext4_inode_info *ei = EXT4_I(inode); 4075 __le32 *i_data = ei->i_data; 4076 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 4077 struct address_space *mapping = inode->i_mapping; 4078 ext4_lblk_t offsets[4]; 4079 Indirect chain[4]; 4080 Indirect *partial; 4081 __le32 nr = 0; 4082 int n; 4083 ext4_lblk_t last_block; 4084 unsigned blocksize = inode->i_sb->s_blocksize; 4085 4086 if (!ext4_can_truncate(inode)) 4087 return; 4088 4089 if (ei->i_disksize && inode->i_size == 0 && 4090 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4091 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4092 4093 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4094 ext4_ext_truncate(inode); 4095 return; 4096 } 4097 4098 handle = start_transaction(inode); 4099 if (IS_ERR(handle)) 4100 return; /* AKPM: return what? */ 4101 4102 last_block = (inode->i_size + blocksize-1) 4103 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4104 4105 if (inode->i_size & (blocksize - 1)) 4106 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4107 goto out_stop; 4108 4109 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4110 if (n == 0) 4111 goto out_stop; /* error */ 4112 4113 /* 4114 * OK. This truncate is going to happen. We add the inode to the 4115 * orphan list, so that if this truncate spans multiple transactions, 4116 * and we crash, we will resume the truncate when the filesystem 4117 * recovers. It also marks the inode dirty, to catch the new size. 4118 * 4119 * Implication: the file must always be in a sane, consistent 4120 * truncatable state while each transaction commits. 4121 */ 4122 if (ext4_orphan_add(handle, inode)) 4123 goto out_stop; 4124 4125 /* 4126 * From here we block out all ext4_get_block() callers who want to 4127 * modify the block allocation tree. 4128 */ 4129 down_write(&ei->i_data_sem); 4130 4131 ext4_discard_preallocations(inode); 4132 4133 /* 4134 * The orphan list entry will now protect us from any crash which 4135 * occurs before the truncate completes, so it is now safe to propagate 4136 * the new, shorter inode size (held for now in i_size) into the 4137 * on-disk inode. We do this via i_disksize, which is the value which 4138 * ext4 *really* writes onto the disk inode. 4139 */ 4140 ei->i_disksize = inode->i_size; 4141 4142 if (n == 1) { /* direct blocks */ 4143 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4144 i_data + EXT4_NDIR_BLOCKS); 4145 goto do_indirects; 4146 } 4147 4148 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 4149 /* Kill the top of shared branch (not detached) */ 4150 if (nr) { 4151 if (partial == chain) { 4152 /* Shared branch grows from the inode */ 4153 ext4_free_branches(handle, inode, NULL, 4154 &nr, &nr+1, (chain+n-1) - partial); 4155 *partial->p = 0; 4156 /* 4157 * We mark the inode dirty prior to restart, 4158 * and prior to stop. No need for it here. 4159 */ 4160 } else { 4161 /* Shared branch grows from an indirect block */ 4162 BUFFER_TRACE(partial->bh, "get_write_access"); 4163 ext4_free_branches(handle, inode, partial->bh, 4164 partial->p, 4165 partial->p+1, (chain+n-1) - partial); 4166 } 4167 } 4168 /* Clear the ends of indirect blocks on the shared branch */ 4169 while (partial > chain) { 4170 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 4171 (__le32*)partial->bh->b_data+addr_per_block, 4172 (chain+n-1) - partial); 4173 BUFFER_TRACE(partial->bh, "call brelse"); 4174 brelse(partial->bh); 4175 partial--; 4176 } 4177 do_indirects: 4178 /* Kill the remaining (whole) subtrees */ 4179 switch (offsets[0]) { 4180 default: 4181 nr = i_data[EXT4_IND_BLOCK]; 4182 if (nr) { 4183 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 4184 i_data[EXT4_IND_BLOCK] = 0; 4185 } 4186 case EXT4_IND_BLOCK: 4187 nr = i_data[EXT4_DIND_BLOCK]; 4188 if (nr) { 4189 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 4190 i_data[EXT4_DIND_BLOCK] = 0; 4191 } 4192 case EXT4_DIND_BLOCK: 4193 nr = i_data[EXT4_TIND_BLOCK]; 4194 if (nr) { 4195 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 4196 i_data[EXT4_TIND_BLOCK] = 0; 4197 } 4198 case EXT4_TIND_BLOCK: 4199 ; 4200 } 4201 4202 up_write(&ei->i_data_sem); 4203 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4204 ext4_mark_inode_dirty(handle, inode); 4205 4206 /* 4207 * In a multi-transaction truncate, we only make the final transaction 4208 * synchronous 4209 */ 4210 if (IS_SYNC(inode)) 4211 ext4_handle_sync(handle); 4212 out_stop: 4213 /* 4214 * If this was a simple ftruncate(), and the file will remain alive 4215 * then we need to clear up the orphan record which we created above. 4216 * However, if this was a real unlink then we were called by 4217 * ext4_delete_inode(), and we allow that function to clean up the 4218 * orphan info for us. 4219 */ 4220 if (inode->i_nlink) 4221 ext4_orphan_del(handle, inode); 4222 4223 ext4_journal_stop(handle); 4224 } 4225 4226 /* 4227 * ext4_get_inode_loc returns with an extra refcount against the inode's 4228 * underlying buffer_head on success. If 'in_mem' is true, we have all 4229 * data in memory that is needed to recreate the on-disk version of this 4230 * inode. 4231 */ 4232 static int __ext4_get_inode_loc(struct inode *inode, 4233 struct ext4_iloc *iloc, int in_mem) 4234 { 4235 struct ext4_group_desc *gdp; 4236 struct buffer_head *bh; 4237 struct super_block *sb = inode->i_sb; 4238 ext4_fsblk_t block; 4239 int inodes_per_block, inode_offset; 4240 4241 iloc->bh = NULL; 4242 if (!ext4_valid_inum(sb, inode->i_ino)) 4243 return -EIO; 4244 4245 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4246 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4247 if (!gdp) 4248 return -EIO; 4249 4250 /* 4251 * Figure out the offset within the block group inode table 4252 */ 4253 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4254 inode_offset = ((inode->i_ino - 1) % 4255 EXT4_INODES_PER_GROUP(sb)); 4256 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4257 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4258 4259 bh = sb_getblk(sb, block); 4260 if (!bh) { 4261 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4262 "inode block - inode=%lu, block=%llu", 4263 inode->i_ino, block); 4264 return -EIO; 4265 } 4266 if (!buffer_uptodate(bh)) { 4267 lock_buffer(bh); 4268 4269 /* 4270 * If the buffer has the write error flag, we have failed 4271 * to write out another inode in the same block. In this 4272 * case, we don't have to read the block because we may 4273 * read the old inode data successfully. 4274 */ 4275 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4276 set_buffer_uptodate(bh); 4277 4278 if (buffer_uptodate(bh)) { 4279 /* someone brought it uptodate while we waited */ 4280 unlock_buffer(bh); 4281 goto has_buffer; 4282 } 4283 4284 /* 4285 * If we have all information of the inode in memory and this 4286 * is the only valid inode in the block, we need not read the 4287 * block. 4288 */ 4289 if (in_mem) { 4290 struct buffer_head *bitmap_bh; 4291 int i, start; 4292 4293 start = inode_offset & ~(inodes_per_block - 1); 4294 4295 /* Is the inode bitmap in cache? */ 4296 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4297 if (!bitmap_bh) 4298 goto make_io; 4299 4300 /* 4301 * If the inode bitmap isn't in cache then the 4302 * optimisation may end up performing two reads instead 4303 * of one, so skip it. 4304 */ 4305 if (!buffer_uptodate(bitmap_bh)) { 4306 brelse(bitmap_bh); 4307 goto make_io; 4308 } 4309 for (i = start; i < start + inodes_per_block; i++) { 4310 if (i == inode_offset) 4311 continue; 4312 if (ext4_test_bit(i, bitmap_bh->b_data)) 4313 break; 4314 } 4315 brelse(bitmap_bh); 4316 if (i == start + inodes_per_block) { 4317 /* all other inodes are free, so skip I/O */ 4318 memset(bh->b_data, 0, bh->b_size); 4319 set_buffer_uptodate(bh); 4320 unlock_buffer(bh); 4321 goto has_buffer; 4322 } 4323 } 4324 4325 make_io: 4326 /* 4327 * If we need to do any I/O, try to pre-readahead extra 4328 * blocks from the inode table. 4329 */ 4330 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4331 ext4_fsblk_t b, end, table; 4332 unsigned num; 4333 4334 table = ext4_inode_table(sb, gdp); 4335 /* s_inode_readahead_blks is always a power of 2 */ 4336 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4337 if (table > b) 4338 b = table; 4339 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4340 num = EXT4_INODES_PER_GROUP(sb); 4341 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4342 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4343 num -= ext4_itable_unused_count(sb, gdp); 4344 table += num / inodes_per_block; 4345 if (end > table) 4346 end = table; 4347 while (b <= end) 4348 sb_breadahead(sb, b++); 4349 } 4350 4351 /* 4352 * There are other valid inodes in the buffer, this inode 4353 * has in-inode xattrs, or we don't have this inode in memory. 4354 * Read the block from disk. 4355 */ 4356 get_bh(bh); 4357 bh->b_end_io = end_buffer_read_sync; 4358 submit_bh(READ_META, bh); 4359 wait_on_buffer(bh); 4360 if (!buffer_uptodate(bh)) { 4361 ext4_error(sb, __func__, 4362 "unable to read inode block - inode=%lu, " 4363 "block=%llu", inode->i_ino, block); 4364 brelse(bh); 4365 return -EIO; 4366 } 4367 } 4368 has_buffer: 4369 iloc->bh = bh; 4370 return 0; 4371 } 4372 4373 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4374 { 4375 /* We have all inode data except xattrs in memory here. */ 4376 return __ext4_get_inode_loc(inode, iloc, 4377 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4378 } 4379 4380 void ext4_set_inode_flags(struct inode *inode) 4381 { 4382 unsigned int flags = EXT4_I(inode)->i_flags; 4383 4384 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4385 if (flags & EXT4_SYNC_FL) 4386 inode->i_flags |= S_SYNC; 4387 if (flags & EXT4_APPEND_FL) 4388 inode->i_flags |= S_APPEND; 4389 if (flags & EXT4_IMMUTABLE_FL) 4390 inode->i_flags |= S_IMMUTABLE; 4391 if (flags & EXT4_NOATIME_FL) 4392 inode->i_flags |= S_NOATIME; 4393 if (flags & EXT4_DIRSYNC_FL) 4394 inode->i_flags |= S_DIRSYNC; 4395 } 4396 4397 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4398 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4399 { 4400 unsigned int flags = ei->vfs_inode.i_flags; 4401 4402 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4403 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4404 if (flags & S_SYNC) 4405 ei->i_flags |= EXT4_SYNC_FL; 4406 if (flags & S_APPEND) 4407 ei->i_flags |= EXT4_APPEND_FL; 4408 if (flags & S_IMMUTABLE) 4409 ei->i_flags |= EXT4_IMMUTABLE_FL; 4410 if (flags & S_NOATIME) 4411 ei->i_flags |= EXT4_NOATIME_FL; 4412 if (flags & S_DIRSYNC) 4413 ei->i_flags |= EXT4_DIRSYNC_FL; 4414 } 4415 4416 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4417 struct ext4_inode_info *ei) 4418 { 4419 blkcnt_t i_blocks ; 4420 struct inode *inode = &(ei->vfs_inode); 4421 struct super_block *sb = inode->i_sb; 4422 4423 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4424 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4425 /* we are using combined 48 bit field */ 4426 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4427 le32_to_cpu(raw_inode->i_blocks_lo); 4428 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4429 /* i_blocks represent file system block size */ 4430 return i_blocks << (inode->i_blkbits - 9); 4431 } else { 4432 return i_blocks; 4433 } 4434 } else { 4435 return le32_to_cpu(raw_inode->i_blocks_lo); 4436 } 4437 } 4438 4439 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4440 { 4441 struct ext4_iloc iloc; 4442 struct ext4_inode *raw_inode; 4443 struct ext4_inode_info *ei; 4444 struct buffer_head *bh; 4445 struct inode *inode; 4446 long ret; 4447 int block; 4448 4449 inode = iget_locked(sb, ino); 4450 if (!inode) 4451 return ERR_PTR(-ENOMEM); 4452 if (!(inode->i_state & I_NEW)) 4453 return inode; 4454 4455 ei = EXT4_I(inode); 4456 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4457 ei->i_acl = EXT4_ACL_NOT_CACHED; 4458 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4459 #endif 4460 4461 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4462 if (ret < 0) 4463 goto bad_inode; 4464 bh = iloc.bh; 4465 raw_inode = ext4_raw_inode(&iloc); 4466 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4467 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4468 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4469 if (!(test_opt(inode->i_sb, NO_UID32))) { 4470 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4471 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4472 } 4473 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4474 4475 ei->i_state = 0; 4476 ei->i_dir_start_lookup = 0; 4477 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4478 /* We now have enough fields to check if the inode was active or not. 4479 * This is needed because nfsd might try to access dead inodes 4480 * the test is that same one that e2fsck uses 4481 * NeilBrown 1999oct15 4482 */ 4483 if (inode->i_nlink == 0) { 4484 if (inode->i_mode == 0 || 4485 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4486 /* this inode is deleted */ 4487 brelse(bh); 4488 ret = -ESTALE; 4489 goto bad_inode; 4490 } 4491 /* The only unlinked inodes we let through here have 4492 * valid i_mode and are being read by the orphan 4493 * recovery code: that's fine, we're about to complete 4494 * the process of deleting those. */ 4495 } 4496 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4497 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4498 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4499 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) 4500 ei->i_file_acl |= 4501 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4502 inode->i_size = ext4_isize(raw_inode); 4503 ei->i_disksize = inode->i_size; 4504 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4505 ei->i_block_group = iloc.block_group; 4506 ei->i_last_alloc_group = ~0; 4507 /* 4508 * NOTE! The in-memory inode i_data array is in little-endian order 4509 * even on big-endian machines: we do NOT byteswap the block numbers! 4510 */ 4511 for (block = 0; block < EXT4_N_BLOCKS; block++) 4512 ei->i_data[block] = raw_inode->i_block[block]; 4513 INIT_LIST_HEAD(&ei->i_orphan); 4514 4515 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4516 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4517 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4518 EXT4_INODE_SIZE(inode->i_sb)) { 4519 brelse(bh); 4520 ret = -EIO; 4521 goto bad_inode; 4522 } 4523 if (ei->i_extra_isize == 0) { 4524 /* The extra space is currently unused. Use it. */ 4525 ei->i_extra_isize = sizeof(struct ext4_inode) - 4526 EXT4_GOOD_OLD_INODE_SIZE; 4527 } else { 4528 __le32 *magic = (void *)raw_inode + 4529 EXT4_GOOD_OLD_INODE_SIZE + 4530 ei->i_extra_isize; 4531 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4532 ei->i_state |= EXT4_STATE_XATTR; 4533 } 4534 } else 4535 ei->i_extra_isize = 0; 4536 4537 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4538 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4539 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4540 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4541 4542 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4543 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4544 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4545 inode->i_version |= 4546 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4547 } 4548 4549 ret = 0; 4550 if (ei->i_file_acl && 4551 ((ei->i_file_acl < 4552 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 4553 EXT4_SB(sb)->s_gdb_count)) || 4554 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { 4555 ext4_error(sb, __func__, 4556 "bad extended attribute block %llu in inode #%lu", 4557 ei->i_file_acl, inode->i_ino); 4558 ret = -EIO; 4559 goto bad_inode; 4560 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 4561 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4562 (S_ISLNK(inode->i_mode) && 4563 !ext4_inode_is_fast_symlink(inode))) 4564 /* Validate extent which is part of inode */ 4565 ret = ext4_ext_check_inode(inode); 4566 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4567 (S_ISLNK(inode->i_mode) && 4568 !ext4_inode_is_fast_symlink(inode))) { 4569 /* Validate block references which are part of inode */ 4570 ret = ext4_check_inode_blockref(inode); 4571 } 4572 if (ret) { 4573 brelse(bh); 4574 goto bad_inode; 4575 } 4576 4577 if (S_ISREG(inode->i_mode)) { 4578 inode->i_op = &ext4_file_inode_operations; 4579 inode->i_fop = &ext4_file_operations; 4580 ext4_set_aops(inode); 4581 } else if (S_ISDIR(inode->i_mode)) { 4582 inode->i_op = &ext4_dir_inode_operations; 4583 inode->i_fop = &ext4_dir_operations; 4584 } else if (S_ISLNK(inode->i_mode)) { 4585 if (ext4_inode_is_fast_symlink(inode)) { 4586 inode->i_op = &ext4_fast_symlink_inode_operations; 4587 nd_terminate_link(ei->i_data, inode->i_size, 4588 sizeof(ei->i_data) - 1); 4589 } else { 4590 inode->i_op = &ext4_symlink_inode_operations; 4591 ext4_set_aops(inode); 4592 } 4593 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 4594 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 4595 inode->i_op = &ext4_special_inode_operations; 4596 if (raw_inode->i_block[0]) 4597 init_special_inode(inode, inode->i_mode, 4598 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4599 else 4600 init_special_inode(inode, inode->i_mode, 4601 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4602 } else { 4603 brelse(bh); 4604 ret = -EIO; 4605 ext4_error(inode->i_sb, __func__, 4606 "bogus i_mode (%o) for inode=%lu", 4607 inode->i_mode, inode->i_ino); 4608 goto bad_inode; 4609 } 4610 brelse(iloc.bh); 4611 ext4_set_inode_flags(inode); 4612 unlock_new_inode(inode); 4613 return inode; 4614 4615 bad_inode: 4616 iget_failed(inode); 4617 return ERR_PTR(ret); 4618 } 4619 4620 static int ext4_inode_blocks_set(handle_t *handle, 4621 struct ext4_inode *raw_inode, 4622 struct ext4_inode_info *ei) 4623 { 4624 struct inode *inode = &(ei->vfs_inode); 4625 u64 i_blocks = inode->i_blocks; 4626 struct super_block *sb = inode->i_sb; 4627 4628 if (i_blocks <= ~0U) { 4629 /* 4630 * i_blocks can be represnted in a 32 bit variable 4631 * as multiple of 512 bytes 4632 */ 4633 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4634 raw_inode->i_blocks_high = 0; 4635 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4636 return 0; 4637 } 4638 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4639 return -EFBIG; 4640 4641 if (i_blocks <= 0xffffffffffffULL) { 4642 /* 4643 * i_blocks can be represented in a 48 bit variable 4644 * as multiple of 512 bytes 4645 */ 4646 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4647 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4648 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4649 } else { 4650 ei->i_flags |= EXT4_HUGE_FILE_FL; 4651 /* i_block is stored in file system block size */ 4652 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4653 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4654 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4655 } 4656 return 0; 4657 } 4658 4659 /* 4660 * Post the struct inode info into an on-disk inode location in the 4661 * buffer-cache. This gobbles the caller's reference to the 4662 * buffer_head in the inode location struct. 4663 * 4664 * The caller must have write access to iloc->bh. 4665 */ 4666 static int ext4_do_update_inode(handle_t *handle, 4667 struct inode *inode, 4668 struct ext4_iloc *iloc) 4669 { 4670 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4671 struct ext4_inode_info *ei = EXT4_I(inode); 4672 struct buffer_head *bh = iloc->bh; 4673 int err = 0, rc, block; 4674 4675 /* For fields not not tracking in the in-memory inode, 4676 * initialise them to zero for new inodes. */ 4677 if (ei->i_state & EXT4_STATE_NEW) 4678 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4679 4680 ext4_get_inode_flags(ei); 4681 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4682 if (!(test_opt(inode->i_sb, NO_UID32))) { 4683 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4684 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4685 /* 4686 * Fix up interoperability with old kernels. Otherwise, old inodes get 4687 * re-used with the upper 16 bits of the uid/gid intact 4688 */ 4689 if (!ei->i_dtime) { 4690 raw_inode->i_uid_high = 4691 cpu_to_le16(high_16_bits(inode->i_uid)); 4692 raw_inode->i_gid_high = 4693 cpu_to_le16(high_16_bits(inode->i_gid)); 4694 } else { 4695 raw_inode->i_uid_high = 0; 4696 raw_inode->i_gid_high = 0; 4697 } 4698 } else { 4699 raw_inode->i_uid_low = 4700 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 4701 raw_inode->i_gid_low = 4702 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 4703 raw_inode->i_uid_high = 0; 4704 raw_inode->i_gid_high = 0; 4705 } 4706 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4707 4708 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4709 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4710 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4711 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4712 4713 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4714 goto out_brelse; 4715 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4716 /* clear the migrate flag in the raw_inode */ 4717 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); 4718 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4719 cpu_to_le32(EXT4_OS_HURD)) 4720 raw_inode->i_file_acl_high = 4721 cpu_to_le16(ei->i_file_acl >> 32); 4722 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4723 ext4_isize_set(raw_inode, ei->i_disksize); 4724 if (ei->i_disksize > 0x7fffffffULL) { 4725 struct super_block *sb = inode->i_sb; 4726 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4727 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 4728 EXT4_SB(sb)->s_es->s_rev_level == 4729 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 4730 /* If this is the first large file 4731 * created, add a flag to the superblock. 4732 */ 4733 err = ext4_journal_get_write_access(handle, 4734 EXT4_SB(sb)->s_sbh); 4735 if (err) 4736 goto out_brelse; 4737 ext4_update_dynamic_rev(sb); 4738 EXT4_SET_RO_COMPAT_FEATURE(sb, 4739 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4740 sb->s_dirt = 1; 4741 ext4_handle_sync(handle); 4742 err = ext4_handle_dirty_metadata(handle, inode, 4743 EXT4_SB(sb)->s_sbh); 4744 } 4745 } 4746 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4747 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4748 if (old_valid_dev(inode->i_rdev)) { 4749 raw_inode->i_block[0] = 4750 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4751 raw_inode->i_block[1] = 0; 4752 } else { 4753 raw_inode->i_block[0] = 0; 4754 raw_inode->i_block[1] = 4755 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4756 raw_inode->i_block[2] = 0; 4757 } 4758 } else 4759 for (block = 0; block < EXT4_N_BLOCKS; block++) 4760 raw_inode->i_block[block] = ei->i_data[block]; 4761 4762 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4763 if (ei->i_extra_isize) { 4764 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4765 raw_inode->i_version_hi = 4766 cpu_to_le32(inode->i_version >> 32); 4767 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4768 } 4769 4770 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4771 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4772 if (!err) 4773 err = rc; 4774 ei->i_state &= ~EXT4_STATE_NEW; 4775 4776 out_brelse: 4777 brelse(bh); 4778 ext4_std_error(inode->i_sb, err); 4779 return err; 4780 } 4781 4782 /* 4783 * ext4_write_inode() 4784 * 4785 * We are called from a few places: 4786 * 4787 * - Within generic_file_write() for O_SYNC files. 4788 * Here, there will be no transaction running. We wait for any running 4789 * trasnaction to commit. 4790 * 4791 * - Within sys_sync(), kupdate and such. 4792 * We wait on commit, if tol to. 4793 * 4794 * - Within prune_icache() (PF_MEMALLOC == true) 4795 * Here we simply return. We can't afford to block kswapd on the 4796 * journal commit. 4797 * 4798 * In all cases it is actually safe for us to return without doing anything, 4799 * because the inode has been copied into a raw inode buffer in 4800 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4801 * knfsd. 4802 * 4803 * Note that we are absolutely dependent upon all inode dirtiers doing the 4804 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4805 * which we are interested. 4806 * 4807 * It would be a bug for them to not do this. The code: 4808 * 4809 * mark_inode_dirty(inode) 4810 * stuff(); 4811 * inode->i_size = expr; 4812 * 4813 * is in error because a kswapd-driven write_inode() could occur while 4814 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4815 * will no longer be on the superblock's dirty inode list. 4816 */ 4817 int ext4_write_inode(struct inode *inode, int wait) 4818 { 4819 if (current->flags & PF_MEMALLOC) 4820 return 0; 4821 4822 if (ext4_journal_current_handle()) { 4823 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4824 dump_stack(); 4825 return -EIO; 4826 } 4827 4828 if (!wait) 4829 return 0; 4830 4831 return ext4_force_commit(inode->i_sb); 4832 } 4833 4834 /* 4835 * ext4_setattr() 4836 * 4837 * Called from notify_change. 4838 * 4839 * We want to trap VFS attempts to truncate the file as soon as 4840 * possible. In particular, we want to make sure that when the VFS 4841 * shrinks i_size, we put the inode on the orphan list and modify 4842 * i_disksize immediately, so that during the subsequent flushing of 4843 * dirty pages and freeing of disk blocks, we can guarantee that any 4844 * commit will leave the blocks being flushed in an unused state on 4845 * disk. (On recovery, the inode will get truncated and the blocks will 4846 * be freed, so we have a strong guarantee that no future commit will 4847 * leave these blocks visible to the user.) 4848 * 4849 * Another thing we have to assure is that if we are in ordered mode 4850 * and inode is still attached to the committing transaction, we must 4851 * we start writeout of all the dirty pages which are being truncated. 4852 * This way we are sure that all the data written in the previous 4853 * transaction are already on disk (truncate waits for pages under 4854 * writeback). 4855 * 4856 * Called with inode->i_mutex down. 4857 */ 4858 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4859 { 4860 struct inode *inode = dentry->d_inode; 4861 int error, rc = 0; 4862 const unsigned int ia_valid = attr->ia_valid; 4863 4864 error = inode_change_ok(inode, attr); 4865 if (error) 4866 return error; 4867 4868 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 4869 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 4870 handle_t *handle; 4871 4872 /* (user+group)*(old+new) structure, inode write (sb, 4873 * inode block, ? - but truncate inode update has it) */ 4874 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 4875 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 4876 if (IS_ERR(handle)) { 4877 error = PTR_ERR(handle); 4878 goto err_out; 4879 } 4880 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 4881 if (error) { 4882 ext4_journal_stop(handle); 4883 return error; 4884 } 4885 /* Update corresponding info in inode so that everything is in 4886 * one transaction */ 4887 if (attr->ia_valid & ATTR_UID) 4888 inode->i_uid = attr->ia_uid; 4889 if (attr->ia_valid & ATTR_GID) 4890 inode->i_gid = attr->ia_gid; 4891 error = ext4_mark_inode_dirty(handle, inode); 4892 ext4_journal_stop(handle); 4893 } 4894 4895 if (attr->ia_valid & ATTR_SIZE) { 4896 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 4897 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4898 4899 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 4900 error = -EFBIG; 4901 goto err_out; 4902 } 4903 } 4904 } 4905 4906 if (S_ISREG(inode->i_mode) && 4907 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 4908 handle_t *handle; 4909 4910 handle = ext4_journal_start(inode, 3); 4911 if (IS_ERR(handle)) { 4912 error = PTR_ERR(handle); 4913 goto err_out; 4914 } 4915 4916 error = ext4_orphan_add(handle, inode); 4917 EXT4_I(inode)->i_disksize = attr->ia_size; 4918 rc = ext4_mark_inode_dirty(handle, inode); 4919 if (!error) 4920 error = rc; 4921 ext4_journal_stop(handle); 4922 4923 if (ext4_should_order_data(inode)) { 4924 error = ext4_begin_ordered_truncate(inode, 4925 attr->ia_size); 4926 if (error) { 4927 /* Do as much error cleanup as possible */ 4928 handle = ext4_journal_start(inode, 3); 4929 if (IS_ERR(handle)) { 4930 ext4_orphan_del(NULL, inode); 4931 goto err_out; 4932 } 4933 ext4_orphan_del(handle, inode); 4934 ext4_journal_stop(handle); 4935 goto err_out; 4936 } 4937 } 4938 } 4939 4940 rc = inode_setattr(inode, attr); 4941 4942 /* If inode_setattr's call to ext4_truncate failed to get a 4943 * transaction handle at all, we need to clean up the in-core 4944 * orphan list manually. */ 4945 if (inode->i_nlink) 4946 ext4_orphan_del(NULL, inode); 4947 4948 if (!rc && (ia_valid & ATTR_MODE)) 4949 rc = ext4_acl_chmod(inode); 4950 4951 err_out: 4952 ext4_std_error(inode->i_sb, error); 4953 if (!error) 4954 error = rc; 4955 return error; 4956 } 4957 4958 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 4959 struct kstat *stat) 4960 { 4961 struct inode *inode; 4962 unsigned long delalloc_blocks; 4963 4964 inode = dentry->d_inode; 4965 generic_fillattr(inode, stat); 4966 4967 /* 4968 * We can't update i_blocks if the block allocation is delayed 4969 * otherwise in the case of system crash before the real block 4970 * allocation is done, we will have i_blocks inconsistent with 4971 * on-disk file blocks. 4972 * We always keep i_blocks updated together with real 4973 * allocation. But to not confuse with user, stat 4974 * will return the blocks that include the delayed allocation 4975 * blocks for this file. 4976 */ 4977 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 4978 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4979 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 4980 4981 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4982 return 0; 4983 } 4984 4985 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 4986 int chunk) 4987 { 4988 int indirects; 4989 4990 /* if nrblocks are contiguous */ 4991 if (chunk) { 4992 /* 4993 * With N contiguous data blocks, it need at most 4994 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 4995 * 2 dindirect blocks 4996 * 1 tindirect block 4997 */ 4998 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 4999 return indirects + 3; 5000 } 5001 /* 5002 * if nrblocks are not contiguous, worse case, each block touch 5003 * a indirect block, and each indirect block touch a double indirect 5004 * block, plus a triple indirect block 5005 */ 5006 indirects = nrblocks * 2 + 1; 5007 return indirects; 5008 } 5009 5010 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5011 { 5012 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5013 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5014 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5015 } 5016 5017 /* 5018 * Account for index blocks, block groups bitmaps and block group 5019 * descriptor blocks if modify datablocks and index blocks 5020 * worse case, the indexs blocks spread over different block groups 5021 * 5022 * If datablocks are discontiguous, they are possible to spread over 5023 * different block groups too. If they are contiugous, with flexbg, 5024 * they could still across block group boundary. 5025 * 5026 * Also account for superblock, inode, quota and xattr blocks 5027 */ 5028 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5029 { 5030 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5031 int gdpblocks; 5032 int idxblocks; 5033 int ret = 0; 5034 5035 /* 5036 * How many index blocks need to touch to modify nrblocks? 5037 * The "Chunk" flag indicating whether the nrblocks is 5038 * physically contiguous on disk 5039 * 5040 * For Direct IO and fallocate, they calls get_block to allocate 5041 * one single extent at a time, so they could set the "Chunk" flag 5042 */ 5043 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 5044 5045 ret = idxblocks; 5046 5047 /* 5048 * Now let's see how many group bitmaps and group descriptors need 5049 * to account 5050 */ 5051 groups = idxblocks; 5052 if (chunk) 5053 groups += 1; 5054 else 5055 groups += nrblocks; 5056 5057 gdpblocks = groups; 5058 if (groups > ngroups) 5059 groups = ngroups; 5060 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5061 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5062 5063 /* bitmaps and block group descriptor blocks */ 5064 ret += groups + gdpblocks; 5065 5066 /* Blocks for super block, inode, quota and xattr blocks */ 5067 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5068 5069 return ret; 5070 } 5071 5072 /* 5073 * Calulate the total number of credits to reserve to fit 5074 * the modification of a single pages into a single transaction, 5075 * which may include multiple chunks of block allocations. 5076 * 5077 * This could be called via ext4_write_begin() 5078 * 5079 * We need to consider the worse case, when 5080 * one new block per extent. 5081 */ 5082 int ext4_writepage_trans_blocks(struct inode *inode) 5083 { 5084 int bpp = ext4_journal_blocks_per_page(inode); 5085 int ret; 5086 5087 ret = ext4_meta_trans_blocks(inode, bpp, 0); 5088 5089 /* Account for data blocks for journalled mode */ 5090 if (ext4_should_journal_data(inode)) 5091 ret += bpp; 5092 return ret; 5093 } 5094 5095 /* 5096 * Calculate the journal credits for a chunk of data modification. 5097 * 5098 * This is called from DIO, fallocate or whoever calling 5099 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. 5100 * 5101 * journal buffers for data blocks are not included here, as DIO 5102 * and fallocate do no need to journal data buffers. 5103 */ 5104 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5105 { 5106 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5107 } 5108 5109 /* 5110 * The caller must have previously called ext4_reserve_inode_write(). 5111 * Give this, we know that the caller already has write access to iloc->bh. 5112 */ 5113 int ext4_mark_iloc_dirty(handle_t *handle, 5114 struct inode *inode, struct ext4_iloc *iloc) 5115 { 5116 int err = 0; 5117 5118 if (test_opt(inode->i_sb, I_VERSION)) 5119 inode_inc_iversion(inode); 5120 5121 /* the do_update_inode consumes one bh->b_count */ 5122 get_bh(iloc->bh); 5123 5124 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5125 err = ext4_do_update_inode(handle, inode, iloc); 5126 put_bh(iloc->bh); 5127 return err; 5128 } 5129 5130 /* 5131 * On success, We end up with an outstanding reference count against 5132 * iloc->bh. This _must_ be cleaned up later. 5133 */ 5134 5135 int 5136 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5137 struct ext4_iloc *iloc) 5138 { 5139 int err; 5140 5141 err = ext4_get_inode_loc(inode, iloc); 5142 if (!err) { 5143 BUFFER_TRACE(iloc->bh, "get_write_access"); 5144 err = ext4_journal_get_write_access(handle, iloc->bh); 5145 if (err) { 5146 brelse(iloc->bh); 5147 iloc->bh = NULL; 5148 } 5149 } 5150 ext4_std_error(inode->i_sb, err); 5151 return err; 5152 } 5153 5154 /* 5155 * Expand an inode by new_extra_isize bytes. 5156 * Returns 0 on success or negative error number on failure. 5157 */ 5158 static int ext4_expand_extra_isize(struct inode *inode, 5159 unsigned int new_extra_isize, 5160 struct ext4_iloc iloc, 5161 handle_t *handle) 5162 { 5163 struct ext4_inode *raw_inode; 5164 struct ext4_xattr_ibody_header *header; 5165 struct ext4_xattr_entry *entry; 5166 5167 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5168 return 0; 5169 5170 raw_inode = ext4_raw_inode(&iloc); 5171 5172 header = IHDR(inode, raw_inode); 5173 entry = IFIRST(header); 5174 5175 /* No extended attributes present */ 5176 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5177 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5178 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5179 new_extra_isize); 5180 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5181 return 0; 5182 } 5183 5184 /* try to expand with EAs present */ 5185 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 5186 raw_inode, handle); 5187 } 5188 5189 /* 5190 * What we do here is to mark the in-core inode as clean with respect to inode 5191 * dirtiness (it may still be data-dirty). 5192 * This means that the in-core inode may be reaped by prune_icache 5193 * without having to perform any I/O. This is a very good thing, 5194 * because *any* task may call prune_icache - even ones which 5195 * have a transaction open against a different journal. 5196 * 5197 * Is this cheating? Not really. Sure, we haven't written the 5198 * inode out, but prune_icache isn't a user-visible syncing function. 5199 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 5200 * we start and wait on commits. 5201 * 5202 * Is this efficient/effective? Well, we're being nice to the system 5203 * by cleaning up our inodes proactively so they can be reaped 5204 * without I/O. But we are potentially leaving up to five seconds' 5205 * worth of inodes floating about which prune_icache wants us to 5206 * write out. One way to fix that would be to get prune_icache() 5207 * to do a write_super() to free up some memory. It has the desired 5208 * effect. 5209 */ 5210 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 5211 { 5212 struct ext4_iloc iloc; 5213 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5214 static unsigned int mnt_count; 5215 int err, ret; 5216 5217 might_sleep(); 5218 err = ext4_reserve_inode_write(handle, inode, &iloc); 5219 if (ext4_handle_valid(handle) && 5220 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5221 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5222 /* 5223 * We need extra buffer credits since we may write into EA block 5224 * with this same handle. If journal_extend fails, then it will 5225 * only result in a minor loss of functionality for that inode. 5226 * If this is felt to be critical, then e2fsck should be run to 5227 * force a large enough s_min_extra_isize. 5228 */ 5229 if ((jbd2_journal_extend(handle, 5230 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 5231 ret = ext4_expand_extra_isize(inode, 5232 sbi->s_want_extra_isize, 5233 iloc, handle); 5234 if (ret) { 5235 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5236 if (mnt_count != 5237 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5238 ext4_warning(inode->i_sb, __func__, 5239 "Unable to expand inode %lu. Delete" 5240 " some EAs or run e2fsck.", 5241 inode->i_ino); 5242 mnt_count = 5243 le16_to_cpu(sbi->s_es->s_mnt_count); 5244 } 5245 } 5246 } 5247 } 5248 if (!err) 5249 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 5250 return err; 5251 } 5252 5253 /* 5254 * ext4_dirty_inode() is called from __mark_inode_dirty() 5255 * 5256 * We're really interested in the case where a file is being extended. 5257 * i_size has been changed by generic_commit_write() and we thus need 5258 * to include the updated inode in the current transaction. 5259 * 5260 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5261 * are allocated to the file. 5262 * 5263 * If the inode is marked synchronous, we don't honour that here - doing 5264 * so would cause a commit on atime updates, which we don't bother doing. 5265 * We handle synchronous inodes at the highest possible level. 5266 */ 5267 void ext4_dirty_inode(struct inode *inode) 5268 { 5269 handle_t *current_handle = ext4_journal_current_handle(); 5270 handle_t *handle; 5271 5272 if (!ext4_handle_valid(current_handle)) { 5273 ext4_mark_inode_dirty(current_handle, inode); 5274 return; 5275 } 5276 5277 handle = ext4_journal_start(inode, 2); 5278 if (IS_ERR(handle)) 5279 goto out; 5280 if (current_handle && 5281 current_handle->h_transaction != handle->h_transaction) { 5282 /* This task has a transaction open against a different fs */ 5283 printk(KERN_EMERG "%s: transactions do not match!\n", 5284 __func__); 5285 } else { 5286 jbd_debug(5, "marking dirty. outer handle=%p\n", 5287 current_handle); 5288 ext4_mark_inode_dirty(handle, inode); 5289 } 5290 ext4_journal_stop(handle); 5291 out: 5292 return; 5293 } 5294 5295 #if 0 5296 /* 5297 * Bind an inode's backing buffer_head into this transaction, to prevent 5298 * it from being flushed to disk early. Unlike 5299 * ext4_reserve_inode_write, this leaves behind no bh reference and 5300 * returns no iloc structure, so the caller needs to repeat the iloc 5301 * lookup to mark the inode dirty later. 5302 */ 5303 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5304 { 5305 struct ext4_iloc iloc; 5306 5307 int err = 0; 5308 if (handle) { 5309 err = ext4_get_inode_loc(inode, &iloc); 5310 if (!err) { 5311 BUFFER_TRACE(iloc.bh, "get_write_access"); 5312 err = jbd2_journal_get_write_access(handle, iloc.bh); 5313 if (!err) 5314 err = ext4_handle_dirty_metadata(handle, 5315 inode, 5316 iloc.bh); 5317 brelse(iloc.bh); 5318 } 5319 } 5320 ext4_std_error(inode->i_sb, err); 5321 return err; 5322 } 5323 #endif 5324 5325 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5326 { 5327 journal_t *journal; 5328 handle_t *handle; 5329 int err; 5330 5331 /* 5332 * We have to be very careful here: changing a data block's 5333 * journaling status dynamically is dangerous. If we write a 5334 * data block to the journal, change the status and then delete 5335 * that block, we risk forgetting to revoke the old log record 5336 * from the journal and so a subsequent replay can corrupt data. 5337 * So, first we make sure that the journal is empty and that 5338 * nobody is changing anything. 5339 */ 5340 5341 journal = EXT4_JOURNAL(inode); 5342 if (!journal) 5343 return 0; 5344 if (is_journal_aborted(journal)) 5345 return -EROFS; 5346 5347 jbd2_journal_lock_updates(journal); 5348 jbd2_journal_flush(journal); 5349 5350 /* 5351 * OK, there are no updates running now, and all cached data is 5352 * synced to disk. We are now in a completely consistent state 5353 * which doesn't have anything in the journal, and we know that 5354 * no filesystem updates are running, so it is safe to modify 5355 * the inode's in-core data-journaling state flag now. 5356 */ 5357 5358 if (val) 5359 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5360 else 5361 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5362 ext4_set_aops(inode); 5363 5364 jbd2_journal_unlock_updates(journal); 5365 5366 /* Finally we can mark the inode as dirty. */ 5367 5368 handle = ext4_journal_start(inode, 1); 5369 if (IS_ERR(handle)) 5370 return PTR_ERR(handle); 5371 5372 err = ext4_mark_inode_dirty(handle, inode); 5373 ext4_handle_sync(handle); 5374 ext4_journal_stop(handle); 5375 ext4_std_error(inode->i_sb, err); 5376 5377 return err; 5378 } 5379 5380 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5381 { 5382 return !buffer_mapped(bh); 5383 } 5384 5385 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 5386 { 5387 struct page *page = vmf->page; 5388 loff_t size; 5389 unsigned long len; 5390 int ret = -EINVAL; 5391 void *fsdata; 5392 struct file *file = vma->vm_file; 5393 struct inode *inode = file->f_path.dentry->d_inode; 5394 struct address_space *mapping = inode->i_mapping; 5395 5396 /* 5397 * Get i_alloc_sem to stop truncates messing with the inode. We cannot 5398 * get i_mutex because we are already holding mmap_sem. 5399 */ 5400 down_read(&inode->i_alloc_sem); 5401 size = i_size_read(inode); 5402 if (page->mapping != mapping || size <= page_offset(page) 5403 || !PageUptodate(page)) { 5404 /* page got truncated from under us? */ 5405 goto out_unlock; 5406 } 5407 ret = 0; 5408 if (PageMappedToDisk(page)) 5409 goto out_unlock; 5410 5411 if (page->index == size >> PAGE_CACHE_SHIFT) 5412 len = size & ~PAGE_CACHE_MASK; 5413 else 5414 len = PAGE_CACHE_SIZE; 5415 5416 if (page_has_buffers(page)) { 5417 /* return if we have all the buffers mapped */ 5418 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5419 ext4_bh_unmapped)) 5420 goto out_unlock; 5421 } 5422 /* 5423 * OK, we need to fill the hole... Do write_begin write_end 5424 * to do block allocation/reservation.We are not holding 5425 * inode.i__mutex here. That allow * parallel write_begin, 5426 * write_end call. lock_page prevent this from happening 5427 * on the same page though 5428 */ 5429 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 5430 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 5431 if (ret < 0) 5432 goto out_unlock; 5433 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 5434 len, len, page, fsdata); 5435 if (ret < 0) 5436 goto out_unlock; 5437 ret = 0; 5438 out_unlock: 5439 if (ret) 5440 ret = VM_FAULT_SIGBUS; 5441 up_read(&inode->i_alloc_sem); 5442 return ret; 5443 } 5444