1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/jbd2.h> 29 #include <linux/highuid.h> 30 #include <linux/pagemap.h> 31 #include <linux/quotaops.h> 32 #include <linux/string.h> 33 #include <linux/buffer_head.h> 34 #include <linux/writeback.h> 35 #include <linux/pagevec.h> 36 #include <linux/mpage.h> 37 #include <linux/namei.h> 38 #include <linux/uio.h> 39 #include <linux/bio.h> 40 #include "ext4_jbd2.h" 41 #include "xattr.h" 42 #include "acl.h" 43 #include "ext4_extents.h" 44 45 #define MPAGE_DA_EXTENT_TAIL 0x01 46 47 static inline int ext4_begin_ordered_truncate(struct inode *inode, 48 loff_t new_size) 49 { 50 return jbd2_journal_begin_ordered_truncate( 51 EXT4_SB(inode->i_sb)->s_journal, 52 &EXT4_I(inode)->jinode, 53 new_size); 54 } 55 56 static void ext4_invalidatepage(struct page *page, unsigned long offset); 57 58 /* 59 * Test whether an inode is a fast symlink. 60 */ 61 static int ext4_inode_is_fast_symlink(struct inode *inode) 62 { 63 int ea_blocks = EXT4_I(inode)->i_file_acl ? 64 (inode->i_sb->s_blocksize >> 9) : 0; 65 66 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 67 } 68 69 /* 70 * The ext4 forget function must perform a revoke if we are freeing data 71 * which has been journaled. Metadata (eg. indirect blocks) must be 72 * revoked in all cases. 73 * 74 * "bh" may be NULL: a metadata block may have been freed from memory 75 * but there may still be a record of it in the journal, and that record 76 * still needs to be revoked. 77 * 78 * If the handle isn't valid we're not journaling so there's nothing to do. 79 */ 80 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 81 struct buffer_head *bh, ext4_fsblk_t blocknr) 82 { 83 int err; 84 85 if (!ext4_handle_valid(handle)) 86 return 0; 87 88 might_sleep(); 89 90 BUFFER_TRACE(bh, "enter"); 91 92 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 93 "data mode %lx\n", 94 bh, is_metadata, inode->i_mode, 95 test_opt(inode->i_sb, DATA_FLAGS)); 96 97 /* Never use the revoke function if we are doing full data 98 * journaling: there is no need to, and a V1 superblock won't 99 * support it. Otherwise, only skip the revoke on un-journaled 100 * data blocks. */ 101 102 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 103 (!is_metadata && !ext4_should_journal_data(inode))) { 104 if (bh) { 105 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 106 return ext4_journal_forget(handle, bh); 107 } 108 return 0; 109 } 110 111 /* 112 * data!=journal && (is_metadata || should_journal_data(inode)) 113 */ 114 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 115 err = ext4_journal_revoke(handle, blocknr, bh); 116 if (err) 117 ext4_abort(inode->i_sb, __func__, 118 "error %d when attempting revoke", err); 119 BUFFER_TRACE(bh, "exit"); 120 return err; 121 } 122 123 /* 124 * Work out how many blocks we need to proceed with the next chunk of a 125 * truncate transaction. 126 */ 127 static unsigned long blocks_for_truncate(struct inode *inode) 128 { 129 ext4_lblk_t needed; 130 131 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 132 133 /* Give ourselves just enough room to cope with inodes in which 134 * i_blocks is corrupt: we've seen disk corruptions in the past 135 * which resulted in random data in an inode which looked enough 136 * like a regular file for ext4 to try to delete it. Things 137 * will go a bit crazy if that happens, but at least we should 138 * try not to panic the whole kernel. */ 139 if (needed < 2) 140 needed = 2; 141 142 /* But we need to bound the transaction so we don't overflow the 143 * journal. */ 144 if (needed > EXT4_MAX_TRANS_DATA) 145 needed = EXT4_MAX_TRANS_DATA; 146 147 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 148 } 149 150 /* 151 * Truncate transactions can be complex and absolutely huge. So we need to 152 * be able to restart the transaction at a conventient checkpoint to make 153 * sure we don't overflow the journal. 154 * 155 * start_transaction gets us a new handle for a truncate transaction, 156 * and extend_transaction tries to extend the existing one a bit. If 157 * extend fails, we need to propagate the failure up and restart the 158 * transaction in the top-level truncate loop. --sct 159 */ 160 static handle_t *start_transaction(struct inode *inode) 161 { 162 handle_t *result; 163 164 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 165 if (!IS_ERR(result)) 166 return result; 167 168 ext4_std_error(inode->i_sb, PTR_ERR(result)); 169 return result; 170 } 171 172 /* 173 * Try to extend this transaction for the purposes of truncation. 174 * 175 * Returns 0 if we managed to create more room. If we can't create more 176 * room, and the transaction must be restarted we return 1. 177 */ 178 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 179 { 180 if (!ext4_handle_valid(handle)) 181 return 0; 182 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 183 return 0; 184 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 185 return 0; 186 return 1; 187 } 188 189 /* 190 * Restart the transaction associated with *handle. This does a commit, 191 * so before we call here everything must be consistently dirtied against 192 * this transaction. 193 */ 194 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 { 196 BUG_ON(EXT4_JOURNAL(inode) == NULL); 197 jbd_debug(2, "restarting handle %p\n", handle); 198 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 199 } 200 201 /* 202 * Called at the last iput() if i_nlink is zero. 203 */ 204 void ext4_delete_inode(struct inode *inode) 205 { 206 handle_t *handle; 207 int err; 208 209 if (ext4_should_order_data(inode)) 210 ext4_begin_ordered_truncate(inode, 0); 211 truncate_inode_pages(&inode->i_data, 0); 212 213 if (is_bad_inode(inode)) 214 goto no_delete; 215 216 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 217 if (IS_ERR(handle)) { 218 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 219 /* 220 * If we're going to skip the normal cleanup, we still need to 221 * make sure that the in-core orphan linked list is properly 222 * cleaned up. 223 */ 224 ext4_orphan_del(NULL, inode); 225 goto no_delete; 226 } 227 228 if (IS_SYNC(inode)) 229 ext4_handle_sync(handle); 230 inode->i_size = 0; 231 err = ext4_mark_inode_dirty(handle, inode); 232 if (err) { 233 ext4_warning(inode->i_sb, __func__, 234 "couldn't mark inode dirty (err %d)", err); 235 goto stop_handle; 236 } 237 if (inode->i_blocks) 238 ext4_truncate(inode); 239 240 /* 241 * ext4_ext_truncate() doesn't reserve any slop when it 242 * restarts journal transactions; therefore there may not be 243 * enough credits left in the handle to remove the inode from 244 * the orphan list and set the dtime field. 245 */ 246 if (!ext4_handle_has_enough_credits(handle, 3)) { 247 err = ext4_journal_extend(handle, 3); 248 if (err > 0) 249 err = ext4_journal_restart(handle, 3); 250 if (err != 0) { 251 ext4_warning(inode->i_sb, __func__, 252 "couldn't extend journal (err %d)", err); 253 stop_handle: 254 ext4_journal_stop(handle); 255 goto no_delete; 256 } 257 } 258 259 /* 260 * Kill off the orphan record which ext4_truncate created. 261 * AKPM: I think this can be inside the above `if'. 262 * Note that ext4_orphan_del() has to be able to cope with the 263 * deletion of a non-existent orphan - this is because we don't 264 * know if ext4_truncate() actually created an orphan record. 265 * (Well, we could do this if we need to, but heck - it works) 266 */ 267 ext4_orphan_del(handle, inode); 268 EXT4_I(inode)->i_dtime = get_seconds(); 269 270 /* 271 * One subtle ordering requirement: if anything has gone wrong 272 * (transaction abort, IO errors, whatever), then we can still 273 * do these next steps (the fs will already have been marked as 274 * having errors), but we can't free the inode if the mark_dirty 275 * fails. 276 */ 277 if (ext4_mark_inode_dirty(handle, inode)) 278 /* If that failed, just do the required in-core inode clear. */ 279 clear_inode(inode); 280 else 281 ext4_free_inode(handle, inode); 282 ext4_journal_stop(handle); 283 return; 284 no_delete: 285 clear_inode(inode); /* We must guarantee clearing of inode... */ 286 } 287 288 typedef struct { 289 __le32 *p; 290 __le32 key; 291 struct buffer_head *bh; 292 } Indirect; 293 294 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 295 { 296 p->key = *(p->p = v); 297 p->bh = bh; 298 } 299 300 /** 301 * ext4_block_to_path - parse the block number into array of offsets 302 * @inode: inode in question (we are only interested in its superblock) 303 * @i_block: block number to be parsed 304 * @offsets: array to store the offsets in 305 * @boundary: set this non-zero if the referred-to block is likely to be 306 * followed (on disk) by an indirect block. 307 * 308 * To store the locations of file's data ext4 uses a data structure common 309 * for UNIX filesystems - tree of pointers anchored in the inode, with 310 * data blocks at leaves and indirect blocks in intermediate nodes. 311 * This function translates the block number into path in that tree - 312 * return value is the path length and @offsets[n] is the offset of 313 * pointer to (n+1)th node in the nth one. If @block is out of range 314 * (negative or too large) warning is printed and zero returned. 315 * 316 * Note: function doesn't find node addresses, so no IO is needed. All 317 * we need to know is the capacity of indirect blocks (taken from the 318 * inode->i_sb). 319 */ 320 321 /* 322 * Portability note: the last comparison (check that we fit into triple 323 * indirect block) is spelled differently, because otherwise on an 324 * architecture with 32-bit longs and 8Kb pages we might get into trouble 325 * if our filesystem had 8Kb blocks. We might use long long, but that would 326 * kill us on x86. Oh, well, at least the sign propagation does not matter - 327 * i_block would have to be negative in the very beginning, so we would not 328 * get there at all. 329 */ 330 331 static int ext4_block_to_path(struct inode *inode, 332 ext4_lblk_t i_block, 333 ext4_lblk_t offsets[4], int *boundary) 334 { 335 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 336 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 337 const long direct_blocks = EXT4_NDIR_BLOCKS, 338 indirect_blocks = ptrs, 339 double_blocks = (1 << (ptrs_bits * 2)); 340 int n = 0; 341 int final = 0; 342 343 if (i_block < 0) { 344 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); 345 } else if (i_block < direct_blocks) { 346 offsets[n++] = i_block; 347 final = direct_blocks; 348 } else if ((i_block -= direct_blocks) < indirect_blocks) { 349 offsets[n++] = EXT4_IND_BLOCK; 350 offsets[n++] = i_block; 351 final = ptrs; 352 } else if ((i_block -= indirect_blocks) < double_blocks) { 353 offsets[n++] = EXT4_DIND_BLOCK; 354 offsets[n++] = i_block >> ptrs_bits; 355 offsets[n++] = i_block & (ptrs - 1); 356 final = ptrs; 357 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 358 offsets[n++] = EXT4_TIND_BLOCK; 359 offsets[n++] = i_block >> (ptrs_bits * 2); 360 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 361 offsets[n++] = i_block & (ptrs - 1); 362 final = ptrs; 363 } else { 364 ext4_warning(inode->i_sb, "ext4_block_to_path", 365 "block %lu > max in inode %lu", 366 i_block + direct_blocks + 367 indirect_blocks + double_blocks, inode->i_ino); 368 } 369 if (boundary) 370 *boundary = final - 1 - (i_block & (ptrs - 1)); 371 return n; 372 } 373 374 /** 375 * ext4_get_branch - read the chain of indirect blocks leading to data 376 * @inode: inode in question 377 * @depth: depth of the chain (1 - direct pointer, etc.) 378 * @offsets: offsets of pointers in inode/indirect blocks 379 * @chain: place to store the result 380 * @err: here we store the error value 381 * 382 * Function fills the array of triples <key, p, bh> and returns %NULL 383 * if everything went OK or the pointer to the last filled triple 384 * (incomplete one) otherwise. Upon the return chain[i].key contains 385 * the number of (i+1)-th block in the chain (as it is stored in memory, 386 * i.e. little-endian 32-bit), chain[i].p contains the address of that 387 * number (it points into struct inode for i==0 and into the bh->b_data 388 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 389 * block for i>0 and NULL for i==0. In other words, it holds the block 390 * numbers of the chain, addresses they were taken from (and where we can 391 * verify that chain did not change) and buffer_heads hosting these 392 * numbers. 393 * 394 * Function stops when it stumbles upon zero pointer (absent block) 395 * (pointer to last triple returned, *@err == 0) 396 * or when it gets an IO error reading an indirect block 397 * (ditto, *@err == -EIO) 398 * or when it reads all @depth-1 indirect blocks successfully and finds 399 * the whole chain, all way to the data (returns %NULL, *err == 0). 400 * 401 * Need to be called with 402 * down_read(&EXT4_I(inode)->i_data_sem) 403 */ 404 static Indirect *ext4_get_branch(struct inode *inode, int depth, 405 ext4_lblk_t *offsets, 406 Indirect chain[4], int *err) 407 { 408 struct super_block *sb = inode->i_sb; 409 Indirect *p = chain; 410 struct buffer_head *bh; 411 412 *err = 0; 413 /* i_data is not going away, no lock needed */ 414 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 415 if (!p->key) 416 goto no_block; 417 while (--depth) { 418 bh = sb_bread(sb, le32_to_cpu(p->key)); 419 if (!bh) 420 goto failure; 421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 422 /* Reader: end */ 423 if (!p->key) 424 goto no_block; 425 } 426 return NULL; 427 428 failure: 429 *err = -EIO; 430 no_block: 431 return p; 432 } 433 434 /** 435 * ext4_find_near - find a place for allocation with sufficient locality 436 * @inode: owner 437 * @ind: descriptor of indirect block. 438 * 439 * This function returns the preferred place for block allocation. 440 * It is used when heuristic for sequential allocation fails. 441 * Rules are: 442 * + if there is a block to the left of our position - allocate near it. 443 * + if pointer will live in indirect block - allocate near that block. 444 * + if pointer will live in inode - allocate in the same 445 * cylinder group. 446 * 447 * In the latter case we colour the starting block by the callers PID to 448 * prevent it from clashing with concurrent allocations for a different inode 449 * in the same block group. The PID is used here so that functionally related 450 * files will be close-by on-disk. 451 * 452 * Caller must make sure that @ind is valid and will stay that way. 453 */ 454 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 455 { 456 struct ext4_inode_info *ei = EXT4_I(inode); 457 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 458 __le32 *p; 459 ext4_fsblk_t bg_start; 460 ext4_fsblk_t last_block; 461 ext4_grpblk_t colour; 462 463 /* Try to find previous block */ 464 for (p = ind->p - 1; p >= start; p--) { 465 if (*p) 466 return le32_to_cpu(*p); 467 } 468 469 /* No such thing, so let's try location of indirect block */ 470 if (ind->bh) 471 return ind->bh->b_blocknr; 472 473 /* 474 * It is going to be referred to from the inode itself? OK, just put it 475 * into the same cylinder group then. 476 */ 477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 479 480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 481 colour = (current->pid % 16) * 482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 483 else 484 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 485 return bg_start + colour; 486 } 487 488 /** 489 * ext4_find_goal - find a preferred place for allocation. 490 * @inode: owner 491 * @block: block we want 492 * @partial: pointer to the last triple within a chain 493 * 494 * Normally this function find the preferred place for block allocation, 495 * returns it. 496 */ 497 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 498 Indirect *partial) 499 { 500 /* 501 * XXX need to get goal block from mballoc's data structures 502 */ 503 504 return ext4_find_near(inode, partial); 505 } 506 507 /** 508 * ext4_blks_to_allocate: Look up the block map and count the number 509 * of direct blocks need to be allocated for the given branch. 510 * 511 * @branch: chain of indirect blocks 512 * @k: number of blocks need for indirect blocks 513 * @blks: number of data blocks to be mapped. 514 * @blocks_to_boundary: the offset in the indirect block 515 * 516 * return the total number of blocks to be allocate, including the 517 * direct and indirect blocks. 518 */ 519 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 520 int blocks_to_boundary) 521 { 522 unsigned int count = 0; 523 524 /* 525 * Simple case, [t,d]Indirect block(s) has not allocated yet 526 * then it's clear blocks on that path have not allocated 527 */ 528 if (k > 0) { 529 /* right now we don't handle cross boundary allocation */ 530 if (blks < blocks_to_boundary + 1) 531 count += blks; 532 else 533 count += blocks_to_boundary + 1; 534 return count; 535 } 536 537 count++; 538 while (count < blks && count <= blocks_to_boundary && 539 le32_to_cpu(*(branch[0].p + count)) == 0) { 540 count++; 541 } 542 return count; 543 } 544 545 /** 546 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 547 * @indirect_blks: the number of blocks need to allocate for indirect 548 * blocks 549 * 550 * @new_blocks: on return it will store the new block numbers for 551 * the indirect blocks(if needed) and the first direct block, 552 * @blks: on return it will store the total number of allocated 553 * direct blocks 554 */ 555 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 556 ext4_lblk_t iblock, ext4_fsblk_t goal, 557 int indirect_blks, int blks, 558 ext4_fsblk_t new_blocks[4], int *err) 559 { 560 struct ext4_allocation_request ar; 561 int target, i; 562 unsigned long count = 0, blk_allocated = 0; 563 int index = 0; 564 ext4_fsblk_t current_block = 0; 565 int ret = 0; 566 567 /* 568 * Here we try to allocate the requested multiple blocks at once, 569 * on a best-effort basis. 570 * To build a branch, we should allocate blocks for 571 * the indirect blocks(if not allocated yet), and at least 572 * the first direct block of this branch. That's the 573 * minimum number of blocks need to allocate(required) 574 */ 575 /* first we try to allocate the indirect blocks */ 576 target = indirect_blks; 577 while (target > 0) { 578 count = target; 579 /* allocating blocks for indirect blocks and direct blocks */ 580 current_block = ext4_new_meta_blocks(handle, inode, 581 goal, &count, err); 582 if (*err) 583 goto failed_out; 584 585 target -= count; 586 /* allocate blocks for indirect blocks */ 587 while (index < indirect_blks && count) { 588 new_blocks[index++] = current_block++; 589 count--; 590 } 591 if (count > 0) { 592 /* 593 * save the new block number 594 * for the first direct block 595 */ 596 new_blocks[index] = current_block; 597 printk(KERN_INFO "%s returned more blocks than " 598 "requested\n", __func__); 599 WARN_ON(1); 600 break; 601 } 602 } 603 604 target = blks - count ; 605 blk_allocated = count; 606 if (!target) 607 goto allocated; 608 /* Now allocate data blocks */ 609 memset(&ar, 0, sizeof(ar)); 610 ar.inode = inode; 611 ar.goal = goal; 612 ar.len = target; 613 ar.logical = iblock; 614 if (S_ISREG(inode->i_mode)) 615 /* enable in-core preallocation only for regular files */ 616 ar.flags = EXT4_MB_HINT_DATA; 617 618 current_block = ext4_mb_new_blocks(handle, &ar, err); 619 620 if (*err && (target == blks)) { 621 /* 622 * if the allocation failed and we didn't allocate 623 * any blocks before 624 */ 625 goto failed_out; 626 } 627 if (!*err) { 628 if (target == blks) { 629 /* 630 * save the new block number 631 * for the first direct block 632 */ 633 new_blocks[index] = current_block; 634 } 635 blk_allocated += ar.len; 636 } 637 allocated: 638 /* total number of blocks allocated for direct blocks */ 639 ret = blk_allocated; 640 *err = 0; 641 return ret; 642 failed_out: 643 for (i = 0; i < index; i++) 644 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 645 return ret; 646 } 647 648 /** 649 * ext4_alloc_branch - allocate and set up a chain of blocks. 650 * @inode: owner 651 * @indirect_blks: number of allocated indirect blocks 652 * @blks: number of allocated direct blocks 653 * @offsets: offsets (in the blocks) to store the pointers to next. 654 * @branch: place to store the chain in. 655 * 656 * This function allocates blocks, zeroes out all but the last one, 657 * links them into chain and (if we are synchronous) writes them to disk. 658 * In other words, it prepares a branch that can be spliced onto the 659 * inode. It stores the information about that chain in the branch[], in 660 * the same format as ext4_get_branch() would do. We are calling it after 661 * we had read the existing part of chain and partial points to the last 662 * triple of that (one with zero ->key). Upon the exit we have the same 663 * picture as after the successful ext4_get_block(), except that in one 664 * place chain is disconnected - *branch->p is still zero (we did not 665 * set the last link), but branch->key contains the number that should 666 * be placed into *branch->p to fill that gap. 667 * 668 * If allocation fails we free all blocks we've allocated (and forget 669 * their buffer_heads) and return the error value the from failed 670 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 671 * as described above and return 0. 672 */ 673 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 674 ext4_lblk_t iblock, int indirect_blks, 675 int *blks, ext4_fsblk_t goal, 676 ext4_lblk_t *offsets, Indirect *branch) 677 { 678 int blocksize = inode->i_sb->s_blocksize; 679 int i, n = 0; 680 int err = 0; 681 struct buffer_head *bh; 682 int num; 683 ext4_fsblk_t new_blocks[4]; 684 ext4_fsblk_t current_block; 685 686 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 687 *blks, new_blocks, &err); 688 if (err) 689 return err; 690 691 branch[0].key = cpu_to_le32(new_blocks[0]); 692 /* 693 * metadata blocks and data blocks are allocated. 694 */ 695 for (n = 1; n <= indirect_blks; n++) { 696 /* 697 * Get buffer_head for parent block, zero it out 698 * and set the pointer to new one, then send 699 * parent to disk. 700 */ 701 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 702 branch[n].bh = bh; 703 lock_buffer(bh); 704 BUFFER_TRACE(bh, "call get_create_access"); 705 err = ext4_journal_get_create_access(handle, bh); 706 if (err) { 707 unlock_buffer(bh); 708 brelse(bh); 709 goto failed; 710 } 711 712 memset(bh->b_data, 0, blocksize); 713 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 714 branch[n].key = cpu_to_le32(new_blocks[n]); 715 *branch[n].p = branch[n].key; 716 if (n == indirect_blks) { 717 current_block = new_blocks[n]; 718 /* 719 * End of chain, update the last new metablock of 720 * the chain to point to the new allocated 721 * data blocks numbers 722 */ 723 for (i=1; i < num; i++) 724 *(branch[n].p + i) = cpu_to_le32(++current_block); 725 } 726 BUFFER_TRACE(bh, "marking uptodate"); 727 set_buffer_uptodate(bh); 728 unlock_buffer(bh); 729 730 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 731 err = ext4_handle_dirty_metadata(handle, inode, bh); 732 if (err) 733 goto failed; 734 } 735 *blks = num; 736 return err; 737 failed: 738 /* Allocation failed, free what we already allocated */ 739 for (i = 1; i <= n ; i++) { 740 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 741 ext4_journal_forget(handle, branch[i].bh); 742 } 743 for (i = 0; i < indirect_blks; i++) 744 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 745 746 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 747 748 return err; 749 } 750 751 /** 752 * ext4_splice_branch - splice the allocated branch onto inode. 753 * @inode: owner 754 * @block: (logical) number of block we are adding 755 * @chain: chain of indirect blocks (with a missing link - see 756 * ext4_alloc_branch) 757 * @where: location of missing link 758 * @num: number of indirect blocks we are adding 759 * @blks: number of direct blocks we are adding 760 * 761 * This function fills the missing link and does all housekeeping needed in 762 * inode (->i_blocks, etc.). In case of success we end up with the full 763 * chain to new block and return 0. 764 */ 765 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 766 ext4_lblk_t block, Indirect *where, int num, int blks) 767 { 768 int i; 769 int err = 0; 770 ext4_fsblk_t current_block; 771 772 /* 773 * If we're splicing into a [td]indirect block (as opposed to the 774 * inode) then we need to get write access to the [td]indirect block 775 * before the splice. 776 */ 777 if (where->bh) { 778 BUFFER_TRACE(where->bh, "get_write_access"); 779 err = ext4_journal_get_write_access(handle, where->bh); 780 if (err) 781 goto err_out; 782 } 783 /* That's it */ 784 785 *where->p = where->key; 786 787 /* 788 * Update the host buffer_head or inode to point to more just allocated 789 * direct blocks blocks 790 */ 791 if (num == 0 && blks > 1) { 792 current_block = le32_to_cpu(where->key) + 1; 793 for (i = 1; i < blks; i++) 794 *(where->p + i) = cpu_to_le32(current_block++); 795 } 796 797 /* We are done with atomic stuff, now do the rest of housekeeping */ 798 799 inode->i_ctime = ext4_current_time(inode); 800 ext4_mark_inode_dirty(handle, inode); 801 802 /* had we spliced it onto indirect block? */ 803 if (where->bh) { 804 /* 805 * If we spliced it onto an indirect block, we haven't 806 * altered the inode. Note however that if it is being spliced 807 * onto an indirect block at the very end of the file (the 808 * file is growing) then we *will* alter the inode to reflect 809 * the new i_size. But that is not done here - it is done in 810 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 811 */ 812 jbd_debug(5, "splicing indirect only\n"); 813 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 814 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 815 if (err) 816 goto err_out; 817 } else { 818 /* 819 * OK, we spliced it into the inode itself on a direct block. 820 * Inode was dirtied above. 821 */ 822 jbd_debug(5, "splicing direct\n"); 823 } 824 return err; 825 826 err_out: 827 for (i = 1; i <= num; i++) { 828 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 829 ext4_journal_forget(handle, where[i].bh); 830 ext4_free_blocks(handle, inode, 831 le32_to_cpu(where[i-1].key), 1, 0); 832 } 833 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 834 835 return err; 836 } 837 838 /* 839 * Allocation strategy is simple: if we have to allocate something, we will 840 * have to go the whole way to leaf. So let's do it before attaching anything 841 * to tree, set linkage between the newborn blocks, write them if sync is 842 * required, recheck the path, free and repeat if check fails, otherwise 843 * set the last missing link (that will protect us from any truncate-generated 844 * removals - all blocks on the path are immune now) and possibly force the 845 * write on the parent block. 846 * That has a nice additional property: no special recovery from the failed 847 * allocations is needed - we simply release blocks and do not touch anything 848 * reachable from inode. 849 * 850 * `handle' can be NULL if create == 0. 851 * 852 * return > 0, # of blocks mapped or allocated. 853 * return = 0, if plain lookup failed. 854 * return < 0, error case. 855 * 856 * 857 * Need to be called with 858 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 859 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 860 */ 861 static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 862 ext4_lblk_t iblock, unsigned int maxblocks, 863 struct buffer_head *bh_result, 864 int create, int extend_disksize) 865 { 866 int err = -EIO; 867 ext4_lblk_t offsets[4]; 868 Indirect chain[4]; 869 Indirect *partial; 870 ext4_fsblk_t goal; 871 int indirect_blks; 872 int blocks_to_boundary = 0; 873 int depth; 874 struct ext4_inode_info *ei = EXT4_I(inode); 875 int count = 0; 876 ext4_fsblk_t first_block = 0; 877 loff_t disksize; 878 879 880 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 881 J_ASSERT(handle != NULL || create == 0); 882 depth = ext4_block_to_path(inode, iblock, offsets, 883 &blocks_to_boundary); 884 885 if (depth == 0) 886 goto out; 887 888 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 889 890 /* Simplest case - block found, no allocation needed */ 891 if (!partial) { 892 first_block = le32_to_cpu(chain[depth - 1].key); 893 clear_buffer_new(bh_result); 894 count++; 895 /*map more blocks*/ 896 while (count < maxblocks && count <= blocks_to_boundary) { 897 ext4_fsblk_t blk; 898 899 blk = le32_to_cpu(*(chain[depth-1].p + count)); 900 901 if (blk == first_block + count) 902 count++; 903 else 904 break; 905 } 906 goto got_it; 907 } 908 909 /* Next simple case - plain lookup or failed read of indirect block */ 910 if (!create || err == -EIO) 911 goto cleanup; 912 913 /* 914 * Okay, we need to do block allocation. 915 */ 916 goal = ext4_find_goal(inode, iblock, partial); 917 918 /* the number of blocks need to allocate for [d,t]indirect blocks */ 919 indirect_blks = (chain + depth) - partial - 1; 920 921 /* 922 * Next look up the indirect map to count the totoal number of 923 * direct blocks to allocate for this branch. 924 */ 925 count = ext4_blks_to_allocate(partial, indirect_blks, 926 maxblocks, blocks_to_boundary); 927 /* 928 * Block out ext4_truncate while we alter the tree 929 */ 930 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 931 &count, goal, 932 offsets + (partial - chain), partial); 933 934 /* 935 * The ext4_splice_branch call will free and forget any buffers 936 * on the new chain if there is a failure, but that risks using 937 * up transaction credits, especially for bitmaps where the 938 * credits cannot be returned. Can we handle this somehow? We 939 * may need to return -EAGAIN upwards in the worst case. --sct 940 */ 941 if (!err) 942 err = ext4_splice_branch(handle, inode, iblock, 943 partial, indirect_blks, count); 944 /* 945 * i_disksize growing is protected by i_data_sem. Don't forget to 946 * protect it if you're about to implement concurrent 947 * ext4_get_block() -bzzz 948 */ 949 if (!err && extend_disksize) { 950 disksize = ((loff_t) iblock + count) << inode->i_blkbits; 951 if (disksize > i_size_read(inode)) 952 disksize = i_size_read(inode); 953 if (disksize > ei->i_disksize) 954 ei->i_disksize = disksize; 955 } 956 if (err) 957 goto cleanup; 958 959 set_buffer_new(bh_result); 960 got_it: 961 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 962 if (count > blocks_to_boundary) 963 set_buffer_boundary(bh_result); 964 err = count; 965 /* Clean up and exit */ 966 partial = chain + depth - 1; /* the whole chain */ 967 cleanup: 968 while (partial > chain) { 969 BUFFER_TRACE(partial->bh, "call brelse"); 970 brelse(partial->bh); 971 partial--; 972 } 973 BUFFER_TRACE(bh_result, "returned"); 974 out: 975 return err; 976 } 977 978 /* 979 * Calculate the number of metadata blocks need to reserve 980 * to allocate @blocks for non extent file based file 981 */ 982 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 983 { 984 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 985 int ind_blks, dind_blks, tind_blks; 986 987 /* number of new indirect blocks needed */ 988 ind_blks = (blocks + icap - 1) / icap; 989 990 dind_blks = (ind_blks + icap - 1) / icap; 991 992 tind_blks = 1; 993 994 return ind_blks + dind_blks + tind_blks; 995 } 996 997 /* 998 * Calculate the number of metadata blocks need to reserve 999 * to allocate given number of blocks 1000 */ 1001 static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1002 { 1003 if (!blocks) 1004 return 0; 1005 1006 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1007 return ext4_ext_calc_metadata_amount(inode, blocks); 1008 1009 return ext4_indirect_calc_metadata_amount(inode, blocks); 1010 } 1011 1012 static void ext4_da_update_reserve_space(struct inode *inode, int used) 1013 { 1014 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1015 int total, mdb, mdb_free; 1016 1017 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1018 /* recalculate the number of metablocks still need to be reserved */ 1019 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1020 mdb = ext4_calc_metadata_amount(inode, total); 1021 1022 /* figure out how many metablocks to release */ 1023 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1024 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1025 1026 if (mdb_free) { 1027 /* Account for allocated meta_blocks */ 1028 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1029 1030 /* update fs dirty blocks counter */ 1031 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1032 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1033 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1034 } 1035 1036 /* update per-inode reservations */ 1037 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1038 EXT4_I(inode)->i_reserved_data_blocks -= used; 1039 1040 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1041 } 1042 1043 /* 1044 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1045 * and returns if the blocks are already mapped. 1046 * 1047 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1048 * and store the allocated blocks in the result buffer head and mark it 1049 * mapped. 1050 * 1051 * If file type is extents based, it will call ext4_ext_get_blocks(), 1052 * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping 1053 * based files 1054 * 1055 * On success, it returns the number of blocks being mapped or allocate. 1056 * if create==0 and the blocks are pre-allocated and uninitialized block, 1057 * the result buffer head is unmapped. If the create ==1, it will make sure 1058 * the buffer head is mapped. 1059 * 1060 * It returns 0 if plain look up failed (blocks have not been allocated), in 1061 * that casem, buffer head is unmapped 1062 * 1063 * It returns the error in case of allocation failure. 1064 */ 1065 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1066 unsigned int max_blocks, struct buffer_head *bh, 1067 int create, int extend_disksize, int flag) 1068 { 1069 int retval; 1070 1071 clear_buffer_mapped(bh); 1072 1073 /* 1074 * Try to see if we can get the block without requesting 1075 * for new file system block. 1076 */ 1077 down_read((&EXT4_I(inode)->i_data_sem)); 1078 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1079 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1080 bh, 0, 0); 1081 } else { 1082 retval = ext4_get_blocks_handle(handle, 1083 inode, block, max_blocks, bh, 0, 0); 1084 } 1085 up_read((&EXT4_I(inode)->i_data_sem)); 1086 1087 /* If it is only a block(s) look up */ 1088 if (!create) 1089 return retval; 1090 1091 /* 1092 * Returns if the blocks have already allocated 1093 * 1094 * Note that if blocks have been preallocated 1095 * ext4_ext_get_block() returns th create = 0 1096 * with buffer head unmapped. 1097 */ 1098 if (retval > 0 && buffer_mapped(bh)) 1099 return retval; 1100 1101 /* 1102 * New blocks allocate and/or writing to uninitialized extent 1103 * will possibly result in updating i_data, so we take 1104 * the write lock of i_data_sem, and call get_blocks() 1105 * with create == 1 flag. 1106 */ 1107 down_write((&EXT4_I(inode)->i_data_sem)); 1108 1109 /* 1110 * if the caller is from delayed allocation writeout path 1111 * we have already reserved fs blocks for allocation 1112 * let the underlying get_block() function know to 1113 * avoid double accounting 1114 */ 1115 if (flag) 1116 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1117 /* 1118 * We need to check for EXT4 here because migrate 1119 * could have changed the inode type in between 1120 */ 1121 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1122 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1123 bh, create, extend_disksize); 1124 } else { 1125 retval = ext4_get_blocks_handle(handle, inode, block, 1126 max_blocks, bh, create, extend_disksize); 1127 1128 if (retval > 0 && buffer_new(bh)) { 1129 /* 1130 * We allocated new blocks which will result in 1131 * i_data's format changing. Force the migrate 1132 * to fail by clearing migrate flags 1133 */ 1134 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1135 ~EXT4_EXT_MIGRATE; 1136 } 1137 } 1138 1139 if (flag) { 1140 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1141 /* 1142 * Update reserved blocks/metadata blocks 1143 * after successful block allocation 1144 * which were deferred till now 1145 */ 1146 if ((retval > 0) && buffer_delay(bh)) 1147 ext4_da_update_reserve_space(inode, retval); 1148 } 1149 1150 up_write((&EXT4_I(inode)->i_data_sem)); 1151 return retval; 1152 } 1153 1154 /* Maximum number of blocks we map for direct IO at once. */ 1155 #define DIO_MAX_BLOCKS 4096 1156 1157 int ext4_get_block(struct inode *inode, sector_t iblock, 1158 struct buffer_head *bh_result, int create) 1159 { 1160 handle_t *handle = ext4_journal_current_handle(); 1161 int ret = 0, started = 0; 1162 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1163 int dio_credits; 1164 1165 if (create && !handle) { 1166 /* Direct IO write... */ 1167 if (max_blocks > DIO_MAX_BLOCKS) 1168 max_blocks = DIO_MAX_BLOCKS; 1169 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1170 handle = ext4_journal_start(inode, dio_credits); 1171 if (IS_ERR(handle)) { 1172 ret = PTR_ERR(handle); 1173 goto out; 1174 } 1175 started = 1; 1176 } 1177 1178 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1179 max_blocks, bh_result, create, 0, 0); 1180 if (ret > 0) { 1181 bh_result->b_size = (ret << inode->i_blkbits); 1182 ret = 0; 1183 } 1184 if (started) 1185 ext4_journal_stop(handle); 1186 out: 1187 return ret; 1188 } 1189 1190 /* 1191 * `handle' can be NULL if create is zero 1192 */ 1193 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1194 ext4_lblk_t block, int create, int *errp) 1195 { 1196 struct buffer_head dummy; 1197 int fatal = 0, err; 1198 1199 J_ASSERT(handle != NULL || create == 0); 1200 1201 dummy.b_state = 0; 1202 dummy.b_blocknr = -1000; 1203 buffer_trace_init(&dummy.b_history); 1204 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1205 &dummy, create, 1, 0); 1206 /* 1207 * ext4_get_blocks_handle() returns number of blocks 1208 * mapped. 0 in case of a HOLE. 1209 */ 1210 if (err > 0) { 1211 if (err > 1) 1212 WARN_ON(1); 1213 err = 0; 1214 } 1215 *errp = err; 1216 if (!err && buffer_mapped(&dummy)) { 1217 struct buffer_head *bh; 1218 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1219 if (!bh) { 1220 *errp = -EIO; 1221 goto err; 1222 } 1223 if (buffer_new(&dummy)) { 1224 J_ASSERT(create != 0); 1225 J_ASSERT(handle != NULL); 1226 1227 /* 1228 * Now that we do not always journal data, we should 1229 * keep in mind whether this should always journal the 1230 * new buffer as metadata. For now, regular file 1231 * writes use ext4_get_block instead, so it's not a 1232 * problem. 1233 */ 1234 lock_buffer(bh); 1235 BUFFER_TRACE(bh, "call get_create_access"); 1236 fatal = ext4_journal_get_create_access(handle, bh); 1237 if (!fatal && !buffer_uptodate(bh)) { 1238 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1239 set_buffer_uptodate(bh); 1240 } 1241 unlock_buffer(bh); 1242 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1243 err = ext4_handle_dirty_metadata(handle, inode, bh); 1244 if (!fatal) 1245 fatal = err; 1246 } else { 1247 BUFFER_TRACE(bh, "not a new buffer"); 1248 } 1249 if (fatal) { 1250 *errp = fatal; 1251 brelse(bh); 1252 bh = NULL; 1253 } 1254 return bh; 1255 } 1256 err: 1257 return NULL; 1258 } 1259 1260 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1261 ext4_lblk_t block, int create, int *err) 1262 { 1263 struct buffer_head *bh; 1264 1265 bh = ext4_getblk(handle, inode, block, create, err); 1266 if (!bh) 1267 return bh; 1268 if (buffer_uptodate(bh)) 1269 return bh; 1270 ll_rw_block(READ_META, 1, &bh); 1271 wait_on_buffer(bh); 1272 if (buffer_uptodate(bh)) 1273 return bh; 1274 put_bh(bh); 1275 *err = -EIO; 1276 return NULL; 1277 } 1278 1279 static int walk_page_buffers(handle_t *handle, 1280 struct buffer_head *head, 1281 unsigned from, 1282 unsigned to, 1283 int *partial, 1284 int (*fn)(handle_t *handle, 1285 struct buffer_head *bh)) 1286 { 1287 struct buffer_head *bh; 1288 unsigned block_start, block_end; 1289 unsigned blocksize = head->b_size; 1290 int err, ret = 0; 1291 struct buffer_head *next; 1292 1293 for (bh = head, block_start = 0; 1294 ret == 0 && (bh != head || !block_start); 1295 block_start = block_end, bh = next) 1296 { 1297 next = bh->b_this_page; 1298 block_end = block_start + blocksize; 1299 if (block_end <= from || block_start >= to) { 1300 if (partial && !buffer_uptodate(bh)) 1301 *partial = 1; 1302 continue; 1303 } 1304 err = (*fn)(handle, bh); 1305 if (!ret) 1306 ret = err; 1307 } 1308 return ret; 1309 } 1310 1311 /* 1312 * To preserve ordering, it is essential that the hole instantiation and 1313 * the data write be encapsulated in a single transaction. We cannot 1314 * close off a transaction and start a new one between the ext4_get_block() 1315 * and the commit_write(). So doing the jbd2_journal_start at the start of 1316 * prepare_write() is the right place. 1317 * 1318 * Also, this function can nest inside ext4_writepage() -> 1319 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1320 * has generated enough buffer credits to do the whole page. So we won't 1321 * block on the journal in that case, which is good, because the caller may 1322 * be PF_MEMALLOC. 1323 * 1324 * By accident, ext4 can be reentered when a transaction is open via 1325 * quota file writes. If we were to commit the transaction while thus 1326 * reentered, there can be a deadlock - we would be holding a quota 1327 * lock, and the commit would never complete if another thread had a 1328 * transaction open and was blocking on the quota lock - a ranking 1329 * violation. 1330 * 1331 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1332 * will _not_ run commit under these circumstances because handle->h_ref 1333 * is elevated. We'll still have enough credits for the tiny quotafile 1334 * write. 1335 */ 1336 static int do_journal_get_write_access(handle_t *handle, 1337 struct buffer_head *bh) 1338 { 1339 if (!buffer_mapped(bh) || buffer_freed(bh)) 1340 return 0; 1341 return ext4_journal_get_write_access(handle, bh); 1342 } 1343 1344 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1345 loff_t pos, unsigned len, unsigned flags, 1346 struct page **pagep, void **fsdata) 1347 { 1348 struct inode *inode = mapping->host; 1349 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1350 handle_t *handle; 1351 int retries = 0; 1352 struct page *page; 1353 pgoff_t index; 1354 unsigned from, to; 1355 1356 trace_mark(ext4_write_begin, 1357 "dev %s ino %lu pos %llu len %u flags %u", 1358 inode->i_sb->s_id, inode->i_ino, 1359 (unsigned long long) pos, len, flags); 1360 index = pos >> PAGE_CACHE_SHIFT; 1361 from = pos & (PAGE_CACHE_SIZE - 1); 1362 to = from + len; 1363 1364 retry: 1365 handle = ext4_journal_start(inode, needed_blocks); 1366 if (IS_ERR(handle)) { 1367 ret = PTR_ERR(handle); 1368 goto out; 1369 } 1370 1371 /* We cannot recurse into the filesystem as the transaction is already 1372 * started */ 1373 flags |= AOP_FLAG_NOFS; 1374 1375 page = grab_cache_page_write_begin(mapping, index, flags); 1376 if (!page) { 1377 ext4_journal_stop(handle); 1378 ret = -ENOMEM; 1379 goto out; 1380 } 1381 *pagep = page; 1382 1383 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1384 ext4_get_block); 1385 1386 if (!ret && ext4_should_journal_data(inode)) { 1387 ret = walk_page_buffers(handle, page_buffers(page), 1388 from, to, NULL, do_journal_get_write_access); 1389 } 1390 1391 if (ret) { 1392 unlock_page(page); 1393 ext4_journal_stop(handle); 1394 page_cache_release(page); 1395 /* 1396 * block_write_begin may have instantiated a few blocks 1397 * outside i_size. Trim these off again. Don't need 1398 * i_size_read because we hold i_mutex. 1399 */ 1400 if (pos + len > inode->i_size) 1401 vmtruncate(inode, inode->i_size); 1402 } 1403 1404 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1405 goto retry; 1406 out: 1407 return ret; 1408 } 1409 1410 /* For write_end() in data=journal mode */ 1411 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1412 { 1413 if (!buffer_mapped(bh) || buffer_freed(bh)) 1414 return 0; 1415 set_buffer_uptodate(bh); 1416 return ext4_handle_dirty_metadata(handle, NULL, bh); 1417 } 1418 1419 /* 1420 * We need to pick up the new inode size which generic_commit_write gave us 1421 * `file' can be NULL - eg, when called from page_symlink(). 1422 * 1423 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1424 * buffers are managed internally. 1425 */ 1426 static int ext4_ordered_write_end(struct file *file, 1427 struct address_space *mapping, 1428 loff_t pos, unsigned len, unsigned copied, 1429 struct page *page, void *fsdata) 1430 { 1431 handle_t *handle = ext4_journal_current_handle(); 1432 struct inode *inode = mapping->host; 1433 int ret = 0, ret2; 1434 1435 trace_mark(ext4_ordered_write_end, 1436 "dev %s ino %lu pos %llu len %u copied %u", 1437 inode->i_sb->s_id, inode->i_ino, 1438 (unsigned long long) pos, len, copied); 1439 ret = ext4_jbd2_file_inode(handle, inode); 1440 1441 if (ret == 0) { 1442 loff_t new_i_size; 1443 1444 new_i_size = pos + copied; 1445 if (new_i_size > EXT4_I(inode)->i_disksize) { 1446 ext4_update_i_disksize(inode, new_i_size); 1447 /* We need to mark inode dirty even if 1448 * new_i_size is less that inode->i_size 1449 * bu greater than i_disksize.(hint delalloc) 1450 */ 1451 ext4_mark_inode_dirty(handle, inode); 1452 } 1453 1454 ret2 = generic_write_end(file, mapping, pos, len, copied, 1455 page, fsdata); 1456 copied = ret2; 1457 if (ret2 < 0) 1458 ret = ret2; 1459 } 1460 ret2 = ext4_journal_stop(handle); 1461 if (!ret) 1462 ret = ret2; 1463 1464 return ret ? ret : copied; 1465 } 1466 1467 static int ext4_writeback_write_end(struct file *file, 1468 struct address_space *mapping, 1469 loff_t pos, unsigned len, unsigned copied, 1470 struct page *page, void *fsdata) 1471 { 1472 handle_t *handle = ext4_journal_current_handle(); 1473 struct inode *inode = mapping->host; 1474 int ret = 0, ret2; 1475 loff_t new_i_size; 1476 1477 trace_mark(ext4_writeback_write_end, 1478 "dev %s ino %lu pos %llu len %u copied %u", 1479 inode->i_sb->s_id, inode->i_ino, 1480 (unsigned long long) pos, len, copied); 1481 new_i_size = pos + copied; 1482 if (new_i_size > EXT4_I(inode)->i_disksize) { 1483 ext4_update_i_disksize(inode, new_i_size); 1484 /* We need to mark inode dirty even if 1485 * new_i_size is less that inode->i_size 1486 * bu greater than i_disksize.(hint delalloc) 1487 */ 1488 ext4_mark_inode_dirty(handle, inode); 1489 } 1490 1491 ret2 = generic_write_end(file, mapping, pos, len, copied, 1492 page, fsdata); 1493 copied = ret2; 1494 if (ret2 < 0) 1495 ret = ret2; 1496 1497 ret2 = ext4_journal_stop(handle); 1498 if (!ret) 1499 ret = ret2; 1500 1501 return ret ? ret : copied; 1502 } 1503 1504 static int ext4_journalled_write_end(struct file *file, 1505 struct address_space *mapping, 1506 loff_t pos, unsigned len, unsigned copied, 1507 struct page *page, void *fsdata) 1508 { 1509 handle_t *handle = ext4_journal_current_handle(); 1510 struct inode *inode = mapping->host; 1511 int ret = 0, ret2; 1512 int partial = 0; 1513 unsigned from, to; 1514 loff_t new_i_size; 1515 1516 trace_mark(ext4_journalled_write_end, 1517 "dev %s ino %lu pos %llu len %u copied %u", 1518 inode->i_sb->s_id, inode->i_ino, 1519 (unsigned long long) pos, len, copied); 1520 from = pos & (PAGE_CACHE_SIZE - 1); 1521 to = from + len; 1522 1523 if (copied < len) { 1524 if (!PageUptodate(page)) 1525 copied = 0; 1526 page_zero_new_buffers(page, from+copied, to); 1527 } 1528 1529 ret = walk_page_buffers(handle, page_buffers(page), from, 1530 to, &partial, write_end_fn); 1531 if (!partial) 1532 SetPageUptodate(page); 1533 new_i_size = pos + copied; 1534 if (new_i_size > inode->i_size) 1535 i_size_write(inode, pos+copied); 1536 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1537 if (new_i_size > EXT4_I(inode)->i_disksize) { 1538 ext4_update_i_disksize(inode, new_i_size); 1539 ret2 = ext4_mark_inode_dirty(handle, inode); 1540 if (!ret) 1541 ret = ret2; 1542 } 1543 1544 unlock_page(page); 1545 ret2 = ext4_journal_stop(handle); 1546 if (!ret) 1547 ret = ret2; 1548 page_cache_release(page); 1549 1550 return ret ? ret : copied; 1551 } 1552 1553 static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1554 { 1555 int retries = 0; 1556 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1557 unsigned long md_needed, mdblocks, total = 0; 1558 1559 /* 1560 * recalculate the amount of metadata blocks to reserve 1561 * in order to allocate nrblocks 1562 * worse case is one extent per block 1563 */ 1564 repeat: 1565 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1566 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1567 mdblocks = ext4_calc_metadata_amount(inode, total); 1568 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1569 1570 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1571 total = md_needed + nrblocks; 1572 1573 if (ext4_claim_free_blocks(sbi, total)) { 1574 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1575 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1576 yield(); 1577 goto repeat; 1578 } 1579 return -ENOSPC; 1580 } 1581 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1582 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1583 1584 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1585 return 0; /* success */ 1586 } 1587 1588 static void ext4_da_release_space(struct inode *inode, int to_free) 1589 { 1590 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1591 int total, mdb, mdb_free, release; 1592 1593 if (!to_free) 1594 return; /* Nothing to release, exit */ 1595 1596 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1597 1598 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1599 /* 1600 * if there is no reserved blocks, but we try to free some 1601 * then the counter is messed up somewhere. 1602 * but since this function is called from invalidate 1603 * page, it's harmless to return without any action 1604 */ 1605 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1606 "blocks for inode %lu, but there is no reserved " 1607 "data blocks\n", to_free, inode->i_ino); 1608 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1609 return; 1610 } 1611 1612 /* recalculate the number of metablocks still need to be reserved */ 1613 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1614 mdb = ext4_calc_metadata_amount(inode, total); 1615 1616 /* figure out how many metablocks to release */ 1617 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1618 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1619 1620 release = to_free + mdb_free; 1621 1622 /* update fs dirty blocks counter for truncate case */ 1623 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); 1624 1625 /* update per-inode reservations */ 1626 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1627 EXT4_I(inode)->i_reserved_data_blocks -= to_free; 1628 1629 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1630 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1631 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1632 } 1633 1634 static void ext4_da_page_release_reservation(struct page *page, 1635 unsigned long offset) 1636 { 1637 int to_release = 0; 1638 struct buffer_head *head, *bh; 1639 unsigned int curr_off = 0; 1640 1641 head = page_buffers(page); 1642 bh = head; 1643 do { 1644 unsigned int next_off = curr_off + bh->b_size; 1645 1646 if ((offset <= curr_off) && (buffer_delay(bh))) { 1647 to_release++; 1648 clear_buffer_delay(bh); 1649 } 1650 curr_off = next_off; 1651 } while ((bh = bh->b_this_page) != head); 1652 ext4_da_release_space(page->mapping->host, to_release); 1653 } 1654 1655 /* 1656 * Delayed allocation stuff 1657 */ 1658 1659 struct mpage_da_data { 1660 struct inode *inode; 1661 struct buffer_head lbh; /* extent of blocks */ 1662 unsigned long first_page, next_page; /* extent of pages */ 1663 get_block_t *get_block; 1664 struct writeback_control *wbc; 1665 int io_done; 1666 int pages_written; 1667 int retval; 1668 }; 1669 1670 /* 1671 * mpage_da_submit_io - walks through extent of pages and try to write 1672 * them with writepage() call back 1673 * 1674 * @mpd->inode: inode 1675 * @mpd->first_page: first page of the extent 1676 * @mpd->next_page: page after the last page of the extent 1677 * @mpd->get_block: the filesystem's block mapper function 1678 * 1679 * By the time mpage_da_submit_io() is called we expect all blocks 1680 * to be allocated. this may be wrong if allocation failed. 1681 * 1682 * As pages are already locked by write_cache_pages(), we can't use it 1683 */ 1684 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1685 { 1686 long pages_skipped; 1687 struct pagevec pvec; 1688 unsigned long index, end; 1689 int ret = 0, err, nr_pages, i; 1690 struct inode *inode = mpd->inode; 1691 struct address_space *mapping = inode->i_mapping; 1692 1693 BUG_ON(mpd->next_page <= mpd->first_page); 1694 /* 1695 * We need to start from the first_page to the next_page - 1 1696 * to make sure we also write the mapped dirty buffer_heads. 1697 * If we look at mpd->lbh.b_blocknr we would only be looking 1698 * at the currently mapped buffer_heads. 1699 */ 1700 index = mpd->first_page; 1701 end = mpd->next_page - 1; 1702 1703 pagevec_init(&pvec, 0); 1704 while (index <= end) { 1705 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1706 if (nr_pages == 0) 1707 break; 1708 for (i = 0; i < nr_pages; i++) { 1709 struct page *page = pvec.pages[i]; 1710 1711 index = page->index; 1712 if (index > end) 1713 break; 1714 index++; 1715 1716 BUG_ON(!PageLocked(page)); 1717 BUG_ON(PageWriteback(page)); 1718 1719 pages_skipped = mpd->wbc->pages_skipped; 1720 err = mapping->a_ops->writepage(page, mpd->wbc); 1721 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1722 /* 1723 * have successfully written the page 1724 * without skipping the same 1725 */ 1726 mpd->pages_written++; 1727 /* 1728 * In error case, we have to continue because 1729 * remaining pages are still locked 1730 * XXX: unlock and re-dirty them? 1731 */ 1732 if (ret == 0) 1733 ret = err; 1734 } 1735 pagevec_release(&pvec); 1736 } 1737 return ret; 1738 } 1739 1740 /* 1741 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 1742 * 1743 * @mpd->inode - inode to walk through 1744 * @exbh->b_blocknr - first block on a disk 1745 * @exbh->b_size - amount of space in bytes 1746 * @logical - first logical block to start assignment with 1747 * 1748 * the function goes through all passed space and put actual disk 1749 * block numbers into buffer heads, dropping BH_Delay 1750 */ 1751 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1752 struct buffer_head *exbh) 1753 { 1754 struct inode *inode = mpd->inode; 1755 struct address_space *mapping = inode->i_mapping; 1756 int blocks = exbh->b_size >> inode->i_blkbits; 1757 sector_t pblock = exbh->b_blocknr, cur_logical; 1758 struct buffer_head *head, *bh; 1759 pgoff_t index, end; 1760 struct pagevec pvec; 1761 int nr_pages, i; 1762 1763 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1764 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1765 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1766 1767 pagevec_init(&pvec, 0); 1768 1769 while (index <= end) { 1770 /* XXX: optimize tail */ 1771 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1772 if (nr_pages == 0) 1773 break; 1774 for (i = 0; i < nr_pages; i++) { 1775 struct page *page = pvec.pages[i]; 1776 1777 index = page->index; 1778 if (index > end) 1779 break; 1780 index++; 1781 1782 BUG_ON(!PageLocked(page)); 1783 BUG_ON(PageWriteback(page)); 1784 BUG_ON(!page_has_buffers(page)); 1785 1786 bh = page_buffers(page); 1787 head = bh; 1788 1789 /* skip blocks out of the range */ 1790 do { 1791 if (cur_logical >= logical) 1792 break; 1793 cur_logical++; 1794 } while ((bh = bh->b_this_page) != head); 1795 1796 do { 1797 if (cur_logical >= logical + blocks) 1798 break; 1799 if (buffer_delay(bh)) { 1800 bh->b_blocknr = pblock; 1801 clear_buffer_delay(bh); 1802 bh->b_bdev = inode->i_sb->s_bdev; 1803 } else if (buffer_unwritten(bh)) { 1804 bh->b_blocknr = pblock; 1805 clear_buffer_unwritten(bh); 1806 set_buffer_mapped(bh); 1807 set_buffer_new(bh); 1808 bh->b_bdev = inode->i_sb->s_bdev; 1809 } else if (buffer_mapped(bh)) 1810 BUG_ON(bh->b_blocknr != pblock); 1811 1812 cur_logical++; 1813 pblock++; 1814 } while ((bh = bh->b_this_page) != head); 1815 } 1816 pagevec_release(&pvec); 1817 } 1818 } 1819 1820 1821 /* 1822 * __unmap_underlying_blocks - just a helper function to unmap 1823 * set of blocks described by @bh 1824 */ 1825 static inline void __unmap_underlying_blocks(struct inode *inode, 1826 struct buffer_head *bh) 1827 { 1828 struct block_device *bdev = inode->i_sb->s_bdev; 1829 int blocks, i; 1830 1831 blocks = bh->b_size >> inode->i_blkbits; 1832 for (i = 0; i < blocks; i++) 1833 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1834 } 1835 1836 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 1837 sector_t logical, long blk_cnt) 1838 { 1839 int nr_pages, i; 1840 pgoff_t index, end; 1841 struct pagevec pvec; 1842 struct inode *inode = mpd->inode; 1843 struct address_space *mapping = inode->i_mapping; 1844 1845 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1846 end = (logical + blk_cnt - 1) >> 1847 (PAGE_CACHE_SHIFT - inode->i_blkbits); 1848 while (index <= end) { 1849 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1850 if (nr_pages == 0) 1851 break; 1852 for (i = 0; i < nr_pages; i++) { 1853 struct page *page = pvec.pages[i]; 1854 index = page->index; 1855 if (index > end) 1856 break; 1857 index++; 1858 1859 BUG_ON(!PageLocked(page)); 1860 BUG_ON(PageWriteback(page)); 1861 block_invalidatepage(page, 0); 1862 ClearPageUptodate(page); 1863 unlock_page(page); 1864 } 1865 } 1866 return; 1867 } 1868 1869 static void ext4_print_free_blocks(struct inode *inode) 1870 { 1871 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1872 printk(KERN_EMERG "Total free blocks count %lld\n", 1873 ext4_count_free_blocks(inode->i_sb)); 1874 printk(KERN_EMERG "Free/Dirty block details\n"); 1875 printk(KERN_EMERG "free_blocks=%lld\n", 1876 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 1877 printk(KERN_EMERG "dirty_blocks=%lld\n", 1878 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1879 printk(KERN_EMERG "Block reservation details\n"); 1880 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 1881 EXT4_I(inode)->i_reserved_data_blocks); 1882 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 1883 EXT4_I(inode)->i_reserved_meta_blocks); 1884 return; 1885 } 1886 1887 /* 1888 * mpage_da_map_blocks - go through given space 1889 * 1890 * @mpd->lbh - bh describing space 1891 * @mpd->get_block - the filesystem's block mapper function 1892 * 1893 * The function skips space we know is already mapped to disk blocks. 1894 * 1895 */ 1896 static int mpage_da_map_blocks(struct mpage_da_data *mpd) 1897 { 1898 int err = 0; 1899 struct buffer_head new; 1900 struct buffer_head *lbh = &mpd->lbh; 1901 sector_t next; 1902 1903 /* 1904 * We consider only non-mapped and non-allocated blocks 1905 */ 1906 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1907 return 0; 1908 new.b_state = lbh->b_state; 1909 new.b_blocknr = 0; 1910 new.b_size = lbh->b_size; 1911 next = lbh->b_blocknr; 1912 /* 1913 * If we didn't accumulate anything 1914 * to write simply return 1915 */ 1916 if (!new.b_size) 1917 return 0; 1918 err = mpd->get_block(mpd->inode, next, &new, 1); 1919 if (err) { 1920 1921 /* If get block returns with error 1922 * we simply return. Later writepage 1923 * will redirty the page and writepages 1924 * will find the dirty page again 1925 */ 1926 if (err == -EAGAIN) 1927 return 0; 1928 1929 if (err == -ENOSPC && 1930 ext4_count_free_blocks(mpd->inode->i_sb)) { 1931 mpd->retval = err; 1932 return 0; 1933 } 1934 1935 /* 1936 * get block failure will cause us 1937 * to loop in writepages. Because 1938 * a_ops->writepage won't be able to 1939 * make progress. The page will be redirtied 1940 * by writepage and writepages will again 1941 * try to write the same. 1942 */ 1943 printk(KERN_EMERG "%s block allocation failed for inode %lu " 1944 "at logical offset %llu with max blocks " 1945 "%zd with error %d\n", 1946 __func__, mpd->inode->i_ino, 1947 (unsigned long long)next, 1948 lbh->b_size >> mpd->inode->i_blkbits, err); 1949 printk(KERN_EMERG "This should not happen.!! " 1950 "Data will be lost\n"); 1951 if (err == -ENOSPC) { 1952 ext4_print_free_blocks(mpd->inode); 1953 } 1954 /* invlaidate all the pages */ 1955 ext4_da_block_invalidatepages(mpd, next, 1956 lbh->b_size >> mpd->inode->i_blkbits); 1957 return err; 1958 } 1959 BUG_ON(new.b_size == 0); 1960 1961 if (buffer_new(&new)) 1962 __unmap_underlying_blocks(mpd->inode, &new); 1963 1964 /* 1965 * If blocks are delayed marked, we need to 1966 * put actual blocknr and drop delayed bit 1967 */ 1968 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1969 mpage_put_bnr_to_bhs(mpd, next, &new); 1970 1971 return 0; 1972 } 1973 1974 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1975 (1 << BH_Delay) | (1 << BH_Unwritten)) 1976 1977 /* 1978 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 1979 * 1980 * @mpd->lbh - extent of blocks 1981 * @logical - logical number of the block in the file 1982 * @bh - bh of the block (used to access block's state) 1983 * 1984 * the function is used to collect contig. blocks in same state 1985 */ 1986 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1987 sector_t logical, struct buffer_head *bh) 1988 { 1989 sector_t next; 1990 size_t b_size = bh->b_size; 1991 struct buffer_head *lbh = &mpd->lbh; 1992 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; 1993 1994 /* check if thereserved journal credits might overflow */ 1995 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 1996 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 1997 /* 1998 * With non-extent format we are limited by the journal 1999 * credit available. Total credit needed to insert 2000 * nrblocks contiguous blocks is dependent on the 2001 * nrblocks. So limit nrblocks. 2002 */ 2003 goto flush_it; 2004 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2005 EXT4_MAX_TRANS_DATA) { 2006 /* 2007 * Adding the new buffer_head would make it cross the 2008 * allowed limit for which we have journal credit 2009 * reserved. So limit the new bh->b_size 2010 */ 2011 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2012 mpd->inode->i_blkbits; 2013 /* we will do mpage_da_submit_io in the next loop */ 2014 } 2015 } 2016 /* 2017 * First block in the extent 2018 */ 2019 if (lbh->b_size == 0) { 2020 lbh->b_blocknr = logical; 2021 lbh->b_size = b_size; 2022 lbh->b_state = bh->b_state & BH_FLAGS; 2023 return; 2024 } 2025 2026 next = lbh->b_blocknr + nrblocks; 2027 /* 2028 * Can we merge the block to our big extent? 2029 */ 2030 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2031 lbh->b_size += b_size; 2032 return; 2033 } 2034 2035 flush_it: 2036 /* 2037 * We couldn't merge the block to our extent, so we 2038 * need to flush current extent and start new one 2039 */ 2040 if (mpage_da_map_blocks(mpd) == 0) 2041 mpage_da_submit_io(mpd); 2042 mpd->io_done = 1; 2043 return; 2044 } 2045 2046 /* 2047 * __mpage_da_writepage - finds extent of pages and blocks 2048 * 2049 * @page: page to consider 2050 * @wbc: not used, we just follow rules 2051 * @data: context 2052 * 2053 * The function finds extents of pages and scan them for all blocks. 2054 */ 2055 static int __mpage_da_writepage(struct page *page, 2056 struct writeback_control *wbc, void *data) 2057 { 2058 struct mpage_da_data *mpd = data; 2059 struct inode *inode = mpd->inode; 2060 struct buffer_head *bh, *head, fake; 2061 sector_t logical; 2062 2063 if (mpd->io_done) { 2064 /* 2065 * Rest of the page in the page_vec 2066 * redirty then and skip then. We will 2067 * try to to write them again after 2068 * starting a new transaction 2069 */ 2070 redirty_page_for_writepage(wbc, page); 2071 unlock_page(page); 2072 return MPAGE_DA_EXTENT_TAIL; 2073 } 2074 /* 2075 * Can we merge this page to current extent? 2076 */ 2077 if (mpd->next_page != page->index) { 2078 /* 2079 * Nope, we can't. So, we map non-allocated blocks 2080 * and start IO on them using writepage() 2081 */ 2082 if (mpd->next_page != mpd->first_page) { 2083 if (mpage_da_map_blocks(mpd) == 0) 2084 mpage_da_submit_io(mpd); 2085 /* 2086 * skip rest of the page in the page_vec 2087 */ 2088 mpd->io_done = 1; 2089 redirty_page_for_writepage(wbc, page); 2090 unlock_page(page); 2091 return MPAGE_DA_EXTENT_TAIL; 2092 } 2093 2094 /* 2095 * Start next extent of pages ... 2096 */ 2097 mpd->first_page = page->index; 2098 2099 /* 2100 * ... and blocks 2101 */ 2102 mpd->lbh.b_size = 0; 2103 mpd->lbh.b_state = 0; 2104 mpd->lbh.b_blocknr = 0; 2105 } 2106 2107 mpd->next_page = page->index + 1; 2108 logical = (sector_t) page->index << 2109 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2110 2111 if (!page_has_buffers(page)) { 2112 /* 2113 * There is no attached buffer heads yet (mmap?) 2114 * we treat the page asfull of dirty blocks 2115 */ 2116 bh = &fake; 2117 bh->b_size = PAGE_CACHE_SIZE; 2118 bh->b_state = 0; 2119 set_buffer_dirty(bh); 2120 set_buffer_uptodate(bh); 2121 mpage_add_bh_to_extent(mpd, logical, bh); 2122 if (mpd->io_done) 2123 return MPAGE_DA_EXTENT_TAIL; 2124 } else { 2125 /* 2126 * Page with regular buffer heads, just add all dirty ones 2127 */ 2128 head = page_buffers(page); 2129 bh = head; 2130 do { 2131 BUG_ON(buffer_locked(bh)); 2132 /* 2133 * We need to try to allocate 2134 * unmapped blocks in the same page. 2135 * Otherwise we won't make progress 2136 * with the page in ext4_da_writepage 2137 */ 2138 if (buffer_dirty(bh) && 2139 (!buffer_mapped(bh) || buffer_delay(bh))) { 2140 mpage_add_bh_to_extent(mpd, logical, bh); 2141 if (mpd->io_done) 2142 return MPAGE_DA_EXTENT_TAIL; 2143 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2144 /* 2145 * mapped dirty buffer. We need to update 2146 * the b_state because we look at 2147 * b_state in mpage_da_map_blocks. We don't 2148 * update b_size because if we find an 2149 * unmapped buffer_head later we need to 2150 * use the b_state flag of that buffer_head. 2151 */ 2152 if (mpd->lbh.b_size == 0) 2153 mpd->lbh.b_state = 2154 bh->b_state & BH_FLAGS; 2155 } 2156 logical++; 2157 } while ((bh = bh->b_this_page) != head); 2158 } 2159 2160 return 0; 2161 } 2162 2163 /* 2164 * mpage_da_writepages - walk the list of dirty pages of the given 2165 * address space, allocates non-allocated blocks, maps newly-allocated 2166 * blocks to existing bhs and issue IO them 2167 * 2168 * @mapping: address space structure to write 2169 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2170 * @get_block: the filesystem's block mapper function. 2171 * 2172 * This is a library function, which implements the writepages() 2173 * address_space_operation. 2174 */ 2175 static int mpage_da_writepages(struct address_space *mapping, 2176 struct writeback_control *wbc, 2177 struct mpage_da_data *mpd) 2178 { 2179 int ret; 2180 2181 if (!mpd->get_block) 2182 return generic_writepages(mapping, wbc); 2183 2184 mpd->lbh.b_size = 0; 2185 mpd->lbh.b_state = 0; 2186 mpd->lbh.b_blocknr = 0; 2187 mpd->first_page = 0; 2188 mpd->next_page = 0; 2189 mpd->io_done = 0; 2190 mpd->pages_written = 0; 2191 mpd->retval = 0; 2192 2193 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2194 /* 2195 * Handle last extent of pages 2196 */ 2197 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2198 if (mpage_da_map_blocks(mpd) == 0) 2199 mpage_da_submit_io(mpd); 2200 2201 mpd->io_done = 1; 2202 ret = MPAGE_DA_EXTENT_TAIL; 2203 } 2204 wbc->nr_to_write -= mpd->pages_written; 2205 return ret; 2206 } 2207 2208 /* 2209 * this is a special callback for ->write_begin() only 2210 * it's intention is to return mapped block or reserve space 2211 */ 2212 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2213 struct buffer_head *bh_result, int create) 2214 { 2215 int ret = 0; 2216 2217 BUG_ON(create == 0); 2218 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2219 2220 /* 2221 * first, we need to know whether the block is allocated already 2222 * preallocated blocks are unmapped but should treated 2223 * the same as allocated blocks. 2224 */ 2225 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); 2226 if ((ret == 0) && !buffer_delay(bh_result)) { 2227 /* the block isn't (pre)allocated yet, let's reserve space */ 2228 /* 2229 * XXX: __block_prepare_write() unmaps passed block, 2230 * is it OK? 2231 */ 2232 ret = ext4_da_reserve_space(inode, 1); 2233 if (ret) 2234 /* not enough space to reserve */ 2235 return ret; 2236 2237 map_bh(bh_result, inode->i_sb, 0); 2238 set_buffer_new(bh_result); 2239 set_buffer_delay(bh_result); 2240 } else if (ret > 0) { 2241 bh_result->b_size = (ret << inode->i_blkbits); 2242 ret = 0; 2243 } 2244 2245 return ret; 2246 } 2247 #define EXT4_DELALLOC_RSVED 1 2248 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, 2249 struct buffer_head *bh_result, int create) 2250 { 2251 int ret; 2252 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2253 loff_t disksize = EXT4_I(inode)->i_disksize; 2254 handle_t *handle = NULL; 2255 2256 handle = ext4_journal_current_handle(); 2257 BUG_ON(!handle); 2258 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2259 bh_result, create, 0, EXT4_DELALLOC_RSVED); 2260 if (ret > 0) { 2261 2262 bh_result->b_size = (ret << inode->i_blkbits); 2263 2264 if (ext4_should_order_data(inode)) { 2265 int retval; 2266 retval = ext4_jbd2_file_inode(handle, inode); 2267 if (retval) 2268 /* 2269 * Failed to add inode for ordered 2270 * mode. Don't update file size 2271 */ 2272 return retval; 2273 } 2274 2275 /* 2276 * Update on-disk size along with block allocation 2277 * we don't use 'extend_disksize' as size may change 2278 * within already allocated block -bzzz 2279 */ 2280 disksize = ((loff_t) iblock + ret) << inode->i_blkbits; 2281 if (disksize > i_size_read(inode)) 2282 disksize = i_size_read(inode); 2283 if (disksize > EXT4_I(inode)->i_disksize) { 2284 ext4_update_i_disksize(inode, disksize); 2285 ret = ext4_mark_inode_dirty(handle, inode); 2286 return ret; 2287 } 2288 ret = 0; 2289 } 2290 return ret; 2291 } 2292 2293 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2294 { 2295 /* 2296 * unmapped buffer is possible for holes. 2297 * delay buffer is possible with delayed allocation 2298 */ 2299 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); 2300 } 2301 2302 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, 2303 struct buffer_head *bh_result, int create) 2304 { 2305 int ret = 0; 2306 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2307 2308 /* 2309 * we don't want to do block allocation in writepage 2310 * so call get_block_wrap with create = 0 2311 */ 2312 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, 2313 bh_result, 0, 0, 0); 2314 if (ret > 0) { 2315 bh_result->b_size = (ret << inode->i_blkbits); 2316 ret = 0; 2317 } 2318 return ret; 2319 } 2320 2321 /* 2322 * get called vi ext4_da_writepages after taking page lock (have journal handle) 2323 * get called via journal_submit_inode_data_buffers (no journal handle) 2324 * get called via shrink_page_list via pdflush (no journal handle) 2325 * or grab_page_cache when doing write_begin (have journal handle) 2326 */ 2327 static int ext4_da_writepage(struct page *page, 2328 struct writeback_control *wbc) 2329 { 2330 int ret = 0; 2331 loff_t size; 2332 unsigned int len; 2333 struct buffer_head *page_bufs; 2334 struct inode *inode = page->mapping->host; 2335 2336 trace_mark(ext4_da_writepage, 2337 "dev %s ino %lu page_index %lu", 2338 inode->i_sb->s_id, inode->i_ino, page->index); 2339 size = i_size_read(inode); 2340 if (page->index == size >> PAGE_CACHE_SHIFT) 2341 len = size & ~PAGE_CACHE_MASK; 2342 else 2343 len = PAGE_CACHE_SIZE; 2344 2345 if (page_has_buffers(page)) { 2346 page_bufs = page_buffers(page); 2347 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2348 ext4_bh_unmapped_or_delay)) { 2349 /* 2350 * We don't want to do block allocation 2351 * So redirty the page and return 2352 * We may reach here when we do a journal commit 2353 * via journal_submit_inode_data_buffers. 2354 * If we don't have mapping block we just ignore 2355 * them. We can also reach here via shrink_page_list 2356 */ 2357 redirty_page_for_writepage(wbc, page); 2358 unlock_page(page); 2359 return 0; 2360 } 2361 } else { 2362 /* 2363 * The test for page_has_buffers() is subtle: 2364 * We know the page is dirty but it lost buffers. That means 2365 * that at some moment in time after write_begin()/write_end() 2366 * has been called all buffers have been clean and thus they 2367 * must have been written at least once. So they are all 2368 * mapped and we can happily proceed with mapping them 2369 * and writing the page. 2370 * 2371 * Try to initialize the buffer_heads and check whether 2372 * all are mapped and non delay. We don't want to 2373 * do block allocation here. 2374 */ 2375 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2376 ext4_normal_get_block_write); 2377 if (!ret) { 2378 page_bufs = page_buffers(page); 2379 /* check whether all are mapped and non delay */ 2380 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2381 ext4_bh_unmapped_or_delay)) { 2382 redirty_page_for_writepage(wbc, page); 2383 unlock_page(page); 2384 return 0; 2385 } 2386 } else { 2387 /* 2388 * We can't do block allocation here 2389 * so just redity the page and unlock 2390 * and return 2391 */ 2392 redirty_page_for_writepage(wbc, page); 2393 unlock_page(page); 2394 return 0; 2395 } 2396 /* now mark the buffer_heads as dirty and uptodate */ 2397 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2398 } 2399 2400 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2401 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); 2402 else 2403 ret = block_write_full_page(page, 2404 ext4_normal_get_block_write, 2405 wbc); 2406 2407 return ret; 2408 } 2409 2410 /* 2411 * This is called via ext4_da_writepages() to 2412 * calulate the total number of credits to reserve to fit 2413 * a single extent allocation into a single transaction, 2414 * ext4_da_writpeages() will loop calling this before 2415 * the block allocation. 2416 */ 2417 2418 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2419 { 2420 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2421 2422 /* 2423 * With non-extent format the journal credit needed to 2424 * insert nrblocks contiguous block is dependent on 2425 * number of contiguous block. So we will limit 2426 * number of contiguous block to a sane value 2427 */ 2428 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2429 (max_blocks > EXT4_MAX_TRANS_DATA)) 2430 max_blocks = EXT4_MAX_TRANS_DATA; 2431 2432 return ext4_chunk_trans_blocks(inode, max_blocks); 2433 } 2434 2435 static int ext4_da_writepages(struct address_space *mapping, 2436 struct writeback_control *wbc) 2437 { 2438 pgoff_t index; 2439 int range_whole = 0; 2440 handle_t *handle = NULL; 2441 struct mpage_da_data mpd; 2442 struct inode *inode = mapping->host; 2443 int no_nrwrite_index_update; 2444 int pages_written = 0; 2445 long pages_skipped; 2446 int range_cyclic, cycled = 1, io_done = 0; 2447 int needed_blocks, ret = 0, nr_to_writebump = 0; 2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2449 2450 trace_mark(ext4_da_writepages, 2451 "dev %s ino %lu nr_t_write %ld " 2452 "pages_skipped %ld range_start %llu " 2453 "range_end %llu nonblocking %d " 2454 "for_kupdate %d for_reclaim %d " 2455 "for_writepages %d range_cyclic %d", 2456 inode->i_sb->s_id, inode->i_ino, 2457 wbc->nr_to_write, wbc->pages_skipped, 2458 (unsigned long long) wbc->range_start, 2459 (unsigned long long) wbc->range_end, 2460 wbc->nonblocking, wbc->for_kupdate, 2461 wbc->for_reclaim, wbc->for_writepages, 2462 wbc->range_cyclic); 2463 2464 /* 2465 * No pages to write? This is mainly a kludge to avoid starting 2466 * a transaction for special inodes like journal inode on last iput() 2467 * because that could violate lock ordering on umount 2468 */ 2469 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2470 return 0; 2471 2472 /* 2473 * If the filesystem has aborted, it is read-only, so return 2474 * right away instead of dumping stack traces later on that 2475 * will obscure the real source of the problem. We test 2476 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because 2477 * the latter could be true if the filesystem is mounted 2478 * read-only, and in that case, ext4_da_writepages should 2479 * *never* be called, so if that ever happens, we would want 2480 * the stack trace. 2481 */ 2482 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) 2483 return -EROFS; 2484 2485 /* 2486 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2487 * This make sure small files blocks are allocated in 2488 * single attempt. This ensure that small files 2489 * get less fragmented. 2490 */ 2491 if (wbc->nr_to_write < sbi->s_mb_stream_request) { 2492 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2493 wbc->nr_to_write = sbi->s_mb_stream_request; 2494 } 2495 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2496 range_whole = 1; 2497 2498 range_cyclic = wbc->range_cyclic; 2499 if (wbc->range_cyclic) { 2500 index = mapping->writeback_index; 2501 if (index) 2502 cycled = 0; 2503 wbc->range_start = index << PAGE_CACHE_SHIFT; 2504 wbc->range_end = LLONG_MAX; 2505 wbc->range_cyclic = 0; 2506 } else 2507 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2508 2509 mpd.wbc = wbc; 2510 mpd.inode = mapping->host; 2511 2512 /* 2513 * we don't want write_cache_pages to update 2514 * nr_to_write and writeback_index 2515 */ 2516 no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2517 wbc->no_nrwrite_index_update = 1; 2518 pages_skipped = wbc->pages_skipped; 2519 2520 retry: 2521 while (!ret && wbc->nr_to_write > 0) { 2522 2523 /* 2524 * we insert one extent at a time. So we need 2525 * credit needed for single extent allocation. 2526 * journalled mode is currently not supported 2527 * by delalloc 2528 */ 2529 BUG_ON(ext4_should_journal_data(inode)); 2530 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2531 2532 /* start a new transaction*/ 2533 handle = ext4_journal_start(inode, needed_blocks); 2534 if (IS_ERR(handle)) { 2535 ret = PTR_ERR(handle); 2536 printk(KERN_CRIT "%s: jbd2_start: " 2537 "%ld pages, ino %lu; err %d\n", __func__, 2538 wbc->nr_to_write, inode->i_ino, ret); 2539 dump_stack(); 2540 goto out_writepages; 2541 } 2542 mpd.get_block = ext4_da_get_block_write; 2543 ret = mpage_da_writepages(mapping, wbc, &mpd); 2544 2545 ext4_journal_stop(handle); 2546 2547 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2548 /* commit the transaction which would 2549 * free blocks released in the transaction 2550 * and try again 2551 */ 2552 jbd2_journal_force_commit_nested(sbi->s_journal); 2553 wbc->pages_skipped = pages_skipped; 2554 ret = 0; 2555 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2556 /* 2557 * got one extent now try with 2558 * rest of the pages 2559 */ 2560 pages_written += mpd.pages_written; 2561 wbc->pages_skipped = pages_skipped; 2562 ret = 0; 2563 io_done = 1; 2564 } else if (wbc->nr_to_write) 2565 /* 2566 * There is no more writeout needed 2567 * or we requested for a noblocking writeout 2568 * and we found the device congested 2569 */ 2570 break; 2571 } 2572 if (!io_done && !cycled) { 2573 cycled = 1; 2574 index = 0; 2575 wbc->range_start = index << PAGE_CACHE_SHIFT; 2576 wbc->range_end = mapping->writeback_index - 1; 2577 goto retry; 2578 } 2579 if (pages_skipped != wbc->pages_skipped) 2580 printk(KERN_EMERG "This should not happen leaving %s " 2581 "with nr_to_write = %ld ret = %d\n", 2582 __func__, wbc->nr_to_write, ret); 2583 2584 /* Update index */ 2585 index += pages_written; 2586 wbc->range_cyclic = range_cyclic; 2587 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2588 /* 2589 * set the writeback_index so that range_cyclic 2590 * mode will write it back later 2591 */ 2592 mapping->writeback_index = index; 2593 2594 out_writepages: 2595 if (!no_nrwrite_index_update) 2596 wbc->no_nrwrite_index_update = 0; 2597 wbc->nr_to_write -= nr_to_writebump; 2598 trace_mark(ext4_da_writepage_result, 2599 "dev %s ino %lu ret %d pages_written %d " 2600 "pages_skipped %ld congestion %d " 2601 "more_io %d no_nrwrite_index_update %d", 2602 inode->i_sb->s_id, inode->i_ino, ret, 2603 pages_written, wbc->pages_skipped, 2604 wbc->encountered_congestion, wbc->more_io, 2605 wbc->no_nrwrite_index_update); 2606 return ret; 2607 } 2608 2609 #define FALL_BACK_TO_NONDELALLOC 1 2610 static int ext4_nonda_switch(struct super_block *sb) 2611 { 2612 s64 free_blocks, dirty_blocks; 2613 struct ext4_sb_info *sbi = EXT4_SB(sb); 2614 2615 /* 2616 * switch to non delalloc mode if we are running low 2617 * on free block. The free block accounting via percpu 2618 * counters can get slightly wrong with percpu_counter_batch getting 2619 * accumulated on each CPU without updating global counters 2620 * Delalloc need an accurate free block accounting. So switch 2621 * to non delalloc when we are near to error range. 2622 */ 2623 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2624 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2625 if (2 * free_blocks < 3 * dirty_blocks || 2626 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2627 /* 2628 * free block count is less that 150% of dirty blocks 2629 * or free blocks is less that watermark 2630 */ 2631 return 1; 2632 } 2633 return 0; 2634 } 2635 2636 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2637 loff_t pos, unsigned len, unsigned flags, 2638 struct page **pagep, void **fsdata) 2639 { 2640 int ret, retries = 0; 2641 struct page *page; 2642 pgoff_t index; 2643 unsigned from, to; 2644 struct inode *inode = mapping->host; 2645 handle_t *handle; 2646 2647 index = pos >> PAGE_CACHE_SHIFT; 2648 from = pos & (PAGE_CACHE_SIZE - 1); 2649 to = from + len; 2650 2651 if (ext4_nonda_switch(inode->i_sb)) { 2652 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2653 return ext4_write_begin(file, mapping, pos, 2654 len, flags, pagep, fsdata); 2655 } 2656 *fsdata = (void *)0; 2657 2658 trace_mark(ext4_da_write_begin, 2659 "dev %s ino %lu pos %llu len %u flags %u", 2660 inode->i_sb->s_id, inode->i_ino, 2661 (unsigned long long) pos, len, flags); 2662 retry: 2663 /* 2664 * With delayed allocation, we don't log the i_disksize update 2665 * if there is delayed block allocation. But we still need 2666 * to journalling the i_disksize update if writes to the end 2667 * of file which has an already mapped buffer. 2668 */ 2669 handle = ext4_journal_start(inode, 1); 2670 if (IS_ERR(handle)) { 2671 ret = PTR_ERR(handle); 2672 goto out; 2673 } 2674 /* We cannot recurse into the filesystem as the transaction is already 2675 * started */ 2676 flags |= AOP_FLAG_NOFS; 2677 2678 page = grab_cache_page_write_begin(mapping, index, flags); 2679 if (!page) { 2680 ext4_journal_stop(handle); 2681 ret = -ENOMEM; 2682 goto out; 2683 } 2684 *pagep = page; 2685 2686 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2687 ext4_da_get_block_prep); 2688 if (ret < 0) { 2689 unlock_page(page); 2690 ext4_journal_stop(handle); 2691 page_cache_release(page); 2692 /* 2693 * block_write_begin may have instantiated a few blocks 2694 * outside i_size. Trim these off again. Don't need 2695 * i_size_read because we hold i_mutex. 2696 */ 2697 if (pos + len > inode->i_size) 2698 vmtruncate(inode, inode->i_size); 2699 } 2700 2701 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2702 goto retry; 2703 out: 2704 return ret; 2705 } 2706 2707 /* 2708 * Check if we should update i_disksize 2709 * when write to the end of file but not require block allocation 2710 */ 2711 static int ext4_da_should_update_i_disksize(struct page *page, 2712 unsigned long offset) 2713 { 2714 struct buffer_head *bh; 2715 struct inode *inode = page->mapping->host; 2716 unsigned int idx; 2717 int i; 2718 2719 bh = page_buffers(page); 2720 idx = offset >> inode->i_blkbits; 2721 2722 for (i = 0; i < idx; i++) 2723 bh = bh->b_this_page; 2724 2725 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2726 return 0; 2727 return 1; 2728 } 2729 2730 static int ext4_da_write_end(struct file *file, 2731 struct address_space *mapping, 2732 loff_t pos, unsigned len, unsigned copied, 2733 struct page *page, void *fsdata) 2734 { 2735 struct inode *inode = mapping->host; 2736 int ret = 0, ret2; 2737 handle_t *handle = ext4_journal_current_handle(); 2738 loff_t new_i_size; 2739 unsigned long start, end; 2740 int write_mode = (int)(unsigned long)fsdata; 2741 2742 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2743 if (ext4_should_order_data(inode)) { 2744 return ext4_ordered_write_end(file, mapping, pos, 2745 len, copied, page, fsdata); 2746 } else if (ext4_should_writeback_data(inode)) { 2747 return ext4_writeback_write_end(file, mapping, pos, 2748 len, copied, page, fsdata); 2749 } else { 2750 BUG(); 2751 } 2752 } 2753 2754 trace_mark(ext4_da_write_end, 2755 "dev %s ino %lu pos %llu len %u copied %u", 2756 inode->i_sb->s_id, inode->i_ino, 2757 (unsigned long long) pos, len, copied); 2758 start = pos & (PAGE_CACHE_SIZE - 1); 2759 end = start + copied - 1; 2760 2761 /* 2762 * generic_write_end() will run mark_inode_dirty() if i_size 2763 * changes. So let's piggyback the i_disksize mark_inode_dirty 2764 * into that. 2765 */ 2766 2767 new_i_size = pos + copied; 2768 if (new_i_size > EXT4_I(inode)->i_disksize) { 2769 if (ext4_da_should_update_i_disksize(page, end)) { 2770 down_write(&EXT4_I(inode)->i_data_sem); 2771 if (new_i_size > EXT4_I(inode)->i_disksize) { 2772 /* 2773 * Updating i_disksize when extending file 2774 * without needing block allocation 2775 */ 2776 if (ext4_should_order_data(inode)) 2777 ret = ext4_jbd2_file_inode(handle, 2778 inode); 2779 2780 EXT4_I(inode)->i_disksize = new_i_size; 2781 } 2782 up_write(&EXT4_I(inode)->i_data_sem); 2783 /* We need to mark inode dirty even if 2784 * new_i_size is less that inode->i_size 2785 * bu greater than i_disksize.(hint delalloc) 2786 */ 2787 ext4_mark_inode_dirty(handle, inode); 2788 } 2789 } 2790 ret2 = generic_write_end(file, mapping, pos, len, copied, 2791 page, fsdata); 2792 copied = ret2; 2793 if (ret2 < 0) 2794 ret = ret2; 2795 ret2 = ext4_journal_stop(handle); 2796 if (!ret) 2797 ret = ret2; 2798 2799 return ret ? ret : copied; 2800 } 2801 2802 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2803 { 2804 /* 2805 * Drop reserved blocks 2806 */ 2807 BUG_ON(!PageLocked(page)); 2808 if (!page_has_buffers(page)) 2809 goto out; 2810 2811 ext4_da_page_release_reservation(page, offset); 2812 2813 out: 2814 ext4_invalidatepage(page, offset); 2815 2816 return; 2817 } 2818 2819 2820 /* 2821 * bmap() is special. It gets used by applications such as lilo and by 2822 * the swapper to find the on-disk block of a specific piece of data. 2823 * 2824 * Naturally, this is dangerous if the block concerned is still in the 2825 * journal. If somebody makes a swapfile on an ext4 data-journaling 2826 * filesystem and enables swap, then they may get a nasty shock when the 2827 * data getting swapped to that swapfile suddenly gets overwritten by 2828 * the original zero's written out previously to the journal and 2829 * awaiting writeback in the kernel's buffer cache. 2830 * 2831 * So, if we see any bmap calls here on a modified, data-journaled file, 2832 * take extra steps to flush any blocks which might be in the cache. 2833 */ 2834 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 2835 { 2836 struct inode *inode = mapping->host; 2837 journal_t *journal; 2838 int err; 2839 2840 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2841 test_opt(inode->i_sb, DELALLOC)) { 2842 /* 2843 * With delalloc we want to sync the file 2844 * so that we can make sure we allocate 2845 * blocks for file 2846 */ 2847 filemap_write_and_wait(mapping); 2848 } 2849 2850 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2851 /* 2852 * This is a REALLY heavyweight approach, but the use of 2853 * bmap on dirty files is expected to be extremely rare: 2854 * only if we run lilo or swapon on a freshly made file 2855 * do we expect this to happen. 2856 * 2857 * (bmap requires CAP_SYS_RAWIO so this does not 2858 * represent an unprivileged user DOS attack --- we'd be 2859 * in trouble if mortal users could trigger this path at 2860 * will.) 2861 * 2862 * NB. EXT4_STATE_JDATA is not set on files other than 2863 * regular files. If somebody wants to bmap a directory 2864 * or symlink and gets confused because the buffer 2865 * hasn't yet been flushed to disk, they deserve 2866 * everything they get. 2867 */ 2868 2869 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 2870 journal = EXT4_JOURNAL(inode); 2871 jbd2_journal_lock_updates(journal); 2872 err = jbd2_journal_flush(journal); 2873 jbd2_journal_unlock_updates(journal); 2874 2875 if (err) 2876 return 0; 2877 } 2878 2879 return generic_block_bmap(mapping, block, ext4_get_block); 2880 } 2881 2882 static int bget_one(handle_t *handle, struct buffer_head *bh) 2883 { 2884 get_bh(bh); 2885 return 0; 2886 } 2887 2888 static int bput_one(handle_t *handle, struct buffer_head *bh) 2889 { 2890 put_bh(bh); 2891 return 0; 2892 } 2893 2894 /* 2895 * Note that we don't need to start a transaction unless we're journaling data 2896 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2897 * need to file the inode to the transaction's list in ordered mode because if 2898 * we are writing back data added by write(), the inode is already there and if 2899 * we are writing back data modified via mmap(), noone guarantees in which 2900 * transaction the data will hit the disk. In case we are journaling data, we 2901 * cannot start transaction directly because transaction start ranks above page 2902 * lock so we have to do some magic. 2903 * 2904 * In all journaling modes block_write_full_page() will start the I/O. 2905 * 2906 * Problem: 2907 * 2908 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2909 * ext4_writepage() 2910 * 2911 * Similar for: 2912 * 2913 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 2914 * 2915 * Same applies to ext4_get_block(). We will deadlock on various things like 2916 * lock_journal and i_data_sem 2917 * 2918 * Setting PF_MEMALLOC here doesn't work - too many internal memory 2919 * allocations fail. 2920 * 2921 * 16May01: If we're reentered then journal_current_handle() will be 2922 * non-zero. We simply *return*. 2923 * 2924 * 1 July 2001: @@@ FIXME: 2925 * In journalled data mode, a data buffer may be metadata against the 2926 * current transaction. But the same file is part of a shared mapping 2927 * and someone does a writepage() on it. 2928 * 2929 * We will move the buffer onto the async_data list, but *after* it has 2930 * been dirtied. So there's a small window where we have dirty data on 2931 * BJ_Metadata. 2932 * 2933 * Note that this only applies to the last partial page in the file. The 2934 * bit which block_write_full_page() uses prepare/commit for. (That's 2935 * broken code anyway: it's wrong for msync()). 2936 * 2937 * It's a rare case: affects the final partial page, for journalled data 2938 * where the file is subject to bith write() and writepage() in the same 2939 * transction. To fix it we'll need a custom block_write_full_page(). 2940 * We'll probably need that anyway for journalling writepage() output. 2941 * 2942 * We don't honour synchronous mounts for writepage(). That would be 2943 * disastrous. Any write() or metadata operation will sync the fs for 2944 * us. 2945 * 2946 */ 2947 static int __ext4_normal_writepage(struct page *page, 2948 struct writeback_control *wbc) 2949 { 2950 struct inode *inode = page->mapping->host; 2951 2952 if (test_opt(inode->i_sb, NOBH)) 2953 return nobh_writepage(page, 2954 ext4_normal_get_block_write, wbc); 2955 else 2956 return block_write_full_page(page, 2957 ext4_normal_get_block_write, 2958 wbc); 2959 } 2960 2961 static int ext4_normal_writepage(struct page *page, 2962 struct writeback_control *wbc) 2963 { 2964 struct inode *inode = page->mapping->host; 2965 loff_t size = i_size_read(inode); 2966 loff_t len; 2967 2968 trace_mark(ext4_normal_writepage, 2969 "dev %s ino %lu page_index %lu", 2970 inode->i_sb->s_id, inode->i_ino, page->index); 2971 J_ASSERT(PageLocked(page)); 2972 if (page->index == size >> PAGE_CACHE_SHIFT) 2973 len = size & ~PAGE_CACHE_MASK; 2974 else 2975 len = PAGE_CACHE_SIZE; 2976 2977 if (page_has_buffers(page)) { 2978 /* if page has buffers it should all be mapped 2979 * and allocated. If there are not buffers attached 2980 * to the page we know the page is dirty but it lost 2981 * buffers. That means that at some moment in time 2982 * after write_begin() / write_end() has been called 2983 * all buffers have been clean and thus they must have been 2984 * written at least once. So they are all mapped and we can 2985 * happily proceed with mapping them and writing the page. 2986 */ 2987 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 2988 ext4_bh_unmapped_or_delay)); 2989 } 2990 2991 if (!ext4_journal_current_handle()) 2992 return __ext4_normal_writepage(page, wbc); 2993 2994 redirty_page_for_writepage(wbc, page); 2995 unlock_page(page); 2996 return 0; 2997 } 2998 2999 static int __ext4_journalled_writepage(struct page *page, 3000 struct writeback_control *wbc) 3001 { 3002 struct address_space *mapping = page->mapping; 3003 struct inode *inode = mapping->host; 3004 struct buffer_head *page_bufs; 3005 handle_t *handle = NULL; 3006 int ret = 0; 3007 int err; 3008 3009 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3010 ext4_normal_get_block_write); 3011 if (ret != 0) 3012 goto out_unlock; 3013 3014 page_bufs = page_buffers(page); 3015 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 3016 bget_one); 3017 /* As soon as we unlock the page, it can go away, but we have 3018 * references to buffers so we are safe */ 3019 unlock_page(page); 3020 3021 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 3022 if (IS_ERR(handle)) { 3023 ret = PTR_ERR(handle); 3024 goto out; 3025 } 3026 3027 ret = walk_page_buffers(handle, page_bufs, 0, 3028 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 3029 3030 err = walk_page_buffers(handle, page_bufs, 0, 3031 PAGE_CACHE_SIZE, NULL, write_end_fn); 3032 if (ret == 0) 3033 ret = err; 3034 err = ext4_journal_stop(handle); 3035 if (!ret) 3036 ret = err; 3037 3038 walk_page_buffers(handle, page_bufs, 0, 3039 PAGE_CACHE_SIZE, NULL, bput_one); 3040 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3041 goto out; 3042 3043 out_unlock: 3044 unlock_page(page); 3045 out: 3046 return ret; 3047 } 3048 3049 static int ext4_journalled_writepage(struct page *page, 3050 struct writeback_control *wbc) 3051 { 3052 struct inode *inode = page->mapping->host; 3053 loff_t size = i_size_read(inode); 3054 loff_t len; 3055 3056 trace_mark(ext4_journalled_writepage, 3057 "dev %s ino %lu page_index %lu", 3058 inode->i_sb->s_id, inode->i_ino, page->index); 3059 J_ASSERT(PageLocked(page)); 3060 if (page->index == size >> PAGE_CACHE_SHIFT) 3061 len = size & ~PAGE_CACHE_MASK; 3062 else 3063 len = PAGE_CACHE_SIZE; 3064 3065 if (page_has_buffers(page)) { 3066 /* if page has buffers it should all be mapped 3067 * and allocated. If there are not buffers attached 3068 * to the page we know the page is dirty but it lost 3069 * buffers. That means that at some moment in time 3070 * after write_begin() / write_end() has been called 3071 * all buffers have been clean and thus they must have been 3072 * written at least once. So they are all mapped and we can 3073 * happily proceed with mapping them and writing the page. 3074 */ 3075 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3076 ext4_bh_unmapped_or_delay)); 3077 } 3078 3079 if (ext4_journal_current_handle()) 3080 goto no_write; 3081 3082 if (PageChecked(page)) { 3083 /* 3084 * It's mmapped pagecache. Add buffers and journal it. There 3085 * doesn't seem much point in redirtying the page here. 3086 */ 3087 ClearPageChecked(page); 3088 return __ext4_journalled_writepage(page, wbc); 3089 } else { 3090 /* 3091 * It may be a page full of checkpoint-mode buffers. We don't 3092 * really know unless we go poke around in the buffer_heads. 3093 * But block_write_full_page will do the right thing. 3094 */ 3095 return block_write_full_page(page, 3096 ext4_normal_get_block_write, 3097 wbc); 3098 } 3099 no_write: 3100 redirty_page_for_writepage(wbc, page); 3101 unlock_page(page); 3102 return 0; 3103 } 3104 3105 static int ext4_readpage(struct file *file, struct page *page) 3106 { 3107 return mpage_readpage(page, ext4_get_block); 3108 } 3109 3110 static int 3111 ext4_readpages(struct file *file, struct address_space *mapping, 3112 struct list_head *pages, unsigned nr_pages) 3113 { 3114 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3115 } 3116 3117 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3118 { 3119 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3120 3121 /* 3122 * If it's a full truncate we just forget about the pending dirtying 3123 */ 3124 if (offset == 0) 3125 ClearPageChecked(page); 3126 3127 if (journal) 3128 jbd2_journal_invalidatepage(journal, page, offset); 3129 else 3130 block_invalidatepage(page, offset); 3131 } 3132 3133 static int ext4_releasepage(struct page *page, gfp_t wait) 3134 { 3135 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3136 3137 WARN_ON(PageChecked(page)); 3138 if (!page_has_buffers(page)) 3139 return 0; 3140 if (journal) 3141 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3142 else 3143 return try_to_free_buffers(page); 3144 } 3145 3146 /* 3147 * If the O_DIRECT write will extend the file then add this inode to the 3148 * orphan list. So recovery will truncate it back to the original size 3149 * if the machine crashes during the write. 3150 * 3151 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3152 * crashes then stale disk data _may_ be exposed inside the file. But current 3153 * VFS code falls back into buffered path in that case so we are safe. 3154 */ 3155 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3156 const struct iovec *iov, loff_t offset, 3157 unsigned long nr_segs) 3158 { 3159 struct file *file = iocb->ki_filp; 3160 struct inode *inode = file->f_mapping->host; 3161 struct ext4_inode_info *ei = EXT4_I(inode); 3162 handle_t *handle; 3163 ssize_t ret; 3164 int orphan = 0; 3165 size_t count = iov_length(iov, nr_segs); 3166 3167 if (rw == WRITE) { 3168 loff_t final_size = offset + count; 3169 3170 if (final_size > inode->i_size) { 3171 /* Credits for sb + inode write */ 3172 handle = ext4_journal_start(inode, 2); 3173 if (IS_ERR(handle)) { 3174 ret = PTR_ERR(handle); 3175 goto out; 3176 } 3177 ret = ext4_orphan_add(handle, inode); 3178 if (ret) { 3179 ext4_journal_stop(handle); 3180 goto out; 3181 } 3182 orphan = 1; 3183 ei->i_disksize = inode->i_size; 3184 ext4_journal_stop(handle); 3185 } 3186 } 3187 3188 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3189 offset, nr_segs, 3190 ext4_get_block, NULL); 3191 3192 if (orphan) { 3193 int err; 3194 3195 /* Credits for sb + inode write */ 3196 handle = ext4_journal_start(inode, 2); 3197 if (IS_ERR(handle)) { 3198 /* This is really bad luck. We've written the data 3199 * but cannot extend i_size. Bail out and pretend 3200 * the write failed... */ 3201 ret = PTR_ERR(handle); 3202 goto out; 3203 } 3204 if (inode->i_nlink) 3205 ext4_orphan_del(handle, inode); 3206 if (ret > 0) { 3207 loff_t end = offset + ret; 3208 if (end > inode->i_size) { 3209 ei->i_disksize = end; 3210 i_size_write(inode, end); 3211 /* 3212 * We're going to return a positive `ret' 3213 * here due to non-zero-length I/O, so there's 3214 * no way of reporting error returns from 3215 * ext4_mark_inode_dirty() to userspace. So 3216 * ignore it. 3217 */ 3218 ext4_mark_inode_dirty(handle, inode); 3219 } 3220 } 3221 err = ext4_journal_stop(handle); 3222 if (ret == 0) 3223 ret = err; 3224 } 3225 out: 3226 return ret; 3227 } 3228 3229 /* 3230 * Pages can be marked dirty completely asynchronously from ext4's journalling 3231 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3232 * much here because ->set_page_dirty is called under VFS locks. The page is 3233 * not necessarily locked. 3234 * 3235 * We cannot just dirty the page and leave attached buffers clean, because the 3236 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3237 * or jbddirty because all the journalling code will explode. 3238 * 3239 * So what we do is to mark the page "pending dirty" and next time writepage 3240 * is called, propagate that into the buffers appropriately. 3241 */ 3242 static int ext4_journalled_set_page_dirty(struct page *page) 3243 { 3244 SetPageChecked(page); 3245 return __set_page_dirty_nobuffers(page); 3246 } 3247 3248 static const struct address_space_operations ext4_ordered_aops = { 3249 .readpage = ext4_readpage, 3250 .readpages = ext4_readpages, 3251 .writepage = ext4_normal_writepage, 3252 .sync_page = block_sync_page, 3253 .write_begin = ext4_write_begin, 3254 .write_end = ext4_ordered_write_end, 3255 .bmap = ext4_bmap, 3256 .invalidatepage = ext4_invalidatepage, 3257 .releasepage = ext4_releasepage, 3258 .direct_IO = ext4_direct_IO, 3259 .migratepage = buffer_migrate_page, 3260 .is_partially_uptodate = block_is_partially_uptodate, 3261 }; 3262 3263 static const struct address_space_operations ext4_writeback_aops = { 3264 .readpage = ext4_readpage, 3265 .readpages = ext4_readpages, 3266 .writepage = ext4_normal_writepage, 3267 .sync_page = block_sync_page, 3268 .write_begin = ext4_write_begin, 3269 .write_end = ext4_writeback_write_end, 3270 .bmap = ext4_bmap, 3271 .invalidatepage = ext4_invalidatepage, 3272 .releasepage = ext4_releasepage, 3273 .direct_IO = ext4_direct_IO, 3274 .migratepage = buffer_migrate_page, 3275 .is_partially_uptodate = block_is_partially_uptodate, 3276 }; 3277 3278 static const struct address_space_operations ext4_journalled_aops = { 3279 .readpage = ext4_readpage, 3280 .readpages = ext4_readpages, 3281 .writepage = ext4_journalled_writepage, 3282 .sync_page = block_sync_page, 3283 .write_begin = ext4_write_begin, 3284 .write_end = ext4_journalled_write_end, 3285 .set_page_dirty = ext4_journalled_set_page_dirty, 3286 .bmap = ext4_bmap, 3287 .invalidatepage = ext4_invalidatepage, 3288 .releasepage = ext4_releasepage, 3289 .is_partially_uptodate = block_is_partially_uptodate, 3290 }; 3291 3292 static const struct address_space_operations ext4_da_aops = { 3293 .readpage = ext4_readpage, 3294 .readpages = ext4_readpages, 3295 .writepage = ext4_da_writepage, 3296 .writepages = ext4_da_writepages, 3297 .sync_page = block_sync_page, 3298 .write_begin = ext4_da_write_begin, 3299 .write_end = ext4_da_write_end, 3300 .bmap = ext4_bmap, 3301 .invalidatepage = ext4_da_invalidatepage, 3302 .releasepage = ext4_releasepage, 3303 .direct_IO = ext4_direct_IO, 3304 .migratepage = buffer_migrate_page, 3305 .is_partially_uptodate = block_is_partially_uptodate, 3306 }; 3307 3308 void ext4_set_aops(struct inode *inode) 3309 { 3310 if (ext4_should_order_data(inode) && 3311 test_opt(inode->i_sb, DELALLOC)) 3312 inode->i_mapping->a_ops = &ext4_da_aops; 3313 else if (ext4_should_order_data(inode)) 3314 inode->i_mapping->a_ops = &ext4_ordered_aops; 3315 else if (ext4_should_writeback_data(inode) && 3316 test_opt(inode->i_sb, DELALLOC)) 3317 inode->i_mapping->a_ops = &ext4_da_aops; 3318 else if (ext4_should_writeback_data(inode)) 3319 inode->i_mapping->a_ops = &ext4_writeback_aops; 3320 else 3321 inode->i_mapping->a_ops = &ext4_journalled_aops; 3322 } 3323 3324 /* 3325 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3326 * up to the end of the block which corresponds to `from'. 3327 * This required during truncate. We need to physically zero the tail end 3328 * of that block so it doesn't yield old data if the file is later grown. 3329 */ 3330 int ext4_block_truncate_page(handle_t *handle, 3331 struct address_space *mapping, loff_t from) 3332 { 3333 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3334 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3335 unsigned blocksize, length, pos; 3336 ext4_lblk_t iblock; 3337 struct inode *inode = mapping->host; 3338 struct buffer_head *bh; 3339 struct page *page; 3340 int err = 0; 3341 3342 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3343 if (!page) 3344 return -EINVAL; 3345 3346 blocksize = inode->i_sb->s_blocksize; 3347 length = blocksize - (offset & (blocksize - 1)); 3348 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3349 3350 /* 3351 * For "nobh" option, we can only work if we don't need to 3352 * read-in the page - otherwise we create buffers to do the IO. 3353 */ 3354 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 3355 ext4_should_writeback_data(inode) && PageUptodate(page)) { 3356 zero_user(page, offset, length); 3357 set_page_dirty(page); 3358 goto unlock; 3359 } 3360 3361 if (!page_has_buffers(page)) 3362 create_empty_buffers(page, blocksize, 0); 3363 3364 /* Find the buffer that contains "offset" */ 3365 bh = page_buffers(page); 3366 pos = blocksize; 3367 while (offset >= pos) { 3368 bh = bh->b_this_page; 3369 iblock++; 3370 pos += blocksize; 3371 } 3372 3373 err = 0; 3374 if (buffer_freed(bh)) { 3375 BUFFER_TRACE(bh, "freed: skip"); 3376 goto unlock; 3377 } 3378 3379 if (!buffer_mapped(bh)) { 3380 BUFFER_TRACE(bh, "unmapped"); 3381 ext4_get_block(inode, iblock, bh, 0); 3382 /* unmapped? It's a hole - nothing to do */ 3383 if (!buffer_mapped(bh)) { 3384 BUFFER_TRACE(bh, "still unmapped"); 3385 goto unlock; 3386 } 3387 } 3388 3389 /* Ok, it's mapped. Make sure it's up-to-date */ 3390 if (PageUptodate(page)) 3391 set_buffer_uptodate(bh); 3392 3393 if (!buffer_uptodate(bh)) { 3394 err = -EIO; 3395 ll_rw_block(READ, 1, &bh); 3396 wait_on_buffer(bh); 3397 /* Uhhuh. Read error. Complain and punt. */ 3398 if (!buffer_uptodate(bh)) 3399 goto unlock; 3400 } 3401 3402 if (ext4_should_journal_data(inode)) { 3403 BUFFER_TRACE(bh, "get write access"); 3404 err = ext4_journal_get_write_access(handle, bh); 3405 if (err) 3406 goto unlock; 3407 } 3408 3409 zero_user(page, offset, length); 3410 3411 BUFFER_TRACE(bh, "zeroed end of block"); 3412 3413 err = 0; 3414 if (ext4_should_journal_data(inode)) { 3415 err = ext4_handle_dirty_metadata(handle, inode, bh); 3416 } else { 3417 if (ext4_should_order_data(inode)) 3418 err = ext4_jbd2_file_inode(handle, inode); 3419 mark_buffer_dirty(bh); 3420 } 3421 3422 unlock: 3423 unlock_page(page); 3424 page_cache_release(page); 3425 return err; 3426 } 3427 3428 /* 3429 * Probably it should be a library function... search for first non-zero word 3430 * or memcmp with zero_page, whatever is better for particular architecture. 3431 * Linus? 3432 */ 3433 static inline int all_zeroes(__le32 *p, __le32 *q) 3434 { 3435 while (p < q) 3436 if (*p++) 3437 return 0; 3438 return 1; 3439 } 3440 3441 /** 3442 * ext4_find_shared - find the indirect blocks for partial truncation. 3443 * @inode: inode in question 3444 * @depth: depth of the affected branch 3445 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3446 * @chain: place to store the pointers to partial indirect blocks 3447 * @top: place to the (detached) top of branch 3448 * 3449 * This is a helper function used by ext4_truncate(). 3450 * 3451 * When we do truncate() we may have to clean the ends of several 3452 * indirect blocks but leave the blocks themselves alive. Block is 3453 * partially truncated if some data below the new i_size is refered 3454 * from it (and it is on the path to the first completely truncated 3455 * data block, indeed). We have to free the top of that path along 3456 * with everything to the right of the path. Since no allocation 3457 * past the truncation point is possible until ext4_truncate() 3458 * finishes, we may safely do the latter, but top of branch may 3459 * require special attention - pageout below the truncation point 3460 * might try to populate it. 3461 * 3462 * We atomically detach the top of branch from the tree, store the 3463 * block number of its root in *@top, pointers to buffer_heads of 3464 * partially truncated blocks - in @chain[].bh and pointers to 3465 * their last elements that should not be removed - in 3466 * @chain[].p. Return value is the pointer to last filled element 3467 * of @chain. 3468 * 3469 * The work left to caller to do the actual freeing of subtrees: 3470 * a) free the subtree starting from *@top 3471 * b) free the subtrees whose roots are stored in 3472 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3473 * c) free the subtrees growing from the inode past the @chain[0]. 3474 * (no partially truncated stuff there). */ 3475 3476 static Indirect *ext4_find_shared(struct inode *inode, int depth, 3477 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 3478 { 3479 Indirect *partial, *p; 3480 int k, err; 3481 3482 *top = 0; 3483 /* Make k index the deepest non-null offest + 1 */ 3484 for (k = depth; k > 1 && !offsets[k-1]; k--) 3485 ; 3486 partial = ext4_get_branch(inode, k, offsets, chain, &err); 3487 /* Writer: pointers */ 3488 if (!partial) 3489 partial = chain + k-1; 3490 /* 3491 * If the branch acquired continuation since we've looked at it - 3492 * fine, it should all survive and (new) top doesn't belong to us. 3493 */ 3494 if (!partial->key && *partial->p) 3495 /* Writer: end */ 3496 goto no_top; 3497 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3498 ; 3499 /* 3500 * OK, we've found the last block that must survive. The rest of our 3501 * branch should be detached before unlocking. However, if that rest 3502 * of branch is all ours and does not grow immediately from the inode 3503 * it's easier to cheat and just decrement partial->p. 3504 */ 3505 if (p == chain + k - 1 && p > chain) { 3506 p->p--; 3507 } else { 3508 *top = *p->p; 3509 /* Nope, don't do this in ext4. Must leave the tree intact */ 3510 #if 0 3511 *p->p = 0; 3512 #endif 3513 } 3514 /* Writer: end */ 3515 3516 while (partial > p) { 3517 brelse(partial->bh); 3518 partial--; 3519 } 3520 no_top: 3521 return partial; 3522 } 3523 3524 /* 3525 * Zero a number of block pointers in either an inode or an indirect block. 3526 * If we restart the transaction we must again get write access to the 3527 * indirect block for further modification. 3528 * 3529 * We release `count' blocks on disk, but (last - first) may be greater 3530 * than `count' because there can be holes in there. 3531 */ 3532 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3533 struct buffer_head *bh, ext4_fsblk_t block_to_free, 3534 unsigned long count, __le32 *first, __le32 *last) 3535 { 3536 __le32 *p; 3537 if (try_to_extend_transaction(handle, inode)) { 3538 if (bh) { 3539 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 3540 ext4_handle_dirty_metadata(handle, inode, bh); 3541 } 3542 ext4_mark_inode_dirty(handle, inode); 3543 ext4_journal_test_restart(handle, inode); 3544 if (bh) { 3545 BUFFER_TRACE(bh, "retaking write access"); 3546 ext4_journal_get_write_access(handle, bh); 3547 } 3548 } 3549 3550 /* 3551 * Any buffers which are on the journal will be in memory. We find 3552 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 3553 * on them. We've already detached each block from the file, so 3554 * bforget() in jbd2_journal_forget() should be safe. 3555 * 3556 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3557 */ 3558 for (p = first; p < last; p++) { 3559 u32 nr = le32_to_cpu(*p); 3560 if (nr) { 3561 struct buffer_head *tbh; 3562 3563 *p = 0; 3564 tbh = sb_find_get_block(inode->i_sb, nr); 3565 ext4_forget(handle, 0, inode, tbh, nr); 3566 } 3567 } 3568 3569 ext4_free_blocks(handle, inode, block_to_free, count, 0); 3570 } 3571 3572 /** 3573 * ext4_free_data - free a list of data blocks 3574 * @handle: handle for this transaction 3575 * @inode: inode we are dealing with 3576 * @this_bh: indirect buffer_head which contains *@first and *@last 3577 * @first: array of block numbers 3578 * @last: points immediately past the end of array 3579 * 3580 * We are freeing all blocks refered from that array (numbers are stored as 3581 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3582 * 3583 * We accumulate contiguous runs of blocks to free. Conveniently, if these 3584 * blocks are contiguous then releasing them at one time will only affect one 3585 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3586 * actually use a lot of journal space. 3587 * 3588 * @this_bh will be %NULL if @first and @last point into the inode's direct 3589 * block pointers. 3590 */ 3591 static void ext4_free_data(handle_t *handle, struct inode *inode, 3592 struct buffer_head *this_bh, 3593 __le32 *first, __le32 *last) 3594 { 3595 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3596 unsigned long count = 0; /* Number of blocks in the run */ 3597 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3598 corresponding to 3599 block_to_free */ 3600 ext4_fsblk_t nr; /* Current block # */ 3601 __le32 *p; /* Pointer into inode/ind 3602 for current block */ 3603 int err; 3604 3605 if (this_bh) { /* For indirect block */ 3606 BUFFER_TRACE(this_bh, "get_write_access"); 3607 err = ext4_journal_get_write_access(handle, this_bh); 3608 /* Important: if we can't update the indirect pointers 3609 * to the blocks, we can't free them. */ 3610 if (err) 3611 return; 3612 } 3613 3614 for (p = first; p < last; p++) { 3615 nr = le32_to_cpu(*p); 3616 if (nr) { 3617 /* accumulate blocks to free if they're contiguous */ 3618 if (count == 0) { 3619 block_to_free = nr; 3620 block_to_free_p = p; 3621 count = 1; 3622 } else if (nr == block_to_free + count) { 3623 count++; 3624 } else { 3625 ext4_clear_blocks(handle, inode, this_bh, 3626 block_to_free, 3627 count, block_to_free_p, p); 3628 block_to_free = nr; 3629 block_to_free_p = p; 3630 count = 1; 3631 } 3632 } 3633 } 3634 3635 if (count > 0) 3636 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3637 count, block_to_free_p, p); 3638 3639 if (this_bh) { 3640 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 3641 3642 /* 3643 * The buffer head should have an attached journal head at this 3644 * point. However, if the data is corrupted and an indirect 3645 * block pointed to itself, it would have been detached when 3646 * the block was cleared. Check for this instead of OOPSing. 3647 */ 3648 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 3649 ext4_handle_dirty_metadata(handle, inode, this_bh); 3650 else 3651 ext4_error(inode->i_sb, __func__, 3652 "circular indirect block detected, " 3653 "inode=%lu, block=%llu", 3654 inode->i_ino, 3655 (unsigned long long) this_bh->b_blocknr); 3656 } 3657 } 3658 3659 /** 3660 * ext4_free_branches - free an array of branches 3661 * @handle: JBD handle for this transaction 3662 * @inode: inode we are dealing with 3663 * @parent_bh: the buffer_head which contains *@first and *@last 3664 * @first: array of block numbers 3665 * @last: pointer immediately past the end of array 3666 * @depth: depth of the branches to free 3667 * 3668 * We are freeing all blocks refered from these branches (numbers are 3669 * stored as little-endian 32-bit) and updating @inode->i_blocks 3670 * appropriately. 3671 */ 3672 static void ext4_free_branches(handle_t *handle, struct inode *inode, 3673 struct buffer_head *parent_bh, 3674 __le32 *first, __le32 *last, int depth) 3675 { 3676 ext4_fsblk_t nr; 3677 __le32 *p; 3678 3679 if (ext4_handle_is_aborted(handle)) 3680 return; 3681 3682 if (depth--) { 3683 struct buffer_head *bh; 3684 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3685 p = last; 3686 while (--p >= first) { 3687 nr = le32_to_cpu(*p); 3688 if (!nr) 3689 continue; /* A hole */ 3690 3691 /* Go read the buffer for the next level down */ 3692 bh = sb_bread(inode->i_sb, nr); 3693 3694 /* 3695 * A read failure? Report error and clear slot 3696 * (should be rare). 3697 */ 3698 if (!bh) { 3699 ext4_error(inode->i_sb, "ext4_free_branches", 3700 "Read failure, inode=%lu, block=%llu", 3701 inode->i_ino, nr); 3702 continue; 3703 } 3704 3705 /* This zaps the entire block. Bottom up. */ 3706 BUFFER_TRACE(bh, "free child branches"); 3707 ext4_free_branches(handle, inode, bh, 3708 (__le32 *) bh->b_data, 3709 (__le32 *) bh->b_data + addr_per_block, 3710 depth); 3711 3712 /* 3713 * We've probably journalled the indirect block several 3714 * times during the truncate. But it's no longer 3715 * needed and we now drop it from the transaction via 3716 * jbd2_journal_revoke(). 3717 * 3718 * That's easy if it's exclusively part of this 3719 * transaction. But if it's part of the committing 3720 * transaction then jbd2_journal_forget() will simply 3721 * brelse() it. That means that if the underlying 3722 * block is reallocated in ext4_get_block(), 3723 * unmap_underlying_metadata() will find this block 3724 * and will try to get rid of it. damn, damn. 3725 * 3726 * If this block has already been committed to the 3727 * journal, a revoke record will be written. And 3728 * revoke records must be emitted *before* clearing 3729 * this block's bit in the bitmaps. 3730 */ 3731 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 3732 3733 /* 3734 * Everything below this this pointer has been 3735 * released. Now let this top-of-subtree go. 3736 * 3737 * We want the freeing of this indirect block to be 3738 * atomic in the journal with the updating of the 3739 * bitmap block which owns it. So make some room in 3740 * the journal. 3741 * 3742 * We zero the parent pointer *after* freeing its 3743 * pointee in the bitmaps, so if extend_transaction() 3744 * for some reason fails to put the bitmap changes and 3745 * the release into the same transaction, recovery 3746 * will merely complain about releasing a free block, 3747 * rather than leaking blocks. 3748 */ 3749 if (ext4_handle_is_aborted(handle)) 3750 return; 3751 if (try_to_extend_transaction(handle, inode)) { 3752 ext4_mark_inode_dirty(handle, inode); 3753 ext4_journal_test_restart(handle, inode); 3754 } 3755 3756 ext4_free_blocks(handle, inode, nr, 1, 1); 3757 3758 if (parent_bh) { 3759 /* 3760 * The block which we have just freed is 3761 * pointed to by an indirect block: journal it 3762 */ 3763 BUFFER_TRACE(parent_bh, "get_write_access"); 3764 if (!ext4_journal_get_write_access(handle, 3765 parent_bh)){ 3766 *p = 0; 3767 BUFFER_TRACE(parent_bh, 3768 "call ext4_handle_dirty_metadata"); 3769 ext4_handle_dirty_metadata(handle, 3770 inode, 3771 parent_bh); 3772 } 3773 } 3774 } 3775 } else { 3776 /* We have reached the bottom of the tree. */ 3777 BUFFER_TRACE(parent_bh, "free data blocks"); 3778 ext4_free_data(handle, inode, parent_bh, first, last); 3779 } 3780 } 3781 3782 int ext4_can_truncate(struct inode *inode) 3783 { 3784 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3785 return 0; 3786 if (S_ISREG(inode->i_mode)) 3787 return 1; 3788 if (S_ISDIR(inode->i_mode)) 3789 return 1; 3790 if (S_ISLNK(inode->i_mode)) 3791 return !ext4_inode_is_fast_symlink(inode); 3792 return 0; 3793 } 3794 3795 /* 3796 * ext4_truncate() 3797 * 3798 * We block out ext4_get_block() block instantiations across the entire 3799 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 3800 * simultaneously on behalf of the same inode. 3801 * 3802 * As we work through the truncate and commmit bits of it to the journal there 3803 * is one core, guiding principle: the file's tree must always be consistent on 3804 * disk. We must be able to restart the truncate after a crash. 3805 * 3806 * The file's tree may be transiently inconsistent in memory (although it 3807 * probably isn't), but whenever we close off and commit a journal transaction, 3808 * the contents of (the filesystem + the journal) must be consistent and 3809 * restartable. It's pretty simple, really: bottom up, right to left (although 3810 * left-to-right works OK too). 3811 * 3812 * Note that at recovery time, journal replay occurs *before* the restart of 3813 * truncate against the orphan inode list. 3814 * 3815 * The committed inode has the new, desired i_size (which is the same as 3816 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 3817 * that this inode's truncate did not complete and it will again call 3818 * ext4_truncate() to have another go. So there will be instantiated blocks 3819 * to the right of the truncation point in a crashed ext4 filesystem. But 3820 * that's fine - as long as they are linked from the inode, the post-crash 3821 * ext4_truncate() run will find them and release them. 3822 */ 3823 void ext4_truncate(struct inode *inode) 3824 { 3825 handle_t *handle; 3826 struct ext4_inode_info *ei = EXT4_I(inode); 3827 __le32 *i_data = ei->i_data; 3828 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3829 struct address_space *mapping = inode->i_mapping; 3830 ext4_lblk_t offsets[4]; 3831 Indirect chain[4]; 3832 Indirect *partial; 3833 __le32 nr = 0; 3834 int n; 3835 ext4_lblk_t last_block; 3836 unsigned blocksize = inode->i_sb->s_blocksize; 3837 3838 if (!ext4_can_truncate(inode)) 3839 return; 3840 3841 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3842 ext4_ext_truncate(inode); 3843 return; 3844 } 3845 3846 handle = start_transaction(inode); 3847 if (IS_ERR(handle)) 3848 return; /* AKPM: return what? */ 3849 3850 last_block = (inode->i_size + blocksize-1) 3851 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3852 3853 if (inode->i_size & (blocksize - 1)) 3854 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 3855 goto out_stop; 3856 3857 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3858 if (n == 0) 3859 goto out_stop; /* error */ 3860 3861 /* 3862 * OK. This truncate is going to happen. We add the inode to the 3863 * orphan list, so that if this truncate spans multiple transactions, 3864 * and we crash, we will resume the truncate when the filesystem 3865 * recovers. It also marks the inode dirty, to catch the new size. 3866 * 3867 * Implication: the file must always be in a sane, consistent 3868 * truncatable state while each transaction commits. 3869 */ 3870 if (ext4_orphan_add(handle, inode)) 3871 goto out_stop; 3872 3873 /* 3874 * From here we block out all ext4_get_block() callers who want to 3875 * modify the block allocation tree. 3876 */ 3877 down_write(&ei->i_data_sem); 3878 3879 ext4_discard_preallocations(inode); 3880 3881 /* 3882 * The orphan list entry will now protect us from any crash which 3883 * occurs before the truncate completes, so it is now safe to propagate 3884 * the new, shorter inode size (held for now in i_size) into the 3885 * on-disk inode. We do this via i_disksize, which is the value which 3886 * ext4 *really* writes onto the disk inode. 3887 */ 3888 ei->i_disksize = inode->i_size; 3889 3890 if (n == 1) { /* direct blocks */ 3891 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3892 i_data + EXT4_NDIR_BLOCKS); 3893 goto do_indirects; 3894 } 3895 3896 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 3897 /* Kill the top of shared branch (not detached) */ 3898 if (nr) { 3899 if (partial == chain) { 3900 /* Shared branch grows from the inode */ 3901 ext4_free_branches(handle, inode, NULL, 3902 &nr, &nr+1, (chain+n-1) - partial); 3903 *partial->p = 0; 3904 /* 3905 * We mark the inode dirty prior to restart, 3906 * and prior to stop. No need for it here. 3907 */ 3908 } else { 3909 /* Shared branch grows from an indirect block */ 3910 BUFFER_TRACE(partial->bh, "get_write_access"); 3911 ext4_free_branches(handle, inode, partial->bh, 3912 partial->p, 3913 partial->p+1, (chain+n-1) - partial); 3914 } 3915 } 3916 /* Clear the ends of indirect blocks on the shared branch */ 3917 while (partial > chain) { 3918 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 3919 (__le32*)partial->bh->b_data+addr_per_block, 3920 (chain+n-1) - partial); 3921 BUFFER_TRACE(partial->bh, "call brelse"); 3922 brelse (partial->bh); 3923 partial--; 3924 } 3925 do_indirects: 3926 /* Kill the remaining (whole) subtrees */ 3927 switch (offsets[0]) { 3928 default: 3929 nr = i_data[EXT4_IND_BLOCK]; 3930 if (nr) { 3931 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 3932 i_data[EXT4_IND_BLOCK] = 0; 3933 } 3934 case EXT4_IND_BLOCK: 3935 nr = i_data[EXT4_DIND_BLOCK]; 3936 if (nr) { 3937 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 3938 i_data[EXT4_DIND_BLOCK] = 0; 3939 } 3940 case EXT4_DIND_BLOCK: 3941 nr = i_data[EXT4_TIND_BLOCK]; 3942 if (nr) { 3943 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 3944 i_data[EXT4_TIND_BLOCK] = 0; 3945 } 3946 case EXT4_TIND_BLOCK: 3947 ; 3948 } 3949 3950 up_write(&ei->i_data_sem); 3951 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3952 ext4_mark_inode_dirty(handle, inode); 3953 3954 /* 3955 * In a multi-transaction truncate, we only make the final transaction 3956 * synchronous 3957 */ 3958 if (IS_SYNC(inode)) 3959 ext4_handle_sync(handle); 3960 out_stop: 3961 /* 3962 * If this was a simple ftruncate(), and the file will remain alive 3963 * then we need to clear up the orphan record which we created above. 3964 * However, if this was a real unlink then we were called by 3965 * ext4_delete_inode(), and we allow that function to clean up the 3966 * orphan info for us. 3967 */ 3968 if (inode->i_nlink) 3969 ext4_orphan_del(handle, inode); 3970 3971 ext4_journal_stop(handle); 3972 } 3973 3974 /* 3975 * ext4_get_inode_loc returns with an extra refcount against the inode's 3976 * underlying buffer_head on success. If 'in_mem' is true, we have all 3977 * data in memory that is needed to recreate the on-disk version of this 3978 * inode. 3979 */ 3980 static int __ext4_get_inode_loc(struct inode *inode, 3981 struct ext4_iloc *iloc, int in_mem) 3982 { 3983 struct ext4_group_desc *gdp; 3984 struct buffer_head *bh; 3985 struct super_block *sb = inode->i_sb; 3986 ext4_fsblk_t block; 3987 int inodes_per_block, inode_offset; 3988 3989 iloc->bh = NULL; 3990 if (!ext4_valid_inum(sb, inode->i_ino)) 3991 return -EIO; 3992 3993 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 3994 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 3995 if (!gdp) 3996 return -EIO; 3997 3998 /* 3999 * Figure out the offset within the block group inode table 4000 */ 4001 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4002 inode_offset = ((inode->i_ino - 1) % 4003 EXT4_INODES_PER_GROUP(sb)); 4004 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4005 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4006 4007 bh = sb_getblk(sb, block); 4008 if (!bh) { 4009 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4010 "inode block - inode=%lu, block=%llu", 4011 inode->i_ino, block); 4012 return -EIO; 4013 } 4014 if (!buffer_uptodate(bh)) { 4015 lock_buffer(bh); 4016 4017 /* 4018 * If the buffer has the write error flag, we have failed 4019 * to write out another inode in the same block. In this 4020 * case, we don't have to read the block because we may 4021 * read the old inode data successfully. 4022 */ 4023 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4024 set_buffer_uptodate(bh); 4025 4026 if (buffer_uptodate(bh)) { 4027 /* someone brought it uptodate while we waited */ 4028 unlock_buffer(bh); 4029 goto has_buffer; 4030 } 4031 4032 /* 4033 * If we have all information of the inode in memory and this 4034 * is the only valid inode in the block, we need not read the 4035 * block. 4036 */ 4037 if (in_mem) { 4038 struct buffer_head *bitmap_bh; 4039 int i, start; 4040 4041 start = inode_offset & ~(inodes_per_block - 1); 4042 4043 /* Is the inode bitmap in cache? */ 4044 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4045 if (!bitmap_bh) 4046 goto make_io; 4047 4048 /* 4049 * If the inode bitmap isn't in cache then the 4050 * optimisation may end up performing two reads instead 4051 * of one, so skip it. 4052 */ 4053 if (!buffer_uptodate(bitmap_bh)) { 4054 brelse(bitmap_bh); 4055 goto make_io; 4056 } 4057 for (i = start; i < start + inodes_per_block; i++) { 4058 if (i == inode_offset) 4059 continue; 4060 if (ext4_test_bit(i, bitmap_bh->b_data)) 4061 break; 4062 } 4063 brelse(bitmap_bh); 4064 if (i == start + inodes_per_block) { 4065 /* all other inodes are free, so skip I/O */ 4066 memset(bh->b_data, 0, bh->b_size); 4067 set_buffer_uptodate(bh); 4068 unlock_buffer(bh); 4069 goto has_buffer; 4070 } 4071 } 4072 4073 make_io: 4074 /* 4075 * If we need to do any I/O, try to pre-readahead extra 4076 * blocks from the inode table. 4077 */ 4078 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4079 ext4_fsblk_t b, end, table; 4080 unsigned num; 4081 4082 table = ext4_inode_table(sb, gdp); 4083 /* Make sure s_inode_readahead_blks is a power of 2 */ 4084 while (EXT4_SB(sb)->s_inode_readahead_blks & 4085 (EXT4_SB(sb)->s_inode_readahead_blks-1)) 4086 EXT4_SB(sb)->s_inode_readahead_blks = 4087 (EXT4_SB(sb)->s_inode_readahead_blks & 4088 (EXT4_SB(sb)->s_inode_readahead_blks-1)); 4089 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4090 if (table > b) 4091 b = table; 4092 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4093 num = EXT4_INODES_PER_GROUP(sb); 4094 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4095 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4096 num -= ext4_itable_unused_count(sb, gdp); 4097 table += num / inodes_per_block; 4098 if (end > table) 4099 end = table; 4100 while (b <= end) 4101 sb_breadahead(sb, b++); 4102 } 4103 4104 /* 4105 * There are other valid inodes in the buffer, this inode 4106 * has in-inode xattrs, or we don't have this inode in memory. 4107 * Read the block from disk. 4108 */ 4109 get_bh(bh); 4110 bh->b_end_io = end_buffer_read_sync; 4111 submit_bh(READ_META, bh); 4112 wait_on_buffer(bh); 4113 if (!buffer_uptodate(bh)) { 4114 ext4_error(sb, __func__, 4115 "unable to read inode block - inode=%lu, " 4116 "block=%llu", inode->i_ino, block); 4117 brelse(bh); 4118 return -EIO; 4119 } 4120 } 4121 has_buffer: 4122 iloc->bh = bh; 4123 return 0; 4124 } 4125 4126 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4127 { 4128 /* We have all inode data except xattrs in memory here. */ 4129 return __ext4_get_inode_loc(inode, iloc, 4130 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4131 } 4132 4133 void ext4_set_inode_flags(struct inode *inode) 4134 { 4135 unsigned int flags = EXT4_I(inode)->i_flags; 4136 4137 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4138 if (flags & EXT4_SYNC_FL) 4139 inode->i_flags |= S_SYNC; 4140 if (flags & EXT4_APPEND_FL) 4141 inode->i_flags |= S_APPEND; 4142 if (flags & EXT4_IMMUTABLE_FL) 4143 inode->i_flags |= S_IMMUTABLE; 4144 if (flags & EXT4_NOATIME_FL) 4145 inode->i_flags |= S_NOATIME; 4146 if (flags & EXT4_DIRSYNC_FL) 4147 inode->i_flags |= S_DIRSYNC; 4148 } 4149 4150 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4151 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4152 { 4153 unsigned int flags = ei->vfs_inode.i_flags; 4154 4155 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4156 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4157 if (flags & S_SYNC) 4158 ei->i_flags |= EXT4_SYNC_FL; 4159 if (flags & S_APPEND) 4160 ei->i_flags |= EXT4_APPEND_FL; 4161 if (flags & S_IMMUTABLE) 4162 ei->i_flags |= EXT4_IMMUTABLE_FL; 4163 if (flags & S_NOATIME) 4164 ei->i_flags |= EXT4_NOATIME_FL; 4165 if (flags & S_DIRSYNC) 4166 ei->i_flags |= EXT4_DIRSYNC_FL; 4167 } 4168 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4169 struct ext4_inode_info *ei) 4170 { 4171 blkcnt_t i_blocks ; 4172 struct inode *inode = &(ei->vfs_inode); 4173 struct super_block *sb = inode->i_sb; 4174 4175 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4176 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4177 /* we are using combined 48 bit field */ 4178 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4179 le32_to_cpu(raw_inode->i_blocks_lo); 4180 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4181 /* i_blocks represent file system block size */ 4182 return i_blocks << (inode->i_blkbits - 9); 4183 } else { 4184 return i_blocks; 4185 } 4186 } else { 4187 return le32_to_cpu(raw_inode->i_blocks_lo); 4188 } 4189 } 4190 4191 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4192 { 4193 struct ext4_iloc iloc; 4194 struct ext4_inode *raw_inode; 4195 struct ext4_inode_info *ei; 4196 struct buffer_head *bh; 4197 struct inode *inode; 4198 long ret; 4199 int block; 4200 4201 inode = iget_locked(sb, ino); 4202 if (!inode) 4203 return ERR_PTR(-ENOMEM); 4204 if (!(inode->i_state & I_NEW)) 4205 return inode; 4206 4207 ei = EXT4_I(inode); 4208 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4209 ei->i_acl = EXT4_ACL_NOT_CACHED; 4210 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4211 #endif 4212 4213 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4214 if (ret < 0) 4215 goto bad_inode; 4216 bh = iloc.bh; 4217 raw_inode = ext4_raw_inode(&iloc); 4218 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4219 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4220 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4221 if (!(test_opt(inode->i_sb, NO_UID32))) { 4222 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4223 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4224 } 4225 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4226 4227 ei->i_state = 0; 4228 ei->i_dir_start_lookup = 0; 4229 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4230 /* We now have enough fields to check if the inode was active or not. 4231 * This is needed because nfsd might try to access dead inodes 4232 * the test is that same one that e2fsck uses 4233 * NeilBrown 1999oct15 4234 */ 4235 if (inode->i_nlink == 0) { 4236 if (inode->i_mode == 0 || 4237 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4238 /* this inode is deleted */ 4239 brelse(bh); 4240 ret = -ESTALE; 4241 goto bad_inode; 4242 } 4243 /* The only unlinked inodes we let through here have 4244 * valid i_mode and are being read by the orphan 4245 * recovery code: that's fine, we're about to complete 4246 * the process of deleting those. */ 4247 } 4248 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4249 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4250 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4251 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4252 cpu_to_le32(EXT4_OS_HURD)) { 4253 ei->i_file_acl |= 4254 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4255 } 4256 inode->i_size = ext4_isize(raw_inode); 4257 ei->i_disksize = inode->i_size; 4258 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4259 ei->i_block_group = iloc.block_group; 4260 /* 4261 * NOTE! The in-memory inode i_data array is in little-endian order 4262 * even on big-endian machines: we do NOT byteswap the block numbers! 4263 */ 4264 for (block = 0; block < EXT4_N_BLOCKS; block++) 4265 ei->i_data[block] = raw_inode->i_block[block]; 4266 INIT_LIST_HEAD(&ei->i_orphan); 4267 4268 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4269 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4270 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4271 EXT4_INODE_SIZE(inode->i_sb)) { 4272 brelse(bh); 4273 ret = -EIO; 4274 goto bad_inode; 4275 } 4276 if (ei->i_extra_isize == 0) { 4277 /* The extra space is currently unused. Use it. */ 4278 ei->i_extra_isize = sizeof(struct ext4_inode) - 4279 EXT4_GOOD_OLD_INODE_SIZE; 4280 } else { 4281 __le32 *magic = (void *)raw_inode + 4282 EXT4_GOOD_OLD_INODE_SIZE + 4283 ei->i_extra_isize; 4284 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4285 ei->i_state |= EXT4_STATE_XATTR; 4286 } 4287 } else 4288 ei->i_extra_isize = 0; 4289 4290 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4291 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4292 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4293 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4294 4295 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4296 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4297 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4298 inode->i_version |= 4299 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4300 } 4301 4302 if (S_ISREG(inode->i_mode)) { 4303 inode->i_op = &ext4_file_inode_operations; 4304 inode->i_fop = &ext4_file_operations; 4305 ext4_set_aops(inode); 4306 } else if (S_ISDIR(inode->i_mode)) { 4307 inode->i_op = &ext4_dir_inode_operations; 4308 inode->i_fop = &ext4_dir_operations; 4309 } else if (S_ISLNK(inode->i_mode)) { 4310 if (ext4_inode_is_fast_symlink(inode)) { 4311 inode->i_op = &ext4_fast_symlink_inode_operations; 4312 nd_terminate_link(ei->i_data, inode->i_size, 4313 sizeof(ei->i_data) - 1); 4314 } else { 4315 inode->i_op = &ext4_symlink_inode_operations; 4316 ext4_set_aops(inode); 4317 } 4318 } else { 4319 inode->i_op = &ext4_special_inode_operations; 4320 if (raw_inode->i_block[0]) 4321 init_special_inode(inode, inode->i_mode, 4322 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4323 else 4324 init_special_inode(inode, inode->i_mode, 4325 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4326 } 4327 brelse(iloc.bh); 4328 ext4_set_inode_flags(inode); 4329 unlock_new_inode(inode); 4330 return inode; 4331 4332 bad_inode: 4333 iget_failed(inode); 4334 return ERR_PTR(ret); 4335 } 4336 4337 static int ext4_inode_blocks_set(handle_t *handle, 4338 struct ext4_inode *raw_inode, 4339 struct ext4_inode_info *ei) 4340 { 4341 struct inode *inode = &(ei->vfs_inode); 4342 u64 i_blocks = inode->i_blocks; 4343 struct super_block *sb = inode->i_sb; 4344 4345 if (i_blocks <= ~0U) { 4346 /* 4347 * i_blocks can be represnted in a 32 bit variable 4348 * as multiple of 512 bytes 4349 */ 4350 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4351 raw_inode->i_blocks_high = 0; 4352 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4353 return 0; 4354 } 4355 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4356 return -EFBIG; 4357 4358 if (i_blocks <= 0xffffffffffffULL) { 4359 /* 4360 * i_blocks can be represented in a 48 bit variable 4361 * as multiple of 512 bytes 4362 */ 4363 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4364 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4365 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4366 } else { 4367 ei->i_flags |= EXT4_HUGE_FILE_FL; 4368 /* i_block is stored in file system block size */ 4369 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4370 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4371 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4372 } 4373 return 0; 4374 } 4375 4376 /* 4377 * Post the struct inode info into an on-disk inode location in the 4378 * buffer-cache. This gobbles the caller's reference to the 4379 * buffer_head in the inode location struct. 4380 * 4381 * The caller must have write access to iloc->bh. 4382 */ 4383 static int ext4_do_update_inode(handle_t *handle, 4384 struct inode *inode, 4385 struct ext4_iloc *iloc) 4386 { 4387 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4388 struct ext4_inode_info *ei = EXT4_I(inode); 4389 struct buffer_head *bh = iloc->bh; 4390 int err = 0, rc, block; 4391 4392 /* For fields not not tracking in the in-memory inode, 4393 * initialise them to zero for new inodes. */ 4394 if (ei->i_state & EXT4_STATE_NEW) 4395 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4396 4397 ext4_get_inode_flags(ei); 4398 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4399 if (!(test_opt(inode->i_sb, NO_UID32))) { 4400 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4401 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4402 /* 4403 * Fix up interoperability with old kernels. Otherwise, old inodes get 4404 * re-used with the upper 16 bits of the uid/gid intact 4405 */ 4406 if (!ei->i_dtime) { 4407 raw_inode->i_uid_high = 4408 cpu_to_le16(high_16_bits(inode->i_uid)); 4409 raw_inode->i_gid_high = 4410 cpu_to_le16(high_16_bits(inode->i_gid)); 4411 } else { 4412 raw_inode->i_uid_high = 0; 4413 raw_inode->i_gid_high = 0; 4414 } 4415 } else { 4416 raw_inode->i_uid_low = 4417 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 4418 raw_inode->i_gid_low = 4419 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 4420 raw_inode->i_uid_high = 0; 4421 raw_inode->i_gid_high = 0; 4422 } 4423 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4424 4425 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4426 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4427 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4428 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4429 4430 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4431 goto out_brelse; 4432 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4433 /* clear the migrate flag in the raw_inode */ 4434 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); 4435 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4436 cpu_to_le32(EXT4_OS_HURD)) 4437 raw_inode->i_file_acl_high = 4438 cpu_to_le16(ei->i_file_acl >> 32); 4439 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4440 ext4_isize_set(raw_inode, ei->i_disksize); 4441 if (ei->i_disksize > 0x7fffffffULL) { 4442 struct super_block *sb = inode->i_sb; 4443 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4444 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 4445 EXT4_SB(sb)->s_es->s_rev_level == 4446 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 4447 /* If this is the first large file 4448 * created, add a flag to the superblock. 4449 */ 4450 err = ext4_journal_get_write_access(handle, 4451 EXT4_SB(sb)->s_sbh); 4452 if (err) 4453 goto out_brelse; 4454 ext4_update_dynamic_rev(sb); 4455 EXT4_SET_RO_COMPAT_FEATURE(sb, 4456 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4457 sb->s_dirt = 1; 4458 ext4_handle_sync(handle); 4459 err = ext4_handle_dirty_metadata(handle, inode, 4460 EXT4_SB(sb)->s_sbh); 4461 } 4462 } 4463 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4464 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4465 if (old_valid_dev(inode->i_rdev)) { 4466 raw_inode->i_block[0] = 4467 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4468 raw_inode->i_block[1] = 0; 4469 } else { 4470 raw_inode->i_block[0] = 0; 4471 raw_inode->i_block[1] = 4472 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4473 raw_inode->i_block[2] = 0; 4474 } 4475 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 4476 raw_inode->i_block[block] = ei->i_data[block]; 4477 4478 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4479 if (ei->i_extra_isize) { 4480 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4481 raw_inode->i_version_hi = 4482 cpu_to_le32(inode->i_version >> 32); 4483 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4484 } 4485 4486 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4487 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4488 if (!err) 4489 err = rc; 4490 ei->i_state &= ~EXT4_STATE_NEW; 4491 4492 out_brelse: 4493 brelse(bh); 4494 ext4_std_error(inode->i_sb, err); 4495 return err; 4496 } 4497 4498 /* 4499 * ext4_write_inode() 4500 * 4501 * We are called from a few places: 4502 * 4503 * - Within generic_file_write() for O_SYNC files. 4504 * Here, there will be no transaction running. We wait for any running 4505 * trasnaction to commit. 4506 * 4507 * - Within sys_sync(), kupdate and such. 4508 * We wait on commit, if tol to. 4509 * 4510 * - Within prune_icache() (PF_MEMALLOC == true) 4511 * Here we simply return. We can't afford to block kswapd on the 4512 * journal commit. 4513 * 4514 * In all cases it is actually safe for us to return without doing anything, 4515 * because the inode has been copied into a raw inode buffer in 4516 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4517 * knfsd. 4518 * 4519 * Note that we are absolutely dependent upon all inode dirtiers doing the 4520 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4521 * which we are interested. 4522 * 4523 * It would be a bug for them to not do this. The code: 4524 * 4525 * mark_inode_dirty(inode) 4526 * stuff(); 4527 * inode->i_size = expr; 4528 * 4529 * is in error because a kswapd-driven write_inode() could occur while 4530 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4531 * will no longer be on the superblock's dirty inode list. 4532 */ 4533 int ext4_write_inode(struct inode *inode, int wait) 4534 { 4535 if (current->flags & PF_MEMALLOC) 4536 return 0; 4537 4538 if (ext4_journal_current_handle()) { 4539 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4540 dump_stack(); 4541 return -EIO; 4542 } 4543 4544 if (!wait) 4545 return 0; 4546 4547 return ext4_force_commit(inode->i_sb); 4548 } 4549 4550 int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) 4551 { 4552 int err = 0; 4553 4554 mark_buffer_dirty(bh); 4555 if (inode && inode_needs_sync(inode)) { 4556 sync_dirty_buffer(bh); 4557 if (buffer_req(bh) && !buffer_uptodate(bh)) { 4558 ext4_error(inode->i_sb, __func__, 4559 "IO error syncing inode, " 4560 "inode=%lu, block=%llu", 4561 inode->i_ino, 4562 (unsigned long long)bh->b_blocknr); 4563 err = -EIO; 4564 } 4565 } 4566 return err; 4567 } 4568 4569 /* 4570 * ext4_setattr() 4571 * 4572 * Called from notify_change. 4573 * 4574 * We want to trap VFS attempts to truncate the file as soon as 4575 * possible. In particular, we want to make sure that when the VFS 4576 * shrinks i_size, we put the inode on the orphan list and modify 4577 * i_disksize immediately, so that during the subsequent flushing of 4578 * dirty pages and freeing of disk blocks, we can guarantee that any 4579 * commit will leave the blocks being flushed in an unused state on 4580 * disk. (On recovery, the inode will get truncated and the blocks will 4581 * be freed, so we have a strong guarantee that no future commit will 4582 * leave these blocks visible to the user.) 4583 * 4584 * Another thing we have to assure is that if we are in ordered mode 4585 * and inode is still attached to the committing transaction, we must 4586 * we start writeout of all the dirty pages which are being truncated. 4587 * This way we are sure that all the data written in the previous 4588 * transaction are already on disk (truncate waits for pages under 4589 * writeback). 4590 * 4591 * Called with inode->i_mutex down. 4592 */ 4593 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4594 { 4595 struct inode *inode = dentry->d_inode; 4596 int error, rc = 0; 4597 const unsigned int ia_valid = attr->ia_valid; 4598 4599 error = inode_change_ok(inode, attr); 4600 if (error) 4601 return error; 4602 4603 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 4604 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 4605 handle_t *handle; 4606 4607 /* (user+group)*(old+new) structure, inode write (sb, 4608 * inode block, ? - but truncate inode update has it) */ 4609 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 4610 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 4611 if (IS_ERR(handle)) { 4612 error = PTR_ERR(handle); 4613 goto err_out; 4614 } 4615 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 4616 if (error) { 4617 ext4_journal_stop(handle); 4618 return error; 4619 } 4620 /* Update corresponding info in inode so that everything is in 4621 * one transaction */ 4622 if (attr->ia_valid & ATTR_UID) 4623 inode->i_uid = attr->ia_uid; 4624 if (attr->ia_valid & ATTR_GID) 4625 inode->i_gid = attr->ia_gid; 4626 error = ext4_mark_inode_dirty(handle, inode); 4627 ext4_journal_stop(handle); 4628 } 4629 4630 if (attr->ia_valid & ATTR_SIZE) { 4631 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 4632 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4633 4634 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 4635 error = -EFBIG; 4636 goto err_out; 4637 } 4638 } 4639 } 4640 4641 if (S_ISREG(inode->i_mode) && 4642 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 4643 handle_t *handle; 4644 4645 handle = ext4_journal_start(inode, 3); 4646 if (IS_ERR(handle)) { 4647 error = PTR_ERR(handle); 4648 goto err_out; 4649 } 4650 4651 error = ext4_orphan_add(handle, inode); 4652 EXT4_I(inode)->i_disksize = attr->ia_size; 4653 rc = ext4_mark_inode_dirty(handle, inode); 4654 if (!error) 4655 error = rc; 4656 ext4_journal_stop(handle); 4657 4658 if (ext4_should_order_data(inode)) { 4659 error = ext4_begin_ordered_truncate(inode, 4660 attr->ia_size); 4661 if (error) { 4662 /* Do as much error cleanup as possible */ 4663 handle = ext4_journal_start(inode, 3); 4664 if (IS_ERR(handle)) { 4665 ext4_orphan_del(NULL, inode); 4666 goto err_out; 4667 } 4668 ext4_orphan_del(handle, inode); 4669 ext4_journal_stop(handle); 4670 goto err_out; 4671 } 4672 } 4673 } 4674 4675 rc = inode_setattr(inode, attr); 4676 4677 /* If inode_setattr's call to ext4_truncate failed to get a 4678 * transaction handle at all, we need to clean up the in-core 4679 * orphan list manually. */ 4680 if (inode->i_nlink) 4681 ext4_orphan_del(NULL, inode); 4682 4683 if (!rc && (ia_valid & ATTR_MODE)) 4684 rc = ext4_acl_chmod(inode); 4685 4686 err_out: 4687 ext4_std_error(inode->i_sb, error); 4688 if (!error) 4689 error = rc; 4690 return error; 4691 } 4692 4693 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 4694 struct kstat *stat) 4695 { 4696 struct inode *inode; 4697 unsigned long delalloc_blocks; 4698 4699 inode = dentry->d_inode; 4700 generic_fillattr(inode, stat); 4701 4702 /* 4703 * We can't update i_blocks if the block allocation is delayed 4704 * otherwise in the case of system crash before the real block 4705 * allocation is done, we will have i_blocks inconsistent with 4706 * on-disk file blocks. 4707 * We always keep i_blocks updated together with real 4708 * allocation. But to not confuse with user, stat 4709 * will return the blocks that include the delayed allocation 4710 * blocks for this file. 4711 */ 4712 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 4713 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4714 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 4715 4716 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4717 return 0; 4718 } 4719 4720 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 4721 int chunk) 4722 { 4723 int indirects; 4724 4725 /* if nrblocks are contiguous */ 4726 if (chunk) { 4727 /* 4728 * With N contiguous data blocks, it need at most 4729 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 4730 * 2 dindirect blocks 4731 * 1 tindirect block 4732 */ 4733 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 4734 return indirects + 3; 4735 } 4736 /* 4737 * if nrblocks are not contiguous, worse case, each block touch 4738 * a indirect block, and each indirect block touch a double indirect 4739 * block, plus a triple indirect block 4740 */ 4741 indirects = nrblocks * 2 + 1; 4742 return indirects; 4743 } 4744 4745 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4746 { 4747 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 4748 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 4749 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4750 } 4751 4752 /* 4753 * Account for index blocks, block groups bitmaps and block group 4754 * descriptor blocks if modify datablocks and index blocks 4755 * worse case, the indexs blocks spread over different block groups 4756 * 4757 * If datablocks are discontiguous, they are possible to spread over 4758 * different block groups too. If they are contiugous, with flexbg, 4759 * they could still across block group boundary. 4760 * 4761 * Also account for superblock, inode, quota and xattr blocks 4762 */ 4763 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4764 { 4765 int groups, gdpblocks; 4766 int idxblocks; 4767 int ret = 0; 4768 4769 /* 4770 * How many index blocks need to touch to modify nrblocks? 4771 * The "Chunk" flag indicating whether the nrblocks is 4772 * physically contiguous on disk 4773 * 4774 * For Direct IO and fallocate, they calls get_block to allocate 4775 * one single extent at a time, so they could set the "Chunk" flag 4776 */ 4777 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4778 4779 ret = idxblocks; 4780 4781 /* 4782 * Now let's see how many group bitmaps and group descriptors need 4783 * to account 4784 */ 4785 groups = idxblocks; 4786 if (chunk) 4787 groups += 1; 4788 else 4789 groups += nrblocks; 4790 4791 gdpblocks = groups; 4792 if (groups > EXT4_SB(inode->i_sb)->s_groups_count) 4793 groups = EXT4_SB(inode->i_sb)->s_groups_count; 4794 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 4795 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 4796 4797 /* bitmaps and block group descriptor blocks */ 4798 ret += groups + gdpblocks; 4799 4800 /* Blocks for super block, inode, quota and xattr blocks */ 4801 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 4802 4803 return ret; 4804 } 4805 4806 /* 4807 * Calulate the total number of credits to reserve to fit 4808 * the modification of a single pages into a single transaction, 4809 * which may include multiple chunks of block allocations. 4810 * 4811 * This could be called via ext4_write_begin() 4812 * 4813 * We need to consider the worse case, when 4814 * one new block per extent. 4815 */ 4816 int ext4_writepage_trans_blocks(struct inode *inode) 4817 { 4818 int bpp = ext4_journal_blocks_per_page(inode); 4819 int ret; 4820 4821 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4822 4823 /* Account for data blocks for journalled mode */ 4824 if (ext4_should_journal_data(inode)) 4825 ret += bpp; 4826 return ret; 4827 } 4828 4829 /* 4830 * Calculate the journal credits for a chunk of data modification. 4831 * 4832 * This is called from DIO, fallocate or whoever calling 4833 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. 4834 * 4835 * journal buffers for data blocks are not included here, as DIO 4836 * and fallocate do no need to journal data buffers. 4837 */ 4838 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 4839 { 4840 return ext4_meta_trans_blocks(inode, nrblocks, 1); 4841 } 4842 4843 /* 4844 * The caller must have previously called ext4_reserve_inode_write(). 4845 * Give this, we know that the caller already has write access to iloc->bh. 4846 */ 4847 int ext4_mark_iloc_dirty(handle_t *handle, 4848 struct inode *inode, struct ext4_iloc *iloc) 4849 { 4850 int err = 0; 4851 4852 if (test_opt(inode->i_sb, I_VERSION)) 4853 inode_inc_iversion(inode); 4854 4855 /* the do_update_inode consumes one bh->b_count */ 4856 get_bh(iloc->bh); 4857 4858 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 4859 err = ext4_do_update_inode(handle, inode, iloc); 4860 put_bh(iloc->bh); 4861 return err; 4862 } 4863 4864 /* 4865 * On success, We end up with an outstanding reference count against 4866 * iloc->bh. This _must_ be cleaned up later. 4867 */ 4868 4869 int 4870 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4871 struct ext4_iloc *iloc) 4872 { 4873 int err; 4874 4875 err = ext4_get_inode_loc(inode, iloc); 4876 if (!err) { 4877 BUFFER_TRACE(iloc->bh, "get_write_access"); 4878 err = ext4_journal_get_write_access(handle, iloc->bh); 4879 if (err) { 4880 brelse(iloc->bh); 4881 iloc->bh = NULL; 4882 } 4883 } 4884 ext4_std_error(inode->i_sb, err); 4885 return err; 4886 } 4887 4888 /* 4889 * Expand an inode by new_extra_isize bytes. 4890 * Returns 0 on success or negative error number on failure. 4891 */ 4892 static int ext4_expand_extra_isize(struct inode *inode, 4893 unsigned int new_extra_isize, 4894 struct ext4_iloc iloc, 4895 handle_t *handle) 4896 { 4897 struct ext4_inode *raw_inode; 4898 struct ext4_xattr_ibody_header *header; 4899 struct ext4_xattr_entry *entry; 4900 4901 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 4902 return 0; 4903 4904 raw_inode = ext4_raw_inode(&iloc); 4905 4906 header = IHDR(inode, raw_inode); 4907 entry = IFIRST(header); 4908 4909 /* No extended attributes present */ 4910 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 4911 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 4912 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 4913 new_extra_isize); 4914 EXT4_I(inode)->i_extra_isize = new_extra_isize; 4915 return 0; 4916 } 4917 4918 /* try to expand with EAs present */ 4919 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 4920 raw_inode, handle); 4921 } 4922 4923 /* 4924 * What we do here is to mark the in-core inode as clean with respect to inode 4925 * dirtiness (it may still be data-dirty). 4926 * This means that the in-core inode may be reaped by prune_icache 4927 * without having to perform any I/O. This is a very good thing, 4928 * because *any* task may call prune_icache - even ones which 4929 * have a transaction open against a different journal. 4930 * 4931 * Is this cheating? Not really. Sure, we haven't written the 4932 * inode out, but prune_icache isn't a user-visible syncing function. 4933 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 4934 * we start and wait on commits. 4935 * 4936 * Is this efficient/effective? Well, we're being nice to the system 4937 * by cleaning up our inodes proactively so they can be reaped 4938 * without I/O. But we are potentially leaving up to five seconds' 4939 * worth of inodes floating about which prune_icache wants us to 4940 * write out. One way to fix that would be to get prune_icache() 4941 * to do a write_super() to free up some memory. It has the desired 4942 * effect. 4943 */ 4944 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 4945 { 4946 struct ext4_iloc iloc; 4947 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4948 static unsigned int mnt_count; 4949 int err, ret; 4950 4951 might_sleep(); 4952 err = ext4_reserve_inode_write(handle, inode, &iloc); 4953 if (ext4_handle_valid(handle) && 4954 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4955 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4956 /* 4957 * We need extra buffer credits since we may write into EA block 4958 * with this same handle. If journal_extend fails, then it will 4959 * only result in a minor loss of functionality for that inode. 4960 * If this is felt to be critical, then e2fsck should be run to 4961 * force a large enough s_min_extra_isize. 4962 */ 4963 if ((jbd2_journal_extend(handle, 4964 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 4965 ret = ext4_expand_extra_isize(inode, 4966 sbi->s_want_extra_isize, 4967 iloc, handle); 4968 if (ret) { 4969 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 4970 if (mnt_count != 4971 le16_to_cpu(sbi->s_es->s_mnt_count)) { 4972 ext4_warning(inode->i_sb, __func__, 4973 "Unable to expand inode %lu. Delete" 4974 " some EAs or run e2fsck.", 4975 inode->i_ino); 4976 mnt_count = 4977 le16_to_cpu(sbi->s_es->s_mnt_count); 4978 } 4979 } 4980 } 4981 } 4982 if (!err) 4983 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 4984 return err; 4985 } 4986 4987 /* 4988 * ext4_dirty_inode() is called from __mark_inode_dirty() 4989 * 4990 * We're really interested in the case where a file is being extended. 4991 * i_size has been changed by generic_commit_write() and we thus need 4992 * to include the updated inode in the current transaction. 4993 * 4994 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 4995 * are allocated to the file. 4996 * 4997 * If the inode is marked synchronous, we don't honour that here - doing 4998 * so would cause a commit on atime updates, which we don't bother doing. 4999 * We handle synchronous inodes at the highest possible level. 5000 */ 5001 void ext4_dirty_inode(struct inode *inode) 5002 { 5003 handle_t *current_handle = ext4_journal_current_handle(); 5004 handle_t *handle; 5005 5006 if (!ext4_handle_valid(current_handle)) { 5007 ext4_mark_inode_dirty(current_handle, inode); 5008 return; 5009 } 5010 5011 handle = ext4_journal_start(inode, 2); 5012 if (IS_ERR(handle)) 5013 goto out; 5014 if (current_handle && 5015 current_handle->h_transaction != handle->h_transaction) { 5016 /* This task has a transaction open against a different fs */ 5017 printk(KERN_EMERG "%s: transactions do not match!\n", 5018 __func__); 5019 } else { 5020 jbd_debug(5, "marking dirty. outer handle=%p\n", 5021 current_handle); 5022 ext4_mark_inode_dirty(handle, inode); 5023 } 5024 ext4_journal_stop(handle); 5025 out: 5026 return; 5027 } 5028 5029 #if 0 5030 /* 5031 * Bind an inode's backing buffer_head into this transaction, to prevent 5032 * it from being flushed to disk early. Unlike 5033 * ext4_reserve_inode_write, this leaves behind no bh reference and 5034 * returns no iloc structure, so the caller needs to repeat the iloc 5035 * lookup to mark the inode dirty later. 5036 */ 5037 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5038 { 5039 struct ext4_iloc iloc; 5040 5041 int err = 0; 5042 if (handle) { 5043 err = ext4_get_inode_loc(inode, &iloc); 5044 if (!err) { 5045 BUFFER_TRACE(iloc.bh, "get_write_access"); 5046 err = jbd2_journal_get_write_access(handle, iloc.bh); 5047 if (!err) 5048 err = ext4_handle_dirty_metadata(handle, 5049 inode, 5050 iloc.bh); 5051 brelse(iloc.bh); 5052 } 5053 } 5054 ext4_std_error(inode->i_sb, err); 5055 return err; 5056 } 5057 #endif 5058 5059 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5060 { 5061 journal_t *journal; 5062 handle_t *handle; 5063 int err; 5064 5065 /* 5066 * We have to be very careful here: changing a data block's 5067 * journaling status dynamically is dangerous. If we write a 5068 * data block to the journal, change the status and then delete 5069 * that block, we risk forgetting to revoke the old log record 5070 * from the journal and so a subsequent replay can corrupt data. 5071 * So, first we make sure that the journal is empty and that 5072 * nobody is changing anything. 5073 */ 5074 5075 journal = EXT4_JOURNAL(inode); 5076 if (!journal) 5077 return 0; 5078 if (is_journal_aborted(journal)) 5079 return -EROFS; 5080 5081 jbd2_journal_lock_updates(journal); 5082 jbd2_journal_flush(journal); 5083 5084 /* 5085 * OK, there are no updates running now, and all cached data is 5086 * synced to disk. We are now in a completely consistent state 5087 * which doesn't have anything in the journal, and we know that 5088 * no filesystem updates are running, so it is safe to modify 5089 * the inode's in-core data-journaling state flag now. 5090 */ 5091 5092 if (val) 5093 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5094 else 5095 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5096 ext4_set_aops(inode); 5097 5098 jbd2_journal_unlock_updates(journal); 5099 5100 /* Finally we can mark the inode as dirty. */ 5101 5102 handle = ext4_journal_start(inode, 1); 5103 if (IS_ERR(handle)) 5104 return PTR_ERR(handle); 5105 5106 err = ext4_mark_inode_dirty(handle, inode); 5107 ext4_handle_sync(handle); 5108 ext4_journal_stop(handle); 5109 ext4_std_error(inode->i_sb, err); 5110 5111 return err; 5112 } 5113 5114 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5115 { 5116 return !buffer_mapped(bh); 5117 } 5118 5119 int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5120 { 5121 loff_t size; 5122 unsigned long len; 5123 int ret = -EINVAL; 5124 void *fsdata; 5125 struct file *file = vma->vm_file; 5126 struct inode *inode = file->f_path.dentry->d_inode; 5127 struct address_space *mapping = inode->i_mapping; 5128 5129 /* 5130 * Get i_alloc_sem to stop truncates messing with the inode. We cannot 5131 * get i_mutex because we are already holding mmap_sem. 5132 */ 5133 down_read(&inode->i_alloc_sem); 5134 size = i_size_read(inode); 5135 if (page->mapping != mapping || size <= page_offset(page) 5136 || !PageUptodate(page)) { 5137 /* page got truncated from under us? */ 5138 goto out_unlock; 5139 } 5140 ret = 0; 5141 if (PageMappedToDisk(page)) 5142 goto out_unlock; 5143 5144 if (page->index == size >> PAGE_CACHE_SHIFT) 5145 len = size & ~PAGE_CACHE_MASK; 5146 else 5147 len = PAGE_CACHE_SIZE; 5148 5149 if (page_has_buffers(page)) { 5150 /* return if we have all the buffers mapped */ 5151 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5152 ext4_bh_unmapped)) 5153 goto out_unlock; 5154 } 5155 /* 5156 * OK, we need to fill the hole... Do write_begin write_end 5157 * to do block allocation/reservation.We are not holding 5158 * inode.i__mutex here. That allow * parallel write_begin, 5159 * write_end call. lock_page prevent this from happening 5160 * on the same page though 5161 */ 5162 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 5163 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 5164 if (ret < 0) 5165 goto out_unlock; 5166 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 5167 len, len, page, fsdata); 5168 if (ret < 0) 5169 goto out_unlock; 5170 ret = 0; 5171 out_unlock: 5172 up_read(&inode->i_alloc_sem); 5173 return ret; 5174 } 5175