1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/jbd2.h> 29 #include <linux/highuid.h> 30 #include <linux/pagemap.h> 31 #include <linux/quotaops.h> 32 #include <linux/string.h> 33 #include <linux/buffer_head.h> 34 #include <linux/writeback.h> 35 #include <linux/pagevec.h> 36 #include <linux/mpage.h> 37 #include <linux/namei.h> 38 #include <linux/uio.h> 39 #include <linux/bio.h> 40 #include "ext4_jbd2.h" 41 #include "xattr.h" 42 #include "acl.h" 43 #include "ext4_extents.h" 44 45 #define MPAGE_DA_EXTENT_TAIL 0x01 46 47 static inline int ext4_begin_ordered_truncate(struct inode *inode, 48 loff_t new_size) 49 { 50 return jbd2_journal_begin_ordered_truncate( 51 EXT4_SB(inode->i_sb)->s_journal, 52 &EXT4_I(inode)->jinode, 53 new_size); 54 } 55 56 static void ext4_invalidatepage(struct page *page, unsigned long offset); 57 58 /* 59 * Test whether an inode is a fast symlink. 60 */ 61 static int ext4_inode_is_fast_symlink(struct inode *inode) 62 { 63 int ea_blocks = EXT4_I(inode)->i_file_acl ? 64 (inode->i_sb->s_blocksize >> 9) : 0; 65 66 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 67 } 68 69 /* 70 * The ext4 forget function must perform a revoke if we are freeing data 71 * which has been journaled. Metadata (eg. indirect blocks) must be 72 * revoked in all cases. 73 * 74 * "bh" may be NULL: a metadata block may have been freed from memory 75 * but there may still be a record of it in the journal, and that record 76 * still needs to be revoked. 77 * 78 * If the handle isn't valid we're not journaling so there's nothing to do. 79 */ 80 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 81 struct buffer_head *bh, ext4_fsblk_t blocknr) 82 { 83 int err; 84 85 if (!ext4_handle_valid(handle)) 86 return 0; 87 88 might_sleep(); 89 90 BUFFER_TRACE(bh, "enter"); 91 92 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 93 "data mode %lx\n", 94 bh, is_metadata, inode->i_mode, 95 test_opt(inode->i_sb, DATA_FLAGS)); 96 97 /* Never use the revoke function if we are doing full data 98 * journaling: there is no need to, and a V1 superblock won't 99 * support it. Otherwise, only skip the revoke on un-journaled 100 * data blocks. */ 101 102 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 103 (!is_metadata && !ext4_should_journal_data(inode))) { 104 if (bh) { 105 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 106 return ext4_journal_forget(handle, bh); 107 } 108 return 0; 109 } 110 111 /* 112 * data!=journal && (is_metadata || should_journal_data(inode)) 113 */ 114 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 115 err = ext4_journal_revoke(handle, blocknr, bh); 116 if (err) 117 ext4_abort(inode->i_sb, __func__, 118 "error %d when attempting revoke", err); 119 BUFFER_TRACE(bh, "exit"); 120 return err; 121 } 122 123 /* 124 * Work out how many blocks we need to proceed with the next chunk of a 125 * truncate transaction. 126 */ 127 static unsigned long blocks_for_truncate(struct inode *inode) 128 { 129 ext4_lblk_t needed; 130 131 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 132 133 /* Give ourselves just enough room to cope with inodes in which 134 * i_blocks is corrupt: we've seen disk corruptions in the past 135 * which resulted in random data in an inode which looked enough 136 * like a regular file for ext4 to try to delete it. Things 137 * will go a bit crazy if that happens, but at least we should 138 * try not to panic the whole kernel. */ 139 if (needed < 2) 140 needed = 2; 141 142 /* But we need to bound the transaction so we don't overflow the 143 * journal. */ 144 if (needed > EXT4_MAX_TRANS_DATA) 145 needed = EXT4_MAX_TRANS_DATA; 146 147 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 148 } 149 150 /* 151 * Truncate transactions can be complex and absolutely huge. So we need to 152 * be able to restart the transaction at a conventient checkpoint to make 153 * sure we don't overflow the journal. 154 * 155 * start_transaction gets us a new handle for a truncate transaction, 156 * and extend_transaction tries to extend the existing one a bit. If 157 * extend fails, we need to propagate the failure up and restart the 158 * transaction in the top-level truncate loop. --sct 159 */ 160 static handle_t *start_transaction(struct inode *inode) 161 { 162 handle_t *result; 163 164 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 165 if (!IS_ERR(result)) 166 return result; 167 168 ext4_std_error(inode->i_sb, PTR_ERR(result)); 169 return result; 170 } 171 172 /* 173 * Try to extend this transaction for the purposes of truncation. 174 * 175 * Returns 0 if we managed to create more room. If we can't create more 176 * room, and the transaction must be restarted we return 1. 177 */ 178 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 179 { 180 if (!ext4_handle_valid(handle)) 181 return 0; 182 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 183 return 0; 184 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 185 return 0; 186 return 1; 187 } 188 189 /* 190 * Restart the transaction associated with *handle. This does a commit, 191 * so before we call here everything must be consistently dirtied against 192 * this transaction. 193 */ 194 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 { 196 BUG_ON(EXT4_JOURNAL(inode) == NULL); 197 jbd_debug(2, "restarting handle %p\n", handle); 198 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 199 } 200 201 /* 202 * Called at the last iput() if i_nlink is zero. 203 */ 204 void ext4_delete_inode(struct inode *inode) 205 { 206 handle_t *handle; 207 int err; 208 209 if (ext4_should_order_data(inode)) 210 ext4_begin_ordered_truncate(inode, 0); 211 truncate_inode_pages(&inode->i_data, 0); 212 213 if (is_bad_inode(inode)) 214 goto no_delete; 215 216 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 217 if (IS_ERR(handle)) { 218 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 219 /* 220 * If we're going to skip the normal cleanup, we still need to 221 * make sure that the in-core orphan linked list is properly 222 * cleaned up. 223 */ 224 ext4_orphan_del(NULL, inode); 225 goto no_delete; 226 } 227 228 if (IS_SYNC(inode)) 229 ext4_handle_sync(handle); 230 inode->i_size = 0; 231 err = ext4_mark_inode_dirty(handle, inode); 232 if (err) { 233 ext4_warning(inode->i_sb, __func__, 234 "couldn't mark inode dirty (err %d)", err); 235 goto stop_handle; 236 } 237 if (inode->i_blocks) 238 ext4_truncate(inode); 239 240 /* 241 * ext4_ext_truncate() doesn't reserve any slop when it 242 * restarts journal transactions; therefore there may not be 243 * enough credits left in the handle to remove the inode from 244 * the orphan list and set the dtime field. 245 */ 246 if (!ext4_handle_has_enough_credits(handle, 3)) { 247 err = ext4_journal_extend(handle, 3); 248 if (err > 0) 249 err = ext4_journal_restart(handle, 3); 250 if (err != 0) { 251 ext4_warning(inode->i_sb, __func__, 252 "couldn't extend journal (err %d)", err); 253 stop_handle: 254 ext4_journal_stop(handle); 255 goto no_delete; 256 } 257 } 258 259 /* 260 * Kill off the orphan record which ext4_truncate created. 261 * AKPM: I think this can be inside the above `if'. 262 * Note that ext4_orphan_del() has to be able to cope with the 263 * deletion of a non-existent orphan - this is because we don't 264 * know if ext4_truncate() actually created an orphan record. 265 * (Well, we could do this if we need to, but heck - it works) 266 */ 267 ext4_orphan_del(handle, inode); 268 EXT4_I(inode)->i_dtime = get_seconds(); 269 270 /* 271 * One subtle ordering requirement: if anything has gone wrong 272 * (transaction abort, IO errors, whatever), then we can still 273 * do these next steps (the fs will already have been marked as 274 * having errors), but we can't free the inode if the mark_dirty 275 * fails. 276 */ 277 if (ext4_mark_inode_dirty(handle, inode)) 278 /* If that failed, just do the required in-core inode clear. */ 279 clear_inode(inode); 280 else 281 ext4_free_inode(handle, inode); 282 ext4_journal_stop(handle); 283 return; 284 no_delete: 285 clear_inode(inode); /* We must guarantee clearing of inode... */ 286 } 287 288 typedef struct { 289 __le32 *p; 290 __le32 key; 291 struct buffer_head *bh; 292 } Indirect; 293 294 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 295 { 296 p->key = *(p->p = v); 297 p->bh = bh; 298 } 299 300 /** 301 * ext4_block_to_path - parse the block number into array of offsets 302 * @inode: inode in question (we are only interested in its superblock) 303 * @i_block: block number to be parsed 304 * @offsets: array to store the offsets in 305 * @boundary: set this non-zero if the referred-to block is likely to be 306 * followed (on disk) by an indirect block. 307 * 308 * To store the locations of file's data ext4 uses a data structure common 309 * for UNIX filesystems - tree of pointers anchored in the inode, with 310 * data blocks at leaves and indirect blocks in intermediate nodes. 311 * This function translates the block number into path in that tree - 312 * return value is the path length and @offsets[n] is the offset of 313 * pointer to (n+1)th node in the nth one. If @block is out of range 314 * (negative or too large) warning is printed and zero returned. 315 * 316 * Note: function doesn't find node addresses, so no IO is needed. All 317 * we need to know is the capacity of indirect blocks (taken from the 318 * inode->i_sb). 319 */ 320 321 /* 322 * Portability note: the last comparison (check that we fit into triple 323 * indirect block) is spelled differently, because otherwise on an 324 * architecture with 32-bit longs and 8Kb pages we might get into trouble 325 * if our filesystem had 8Kb blocks. We might use long long, but that would 326 * kill us on x86. Oh, well, at least the sign propagation does not matter - 327 * i_block would have to be negative in the very beginning, so we would not 328 * get there at all. 329 */ 330 331 static int ext4_block_to_path(struct inode *inode, 332 ext4_lblk_t i_block, 333 ext4_lblk_t offsets[4], int *boundary) 334 { 335 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 336 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 337 const long direct_blocks = EXT4_NDIR_BLOCKS, 338 indirect_blocks = ptrs, 339 double_blocks = (1 << (ptrs_bits * 2)); 340 int n = 0; 341 int final = 0; 342 343 if (i_block < 0) { 344 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); 345 } else if (i_block < direct_blocks) { 346 offsets[n++] = i_block; 347 final = direct_blocks; 348 } else if ((i_block -= direct_blocks) < indirect_blocks) { 349 offsets[n++] = EXT4_IND_BLOCK; 350 offsets[n++] = i_block; 351 final = ptrs; 352 } else if ((i_block -= indirect_blocks) < double_blocks) { 353 offsets[n++] = EXT4_DIND_BLOCK; 354 offsets[n++] = i_block >> ptrs_bits; 355 offsets[n++] = i_block & (ptrs - 1); 356 final = ptrs; 357 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 358 offsets[n++] = EXT4_TIND_BLOCK; 359 offsets[n++] = i_block >> (ptrs_bits * 2); 360 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 361 offsets[n++] = i_block & (ptrs - 1); 362 final = ptrs; 363 } else { 364 ext4_warning(inode->i_sb, "ext4_block_to_path", 365 "block %lu > max in inode %lu", 366 i_block + direct_blocks + 367 indirect_blocks + double_blocks, inode->i_ino); 368 } 369 if (boundary) 370 *boundary = final - 1 - (i_block & (ptrs - 1)); 371 return n; 372 } 373 374 /** 375 * ext4_get_branch - read the chain of indirect blocks leading to data 376 * @inode: inode in question 377 * @depth: depth of the chain (1 - direct pointer, etc.) 378 * @offsets: offsets of pointers in inode/indirect blocks 379 * @chain: place to store the result 380 * @err: here we store the error value 381 * 382 * Function fills the array of triples <key, p, bh> and returns %NULL 383 * if everything went OK or the pointer to the last filled triple 384 * (incomplete one) otherwise. Upon the return chain[i].key contains 385 * the number of (i+1)-th block in the chain (as it is stored in memory, 386 * i.e. little-endian 32-bit), chain[i].p contains the address of that 387 * number (it points into struct inode for i==0 and into the bh->b_data 388 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 389 * block for i>0 and NULL for i==0. In other words, it holds the block 390 * numbers of the chain, addresses they were taken from (and where we can 391 * verify that chain did not change) and buffer_heads hosting these 392 * numbers. 393 * 394 * Function stops when it stumbles upon zero pointer (absent block) 395 * (pointer to last triple returned, *@err == 0) 396 * or when it gets an IO error reading an indirect block 397 * (ditto, *@err == -EIO) 398 * or when it reads all @depth-1 indirect blocks successfully and finds 399 * the whole chain, all way to the data (returns %NULL, *err == 0). 400 * 401 * Need to be called with 402 * down_read(&EXT4_I(inode)->i_data_sem) 403 */ 404 static Indirect *ext4_get_branch(struct inode *inode, int depth, 405 ext4_lblk_t *offsets, 406 Indirect chain[4], int *err) 407 { 408 struct super_block *sb = inode->i_sb; 409 Indirect *p = chain; 410 struct buffer_head *bh; 411 412 *err = 0; 413 /* i_data is not going away, no lock needed */ 414 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 415 if (!p->key) 416 goto no_block; 417 while (--depth) { 418 bh = sb_bread(sb, le32_to_cpu(p->key)); 419 if (!bh) 420 goto failure; 421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 422 /* Reader: end */ 423 if (!p->key) 424 goto no_block; 425 } 426 return NULL; 427 428 failure: 429 *err = -EIO; 430 no_block: 431 return p; 432 } 433 434 /** 435 * ext4_find_near - find a place for allocation with sufficient locality 436 * @inode: owner 437 * @ind: descriptor of indirect block. 438 * 439 * This function returns the preferred place for block allocation. 440 * It is used when heuristic for sequential allocation fails. 441 * Rules are: 442 * + if there is a block to the left of our position - allocate near it. 443 * + if pointer will live in indirect block - allocate near that block. 444 * + if pointer will live in inode - allocate in the same 445 * cylinder group. 446 * 447 * In the latter case we colour the starting block by the callers PID to 448 * prevent it from clashing with concurrent allocations for a different inode 449 * in the same block group. The PID is used here so that functionally related 450 * files will be close-by on-disk. 451 * 452 * Caller must make sure that @ind is valid and will stay that way. 453 */ 454 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 455 { 456 struct ext4_inode_info *ei = EXT4_I(inode); 457 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 458 __le32 *p; 459 ext4_fsblk_t bg_start; 460 ext4_fsblk_t last_block; 461 ext4_grpblk_t colour; 462 463 /* Try to find previous block */ 464 for (p = ind->p - 1; p >= start; p--) { 465 if (*p) 466 return le32_to_cpu(*p); 467 } 468 469 /* No such thing, so let's try location of indirect block */ 470 if (ind->bh) 471 return ind->bh->b_blocknr; 472 473 /* 474 * It is going to be referred to from the inode itself? OK, just put it 475 * into the same cylinder group then. 476 */ 477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 479 480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 481 colour = (current->pid % 16) * 482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 483 else 484 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 485 return bg_start + colour; 486 } 487 488 /** 489 * ext4_find_goal - find a preferred place for allocation. 490 * @inode: owner 491 * @block: block we want 492 * @partial: pointer to the last triple within a chain 493 * 494 * Normally this function find the preferred place for block allocation, 495 * returns it. 496 */ 497 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 498 Indirect *partial) 499 { 500 /* 501 * XXX need to get goal block from mballoc's data structures 502 */ 503 504 return ext4_find_near(inode, partial); 505 } 506 507 /** 508 * ext4_blks_to_allocate: Look up the block map and count the number 509 * of direct blocks need to be allocated for the given branch. 510 * 511 * @branch: chain of indirect blocks 512 * @k: number of blocks need for indirect blocks 513 * @blks: number of data blocks to be mapped. 514 * @blocks_to_boundary: the offset in the indirect block 515 * 516 * return the total number of blocks to be allocate, including the 517 * direct and indirect blocks. 518 */ 519 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 520 int blocks_to_boundary) 521 { 522 unsigned int count = 0; 523 524 /* 525 * Simple case, [t,d]Indirect block(s) has not allocated yet 526 * then it's clear blocks on that path have not allocated 527 */ 528 if (k > 0) { 529 /* right now we don't handle cross boundary allocation */ 530 if (blks < blocks_to_boundary + 1) 531 count += blks; 532 else 533 count += blocks_to_boundary + 1; 534 return count; 535 } 536 537 count++; 538 while (count < blks && count <= blocks_to_boundary && 539 le32_to_cpu(*(branch[0].p + count)) == 0) { 540 count++; 541 } 542 return count; 543 } 544 545 /** 546 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 547 * @indirect_blks: the number of blocks need to allocate for indirect 548 * blocks 549 * 550 * @new_blocks: on return it will store the new block numbers for 551 * the indirect blocks(if needed) and the first direct block, 552 * @blks: on return it will store the total number of allocated 553 * direct blocks 554 */ 555 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 556 ext4_lblk_t iblock, ext4_fsblk_t goal, 557 int indirect_blks, int blks, 558 ext4_fsblk_t new_blocks[4], int *err) 559 { 560 struct ext4_allocation_request ar; 561 int target, i; 562 unsigned long count = 0, blk_allocated = 0; 563 int index = 0; 564 ext4_fsblk_t current_block = 0; 565 int ret = 0; 566 567 /* 568 * Here we try to allocate the requested multiple blocks at once, 569 * on a best-effort basis. 570 * To build a branch, we should allocate blocks for 571 * the indirect blocks(if not allocated yet), and at least 572 * the first direct block of this branch. That's the 573 * minimum number of blocks need to allocate(required) 574 */ 575 /* first we try to allocate the indirect blocks */ 576 target = indirect_blks; 577 while (target > 0) { 578 count = target; 579 /* allocating blocks for indirect blocks and direct blocks */ 580 current_block = ext4_new_meta_blocks(handle, inode, 581 goal, &count, err); 582 if (*err) 583 goto failed_out; 584 585 target -= count; 586 /* allocate blocks for indirect blocks */ 587 while (index < indirect_blks && count) { 588 new_blocks[index++] = current_block++; 589 count--; 590 } 591 if (count > 0) { 592 /* 593 * save the new block number 594 * for the first direct block 595 */ 596 new_blocks[index] = current_block; 597 printk(KERN_INFO "%s returned more blocks than " 598 "requested\n", __func__); 599 WARN_ON(1); 600 break; 601 } 602 } 603 604 target = blks - count ; 605 blk_allocated = count; 606 if (!target) 607 goto allocated; 608 /* Now allocate data blocks */ 609 memset(&ar, 0, sizeof(ar)); 610 ar.inode = inode; 611 ar.goal = goal; 612 ar.len = target; 613 ar.logical = iblock; 614 if (S_ISREG(inode->i_mode)) 615 /* enable in-core preallocation only for regular files */ 616 ar.flags = EXT4_MB_HINT_DATA; 617 618 current_block = ext4_mb_new_blocks(handle, &ar, err); 619 620 if (*err && (target == blks)) { 621 /* 622 * if the allocation failed and we didn't allocate 623 * any blocks before 624 */ 625 goto failed_out; 626 } 627 if (!*err) { 628 if (target == blks) { 629 /* 630 * save the new block number 631 * for the first direct block 632 */ 633 new_blocks[index] = current_block; 634 } 635 blk_allocated += ar.len; 636 } 637 allocated: 638 /* total number of blocks allocated for direct blocks */ 639 ret = blk_allocated; 640 *err = 0; 641 return ret; 642 failed_out: 643 for (i = 0; i < index; i++) 644 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 645 return ret; 646 } 647 648 /** 649 * ext4_alloc_branch - allocate and set up a chain of blocks. 650 * @inode: owner 651 * @indirect_blks: number of allocated indirect blocks 652 * @blks: number of allocated direct blocks 653 * @offsets: offsets (in the blocks) to store the pointers to next. 654 * @branch: place to store the chain in. 655 * 656 * This function allocates blocks, zeroes out all but the last one, 657 * links them into chain and (if we are synchronous) writes them to disk. 658 * In other words, it prepares a branch that can be spliced onto the 659 * inode. It stores the information about that chain in the branch[], in 660 * the same format as ext4_get_branch() would do. We are calling it after 661 * we had read the existing part of chain and partial points to the last 662 * triple of that (one with zero ->key). Upon the exit we have the same 663 * picture as after the successful ext4_get_block(), except that in one 664 * place chain is disconnected - *branch->p is still zero (we did not 665 * set the last link), but branch->key contains the number that should 666 * be placed into *branch->p to fill that gap. 667 * 668 * If allocation fails we free all blocks we've allocated (and forget 669 * their buffer_heads) and return the error value the from failed 670 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 671 * as described above and return 0. 672 */ 673 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 674 ext4_lblk_t iblock, int indirect_blks, 675 int *blks, ext4_fsblk_t goal, 676 ext4_lblk_t *offsets, Indirect *branch) 677 { 678 int blocksize = inode->i_sb->s_blocksize; 679 int i, n = 0; 680 int err = 0; 681 struct buffer_head *bh; 682 int num; 683 ext4_fsblk_t new_blocks[4]; 684 ext4_fsblk_t current_block; 685 686 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 687 *blks, new_blocks, &err); 688 if (err) 689 return err; 690 691 branch[0].key = cpu_to_le32(new_blocks[0]); 692 /* 693 * metadata blocks and data blocks are allocated. 694 */ 695 for (n = 1; n <= indirect_blks; n++) { 696 /* 697 * Get buffer_head for parent block, zero it out 698 * and set the pointer to new one, then send 699 * parent to disk. 700 */ 701 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 702 branch[n].bh = bh; 703 lock_buffer(bh); 704 BUFFER_TRACE(bh, "call get_create_access"); 705 err = ext4_journal_get_create_access(handle, bh); 706 if (err) { 707 unlock_buffer(bh); 708 brelse(bh); 709 goto failed; 710 } 711 712 memset(bh->b_data, 0, blocksize); 713 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 714 branch[n].key = cpu_to_le32(new_blocks[n]); 715 *branch[n].p = branch[n].key; 716 if (n == indirect_blks) { 717 current_block = new_blocks[n]; 718 /* 719 * End of chain, update the last new metablock of 720 * the chain to point to the new allocated 721 * data blocks numbers 722 */ 723 for (i=1; i < num; i++) 724 *(branch[n].p + i) = cpu_to_le32(++current_block); 725 } 726 BUFFER_TRACE(bh, "marking uptodate"); 727 set_buffer_uptodate(bh); 728 unlock_buffer(bh); 729 730 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 731 err = ext4_handle_dirty_metadata(handle, inode, bh); 732 if (err) 733 goto failed; 734 } 735 *blks = num; 736 return err; 737 failed: 738 /* Allocation failed, free what we already allocated */ 739 for (i = 1; i <= n ; i++) { 740 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 741 ext4_journal_forget(handle, branch[i].bh); 742 } 743 for (i = 0; i < indirect_blks; i++) 744 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 745 746 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 747 748 return err; 749 } 750 751 /** 752 * ext4_splice_branch - splice the allocated branch onto inode. 753 * @inode: owner 754 * @block: (logical) number of block we are adding 755 * @chain: chain of indirect blocks (with a missing link - see 756 * ext4_alloc_branch) 757 * @where: location of missing link 758 * @num: number of indirect blocks we are adding 759 * @blks: number of direct blocks we are adding 760 * 761 * This function fills the missing link and does all housekeeping needed in 762 * inode (->i_blocks, etc.). In case of success we end up with the full 763 * chain to new block and return 0. 764 */ 765 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 766 ext4_lblk_t block, Indirect *where, int num, int blks) 767 { 768 int i; 769 int err = 0; 770 ext4_fsblk_t current_block; 771 772 /* 773 * If we're splicing into a [td]indirect block (as opposed to the 774 * inode) then we need to get write access to the [td]indirect block 775 * before the splice. 776 */ 777 if (where->bh) { 778 BUFFER_TRACE(where->bh, "get_write_access"); 779 err = ext4_journal_get_write_access(handle, where->bh); 780 if (err) 781 goto err_out; 782 } 783 /* That's it */ 784 785 *where->p = where->key; 786 787 /* 788 * Update the host buffer_head or inode to point to more just allocated 789 * direct blocks blocks 790 */ 791 if (num == 0 && blks > 1) { 792 current_block = le32_to_cpu(where->key) + 1; 793 for (i = 1; i < blks; i++) 794 *(where->p + i) = cpu_to_le32(current_block++); 795 } 796 797 /* We are done with atomic stuff, now do the rest of housekeeping */ 798 799 inode->i_ctime = ext4_current_time(inode); 800 ext4_mark_inode_dirty(handle, inode); 801 802 /* had we spliced it onto indirect block? */ 803 if (where->bh) { 804 /* 805 * If we spliced it onto an indirect block, we haven't 806 * altered the inode. Note however that if it is being spliced 807 * onto an indirect block at the very end of the file (the 808 * file is growing) then we *will* alter the inode to reflect 809 * the new i_size. But that is not done here - it is done in 810 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 811 */ 812 jbd_debug(5, "splicing indirect only\n"); 813 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 814 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 815 if (err) 816 goto err_out; 817 } else { 818 /* 819 * OK, we spliced it into the inode itself on a direct block. 820 * Inode was dirtied above. 821 */ 822 jbd_debug(5, "splicing direct\n"); 823 } 824 return err; 825 826 err_out: 827 for (i = 1; i <= num; i++) { 828 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 829 ext4_journal_forget(handle, where[i].bh); 830 ext4_free_blocks(handle, inode, 831 le32_to_cpu(where[i-1].key), 1, 0); 832 } 833 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 834 835 return err; 836 } 837 838 /* 839 * Allocation strategy is simple: if we have to allocate something, we will 840 * have to go the whole way to leaf. So let's do it before attaching anything 841 * to tree, set linkage between the newborn blocks, write them if sync is 842 * required, recheck the path, free and repeat if check fails, otherwise 843 * set the last missing link (that will protect us from any truncate-generated 844 * removals - all blocks on the path are immune now) and possibly force the 845 * write on the parent block. 846 * That has a nice additional property: no special recovery from the failed 847 * allocations is needed - we simply release blocks and do not touch anything 848 * reachable from inode. 849 * 850 * `handle' can be NULL if create == 0. 851 * 852 * return > 0, # of blocks mapped or allocated. 853 * return = 0, if plain lookup failed. 854 * return < 0, error case. 855 * 856 * 857 * Need to be called with 858 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 859 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 860 */ 861 static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 862 ext4_lblk_t iblock, unsigned int maxblocks, 863 struct buffer_head *bh_result, 864 int create, int extend_disksize) 865 { 866 int err = -EIO; 867 ext4_lblk_t offsets[4]; 868 Indirect chain[4]; 869 Indirect *partial; 870 ext4_fsblk_t goal; 871 int indirect_blks; 872 int blocks_to_boundary = 0; 873 int depth; 874 struct ext4_inode_info *ei = EXT4_I(inode); 875 int count = 0; 876 ext4_fsblk_t first_block = 0; 877 loff_t disksize; 878 879 880 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 881 J_ASSERT(handle != NULL || create == 0); 882 depth = ext4_block_to_path(inode, iblock, offsets, 883 &blocks_to_boundary); 884 885 if (depth == 0) 886 goto out; 887 888 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 889 890 /* Simplest case - block found, no allocation needed */ 891 if (!partial) { 892 first_block = le32_to_cpu(chain[depth - 1].key); 893 clear_buffer_new(bh_result); 894 count++; 895 /*map more blocks*/ 896 while (count < maxblocks && count <= blocks_to_boundary) { 897 ext4_fsblk_t blk; 898 899 blk = le32_to_cpu(*(chain[depth-1].p + count)); 900 901 if (blk == first_block + count) 902 count++; 903 else 904 break; 905 } 906 goto got_it; 907 } 908 909 /* Next simple case - plain lookup or failed read of indirect block */ 910 if (!create || err == -EIO) 911 goto cleanup; 912 913 /* 914 * Okay, we need to do block allocation. 915 */ 916 goal = ext4_find_goal(inode, iblock, partial); 917 918 /* the number of blocks need to allocate for [d,t]indirect blocks */ 919 indirect_blks = (chain + depth) - partial - 1; 920 921 /* 922 * Next look up the indirect map to count the totoal number of 923 * direct blocks to allocate for this branch. 924 */ 925 count = ext4_blks_to_allocate(partial, indirect_blks, 926 maxblocks, blocks_to_boundary); 927 /* 928 * Block out ext4_truncate while we alter the tree 929 */ 930 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 931 &count, goal, 932 offsets + (partial - chain), partial); 933 934 /* 935 * The ext4_splice_branch call will free and forget any buffers 936 * on the new chain if there is a failure, but that risks using 937 * up transaction credits, especially for bitmaps where the 938 * credits cannot be returned. Can we handle this somehow? We 939 * may need to return -EAGAIN upwards in the worst case. --sct 940 */ 941 if (!err) 942 err = ext4_splice_branch(handle, inode, iblock, 943 partial, indirect_blks, count); 944 /* 945 * i_disksize growing is protected by i_data_sem. Don't forget to 946 * protect it if you're about to implement concurrent 947 * ext4_get_block() -bzzz 948 */ 949 if (!err && extend_disksize) { 950 disksize = ((loff_t) iblock + count) << inode->i_blkbits; 951 if (disksize > i_size_read(inode)) 952 disksize = i_size_read(inode); 953 if (disksize > ei->i_disksize) 954 ei->i_disksize = disksize; 955 } 956 if (err) 957 goto cleanup; 958 959 set_buffer_new(bh_result); 960 got_it: 961 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 962 if (count > blocks_to_boundary) 963 set_buffer_boundary(bh_result); 964 err = count; 965 /* Clean up and exit */ 966 partial = chain + depth - 1; /* the whole chain */ 967 cleanup: 968 while (partial > chain) { 969 BUFFER_TRACE(partial->bh, "call brelse"); 970 brelse(partial->bh); 971 partial--; 972 } 973 BUFFER_TRACE(bh_result, "returned"); 974 out: 975 return err; 976 } 977 978 qsize_t ext4_get_reserved_space(struct inode *inode) 979 { 980 unsigned long long total; 981 982 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 983 total = EXT4_I(inode)->i_reserved_data_blocks + 984 EXT4_I(inode)->i_reserved_meta_blocks; 985 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 986 987 return total; 988 } 989 /* 990 * Calculate the number of metadata blocks need to reserve 991 * to allocate @blocks for non extent file based file 992 */ 993 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 994 { 995 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 996 int ind_blks, dind_blks, tind_blks; 997 998 /* number of new indirect blocks needed */ 999 ind_blks = (blocks + icap - 1) / icap; 1000 1001 dind_blks = (ind_blks + icap - 1) / icap; 1002 1003 tind_blks = 1; 1004 1005 return ind_blks + dind_blks + tind_blks; 1006 } 1007 1008 /* 1009 * Calculate the number of metadata blocks need to reserve 1010 * to allocate given number of blocks 1011 */ 1012 static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1013 { 1014 if (!blocks) 1015 return 0; 1016 1017 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1018 return ext4_ext_calc_metadata_amount(inode, blocks); 1019 1020 return ext4_indirect_calc_metadata_amount(inode, blocks); 1021 } 1022 1023 static void ext4_da_update_reserve_space(struct inode *inode, int used) 1024 { 1025 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1026 int total, mdb, mdb_free; 1027 1028 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1029 /* recalculate the number of metablocks still need to be reserved */ 1030 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1031 mdb = ext4_calc_metadata_amount(inode, total); 1032 1033 /* figure out how many metablocks to release */ 1034 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1035 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1036 1037 if (mdb_free) { 1038 /* Account for allocated meta_blocks */ 1039 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1040 1041 /* update fs dirty blocks counter */ 1042 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1043 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1044 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1045 } 1046 1047 /* update per-inode reservations */ 1048 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1049 EXT4_I(inode)->i_reserved_data_blocks -= used; 1050 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1051 1052 /* 1053 * free those over-booking quota for metadata blocks 1054 */ 1055 1056 if (mdb_free) 1057 vfs_dq_release_reservation_block(inode, mdb_free); 1058 } 1059 1060 /* 1061 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1062 * and returns if the blocks are already mapped. 1063 * 1064 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1065 * and store the allocated blocks in the result buffer head and mark it 1066 * mapped. 1067 * 1068 * If file type is extents based, it will call ext4_ext_get_blocks(), 1069 * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping 1070 * based files 1071 * 1072 * On success, it returns the number of blocks being mapped or allocate. 1073 * if create==0 and the blocks are pre-allocated and uninitialized block, 1074 * the result buffer head is unmapped. If the create ==1, it will make sure 1075 * the buffer head is mapped. 1076 * 1077 * It returns 0 if plain look up failed (blocks have not been allocated), in 1078 * that casem, buffer head is unmapped 1079 * 1080 * It returns the error in case of allocation failure. 1081 */ 1082 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1083 unsigned int max_blocks, struct buffer_head *bh, 1084 int create, int extend_disksize, int flag) 1085 { 1086 int retval; 1087 1088 clear_buffer_mapped(bh); 1089 1090 /* 1091 * Try to see if we can get the block without requesting 1092 * for new file system block. 1093 */ 1094 down_read((&EXT4_I(inode)->i_data_sem)); 1095 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1096 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1097 bh, 0, 0); 1098 } else { 1099 retval = ext4_get_blocks_handle(handle, 1100 inode, block, max_blocks, bh, 0, 0); 1101 } 1102 up_read((&EXT4_I(inode)->i_data_sem)); 1103 1104 /* If it is only a block(s) look up */ 1105 if (!create) 1106 return retval; 1107 1108 /* 1109 * Returns if the blocks have already allocated 1110 * 1111 * Note that if blocks have been preallocated 1112 * ext4_ext_get_block() returns th create = 0 1113 * with buffer head unmapped. 1114 */ 1115 if (retval > 0 && buffer_mapped(bh)) 1116 return retval; 1117 1118 /* 1119 * New blocks allocate and/or writing to uninitialized extent 1120 * will possibly result in updating i_data, so we take 1121 * the write lock of i_data_sem, and call get_blocks() 1122 * with create == 1 flag. 1123 */ 1124 down_write((&EXT4_I(inode)->i_data_sem)); 1125 1126 /* 1127 * if the caller is from delayed allocation writeout path 1128 * we have already reserved fs blocks for allocation 1129 * let the underlying get_block() function know to 1130 * avoid double accounting 1131 */ 1132 if (flag) 1133 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1134 /* 1135 * We need to check for EXT4 here because migrate 1136 * could have changed the inode type in between 1137 */ 1138 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1139 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1140 bh, create, extend_disksize); 1141 } else { 1142 retval = ext4_get_blocks_handle(handle, inode, block, 1143 max_blocks, bh, create, extend_disksize); 1144 1145 if (retval > 0 && buffer_new(bh)) { 1146 /* 1147 * We allocated new blocks which will result in 1148 * i_data's format changing. Force the migrate 1149 * to fail by clearing migrate flags 1150 */ 1151 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1152 ~EXT4_EXT_MIGRATE; 1153 } 1154 } 1155 1156 if (flag) { 1157 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1158 /* 1159 * Update reserved blocks/metadata blocks 1160 * after successful block allocation 1161 * which were deferred till now 1162 */ 1163 if ((retval > 0) && buffer_delay(bh)) 1164 ext4_da_update_reserve_space(inode, retval); 1165 } 1166 1167 up_write((&EXT4_I(inode)->i_data_sem)); 1168 return retval; 1169 } 1170 1171 /* Maximum number of blocks we map for direct IO at once. */ 1172 #define DIO_MAX_BLOCKS 4096 1173 1174 int ext4_get_block(struct inode *inode, sector_t iblock, 1175 struct buffer_head *bh_result, int create) 1176 { 1177 handle_t *handle = ext4_journal_current_handle(); 1178 int ret = 0, started = 0; 1179 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1180 int dio_credits; 1181 1182 if (create && !handle) { 1183 /* Direct IO write... */ 1184 if (max_blocks > DIO_MAX_BLOCKS) 1185 max_blocks = DIO_MAX_BLOCKS; 1186 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1187 handle = ext4_journal_start(inode, dio_credits); 1188 if (IS_ERR(handle)) { 1189 ret = PTR_ERR(handle); 1190 goto out; 1191 } 1192 started = 1; 1193 } 1194 1195 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1196 max_blocks, bh_result, create, 0, 0); 1197 if (ret > 0) { 1198 bh_result->b_size = (ret << inode->i_blkbits); 1199 ret = 0; 1200 } 1201 if (started) 1202 ext4_journal_stop(handle); 1203 out: 1204 return ret; 1205 } 1206 1207 /* 1208 * `handle' can be NULL if create is zero 1209 */ 1210 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1211 ext4_lblk_t block, int create, int *errp) 1212 { 1213 struct buffer_head dummy; 1214 int fatal = 0, err; 1215 1216 J_ASSERT(handle != NULL || create == 0); 1217 1218 dummy.b_state = 0; 1219 dummy.b_blocknr = -1000; 1220 buffer_trace_init(&dummy.b_history); 1221 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1222 &dummy, create, 1, 0); 1223 /* 1224 * ext4_get_blocks_handle() returns number of blocks 1225 * mapped. 0 in case of a HOLE. 1226 */ 1227 if (err > 0) { 1228 if (err > 1) 1229 WARN_ON(1); 1230 err = 0; 1231 } 1232 *errp = err; 1233 if (!err && buffer_mapped(&dummy)) { 1234 struct buffer_head *bh; 1235 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1236 if (!bh) { 1237 *errp = -EIO; 1238 goto err; 1239 } 1240 if (buffer_new(&dummy)) { 1241 J_ASSERT(create != 0); 1242 J_ASSERT(handle != NULL); 1243 1244 /* 1245 * Now that we do not always journal data, we should 1246 * keep in mind whether this should always journal the 1247 * new buffer as metadata. For now, regular file 1248 * writes use ext4_get_block instead, so it's not a 1249 * problem. 1250 */ 1251 lock_buffer(bh); 1252 BUFFER_TRACE(bh, "call get_create_access"); 1253 fatal = ext4_journal_get_create_access(handle, bh); 1254 if (!fatal && !buffer_uptodate(bh)) { 1255 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1256 set_buffer_uptodate(bh); 1257 } 1258 unlock_buffer(bh); 1259 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1260 err = ext4_handle_dirty_metadata(handle, inode, bh); 1261 if (!fatal) 1262 fatal = err; 1263 } else { 1264 BUFFER_TRACE(bh, "not a new buffer"); 1265 } 1266 if (fatal) { 1267 *errp = fatal; 1268 brelse(bh); 1269 bh = NULL; 1270 } 1271 return bh; 1272 } 1273 err: 1274 return NULL; 1275 } 1276 1277 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1278 ext4_lblk_t block, int create, int *err) 1279 { 1280 struct buffer_head *bh; 1281 1282 bh = ext4_getblk(handle, inode, block, create, err); 1283 if (!bh) 1284 return bh; 1285 if (buffer_uptodate(bh)) 1286 return bh; 1287 ll_rw_block(READ_META, 1, &bh); 1288 wait_on_buffer(bh); 1289 if (buffer_uptodate(bh)) 1290 return bh; 1291 put_bh(bh); 1292 *err = -EIO; 1293 return NULL; 1294 } 1295 1296 static int walk_page_buffers(handle_t *handle, 1297 struct buffer_head *head, 1298 unsigned from, 1299 unsigned to, 1300 int *partial, 1301 int (*fn)(handle_t *handle, 1302 struct buffer_head *bh)) 1303 { 1304 struct buffer_head *bh; 1305 unsigned block_start, block_end; 1306 unsigned blocksize = head->b_size; 1307 int err, ret = 0; 1308 struct buffer_head *next; 1309 1310 for (bh = head, block_start = 0; 1311 ret == 0 && (bh != head || !block_start); 1312 block_start = block_end, bh = next) 1313 { 1314 next = bh->b_this_page; 1315 block_end = block_start + blocksize; 1316 if (block_end <= from || block_start >= to) { 1317 if (partial && !buffer_uptodate(bh)) 1318 *partial = 1; 1319 continue; 1320 } 1321 err = (*fn)(handle, bh); 1322 if (!ret) 1323 ret = err; 1324 } 1325 return ret; 1326 } 1327 1328 /* 1329 * To preserve ordering, it is essential that the hole instantiation and 1330 * the data write be encapsulated in a single transaction. We cannot 1331 * close off a transaction and start a new one between the ext4_get_block() 1332 * and the commit_write(). So doing the jbd2_journal_start at the start of 1333 * prepare_write() is the right place. 1334 * 1335 * Also, this function can nest inside ext4_writepage() -> 1336 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1337 * has generated enough buffer credits to do the whole page. So we won't 1338 * block on the journal in that case, which is good, because the caller may 1339 * be PF_MEMALLOC. 1340 * 1341 * By accident, ext4 can be reentered when a transaction is open via 1342 * quota file writes. If we were to commit the transaction while thus 1343 * reentered, there can be a deadlock - we would be holding a quota 1344 * lock, and the commit would never complete if another thread had a 1345 * transaction open and was blocking on the quota lock - a ranking 1346 * violation. 1347 * 1348 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1349 * will _not_ run commit under these circumstances because handle->h_ref 1350 * is elevated. We'll still have enough credits for the tiny quotafile 1351 * write. 1352 */ 1353 static int do_journal_get_write_access(handle_t *handle, 1354 struct buffer_head *bh) 1355 { 1356 if (!buffer_mapped(bh) || buffer_freed(bh)) 1357 return 0; 1358 return ext4_journal_get_write_access(handle, bh); 1359 } 1360 1361 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1362 loff_t pos, unsigned len, unsigned flags, 1363 struct page **pagep, void **fsdata) 1364 { 1365 struct inode *inode = mapping->host; 1366 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1367 handle_t *handle; 1368 int retries = 0; 1369 struct page *page; 1370 pgoff_t index; 1371 unsigned from, to; 1372 1373 trace_mark(ext4_write_begin, 1374 "dev %s ino %lu pos %llu len %u flags %u", 1375 inode->i_sb->s_id, inode->i_ino, 1376 (unsigned long long) pos, len, flags); 1377 index = pos >> PAGE_CACHE_SHIFT; 1378 from = pos & (PAGE_CACHE_SIZE - 1); 1379 to = from + len; 1380 1381 retry: 1382 handle = ext4_journal_start(inode, needed_blocks); 1383 if (IS_ERR(handle)) { 1384 ret = PTR_ERR(handle); 1385 goto out; 1386 } 1387 1388 /* We cannot recurse into the filesystem as the transaction is already 1389 * started */ 1390 flags |= AOP_FLAG_NOFS; 1391 1392 page = grab_cache_page_write_begin(mapping, index, flags); 1393 if (!page) { 1394 ext4_journal_stop(handle); 1395 ret = -ENOMEM; 1396 goto out; 1397 } 1398 *pagep = page; 1399 1400 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1401 ext4_get_block); 1402 1403 if (!ret && ext4_should_journal_data(inode)) { 1404 ret = walk_page_buffers(handle, page_buffers(page), 1405 from, to, NULL, do_journal_get_write_access); 1406 } 1407 1408 if (ret) { 1409 unlock_page(page); 1410 ext4_journal_stop(handle); 1411 page_cache_release(page); 1412 /* 1413 * block_write_begin may have instantiated a few blocks 1414 * outside i_size. Trim these off again. Don't need 1415 * i_size_read because we hold i_mutex. 1416 */ 1417 if (pos + len > inode->i_size) 1418 vmtruncate(inode, inode->i_size); 1419 } 1420 1421 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1422 goto retry; 1423 out: 1424 return ret; 1425 } 1426 1427 /* For write_end() in data=journal mode */ 1428 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1429 { 1430 if (!buffer_mapped(bh) || buffer_freed(bh)) 1431 return 0; 1432 set_buffer_uptodate(bh); 1433 return ext4_handle_dirty_metadata(handle, NULL, bh); 1434 } 1435 1436 /* 1437 * We need to pick up the new inode size which generic_commit_write gave us 1438 * `file' can be NULL - eg, when called from page_symlink(). 1439 * 1440 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1441 * buffers are managed internally. 1442 */ 1443 static int ext4_ordered_write_end(struct file *file, 1444 struct address_space *mapping, 1445 loff_t pos, unsigned len, unsigned copied, 1446 struct page *page, void *fsdata) 1447 { 1448 handle_t *handle = ext4_journal_current_handle(); 1449 struct inode *inode = mapping->host; 1450 int ret = 0, ret2; 1451 1452 trace_mark(ext4_ordered_write_end, 1453 "dev %s ino %lu pos %llu len %u copied %u", 1454 inode->i_sb->s_id, inode->i_ino, 1455 (unsigned long long) pos, len, copied); 1456 ret = ext4_jbd2_file_inode(handle, inode); 1457 1458 if (ret == 0) { 1459 loff_t new_i_size; 1460 1461 new_i_size = pos + copied; 1462 if (new_i_size > EXT4_I(inode)->i_disksize) { 1463 ext4_update_i_disksize(inode, new_i_size); 1464 /* We need to mark inode dirty even if 1465 * new_i_size is less that inode->i_size 1466 * bu greater than i_disksize.(hint delalloc) 1467 */ 1468 ext4_mark_inode_dirty(handle, inode); 1469 } 1470 1471 ret2 = generic_write_end(file, mapping, pos, len, copied, 1472 page, fsdata); 1473 copied = ret2; 1474 if (ret2 < 0) 1475 ret = ret2; 1476 } 1477 ret2 = ext4_journal_stop(handle); 1478 if (!ret) 1479 ret = ret2; 1480 1481 return ret ? ret : copied; 1482 } 1483 1484 static int ext4_writeback_write_end(struct file *file, 1485 struct address_space *mapping, 1486 loff_t pos, unsigned len, unsigned copied, 1487 struct page *page, void *fsdata) 1488 { 1489 handle_t *handle = ext4_journal_current_handle(); 1490 struct inode *inode = mapping->host; 1491 int ret = 0, ret2; 1492 loff_t new_i_size; 1493 1494 trace_mark(ext4_writeback_write_end, 1495 "dev %s ino %lu pos %llu len %u copied %u", 1496 inode->i_sb->s_id, inode->i_ino, 1497 (unsigned long long) pos, len, copied); 1498 new_i_size = pos + copied; 1499 if (new_i_size > EXT4_I(inode)->i_disksize) { 1500 ext4_update_i_disksize(inode, new_i_size); 1501 /* We need to mark inode dirty even if 1502 * new_i_size is less that inode->i_size 1503 * bu greater than i_disksize.(hint delalloc) 1504 */ 1505 ext4_mark_inode_dirty(handle, inode); 1506 } 1507 1508 ret2 = generic_write_end(file, mapping, pos, len, copied, 1509 page, fsdata); 1510 copied = ret2; 1511 if (ret2 < 0) 1512 ret = ret2; 1513 1514 ret2 = ext4_journal_stop(handle); 1515 if (!ret) 1516 ret = ret2; 1517 1518 return ret ? ret : copied; 1519 } 1520 1521 static int ext4_journalled_write_end(struct file *file, 1522 struct address_space *mapping, 1523 loff_t pos, unsigned len, unsigned copied, 1524 struct page *page, void *fsdata) 1525 { 1526 handle_t *handle = ext4_journal_current_handle(); 1527 struct inode *inode = mapping->host; 1528 int ret = 0, ret2; 1529 int partial = 0; 1530 unsigned from, to; 1531 loff_t new_i_size; 1532 1533 trace_mark(ext4_journalled_write_end, 1534 "dev %s ino %lu pos %llu len %u copied %u", 1535 inode->i_sb->s_id, inode->i_ino, 1536 (unsigned long long) pos, len, copied); 1537 from = pos & (PAGE_CACHE_SIZE - 1); 1538 to = from + len; 1539 1540 if (copied < len) { 1541 if (!PageUptodate(page)) 1542 copied = 0; 1543 page_zero_new_buffers(page, from+copied, to); 1544 } 1545 1546 ret = walk_page_buffers(handle, page_buffers(page), from, 1547 to, &partial, write_end_fn); 1548 if (!partial) 1549 SetPageUptodate(page); 1550 new_i_size = pos + copied; 1551 if (new_i_size > inode->i_size) 1552 i_size_write(inode, pos+copied); 1553 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1554 if (new_i_size > EXT4_I(inode)->i_disksize) { 1555 ext4_update_i_disksize(inode, new_i_size); 1556 ret2 = ext4_mark_inode_dirty(handle, inode); 1557 if (!ret) 1558 ret = ret2; 1559 } 1560 1561 unlock_page(page); 1562 ret2 = ext4_journal_stop(handle); 1563 if (!ret) 1564 ret = ret2; 1565 page_cache_release(page); 1566 1567 return ret ? ret : copied; 1568 } 1569 1570 static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1571 { 1572 int retries = 0; 1573 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1574 unsigned long md_needed, mdblocks, total = 0; 1575 1576 /* 1577 * recalculate the amount of metadata blocks to reserve 1578 * in order to allocate nrblocks 1579 * worse case is one extent per block 1580 */ 1581 repeat: 1582 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1583 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1584 mdblocks = ext4_calc_metadata_amount(inode, total); 1585 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1586 1587 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1588 total = md_needed + nrblocks; 1589 1590 /* 1591 * Make quota reservation here to prevent quota overflow 1592 * later. Real quota accounting is done at pages writeout 1593 * time. 1594 */ 1595 if (vfs_dq_reserve_block(inode, total)) { 1596 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1597 return -EDQUOT; 1598 } 1599 1600 if (ext4_claim_free_blocks(sbi, total)) { 1601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1602 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1603 yield(); 1604 goto repeat; 1605 } 1606 vfs_dq_release_reservation_block(inode, total); 1607 return -ENOSPC; 1608 } 1609 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1610 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1611 1612 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1613 return 0; /* success */ 1614 } 1615 1616 static void ext4_da_release_space(struct inode *inode, int to_free) 1617 { 1618 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1619 int total, mdb, mdb_free, release; 1620 1621 if (!to_free) 1622 return; /* Nothing to release, exit */ 1623 1624 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1625 1626 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1627 /* 1628 * if there is no reserved blocks, but we try to free some 1629 * then the counter is messed up somewhere. 1630 * but since this function is called from invalidate 1631 * page, it's harmless to return without any action 1632 */ 1633 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1634 "blocks for inode %lu, but there is no reserved " 1635 "data blocks\n", to_free, inode->i_ino); 1636 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1637 return; 1638 } 1639 1640 /* recalculate the number of metablocks still need to be reserved */ 1641 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1642 mdb = ext4_calc_metadata_amount(inode, total); 1643 1644 /* figure out how many metablocks to release */ 1645 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1646 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1647 1648 release = to_free + mdb_free; 1649 1650 /* update fs dirty blocks counter for truncate case */ 1651 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); 1652 1653 /* update per-inode reservations */ 1654 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1655 EXT4_I(inode)->i_reserved_data_blocks -= to_free; 1656 1657 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1658 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1659 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1660 1661 vfs_dq_release_reservation_block(inode, release); 1662 } 1663 1664 static void ext4_da_page_release_reservation(struct page *page, 1665 unsigned long offset) 1666 { 1667 int to_release = 0; 1668 struct buffer_head *head, *bh; 1669 unsigned int curr_off = 0; 1670 1671 head = page_buffers(page); 1672 bh = head; 1673 do { 1674 unsigned int next_off = curr_off + bh->b_size; 1675 1676 if ((offset <= curr_off) && (buffer_delay(bh))) { 1677 to_release++; 1678 clear_buffer_delay(bh); 1679 } 1680 curr_off = next_off; 1681 } while ((bh = bh->b_this_page) != head); 1682 ext4_da_release_space(page->mapping->host, to_release); 1683 } 1684 1685 /* 1686 * Delayed allocation stuff 1687 */ 1688 1689 struct mpage_da_data { 1690 struct inode *inode; 1691 struct buffer_head lbh; /* extent of blocks */ 1692 unsigned long first_page, next_page; /* extent of pages */ 1693 get_block_t *get_block; 1694 struct writeback_control *wbc; 1695 int io_done; 1696 int pages_written; 1697 int retval; 1698 }; 1699 1700 /* 1701 * mpage_da_submit_io - walks through extent of pages and try to write 1702 * them with writepage() call back 1703 * 1704 * @mpd->inode: inode 1705 * @mpd->first_page: first page of the extent 1706 * @mpd->next_page: page after the last page of the extent 1707 * @mpd->get_block: the filesystem's block mapper function 1708 * 1709 * By the time mpage_da_submit_io() is called we expect all blocks 1710 * to be allocated. this may be wrong if allocation failed. 1711 * 1712 * As pages are already locked by write_cache_pages(), we can't use it 1713 */ 1714 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1715 { 1716 long pages_skipped; 1717 struct pagevec pvec; 1718 unsigned long index, end; 1719 int ret = 0, err, nr_pages, i; 1720 struct inode *inode = mpd->inode; 1721 struct address_space *mapping = inode->i_mapping; 1722 1723 BUG_ON(mpd->next_page <= mpd->first_page); 1724 /* 1725 * We need to start from the first_page to the next_page - 1 1726 * to make sure we also write the mapped dirty buffer_heads. 1727 * If we look at mpd->lbh.b_blocknr we would only be looking 1728 * at the currently mapped buffer_heads. 1729 */ 1730 index = mpd->first_page; 1731 end = mpd->next_page - 1; 1732 1733 pagevec_init(&pvec, 0); 1734 while (index <= end) { 1735 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1736 if (nr_pages == 0) 1737 break; 1738 for (i = 0; i < nr_pages; i++) { 1739 struct page *page = pvec.pages[i]; 1740 1741 index = page->index; 1742 if (index > end) 1743 break; 1744 index++; 1745 1746 BUG_ON(!PageLocked(page)); 1747 BUG_ON(PageWriteback(page)); 1748 1749 pages_skipped = mpd->wbc->pages_skipped; 1750 err = mapping->a_ops->writepage(page, mpd->wbc); 1751 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1752 /* 1753 * have successfully written the page 1754 * without skipping the same 1755 */ 1756 mpd->pages_written++; 1757 /* 1758 * In error case, we have to continue because 1759 * remaining pages are still locked 1760 * XXX: unlock and re-dirty them? 1761 */ 1762 if (ret == 0) 1763 ret = err; 1764 } 1765 pagevec_release(&pvec); 1766 } 1767 return ret; 1768 } 1769 1770 /* 1771 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 1772 * 1773 * @mpd->inode - inode to walk through 1774 * @exbh->b_blocknr - first block on a disk 1775 * @exbh->b_size - amount of space in bytes 1776 * @logical - first logical block to start assignment with 1777 * 1778 * the function goes through all passed space and put actual disk 1779 * block numbers into buffer heads, dropping BH_Delay 1780 */ 1781 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1782 struct buffer_head *exbh) 1783 { 1784 struct inode *inode = mpd->inode; 1785 struct address_space *mapping = inode->i_mapping; 1786 int blocks = exbh->b_size >> inode->i_blkbits; 1787 sector_t pblock = exbh->b_blocknr, cur_logical; 1788 struct buffer_head *head, *bh; 1789 pgoff_t index, end; 1790 struct pagevec pvec; 1791 int nr_pages, i; 1792 1793 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1794 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1795 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1796 1797 pagevec_init(&pvec, 0); 1798 1799 while (index <= end) { 1800 /* XXX: optimize tail */ 1801 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1802 if (nr_pages == 0) 1803 break; 1804 for (i = 0; i < nr_pages; i++) { 1805 struct page *page = pvec.pages[i]; 1806 1807 index = page->index; 1808 if (index > end) 1809 break; 1810 index++; 1811 1812 BUG_ON(!PageLocked(page)); 1813 BUG_ON(PageWriteback(page)); 1814 BUG_ON(!page_has_buffers(page)); 1815 1816 bh = page_buffers(page); 1817 head = bh; 1818 1819 /* skip blocks out of the range */ 1820 do { 1821 if (cur_logical >= logical) 1822 break; 1823 cur_logical++; 1824 } while ((bh = bh->b_this_page) != head); 1825 1826 do { 1827 if (cur_logical >= logical + blocks) 1828 break; 1829 if (buffer_delay(bh)) { 1830 bh->b_blocknr = pblock; 1831 clear_buffer_delay(bh); 1832 bh->b_bdev = inode->i_sb->s_bdev; 1833 } else if (buffer_unwritten(bh)) { 1834 bh->b_blocknr = pblock; 1835 clear_buffer_unwritten(bh); 1836 set_buffer_mapped(bh); 1837 set_buffer_new(bh); 1838 bh->b_bdev = inode->i_sb->s_bdev; 1839 } else if (buffer_mapped(bh)) 1840 BUG_ON(bh->b_blocknr != pblock); 1841 1842 cur_logical++; 1843 pblock++; 1844 } while ((bh = bh->b_this_page) != head); 1845 } 1846 pagevec_release(&pvec); 1847 } 1848 } 1849 1850 1851 /* 1852 * __unmap_underlying_blocks - just a helper function to unmap 1853 * set of blocks described by @bh 1854 */ 1855 static inline void __unmap_underlying_blocks(struct inode *inode, 1856 struct buffer_head *bh) 1857 { 1858 struct block_device *bdev = inode->i_sb->s_bdev; 1859 int blocks, i; 1860 1861 blocks = bh->b_size >> inode->i_blkbits; 1862 for (i = 0; i < blocks; i++) 1863 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1864 } 1865 1866 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 1867 sector_t logical, long blk_cnt) 1868 { 1869 int nr_pages, i; 1870 pgoff_t index, end; 1871 struct pagevec pvec; 1872 struct inode *inode = mpd->inode; 1873 struct address_space *mapping = inode->i_mapping; 1874 1875 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1876 end = (logical + blk_cnt - 1) >> 1877 (PAGE_CACHE_SHIFT - inode->i_blkbits); 1878 while (index <= end) { 1879 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1880 if (nr_pages == 0) 1881 break; 1882 for (i = 0; i < nr_pages; i++) { 1883 struct page *page = pvec.pages[i]; 1884 index = page->index; 1885 if (index > end) 1886 break; 1887 index++; 1888 1889 BUG_ON(!PageLocked(page)); 1890 BUG_ON(PageWriteback(page)); 1891 block_invalidatepage(page, 0); 1892 ClearPageUptodate(page); 1893 unlock_page(page); 1894 } 1895 } 1896 return; 1897 } 1898 1899 static void ext4_print_free_blocks(struct inode *inode) 1900 { 1901 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1902 printk(KERN_EMERG "Total free blocks count %lld\n", 1903 ext4_count_free_blocks(inode->i_sb)); 1904 printk(KERN_EMERG "Free/Dirty block details\n"); 1905 printk(KERN_EMERG "free_blocks=%lld\n", 1906 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 1907 printk(KERN_EMERG "dirty_blocks=%lld\n", 1908 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1909 printk(KERN_EMERG "Block reservation details\n"); 1910 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 1911 EXT4_I(inode)->i_reserved_data_blocks); 1912 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 1913 EXT4_I(inode)->i_reserved_meta_blocks); 1914 return; 1915 } 1916 1917 /* 1918 * mpage_da_map_blocks - go through given space 1919 * 1920 * @mpd->lbh - bh describing space 1921 * @mpd->get_block - the filesystem's block mapper function 1922 * 1923 * The function skips space we know is already mapped to disk blocks. 1924 * 1925 */ 1926 static int mpage_da_map_blocks(struct mpage_da_data *mpd) 1927 { 1928 int err = 0; 1929 struct buffer_head new; 1930 struct buffer_head *lbh = &mpd->lbh; 1931 sector_t next; 1932 1933 /* 1934 * We consider only non-mapped and non-allocated blocks 1935 */ 1936 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1937 return 0; 1938 new.b_state = lbh->b_state; 1939 new.b_blocknr = 0; 1940 new.b_size = lbh->b_size; 1941 next = lbh->b_blocknr; 1942 /* 1943 * If we didn't accumulate anything 1944 * to write simply return 1945 */ 1946 if (!new.b_size) 1947 return 0; 1948 err = mpd->get_block(mpd->inode, next, &new, 1); 1949 if (err) { 1950 1951 /* If get block returns with error 1952 * we simply return. Later writepage 1953 * will redirty the page and writepages 1954 * will find the dirty page again 1955 */ 1956 if (err == -EAGAIN) 1957 return 0; 1958 1959 if (err == -ENOSPC && 1960 ext4_count_free_blocks(mpd->inode->i_sb)) { 1961 mpd->retval = err; 1962 return 0; 1963 } 1964 1965 /* 1966 * get block failure will cause us 1967 * to loop in writepages. Because 1968 * a_ops->writepage won't be able to 1969 * make progress. The page will be redirtied 1970 * by writepage and writepages will again 1971 * try to write the same. 1972 */ 1973 printk(KERN_EMERG "%s block allocation failed for inode %lu " 1974 "at logical offset %llu with max blocks " 1975 "%zd with error %d\n", 1976 __func__, mpd->inode->i_ino, 1977 (unsigned long long)next, 1978 lbh->b_size >> mpd->inode->i_blkbits, err); 1979 printk(KERN_EMERG "This should not happen.!! " 1980 "Data will be lost\n"); 1981 if (err == -ENOSPC) { 1982 ext4_print_free_blocks(mpd->inode); 1983 } 1984 /* invlaidate all the pages */ 1985 ext4_da_block_invalidatepages(mpd, next, 1986 lbh->b_size >> mpd->inode->i_blkbits); 1987 return err; 1988 } 1989 BUG_ON(new.b_size == 0); 1990 1991 if (buffer_new(&new)) 1992 __unmap_underlying_blocks(mpd->inode, &new); 1993 1994 /* 1995 * If blocks are delayed marked, we need to 1996 * put actual blocknr and drop delayed bit 1997 */ 1998 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1999 mpage_put_bnr_to_bhs(mpd, next, &new); 2000 2001 return 0; 2002 } 2003 2004 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2005 (1 << BH_Delay) | (1 << BH_Unwritten)) 2006 2007 /* 2008 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 2009 * 2010 * @mpd->lbh - extent of blocks 2011 * @logical - logical number of the block in the file 2012 * @bh - bh of the block (used to access block's state) 2013 * 2014 * the function is used to collect contig. blocks in same state 2015 */ 2016 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2017 sector_t logical, struct buffer_head *bh) 2018 { 2019 sector_t next; 2020 size_t b_size = bh->b_size; 2021 struct buffer_head *lbh = &mpd->lbh; 2022 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; 2023 2024 /* check if thereserved journal credits might overflow */ 2025 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2026 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2027 /* 2028 * With non-extent format we are limited by the journal 2029 * credit available. Total credit needed to insert 2030 * nrblocks contiguous blocks is dependent on the 2031 * nrblocks. So limit nrblocks. 2032 */ 2033 goto flush_it; 2034 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2035 EXT4_MAX_TRANS_DATA) { 2036 /* 2037 * Adding the new buffer_head would make it cross the 2038 * allowed limit for which we have journal credit 2039 * reserved. So limit the new bh->b_size 2040 */ 2041 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2042 mpd->inode->i_blkbits; 2043 /* we will do mpage_da_submit_io in the next loop */ 2044 } 2045 } 2046 /* 2047 * First block in the extent 2048 */ 2049 if (lbh->b_size == 0) { 2050 lbh->b_blocknr = logical; 2051 lbh->b_size = b_size; 2052 lbh->b_state = bh->b_state & BH_FLAGS; 2053 return; 2054 } 2055 2056 next = lbh->b_blocknr + nrblocks; 2057 /* 2058 * Can we merge the block to our big extent? 2059 */ 2060 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2061 lbh->b_size += b_size; 2062 return; 2063 } 2064 2065 flush_it: 2066 /* 2067 * We couldn't merge the block to our extent, so we 2068 * need to flush current extent and start new one 2069 */ 2070 if (mpage_da_map_blocks(mpd) == 0) 2071 mpage_da_submit_io(mpd); 2072 mpd->io_done = 1; 2073 return; 2074 } 2075 2076 /* 2077 * __mpage_da_writepage - finds extent of pages and blocks 2078 * 2079 * @page: page to consider 2080 * @wbc: not used, we just follow rules 2081 * @data: context 2082 * 2083 * The function finds extents of pages and scan them for all blocks. 2084 */ 2085 static int __mpage_da_writepage(struct page *page, 2086 struct writeback_control *wbc, void *data) 2087 { 2088 struct mpage_da_data *mpd = data; 2089 struct inode *inode = mpd->inode; 2090 struct buffer_head *bh, *head, fake; 2091 sector_t logical; 2092 2093 if (mpd->io_done) { 2094 /* 2095 * Rest of the page in the page_vec 2096 * redirty then and skip then. We will 2097 * try to to write them again after 2098 * starting a new transaction 2099 */ 2100 redirty_page_for_writepage(wbc, page); 2101 unlock_page(page); 2102 return MPAGE_DA_EXTENT_TAIL; 2103 } 2104 /* 2105 * Can we merge this page to current extent? 2106 */ 2107 if (mpd->next_page != page->index) { 2108 /* 2109 * Nope, we can't. So, we map non-allocated blocks 2110 * and start IO on them using writepage() 2111 */ 2112 if (mpd->next_page != mpd->first_page) { 2113 if (mpage_da_map_blocks(mpd) == 0) 2114 mpage_da_submit_io(mpd); 2115 /* 2116 * skip rest of the page in the page_vec 2117 */ 2118 mpd->io_done = 1; 2119 redirty_page_for_writepage(wbc, page); 2120 unlock_page(page); 2121 return MPAGE_DA_EXTENT_TAIL; 2122 } 2123 2124 /* 2125 * Start next extent of pages ... 2126 */ 2127 mpd->first_page = page->index; 2128 2129 /* 2130 * ... and blocks 2131 */ 2132 mpd->lbh.b_size = 0; 2133 mpd->lbh.b_state = 0; 2134 mpd->lbh.b_blocknr = 0; 2135 } 2136 2137 mpd->next_page = page->index + 1; 2138 logical = (sector_t) page->index << 2139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2140 2141 if (!page_has_buffers(page)) { 2142 /* 2143 * There is no attached buffer heads yet (mmap?) 2144 * we treat the page asfull of dirty blocks 2145 */ 2146 bh = &fake; 2147 bh->b_size = PAGE_CACHE_SIZE; 2148 bh->b_state = 0; 2149 set_buffer_dirty(bh); 2150 set_buffer_uptodate(bh); 2151 mpage_add_bh_to_extent(mpd, logical, bh); 2152 if (mpd->io_done) 2153 return MPAGE_DA_EXTENT_TAIL; 2154 } else { 2155 /* 2156 * Page with regular buffer heads, just add all dirty ones 2157 */ 2158 head = page_buffers(page); 2159 bh = head; 2160 do { 2161 BUG_ON(buffer_locked(bh)); 2162 /* 2163 * We need to try to allocate 2164 * unmapped blocks in the same page. 2165 * Otherwise we won't make progress 2166 * with the page in ext4_da_writepage 2167 */ 2168 if (buffer_dirty(bh) && 2169 (!buffer_mapped(bh) || buffer_delay(bh))) { 2170 mpage_add_bh_to_extent(mpd, logical, bh); 2171 if (mpd->io_done) 2172 return MPAGE_DA_EXTENT_TAIL; 2173 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2174 /* 2175 * mapped dirty buffer. We need to update 2176 * the b_state because we look at 2177 * b_state in mpage_da_map_blocks. We don't 2178 * update b_size because if we find an 2179 * unmapped buffer_head later we need to 2180 * use the b_state flag of that buffer_head. 2181 */ 2182 if (mpd->lbh.b_size == 0) 2183 mpd->lbh.b_state = 2184 bh->b_state & BH_FLAGS; 2185 } 2186 logical++; 2187 } while ((bh = bh->b_this_page) != head); 2188 } 2189 2190 return 0; 2191 } 2192 2193 /* 2194 * mpage_da_writepages - walk the list of dirty pages of the given 2195 * address space, allocates non-allocated blocks, maps newly-allocated 2196 * blocks to existing bhs and issue IO them 2197 * 2198 * @mapping: address space structure to write 2199 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2200 * @get_block: the filesystem's block mapper function. 2201 * 2202 * This is a library function, which implements the writepages() 2203 * address_space_operation. 2204 */ 2205 static int mpage_da_writepages(struct address_space *mapping, 2206 struct writeback_control *wbc, 2207 struct mpage_da_data *mpd) 2208 { 2209 int ret; 2210 2211 if (!mpd->get_block) 2212 return generic_writepages(mapping, wbc); 2213 2214 mpd->lbh.b_size = 0; 2215 mpd->lbh.b_state = 0; 2216 mpd->lbh.b_blocknr = 0; 2217 mpd->first_page = 0; 2218 mpd->next_page = 0; 2219 mpd->io_done = 0; 2220 mpd->pages_written = 0; 2221 mpd->retval = 0; 2222 2223 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2224 /* 2225 * Handle last extent of pages 2226 */ 2227 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2228 if (mpage_da_map_blocks(mpd) == 0) 2229 mpage_da_submit_io(mpd); 2230 2231 mpd->io_done = 1; 2232 ret = MPAGE_DA_EXTENT_TAIL; 2233 } 2234 wbc->nr_to_write -= mpd->pages_written; 2235 return ret; 2236 } 2237 2238 /* 2239 * this is a special callback for ->write_begin() only 2240 * it's intention is to return mapped block or reserve space 2241 */ 2242 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2243 struct buffer_head *bh_result, int create) 2244 { 2245 int ret = 0; 2246 2247 BUG_ON(create == 0); 2248 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2249 2250 /* 2251 * first, we need to know whether the block is allocated already 2252 * preallocated blocks are unmapped but should treated 2253 * the same as allocated blocks. 2254 */ 2255 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); 2256 if ((ret == 0) && !buffer_delay(bh_result)) { 2257 /* the block isn't (pre)allocated yet, let's reserve space */ 2258 /* 2259 * XXX: __block_prepare_write() unmaps passed block, 2260 * is it OK? 2261 */ 2262 ret = ext4_da_reserve_space(inode, 1); 2263 if (ret) 2264 /* not enough space to reserve */ 2265 return ret; 2266 2267 map_bh(bh_result, inode->i_sb, 0); 2268 set_buffer_new(bh_result); 2269 set_buffer_delay(bh_result); 2270 } else if (ret > 0) { 2271 bh_result->b_size = (ret << inode->i_blkbits); 2272 ret = 0; 2273 } 2274 2275 return ret; 2276 } 2277 #define EXT4_DELALLOC_RSVED 1 2278 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, 2279 struct buffer_head *bh_result, int create) 2280 { 2281 int ret; 2282 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2283 loff_t disksize = EXT4_I(inode)->i_disksize; 2284 handle_t *handle = NULL; 2285 2286 handle = ext4_journal_current_handle(); 2287 BUG_ON(!handle); 2288 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2289 bh_result, create, 0, EXT4_DELALLOC_RSVED); 2290 if (ret > 0) { 2291 2292 bh_result->b_size = (ret << inode->i_blkbits); 2293 2294 if (ext4_should_order_data(inode)) { 2295 int retval; 2296 retval = ext4_jbd2_file_inode(handle, inode); 2297 if (retval) 2298 /* 2299 * Failed to add inode for ordered 2300 * mode. Don't update file size 2301 */ 2302 return retval; 2303 } 2304 2305 /* 2306 * Update on-disk size along with block allocation 2307 * we don't use 'extend_disksize' as size may change 2308 * within already allocated block -bzzz 2309 */ 2310 disksize = ((loff_t) iblock + ret) << inode->i_blkbits; 2311 if (disksize > i_size_read(inode)) 2312 disksize = i_size_read(inode); 2313 if (disksize > EXT4_I(inode)->i_disksize) { 2314 ext4_update_i_disksize(inode, disksize); 2315 ret = ext4_mark_inode_dirty(handle, inode); 2316 return ret; 2317 } 2318 ret = 0; 2319 } 2320 return ret; 2321 } 2322 2323 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2324 { 2325 /* 2326 * unmapped buffer is possible for holes. 2327 * delay buffer is possible with delayed allocation 2328 */ 2329 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); 2330 } 2331 2332 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, 2333 struct buffer_head *bh_result, int create) 2334 { 2335 int ret = 0; 2336 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2337 2338 /* 2339 * we don't want to do block allocation in writepage 2340 * so call get_block_wrap with create = 0 2341 */ 2342 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, 2343 bh_result, 0, 0, 0); 2344 if (ret > 0) { 2345 bh_result->b_size = (ret << inode->i_blkbits); 2346 ret = 0; 2347 } 2348 return ret; 2349 } 2350 2351 /* 2352 * get called vi ext4_da_writepages after taking page lock (have journal handle) 2353 * get called via journal_submit_inode_data_buffers (no journal handle) 2354 * get called via shrink_page_list via pdflush (no journal handle) 2355 * or grab_page_cache when doing write_begin (have journal handle) 2356 */ 2357 static int ext4_da_writepage(struct page *page, 2358 struct writeback_control *wbc) 2359 { 2360 int ret = 0; 2361 loff_t size; 2362 unsigned int len; 2363 struct buffer_head *page_bufs; 2364 struct inode *inode = page->mapping->host; 2365 2366 trace_mark(ext4_da_writepage, 2367 "dev %s ino %lu page_index %lu", 2368 inode->i_sb->s_id, inode->i_ino, page->index); 2369 size = i_size_read(inode); 2370 if (page->index == size >> PAGE_CACHE_SHIFT) 2371 len = size & ~PAGE_CACHE_MASK; 2372 else 2373 len = PAGE_CACHE_SIZE; 2374 2375 if (page_has_buffers(page)) { 2376 page_bufs = page_buffers(page); 2377 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2378 ext4_bh_unmapped_or_delay)) { 2379 /* 2380 * We don't want to do block allocation 2381 * So redirty the page and return 2382 * We may reach here when we do a journal commit 2383 * via journal_submit_inode_data_buffers. 2384 * If we don't have mapping block we just ignore 2385 * them. We can also reach here via shrink_page_list 2386 */ 2387 redirty_page_for_writepage(wbc, page); 2388 unlock_page(page); 2389 return 0; 2390 } 2391 } else { 2392 /* 2393 * The test for page_has_buffers() is subtle: 2394 * We know the page is dirty but it lost buffers. That means 2395 * that at some moment in time after write_begin()/write_end() 2396 * has been called all buffers have been clean and thus they 2397 * must have been written at least once. So they are all 2398 * mapped and we can happily proceed with mapping them 2399 * and writing the page. 2400 * 2401 * Try to initialize the buffer_heads and check whether 2402 * all are mapped and non delay. We don't want to 2403 * do block allocation here. 2404 */ 2405 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2406 ext4_normal_get_block_write); 2407 if (!ret) { 2408 page_bufs = page_buffers(page); 2409 /* check whether all are mapped and non delay */ 2410 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2411 ext4_bh_unmapped_or_delay)) { 2412 redirty_page_for_writepage(wbc, page); 2413 unlock_page(page); 2414 return 0; 2415 } 2416 } else { 2417 /* 2418 * We can't do block allocation here 2419 * so just redity the page and unlock 2420 * and return 2421 */ 2422 redirty_page_for_writepage(wbc, page); 2423 unlock_page(page); 2424 return 0; 2425 } 2426 /* now mark the buffer_heads as dirty and uptodate */ 2427 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2428 } 2429 2430 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2431 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); 2432 else 2433 ret = block_write_full_page(page, 2434 ext4_normal_get_block_write, 2435 wbc); 2436 2437 return ret; 2438 } 2439 2440 /* 2441 * This is called via ext4_da_writepages() to 2442 * calulate the total number of credits to reserve to fit 2443 * a single extent allocation into a single transaction, 2444 * ext4_da_writpeages() will loop calling this before 2445 * the block allocation. 2446 */ 2447 2448 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2449 { 2450 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2451 2452 /* 2453 * With non-extent format the journal credit needed to 2454 * insert nrblocks contiguous block is dependent on 2455 * number of contiguous block. So we will limit 2456 * number of contiguous block to a sane value 2457 */ 2458 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2459 (max_blocks > EXT4_MAX_TRANS_DATA)) 2460 max_blocks = EXT4_MAX_TRANS_DATA; 2461 2462 return ext4_chunk_trans_blocks(inode, max_blocks); 2463 } 2464 2465 static int ext4_da_writepages(struct address_space *mapping, 2466 struct writeback_control *wbc) 2467 { 2468 pgoff_t index; 2469 int range_whole = 0; 2470 handle_t *handle = NULL; 2471 struct mpage_da_data mpd; 2472 struct inode *inode = mapping->host; 2473 int no_nrwrite_index_update; 2474 int pages_written = 0; 2475 long pages_skipped; 2476 int range_cyclic, cycled = 1, io_done = 0; 2477 int needed_blocks, ret = 0, nr_to_writebump = 0; 2478 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2479 2480 trace_mark(ext4_da_writepages, 2481 "dev %s ino %lu nr_t_write %ld " 2482 "pages_skipped %ld range_start %llu " 2483 "range_end %llu nonblocking %d " 2484 "for_kupdate %d for_reclaim %d " 2485 "for_writepages %d range_cyclic %d", 2486 inode->i_sb->s_id, inode->i_ino, 2487 wbc->nr_to_write, wbc->pages_skipped, 2488 (unsigned long long) wbc->range_start, 2489 (unsigned long long) wbc->range_end, 2490 wbc->nonblocking, wbc->for_kupdate, 2491 wbc->for_reclaim, wbc->for_writepages, 2492 wbc->range_cyclic); 2493 2494 /* 2495 * No pages to write? This is mainly a kludge to avoid starting 2496 * a transaction for special inodes like journal inode on last iput() 2497 * because that could violate lock ordering on umount 2498 */ 2499 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2500 return 0; 2501 2502 /* 2503 * If the filesystem has aborted, it is read-only, so return 2504 * right away instead of dumping stack traces later on that 2505 * will obscure the real source of the problem. We test 2506 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because 2507 * the latter could be true if the filesystem is mounted 2508 * read-only, and in that case, ext4_da_writepages should 2509 * *never* be called, so if that ever happens, we would want 2510 * the stack trace. 2511 */ 2512 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) 2513 return -EROFS; 2514 2515 /* 2516 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2517 * This make sure small files blocks are allocated in 2518 * single attempt. This ensure that small files 2519 * get less fragmented. 2520 */ 2521 if (wbc->nr_to_write < sbi->s_mb_stream_request) { 2522 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2523 wbc->nr_to_write = sbi->s_mb_stream_request; 2524 } 2525 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2526 range_whole = 1; 2527 2528 range_cyclic = wbc->range_cyclic; 2529 if (wbc->range_cyclic) { 2530 index = mapping->writeback_index; 2531 if (index) 2532 cycled = 0; 2533 wbc->range_start = index << PAGE_CACHE_SHIFT; 2534 wbc->range_end = LLONG_MAX; 2535 wbc->range_cyclic = 0; 2536 } else 2537 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2538 2539 mpd.wbc = wbc; 2540 mpd.inode = mapping->host; 2541 2542 /* 2543 * we don't want write_cache_pages to update 2544 * nr_to_write and writeback_index 2545 */ 2546 no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2547 wbc->no_nrwrite_index_update = 1; 2548 pages_skipped = wbc->pages_skipped; 2549 2550 retry: 2551 while (!ret && wbc->nr_to_write > 0) { 2552 2553 /* 2554 * we insert one extent at a time. So we need 2555 * credit needed for single extent allocation. 2556 * journalled mode is currently not supported 2557 * by delalloc 2558 */ 2559 BUG_ON(ext4_should_journal_data(inode)); 2560 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2561 2562 /* start a new transaction*/ 2563 handle = ext4_journal_start(inode, needed_blocks); 2564 if (IS_ERR(handle)) { 2565 ret = PTR_ERR(handle); 2566 printk(KERN_CRIT "%s: jbd2_start: " 2567 "%ld pages, ino %lu; err %d\n", __func__, 2568 wbc->nr_to_write, inode->i_ino, ret); 2569 dump_stack(); 2570 goto out_writepages; 2571 } 2572 mpd.get_block = ext4_da_get_block_write; 2573 ret = mpage_da_writepages(mapping, wbc, &mpd); 2574 2575 ext4_journal_stop(handle); 2576 2577 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2578 /* commit the transaction which would 2579 * free blocks released in the transaction 2580 * and try again 2581 */ 2582 jbd2_journal_force_commit_nested(sbi->s_journal); 2583 wbc->pages_skipped = pages_skipped; 2584 ret = 0; 2585 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2586 /* 2587 * got one extent now try with 2588 * rest of the pages 2589 */ 2590 pages_written += mpd.pages_written; 2591 wbc->pages_skipped = pages_skipped; 2592 ret = 0; 2593 io_done = 1; 2594 } else if (wbc->nr_to_write) 2595 /* 2596 * There is no more writeout needed 2597 * or we requested for a noblocking writeout 2598 * and we found the device congested 2599 */ 2600 break; 2601 } 2602 if (!io_done && !cycled) { 2603 cycled = 1; 2604 index = 0; 2605 wbc->range_start = index << PAGE_CACHE_SHIFT; 2606 wbc->range_end = mapping->writeback_index - 1; 2607 goto retry; 2608 } 2609 if (pages_skipped != wbc->pages_skipped) 2610 printk(KERN_EMERG "This should not happen leaving %s " 2611 "with nr_to_write = %ld ret = %d\n", 2612 __func__, wbc->nr_to_write, ret); 2613 2614 /* Update index */ 2615 index += pages_written; 2616 wbc->range_cyclic = range_cyclic; 2617 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2618 /* 2619 * set the writeback_index so that range_cyclic 2620 * mode will write it back later 2621 */ 2622 mapping->writeback_index = index; 2623 2624 out_writepages: 2625 if (!no_nrwrite_index_update) 2626 wbc->no_nrwrite_index_update = 0; 2627 wbc->nr_to_write -= nr_to_writebump; 2628 trace_mark(ext4_da_writepage_result, 2629 "dev %s ino %lu ret %d pages_written %d " 2630 "pages_skipped %ld congestion %d " 2631 "more_io %d no_nrwrite_index_update %d", 2632 inode->i_sb->s_id, inode->i_ino, ret, 2633 pages_written, wbc->pages_skipped, 2634 wbc->encountered_congestion, wbc->more_io, 2635 wbc->no_nrwrite_index_update); 2636 return ret; 2637 } 2638 2639 #define FALL_BACK_TO_NONDELALLOC 1 2640 static int ext4_nonda_switch(struct super_block *sb) 2641 { 2642 s64 free_blocks, dirty_blocks; 2643 struct ext4_sb_info *sbi = EXT4_SB(sb); 2644 2645 /* 2646 * switch to non delalloc mode if we are running low 2647 * on free block. The free block accounting via percpu 2648 * counters can get slightly wrong with percpu_counter_batch getting 2649 * accumulated on each CPU without updating global counters 2650 * Delalloc need an accurate free block accounting. So switch 2651 * to non delalloc when we are near to error range. 2652 */ 2653 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2654 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2655 if (2 * free_blocks < 3 * dirty_blocks || 2656 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2657 /* 2658 * free block count is less that 150% of dirty blocks 2659 * or free blocks is less that watermark 2660 */ 2661 return 1; 2662 } 2663 return 0; 2664 } 2665 2666 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2667 loff_t pos, unsigned len, unsigned flags, 2668 struct page **pagep, void **fsdata) 2669 { 2670 int ret, retries = 0; 2671 struct page *page; 2672 pgoff_t index; 2673 unsigned from, to; 2674 struct inode *inode = mapping->host; 2675 handle_t *handle; 2676 2677 index = pos >> PAGE_CACHE_SHIFT; 2678 from = pos & (PAGE_CACHE_SIZE - 1); 2679 to = from + len; 2680 2681 if (ext4_nonda_switch(inode->i_sb)) { 2682 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2683 return ext4_write_begin(file, mapping, pos, 2684 len, flags, pagep, fsdata); 2685 } 2686 *fsdata = (void *)0; 2687 2688 trace_mark(ext4_da_write_begin, 2689 "dev %s ino %lu pos %llu len %u flags %u", 2690 inode->i_sb->s_id, inode->i_ino, 2691 (unsigned long long) pos, len, flags); 2692 retry: 2693 /* 2694 * With delayed allocation, we don't log the i_disksize update 2695 * if there is delayed block allocation. But we still need 2696 * to journalling the i_disksize update if writes to the end 2697 * of file which has an already mapped buffer. 2698 */ 2699 handle = ext4_journal_start(inode, 1); 2700 if (IS_ERR(handle)) { 2701 ret = PTR_ERR(handle); 2702 goto out; 2703 } 2704 /* We cannot recurse into the filesystem as the transaction is already 2705 * started */ 2706 flags |= AOP_FLAG_NOFS; 2707 2708 page = grab_cache_page_write_begin(mapping, index, flags); 2709 if (!page) { 2710 ext4_journal_stop(handle); 2711 ret = -ENOMEM; 2712 goto out; 2713 } 2714 *pagep = page; 2715 2716 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2717 ext4_da_get_block_prep); 2718 if (ret < 0) { 2719 unlock_page(page); 2720 ext4_journal_stop(handle); 2721 page_cache_release(page); 2722 /* 2723 * block_write_begin may have instantiated a few blocks 2724 * outside i_size. Trim these off again. Don't need 2725 * i_size_read because we hold i_mutex. 2726 */ 2727 if (pos + len > inode->i_size) 2728 vmtruncate(inode, inode->i_size); 2729 } 2730 2731 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2732 goto retry; 2733 out: 2734 return ret; 2735 } 2736 2737 /* 2738 * Check if we should update i_disksize 2739 * when write to the end of file but not require block allocation 2740 */ 2741 static int ext4_da_should_update_i_disksize(struct page *page, 2742 unsigned long offset) 2743 { 2744 struct buffer_head *bh; 2745 struct inode *inode = page->mapping->host; 2746 unsigned int idx; 2747 int i; 2748 2749 bh = page_buffers(page); 2750 idx = offset >> inode->i_blkbits; 2751 2752 for (i = 0; i < idx; i++) 2753 bh = bh->b_this_page; 2754 2755 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2756 return 0; 2757 return 1; 2758 } 2759 2760 static int ext4_da_write_end(struct file *file, 2761 struct address_space *mapping, 2762 loff_t pos, unsigned len, unsigned copied, 2763 struct page *page, void *fsdata) 2764 { 2765 struct inode *inode = mapping->host; 2766 int ret = 0, ret2; 2767 handle_t *handle = ext4_journal_current_handle(); 2768 loff_t new_i_size; 2769 unsigned long start, end; 2770 int write_mode = (int)(unsigned long)fsdata; 2771 2772 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2773 if (ext4_should_order_data(inode)) { 2774 return ext4_ordered_write_end(file, mapping, pos, 2775 len, copied, page, fsdata); 2776 } else if (ext4_should_writeback_data(inode)) { 2777 return ext4_writeback_write_end(file, mapping, pos, 2778 len, copied, page, fsdata); 2779 } else { 2780 BUG(); 2781 } 2782 } 2783 2784 trace_mark(ext4_da_write_end, 2785 "dev %s ino %lu pos %llu len %u copied %u", 2786 inode->i_sb->s_id, inode->i_ino, 2787 (unsigned long long) pos, len, copied); 2788 start = pos & (PAGE_CACHE_SIZE - 1); 2789 end = start + copied - 1; 2790 2791 /* 2792 * generic_write_end() will run mark_inode_dirty() if i_size 2793 * changes. So let's piggyback the i_disksize mark_inode_dirty 2794 * into that. 2795 */ 2796 2797 new_i_size = pos + copied; 2798 if (new_i_size > EXT4_I(inode)->i_disksize) { 2799 if (ext4_da_should_update_i_disksize(page, end)) { 2800 down_write(&EXT4_I(inode)->i_data_sem); 2801 if (new_i_size > EXT4_I(inode)->i_disksize) { 2802 /* 2803 * Updating i_disksize when extending file 2804 * without needing block allocation 2805 */ 2806 if (ext4_should_order_data(inode)) 2807 ret = ext4_jbd2_file_inode(handle, 2808 inode); 2809 2810 EXT4_I(inode)->i_disksize = new_i_size; 2811 } 2812 up_write(&EXT4_I(inode)->i_data_sem); 2813 /* We need to mark inode dirty even if 2814 * new_i_size is less that inode->i_size 2815 * bu greater than i_disksize.(hint delalloc) 2816 */ 2817 ext4_mark_inode_dirty(handle, inode); 2818 } 2819 } 2820 ret2 = generic_write_end(file, mapping, pos, len, copied, 2821 page, fsdata); 2822 copied = ret2; 2823 if (ret2 < 0) 2824 ret = ret2; 2825 ret2 = ext4_journal_stop(handle); 2826 if (!ret) 2827 ret = ret2; 2828 2829 return ret ? ret : copied; 2830 } 2831 2832 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2833 { 2834 /* 2835 * Drop reserved blocks 2836 */ 2837 BUG_ON(!PageLocked(page)); 2838 if (!page_has_buffers(page)) 2839 goto out; 2840 2841 ext4_da_page_release_reservation(page, offset); 2842 2843 out: 2844 ext4_invalidatepage(page, offset); 2845 2846 return; 2847 } 2848 2849 2850 /* 2851 * bmap() is special. It gets used by applications such as lilo and by 2852 * the swapper to find the on-disk block of a specific piece of data. 2853 * 2854 * Naturally, this is dangerous if the block concerned is still in the 2855 * journal. If somebody makes a swapfile on an ext4 data-journaling 2856 * filesystem and enables swap, then they may get a nasty shock when the 2857 * data getting swapped to that swapfile suddenly gets overwritten by 2858 * the original zero's written out previously to the journal and 2859 * awaiting writeback in the kernel's buffer cache. 2860 * 2861 * So, if we see any bmap calls here on a modified, data-journaled file, 2862 * take extra steps to flush any blocks which might be in the cache. 2863 */ 2864 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 2865 { 2866 struct inode *inode = mapping->host; 2867 journal_t *journal; 2868 int err; 2869 2870 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2871 test_opt(inode->i_sb, DELALLOC)) { 2872 /* 2873 * With delalloc we want to sync the file 2874 * so that we can make sure we allocate 2875 * blocks for file 2876 */ 2877 filemap_write_and_wait(mapping); 2878 } 2879 2880 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2881 /* 2882 * This is a REALLY heavyweight approach, but the use of 2883 * bmap on dirty files is expected to be extremely rare: 2884 * only if we run lilo or swapon on a freshly made file 2885 * do we expect this to happen. 2886 * 2887 * (bmap requires CAP_SYS_RAWIO so this does not 2888 * represent an unprivileged user DOS attack --- we'd be 2889 * in trouble if mortal users could trigger this path at 2890 * will.) 2891 * 2892 * NB. EXT4_STATE_JDATA is not set on files other than 2893 * regular files. If somebody wants to bmap a directory 2894 * or symlink and gets confused because the buffer 2895 * hasn't yet been flushed to disk, they deserve 2896 * everything they get. 2897 */ 2898 2899 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 2900 journal = EXT4_JOURNAL(inode); 2901 jbd2_journal_lock_updates(journal); 2902 err = jbd2_journal_flush(journal); 2903 jbd2_journal_unlock_updates(journal); 2904 2905 if (err) 2906 return 0; 2907 } 2908 2909 return generic_block_bmap(mapping, block, ext4_get_block); 2910 } 2911 2912 static int bget_one(handle_t *handle, struct buffer_head *bh) 2913 { 2914 get_bh(bh); 2915 return 0; 2916 } 2917 2918 static int bput_one(handle_t *handle, struct buffer_head *bh) 2919 { 2920 put_bh(bh); 2921 return 0; 2922 } 2923 2924 /* 2925 * Note that we don't need to start a transaction unless we're journaling data 2926 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2927 * need to file the inode to the transaction's list in ordered mode because if 2928 * we are writing back data added by write(), the inode is already there and if 2929 * we are writing back data modified via mmap(), noone guarantees in which 2930 * transaction the data will hit the disk. In case we are journaling data, we 2931 * cannot start transaction directly because transaction start ranks above page 2932 * lock so we have to do some magic. 2933 * 2934 * In all journaling modes block_write_full_page() will start the I/O. 2935 * 2936 * Problem: 2937 * 2938 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2939 * ext4_writepage() 2940 * 2941 * Similar for: 2942 * 2943 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 2944 * 2945 * Same applies to ext4_get_block(). We will deadlock on various things like 2946 * lock_journal and i_data_sem 2947 * 2948 * Setting PF_MEMALLOC here doesn't work - too many internal memory 2949 * allocations fail. 2950 * 2951 * 16May01: If we're reentered then journal_current_handle() will be 2952 * non-zero. We simply *return*. 2953 * 2954 * 1 July 2001: @@@ FIXME: 2955 * In journalled data mode, a data buffer may be metadata against the 2956 * current transaction. But the same file is part of a shared mapping 2957 * and someone does a writepage() on it. 2958 * 2959 * We will move the buffer onto the async_data list, but *after* it has 2960 * been dirtied. So there's a small window where we have dirty data on 2961 * BJ_Metadata. 2962 * 2963 * Note that this only applies to the last partial page in the file. The 2964 * bit which block_write_full_page() uses prepare/commit for. (That's 2965 * broken code anyway: it's wrong for msync()). 2966 * 2967 * It's a rare case: affects the final partial page, for journalled data 2968 * where the file is subject to bith write() and writepage() in the same 2969 * transction. To fix it we'll need a custom block_write_full_page(). 2970 * We'll probably need that anyway for journalling writepage() output. 2971 * 2972 * We don't honour synchronous mounts for writepage(). That would be 2973 * disastrous. Any write() or metadata operation will sync the fs for 2974 * us. 2975 * 2976 */ 2977 static int __ext4_normal_writepage(struct page *page, 2978 struct writeback_control *wbc) 2979 { 2980 struct inode *inode = page->mapping->host; 2981 2982 if (test_opt(inode->i_sb, NOBH)) 2983 return nobh_writepage(page, 2984 ext4_normal_get_block_write, wbc); 2985 else 2986 return block_write_full_page(page, 2987 ext4_normal_get_block_write, 2988 wbc); 2989 } 2990 2991 static int ext4_normal_writepage(struct page *page, 2992 struct writeback_control *wbc) 2993 { 2994 struct inode *inode = page->mapping->host; 2995 loff_t size = i_size_read(inode); 2996 loff_t len; 2997 2998 trace_mark(ext4_normal_writepage, 2999 "dev %s ino %lu page_index %lu", 3000 inode->i_sb->s_id, inode->i_ino, page->index); 3001 J_ASSERT(PageLocked(page)); 3002 if (page->index == size >> PAGE_CACHE_SHIFT) 3003 len = size & ~PAGE_CACHE_MASK; 3004 else 3005 len = PAGE_CACHE_SIZE; 3006 3007 if (page_has_buffers(page)) { 3008 /* if page has buffers it should all be mapped 3009 * and allocated. If there are not buffers attached 3010 * to the page we know the page is dirty but it lost 3011 * buffers. That means that at some moment in time 3012 * after write_begin() / write_end() has been called 3013 * all buffers have been clean and thus they must have been 3014 * written at least once. So they are all mapped and we can 3015 * happily proceed with mapping them and writing the page. 3016 */ 3017 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3018 ext4_bh_unmapped_or_delay)); 3019 } 3020 3021 if (!ext4_journal_current_handle()) 3022 return __ext4_normal_writepage(page, wbc); 3023 3024 redirty_page_for_writepage(wbc, page); 3025 unlock_page(page); 3026 return 0; 3027 } 3028 3029 static int __ext4_journalled_writepage(struct page *page, 3030 struct writeback_control *wbc) 3031 { 3032 struct address_space *mapping = page->mapping; 3033 struct inode *inode = mapping->host; 3034 struct buffer_head *page_bufs; 3035 handle_t *handle = NULL; 3036 int ret = 0; 3037 int err; 3038 3039 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3040 ext4_normal_get_block_write); 3041 if (ret != 0) 3042 goto out_unlock; 3043 3044 page_bufs = page_buffers(page); 3045 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 3046 bget_one); 3047 /* As soon as we unlock the page, it can go away, but we have 3048 * references to buffers so we are safe */ 3049 unlock_page(page); 3050 3051 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 3052 if (IS_ERR(handle)) { 3053 ret = PTR_ERR(handle); 3054 goto out; 3055 } 3056 3057 ret = walk_page_buffers(handle, page_bufs, 0, 3058 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 3059 3060 err = walk_page_buffers(handle, page_bufs, 0, 3061 PAGE_CACHE_SIZE, NULL, write_end_fn); 3062 if (ret == 0) 3063 ret = err; 3064 err = ext4_journal_stop(handle); 3065 if (!ret) 3066 ret = err; 3067 3068 walk_page_buffers(handle, page_bufs, 0, 3069 PAGE_CACHE_SIZE, NULL, bput_one); 3070 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3071 goto out; 3072 3073 out_unlock: 3074 unlock_page(page); 3075 out: 3076 return ret; 3077 } 3078 3079 static int ext4_journalled_writepage(struct page *page, 3080 struct writeback_control *wbc) 3081 { 3082 struct inode *inode = page->mapping->host; 3083 loff_t size = i_size_read(inode); 3084 loff_t len; 3085 3086 trace_mark(ext4_journalled_writepage, 3087 "dev %s ino %lu page_index %lu", 3088 inode->i_sb->s_id, inode->i_ino, page->index); 3089 J_ASSERT(PageLocked(page)); 3090 if (page->index == size >> PAGE_CACHE_SHIFT) 3091 len = size & ~PAGE_CACHE_MASK; 3092 else 3093 len = PAGE_CACHE_SIZE; 3094 3095 if (page_has_buffers(page)) { 3096 /* if page has buffers it should all be mapped 3097 * and allocated. If there are not buffers attached 3098 * to the page we know the page is dirty but it lost 3099 * buffers. That means that at some moment in time 3100 * after write_begin() / write_end() has been called 3101 * all buffers have been clean and thus they must have been 3102 * written at least once. So they are all mapped and we can 3103 * happily proceed with mapping them and writing the page. 3104 */ 3105 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3106 ext4_bh_unmapped_or_delay)); 3107 } 3108 3109 if (ext4_journal_current_handle()) 3110 goto no_write; 3111 3112 if (PageChecked(page)) { 3113 /* 3114 * It's mmapped pagecache. Add buffers and journal it. There 3115 * doesn't seem much point in redirtying the page here. 3116 */ 3117 ClearPageChecked(page); 3118 return __ext4_journalled_writepage(page, wbc); 3119 } else { 3120 /* 3121 * It may be a page full of checkpoint-mode buffers. We don't 3122 * really know unless we go poke around in the buffer_heads. 3123 * But block_write_full_page will do the right thing. 3124 */ 3125 return block_write_full_page(page, 3126 ext4_normal_get_block_write, 3127 wbc); 3128 } 3129 no_write: 3130 redirty_page_for_writepage(wbc, page); 3131 unlock_page(page); 3132 return 0; 3133 } 3134 3135 static int ext4_readpage(struct file *file, struct page *page) 3136 { 3137 return mpage_readpage(page, ext4_get_block); 3138 } 3139 3140 static int 3141 ext4_readpages(struct file *file, struct address_space *mapping, 3142 struct list_head *pages, unsigned nr_pages) 3143 { 3144 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3145 } 3146 3147 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3148 { 3149 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3150 3151 /* 3152 * If it's a full truncate we just forget about the pending dirtying 3153 */ 3154 if (offset == 0) 3155 ClearPageChecked(page); 3156 3157 if (journal) 3158 jbd2_journal_invalidatepage(journal, page, offset); 3159 else 3160 block_invalidatepage(page, offset); 3161 } 3162 3163 static int ext4_releasepage(struct page *page, gfp_t wait) 3164 { 3165 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3166 3167 WARN_ON(PageChecked(page)); 3168 if (!page_has_buffers(page)) 3169 return 0; 3170 if (journal) 3171 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3172 else 3173 return try_to_free_buffers(page); 3174 } 3175 3176 /* 3177 * If the O_DIRECT write will extend the file then add this inode to the 3178 * orphan list. So recovery will truncate it back to the original size 3179 * if the machine crashes during the write. 3180 * 3181 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3182 * crashes then stale disk data _may_ be exposed inside the file. But current 3183 * VFS code falls back into buffered path in that case so we are safe. 3184 */ 3185 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3186 const struct iovec *iov, loff_t offset, 3187 unsigned long nr_segs) 3188 { 3189 struct file *file = iocb->ki_filp; 3190 struct inode *inode = file->f_mapping->host; 3191 struct ext4_inode_info *ei = EXT4_I(inode); 3192 handle_t *handle; 3193 ssize_t ret; 3194 int orphan = 0; 3195 size_t count = iov_length(iov, nr_segs); 3196 3197 if (rw == WRITE) { 3198 loff_t final_size = offset + count; 3199 3200 if (final_size > inode->i_size) { 3201 /* Credits for sb + inode write */ 3202 handle = ext4_journal_start(inode, 2); 3203 if (IS_ERR(handle)) { 3204 ret = PTR_ERR(handle); 3205 goto out; 3206 } 3207 ret = ext4_orphan_add(handle, inode); 3208 if (ret) { 3209 ext4_journal_stop(handle); 3210 goto out; 3211 } 3212 orphan = 1; 3213 ei->i_disksize = inode->i_size; 3214 ext4_journal_stop(handle); 3215 } 3216 } 3217 3218 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3219 offset, nr_segs, 3220 ext4_get_block, NULL); 3221 3222 if (orphan) { 3223 int err; 3224 3225 /* Credits for sb + inode write */ 3226 handle = ext4_journal_start(inode, 2); 3227 if (IS_ERR(handle)) { 3228 /* This is really bad luck. We've written the data 3229 * but cannot extend i_size. Bail out and pretend 3230 * the write failed... */ 3231 ret = PTR_ERR(handle); 3232 goto out; 3233 } 3234 if (inode->i_nlink) 3235 ext4_orphan_del(handle, inode); 3236 if (ret > 0) { 3237 loff_t end = offset + ret; 3238 if (end > inode->i_size) { 3239 ei->i_disksize = end; 3240 i_size_write(inode, end); 3241 /* 3242 * We're going to return a positive `ret' 3243 * here due to non-zero-length I/O, so there's 3244 * no way of reporting error returns from 3245 * ext4_mark_inode_dirty() to userspace. So 3246 * ignore it. 3247 */ 3248 ext4_mark_inode_dirty(handle, inode); 3249 } 3250 } 3251 err = ext4_journal_stop(handle); 3252 if (ret == 0) 3253 ret = err; 3254 } 3255 out: 3256 return ret; 3257 } 3258 3259 /* 3260 * Pages can be marked dirty completely asynchronously from ext4's journalling 3261 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3262 * much here because ->set_page_dirty is called under VFS locks. The page is 3263 * not necessarily locked. 3264 * 3265 * We cannot just dirty the page and leave attached buffers clean, because the 3266 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3267 * or jbddirty because all the journalling code will explode. 3268 * 3269 * So what we do is to mark the page "pending dirty" and next time writepage 3270 * is called, propagate that into the buffers appropriately. 3271 */ 3272 static int ext4_journalled_set_page_dirty(struct page *page) 3273 { 3274 SetPageChecked(page); 3275 return __set_page_dirty_nobuffers(page); 3276 } 3277 3278 static const struct address_space_operations ext4_ordered_aops = { 3279 .readpage = ext4_readpage, 3280 .readpages = ext4_readpages, 3281 .writepage = ext4_normal_writepage, 3282 .sync_page = block_sync_page, 3283 .write_begin = ext4_write_begin, 3284 .write_end = ext4_ordered_write_end, 3285 .bmap = ext4_bmap, 3286 .invalidatepage = ext4_invalidatepage, 3287 .releasepage = ext4_releasepage, 3288 .direct_IO = ext4_direct_IO, 3289 .migratepage = buffer_migrate_page, 3290 .is_partially_uptodate = block_is_partially_uptodate, 3291 }; 3292 3293 static const struct address_space_operations ext4_writeback_aops = { 3294 .readpage = ext4_readpage, 3295 .readpages = ext4_readpages, 3296 .writepage = ext4_normal_writepage, 3297 .sync_page = block_sync_page, 3298 .write_begin = ext4_write_begin, 3299 .write_end = ext4_writeback_write_end, 3300 .bmap = ext4_bmap, 3301 .invalidatepage = ext4_invalidatepage, 3302 .releasepage = ext4_releasepage, 3303 .direct_IO = ext4_direct_IO, 3304 .migratepage = buffer_migrate_page, 3305 .is_partially_uptodate = block_is_partially_uptodate, 3306 }; 3307 3308 static const struct address_space_operations ext4_journalled_aops = { 3309 .readpage = ext4_readpage, 3310 .readpages = ext4_readpages, 3311 .writepage = ext4_journalled_writepage, 3312 .sync_page = block_sync_page, 3313 .write_begin = ext4_write_begin, 3314 .write_end = ext4_journalled_write_end, 3315 .set_page_dirty = ext4_journalled_set_page_dirty, 3316 .bmap = ext4_bmap, 3317 .invalidatepage = ext4_invalidatepage, 3318 .releasepage = ext4_releasepage, 3319 .is_partially_uptodate = block_is_partially_uptodate, 3320 }; 3321 3322 static const struct address_space_operations ext4_da_aops = { 3323 .readpage = ext4_readpage, 3324 .readpages = ext4_readpages, 3325 .writepage = ext4_da_writepage, 3326 .writepages = ext4_da_writepages, 3327 .sync_page = block_sync_page, 3328 .write_begin = ext4_da_write_begin, 3329 .write_end = ext4_da_write_end, 3330 .bmap = ext4_bmap, 3331 .invalidatepage = ext4_da_invalidatepage, 3332 .releasepage = ext4_releasepage, 3333 .direct_IO = ext4_direct_IO, 3334 .migratepage = buffer_migrate_page, 3335 .is_partially_uptodate = block_is_partially_uptodate, 3336 }; 3337 3338 void ext4_set_aops(struct inode *inode) 3339 { 3340 if (ext4_should_order_data(inode) && 3341 test_opt(inode->i_sb, DELALLOC)) 3342 inode->i_mapping->a_ops = &ext4_da_aops; 3343 else if (ext4_should_order_data(inode)) 3344 inode->i_mapping->a_ops = &ext4_ordered_aops; 3345 else if (ext4_should_writeback_data(inode) && 3346 test_opt(inode->i_sb, DELALLOC)) 3347 inode->i_mapping->a_ops = &ext4_da_aops; 3348 else if (ext4_should_writeback_data(inode)) 3349 inode->i_mapping->a_ops = &ext4_writeback_aops; 3350 else 3351 inode->i_mapping->a_ops = &ext4_journalled_aops; 3352 } 3353 3354 /* 3355 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3356 * up to the end of the block which corresponds to `from'. 3357 * This required during truncate. We need to physically zero the tail end 3358 * of that block so it doesn't yield old data if the file is later grown. 3359 */ 3360 int ext4_block_truncate_page(handle_t *handle, 3361 struct address_space *mapping, loff_t from) 3362 { 3363 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3364 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3365 unsigned blocksize, length, pos; 3366 ext4_lblk_t iblock; 3367 struct inode *inode = mapping->host; 3368 struct buffer_head *bh; 3369 struct page *page; 3370 int err = 0; 3371 3372 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3373 if (!page) 3374 return -EINVAL; 3375 3376 blocksize = inode->i_sb->s_blocksize; 3377 length = blocksize - (offset & (blocksize - 1)); 3378 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3379 3380 /* 3381 * For "nobh" option, we can only work if we don't need to 3382 * read-in the page - otherwise we create buffers to do the IO. 3383 */ 3384 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 3385 ext4_should_writeback_data(inode) && PageUptodate(page)) { 3386 zero_user(page, offset, length); 3387 set_page_dirty(page); 3388 goto unlock; 3389 } 3390 3391 if (!page_has_buffers(page)) 3392 create_empty_buffers(page, blocksize, 0); 3393 3394 /* Find the buffer that contains "offset" */ 3395 bh = page_buffers(page); 3396 pos = blocksize; 3397 while (offset >= pos) { 3398 bh = bh->b_this_page; 3399 iblock++; 3400 pos += blocksize; 3401 } 3402 3403 err = 0; 3404 if (buffer_freed(bh)) { 3405 BUFFER_TRACE(bh, "freed: skip"); 3406 goto unlock; 3407 } 3408 3409 if (!buffer_mapped(bh)) { 3410 BUFFER_TRACE(bh, "unmapped"); 3411 ext4_get_block(inode, iblock, bh, 0); 3412 /* unmapped? It's a hole - nothing to do */ 3413 if (!buffer_mapped(bh)) { 3414 BUFFER_TRACE(bh, "still unmapped"); 3415 goto unlock; 3416 } 3417 } 3418 3419 /* Ok, it's mapped. Make sure it's up-to-date */ 3420 if (PageUptodate(page)) 3421 set_buffer_uptodate(bh); 3422 3423 if (!buffer_uptodate(bh)) { 3424 err = -EIO; 3425 ll_rw_block(READ, 1, &bh); 3426 wait_on_buffer(bh); 3427 /* Uhhuh. Read error. Complain and punt. */ 3428 if (!buffer_uptodate(bh)) 3429 goto unlock; 3430 } 3431 3432 if (ext4_should_journal_data(inode)) { 3433 BUFFER_TRACE(bh, "get write access"); 3434 err = ext4_journal_get_write_access(handle, bh); 3435 if (err) 3436 goto unlock; 3437 } 3438 3439 zero_user(page, offset, length); 3440 3441 BUFFER_TRACE(bh, "zeroed end of block"); 3442 3443 err = 0; 3444 if (ext4_should_journal_data(inode)) { 3445 err = ext4_handle_dirty_metadata(handle, inode, bh); 3446 } else { 3447 if (ext4_should_order_data(inode)) 3448 err = ext4_jbd2_file_inode(handle, inode); 3449 mark_buffer_dirty(bh); 3450 } 3451 3452 unlock: 3453 unlock_page(page); 3454 page_cache_release(page); 3455 return err; 3456 } 3457 3458 /* 3459 * Probably it should be a library function... search for first non-zero word 3460 * or memcmp with zero_page, whatever is better for particular architecture. 3461 * Linus? 3462 */ 3463 static inline int all_zeroes(__le32 *p, __le32 *q) 3464 { 3465 while (p < q) 3466 if (*p++) 3467 return 0; 3468 return 1; 3469 } 3470 3471 /** 3472 * ext4_find_shared - find the indirect blocks for partial truncation. 3473 * @inode: inode in question 3474 * @depth: depth of the affected branch 3475 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3476 * @chain: place to store the pointers to partial indirect blocks 3477 * @top: place to the (detached) top of branch 3478 * 3479 * This is a helper function used by ext4_truncate(). 3480 * 3481 * When we do truncate() we may have to clean the ends of several 3482 * indirect blocks but leave the blocks themselves alive. Block is 3483 * partially truncated if some data below the new i_size is refered 3484 * from it (and it is on the path to the first completely truncated 3485 * data block, indeed). We have to free the top of that path along 3486 * with everything to the right of the path. Since no allocation 3487 * past the truncation point is possible until ext4_truncate() 3488 * finishes, we may safely do the latter, but top of branch may 3489 * require special attention - pageout below the truncation point 3490 * might try to populate it. 3491 * 3492 * We atomically detach the top of branch from the tree, store the 3493 * block number of its root in *@top, pointers to buffer_heads of 3494 * partially truncated blocks - in @chain[].bh and pointers to 3495 * their last elements that should not be removed - in 3496 * @chain[].p. Return value is the pointer to last filled element 3497 * of @chain. 3498 * 3499 * The work left to caller to do the actual freeing of subtrees: 3500 * a) free the subtree starting from *@top 3501 * b) free the subtrees whose roots are stored in 3502 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3503 * c) free the subtrees growing from the inode past the @chain[0]. 3504 * (no partially truncated stuff there). */ 3505 3506 static Indirect *ext4_find_shared(struct inode *inode, int depth, 3507 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 3508 { 3509 Indirect *partial, *p; 3510 int k, err; 3511 3512 *top = 0; 3513 /* Make k index the deepest non-null offest + 1 */ 3514 for (k = depth; k > 1 && !offsets[k-1]; k--) 3515 ; 3516 partial = ext4_get_branch(inode, k, offsets, chain, &err); 3517 /* Writer: pointers */ 3518 if (!partial) 3519 partial = chain + k-1; 3520 /* 3521 * If the branch acquired continuation since we've looked at it - 3522 * fine, it should all survive and (new) top doesn't belong to us. 3523 */ 3524 if (!partial->key && *partial->p) 3525 /* Writer: end */ 3526 goto no_top; 3527 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3528 ; 3529 /* 3530 * OK, we've found the last block that must survive. The rest of our 3531 * branch should be detached before unlocking. However, if that rest 3532 * of branch is all ours and does not grow immediately from the inode 3533 * it's easier to cheat and just decrement partial->p. 3534 */ 3535 if (p == chain + k - 1 && p > chain) { 3536 p->p--; 3537 } else { 3538 *top = *p->p; 3539 /* Nope, don't do this in ext4. Must leave the tree intact */ 3540 #if 0 3541 *p->p = 0; 3542 #endif 3543 } 3544 /* Writer: end */ 3545 3546 while (partial > p) { 3547 brelse(partial->bh); 3548 partial--; 3549 } 3550 no_top: 3551 return partial; 3552 } 3553 3554 /* 3555 * Zero a number of block pointers in either an inode or an indirect block. 3556 * If we restart the transaction we must again get write access to the 3557 * indirect block for further modification. 3558 * 3559 * We release `count' blocks on disk, but (last - first) may be greater 3560 * than `count' because there can be holes in there. 3561 */ 3562 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3563 struct buffer_head *bh, ext4_fsblk_t block_to_free, 3564 unsigned long count, __le32 *first, __le32 *last) 3565 { 3566 __le32 *p; 3567 if (try_to_extend_transaction(handle, inode)) { 3568 if (bh) { 3569 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 3570 ext4_handle_dirty_metadata(handle, inode, bh); 3571 } 3572 ext4_mark_inode_dirty(handle, inode); 3573 ext4_journal_test_restart(handle, inode); 3574 if (bh) { 3575 BUFFER_TRACE(bh, "retaking write access"); 3576 ext4_journal_get_write_access(handle, bh); 3577 } 3578 } 3579 3580 /* 3581 * Any buffers which are on the journal will be in memory. We find 3582 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 3583 * on them. We've already detached each block from the file, so 3584 * bforget() in jbd2_journal_forget() should be safe. 3585 * 3586 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3587 */ 3588 for (p = first; p < last; p++) { 3589 u32 nr = le32_to_cpu(*p); 3590 if (nr) { 3591 struct buffer_head *tbh; 3592 3593 *p = 0; 3594 tbh = sb_find_get_block(inode->i_sb, nr); 3595 ext4_forget(handle, 0, inode, tbh, nr); 3596 } 3597 } 3598 3599 ext4_free_blocks(handle, inode, block_to_free, count, 0); 3600 } 3601 3602 /** 3603 * ext4_free_data - free a list of data blocks 3604 * @handle: handle for this transaction 3605 * @inode: inode we are dealing with 3606 * @this_bh: indirect buffer_head which contains *@first and *@last 3607 * @first: array of block numbers 3608 * @last: points immediately past the end of array 3609 * 3610 * We are freeing all blocks refered from that array (numbers are stored as 3611 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3612 * 3613 * We accumulate contiguous runs of blocks to free. Conveniently, if these 3614 * blocks are contiguous then releasing them at one time will only affect one 3615 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3616 * actually use a lot of journal space. 3617 * 3618 * @this_bh will be %NULL if @first and @last point into the inode's direct 3619 * block pointers. 3620 */ 3621 static void ext4_free_data(handle_t *handle, struct inode *inode, 3622 struct buffer_head *this_bh, 3623 __le32 *first, __le32 *last) 3624 { 3625 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3626 unsigned long count = 0; /* Number of blocks in the run */ 3627 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3628 corresponding to 3629 block_to_free */ 3630 ext4_fsblk_t nr; /* Current block # */ 3631 __le32 *p; /* Pointer into inode/ind 3632 for current block */ 3633 int err; 3634 3635 if (this_bh) { /* For indirect block */ 3636 BUFFER_TRACE(this_bh, "get_write_access"); 3637 err = ext4_journal_get_write_access(handle, this_bh); 3638 /* Important: if we can't update the indirect pointers 3639 * to the blocks, we can't free them. */ 3640 if (err) 3641 return; 3642 } 3643 3644 for (p = first; p < last; p++) { 3645 nr = le32_to_cpu(*p); 3646 if (nr) { 3647 /* accumulate blocks to free if they're contiguous */ 3648 if (count == 0) { 3649 block_to_free = nr; 3650 block_to_free_p = p; 3651 count = 1; 3652 } else if (nr == block_to_free + count) { 3653 count++; 3654 } else { 3655 ext4_clear_blocks(handle, inode, this_bh, 3656 block_to_free, 3657 count, block_to_free_p, p); 3658 block_to_free = nr; 3659 block_to_free_p = p; 3660 count = 1; 3661 } 3662 } 3663 } 3664 3665 if (count > 0) 3666 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3667 count, block_to_free_p, p); 3668 3669 if (this_bh) { 3670 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 3671 3672 /* 3673 * The buffer head should have an attached journal head at this 3674 * point. However, if the data is corrupted and an indirect 3675 * block pointed to itself, it would have been detached when 3676 * the block was cleared. Check for this instead of OOPSing. 3677 */ 3678 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 3679 ext4_handle_dirty_metadata(handle, inode, this_bh); 3680 else 3681 ext4_error(inode->i_sb, __func__, 3682 "circular indirect block detected, " 3683 "inode=%lu, block=%llu", 3684 inode->i_ino, 3685 (unsigned long long) this_bh->b_blocknr); 3686 } 3687 } 3688 3689 /** 3690 * ext4_free_branches - free an array of branches 3691 * @handle: JBD handle for this transaction 3692 * @inode: inode we are dealing with 3693 * @parent_bh: the buffer_head which contains *@first and *@last 3694 * @first: array of block numbers 3695 * @last: pointer immediately past the end of array 3696 * @depth: depth of the branches to free 3697 * 3698 * We are freeing all blocks refered from these branches (numbers are 3699 * stored as little-endian 32-bit) and updating @inode->i_blocks 3700 * appropriately. 3701 */ 3702 static void ext4_free_branches(handle_t *handle, struct inode *inode, 3703 struct buffer_head *parent_bh, 3704 __le32 *first, __le32 *last, int depth) 3705 { 3706 ext4_fsblk_t nr; 3707 __le32 *p; 3708 3709 if (ext4_handle_is_aborted(handle)) 3710 return; 3711 3712 if (depth--) { 3713 struct buffer_head *bh; 3714 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3715 p = last; 3716 while (--p >= first) { 3717 nr = le32_to_cpu(*p); 3718 if (!nr) 3719 continue; /* A hole */ 3720 3721 /* Go read the buffer for the next level down */ 3722 bh = sb_bread(inode->i_sb, nr); 3723 3724 /* 3725 * A read failure? Report error and clear slot 3726 * (should be rare). 3727 */ 3728 if (!bh) { 3729 ext4_error(inode->i_sb, "ext4_free_branches", 3730 "Read failure, inode=%lu, block=%llu", 3731 inode->i_ino, nr); 3732 continue; 3733 } 3734 3735 /* This zaps the entire block. Bottom up. */ 3736 BUFFER_TRACE(bh, "free child branches"); 3737 ext4_free_branches(handle, inode, bh, 3738 (__le32 *) bh->b_data, 3739 (__le32 *) bh->b_data + addr_per_block, 3740 depth); 3741 3742 /* 3743 * We've probably journalled the indirect block several 3744 * times during the truncate. But it's no longer 3745 * needed and we now drop it from the transaction via 3746 * jbd2_journal_revoke(). 3747 * 3748 * That's easy if it's exclusively part of this 3749 * transaction. But if it's part of the committing 3750 * transaction then jbd2_journal_forget() will simply 3751 * brelse() it. That means that if the underlying 3752 * block is reallocated in ext4_get_block(), 3753 * unmap_underlying_metadata() will find this block 3754 * and will try to get rid of it. damn, damn. 3755 * 3756 * If this block has already been committed to the 3757 * journal, a revoke record will be written. And 3758 * revoke records must be emitted *before* clearing 3759 * this block's bit in the bitmaps. 3760 */ 3761 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 3762 3763 /* 3764 * Everything below this this pointer has been 3765 * released. Now let this top-of-subtree go. 3766 * 3767 * We want the freeing of this indirect block to be 3768 * atomic in the journal with the updating of the 3769 * bitmap block which owns it. So make some room in 3770 * the journal. 3771 * 3772 * We zero the parent pointer *after* freeing its 3773 * pointee in the bitmaps, so if extend_transaction() 3774 * for some reason fails to put the bitmap changes and 3775 * the release into the same transaction, recovery 3776 * will merely complain about releasing a free block, 3777 * rather than leaking blocks. 3778 */ 3779 if (ext4_handle_is_aborted(handle)) 3780 return; 3781 if (try_to_extend_transaction(handle, inode)) { 3782 ext4_mark_inode_dirty(handle, inode); 3783 ext4_journal_test_restart(handle, inode); 3784 } 3785 3786 ext4_free_blocks(handle, inode, nr, 1, 1); 3787 3788 if (parent_bh) { 3789 /* 3790 * The block which we have just freed is 3791 * pointed to by an indirect block: journal it 3792 */ 3793 BUFFER_TRACE(parent_bh, "get_write_access"); 3794 if (!ext4_journal_get_write_access(handle, 3795 parent_bh)){ 3796 *p = 0; 3797 BUFFER_TRACE(parent_bh, 3798 "call ext4_handle_dirty_metadata"); 3799 ext4_handle_dirty_metadata(handle, 3800 inode, 3801 parent_bh); 3802 } 3803 } 3804 } 3805 } else { 3806 /* We have reached the bottom of the tree. */ 3807 BUFFER_TRACE(parent_bh, "free data blocks"); 3808 ext4_free_data(handle, inode, parent_bh, first, last); 3809 } 3810 } 3811 3812 int ext4_can_truncate(struct inode *inode) 3813 { 3814 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3815 return 0; 3816 if (S_ISREG(inode->i_mode)) 3817 return 1; 3818 if (S_ISDIR(inode->i_mode)) 3819 return 1; 3820 if (S_ISLNK(inode->i_mode)) 3821 return !ext4_inode_is_fast_symlink(inode); 3822 return 0; 3823 } 3824 3825 /* 3826 * ext4_truncate() 3827 * 3828 * We block out ext4_get_block() block instantiations across the entire 3829 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 3830 * simultaneously on behalf of the same inode. 3831 * 3832 * As we work through the truncate and commmit bits of it to the journal there 3833 * is one core, guiding principle: the file's tree must always be consistent on 3834 * disk. We must be able to restart the truncate after a crash. 3835 * 3836 * The file's tree may be transiently inconsistent in memory (although it 3837 * probably isn't), but whenever we close off and commit a journal transaction, 3838 * the contents of (the filesystem + the journal) must be consistent and 3839 * restartable. It's pretty simple, really: bottom up, right to left (although 3840 * left-to-right works OK too). 3841 * 3842 * Note that at recovery time, journal replay occurs *before* the restart of 3843 * truncate against the orphan inode list. 3844 * 3845 * The committed inode has the new, desired i_size (which is the same as 3846 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 3847 * that this inode's truncate did not complete and it will again call 3848 * ext4_truncate() to have another go. So there will be instantiated blocks 3849 * to the right of the truncation point in a crashed ext4 filesystem. But 3850 * that's fine - as long as they are linked from the inode, the post-crash 3851 * ext4_truncate() run will find them and release them. 3852 */ 3853 void ext4_truncate(struct inode *inode) 3854 { 3855 handle_t *handle; 3856 struct ext4_inode_info *ei = EXT4_I(inode); 3857 __le32 *i_data = ei->i_data; 3858 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3859 struct address_space *mapping = inode->i_mapping; 3860 ext4_lblk_t offsets[4]; 3861 Indirect chain[4]; 3862 Indirect *partial; 3863 __le32 nr = 0; 3864 int n; 3865 ext4_lblk_t last_block; 3866 unsigned blocksize = inode->i_sb->s_blocksize; 3867 3868 if (!ext4_can_truncate(inode)) 3869 return; 3870 3871 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3872 ext4_ext_truncate(inode); 3873 return; 3874 } 3875 3876 handle = start_transaction(inode); 3877 if (IS_ERR(handle)) 3878 return; /* AKPM: return what? */ 3879 3880 last_block = (inode->i_size + blocksize-1) 3881 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3882 3883 if (inode->i_size & (blocksize - 1)) 3884 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 3885 goto out_stop; 3886 3887 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3888 if (n == 0) 3889 goto out_stop; /* error */ 3890 3891 /* 3892 * OK. This truncate is going to happen. We add the inode to the 3893 * orphan list, so that if this truncate spans multiple transactions, 3894 * and we crash, we will resume the truncate when the filesystem 3895 * recovers. It also marks the inode dirty, to catch the new size. 3896 * 3897 * Implication: the file must always be in a sane, consistent 3898 * truncatable state while each transaction commits. 3899 */ 3900 if (ext4_orphan_add(handle, inode)) 3901 goto out_stop; 3902 3903 /* 3904 * From here we block out all ext4_get_block() callers who want to 3905 * modify the block allocation tree. 3906 */ 3907 down_write(&ei->i_data_sem); 3908 3909 ext4_discard_preallocations(inode); 3910 3911 /* 3912 * The orphan list entry will now protect us from any crash which 3913 * occurs before the truncate completes, so it is now safe to propagate 3914 * the new, shorter inode size (held for now in i_size) into the 3915 * on-disk inode. We do this via i_disksize, which is the value which 3916 * ext4 *really* writes onto the disk inode. 3917 */ 3918 ei->i_disksize = inode->i_size; 3919 3920 if (n == 1) { /* direct blocks */ 3921 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3922 i_data + EXT4_NDIR_BLOCKS); 3923 goto do_indirects; 3924 } 3925 3926 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 3927 /* Kill the top of shared branch (not detached) */ 3928 if (nr) { 3929 if (partial == chain) { 3930 /* Shared branch grows from the inode */ 3931 ext4_free_branches(handle, inode, NULL, 3932 &nr, &nr+1, (chain+n-1) - partial); 3933 *partial->p = 0; 3934 /* 3935 * We mark the inode dirty prior to restart, 3936 * and prior to stop. No need for it here. 3937 */ 3938 } else { 3939 /* Shared branch grows from an indirect block */ 3940 BUFFER_TRACE(partial->bh, "get_write_access"); 3941 ext4_free_branches(handle, inode, partial->bh, 3942 partial->p, 3943 partial->p+1, (chain+n-1) - partial); 3944 } 3945 } 3946 /* Clear the ends of indirect blocks on the shared branch */ 3947 while (partial > chain) { 3948 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 3949 (__le32*)partial->bh->b_data+addr_per_block, 3950 (chain+n-1) - partial); 3951 BUFFER_TRACE(partial->bh, "call brelse"); 3952 brelse (partial->bh); 3953 partial--; 3954 } 3955 do_indirects: 3956 /* Kill the remaining (whole) subtrees */ 3957 switch (offsets[0]) { 3958 default: 3959 nr = i_data[EXT4_IND_BLOCK]; 3960 if (nr) { 3961 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 3962 i_data[EXT4_IND_BLOCK] = 0; 3963 } 3964 case EXT4_IND_BLOCK: 3965 nr = i_data[EXT4_DIND_BLOCK]; 3966 if (nr) { 3967 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 3968 i_data[EXT4_DIND_BLOCK] = 0; 3969 } 3970 case EXT4_DIND_BLOCK: 3971 nr = i_data[EXT4_TIND_BLOCK]; 3972 if (nr) { 3973 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 3974 i_data[EXT4_TIND_BLOCK] = 0; 3975 } 3976 case EXT4_TIND_BLOCK: 3977 ; 3978 } 3979 3980 up_write(&ei->i_data_sem); 3981 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3982 ext4_mark_inode_dirty(handle, inode); 3983 3984 /* 3985 * In a multi-transaction truncate, we only make the final transaction 3986 * synchronous 3987 */ 3988 if (IS_SYNC(inode)) 3989 ext4_handle_sync(handle); 3990 out_stop: 3991 /* 3992 * If this was a simple ftruncate(), and the file will remain alive 3993 * then we need to clear up the orphan record which we created above. 3994 * However, if this was a real unlink then we were called by 3995 * ext4_delete_inode(), and we allow that function to clean up the 3996 * orphan info for us. 3997 */ 3998 if (inode->i_nlink) 3999 ext4_orphan_del(handle, inode); 4000 4001 ext4_journal_stop(handle); 4002 } 4003 4004 /* 4005 * ext4_get_inode_loc returns with an extra refcount against the inode's 4006 * underlying buffer_head on success. If 'in_mem' is true, we have all 4007 * data in memory that is needed to recreate the on-disk version of this 4008 * inode. 4009 */ 4010 static int __ext4_get_inode_loc(struct inode *inode, 4011 struct ext4_iloc *iloc, int in_mem) 4012 { 4013 struct ext4_group_desc *gdp; 4014 struct buffer_head *bh; 4015 struct super_block *sb = inode->i_sb; 4016 ext4_fsblk_t block; 4017 int inodes_per_block, inode_offset; 4018 4019 iloc->bh = NULL; 4020 if (!ext4_valid_inum(sb, inode->i_ino)) 4021 return -EIO; 4022 4023 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4024 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4025 if (!gdp) 4026 return -EIO; 4027 4028 /* 4029 * Figure out the offset within the block group inode table 4030 */ 4031 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4032 inode_offset = ((inode->i_ino - 1) % 4033 EXT4_INODES_PER_GROUP(sb)); 4034 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4035 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4036 4037 bh = sb_getblk(sb, block); 4038 if (!bh) { 4039 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4040 "inode block - inode=%lu, block=%llu", 4041 inode->i_ino, block); 4042 return -EIO; 4043 } 4044 if (!buffer_uptodate(bh)) { 4045 lock_buffer(bh); 4046 4047 /* 4048 * If the buffer has the write error flag, we have failed 4049 * to write out another inode in the same block. In this 4050 * case, we don't have to read the block because we may 4051 * read the old inode data successfully. 4052 */ 4053 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4054 set_buffer_uptodate(bh); 4055 4056 if (buffer_uptodate(bh)) { 4057 /* someone brought it uptodate while we waited */ 4058 unlock_buffer(bh); 4059 goto has_buffer; 4060 } 4061 4062 /* 4063 * If we have all information of the inode in memory and this 4064 * is the only valid inode in the block, we need not read the 4065 * block. 4066 */ 4067 if (in_mem) { 4068 struct buffer_head *bitmap_bh; 4069 int i, start; 4070 4071 start = inode_offset & ~(inodes_per_block - 1); 4072 4073 /* Is the inode bitmap in cache? */ 4074 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4075 if (!bitmap_bh) 4076 goto make_io; 4077 4078 /* 4079 * If the inode bitmap isn't in cache then the 4080 * optimisation may end up performing two reads instead 4081 * of one, so skip it. 4082 */ 4083 if (!buffer_uptodate(bitmap_bh)) { 4084 brelse(bitmap_bh); 4085 goto make_io; 4086 } 4087 for (i = start; i < start + inodes_per_block; i++) { 4088 if (i == inode_offset) 4089 continue; 4090 if (ext4_test_bit(i, bitmap_bh->b_data)) 4091 break; 4092 } 4093 brelse(bitmap_bh); 4094 if (i == start + inodes_per_block) { 4095 /* all other inodes are free, so skip I/O */ 4096 memset(bh->b_data, 0, bh->b_size); 4097 set_buffer_uptodate(bh); 4098 unlock_buffer(bh); 4099 goto has_buffer; 4100 } 4101 } 4102 4103 make_io: 4104 /* 4105 * If we need to do any I/O, try to pre-readahead extra 4106 * blocks from the inode table. 4107 */ 4108 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4109 ext4_fsblk_t b, end, table; 4110 unsigned num; 4111 4112 table = ext4_inode_table(sb, gdp); 4113 /* Make sure s_inode_readahead_blks is a power of 2 */ 4114 while (EXT4_SB(sb)->s_inode_readahead_blks & 4115 (EXT4_SB(sb)->s_inode_readahead_blks-1)) 4116 EXT4_SB(sb)->s_inode_readahead_blks = 4117 (EXT4_SB(sb)->s_inode_readahead_blks & 4118 (EXT4_SB(sb)->s_inode_readahead_blks-1)); 4119 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4120 if (table > b) 4121 b = table; 4122 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4123 num = EXT4_INODES_PER_GROUP(sb); 4124 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4125 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4126 num -= ext4_itable_unused_count(sb, gdp); 4127 table += num / inodes_per_block; 4128 if (end > table) 4129 end = table; 4130 while (b <= end) 4131 sb_breadahead(sb, b++); 4132 } 4133 4134 /* 4135 * There are other valid inodes in the buffer, this inode 4136 * has in-inode xattrs, or we don't have this inode in memory. 4137 * Read the block from disk. 4138 */ 4139 get_bh(bh); 4140 bh->b_end_io = end_buffer_read_sync; 4141 submit_bh(READ_META, bh); 4142 wait_on_buffer(bh); 4143 if (!buffer_uptodate(bh)) { 4144 ext4_error(sb, __func__, 4145 "unable to read inode block - inode=%lu, " 4146 "block=%llu", inode->i_ino, block); 4147 brelse(bh); 4148 return -EIO; 4149 } 4150 } 4151 has_buffer: 4152 iloc->bh = bh; 4153 return 0; 4154 } 4155 4156 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4157 { 4158 /* We have all inode data except xattrs in memory here. */ 4159 return __ext4_get_inode_loc(inode, iloc, 4160 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4161 } 4162 4163 void ext4_set_inode_flags(struct inode *inode) 4164 { 4165 unsigned int flags = EXT4_I(inode)->i_flags; 4166 4167 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4168 if (flags & EXT4_SYNC_FL) 4169 inode->i_flags |= S_SYNC; 4170 if (flags & EXT4_APPEND_FL) 4171 inode->i_flags |= S_APPEND; 4172 if (flags & EXT4_IMMUTABLE_FL) 4173 inode->i_flags |= S_IMMUTABLE; 4174 if (flags & EXT4_NOATIME_FL) 4175 inode->i_flags |= S_NOATIME; 4176 if (flags & EXT4_DIRSYNC_FL) 4177 inode->i_flags |= S_DIRSYNC; 4178 } 4179 4180 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4181 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4182 { 4183 unsigned int flags = ei->vfs_inode.i_flags; 4184 4185 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4186 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4187 if (flags & S_SYNC) 4188 ei->i_flags |= EXT4_SYNC_FL; 4189 if (flags & S_APPEND) 4190 ei->i_flags |= EXT4_APPEND_FL; 4191 if (flags & S_IMMUTABLE) 4192 ei->i_flags |= EXT4_IMMUTABLE_FL; 4193 if (flags & S_NOATIME) 4194 ei->i_flags |= EXT4_NOATIME_FL; 4195 if (flags & S_DIRSYNC) 4196 ei->i_flags |= EXT4_DIRSYNC_FL; 4197 } 4198 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4199 struct ext4_inode_info *ei) 4200 { 4201 blkcnt_t i_blocks ; 4202 struct inode *inode = &(ei->vfs_inode); 4203 struct super_block *sb = inode->i_sb; 4204 4205 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4206 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4207 /* we are using combined 48 bit field */ 4208 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4209 le32_to_cpu(raw_inode->i_blocks_lo); 4210 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4211 /* i_blocks represent file system block size */ 4212 return i_blocks << (inode->i_blkbits - 9); 4213 } else { 4214 return i_blocks; 4215 } 4216 } else { 4217 return le32_to_cpu(raw_inode->i_blocks_lo); 4218 } 4219 } 4220 4221 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4222 { 4223 struct ext4_iloc iloc; 4224 struct ext4_inode *raw_inode; 4225 struct ext4_inode_info *ei; 4226 struct buffer_head *bh; 4227 struct inode *inode; 4228 long ret; 4229 int block; 4230 4231 inode = iget_locked(sb, ino); 4232 if (!inode) 4233 return ERR_PTR(-ENOMEM); 4234 if (!(inode->i_state & I_NEW)) 4235 return inode; 4236 4237 ei = EXT4_I(inode); 4238 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4239 ei->i_acl = EXT4_ACL_NOT_CACHED; 4240 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4241 #endif 4242 4243 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4244 if (ret < 0) 4245 goto bad_inode; 4246 bh = iloc.bh; 4247 raw_inode = ext4_raw_inode(&iloc); 4248 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4249 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4250 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4251 if (!(test_opt(inode->i_sb, NO_UID32))) { 4252 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4253 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4254 } 4255 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4256 4257 ei->i_state = 0; 4258 ei->i_dir_start_lookup = 0; 4259 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4260 /* We now have enough fields to check if the inode was active or not. 4261 * This is needed because nfsd might try to access dead inodes 4262 * the test is that same one that e2fsck uses 4263 * NeilBrown 1999oct15 4264 */ 4265 if (inode->i_nlink == 0) { 4266 if (inode->i_mode == 0 || 4267 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4268 /* this inode is deleted */ 4269 brelse(bh); 4270 ret = -ESTALE; 4271 goto bad_inode; 4272 } 4273 /* The only unlinked inodes we let through here have 4274 * valid i_mode and are being read by the orphan 4275 * recovery code: that's fine, we're about to complete 4276 * the process of deleting those. */ 4277 } 4278 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4279 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4280 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4281 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4282 cpu_to_le32(EXT4_OS_HURD)) { 4283 ei->i_file_acl |= 4284 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4285 } 4286 inode->i_size = ext4_isize(raw_inode); 4287 ei->i_disksize = inode->i_size; 4288 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4289 ei->i_block_group = iloc.block_group; 4290 /* 4291 * NOTE! The in-memory inode i_data array is in little-endian order 4292 * even on big-endian machines: we do NOT byteswap the block numbers! 4293 */ 4294 for (block = 0; block < EXT4_N_BLOCKS; block++) 4295 ei->i_data[block] = raw_inode->i_block[block]; 4296 INIT_LIST_HEAD(&ei->i_orphan); 4297 4298 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4299 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4300 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4301 EXT4_INODE_SIZE(inode->i_sb)) { 4302 brelse(bh); 4303 ret = -EIO; 4304 goto bad_inode; 4305 } 4306 if (ei->i_extra_isize == 0) { 4307 /* The extra space is currently unused. Use it. */ 4308 ei->i_extra_isize = sizeof(struct ext4_inode) - 4309 EXT4_GOOD_OLD_INODE_SIZE; 4310 } else { 4311 __le32 *magic = (void *)raw_inode + 4312 EXT4_GOOD_OLD_INODE_SIZE + 4313 ei->i_extra_isize; 4314 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4315 ei->i_state |= EXT4_STATE_XATTR; 4316 } 4317 } else 4318 ei->i_extra_isize = 0; 4319 4320 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4321 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4322 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4323 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4324 4325 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4326 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4327 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4328 inode->i_version |= 4329 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4330 } 4331 4332 if (S_ISREG(inode->i_mode)) { 4333 inode->i_op = &ext4_file_inode_operations; 4334 inode->i_fop = &ext4_file_operations; 4335 ext4_set_aops(inode); 4336 } else if (S_ISDIR(inode->i_mode)) { 4337 inode->i_op = &ext4_dir_inode_operations; 4338 inode->i_fop = &ext4_dir_operations; 4339 } else if (S_ISLNK(inode->i_mode)) { 4340 if (ext4_inode_is_fast_symlink(inode)) { 4341 inode->i_op = &ext4_fast_symlink_inode_operations; 4342 nd_terminate_link(ei->i_data, inode->i_size, 4343 sizeof(ei->i_data) - 1); 4344 } else { 4345 inode->i_op = &ext4_symlink_inode_operations; 4346 ext4_set_aops(inode); 4347 } 4348 } else { 4349 inode->i_op = &ext4_special_inode_operations; 4350 if (raw_inode->i_block[0]) 4351 init_special_inode(inode, inode->i_mode, 4352 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4353 else 4354 init_special_inode(inode, inode->i_mode, 4355 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4356 } 4357 brelse(iloc.bh); 4358 ext4_set_inode_flags(inode); 4359 unlock_new_inode(inode); 4360 return inode; 4361 4362 bad_inode: 4363 iget_failed(inode); 4364 return ERR_PTR(ret); 4365 } 4366 4367 static int ext4_inode_blocks_set(handle_t *handle, 4368 struct ext4_inode *raw_inode, 4369 struct ext4_inode_info *ei) 4370 { 4371 struct inode *inode = &(ei->vfs_inode); 4372 u64 i_blocks = inode->i_blocks; 4373 struct super_block *sb = inode->i_sb; 4374 4375 if (i_blocks <= ~0U) { 4376 /* 4377 * i_blocks can be represnted in a 32 bit variable 4378 * as multiple of 512 bytes 4379 */ 4380 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4381 raw_inode->i_blocks_high = 0; 4382 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4383 return 0; 4384 } 4385 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4386 return -EFBIG; 4387 4388 if (i_blocks <= 0xffffffffffffULL) { 4389 /* 4390 * i_blocks can be represented in a 48 bit variable 4391 * as multiple of 512 bytes 4392 */ 4393 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4394 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4395 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4396 } else { 4397 ei->i_flags |= EXT4_HUGE_FILE_FL; 4398 /* i_block is stored in file system block size */ 4399 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4400 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4401 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4402 } 4403 return 0; 4404 } 4405 4406 /* 4407 * Post the struct inode info into an on-disk inode location in the 4408 * buffer-cache. This gobbles the caller's reference to the 4409 * buffer_head in the inode location struct. 4410 * 4411 * The caller must have write access to iloc->bh. 4412 */ 4413 static int ext4_do_update_inode(handle_t *handle, 4414 struct inode *inode, 4415 struct ext4_iloc *iloc) 4416 { 4417 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4418 struct ext4_inode_info *ei = EXT4_I(inode); 4419 struct buffer_head *bh = iloc->bh; 4420 int err = 0, rc, block; 4421 4422 /* For fields not not tracking in the in-memory inode, 4423 * initialise them to zero for new inodes. */ 4424 if (ei->i_state & EXT4_STATE_NEW) 4425 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4426 4427 ext4_get_inode_flags(ei); 4428 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4429 if (!(test_opt(inode->i_sb, NO_UID32))) { 4430 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4431 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4432 /* 4433 * Fix up interoperability with old kernels. Otherwise, old inodes get 4434 * re-used with the upper 16 bits of the uid/gid intact 4435 */ 4436 if (!ei->i_dtime) { 4437 raw_inode->i_uid_high = 4438 cpu_to_le16(high_16_bits(inode->i_uid)); 4439 raw_inode->i_gid_high = 4440 cpu_to_le16(high_16_bits(inode->i_gid)); 4441 } else { 4442 raw_inode->i_uid_high = 0; 4443 raw_inode->i_gid_high = 0; 4444 } 4445 } else { 4446 raw_inode->i_uid_low = 4447 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 4448 raw_inode->i_gid_low = 4449 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 4450 raw_inode->i_uid_high = 0; 4451 raw_inode->i_gid_high = 0; 4452 } 4453 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4454 4455 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4456 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4457 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4458 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4459 4460 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4461 goto out_brelse; 4462 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4463 /* clear the migrate flag in the raw_inode */ 4464 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); 4465 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4466 cpu_to_le32(EXT4_OS_HURD)) 4467 raw_inode->i_file_acl_high = 4468 cpu_to_le16(ei->i_file_acl >> 32); 4469 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4470 ext4_isize_set(raw_inode, ei->i_disksize); 4471 if (ei->i_disksize > 0x7fffffffULL) { 4472 struct super_block *sb = inode->i_sb; 4473 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4474 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 4475 EXT4_SB(sb)->s_es->s_rev_level == 4476 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 4477 /* If this is the first large file 4478 * created, add a flag to the superblock. 4479 */ 4480 err = ext4_journal_get_write_access(handle, 4481 EXT4_SB(sb)->s_sbh); 4482 if (err) 4483 goto out_brelse; 4484 ext4_update_dynamic_rev(sb); 4485 EXT4_SET_RO_COMPAT_FEATURE(sb, 4486 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4487 sb->s_dirt = 1; 4488 ext4_handle_sync(handle); 4489 err = ext4_handle_dirty_metadata(handle, inode, 4490 EXT4_SB(sb)->s_sbh); 4491 } 4492 } 4493 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4494 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4495 if (old_valid_dev(inode->i_rdev)) { 4496 raw_inode->i_block[0] = 4497 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4498 raw_inode->i_block[1] = 0; 4499 } else { 4500 raw_inode->i_block[0] = 0; 4501 raw_inode->i_block[1] = 4502 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4503 raw_inode->i_block[2] = 0; 4504 } 4505 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 4506 raw_inode->i_block[block] = ei->i_data[block]; 4507 4508 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4509 if (ei->i_extra_isize) { 4510 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4511 raw_inode->i_version_hi = 4512 cpu_to_le32(inode->i_version >> 32); 4513 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4514 } 4515 4516 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4517 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4518 if (!err) 4519 err = rc; 4520 ei->i_state &= ~EXT4_STATE_NEW; 4521 4522 out_brelse: 4523 brelse(bh); 4524 ext4_std_error(inode->i_sb, err); 4525 return err; 4526 } 4527 4528 /* 4529 * ext4_write_inode() 4530 * 4531 * We are called from a few places: 4532 * 4533 * - Within generic_file_write() for O_SYNC files. 4534 * Here, there will be no transaction running. We wait for any running 4535 * trasnaction to commit. 4536 * 4537 * - Within sys_sync(), kupdate and such. 4538 * We wait on commit, if tol to. 4539 * 4540 * - Within prune_icache() (PF_MEMALLOC == true) 4541 * Here we simply return. We can't afford to block kswapd on the 4542 * journal commit. 4543 * 4544 * In all cases it is actually safe for us to return without doing anything, 4545 * because the inode has been copied into a raw inode buffer in 4546 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4547 * knfsd. 4548 * 4549 * Note that we are absolutely dependent upon all inode dirtiers doing the 4550 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4551 * which we are interested. 4552 * 4553 * It would be a bug for them to not do this. The code: 4554 * 4555 * mark_inode_dirty(inode) 4556 * stuff(); 4557 * inode->i_size = expr; 4558 * 4559 * is in error because a kswapd-driven write_inode() could occur while 4560 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4561 * will no longer be on the superblock's dirty inode list. 4562 */ 4563 int ext4_write_inode(struct inode *inode, int wait) 4564 { 4565 if (current->flags & PF_MEMALLOC) 4566 return 0; 4567 4568 if (ext4_journal_current_handle()) { 4569 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4570 dump_stack(); 4571 return -EIO; 4572 } 4573 4574 if (!wait) 4575 return 0; 4576 4577 return ext4_force_commit(inode->i_sb); 4578 } 4579 4580 int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) 4581 { 4582 int err = 0; 4583 4584 mark_buffer_dirty(bh); 4585 if (inode && inode_needs_sync(inode)) { 4586 sync_dirty_buffer(bh); 4587 if (buffer_req(bh) && !buffer_uptodate(bh)) { 4588 ext4_error(inode->i_sb, __func__, 4589 "IO error syncing inode, " 4590 "inode=%lu, block=%llu", 4591 inode->i_ino, 4592 (unsigned long long)bh->b_blocknr); 4593 err = -EIO; 4594 } 4595 } 4596 return err; 4597 } 4598 4599 /* 4600 * ext4_setattr() 4601 * 4602 * Called from notify_change. 4603 * 4604 * We want to trap VFS attempts to truncate the file as soon as 4605 * possible. In particular, we want to make sure that when the VFS 4606 * shrinks i_size, we put the inode on the orphan list and modify 4607 * i_disksize immediately, so that during the subsequent flushing of 4608 * dirty pages and freeing of disk blocks, we can guarantee that any 4609 * commit will leave the blocks being flushed in an unused state on 4610 * disk. (On recovery, the inode will get truncated and the blocks will 4611 * be freed, so we have a strong guarantee that no future commit will 4612 * leave these blocks visible to the user.) 4613 * 4614 * Another thing we have to assure is that if we are in ordered mode 4615 * and inode is still attached to the committing transaction, we must 4616 * we start writeout of all the dirty pages which are being truncated. 4617 * This way we are sure that all the data written in the previous 4618 * transaction are already on disk (truncate waits for pages under 4619 * writeback). 4620 * 4621 * Called with inode->i_mutex down. 4622 */ 4623 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4624 { 4625 struct inode *inode = dentry->d_inode; 4626 int error, rc = 0; 4627 const unsigned int ia_valid = attr->ia_valid; 4628 4629 error = inode_change_ok(inode, attr); 4630 if (error) 4631 return error; 4632 4633 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 4634 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 4635 handle_t *handle; 4636 4637 /* (user+group)*(old+new) structure, inode write (sb, 4638 * inode block, ? - but truncate inode update has it) */ 4639 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 4640 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 4641 if (IS_ERR(handle)) { 4642 error = PTR_ERR(handle); 4643 goto err_out; 4644 } 4645 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 4646 if (error) { 4647 ext4_journal_stop(handle); 4648 return error; 4649 } 4650 /* Update corresponding info in inode so that everything is in 4651 * one transaction */ 4652 if (attr->ia_valid & ATTR_UID) 4653 inode->i_uid = attr->ia_uid; 4654 if (attr->ia_valid & ATTR_GID) 4655 inode->i_gid = attr->ia_gid; 4656 error = ext4_mark_inode_dirty(handle, inode); 4657 ext4_journal_stop(handle); 4658 } 4659 4660 if (attr->ia_valid & ATTR_SIZE) { 4661 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 4662 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4663 4664 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 4665 error = -EFBIG; 4666 goto err_out; 4667 } 4668 } 4669 } 4670 4671 if (S_ISREG(inode->i_mode) && 4672 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 4673 handle_t *handle; 4674 4675 handle = ext4_journal_start(inode, 3); 4676 if (IS_ERR(handle)) { 4677 error = PTR_ERR(handle); 4678 goto err_out; 4679 } 4680 4681 error = ext4_orphan_add(handle, inode); 4682 EXT4_I(inode)->i_disksize = attr->ia_size; 4683 rc = ext4_mark_inode_dirty(handle, inode); 4684 if (!error) 4685 error = rc; 4686 ext4_journal_stop(handle); 4687 4688 if (ext4_should_order_data(inode)) { 4689 error = ext4_begin_ordered_truncate(inode, 4690 attr->ia_size); 4691 if (error) { 4692 /* Do as much error cleanup as possible */ 4693 handle = ext4_journal_start(inode, 3); 4694 if (IS_ERR(handle)) { 4695 ext4_orphan_del(NULL, inode); 4696 goto err_out; 4697 } 4698 ext4_orphan_del(handle, inode); 4699 ext4_journal_stop(handle); 4700 goto err_out; 4701 } 4702 } 4703 } 4704 4705 rc = inode_setattr(inode, attr); 4706 4707 /* If inode_setattr's call to ext4_truncate failed to get a 4708 * transaction handle at all, we need to clean up the in-core 4709 * orphan list manually. */ 4710 if (inode->i_nlink) 4711 ext4_orphan_del(NULL, inode); 4712 4713 if (!rc && (ia_valid & ATTR_MODE)) 4714 rc = ext4_acl_chmod(inode); 4715 4716 err_out: 4717 ext4_std_error(inode->i_sb, error); 4718 if (!error) 4719 error = rc; 4720 return error; 4721 } 4722 4723 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 4724 struct kstat *stat) 4725 { 4726 struct inode *inode; 4727 unsigned long delalloc_blocks; 4728 4729 inode = dentry->d_inode; 4730 generic_fillattr(inode, stat); 4731 4732 /* 4733 * We can't update i_blocks if the block allocation is delayed 4734 * otherwise in the case of system crash before the real block 4735 * allocation is done, we will have i_blocks inconsistent with 4736 * on-disk file blocks. 4737 * We always keep i_blocks updated together with real 4738 * allocation. But to not confuse with user, stat 4739 * will return the blocks that include the delayed allocation 4740 * blocks for this file. 4741 */ 4742 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 4743 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4744 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 4745 4746 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4747 return 0; 4748 } 4749 4750 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 4751 int chunk) 4752 { 4753 int indirects; 4754 4755 /* if nrblocks are contiguous */ 4756 if (chunk) { 4757 /* 4758 * With N contiguous data blocks, it need at most 4759 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 4760 * 2 dindirect blocks 4761 * 1 tindirect block 4762 */ 4763 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 4764 return indirects + 3; 4765 } 4766 /* 4767 * if nrblocks are not contiguous, worse case, each block touch 4768 * a indirect block, and each indirect block touch a double indirect 4769 * block, plus a triple indirect block 4770 */ 4771 indirects = nrblocks * 2 + 1; 4772 return indirects; 4773 } 4774 4775 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4776 { 4777 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 4778 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 4779 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4780 } 4781 4782 /* 4783 * Account for index blocks, block groups bitmaps and block group 4784 * descriptor blocks if modify datablocks and index blocks 4785 * worse case, the indexs blocks spread over different block groups 4786 * 4787 * If datablocks are discontiguous, they are possible to spread over 4788 * different block groups too. If they are contiugous, with flexbg, 4789 * they could still across block group boundary. 4790 * 4791 * Also account for superblock, inode, quota and xattr blocks 4792 */ 4793 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4794 { 4795 int groups, gdpblocks; 4796 int idxblocks; 4797 int ret = 0; 4798 4799 /* 4800 * How many index blocks need to touch to modify nrblocks? 4801 * The "Chunk" flag indicating whether the nrblocks is 4802 * physically contiguous on disk 4803 * 4804 * For Direct IO and fallocate, they calls get_block to allocate 4805 * one single extent at a time, so they could set the "Chunk" flag 4806 */ 4807 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4808 4809 ret = idxblocks; 4810 4811 /* 4812 * Now let's see how many group bitmaps and group descriptors need 4813 * to account 4814 */ 4815 groups = idxblocks; 4816 if (chunk) 4817 groups += 1; 4818 else 4819 groups += nrblocks; 4820 4821 gdpblocks = groups; 4822 if (groups > EXT4_SB(inode->i_sb)->s_groups_count) 4823 groups = EXT4_SB(inode->i_sb)->s_groups_count; 4824 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 4825 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 4826 4827 /* bitmaps and block group descriptor blocks */ 4828 ret += groups + gdpblocks; 4829 4830 /* Blocks for super block, inode, quota and xattr blocks */ 4831 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 4832 4833 return ret; 4834 } 4835 4836 /* 4837 * Calulate the total number of credits to reserve to fit 4838 * the modification of a single pages into a single transaction, 4839 * which may include multiple chunks of block allocations. 4840 * 4841 * This could be called via ext4_write_begin() 4842 * 4843 * We need to consider the worse case, when 4844 * one new block per extent. 4845 */ 4846 int ext4_writepage_trans_blocks(struct inode *inode) 4847 { 4848 int bpp = ext4_journal_blocks_per_page(inode); 4849 int ret; 4850 4851 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4852 4853 /* Account for data blocks for journalled mode */ 4854 if (ext4_should_journal_data(inode)) 4855 ret += bpp; 4856 return ret; 4857 } 4858 4859 /* 4860 * Calculate the journal credits for a chunk of data modification. 4861 * 4862 * This is called from DIO, fallocate or whoever calling 4863 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. 4864 * 4865 * journal buffers for data blocks are not included here, as DIO 4866 * and fallocate do no need to journal data buffers. 4867 */ 4868 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 4869 { 4870 return ext4_meta_trans_blocks(inode, nrblocks, 1); 4871 } 4872 4873 /* 4874 * The caller must have previously called ext4_reserve_inode_write(). 4875 * Give this, we know that the caller already has write access to iloc->bh. 4876 */ 4877 int ext4_mark_iloc_dirty(handle_t *handle, 4878 struct inode *inode, struct ext4_iloc *iloc) 4879 { 4880 int err = 0; 4881 4882 if (test_opt(inode->i_sb, I_VERSION)) 4883 inode_inc_iversion(inode); 4884 4885 /* the do_update_inode consumes one bh->b_count */ 4886 get_bh(iloc->bh); 4887 4888 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 4889 err = ext4_do_update_inode(handle, inode, iloc); 4890 put_bh(iloc->bh); 4891 return err; 4892 } 4893 4894 /* 4895 * On success, We end up with an outstanding reference count against 4896 * iloc->bh. This _must_ be cleaned up later. 4897 */ 4898 4899 int 4900 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4901 struct ext4_iloc *iloc) 4902 { 4903 int err; 4904 4905 err = ext4_get_inode_loc(inode, iloc); 4906 if (!err) { 4907 BUFFER_TRACE(iloc->bh, "get_write_access"); 4908 err = ext4_journal_get_write_access(handle, iloc->bh); 4909 if (err) { 4910 brelse(iloc->bh); 4911 iloc->bh = NULL; 4912 } 4913 } 4914 ext4_std_error(inode->i_sb, err); 4915 return err; 4916 } 4917 4918 /* 4919 * Expand an inode by new_extra_isize bytes. 4920 * Returns 0 on success or negative error number on failure. 4921 */ 4922 static int ext4_expand_extra_isize(struct inode *inode, 4923 unsigned int new_extra_isize, 4924 struct ext4_iloc iloc, 4925 handle_t *handle) 4926 { 4927 struct ext4_inode *raw_inode; 4928 struct ext4_xattr_ibody_header *header; 4929 struct ext4_xattr_entry *entry; 4930 4931 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 4932 return 0; 4933 4934 raw_inode = ext4_raw_inode(&iloc); 4935 4936 header = IHDR(inode, raw_inode); 4937 entry = IFIRST(header); 4938 4939 /* No extended attributes present */ 4940 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 4941 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 4942 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 4943 new_extra_isize); 4944 EXT4_I(inode)->i_extra_isize = new_extra_isize; 4945 return 0; 4946 } 4947 4948 /* try to expand with EAs present */ 4949 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 4950 raw_inode, handle); 4951 } 4952 4953 /* 4954 * What we do here is to mark the in-core inode as clean with respect to inode 4955 * dirtiness (it may still be data-dirty). 4956 * This means that the in-core inode may be reaped by prune_icache 4957 * without having to perform any I/O. This is a very good thing, 4958 * because *any* task may call prune_icache - even ones which 4959 * have a transaction open against a different journal. 4960 * 4961 * Is this cheating? Not really. Sure, we haven't written the 4962 * inode out, but prune_icache isn't a user-visible syncing function. 4963 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 4964 * we start and wait on commits. 4965 * 4966 * Is this efficient/effective? Well, we're being nice to the system 4967 * by cleaning up our inodes proactively so they can be reaped 4968 * without I/O. But we are potentially leaving up to five seconds' 4969 * worth of inodes floating about which prune_icache wants us to 4970 * write out. One way to fix that would be to get prune_icache() 4971 * to do a write_super() to free up some memory. It has the desired 4972 * effect. 4973 */ 4974 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 4975 { 4976 struct ext4_iloc iloc; 4977 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4978 static unsigned int mnt_count; 4979 int err, ret; 4980 4981 might_sleep(); 4982 err = ext4_reserve_inode_write(handle, inode, &iloc); 4983 if (ext4_handle_valid(handle) && 4984 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4985 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4986 /* 4987 * We need extra buffer credits since we may write into EA block 4988 * with this same handle. If journal_extend fails, then it will 4989 * only result in a minor loss of functionality for that inode. 4990 * If this is felt to be critical, then e2fsck should be run to 4991 * force a large enough s_min_extra_isize. 4992 */ 4993 if ((jbd2_journal_extend(handle, 4994 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 4995 ret = ext4_expand_extra_isize(inode, 4996 sbi->s_want_extra_isize, 4997 iloc, handle); 4998 if (ret) { 4999 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5000 if (mnt_count != 5001 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5002 ext4_warning(inode->i_sb, __func__, 5003 "Unable to expand inode %lu. Delete" 5004 " some EAs or run e2fsck.", 5005 inode->i_ino); 5006 mnt_count = 5007 le16_to_cpu(sbi->s_es->s_mnt_count); 5008 } 5009 } 5010 } 5011 } 5012 if (!err) 5013 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 5014 return err; 5015 } 5016 5017 /* 5018 * ext4_dirty_inode() is called from __mark_inode_dirty() 5019 * 5020 * We're really interested in the case where a file is being extended. 5021 * i_size has been changed by generic_commit_write() and we thus need 5022 * to include the updated inode in the current transaction. 5023 * 5024 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5025 * are allocated to the file. 5026 * 5027 * If the inode is marked synchronous, we don't honour that here - doing 5028 * so would cause a commit on atime updates, which we don't bother doing. 5029 * We handle synchronous inodes at the highest possible level. 5030 */ 5031 void ext4_dirty_inode(struct inode *inode) 5032 { 5033 handle_t *current_handle = ext4_journal_current_handle(); 5034 handle_t *handle; 5035 5036 if (!ext4_handle_valid(current_handle)) { 5037 ext4_mark_inode_dirty(current_handle, inode); 5038 return; 5039 } 5040 5041 handle = ext4_journal_start(inode, 2); 5042 if (IS_ERR(handle)) 5043 goto out; 5044 if (current_handle && 5045 current_handle->h_transaction != handle->h_transaction) { 5046 /* This task has a transaction open against a different fs */ 5047 printk(KERN_EMERG "%s: transactions do not match!\n", 5048 __func__); 5049 } else { 5050 jbd_debug(5, "marking dirty. outer handle=%p\n", 5051 current_handle); 5052 ext4_mark_inode_dirty(handle, inode); 5053 } 5054 ext4_journal_stop(handle); 5055 out: 5056 return; 5057 } 5058 5059 #if 0 5060 /* 5061 * Bind an inode's backing buffer_head into this transaction, to prevent 5062 * it from being flushed to disk early. Unlike 5063 * ext4_reserve_inode_write, this leaves behind no bh reference and 5064 * returns no iloc structure, so the caller needs to repeat the iloc 5065 * lookup to mark the inode dirty later. 5066 */ 5067 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5068 { 5069 struct ext4_iloc iloc; 5070 5071 int err = 0; 5072 if (handle) { 5073 err = ext4_get_inode_loc(inode, &iloc); 5074 if (!err) { 5075 BUFFER_TRACE(iloc.bh, "get_write_access"); 5076 err = jbd2_journal_get_write_access(handle, iloc.bh); 5077 if (!err) 5078 err = ext4_handle_dirty_metadata(handle, 5079 inode, 5080 iloc.bh); 5081 brelse(iloc.bh); 5082 } 5083 } 5084 ext4_std_error(inode->i_sb, err); 5085 return err; 5086 } 5087 #endif 5088 5089 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5090 { 5091 journal_t *journal; 5092 handle_t *handle; 5093 int err; 5094 5095 /* 5096 * We have to be very careful here: changing a data block's 5097 * journaling status dynamically is dangerous. If we write a 5098 * data block to the journal, change the status and then delete 5099 * that block, we risk forgetting to revoke the old log record 5100 * from the journal and so a subsequent replay can corrupt data. 5101 * So, first we make sure that the journal is empty and that 5102 * nobody is changing anything. 5103 */ 5104 5105 journal = EXT4_JOURNAL(inode); 5106 if (!journal) 5107 return 0; 5108 if (is_journal_aborted(journal)) 5109 return -EROFS; 5110 5111 jbd2_journal_lock_updates(journal); 5112 jbd2_journal_flush(journal); 5113 5114 /* 5115 * OK, there are no updates running now, and all cached data is 5116 * synced to disk. We are now in a completely consistent state 5117 * which doesn't have anything in the journal, and we know that 5118 * no filesystem updates are running, so it is safe to modify 5119 * the inode's in-core data-journaling state flag now. 5120 */ 5121 5122 if (val) 5123 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5124 else 5125 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5126 ext4_set_aops(inode); 5127 5128 jbd2_journal_unlock_updates(journal); 5129 5130 /* Finally we can mark the inode as dirty. */ 5131 5132 handle = ext4_journal_start(inode, 1); 5133 if (IS_ERR(handle)) 5134 return PTR_ERR(handle); 5135 5136 err = ext4_mark_inode_dirty(handle, inode); 5137 ext4_handle_sync(handle); 5138 ext4_journal_stop(handle); 5139 ext4_std_error(inode->i_sb, err); 5140 5141 return err; 5142 } 5143 5144 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5145 { 5146 return !buffer_mapped(bh); 5147 } 5148 5149 int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5150 { 5151 loff_t size; 5152 unsigned long len; 5153 int ret = -EINVAL; 5154 void *fsdata; 5155 struct file *file = vma->vm_file; 5156 struct inode *inode = file->f_path.dentry->d_inode; 5157 struct address_space *mapping = inode->i_mapping; 5158 5159 /* 5160 * Get i_alloc_sem to stop truncates messing with the inode. We cannot 5161 * get i_mutex because we are already holding mmap_sem. 5162 */ 5163 down_read(&inode->i_alloc_sem); 5164 size = i_size_read(inode); 5165 if (page->mapping != mapping || size <= page_offset(page) 5166 || !PageUptodate(page)) { 5167 /* page got truncated from under us? */ 5168 goto out_unlock; 5169 } 5170 ret = 0; 5171 if (PageMappedToDisk(page)) 5172 goto out_unlock; 5173 5174 if (page->index == size >> PAGE_CACHE_SHIFT) 5175 len = size & ~PAGE_CACHE_MASK; 5176 else 5177 len = PAGE_CACHE_SIZE; 5178 5179 if (page_has_buffers(page)) { 5180 /* return if we have all the buffers mapped */ 5181 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5182 ext4_bh_unmapped)) 5183 goto out_unlock; 5184 } 5185 /* 5186 * OK, we need to fill the hole... Do write_begin write_end 5187 * to do block allocation/reservation.We are not holding 5188 * inode.i__mutex here. That allow * parallel write_begin, 5189 * write_end call. lock_page prevent this from happening 5190 * on the same page though 5191 */ 5192 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 5193 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 5194 if (ret < 0) 5195 goto out_unlock; 5196 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 5197 len, len, page, fsdata); 5198 if (ret < 0) 5199 goto out_unlock; 5200 ret = 0; 5201 out_unlock: 5202 up_read(&inode->i_alloc_sem); 5203 return ret; 5204 } 5205