1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/jbd2.h> 29 #include <linux/highuid.h> 30 #include <linux/pagemap.h> 31 #include <linux/quotaops.h> 32 #include <linux/string.h> 33 #include <linux/buffer_head.h> 34 #include <linux/writeback.h> 35 #include <linux/pagevec.h> 36 #include <linux/mpage.h> 37 #include <linux/namei.h> 38 #include <linux/uio.h> 39 #include <linux/bio.h> 40 #include "ext4_jbd2.h" 41 #include "xattr.h" 42 #include "acl.h" 43 #include "ext4_extents.h" 44 45 #define MPAGE_DA_EXTENT_TAIL 0x01 46 47 static inline int ext4_begin_ordered_truncate(struct inode *inode, 48 loff_t new_size) 49 { 50 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, 51 new_size); 52 } 53 54 static void ext4_invalidatepage(struct page *page, unsigned long offset); 55 56 /* 57 * Test whether an inode is a fast symlink. 58 */ 59 static int ext4_inode_is_fast_symlink(struct inode *inode) 60 { 61 int ea_blocks = EXT4_I(inode)->i_file_acl ? 62 (inode->i_sb->s_blocksize >> 9) : 0; 63 64 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 65 } 66 67 /* 68 * The ext4 forget function must perform a revoke if we are freeing data 69 * which has been journaled. Metadata (eg. indirect blocks) must be 70 * revoked in all cases. 71 * 72 * "bh" may be NULL: a metadata block may have been freed from memory 73 * but there may still be a record of it in the journal, and that record 74 * still needs to be revoked. 75 */ 76 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 77 struct buffer_head *bh, ext4_fsblk_t blocknr) 78 { 79 int err; 80 81 might_sleep(); 82 83 BUFFER_TRACE(bh, "enter"); 84 85 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 86 "data mode %lx\n", 87 bh, is_metadata, inode->i_mode, 88 test_opt(inode->i_sb, DATA_FLAGS)); 89 90 /* Never use the revoke function if we are doing full data 91 * journaling: there is no need to, and a V1 superblock won't 92 * support it. Otherwise, only skip the revoke on un-journaled 93 * data blocks. */ 94 95 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 96 (!is_metadata && !ext4_should_journal_data(inode))) { 97 if (bh) { 98 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 99 return ext4_journal_forget(handle, bh); 100 } 101 return 0; 102 } 103 104 /* 105 * data!=journal && (is_metadata || should_journal_data(inode)) 106 */ 107 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 108 err = ext4_journal_revoke(handle, blocknr, bh); 109 if (err) 110 ext4_abort(inode->i_sb, __func__, 111 "error %d when attempting revoke", err); 112 BUFFER_TRACE(bh, "exit"); 113 return err; 114 } 115 116 /* 117 * Work out how many blocks we need to proceed with the next chunk of a 118 * truncate transaction. 119 */ 120 static unsigned long blocks_for_truncate(struct inode *inode) 121 { 122 ext4_lblk_t needed; 123 124 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 125 126 /* Give ourselves just enough room to cope with inodes in which 127 * i_blocks is corrupt: we've seen disk corruptions in the past 128 * which resulted in random data in an inode which looked enough 129 * like a regular file for ext4 to try to delete it. Things 130 * will go a bit crazy if that happens, but at least we should 131 * try not to panic the whole kernel. */ 132 if (needed < 2) 133 needed = 2; 134 135 /* But we need to bound the transaction so we don't overflow the 136 * journal. */ 137 if (needed > EXT4_MAX_TRANS_DATA) 138 needed = EXT4_MAX_TRANS_DATA; 139 140 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 141 } 142 143 /* 144 * Truncate transactions can be complex and absolutely huge. So we need to 145 * be able to restart the transaction at a conventient checkpoint to make 146 * sure we don't overflow the journal. 147 * 148 * start_transaction gets us a new handle for a truncate transaction, 149 * and extend_transaction tries to extend the existing one a bit. If 150 * extend fails, we need to propagate the failure up and restart the 151 * transaction in the top-level truncate loop. --sct 152 */ 153 static handle_t *start_transaction(struct inode *inode) 154 { 155 handle_t *result; 156 157 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 158 if (!IS_ERR(result)) 159 return result; 160 161 ext4_std_error(inode->i_sb, PTR_ERR(result)); 162 return result; 163 } 164 165 /* 166 * Try to extend this transaction for the purposes of truncation. 167 * 168 * Returns 0 if we managed to create more room. If we can't create more 169 * room, and the transaction must be restarted we return 1. 170 */ 171 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 172 { 173 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 174 return 0; 175 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 176 return 0; 177 return 1; 178 } 179 180 /* 181 * Restart the transaction associated with *handle. This does a commit, 182 * so before we call here everything must be consistently dirtied against 183 * this transaction. 184 */ 185 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 186 { 187 jbd_debug(2, "restarting handle %p\n", handle); 188 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 189 } 190 191 /* 192 * Called at the last iput() if i_nlink is zero. 193 */ 194 void ext4_delete_inode(struct inode *inode) 195 { 196 handle_t *handle; 197 int err; 198 199 if (ext4_should_order_data(inode)) 200 ext4_begin_ordered_truncate(inode, 0); 201 truncate_inode_pages(&inode->i_data, 0); 202 203 if (is_bad_inode(inode)) 204 goto no_delete; 205 206 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 207 if (IS_ERR(handle)) { 208 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 209 /* 210 * If we're going to skip the normal cleanup, we still need to 211 * make sure that the in-core orphan linked list is properly 212 * cleaned up. 213 */ 214 ext4_orphan_del(NULL, inode); 215 goto no_delete; 216 } 217 218 if (IS_SYNC(inode)) 219 handle->h_sync = 1; 220 inode->i_size = 0; 221 err = ext4_mark_inode_dirty(handle, inode); 222 if (err) { 223 ext4_warning(inode->i_sb, __func__, 224 "couldn't mark inode dirty (err %d)", err); 225 goto stop_handle; 226 } 227 if (inode->i_blocks) 228 ext4_truncate(inode); 229 230 /* 231 * ext4_ext_truncate() doesn't reserve any slop when it 232 * restarts journal transactions; therefore there may not be 233 * enough credits left in the handle to remove the inode from 234 * the orphan list and set the dtime field. 235 */ 236 if (handle->h_buffer_credits < 3) { 237 err = ext4_journal_extend(handle, 3); 238 if (err > 0) 239 err = ext4_journal_restart(handle, 3); 240 if (err != 0) { 241 ext4_warning(inode->i_sb, __func__, 242 "couldn't extend journal (err %d)", err); 243 stop_handle: 244 ext4_journal_stop(handle); 245 goto no_delete; 246 } 247 } 248 249 /* 250 * Kill off the orphan record which ext4_truncate created. 251 * AKPM: I think this can be inside the above `if'. 252 * Note that ext4_orphan_del() has to be able to cope with the 253 * deletion of a non-existent orphan - this is because we don't 254 * know if ext4_truncate() actually created an orphan record. 255 * (Well, we could do this if we need to, but heck - it works) 256 */ 257 ext4_orphan_del(handle, inode); 258 EXT4_I(inode)->i_dtime = get_seconds(); 259 260 /* 261 * One subtle ordering requirement: if anything has gone wrong 262 * (transaction abort, IO errors, whatever), then we can still 263 * do these next steps (the fs will already have been marked as 264 * having errors), but we can't free the inode if the mark_dirty 265 * fails. 266 */ 267 if (ext4_mark_inode_dirty(handle, inode)) 268 /* If that failed, just do the required in-core inode clear. */ 269 clear_inode(inode); 270 else 271 ext4_free_inode(handle, inode); 272 ext4_journal_stop(handle); 273 return; 274 no_delete: 275 clear_inode(inode); /* We must guarantee clearing of inode... */ 276 } 277 278 typedef struct { 279 __le32 *p; 280 __le32 key; 281 struct buffer_head *bh; 282 } Indirect; 283 284 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 285 { 286 p->key = *(p->p = v); 287 p->bh = bh; 288 } 289 290 /** 291 * ext4_block_to_path - parse the block number into array of offsets 292 * @inode: inode in question (we are only interested in its superblock) 293 * @i_block: block number to be parsed 294 * @offsets: array to store the offsets in 295 * @boundary: set this non-zero if the referred-to block is likely to be 296 * followed (on disk) by an indirect block. 297 * 298 * To store the locations of file's data ext4 uses a data structure common 299 * for UNIX filesystems - tree of pointers anchored in the inode, with 300 * data blocks at leaves and indirect blocks in intermediate nodes. 301 * This function translates the block number into path in that tree - 302 * return value is the path length and @offsets[n] is the offset of 303 * pointer to (n+1)th node in the nth one. If @block is out of range 304 * (negative or too large) warning is printed and zero returned. 305 * 306 * Note: function doesn't find node addresses, so no IO is needed. All 307 * we need to know is the capacity of indirect blocks (taken from the 308 * inode->i_sb). 309 */ 310 311 /* 312 * Portability note: the last comparison (check that we fit into triple 313 * indirect block) is spelled differently, because otherwise on an 314 * architecture with 32-bit longs and 8Kb pages we might get into trouble 315 * if our filesystem had 8Kb blocks. We might use long long, but that would 316 * kill us on x86. Oh, well, at least the sign propagation does not matter - 317 * i_block would have to be negative in the very beginning, so we would not 318 * get there at all. 319 */ 320 321 static int ext4_block_to_path(struct inode *inode, 322 ext4_lblk_t i_block, 323 ext4_lblk_t offsets[4], int *boundary) 324 { 325 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 326 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 327 const long direct_blocks = EXT4_NDIR_BLOCKS, 328 indirect_blocks = ptrs, 329 double_blocks = (1 << (ptrs_bits * 2)); 330 int n = 0; 331 int final = 0; 332 333 if (i_block < 0) { 334 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); 335 } else if (i_block < direct_blocks) { 336 offsets[n++] = i_block; 337 final = direct_blocks; 338 } else if ((i_block -= direct_blocks) < indirect_blocks) { 339 offsets[n++] = EXT4_IND_BLOCK; 340 offsets[n++] = i_block; 341 final = ptrs; 342 } else if ((i_block -= indirect_blocks) < double_blocks) { 343 offsets[n++] = EXT4_DIND_BLOCK; 344 offsets[n++] = i_block >> ptrs_bits; 345 offsets[n++] = i_block & (ptrs - 1); 346 final = ptrs; 347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 348 offsets[n++] = EXT4_TIND_BLOCK; 349 offsets[n++] = i_block >> (ptrs_bits * 2); 350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 351 offsets[n++] = i_block & (ptrs - 1); 352 final = ptrs; 353 } else { 354 ext4_warning(inode->i_sb, "ext4_block_to_path", 355 "block %lu > max", 356 i_block + direct_blocks + 357 indirect_blocks + double_blocks); 358 } 359 if (boundary) 360 *boundary = final - 1 - (i_block & (ptrs - 1)); 361 return n; 362 } 363 364 /** 365 * ext4_get_branch - read the chain of indirect blocks leading to data 366 * @inode: inode in question 367 * @depth: depth of the chain (1 - direct pointer, etc.) 368 * @offsets: offsets of pointers in inode/indirect blocks 369 * @chain: place to store the result 370 * @err: here we store the error value 371 * 372 * Function fills the array of triples <key, p, bh> and returns %NULL 373 * if everything went OK or the pointer to the last filled triple 374 * (incomplete one) otherwise. Upon the return chain[i].key contains 375 * the number of (i+1)-th block in the chain (as it is stored in memory, 376 * i.e. little-endian 32-bit), chain[i].p contains the address of that 377 * number (it points into struct inode for i==0 and into the bh->b_data 378 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 379 * block for i>0 and NULL for i==0. In other words, it holds the block 380 * numbers of the chain, addresses they were taken from (and where we can 381 * verify that chain did not change) and buffer_heads hosting these 382 * numbers. 383 * 384 * Function stops when it stumbles upon zero pointer (absent block) 385 * (pointer to last triple returned, *@err == 0) 386 * or when it gets an IO error reading an indirect block 387 * (ditto, *@err == -EIO) 388 * or when it reads all @depth-1 indirect blocks successfully and finds 389 * the whole chain, all way to the data (returns %NULL, *err == 0). 390 * 391 * Need to be called with 392 * down_read(&EXT4_I(inode)->i_data_sem) 393 */ 394 static Indirect *ext4_get_branch(struct inode *inode, int depth, 395 ext4_lblk_t *offsets, 396 Indirect chain[4], int *err) 397 { 398 struct super_block *sb = inode->i_sb; 399 Indirect *p = chain; 400 struct buffer_head *bh; 401 402 *err = 0; 403 /* i_data is not going away, no lock needed */ 404 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 405 if (!p->key) 406 goto no_block; 407 while (--depth) { 408 bh = sb_bread(sb, le32_to_cpu(p->key)); 409 if (!bh) 410 goto failure; 411 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 412 /* Reader: end */ 413 if (!p->key) 414 goto no_block; 415 } 416 return NULL; 417 418 failure: 419 *err = -EIO; 420 no_block: 421 return p; 422 } 423 424 /** 425 * ext4_find_near - find a place for allocation with sufficient locality 426 * @inode: owner 427 * @ind: descriptor of indirect block. 428 * 429 * This function returns the preferred place for block allocation. 430 * It is used when heuristic for sequential allocation fails. 431 * Rules are: 432 * + if there is a block to the left of our position - allocate near it. 433 * + if pointer will live in indirect block - allocate near that block. 434 * + if pointer will live in inode - allocate in the same 435 * cylinder group. 436 * 437 * In the latter case we colour the starting block by the callers PID to 438 * prevent it from clashing with concurrent allocations for a different inode 439 * in the same block group. The PID is used here so that functionally related 440 * files will be close-by on-disk. 441 * 442 * Caller must make sure that @ind is valid and will stay that way. 443 */ 444 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 445 { 446 struct ext4_inode_info *ei = EXT4_I(inode); 447 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 448 __le32 *p; 449 ext4_fsblk_t bg_start; 450 ext4_fsblk_t last_block; 451 ext4_grpblk_t colour; 452 453 /* Try to find previous block */ 454 for (p = ind->p - 1; p >= start; p--) { 455 if (*p) 456 return le32_to_cpu(*p); 457 } 458 459 /* No such thing, so let's try location of indirect block */ 460 if (ind->bh) 461 return ind->bh->b_blocknr; 462 463 /* 464 * It is going to be referred to from the inode itself? OK, just put it 465 * into the same cylinder group then. 466 */ 467 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 468 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 469 470 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 471 colour = (current->pid % 16) * 472 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 473 else 474 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 475 return bg_start + colour; 476 } 477 478 /** 479 * ext4_find_goal - find a preferred place for allocation. 480 * @inode: owner 481 * @block: block we want 482 * @partial: pointer to the last triple within a chain 483 * 484 * Normally this function find the preferred place for block allocation, 485 * returns it. 486 */ 487 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 488 Indirect *partial) 489 { 490 /* 491 * XXX need to get goal block from mballoc's data structures 492 */ 493 494 return ext4_find_near(inode, partial); 495 } 496 497 /** 498 * ext4_blks_to_allocate: Look up the block map and count the number 499 * of direct blocks need to be allocated for the given branch. 500 * 501 * @branch: chain of indirect blocks 502 * @k: number of blocks need for indirect blocks 503 * @blks: number of data blocks to be mapped. 504 * @blocks_to_boundary: the offset in the indirect block 505 * 506 * return the total number of blocks to be allocate, including the 507 * direct and indirect blocks. 508 */ 509 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 510 int blocks_to_boundary) 511 { 512 unsigned long count = 0; 513 514 /* 515 * Simple case, [t,d]Indirect block(s) has not allocated yet 516 * then it's clear blocks on that path have not allocated 517 */ 518 if (k > 0) { 519 /* right now we don't handle cross boundary allocation */ 520 if (blks < blocks_to_boundary + 1) 521 count += blks; 522 else 523 count += blocks_to_boundary + 1; 524 return count; 525 } 526 527 count++; 528 while (count < blks && count <= blocks_to_boundary && 529 le32_to_cpu(*(branch[0].p + count)) == 0) { 530 count++; 531 } 532 return count; 533 } 534 535 /** 536 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 537 * @indirect_blks: the number of blocks need to allocate for indirect 538 * blocks 539 * 540 * @new_blocks: on return it will store the new block numbers for 541 * the indirect blocks(if needed) and the first direct block, 542 * @blks: on return it will store the total number of allocated 543 * direct blocks 544 */ 545 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 546 ext4_lblk_t iblock, ext4_fsblk_t goal, 547 int indirect_blks, int blks, 548 ext4_fsblk_t new_blocks[4], int *err) 549 { 550 int target, i; 551 unsigned long count = 0, blk_allocated = 0; 552 int index = 0; 553 ext4_fsblk_t current_block = 0; 554 int ret = 0; 555 556 /* 557 * Here we try to allocate the requested multiple blocks at once, 558 * on a best-effort basis. 559 * To build a branch, we should allocate blocks for 560 * the indirect blocks(if not allocated yet), and at least 561 * the first direct block of this branch. That's the 562 * minimum number of blocks need to allocate(required) 563 */ 564 /* first we try to allocate the indirect blocks */ 565 target = indirect_blks; 566 while (target > 0) { 567 count = target; 568 /* allocating blocks for indirect blocks and direct blocks */ 569 current_block = ext4_new_meta_blocks(handle, inode, 570 goal, &count, err); 571 if (*err) 572 goto failed_out; 573 574 target -= count; 575 /* allocate blocks for indirect blocks */ 576 while (index < indirect_blks && count) { 577 new_blocks[index++] = current_block++; 578 count--; 579 } 580 if (count > 0) { 581 /* 582 * save the new block number 583 * for the first direct block 584 */ 585 new_blocks[index] = current_block; 586 printk(KERN_INFO "%s returned more blocks than " 587 "requested\n", __func__); 588 WARN_ON(1); 589 break; 590 } 591 } 592 593 target = blks - count ; 594 blk_allocated = count; 595 if (!target) 596 goto allocated; 597 /* Now allocate data blocks */ 598 count = target; 599 /* allocating blocks for data blocks */ 600 current_block = ext4_new_blocks(handle, inode, iblock, 601 goal, &count, err); 602 if (*err && (target == blks)) { 603 /* 604 * if the allocation failed and we didn't allocate 605 * any blocks before 606 */ 607 goto failed_out; 608 } 609 if (!*err) { 610 if (target == blks) { 611 /* 612 * save the new block number 613 * for the first direct block 614 */ 615 new_blocks[index] = current_block; 616 } 617 blk_allocated += count; 618 } 619 allocated: 620 /* total number of blocks allocated for direct blocks */ 621 ret = blk_allocated; 622 *err = 0; 623 return ret; 624 failed_out: 625 for (i = 0; i < index; i++) 626 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 627 return ret; 628 } 629 630 /** 631 * ext4_alloc_branch - allocate and set up a chain of blocks. 632 * @inode: owner 633 * @indirect_blks: number of allocated indirect blocks 634 * @blks: number of allocated direct blocks 635 * @offsets: offsets (in the blocks) to store the pointers to next. 636 * @branch: place to store the chain in. 637 * 638 * This function allocates blocks, zeroes out all but the last one, 639 * links them into chain and (if we are synchronous) writes them to disk. 640 * In other words, it prepares a branch that can be spliced onto the 641 * inode. It stores the information about that chain in the branch[], in 642 * the same format as ext4_get_branch() would do. We are calling it after 643 * we had read the existing part of chain and partial points to the last 644 * triple of that (one with zero ->key). Upon the exit we have the same 645 * picture as after the successful ext4_get_block(), except that in one 646 * place chain is disconnected - *branch->p is still zero (we did not 647 * set the last link), but branch->key contains the number that should 648 * be placed into *branch->p to fill that gap. 649 * 650 * If allocation fails we free all blocks we've allocated (and forget 651 * their buffer_heads) and return the error value the from failed 652 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 653 * as described above and return 0. 654 */ 655 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 656 ext4_lblk_t iblock, int indirect_blks, 657 int *blks, ext4_fsblk_t goal, 658 ext4_lblk_t *offsets, Indirect *branch) 659 { 660 int blocksize = inode->i_sb->s_blocksize; 661 int i, n = 0; 662 int err = 0; 663 struct buffer_head *bh; 664 int num; 665 ext4_fsblk_t new_blocks[4]; 666 ext4_fsblk_t current_block; 667 668 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 669 *blks, new_blocks, &err); 670 if (err) 671 return err; 672 673 branch[0].key = cpu_to_le32(new_blocks[0]); 674 /* 675 * metadata blocks and data blocks are allocated. 676 */ 677 for (n = 1; n <= indirect_blks; n++) { 678 /* 679 * Get buffer_head for parent block, zero it out 680 * and set the pointer to new one, then send 681 * parent to disk. 682 */ 683 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 684 branch[n].bh = bh; 685 lock_buffer(bh); 686 BUFFER_TRACE(bh, "call get_create_access"); 687 err = ext4_journal_get_create_access(handle, bh); 688 if (err) { 689 unlock_buffer(bh); 690 brelse(bh); 691 goto failed; 692 } 693 694 memset(bh->b_data, 0, blocksize); 695 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 696 branch[n].key = cpu_to_le32(new_blocks[n]); 697 *branch[n].p = branch[n].key; 698 if (n == indirect_blks) { 699 current_block = new_blocks[n]; 700 /* 701 * End of chain, update the last new metablock of 702 * the chain to point to the new allocated 703 * data blocks numbers 704 */ 705 for (i=1; i < num; i++) 706 *(branch[n].p + i) = cpu_to_le32(++current_block); 707 } 708 BUFFER_TRACE(bh, "marking uptodate"); 709 set_buffer_uptodate(bh); 710 unlock_buffer(bh); 711 712 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 713 err = ext4_journal_dirty_metadata(handle, bh); 714 if (err) 715 goto failed; 716 } 717 *blks = num; 718 return err; 719 failed: 720 /* Allocation failed, free what we already allocated */ 721 for (i = 1; i <= n ; i++) { 722 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 723 ext4_journal_forget(handle, branch[i].bh); 724 } 725 for (i = 0; i < indirect_blks; i++) 726 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 727 728 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 729 730 return err; 731 } 732 733 /** 734 * ext4_splice_branch - splice the allocated branch onto inode. 735 * @inode: owner 736 * @block: (logical) number of block we are adding 737 * @chain: chain of indirect blocks (with a missing link - see 738 * ext4_alloc_branch) 739 * @where: location of missing link 740 * @num: number of indirect blocks we are adding 741 * @blks: number of direct blocks we are adding 742 * 743 * This function fills the missing link and does all housekeeping needed in 744 * inode (->i_blocks, etc.). In case of success we end up with the full 745 * chain to new block and return 0. 746 */ 747 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 748 ext4_lblk_t block, Indirect *where, int num, int blks) 749 { 750 int i; 751 int err = 0; 752 ext4_fsblk_t current_block; 753 754 /* 755 * If we're splicing into a [td]indirect block (as opposed to the 756 * inode) then we need to get write access to the [td]indirect block 757 * before the splice. 758 */ 759 if (where->bh) { 760 BUFFER_TRACE(where->bh, "get_write_access"); 761 err = ext4_journal_get_write_access(handle, where->bh); 762 if (err) 763 goto err_out; 764 } 765 /* That's it */ 766 767 *where->p = where->key; 768 769 /* 770 * Update the host buffer_head or inode to point to more just allocated 771 * direct blocks blocks 772 */ 773 if (num == 0 && blks > 1) { 774 current_block = le32_to_cpu(where->key) + 1; 775 for (i = 1; i < blks; i++) 776 *(where->p + i) = cpu_to_le32(current_block++); 777 } 778 779 /* We are done with atomic stuff, now do the rest of housekeeping */ 780 781 inode->i_ctime = ext4_current_time(inode); 782 ext4_mark_inode_dirty(handle, inode); 783 784 /* had we spliced it onto indirect block? */ 785 if (where->bh) { 786 /* 787 * If we spliced it onto an indirect block, we haven't 788 * altered the inode. Note however that if it is being spliced 789 * onto an indirect block at the very end of the file (the 790 * file is growing) then we *will* alter the inode to reflect 791 * the new i_size. But that is not done here - it is done in 792 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 793 */ 794 jbd_debug(5, "splicing indirect only\n"); 795 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); 796 err = ext4_journal_dirty_metadata(handle, where->bh); 797 if (err) 798 goto err_out; 799 } else { 800 /* 801 * OK, we spliced it into the inode itself on a direct block. 802 * Inode was dirtied above. 803 */ 804 jbd_debug(5, "splicing direct\n"); 805 } 806 return err; 807 808 err_out: 809 for (i = 1; i <= num; i++) { 810 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 811 ext4_journal_forget(handle, where[i].bh); 812 ext4_free_blocks(handle, inode, 813 le32_to_cpu(where[i-1].key), 1, 0); 814 } 815 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 816 817 return err; 818 } 819 820 /* 821 * Allocation strategy is simple: if we have to allocate something, we will 822 * have to go the whole way to leaf. So let's do it before attaching anything 823 * to tree, set linkage between the newborn blocks, write them if sync is 824 * required, recheck the path, free and repeat if check fails, otherwise 825 * set the last missing link (that will protect us from any truncate-generated 826 * removals - all blocks on the path are immune now) and possibly force the 827 * write on the parent block. 828 * That has a nice additional property: no special recovery from the failed 829 * allocations is needed - we simply release blocks and do not touch anything 830 * reachable from inode. 831 * 832 * `handle' can be NULL if create == 0. 833 * 834 * return > 0, # of blocks mapped or allocated. 835 * return = 0, if plain lookup failed. 836 * return < 0, error case. 837 * 838 * 839 * Need to be called with 840 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 841 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 842 */ 843 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 844 ext4_lblk_t iblock, unsigned long maxblocks, 845 struct buffer_head *bh_result, 846 int create, int extend_disksize) 847 { 848 int err = -EIO; 849 ext4_lblk_t offsets[4]; 850 Indirect chain[4]; 851 Indirect *partial; 852 ext4_fsblk_t goal; 853 int indirect_blks; 854 int blocks_to_boundary = 0; 855 int depth; 856 struct ext4_inode_info *ei = EXT4_I(inode); 857 int count = 0; 858 ext4_fsblk_t first_block = 0; 859 loff_t disksize; 860 861 862 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 863 J_ASSERT(handle != NULL || create == 0); 864 depth = ext4_block_to_path(inode, iblock, offsets, 865 &blocks_to_boundary); 866 867 if (depth == 0) 868 goto out; 869 870 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 871 872 /* Simplest case - block found, no allocation needed */ 873 if (!partial) { 874 first_block = le32_to_cpu(chain[depth - 1].key); 875 clear_buffer_new(bh_result); 876 count++; 877 /*map more blocks*/ 878 while (count < maxblocks && count <= blocks_to_boundary) { 879 ext4_fsblk_t blk; 880 881 blk = le32_to_cpu(*(chain[depth-1].p + count)); 882 883 if (blk == first_block + count) 884 count++; 885 else 886 break; 887 } 888 goto got_it; 889 } 890 891 /* Next simple case - plain lookup or failed read of indirect block */ 892 if (!create || err == -EIO) 893 goto cleanup; 894 895 /* 896 * Okay, we need to do block allocation. 897 */ 898 goal = ext4_find_goal(inode, iblock, partial); 899 900 /* the number of blocks need to allocate for [d,t]indirect blocks */ 901 indirect_blks = (chain + depth) - partial - 1; 902 903 /* 904 * Next look up the indirect map to count the totoal number of 905 * direct blocks to allocate for this branch. 906 */ 907 count = ext4_blks_to_allocate(partial, indirect_blks, 908 maxblocks, blocks_to_boundary); 909 /* 910 * Block out ext4_truncate while we alter the tree 911 */ 912 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 913 &count, goal, 914 offsets + (partial - chain), partial); 915 916 /* 917 * The ext4_splice_branch call will free and forget any buffers 918 * on the new chain if there is a failure, but that risks using 919 * up transaction credits, especially for bitmaps where the 920 * credits cannot be returned. Can we handle this somehow? We 921 * may need to return -EAGAIN upwards in the worst case. --sct 922 */ 923 if (!err) 924 err = ext4_splice_branch(handle, inode, iblock, 925 partial, indirect_blks, count); 926 /* 927 * i_disksize growing is protected by i_data_sem. Don't forget to 928 * protect it if you're about to implement concurrent 929 * ext4_get_block() -bzzz 930 */ 931 if (!err && extend_disksize) { 932 disksize = ((loff_t) iblock + count) << inode->i_blkbits; 933 if (disksize > i_size_read(inode)) 934 disksize = i_size_read(inode); 935 if (disksize > ei->i_disksize) 936 ei->i_disksize = disksize; 937 } 938 if (err) 939 goto cleanup; 940 941 set_buffer_new(bh_result); 942 got_it: 943 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 944 if (count > blocks_to_boundary) 945 set_buffer_boundary(bh_result); 946 err = count; 947 /* Clean up and exit */ 948 partial = chain + depth - 1; /* the whole chain */ 949 cleanup: 950 while (partial > chain) { 951 BUFFER_TRACE(partial->bh, "call brelse"); 952 brelse(partial->bh); 953 partial--; 954 } 955 BUFFER_TRACE(bh_result, "returned"); 956 out: 957 return err; 958 } 959 960 /* 961 * Calculate the number of metadata blocks need to reserve 962 * to allocate @blocks for non extent file based file 963 */ 964 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 965 { 966 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 967 int ind_blks, dind_blks, tind_blks; 968 969 /* number of new indirect blocks needed */ 970 ind_blks = (blocks + icap - 1) / icap; 971 972 dind_blks = (ind_blks + icap - 1) / icap; 973 974 tind_blks = 1; 975 976 return ind_blks + dind_blks + tind_blks; 977 } 978 979 /* 980 * Calculate the number of metadata blocks need to reserve 981 * to allocate given number of blocks 982 */ 983 static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 984 { 985 if (!blocks) 986 return 0; 987 988 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 989 return ext4_ext_calc_metadata_amount(inode, blocks); 990 991 return ext4_indirect_calc_metadata_amount(inode, blocks); 992 } 993 994 static void ext4_da_update_reserve_space(struct inode *inode, int used) 995 { 996 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 997 int total, mdb, mdb_free; 998 999 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1000 /* recalculate the number of metablocks still need to be reserved */ 1001 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1002 mdb = ext4_calc_metadata_amount(inode, total); 1003 1004 /* figure out how many metablocks to release */ 1005 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1006 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1007 1008 if (mdb_free) { 1009 /* Account for allocated meta_blocks */ 1010 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1011 1012 /* update fs dirty blocks counter */ 1013 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1014 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1015 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1016 } 1017 1018 /* update per-inode reservations */ 1019 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1020 EXT4_I(inode)->i_reserved_data_blocks -= used; 1021 1022 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1023 } 1024 1025 /* 1026 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1027 * and returns if the blocks are already mapped. 1028 * 1029 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1030 * and store the allocated blocks in the result buffer head and mark it 1031 * mapped. 1032 * 1033 * If file type is extents based, it will call ext4_ext_get_blocks(), 1034 * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping 1035 * based files 1036 * 1037 * On success, it returns the number of blocks being mapped or allocate. 1038 * if create==0 and the blocks are pre-allocated and uninitialized block, 1039 * the result buffer head is unmapped. If the create ==1, it will make sure 1040 * the buffer head is mapped. 1041 * 1042 * It returns 0 if plain look up failed (blocks have not been allocated), in 1043 * that casem, buffer head is unmapped 1044 * 1045 * It returns the error in case of allocation failure. 1046 */ 1047 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1048 unsigned long max_blocks, struct buffer_head *bh, 1049 int create, int extend_disksize, int flag) 1050 { 1051 int retval; 1052 1053 clear_buffer_mapped(bh); 1054 1055 /* 1056 * Try to see if we can get the block without requesting 1057 * for new file system block. 1058 */ 1059 down_read((&EXT4_I(inode)->i_data_sem)); 1060 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1061 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1062 bh, 0, 0); 1063 } else { 1064 retval = ext4_get_blocks_handle(handle, 1065 inode, block, max_blocks, bh, 0, 0); 1066 } 1067 up_read((&EXT4_I(inode)->i_data_sem)); 1068 1069 /* If it is only a block(s) look up */ 1070 if (!create) 1071 return retval; 1072 1073 /* 1074 * Returns if the blocks have already allocated 1075 * 1076 * Note that if blocks have been preallocated 1077 * ext4_ext_get_block() returns th create = 0 1078 * with buffer head unmapped. 1079 */ 1080 if (retval > 0 && buffer_mapped(bh)) 1081 return retval; 1082 1083 /* 1084 * New blocks allocate and/or writing to uninitialized extent 1085 * will possibly result in updating i_data, so we take 1086 * the write lock of i_data_sem, and call get_blocks() 1087 * with create == 1 flag. 1088 */ 1089 down_write((&EXT4_I(inode)->i_data_sem)); 1090 1091 /* 1092 * if the caller is from delayed allocation writeout path 1093 * we have already reserved fs blocks for allocation 1094 * let the underlying get_block() function know to 1095 * avoid double accounting 1096 */ 1097 if (flag) 1098 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1099 /* 1100 * We need to check for EXT4 here because migrate 1101 * could have changed the inode type in between 1102 */ 1103 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1104 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1105 bh, create, extend_disksize); 1106 } else { 1107 retval = ext4_get_blocks_handle(handle, inode, block, 1108 max_blocks, bh, create, extend_disksize); 1109 1110 if (retval > 0 && buffer_new(bh)) { 1111 /* 1112 * We allocated new blocks which will result in 1113 * i_data's format changing. Force the migrate 1114 * to fail by clearing migrate flags 1115 */ 1116 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1117 ~EXT4_EXT_MIGRATE; 1118 } 1119 } 1120 1121 if (flag) { 1122 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1123 /* 1124 * Update reserved blocks/metadata blocks 1125 * after successful block allocation 1126 * which were deferred till now 1127 */ 1128 if ((retval > 0) && buffer_delay(bh)) 1129 ext4_da_update_reserve_space(inode, retval); 1130 } 1131 1132 up_write((&EXT4_I(inode)->i_data_sem)); 1133 return retval; 1134 } 1135 1136 /* Maximum number of blocks we map for direct IO at once. */ 1137 #define DIO_MAX_BLOCKS 4096 1138 1139 int ext4_get_block(struct inode *inode, sector_t iblock, 1140 struct buffer_head *bh_result, int create) 1141 { 1142 handle_t *handle = ext4_journal_current_handle(); 1143 int ret = 0, started = 0; 1144 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1145 int dio_credits; 1146 1147 if (create && !handle) { 1148 /* Direct IO write... */ 1149 if (max_blocks > DIO_MAX_BLOCKS) 1150 max_blocks = DIO_MAX_BLOCKS; 1151 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1152 handle = ext4_journal_start(inode, dio_credits); 1153 if (IS_ERR(handle)) { 1154 ret = PTR_ERR(handle); 1155 goto out; 1156 } 1157 started = 1; 1158 } 1159 1160 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1161 max_blocks, bh_result, create, 0, 0); 1162 if (ret > 0) { 1163 bh_result->b_size = (ret << inode->i_blkbits); 1164 ret = 0; 1165 } 1166 if (started) 1167 ext4_journal_stop(handle); 1168 out: 1169 return ret; 1170 } 1171 1172 /* 1173 * `handle' can be NULL if create is zero 1174 */ 1175 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1176 ext4_lblk_t block, int create, int *errp) 1177 { 1178 struct buffer_head dummy; 1179 int fatal = 0, err; 1180 1181 J_ASSERT(handle != NULL || create == 0); 1182 1183 dummy.b_state = 0; 1184 dummy.b_blocknr = -1000; 1185 buffer_trace_init(&dummy.b_history); 1186 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1187 &dummy, create, 1, 0); 1188 /* 1189 * ext4_get_blocks_handle() returns number of blocks 1190 * mapped. 0 in case of a HOLE. 1191 */ 1192 if (err > 0) { 1193 if (err > 1) 1194 WARN_ON(1); 1195 err = 0; 1196 } 1197 *errp = err; 1198 if (!err && buffer_mapped(&dummy)) { 1199 struct buffer_head *bh; 1200 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1201 if (!bh) { 1202 *errp = -EIO; 1203 goto err; 1204 } 1205 if (buffer_new(&dummy)) { 1206 J_ASSERT(create != 0); 1207 J_ASSERT(handle != NULL); 1208 1209 /* 1210 * Now that we do not always journal data, we should 1211 * keep in mind whether this should always journal the 1212 * new buffer as metadata. For now, regular file 1213 * writes use ext4_get_block instead, so it's not a 1214 * problem. 1215 */ 1216 lock_buffer(bh); 1217 BUFFER_TRACE(bh, "call get_create_access"); 1218 fatal = ext4_journal_get_create_access(handle, bh); 1219 if (!fatal && !buffer_uptodate(bh)) { 1220 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1221 set_buffer_uptodate(bh); 1222 } 1223 unlock_buffer(bh); 1224 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1225 err = ext4_journal_dirty_metadata(handle, bh); 1226 if (!fatal) 1227 fatal = err; 1228 } else { 1229 BUFFER_TRACE(bh, "not a new buffer"); 1230 } 1231 if (fatal) { 1232 *errp = fatal; 1233 brelse(bh); 1234 bh = NULL; 1235 } 1236 return bh; 1237 } 1238 err: 1239 return NULL; 1240 } 1241 1242 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1243 ext4_lblk_t block, int create, int *err) 1244 { 1245 struct buffer_head *bh; 1246 1247 bh = ext4_getblk(handle, inode, block, create, err); 1248 if (!bh) 1249 return bh; 1250 if (buffer_uptodate(bh)) 1251 return bh; 1252 ll_rw_block(READ_META, 1, &bh); 1253 wait_on_buffer(bh); 1254 if (buffer_uptodate(bh)) 1255 return bh; 1256 put_bh(bh); 1257 *err = -EIO; 1258 return NULL; 1259 } 1260 1261 static int walk_page_buffers(handle_t *handle, 1262 struct buffer_head *head, 1263 unsigned from, 1264 unsigned to, 1265 int *partial, 1266 int (*fn)(handle_t *handle, 1267 struct buffer_head *bh)) 1268 { 1269 struct buffer_head *bh; 1270 unsigned block_start, block_end; 1271 unsigned blocksize = head->b_size; 1272 int err, ret = 0; 1273 struct buffer_head *next; 1274 1275 for (bh = head, block_start = 0; 1276 ret == 0 && (bh != head || !block_start); 1277 block_start = block_end, bh = next) 1278 { 1279 next = bh->b_this_page; 1280 block_end = block_start + blocksize; 1281 if (block_end <= from || block_start >= to) { 1282 if (partial && !buffer_uptodate(bh)) 1283 *partial = 1; 1284 continue; 1285 } 1286 err = (*fn)(handle, bh); 1287 if (!ret) 1288 ret = err; 1289 } 1290 return ret; 1291 } 1292 1293 /* 1294 * To preserve ordering, it is essential that the hole instantiation and 1295 * the data write be encapsulated in a single transaction. We cannot 1296 * close off a transaction and start a new one between the ext4_get_block() 1297 * and the commit_write(). So doing the jbd2_journal_start at the start of 1298 * prepare_write() is the right place. 1299 * 1300 * Also, this function can nest inside ext4_writepage() -> 1301 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1302 * has generated enough buffer credits to do the whole page. So we won't 1303 * block on the journal in that case, which is good, because the caller may 1304 * be PF_MEMALLOC. 1305 * 1306 * By accident, ext4 can be reentered when a transaction is open via 1307 * quota file writes. If we were to commit the transaction while thus 1308 * reentered, there can be a deadlock - we would be holding a quota 1309 * lock, and the commit would never complete if another thread had a 1310 * transaction open and was blocking on the quota lock - a ranking 1311 * violation. 1312 * 1313 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1314 * will _not_ run commit under these circumstances because handle->h_ref 1315 * is elevated. We'll still have enough credits for the tiny quotafile 1316 * write. 1317 */ 1318 static int do_journal_get_write_access(handle_t *handle, 1319 struct buffer_head *bh) 1320 { 1321 if (!buffer_mapped(bh) || buffer_freed(bh)) 1322 return 0; 1323 return ext4_journal_get_write_access(handle, bh); 1324 } 1325 1326 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1327 loff_t pos, unsigned len, unsigned flags, 1328 struct page **pagep, void **fsdata) 1329 { 1330 struct inode *inode = mapping->host; 1331 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1332 handle_t *handle; 1333 int retries = 0; 1334 struct page *page; 1335 pgoff_t index; 1336 unsigned from, to; 1337 1338 index = pos >> PAGE_CACHE_SHIFT; 1339 from = pos & (PAGE_CACHE_SIZE - 1); 1340 to = from + len; 1341 1342 retry: 1343 handle = ext4_journal_start(inode, needed_blocks); 1344 if (IS_ERR(handle)) { 1345 ret = PTR_ERR(handle); 1346 goto out; 1347 } 1348 1349 page = grab_cache_page_write_begin(mapping, index, flags); 1350 if (!page) { 1351 ext4_journal_stop(handle); 1352 ret = -ENOMEM; 1353 goto out; 1354 } 1355 *pagep = page; 1356 1357 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1358 ext4_get_block); 1359 1360 if (!ret && ext4_should_journal_data(inode)) { 1361 ret = walk_page_buffers(handle, page_buffers(page), 1362 from, to, NULL, do_journal_get_write_access); 1363 } 1364 1365 if (ret) { 1366 unlock_page(page); 1367 ext4_journal_stop(handle); 1368 page_cache_release(page); 1369 /* 1370 * block_write_begin may have instantiated a few blocks 1371 * outside i_size. Trim these off again. Don't need 1372 * i_size_read because we hold i_mutex. 1373 */ 1374 if (pos + len > inode->i_size) 1375 vmtruncate(inode, inode->i_size); 1376 } 1377 1378 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1379 goto retry; 1380 out: 1381 return ret; 1382 } 1383 1384 /* For write_end() in data=journal mode */ 1385 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1386 { 1387 if (!buffer_mapped(bh) || buffer_freed(bh)) 1388 return 0; 1389 set_buffer_uptodate(bh); 1390 return ext4_journal_dirty_metadata(handle, bh); 1391 } 1392 1393 /* 1394 * We need to pick up the new inode size which generic_commit_write gave us 1395 * `file' can be NULL - eg, when called from page_symlink(). 1396 * 1397 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1398 * buffers are managed internally. 1399 */ 1400 static int ext4_ordered_write_end(struct file *file, 1401 struct address_space *mapping, 1402 loff_t pos, unsigned len, unsigned copied, 1403 struct page *page, void *fsdata) 1404 { 1405 handle_t *handle = ext4_journal_current_handle(); 1406 struct inode *inode = mapping->host; 1407 int ret = 0, ret2; 1408 1409 ret = ext4_jbd2_file_inode(handle, inode); 1410 1411 if (ret == 0) { 1412 loff_t new_i_size; 1413 1414 new_i_size = pos + copied; 1415 if (new_i_size > EXT4_I(inode)->i_disksize) { 1416 ext4_update_i_disksize(inode, new_i_size); 1417 /* We need to mark inode dirty even if 1418 * new_i_size is less that inode->i_size 1419 * bu greater than i_disksize.(hint delalloc) 1420 */ 1421 ext4_mark_inode_dirty(handle, inode); 1422 } 1423 1424 ret2 = generic_write_end(file, mapping, pos, len, copied, 1425 page, fsdata); 1426 copied = ret2; 1427 if (ret2 < 0) 1428 ret = ret2; 1429 } 1430 ret2 = ext4_journal_stop(handle); 1431 if (!ret) 1432 ret = ret2; 1433 1434 return ret ? ret : copied; 1435 } 1436 1437 static int ext4_writeback_write_end(struct file *file, 1438 struct address_space *mapping, 1439 loff_t pos, unsigned len, unsigned copied, 1440 struct page *page, void *fsdata) 1441 { 1442 handle_t *handle = ext4_journal_current_handle(); 1443 struct inode *inode = mapping->host; 1444 int ret = 0, ret2; 1445 loff_t new_i_size; 1446 1447 new_i_size = pos + copied; 1448 if (new_i_size > EXT4_I(inode)->i_disksize) { 1449 ext4_update_i_disksize(inode, new_i_size); 1450 /* We need to mark inode dirty even if 1451 * new_i_size is less that inode->i_size 1452 * bu greater than i_disksize.(hint delalloc) 1453 */ 1454 ext4_mark_inode_dirty(handle, inode); 1455 } 1456 1457 ret2 = generic_write_end(file, mapping, pos, len, copied, 1458 page, fsdata); 1459 copied = ret2; 1460 if (ret2 < 0) 1461 ret = ret2; 1462 1463 ret2 = ext4_journal_stop(handle); 1464 if (!ret) 1465 ret = ret2; 1466 1467 return ret ? ret : copied; 1468 } 1469 1470 static int ext4_journalled_write_end(struct file *file, 1471 struct address_space *mapping, 1472 loff_t pos, unsigned len, unsigned copied, 1473 struct page *page, void *fsdata) 1474 { 1475 handle_t *handle = ext4_journal_current_handle(); 1476 struct inode *inode = mapping->host; 1477 int ret = 0, ret2; 1478 int partial = 0; 1479 unsigned from, to; 1480 loff_t new_i_size; 1481 1482 from = pos & (PAGE_CACHE_SIZE - 1); 1483 to = from + len; 1484 1485 if (copied < len) { 1486 if (!PageUptodate(page)) 1487 copied = 0; 1488 page_zero_new_buffers(page, from+copied, to); 1489 } 1490 1491 ret = walk_page_buffers(handle, page_buffers(page), from, 1492 to, &partial, write_end_fn); 1493 if (!partial) 1494 SetPageUptodate(page); 1495 new_i_size = pos + copied; 1496 if (new_i_size > inode->i_size) 1497 i_size_write(inode, pos+copied); 1498 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1499 if (new_i_size > EXT4_I(inode)->i_disksize) { 1500 ext4_update_i_disksize(inode, new_i_size); 1501 ret2 = ext4_mark_inode_dirty(handle, inode); 1502 if (!ret) 1503 ret = ret2; 1504 } 1505 1506 unlock_page(page); 1507 ret2 = ext4_journal_stop(handle); 1508 if (!ret) 1509 ret = ret2; 1510 page_cache_release(page); 1511 1512 return ret ? ret : copied; 1513 } 1514 1515 static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1516 { 1517 int retries = 0; 1518 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1519 unsigned long md_needed, mdblocks, total = 0; 1520 1521 /* 1522 * recalculate the amount of metadata blocks to reserve 1523 * in order to allocate nrblocks 1524 * worse case is one extent per block 1525 */ 1526 repeat: 1527 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1528 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1529 mdblocks = ext4_calc_metadata_amount(inode, total); 1530 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1531 1532 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1533 total = md_needed + nrblocks; 1534 1535 if (ext4_claim_free_blocks(sbi, total)) { 1536 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1537 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1538 yield(); 1539 goto repeat; 1540 } 1541 return -ENOSPC; 1542 } 1543 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1544 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1545 1546 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1547 return 0; /* success */ 1548 } 1549 1550 static void ext4_da_release_space(struct inode *inode, int to_free) 1551 { 1552 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1553 int total, mdb, mdb_free, release; 1554 1555 if (!to_free) 1556 return; /* Nothing to release, exit */ 1557 1558 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1559 1560 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1561 /* 1562 * if there is no reserved blocks, but we try to free some 1563 * then the counter is messed up somewhere. 1564 * but since this function is called from invalidate 1565 * page, it's harmless to return without any action 1566 */ 1567 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1568 "blocks for inode %lu, but there is no reserved " 1569 "data blocks\n", to_free, inode->i_ino); 1570 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1571 return; 1572 } 1573 1574 /* recalculate the number of metablocks still need to be reserved */ 1575 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1576 mdb = ext4_calc_metadata_amount(inode, total); 1577 1578 /* figure out how many metablocks to release */ 1579 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1580 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1581 1582 release = to_free + mdb_free; 1583 1584 /* update fs dirty blocks counter for truncate case */ 1585 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); 1586 1587 /* update per-inode reservations */ 1588 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1589 EXT4_I(inode)->i_reserved_data_blocks -= to_free; 1590 1591 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1592 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1593 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1594 } 1595 1596 static void ext4_da_page_release_reservation(struct page *page, 1597 unsigned long offset) 1598 { 1599 int to_release = 0; 1600 struct buffer_head *head, *bh; 1601 unsigned int curr_off = 0; 1602 1603 head = page_buffers(page); 1604 bh = head; 1605 do { 1606 unsigned int next_off = curr_off + bh->b_size; 1607 1608 if ((offset <= curr_off) && (buffer_delay(bh))) { 1609 to_release++; 1610 clear_buffer_delay(bh); 1611 } 1612 curr_off = next_off; 1613 } while ((bh = bh->b_this_page) != head); 1614 ext4_da_release_space(page->mapping->host, to_release); 1615 } 1616 1617 /* 1618 * Delayed allocation stuff 1619 */ 1620 1621 struct mpage_da_data { 1622 struct inode *inode; 1623 struct buffer_head lbh; /* extent of blocks */ 1624 unsigned long first_page, next_page; /* extent of pages */ 1625 get_block_t *get_block; 1626 struct writeback_control *wbc; 1627 int io_done; 1628 long pages_written; 1629 int retval; 1630 }; 1631 1632 /* 1633 * mpage_da_submit_io - walks through extent of pages and try to write 1634 * them with writepage() call back 1635 * 1636 * @mpd->inode: inode 1637 * @mpd->first_page: first page of the extent 1638 * @mpd->next_page: page after the last page of the extent 1639 * @mpd->get_block: the filesystem's block mapper function 1640 * 1641 * By the time mpage_da_submit_io() is called we expect all blocks 1642 * to be allocated. this may be wrong if allocation failed. 1643 * 1644 * As pages are already locked by write_cache_pages(), we can't use it 1645 */ 1646 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1647 { 1648 struct address_space *mapping = mpd->inode->i_mapping; 1649 int ret = 0, err, nr_pages, i; 1650 unsigned long index, end; 1651 struct pagevec pvec; 1652 long pages_skipped; 1653 1654 BUG_ON(mpd->next_page <= mpd->first_page); 1655 pagevec_init(&pvec, 0); 1656 index = mpd->first_page; 1657 end = mpd->next_page - 1; 1658 1659 while (index <= end) { 1660 /* 1661 * We can use PAGECACHE_TAG_DIRTY lookup here because 1662 * even though we have cleared the dirty flag on the page 1663 * We still keep the page in the radix tree with tag 1664 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. 1665 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback 1666 * which is called via the below writepage callback. 1667 */ 1668 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1669 PAGECACHE_TAG_DIRTY, 1670 min(end - index, 1671 (pgoff_t)PAGEVEC_SIZE-1) + 1); 1672 if (nr_pages == 0) 1673 break; 1674 for (i = 0; i < nr_pages; i++) { 1675 struct page *page = pvec.pages[i]; 1676 1677 pages_skipped = mpd->wbc->pages_skipped; 1678 err = mapping->a_ops->writepage(page, mpd->wbc); 1679 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1680 /* 1681 * have successfully written the page 1682 * without skipping the same 1683 */ 1684 mpd->pages_written++; 1685 /* 1686 * In error case, we have to continue because 1687 * remaining pages are still locked 1688 * XXX: unlock and re-dirty them? 1689 */ 1690 if (ret == 0) 1691 ret = err; 1692 } 1693 pagevec_release(&pvec); 1694 } 1695 return ret; 1696 } 1697 1698 /* 1699 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 1700 * 1701 * @mpd->inode - inode to walk through 1702 * @exbh->b_blocknr - first block on a disk 1703 * @exbh->b_size - amount of space in bytes 1704 * @logical - first logical block to start assignment with 1705 * 1706 * the function goes through all passed space and put actual disk 1707 * block numbers into buffer heads, dropping BH_Delay 1708 */ 1709 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1710 struct buffer_head *exbh) 1711 { 1712 struct inode *inode = mpd->inode; 1713 struct address_space *mapping = inode->i_mapping; 1714 int blocks = exbh->b_size >> inode->i_blkbits; 1715 sector_t pblock = exbh->b_blocknr, cur_logical; 1716 struct buffer_head *head, *bh; 1717 pgoff_t index, end; 1718 struct pagevec pvec; 1719 int nr_pages, i; 1720 1721 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1722 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1723 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1724 1725 pagevec_init(&pvec, 0); 1726 1727 while (index <= end) { 1728 /* XXX: optimize tail */ 1729 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1730 if (nr_pages == 0) 1731 break; 1732 for (i = 0; i < nr_pages; i++) { 1733 struct page *page = pvec.pages[i]; 1734 1735 index = page->index; 1736 if (index > end) 1737 break; 1738 index++; 1739 1740 BUG_ON(!PageLocked(page)); 1741 BUG_ON(PageWriteback(page)); 1742 BUG_ON(!page_has_buffers(page)); 1743 1744 bh = page_buffers(page); 1745 head = bh; 1746 1747 /* skip blocks out of the range */ 1748 do { 1749 if (cur_logical >= logical) 1750 break; 1751 cur_logical++; 1752 } while ((bh = bh->b_this_page) != head); 1753 1754 do { 1755 if (cur_logical >= logical + blocks) 1756 break; 1757 if (buffer_delay(bh)) { 1758 bh->b_blocknr = pblock; 1759 clear_buffer_delay(bh); 1760 bh->b_bdev = inode->i_sb->s_bdev; 1761 } else if (buffer_unwritten(bh)) { 1762 bh->b_blocknr = pblock; 1763 clear_buffer_unwritten(bh); 1764 set_buffer_mapped(bh); 1765 set_buffer_new(bh); 1766 bh->b_bdev = inode->i_sb->s_bdev; 1767 } else if (buffer_mapped(bh)) 1768 BUG_ON(bh->b_blocknr != pblock); 1769 1770 cur_logical++; 1771 pblock++; 1772 } while ((bh = bh->b_this_page) != head); 1773 } 1774 pagevec_release(&pvec); 1775 } 1776 } 1777 1778 1779 /* 1780 * __unmap_underlying_blocks - just a helper function to unmap 1781 * set of blocks described by @bh 1782 */ 1783 static inline void __unmap_underlying_blocks(struct inode *inode, 1784 struct buffer_head *bh) 1785 { 1786 struct block_device *bdev = inode->i_sb->s_bdev; 1787 int blocks, i; 1788 1789 blocks = bh->b_size >> inode->i_blkbits; 1790 for (i = 0; i < blocks; i++) 1791 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1792 } 1793 1794 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 1795 sector_t logical, long blk_cnt) 1796 { 1797 int nr_pages, i; 1798 pgoff_t index, end; 1799 struct pagevec pvec; 1800 struct inode *inode = mpd->inode; 1801 struct address_space *mapping = inode->i_mapping; 1802 1803 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1804 end = (logical + blk_cnt - 1) >> 1805 (PAGE_CACHE_SHIFT - inode->i_blkbits); 1806 while (index <= end) { 1807 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1808 if (nr_pages == 0) 1809 break; 1810 for (i = 0; i < nr_pages; i++) { 1811 struct page *page = pvec.pages[i]; 1812 index = page->index; 1813 if (index > end) 1814 break; 1815 index++; 1816 1817 BUG_ON(!PageLocked(page)); 1818 BUG_ON(PageWriteback(page)); 1819 block_invalidatepage(page, 0); 1820 ClearPageUptodate(page); 1821 unlock_page(page); 1822 } 1823 } 1824 return; 1825 } 1826 1827 static void ext4_print_free_blocks(struct inode *inode) 1828 { 1829 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1830 printk(KERN_EMERG "Total free blocks count %lld\n", 1831 ext4_count_free_blocks(inode->i_sb)); 1832 printk(KERN_EMERG "Free/Dirty block details\n"); 1833 printk(KERN_EMERG "free_blocks=%lld\n", 1834 percpu_counter_sum(&sbi->s_freeblocks_counter)); 1835 printk(KERN_EMERG "dirty_blocks=%lld\n", 1836 percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1837 printk(KERN_EMERG "Block reservation details\n"); 1838 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", 1839 EXT4_I(inode)->i_reserved_data_blocks); 1840 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", 1841 EXT4_I(inode)->i_reserved_meta_blocks); 1842 return; 1843 } 1844 1845 /* 1846 * mpage_da_map_blocks - go through given space 1847 * 1848 * @mpd->lbh - bh describing space 1849 * @mpd->get_block - the filesystem's block mapper function 1850 * 1851 * The function skips space we know is already mapped to disk blocks. 1852 * 1853 */ 1854 static int mpage_da_map_blocks(struct mpage_da_data *mpd) 1855 { 1856 int err = 0; 1857 struct buffer_head new; 1858 struct buffer_head *lbh = &mpd->lbh; 1859 sector_t next; 1860 1861 /* 1862 * We consider only non-mapped and non-allocated blocks 1863 */ 1864 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1865 return 0; 1866 new.b_state = lbh->b_state; 1867 new.b_blocknr = 0; 1868 new.b_size = lbh->b_size; 1869 next = lbh->b_blocknr; 1870 /* 1871 * If we didn't accumulate anything 1872 * to write simply return 1873 */ 1874 if (!new.b_size) 1875 return 0; 1876 err = mpd->get_block(mpd->inode, next, &new, 1); 1877 if (err) { 1878 1879 /* If get block returns with error 1880 * we simply return. Later writepage 1881 * will redirty the page and writepages 1882 * will find the dirty page again 1883 */ 1884 if (err == -EAGAIN) 1885 return 0; 1886 1887 if (err == -ENOSPC && 1888 ext4_count_free_blocks(mpd->inode->i_sb)) { 1889 mpd->retval = err; 1890 return 0; 1891 } 1892 1893 /* 1894 * get block failure will cause us 1895 * to loop in writepages. Because 1896 * a_ops->writepage won't be able to 1897 * make progress. The page will be redirtied 1898 * by writepage and writepages will again 1899 * try to write the same. 1900 */ 1901 printk(KERN_EMERG "%s block allocation failed for inode %lu " 1902 "at logical offset %llu with max blocks " 1903 "%zd with error %d\n", 1904 __func__, mpd->inode->i_ino, 1905 (unsigned long long)next, 1906 lbh->b_size >> mpd->inode->i_blkbits, err); 1907 printk(KERN_EMERG "This should not happen.!! " 1908 "Data will be lost\n"); 1909 if (err == -ENOSPC) { 1910 ext4_print_free_blocks(mpd->inode); 1911 } 1912 /* invlaidate all the pages */ 1913 ext4_da_block_invalidatepages(mpd, next, 1914 lbh->b_size >> mpd->inode->i_blkbits); 1915 return err; 1916 } 1917 BUG_ON(new.b_size == 0); 1918 1919 if (buffer_new(&new)) 1920 __unmap_underlying_blocks(mpd->inode, &new); 1921 1922 /* 1923 * If blocks are delayed marked, we need to 1924 * put actual blocknr and drop delayed bit 1925 */ 1926 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1927 mpage_put_bnr_to_bhs(mpd, next, &new); 1928 1929 return 0; 1930 } 1931 1932 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1933 (1 << BH_Delay) | (1 << BH_Unwritten)) 1934 1935 /* 1936 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 1937 * 1938 * @mpd->lbh - extent of blocks 1939 * @logical - logical number of the block in the file 1940 * @bh - bh of the block (used to access block's state) 1941 * 1942 * the function is used to collect contig. blocks in same state 1943 */ 1944 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1945 sector_t logical, struct buffer_head *bh) 1946 { 1947 sector_t next; 1948 size_t b_size = bh->b_size; 1949 struct buffer_head *lbh = &mpd->lbh; 1950 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; 1951 1952 /* check if thereserved journal credits might overflow */ 1953 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 1954 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 1955 /* 1956 * With non-extent format we are limited by the journal 1957 * credit available. Total credit needed to insert 1958 * nrblocks contiguous blocks is dependent on the 1959 * nrblocks. So limit nrblocks. 1960 */ 1961 goto flush_it; 1962 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 1963 EXT4_MAX_TRANS_DATA) { 1964 /* 1965 * Adding the new buffer_head would make it cross the 1966 * allowed limit for which we have journal credit 1967 * reserved. So limit the new bh->b_size 1968 */ 1969 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 1970 mpd->inode->i_blkbits; 1971 /* we will do mpage_da_submit_io in the next loop */ 1972 } 1973 } 1974 /* 1975 * First block in the extent 1976 */ 1977 if (lbh->b_size == 0) { 1978 lbh->b_blocknr = logical; 1979 lbh->b_size = b_size; 1980 lbh->b_state = bh->b_state & BH_FLAGS; 1981 return; 1982 } 1983 1984 next = lbh->b_blocknr + nrblocks; 1985 /* 1986 * Can we merge the block to our big extent? 1987 */ 1988 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 1989 lbh->b_size += b_size; 1990 return; 1991 } 1992 1993 flush_it: 1994 /* 1995 * We couldn't merge the block to our extent, so we 1996 * need to flush current extent and start new one 1997 */ 1998 if (mpage_da_map_blocks(mpd) == 0) 1999 mpage_da_submit_io(mpd); 2000 mpd->io_done = 1; 2001 return; 2002 } 2003 2004 /* 2005 * __mpage_da_writepage - finds extent of pages and blocks 2006 * 2007 * @page: page to consider 2008 * @wbc: not used, we just follow rules 2009 * @data: context 2010 * 2011 * The function finds extents of pages and scan them for all blocks. 2012 */ 2013 static int __mpage_da_writepage(struct page *page, 2014 struct writeback_control *wbc, void *data) 2015 { 2016 struct mpage_da_data *mpd = data; 2017 struct inode *inode = mpd->inode; 2018 struct buffer_head *bh, *head, fake; 2019 sector_t logical; 2020 2021 if (mpd->io_done) { 2022 /* 2023 * Rest of the page in the page_vec 2024 * redirty then and skip then. We will 2025 * try to to write them again after 2026 * starting a new transaction 2027 */ 2028 redirty_page_for_writepage(wbc, page); 2029 unlock_page(page); 2030 return MPAGE_DA_EXTENT_TAIL; 2031 } 2032 /* 2033 * Can we merge this page to current extent? 2034 */ 2035 if (mpd->next_page != page->index) { 2036 /* 2037 * Nope, we can't. So, we map non-allocated blocks 2038 * and start IO on them using writepage() 2039 */ 2040 if (mpd->next_page != mpd->first_page) { 2041 if (mpage_da_map_blocks(mpd) == 0) 2042 mpage_da_submit_io(mpd); 2043 /* 2044 * skip rest of the page in the page_vec 2045 */ 2046 mpd->io_done = 1; 2047 redirty_page_for_writepage(wbc, page); 2048 unlock_page(page); 2049 return MPAGE_DA_EXTENT_TAIL; 2050 } 2051 2052 /* 2053 * Start next extent of pages ... 2054 */ 2055 mpd->first_page = page->index; 2056 2057 /* 2058 * ... and blocks 2059 */ 2060 mpd->lbh.b_size = 0; 2061 mpd->lbh.b_state = 0; 2062 mpd->lbh.b_blocknr = 0; 2063 } 2064 2065 mpd->next_page = page->index + 1; 2066 logical = (sector_t) page->index << 2067 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2068 2069 if (!page_has_buffers(page)) { 2070 /* 2071 * There is no attached buffer heads yet (mmap?) 2072 * we treat the page asfull of dirty blocks 2073 */ 2074 bh = &fake; 2075 bh->b_size = PAGE_CACHE_SIZE; 2076 bh->b_state = 0; 2077 set_buffer_dirty(bh); 2078 set_buffer_uptodate(bh); 2079 mpage_add_bh_to_extent(mpd, logical, bh); 2080 if (mpd->io_done) 2081 return MPAGE_DA_EXTENT_TAIL; 2082 } else { 2083 /* 2084 * Page with regular buffer heads, just add all dirty ones 2085 */ 2086 head = page_buffers(page); 2087 bh = head; 2088 do { 2089 BUG_ON(buffer_locked(bh)); 2090 if (buffer_dirty(bh) && 2091 (!buffer_mapped(bh) || buffer_delay(bh))) { 2092 mpage_add_bh_to_extent(mpd, logical, bh); 2093 if (mpd->io_done) 2094 return MPAGE_DA_EXTENT_TAIL; 2095 } 2096 logical++; 2097 } while ((bh = bh->b_this_page) != head); 2098 } 2099 2100 return 0; 2101 } 2102 2103 /* 2104 * mpage_da_writepages - walk the list of dirty pages of the given 2105 * address space, allocates non-allocated blocks, maps newly-allocated 2106 * blocks to existing bhs and issue IO them 2107 * 2108 * @mapping: address space structure to write 2109 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2110 * @get_block: the filesystem's block mapper function. 2111 * 2112 * This is a library function, which implements the writepages() 2113 * address_space_operation. 2114 */ 2115 static int mpage_da_writepages(struct address_space *mapping, 2116 struct writeback_control *wbc, 2117 struct mpage_da_data *mpd) 2118 { 2119 int ret; 2120 2121 if (!mpd->get_block) 2122 return generic_writepages(mapping, wbc); 2123 2124 mpd->lbh.b_size = 0; 2125 mpd->lbh.b_state = 0; 2126 mpd->lbh.b_blocknr = 0; 2127 mpd->first_page = 0; 2128 mpd->next_page = 0; 2129 mpd->io_done = 0; 2130 mpd->pages_written = 0; 2131 mpd->retval = 0; 2132 2133 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2134 /* 2135 * Handle last extent of pages 2136 */ 2137 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2138 if (mpage_da_map_blocks(mpd) == 0) 2139 mpage_da_submit_io(mpd); 2140 2141 mpd->io_done = 1; 2142 ret = MPAGE_DA_EXTENT_TAIL; 2143 } 2144 wbc->nr_to_write -= mpd->pages_written; 2145 return ret; 2146 } 2147 2148 /* 2149 * this is a special callback for ->write_begin() only 2150 * it's intention is to return mapped block or reserve space 2151 */ 2152 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2153 struct buffer_head *bh_result, int create) 2154 { 2155 int ret = 0; 2156 2157 BUG_ON(create == 0); 2158 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2159 2160 /* 2161 * first, we need to know whether the block is allocated already 2162 * preallocated blocks are unmapped but should treated 2163 * the same as allocated blocks. 2164 */ 2165 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); 2166 if ((ret == 0) && !buffer_delay(bh_result)) { 2167 /* the block isn't (pre)allocated yet, let's reserve space */ 2168 /* 2169 * XXX: __block_prepare_write() unmaps passed block, 2170 * is it OK? 2171 */ 2172 ret = ext4_da_reserve_space(inode, 1); 2173 if (ret) 2174 /* not enough space to reserve */ 2175 return ret; 2176 2177 map_bh(bh_result, inode->i_sb, 0); 2178 set_buffer_new(bh_result); 2179 set_buffer_delay(bh_result); 2180 } else if (ret > 0) { 2181 bh_result->b_size = (ret << inode->i_blkbits); 2182 ret = 0; 2183 } 2184 2185 return ret; 2186 } 2187 #define EXT4_DELALLOC_RSVED 1 2188 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, 2189 struct buffer_head *bh_result, int create) 2190 { 2191 int ret; 2192 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2193 loff_t disksize = EXT4_I(inode)->i_disksize; 2194 handle_t *handle = NULL; 2195 2196 handle = ext4_journal_current_handle(); 2197 BUG_ON(!handle); 2198 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2199 bh_result, create, 0, EXT4_DELALLOC_RSVED); 2200 if (ret > 0) { 2201 2202 bh_result->b_size = (ret << inode->i_blkbits); 2203 2204 if (ext4_should_order_data(inode)) { 2205 int retval; 2206 retval = ext4_jbd2_file_inode(handle, inode); 2207 if (retval) 2208 /* 2209 * Failed to add inode for ordered 2210 * mode. Don't update file size 2211 */ 2212 return retval; 2213 } 2214 2215 /* 2216 * Update on-disk size along with block allocation 2217 * we don't use 'extend_disksize' as size may change 2218 * within already allocated block -bzzz 2219 */ 2220 disksize = ((loff_t) iblock + ret) << inode->i_blkbits; 2221 if (disksize > i_size_read(inode)) 2222 disksize = i_size_read(inode); 2223 if (disksize > EXT4_I(inode)->i_disksize) { 2224 ext4_update_i_disksize(inode, disksize); 2225 ret = ext4_mark_inode_dirty(handle, inode); 2226 return ret; 2227 } 2228 ret = 0; 2229 } 2230 return ret; 2231 } 2232 2233 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2234 { 2235 /* 2236 * unmapped buffer is possible for holes. 2237 * delay buffer is possible with delayed allocation 2238 */ 2239 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); 2240 } 2241 2242 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, 2243 struct buffer_head *bh_result, int create) 2244 { 2245 int ret = 0; 2246 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2247 2248 /* 2249 * we don't want to do block allocation in writepage 2250 * so call get_block_wrap with create = 0 2251 */ 2252 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, 2253 bh_result, 0, 0, 0); 2254 if (ret > 0) { 2255 bh_result->b_size = (ret << inode->i_blkbits); 2256 ret = 0; 2257 } 2258 return ret; 2259 } 2260 2261 /* 2262 * get called vi ext4_da_writepages after taking page lock (have journal handle) 2263 * get called via journal_submit_inode_data_buffers (no journal handle) 2264 * get called via shrink_page_list via pdflush (no journal handle) 2265 * or grab_page_cache when doing write_begin (have journal handle) 2266 */ 2267 static int ext4_da_writepage(struct page *page, 2268 struct writeback_control *wbc) 2269 { 2270 int ret = 0; 2271 loff_t size; 2272 unsigned long len; 2273 struct buffer_head *page_bufs; 2274 struct inode *inode = page->mapping->host; 2275 2276 size = i_size_read(inode); 2277 if (page->index == size >> PAGE_CACHE_SHIFT) 2278 len = size & ~PAGE_CACHE_MASK; 2279 else 2280 len = PAGE_CACHE_SIZE; 2281 2282 if (page_has_buffers(page)) { 2283 page_bufs = page_buffers(page); 2284 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2285 ext4_bh_unmapped_or_delay)) { 2286 /* 2287 * We don't want to do block allocation 2288 * So redirty the page and return 2289 * We may reach here when we do a journal commit 2290 * via journal_submit_inode_data_buffers. 2291 * If we don't have mapping block we just ignore 2292 * them. We can also reach here via shrink_page_list 2293 */ 2294 redirty_page_for_writepage(wbc, page); 2295 unlock_page(page); 2296 return 0; 2297 } 2298 } else { 2299 /* 2300 * The test for page_has_buffers() is subtle: 2301 * We know the page is dirty but it lost buffers. That means 2302 * that at some moment in time after write_begin()/write_end() 2303 * has been called all buffers have been clean and thus they 2304 * must have been written at least once. So they are all 2305 * mapped and we can happily proceed with mapping them 2306 * and writing the page. 2307 * 2308 * Try to initialize the buffer_heads and check whether 2309 * all are mapped and non delay. We don't want to 2310 * do block allocation here. 2311 */ 2312 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2313 ext4_normal_get_block_write); 2314 if (!ret) { 2315 page_bufs = page_buffers(page); 2316 /* check whether all are mapped and non delay */ 2317 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2318 ext4_bh_unmapped_or_delay)) { 2319 redirty_page_for_writepage(wbc, page); 2320 unlock_page(page); 2321 return 0; 2322 } 2323 } else { 2324 /* 2325 * We can't do block allocation here 2326 * so just redity the page and unlock 2327 * and return 2328 */ 2329 redirty_page_for_writepage(wbc, page); 2330 unlock_page(page); 2331 return 0; 2332 } 2333 /* now mark the buffer_heads as dirty and uptodate */ 2334 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2335 } 2336 2337 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2338 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); 2339 else 2340 ret = block_write_full_page(page, 2341 ext4_normal_get_block_write, 2342 wbc); 2343 2344 return ret; 2345 } 2346 2347 /* 2348 * This is called via ext4_da_writepages() to 2349 * calulate the total number of credits to reserve to fit 2350 * a single extent allocation into a single transaction, 2351 * ext4_da_writpeages() will loop calling this before 2352 * the block allocation. 2353 */ 2354 2355 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2356 { 2357 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2358 2359 /* 2360 * With non-extent format the journal credit needed to 2361 * insert nrblocks contiguous block is dependent on 2362 * number of contiguous block. So we will limit 2363 * number of contiguous block to a sane value 2364 */ 2365 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2366 (max_blocks > EXT4_MAX_TRANS_DATA)) 2367 max_blocks = EXT4_MAX_TRANS_DATA; 2368 2369 return ext4_chunk_trans_blocks(inode, max_blocks); 2370 } 2371 2372 static int ext4_da_writepages(struct address_space *mapping, 2373 struct writeback_control *wbc) 2374 { 2375 pgoff_t index; 2376 int range_whole = 0; 2377 handle_t *handle = NULL; 2378 struct mpage_da_data mpd; 2379 struct inode *inode = mapping->host; 2380 int no_nrwrite_index_update; 2381 long pages_written = 0, pages_skipped; 2382 int needed_blocks, ret = 0, nr_to_writebump = 0; 2383 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2384 2385 /* 2386 * No pages to write? This is mainly a kludge to avoid starting 2387 * a transaction for special inodes like journal inode on last iput() 2388 * because that could violate lock ordering on umount 2389 */ 2390 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2391 return 0; 2392 /* 2393 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2394 * This make sure small files blocks are allocated in 2395 * single attempt. This ensure that small files 2396 * get less fragmented. 2397 */ 2398 if (wbc->nr_to_write < sbi->s_mb_stream_request) { 2399 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2400 wbc->nr_to_write = sbi->s_mb_stream_request; 2401 } 2402 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2403 range_whole = 1; 2404 2405 if (wbc->range_cyclic) 2406 index = mapping->writeback_index; 2407 else 2408 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2409 2410 mpd.wbc = wbc; 2411 mpd.inode = mapping->host; 2412 2413 /* 2414 * we don't want write_cache_pages to update 2415 * nr_to_write and writeback_index 2416 */ 2417 no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2418 wbc->no_nrwrite_index_update = 1; 2419 pages_skipped = wbc->pages_skipped; 2420 2421 while (!ret && wbc->nr_to_write > 0) { 2422 2423 /* 2424 * we insert one extent at a time. So we need 2425 * credit needed for single extent allocation. 2426 * journalled mode is currently not supported 2427 * by delalloc 2428 */ 2429 BUG_ON(ext4_should_journal_data(inode)); 2430 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2431 2432 /* start a new transaction*/ 2433 handle = ext4_journal_start(inode, needed_blocks); 2434 if (IS_ERR(handle)) { 2435 ret = PTR_ERR(handle); 2436 printk(KERN_EMERG "%s: jbd2_start: " 2437 "%ld pages, ino %lu; err %d\n", __func__, 2438 wbc->nr_to_write, inode->i_ino, ret); 2439 dump_stack(); 2440 goto out_writepages; 2441 } 2442 mpd.get_block = ext4_da_get_block_write; 2443 ret = mpage_da_writepages(mapping, wbc, &mpd); 2444 2445 ext4_journal_stop(handle); 2446 2447 if (mpd.retval == -ENOSPC) { 2448 /* commit the transaction which would 2449 * free blocks released in the transaction 2450 * and try again 2451 */ 2452 jbd2_journal_force_commit_nested(sbi->s_journal); 2453 wbc->pages_skipped = pages_skipped; 2454 ret = 0; 2455 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2456 /* 2457 * got one extent now try with 2458 * rest of the pages 2459 */ 2460 pages_written += mpd.pages_written; 2461 wbc->pages_skipped = pages_skipped; 2462 ret = 0; 2463 } else if (wbc->nr_to_write) 2464 /* 2465 * There is no more writeout needed 2466 * or we requested for a noblocking writeout 2467 * and we found the device congested 2468 */ 2469 break; 2470 } 2471 if (pages_skipped != wbc->pages_skipped) 2472 printk(KERN_EMERG "This should not happen leaving %s " 2473 "with nr_to_write = %ld ret = %d\n", 2474 __func__, wbc->nr_to_write, ret); 2475 2476 /* Update index */ 2477 index += pages_written; 2478 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2479 /* 2480 * set the writeback_index so that range_cyclic 2481 * mode will write it back later 2482 */ 2483 mapping->writeback_index = index; 2484 2485 out_writepages: 2486 if (!no_nrwrite_index_update) 2487 wbc->no_nrwrite_index_update = 0; 2488 wbc->nr_to_write -= nr_to_writebump; 2489 return ret; 2490 } 2491 2492 #define FALL_BACK_TO_NONDELALLOC 1 2493 static int ext4_nonda_switch(struct super_block *sb) 2494 { 2495 s64 free_blocks, dirty_blocks; 2496 struct ext4_sb_info *sbi = EXT4_SB(sb); 2497 2498 /* 2499 * switch to non delalloc mode if we are running low 2500 * on free block. The free block accounting via percpu 2501 * counters can get slightly wrong with FBC_BATCH getting 2502 * accumulated on each CPU without updating global counters 2503 * Delalloc need an accurate free block accounting. So switch 2504 * to non delalloc when we are near to error range. 2505 */ 2506 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2507 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2508 if (2 * free_blocks < 3 * dirty_blocks || 2509 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2510 /* 2511 * free block count is less that 150% of dirty blocks 2512 * or free blocks is less that watermark 2513 */ 2514 return 1; 2515 } 2516 return 0; 2517 } 2518 2519 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2520 loff_t pos, unsigned len, unsigned flags, 2521 struct page **pagep, void **fsdata) 2522 { 2523 int ret, retries = 0; 2524 struct page *page; 2525 pgoff_t index; 2526 unsigned from, to; 2527 struct inode *inode = mapping->host; 2528 handle_t *handle; 2529 2530 index = pos >> PAGE_CACHE_SHIFT; 2531 from = pos & (PAGE_CACHE_SIZE - 1); 2532 to = from + len; 2533 2534 if (ext4_nonda_switch(inode->i_sb)) { 2535 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2536 return ext4_write_begin(file, mapping, pos, 2537 len, flags, pagep, fsdata); 2538 } 2539 *fsdata = (void *)0; 2540 retry: 2541 /* 2542 * With delayed allocation, we don't log the i_disksize update 2543 * if there is delayed block allocation. But we still need 2544 * to journalling the i_disksize update if writes to the end 2545 * of file which has an already mapped buffer. 2546 */ 2547 handle = ext4_journal_start(inode, 1); 2548 if (IS_ERR(handle)) { 2549 ret = PTR_ERR(handle); 2550 goto out; 2551 } 2552 2553 page = grab_cache_page_write_begin(mapping, index, flags); 2554 if (!page) { 2555 ext4_journal_stop(handle); 2556 ret = -ENOMEM; 2557 goto out; 2558 } 2559 *pagep = page; 2560 2561 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2562 ext4_da_get_block_prep); 2563 if (ret < 0) { 2564 unlock_page(page); 2565 ext4_journal_stop(handle); 2566 page_cache_release(page); 2567 /* 2568 * block_write_begin may have instantiated a few blocks 2569 * outside i_size. Trim these off again. Don't need 2570 * i_size_read because we hold i_mutex. 2571 */ 2572 if (pos + len > inode->i_size) 2573 vmtruncate(inode, inode->i_size); 2574 } 2575 2576 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2577 goto retry; 2578 out: 2579 return ret; 2580 } 2581 2582 /* 2583 * Check if we should update i_disksize 2584 * when write to the end of file but not require block allocation 2585 */ 2586 static int ext4_da_should_update_i_disksize(struct page *page, 2587 unsigned long offset) 2588 { 2589 struct buffer_head *bh; 2590 struct inode *inode = page->mapping->host; 2591 unsigned int idx; 2592 int i; 2593 2594 bh = page_buffers(page); 2595 idx = offset >> inode->i_blkbits; 2596 2597 for (i = 0; i < idx; i++) 2598 bh = bh->b_this_page; 2599 2600 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2601 return 0; 2602 return 1; 2603 } 2604 2605 static int ext4_da_write_end(struct file *file, 2606 struct address_space *mapping, 2607 loff_t pos, unsigned len, unsigned copied, 2608 struct page *page, void *fsdata) 2609 { 2610 struct inode *inode = mapping->host; 2611 int ret = 0, ret2; 2612 handle_t *handle = ext4_journal_current_handle(); 2613 loff_t new_i_size; 2614 unsigned long start, end; 2615 int write_mode = (int)(unsigned long)fsdata; 2616 2617 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2618 if (ext4_should_order_data(inode)) { 2619 return ext4_ordered_write_end(file, mapping, pos, 2620 len, copied, page, fsdata); 2621 } else if (ext4_should_writeback_data(inode)) { 2622 return ext4_writeback_write_end(file, mapping, pos, 2623 len, copied, page, fsdata); 2624 } else { 2625 BUG(); 2626 } 2627 } 2628 2629 start = pos & (PAGE_CACHE_SIZE - 1); 2630 end = start + copied - 1; 2631 2632 /* 2633 * generic_write_end() will run mark_inode_dirty() if i_size 2634 * changes. So let's piggyback the i_disksize mark_inode_dirty 2635 * into that. 2636 */ 2637 2638 new_i_size = pos + copied; 2639 if (new_i_size > EXT4_I(inode)->i_disksize) { 2640 if (ext4_da_should_update_i_disksize(page, end)) { 2641 down_write(&EXT4_I(inode)->i_data_sem); 2642 if (new_i_size > EXT4_I(inode)->i_disksize) { 2643 /* 2644 * Updating i_disksize when extending file 2645 * without needing block allocation 2646 */ 2647 if (ext4_should_order_data(inode)) 2648 ret = ext4_jbd2_file_inode(handle, 2649 inode); 2650 2651 EXT4_I(inode)->i_disksize = new_i_size; 2652 } 2653 up_write(&EXT4_I(inode)->i_data_sem); 2654 /* We need to mark inode dirty even if 2655 * new_i_size is less that inode->i_size 2656 * bu greater than i_disksize.(hint delalloc) 2657 */ 2658 ext4_mark_inode_dirty(handle, inode); 2659 } 2660 } 2661 ret2 = generic_write_end(file, mapping, pos, len, copied, 2662 page, fsdata); 2663 copied = ret2; 2664 if (ret2 < 0) 2665 ret = ret2; 2666 ret2 = ext4_journal_stop(handle); 2667 if (!ret) 2668 ret = ret2; 2669 2670 return ret ? ret : copied; 2671 } 2672 2673 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2674 { 2675 /* 2676 * Drop reserved blocks 2677 */ 2678 BUG_ON(!PageLocked(page)); 2679 if (!page_has_buffers(page)) 2680 goto out; 2681 2682 ext4_da_page_release_reservation(page, offset); 2683 2684 out: 2685 ext4_invalidatepage(page, offset); 2686 2687 return; 2688 } 2689 2690 2691 /* 2692 * bmap() is special. It gets used by applications such as lilo and by 2693 * the swapper to find the on-disk block of a specific piece of data. 2694 * 2695 * Naturally, this is dangerous if the block concerned is still in the 2696 * journal. If somebody makes a swapfile on an ext4 data-journaling 2697 * filesystem and enables swap, then they may get a nasty shock when the 2698 * data getting swapped to that swapfile suddenly gets overwritten by 2699 * the original zero's written out previously to the journal and 2700 * awaiting writeback in the kernel's buffer cache. 2701 * 2702 * So, if we see any bmap calls here on a modified, data-journaled file, 2703 * take extra steps to flush any blocks which might be in the cache. 2704 */ 2705 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 2706 { 2707 struct inode *inode = mapping->host; 2708 journal_t *journal; 2709 int err; 2710 2711 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2712 test_opt(inode->i_sb, DELALLOC)) { 2713 /* 2714 * With delalloc we want to sync the file 2715 * so that we can make sure we allocate 2716 * blocks for file 2717 */ 2718 filemap_write_and_wait(mapping); 2719 } 2720 2721 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2722 /* 2723 * This is a REALLY heavyweight approach, but the use of 2724 * bmap on dirty files is expected to be extremely rare: 2725 * only if we run lilo or swapon on a freshly made file 2726 * do we expect this to happen. 2727 * 2728 * (bmap requires CAP_SYS_RAWIO so this does not 2729 * represent an unprivileged user DOS attack --- we'd be 2730 * in trouble if mortal users could trigger this path at 2731 * will.) 2732 * 2733 * NB. EXT4_STATE_JDATA is not set on files other than 2734 * regular files. If somebody wants to bmap a directory 2735 * or symlink and gets confused because the buffer 2736 * hasn't yet been flushed to disk, they deserve 2737 * everything they get. 2738 */ 2739 2740 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 2741 journal = EXT4_JOURNAL(inode); 2742 jbd2_journal_lock_updates(journal); 2743 err = jbd2_journal_flush(journal); 2744 jbd2_journal_unlock_updates(journal); 2745 2746 if (err) 2747 return 0; 2748 } 2749 2750 return generic_block_bmap(mapping, block, ext4_get_block); 2751 } 2752 2753 static int bget_one(handle_t *handle, struct buffer_head *bh) 2754 { 2755 get_bh(bh); 2756 return 0; 2757 } 2758 2759 static int bput_one(handle_t *handle, struct buffer_head *bh) 2760 { 2761 put_bh(bh); 2762 return 0; 2763 } 2764 2765 /* 2766 * Note that we don't need to start a transaction unless we're journaling data 2767 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2768 * need to file the inode to the transaction's list in ordered mode because if 2769 * we are writing back data added by write(), the inode is already there and if 2770 * we are writing back data modified via mmap(), noone guarantees in which 2771 * transaction the data will hit the disk. In case we are journaling data, we 2772 * cannot start transaction directly because transaction start ranks above page 2773 * lock so we have to do some magic. 2774 * 2775 * In all journaling modes block_write_full_page() will start the I/O. 2776 * 2777 * Problem: 2778 * 2779 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2780 * ext4_writepage() 2781 * 2782 * Similar for: 2783 * 2784 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 2785 * 2786 * Same applies to ext4_get_block(). We will deadlock on various things like 2787 * lock_journal and i_data_sem 2788 * 2789 * Setting PF_MEMALLOC here doesn't work - too many internal memory 2790 * allocations fail. 2791 * 2792 * 16May01: If we're reentered then journal_current_handle() will be 2793 * non-zero. We simply *return*. 2794 * 2795 * 1 July 2001: @@@ FIXME: 2796 * In journalled data mode, a data buffer may be metadata against the 2797 * current transaction. But the same file is part of a shared mapping 2798 * and someone does a writepage() on it. 2799 * 2800 * We will move the buffer onto the async_data list, but *after* it has 2801 * been dirtied. So there's a small window where we have dirty data on 2802 * BJ_Metadata. 2803 * 2804 * Note that this only applies to the last partial page in the file. The 2805 * bit which block_write_full_page() uses prepare/commit for. (That's 2806 * broken code anyway: it's wrong for msync()). 2807 * 2808 * It's a rare case: affects the final partial page, for journalled data 2809 * where the file is subject to bith write() and writepage() in the same 2810 * transction. To fix it we'll need a custom block_write_full_page(). 2811 * We'll probably need that anyway for journalling writepage() output. 2812 * 2813 * We don't honour synchronous mounts for writepage(). That would be 2814 * disastrous. Any write() or metadata operation will sync the fs for 2815 * us. 2816 * 2817 */ 2818 static int __ext4_normal_writepage(struct page *page, 2819 struct writeback_control *wbc) 2820 { 2821 struct inode *inode = page->mapping->host; 2822 2823 if (test_opt(inode->i_sb, NOBH)) 2824 return nobh_writepage(page, 2825 ext4_normal_get_block_write, wbc); 2826 else 2827 return block_write_full_page(page, 2828 ext4_normal_get_block_write, 2829 wbc); 2830 } 2831 2832 static int ext4_normal_writepage(struct page *page, 2833 struct writeback_control *wbc) 2834 { 2835 struct inode *inode = page->mapping->host; 2836 loff_t size = i_size_read(inode); 2837 loff_t len; 2838 2839 J_ASSERT(PageLocked(page)); 2840 if (page->index == size >> PAGE_CACHE_SHIFT) 2841 len = size & ~PAGE_CACHE_MASK; 2842 else 2843 len = PAGE_CACHE_SIZE; 2844 2845 if (page_has_buffers(page)) { 2846 /* if page has buffers it should all be mapped 2847 * and allocated. If there are not buffers attached 2848 * to the page we know the page is dirty but it lost 2849 * buffers. That means that at some moment in time 2850 * after write_begin() / write_end() has been called 2851 * all buffers have been clean and thus they must have been 2852 * written at least once. So they are all mapped and we can 2853 * happily proceed with mapping them and writing the page. 2854 */ 2855 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 2856 ext4_bh_unmapped_or_delay)); 2857 } 2858 2859 if (!ext4_journal_current_handle()) 2860 return __ext4_normal_writepage(page, wbc); 2861 2862 redirty_page_for_writepage(wbc, page); 2863 unlock_page(page); 2864 return 0; 2865 } 2866 2867 static int __ext4_journalled_writepage(struct page *page, 2868 struct writeback_control *wbc) 2869 { 2870 struct address_space *mapping = page->mapping; 2871 struct inode *inode = mapping->host; 2872 struct buffer_head *page_bufs; 2873 handle_t *handle = NULL; 2874 int ret = 0; 2875 int err; 2876 2877 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2878 ext4_normal_get_block_write); 2879 if (ret != 0) 2880 goto out_unlock; 2881 2882 page_bufs = page_buffers(page); 2883 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 2884 bget_one); 2885 /* As soon as we unlock the page, it can go away, but we have 2886 * references to buffers so we are safe */ 2887 unlock_page(page); 2888 2889 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2890 if (IS_ERR(handle)) { 2891 ret = PTR_ERR(handle); 2892 goto out; 2893 } 2894 2895 ret = walk_page_buffers(handle, page_bufs, 0, 2896 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 2897 2898 err = walk_page_buffers(handle, page_bufs, 0, 2899 PAGE_CACHE_SIZE, NULL, write_end_fn); 2900 if (ret == 0) 2901 ret = err; 2902 err = ext4_journal_stop(handle); 2903 if (!ret) 2904 ret = err; 2905 2906 walk_page_buffers(handle, page_bufs, 0, 2907 PAGE_CACHE_SIZE, NULL, bput_one); 2908 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2909 goto out; 2910 2911 out_unlock: 2912 unlock_page(page); 2913 out: 2914 return ret; 2915 } 2916 2917 static int ext4_journalled_writepage(struct page *page, 2918 struct writeback_control *wbc) 2919 { 2920 struct inode *inode = page->mapping->host; 2921 loff_t size = i_size_read(inode); 2922 loff_t len; 2923 2924 J_ASSERT(PageLocked(page)); 2925 if (page->index == size >> PAGE_CACHE_SHIFT) 2926 len = size & ~PAGE_CACHE_MASK; 2927 else 2928 len = PAGE_CACHE_SIZE; 2929 2930 if (page_has_buffers(page)) { 2931 /* if page has buffers it should all be mapped 2932 * and allocated. If there are not buffers attached 2933 * to the page we know the page is dirty but it lost 2934 * buffers. That means that at some moment in time 2935 * after write_begin() / write_end() has been called 2936 * all buffers have been clean and thus they must have been 2937 * written at least once. So they are all mapped and we can 2938 * happily proceed with mapping them and writing the page. 2939 */ 2940 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 2941 ext4_bh_unmapped_or_delay)); 2942 } 2943 2944 if (ext4_journal_current_handle()) 2945 goto no_write; 2946 2947 if (PageChecked(page)) { 2948 /* 2949 * It's mmapped pagecache. Add buffers and journal it. There 2950 * doesn't seem much point in redirtying the page here. 2951 */ 2952 ClearPageChecked(page); 2953 return __ext4_journalled_writepage(page, wbc); 2954 } else { 2955 /* 2956 * It may be a page full of checkpoint-mode buffers. We don't 2957 * really know unless we go poke around in the buffer_heads. 2958 * But block_write_full_page will do the right thing. 2959 */ 2960 return block_write_full_page(page, 2961 ext4_normal_get_block_write, 2962 wbc); 2963 } 2964 no_write: 2965 redirty_page_for_writepage(wbc, page); 2966 unlock_page(page); 2967 return 0; 2968 } 2969 2970 static int ext4_readpage(struct file *file, struct page *page) 2971 { 2972 return mpage_readpage(page, ext4_get_block); 2973 } 2974 2975 static int 2976 ext4_readpages(struct file *file, struct address_space *mapping, 2977 struct list_head *pages, unsigned nr_pages) 2978 { 2979 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2980 } 2981 2982 static void ext4_invalidatepage(struct page *page, unsigned long offset) 2983 { 2984 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2985 2986 /* 2987 * If it's a full truncate we just forget about the pending dirtying 2988 */ 2989 if (offset == 0) 2990 ClearPageChecked(page); 2991 2992 jbd2_journal_invalidatepage(journal, page, offset); 2993 } 2994 2995 static int ext4_releasepage(struct page *page, gfp_t wait) 2996 { 2997 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2998 2999 WARN_ON(PageChecked(page)); 3000 if (!page_has_buffers(page)) 3001 return 0; 3002 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3003 } 3004 3005 /* 3006 * If the O_DIRECT write will extend the file then add this inode to the 3007 * orphan list. So recovery will truncate it back to the original size 3008 * if the machine crashes during the write. 3009 * 3010 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3011 * crashes then stale disk data _may_ be exposed inside the file. But current 3012 * VFS code falls back into buffered path in that case so we are safe. 3013 */ 3014 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3015 const struct iovec *iov, loff_t offset, 3016 unsigned long nr_segs) 3017 { 3018 struct file *file = iocb->ki_filp; 3019 struct inode *inode = file->f_mapping->host; 3020 struct ext4_inode_info *ei = EXT4_I(inode); 3021 handle_t *handle; 3022 ssize_t ret; 3023 int orphan = 0; 3024 size_t count = iov_length(iov, nr_segs); 3025 3026 if (rw == WRITE) { 3027 loff_t final_size = offset + count; 3028 3029 if (final_size > inode->i_size) { 3030 /* Credits for sb + inode write */ 3031 handle = ext4_journal_start(inode, 2); 3032 if (IS_ERR(handle)) { 3033 ret = PTR_ERR(handle); 3034 goto out; 3035 } 3036 ret = ext4_orphan_add(handle, inode); 3037 if (ret) { 3038 ext4_journal_stop(handle); 3039 goto out; 3040 } 3041 orphan = 1; 3042 ei->i_disksize = inode->i_size; 3043 ext4_journal_stop(handle); 3044 } 3045 } 3046 3047 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3048 offset, nr_segs, 3049 ext4_get_block, NULL); 3050 3051 if (orphan) { 3052 int err; 3053 3054 /* Credits for sb + inode write */ 3055 handle = ext4_journal_start(inode, 2); 3056 if (IS_ERR(handle)) { 3057 /* This is really bad luck. We've written the data 3058 * but cannot extend i_size. Bail out and pretend 3059 * the write failed... */ 3060 ret = PTR_ERR(handle); 3061 goto out; 3062 } 3063 if (inode->i_nlink) 3064 ext4_orphan_del(handle, inode); 3065 if (ret > 0) { 3066 loff_t end = offset + ret; 3067 if (end > inode->i_size) { 3068 ei->i_disksize = end; 3069 i_size_write(inode, end); 3070 /* 3071 * We're going to return a positive `ret' 3072 * here due to non-zero-length I/O, so there's 3073 * no way of reporting error returns from 3074 * ext4_mark_inode_dirty() to userspace. So 3075 * ignore it. 3076 */ 3077 ext4_mark_inode_dirty(handle, inode); 3078 } 3079 } 3080 err = ext4_journal_stop(handle); 3081 if (ret == 0) 3082 ret = err; 3083 } 3084 out: 3085 return ret; 3086 } 3087 3088 /* 3089 * Pages can be marked dirty completely asynchronously from ext4's journalling 3090 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3091 * much here because ->set_page_dirty is called under VFS locks. The page is 3092 * not necessarily locked. 3093 * 3094 * We cannot just dirty the page and leave attached buffers clean, because the 3095 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3096 * or jbddirty because all the journalling code will explode. 3097 * 3098 * So what we do is to mark the page "pending dirty" and next time writepage 3099 * is called, propagate that into the buffers appropriately. 3100 */ 3101 static int ext4_journalled_set_page_dirty(struct page *page) 3102 { 3103 SetPageChecked(page); 3104 return __set_page_dirty_nobuffers(page); 3105 } 3106 3107 static const struct address_space_operations ext4_ordered_aops = { 3108 .readpage = ext4_readpage, 3109 .readpages = ext4_readpages, 3110 .writepage = ext4_normal_writepage, 3111 .sync_page = block_sync_page, 3112 .write_begin = ext4_write_begin, 3113 .write_end = ext4_ordered_write_end, 3114 .bmap = ext4_bmap, 3115 .invalidatepage = ext4_invalidatepage, 3116 .releasepage = ext4_releasepage, 3117 .direct_IO = ext4_direct_IO, 3118 .migratepage = buffer_migrate_page, 3119 .is_partially_uptodate = block_is_partially_uptodate, 3120 }; 3121 3122 static const struct address_space_operations ext4_writeback_aops = { 3123 .readpage = ext4_readpage, 3124 .readpages = ext4_readpages, 3125 .writepage = ext4_normal_writepage, 3126 .sync_page = block_sync_page, 3127 .write_begin = ext4_write_begin, 3128 .write_end = ext4_writeback_write_end, 3129 .bmap = ext4_bmap, 3130 .invalidatepage = ext4_invalidatepage, 3131 .releasepage = ext4_releasepage, 3132 .direct_IO = ext4_direct_IO, 3133 .migratepage = buffer_migrate_page, 3134 .is_partially_uptodate = block_is_partially_uptodate, 3135 }; 3136 3137 static const struct address_space_operations ext4_journalled_aops = { 3138 .readpage = ext4_readpage, 3139 .readpages = ext4_readpages, 3140 .writepage = ext4_journalled_writepage, 3141 .sync_page = block_sync_page, 3142 .write_begin = ext4_write_begin, 3143 .write_end = ext4_journalled_write_end, 3144 .set_page_dirty = ext4_journalled_set_page_dirty, 3145 .bmap = ext4_bmap, 3146 .invalidatepage = ext4_invalidatepage, 3147 .releasepage = ext4_releasepage, 3148 .is_partially_uptodate = block_is_partially_uptodate, 3149 }; 3150 3151 static const struct address_space_operations ext4_da_aops = { 3152 .readpage = ext4_readpage, 3153 .readpages = ext4_readpages, 3154 .writepage = ext4_da_writepage, 3155 .writepages = ext4_da_writepages, 3156 .sync_page = block_sync_page, 3157 .write_begin = ext4_da_write_begin, 3158 .write_end = ext4_da_write_end, 3159 .bmap = ext4_bmap, 3160 .invalidatepage = ext4_da_invalidatepage, 3161 .releasepage = ext4_releasepage, 3162 .direct_IO = ext4_direct_IO, 3163 .migratepage = buffer_migrate_page, 3164 .is_partially_uptodate = block_is_partially_uptodate, 3165 }; 3166 3167 void ext4_set_aops(struct inode *inode) 3168 { 3169 if (ext4_should_order_data(inode) && 3170 test_opt(inode->i_sb, DELALLOC)) 3171 inode->i_mapping->a_ops = &ext4_da_aops; 3172 else if (ext4_should_order_data(inode)) 3173 inode->i_mapping->a_ops = &ext4_ordered_aops; 3174 else if (ext4_should_writeback_data(inode) && 3175 test_opt(inode->i_sb, DELALLOC)) 3176 inode->i_mapping->a_ops = &ext4_da_aops; 3177 else if (ext4_should_writeback_data(inode)) 3178 inode->i_mapping->a_ops = &ext4_writeback_aops; 3179 else 3180 inode->i_mapping->a_ops = &ext4_journalled_aops; 3181 } 3182 3183 /* 3184 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3185 * up to the end of the block which corresponds to `from'. 3186 * This required during truncate. We need to physically zero the tail end 3187 * of that block so it doesn't yield old data if the file is later grown. 3188 */ 3189 int ext4_block_truncate_page(handle_t *handle, 3190 struct address_space *mapping, loff_t from) 3191 { 3192 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3193 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3194 unsigned blocksize, length, pos; 3195 ext4_lblk_t iblock; 3196 struct inode *inode = mapping->host; 3197 struct buffer_head *bh; 3198 struct page *page; 3199 int err = 0; 3200 3201 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3202 if (!page) 3203 return -EINVAL; 3204 3205 blocksize = inode->i_sb->s_blocksize; 3206 length = blocksize - (offset & (blocksize - 1)); 3207 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3208 3209 /* 3210 * For "nobh" option, we can only work if we don't need to 3211 * read-in the page - otherwise we create buffers to do the IO. 3212 */ 3213 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 3214 ext4_should_writeback_data(inode) && PageUptodate(page)) { 3215 zero_user(page, offset, length); 3216 set_page_dirty(page); 3217 goto unlock; 3218 } 3219 3220 if (!page_has_buffers(page)) 3221 create_empty_buffers(page, blocksize, 0); 3222 3223 /* Find the buffer that contains "offset" */ 3224 bh = page_buffers(page); 3225 pos = blocksize; 3226 while (offset >= pos) { 3227 bh = bh->b_this_page; 3228 iblock++; 3229 pos += blocksize; 3230 } 3231 3232 err = 0; 3233 if (buffer_freed(bh)) { 3234 BUFFER_TRACE(bh, "freed: skip"); 3235 goto unlock; 3236 } 3237 3238 if (!buffer_mapped(bh)) { 3239 BUFFER_TRACE(bh, "unmapped"); 3240 ext4_get_block(inode, iblock, bh, 0); 3241 /* unmapped? It's a hole - nothing to do */ 3242 if (!buffer_mapped(bh)) { 3243 BUFFER_TRACE(bh, "still unmapped"); 3244 goto unlock; 3245 } 3246 } 3247 3248 /* Ok, it's mapped. Make sure it's up-to-date */ 3249 if (PageUptodate(page)) 3250 set_buffer_uptodate(bh); 3251 3252 if (!buffer_uptodate(bh)) { 3253 err = -EIO; 3254 ll_rw_block(READ, 1, &bh); 3255 wait_on_buffer(bh); 3256 /* Uhhuh. Read error. Complain and punt. */ 3257 if (!buffer_uptodate(bh)) 3258 goto unlock; 3259 } 3260 3261 if (ext4_should_journal_data(inode)) { 3262 BUFFER_TRACE(bh, "get write access"); 3263 err = ext4_journal_get_write_access(handle, bh); 3264 if (err) 3265 goto unlock; 3266 } 3267 3268 zero_user(page, offset, length); 3269 3270 BUFFER_TRACE(bh, "zeroed end of block"); 3271 3272 err = 0; 3273 if (ext4_should_journal_data(inode)) { 3274 err = ext4_journal_dirty_metadata(handle, bh); 3275 } else { 3276 if (ext4_should_order_data(inode)) 3277 err = ext4_jbd2_file_inode(handle, inode); 3278 mark_buffer_dirty(bh); 3279 } 3280 3281 unlock: 3282 unlock_page(page); 3283 page_cache_release(page); 3284 return err; 3285 } 3286 3287 /* 3288 * Probably it should be a library function... search for first non-zero word 3289 * or memcmp with zero_page, whatever is better for particular architecture. 3290 * Linus? 3291 */ 3292 static inline int all_zeroes(__le32 *p, __le32 *q) 3293 { 3294 while (p < q) 3295 if (*p++) 3296 return 0; 3297 return 1; 3298 } 3299 3300 /** 3301 * ext4_find_shared - find the indirect blocks for partial truncation. 3302 * @inode: inode in question 3303 * @depth: depth of the affected branch 3304 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3305 * @chain: place to store the pointers to partial indirect blocks 3306 * @top: place to the (detached) top of branch 3307 * 3308 * This is a helper function used by ext4_truncate(). 3309 * 3310 * When we do truncate() we may have to clean the ends of several 3311 * indirect blocks but leave the blocks themselves alive. Block is 3312 * partially truncated if some data below the new i_size is refered 3313 * from it (and it is on the path to the first completely truncated 3314 * data block, indeed). We have to free the top of that path along 3315 * with everything to the right of the path. Since no allocation 3316 * past the truncation point is possible until ext4_truncate() 3317 * finishes, we may safely do the latter, but top of branch may 3318 * require special attention - pageout below the truncation point 3319 * might try to populate it. 3320 * 3321 * We atomically detach the top of branch from the tree, store the 3322 * block number of its root in *@top, pointers to buffer_heads of 3323 * partially truncated blocks - in @chain[].bh and pointers to 3324 * their last elements that should not be removed - in 3325 * @chain[].p. Return value is the pointer to last filled element 3326 * of @chain. 3327 * 3328 * The work left to caller to do the actual freeing of subtrees: 3329 * a) free the subtree starting from *@top 3330 * b) free the subtrees whose roots are stored in 3331 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3332 * c) free the subtrees growing from the inode past the @chain[0]. 3333 * (no partially truncated stuff there). */ 3334 3335 static Indirect *ext4_find_shared(struct inode *inode, int depth, 3336 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 3337 { 3338 Indirect *partial, *p; 3339 int k, err; 3340 3341 *top = 0; 3342 /* Make k index the deepest non-null offest + 1 */ 3343 for (k = depth; k > 1 && !offsets[k-1]; k--) 3344 ; 3345 partial = ext4_get_branch(inode, k, offsets, chain, &err); 3346 /* Writer: pointers */ 3347 if (!partial) 3348 partial = chain + k-1; 3349 /* 3350 * If the branch acquired continuation since we've looked at it - 3351 * fine, it should all survive and (new) top doesn't belong to us. 3352 */ 3353 if (!partial->key && *partial->p) 3354 /* Writer: end */ 3355 goto no_top; 3356 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3357 ; 3358 /* 3359 * OK, we've found the last block that must survive. The rest of our 3360 * branch should be detached before unlocking. However, if that rest 3361 * of branch is all ours and does not grow immediately from the inode 3362 * it's easier to cheat and just decrement partial->p. 3363 */ 3364 if (p == chain + k - 1 && p > chain) { 3365 p->p--; 3366 } else { 3367 *top = *p->p; 3368 /* Nope, don't do this in ext4. Must leave the tree intact */ 3369 #if 0 3370 *p->p = 0; 3371 #endif 3372 } 3373 /* Writer: end */ 3374 3375 while (partial > p) { 3376 brelse(partial->bh); 3377 partial--; 3378 } 3379 no_top: 3380 return partial; 3381 } 3382 3383 /* 3384 * Zero a number of block pointers in either an inode or an indirect block. 3385 * If we restart the transaction we must again get write access to the 3386 * indirect block for further modification. 3387 * 3388 * We release `count' blocks on disk, but (last - first) may be greater 3389 * than `count' because there can be holes in there. 3390 */ 3391 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3392 struct buffer_head *bh, ext4_fsblk_t block_to_free, 3393 unsigned long count, __le32 *first, __le32 *last) 3394 { 3395 __le32 *p; 3396 if (try_to_extend_transaction(handle, inode)) { 3397 if (bh) { 3398 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 3399 ext4_journal_dirty_metadata(handle, bh); 3400 } 3401 ext4_mark_inode_dirty(handle, inode); 3402 ext4_journal_test_restart(handle, inode); 3403 if (bh) { 3404 BUFFER_TRACE(bh, "retaking write access"); 3405 ext4_journal_get_write_access(handle, bh); 3406 } 3407 } 3408 3409 /* 3410 * Any buffers which are on the journal will be in memory. We find 3411 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 3412 * on them. We've already detached each block from the file, so 3413 * bforget() in jbd2_journal_forget() should be safe. 3414 * 3415 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3416 */ 3417 for (p = first; p < last; p++) { 3418 u32 nr = le32_to_cpu(*p); 3419 if (nr) { 3420 struct buffer_head *tbh; 3421 3422 *p = 0; 3423 tbh = sb_find_get_block(inode->i_sb, nr); 3424 ext4_forget(handle, 0, inode, tbh, nr); 3425 } 3426 } 3427 3428 ext4_free_blocks(handle, inode, block_to_free, count, 0); 3429 } 3430 3431 /** 3432 * ext4_free_data - free a list of data blocks 3433 * @handle: handle for this transaction 3434 * @inode: inode we are dealing with 3435 * @this_bh: indirect buffer_head which contains *@first and *@last 3436 * @first: array of block numbers 3437 * @last: points immediately past the end of array 3438 * 3439 * We are freeing all blocks refered from that array (numbers are stored as 3440 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3441 * 3442 * We accumulate contiguous runs of blocks to free. Conveniently, if these 3443 * blocks are contiguous then releasing them at one time will only affect one 3444 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3445 * actually use a lot of journal space. 3446 * 3447 * @this_bh will be %NULL if @first and @last point into the inode's direct 3448 * block pointers. 3449 */ 3450 static void ext4_free_data(handle_t *handle, struct inode *inode, 3451 struct buffer_head *this_bh, 3452 __le32 *first, __le32 *last) 3453 { 3454 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3455 unsigned long count = 0; /* Number of blocks in the run */ 3456 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3457 corresponding to 3458 block_to_free */ 3459 ext4_fsblk_t nr; /* Current block # */ 3460 __le32 *p; /* Pointer into inode/ind 3461 for current block */ 3462 int err; 3463 3464 if (this_bh) { /* For indirect block */ 3465 BUFFER_TRACE(this_bh, "get_write_access"); 3466 err = ext4_journal_get_write_access(handle, this_bh); 3467 /* Important: if we can't update the indirect pointers 3468 * to the blocks, we can't free them. */ 3469 if (err) 3470 return; 3471 } 3472 3473 for (p = first; p < last; p++) { 3474 nr = le32_to_cpu(*p); 3475 if (nr) { 3476 /* accumulate blocks to free if they're contiguous */ 3477 if (count == 0) { 3478 block_to_free = nr; 3479 block_to_free_p = p; 3480 count = 1; 3481 } else if (nr == block_to_free + count) { 3482 count++; 3483 } else { 3484 ext4_clear_blocks(handle, inode, this_bh, 3485 block_to_free, 3486 count, block_to_free_p, p); 3487 block_to_free = nr; 3488 block_to_free_p = p; 3489 count = 1; 3490 } 3491 } 3492 } 3493 3494 if (count > 0) 3495 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3496 count, block_to_free_p, p); 3497 3498 if (this_bh) { 3499 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3500 3501 /* 3502 * The buffer head should have an attached journal head at this 3503 * point. However, if the data is corrupted and an indirect 3504 * block pointed to itself, it would have been detached when 3505 * the block was cleared. Check for this instead of OOPSing. 3506 */ 3507 if (bh2jh(this_bh)) 3508 ext4_journal_dirty_metadata(handle, this_bh); 3509 else 3510 ext4_error(inode->i_sb, __func__, 3511 "circular indirect block detected, " 3512 "inode=%lu, block=%llu", 3513 inode->i_ino, 3514 (unsigned long long) this_bh->b_blocknr); 3515 } 3516 } 3517 3518 /** 3519 * ext4_free_branches - free an array of branches 3520 * @handle: JBD handle for this transaction 3521 * @inode: inode we are dealing with 3522 * @parent_bh: the buffer_head which contains *@first and *@last 3523 * @first: array of block numbers 3524 * @last: pointer immediately past the end of array 3525 * @depth: depth of the branches to free 3526 * 3527 * We are freeing all blocks refered from these branches (numbers are 3528 * stored as little-endian 32-bit) and updating @inode->i_blocks 3529 * appropriately. 3530 */ 3531 static void ext4_free_branches(handle_t *handle, struct inode *inode, 3532 struct buffer_head *parent_bh, 3533 __le32 *first, __le32 *last, int depth) 3534 { 3535 ext4_fsblk_t nr; 3536 __le32 *p; 3537 3538 if (is_handle_aborted(handle)) 3539 return; 3540 3541 if (depth--) { 3542 struct buffer_head *bh; 3543 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3544 p = last; 3545 while (--p >= first) { 3546 nr = le32_to_cpu(*p); 3547 if (!nr) 3548 continue; /* A hole */ 3549 3550 /* Go read the buffer for the next level down */ 3551 bh = sb_bread(inode->i_sb, nr); 3552 3553 /* 3554 * A read failure? Report error and clear slot 3555 * (should be rare). 3556 */ 3557 if (!bh) { 3558 ext4_error(inode->i_sb, "ext4_free_branches", 3559 "Read failure, inode=%lu, block=%llu", 3560 inode->i_ino, nr); 3561 continue; 3562 } 3563 3564 /* This zaps the entire block. Bottom up. */ 3565 BUFFER_TRACE(bh, "free child branches"); 3566 ext4_free_branches(handle, inode, bh, 3567 (__le32 *) bh->b_data, 3568 (__le32 *) bh->b_data + addr_per_block, 3569 depth); 3570 3571 /* 3572 * We've probably journalled the indirect block several 3573 * times during the truncate. But it's no longer 3574 * needed and we now drop it from the transaction via 3575 * jbd2_journal_revoke(). 3576 * 3577 * That's easy if it's exclusively part of this 3578 * transaction. But if it's part of the committing 3579 * transaction then jbd2_journal_forget() will simply 3580 * brelse() it. That means that if the underlying 3581 * block is reallocated in ext4_get_block(), 3582 * unmap_underlying_metadata() will find this block 3583 * and will try to get rid of it. damn, damn. 3584 * 3585 * If this block has already been committed to the 3586 * journal, a revoke record will be written. And 3587 * revoke records must be emitted *before* clearing 3588 * this block's bit in the bitmaps. 3589 */ 3590 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 3591 3592 /* 3593 * Everything below this this pointer has been 3594 * released. Now let this top-of-subtree go. 3595 * 3596 * We want the freeing of this indirect block to be 3597 * atomic in the journal with the updating of the 3598 * bitmap block which owns it. So make some room in 3599 * the journal. 3600 * 3601 * We zero the parent pointer *after* freeing its 3602 * pointee in the bitmaps, so if extend_transaction() 3603 * for some reason fails to put the bitmap changes and 3604 * the release into the same transaction, recovery 3605 * will merely complain about releasing a free block, 3606 * rather than leaking blocks. 3607 */ 3608 if (is_handle_aborted(handle)) 3609 return; 3610 if (try_to_extend_transaction(handle, inode)) { 3611 ext4_mark_inode_dirty(handle, inode); 3612 ext4_journal_test_restart(handle, inode); 3613 } 3614 3615 ext4_free_blocks(handle, inode, nr, 1, 1); 3616 3617 if (parent_bh) { 3618 /* 3619 * The block which we have just freed is 3620 * pointed to by an indirect block: journal it 3621 */ 3622 BUFFER_TRACE(parent_bh, "get_write_access"); 3623 if (!ext4_journal_get_write_access(handle, 3624 parent_bh)){ 3625 *p = 0; 3626 BUFFER_TRACE(parent_bh, 3627 "call ext4_journal_dirty_metadata"); 3628 ext4_journal_dirty_metadata(handle, 3629 parent_bh); 3630 } 3631 } 3632 } 3633 } else { 3634 /* We have reached the bottom of the tree. */ 3635 BUFFER_TRACE(parent_bh, "free data blocks"); 3636 ext4_free_data(handle, inode, parent_bh, first, last); 3637 } 3638 } 3639 3640 int ext4_can_truncate(struct inode *inode) 3641 { 3642 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3643 return 0; 3644 if (S_ISREG(inode->i_mode)) 3645 return 1; 3646 if (S_ISDIR(inode->i_mode)) 3647 return 1; 3648 if (S_ISLNK(inode->i_mode)) 3649 return !ext4_inode_is_fast_symlink(inode); 3650 return 0; 3651 } 3652 3653 /* 3654 * ext4_truncate() 3655 * 3656 * We block out ext4_get_block() block instantiations across the entire 3657 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 3658 * simultaneously on behalf of the same inode. 3659 * 3660 * As we work through the truncate and commmit bits of it to the journal there 3661 * is one core, guiding principle: the file's tree must always be consistent on 3662 * disk. We must be able to restart the truncate after a crash. 3663 * 3664 * The file's tree may be transiently inconsistent in memory (although it 3665 * probably isn't), but whenever we close off and commit a journal transaction, 3666 * the contents of (the filesystem + the journal) must be consistent and 3667 * restartable. It's pretty simple, really: bottom up, right to left (although 3668 * left-to-right works OK too). 3669 * 3670 * Note that at recovery time, journal replay occurs *before* the restart of 3671 * truncate against the orphan inode list. 3672 * 3673 * The committed inode has the new, desired i_size (which is the same as 3674 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 3675 * that this inode's truncate did not complete and it will again call 3676 * ext4_truncate() to have another go. So there will be instantiated blocks 3677 * to the right of the truncation point in a crashed ext4 filesystem. But 3678 * that's fine - as long as they are linked from the inode, the post-crash 3679 * ext4_truncate() run will find them and release them. 3680 */ 3681 void ext4_truncate(struct inode *inode) 3682 { 3683 handle_t *handle; 3684 struct ext4_inode_info *ei = EXT4_I(inode); 3685 __le32 *i_data = ei->i_data; 3686 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3687 struct address_space *mapping = inode->i_mapping; 3688 ext4_lblk_t offsets[4]; 3689 Indirect chain[4]; 3690 Indirect *partial; 3691 __le32 nr = 0; 3692 int n; 3693 ext4_lblk_t last_block; 3694 unsigned blocksize = inode->i_sb->s_blocksize; 3695 3696 if (!ext4_can_truncate(inode)) 3697 return; 3698 3699 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3700 ext4_ext_truncate(inode); 3701 return; 3702 } 3703 3704 handle = start_transaction(inode); 3705 if (IS_ERR(handle)) 3706 return; /* AKPM: return what? */ 3707 3708 last_block = (inode->i_size + blocksize-1) 3709 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3710 3711 if (inode->i_size & (blocksize - 1)) 3712 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 3713 goto out_stop; 3714 3715 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3716 if (n == 0) 3717 goto out_stop; /* error */ 3718 3719 /* 3720 * OK. This truncate is going to happen. We add the inode to the 3721 * orphan list, so that if this truncate spans multiple transactions, 3722 * and we crash, we will resume the truncate when the filesystem 3723 * recovers. It also marks the inode dirty, to catch the new size. 3724 * 3725 * Implication: the file must always be in a sane, consistent 3726 * truncatable state while each transaction commits. 3727 */ 3728 if (ext4_orphan_add(handle, inode)) 3729 goto out_stop; 3730 3731 /* 3732 * From here we block out all ext4_get_block() callers who want to 3733 * modify the block allocation tree. 3734 */ 3735 down_write(&ei->i_data_sem); 3736 3737 ext4_discard_preallocations(inode); 3738 3739 /* 3740 * The orphan list entry will now protect us from any crash which 3741 * occurs before the truncate completes, so it is now safe to propagate 3742 * the new, shorter inode size (held for now in i_size) into the 3743 * on-disk inode. We do this via i_disksize, which is the value which 3744 * ext4 *really* writes onto the disk inode. 3745 */ 3746 ei->i_disksize = inode->i_size; 3747 3748 if (n == 1) { /* direct blocks */ 3749 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3750 i_data + EXT4_NDIR_BLOCKS); 3751 goto do_indirects; 3752 } 3753 3754 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 3755 /* Kill the top of shared branch (not detached) */ 3756 if (nr) { 3757 if (partial == chain) { 3758 /* Shared branch grows from the inode */ 3759 ext4_free_branches(handle, inode, NULL, 3760 &nr, &nr+1, (chain+n-1) - partial); 3761 *partial->p = 0; 3762 /* 3763 * We mark the inode dirty prior to restart, 3764 * and prior to stop. No need for it here. 3765 */ 3766 } else { 3767 /* Shared branch grows from an indirect block */ 3768 BUFFER_TRACE(partial->bh, "get_write_access"); 3769 ext4_free_branches(handle, inode, partial->bh, 3770 partial->p, 3771 partial->p+1, (chain+n-1) - partial); 3772 } 3773 } 3774 /* Clear the ends of indirect blocks on the shared branch */ 3775 while (partial > chain) { 3776 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 3777 (__le32*)partial->bh->b_data+addr_per_block, 3778 (chain+n-1) - partial); 3779 BUFFER_TRACE(partial->bh, "call brelse"); 3780 brelse (partial->bh); 3781 partial--; 3782 } 3783 do_indirects: 3784 /* Kill the remaining (whole) subtrees */ 3785 switch (offsets[0]) { 3786 default: 3787 nr = i_data[EXT4_IND_BLOCK]; 3788 if (nr) { 3789 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 3790 i_data[EXT4_IND_BLOCK] = 0; 3791 } 3792 case EXT4_IND_BLOCK: 3793 nr = i_data[EXT4_DIND_BLOCK]; 3794 if (nr) { 3795 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 3796 i_data[EXT4_DIND_BLOCK] = 0; 3797 } 3798 case EXT4_DIND_BLOCK: 3799 nr = i_data[EXT4_TIND_BLOCK]; 3800 if (nr) { 3801 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 3802 i_data[EXT4_TIND_BLOCK] = 0; 3803 } 3804 case EXT4_TIND_BLOCK: 3805 ; 3806 } 3807 3808 up_write(&ei->i_data_sem); 3809 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3810 ext4_mark_inode_dirty(handle, inode); 3811 3812 /* 3813 * In a multi-transaction truncate, we only make the final transaction 3814 * synchronous 3815 */ 3816 if (IS_SYNC(inode)) 3817 handle->h_sync = 1; 3818 out_stop: 3819 /* 3820 * If this was a simple ftruncate(), and the file will remain alive 3821 * then we need to clear up the orphan record which we created above. 3822 * However, if this was a real unlink then we were called by 3823 * ext4_delete_inode(), and we allow that function to clean up the 3824 * orphan info for us. 3825 */ 3826 if (inode->i_nlink) 3827 ext4_orphan_del(handle, inode); 3828 3829 ext4_journal_stop(handle); 3830 } 3831 3832 /* 3833 * ext4_get_inode_loc returns with an extra refcount against the inode's 3834 * underlying buffer_head on success. If 'in_mem' is true, we have all 3835 * data in memory that is needed to recreate the on-disk version of this 3836 * inode. 3837 */ 3838 static int __ext4_get_inode_loc(struct inode *inode, 3839 struct ext4_iloc *iloc, int in_mem) 3840 { 3841 struct ext4_group_desc *gdp; 3842 struct buffer_head *bh; 3843 struct super_block *sb = inode->i_sb; 3844 ext4_fsblk_t block; 3845 int inodes_per_block, inode_offset; 3846 3847 iloc->bh = 0; 3848 if (!ext4_valid_inum(sb, inode->i_ino)) 3849 return -EIO; 3850 3851 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 3852 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 3853 if (!gdp) 3854 return -EIO; 3855 3856 /* 3857 * Figure out the offset within the block group inode table 3858 */ 3859 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 3860 inode_offset = ((inode->i_ino - 1) % 3861 EXT4_INODES_PER_GROUP(sb)); 3862 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 3863 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 3864 3865 bh = sb_getblk(sb, block); 3866 if (!bh) { 3867 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 3868 "inode block - inode=%lu, block=%llu", 3869 inode->i_ino, block); 3870 return -EIO; 3871 } 3872 if (!buffer_uptodate(bh)) { 3873 lock_buffer(bh); 3874 3875 /* 3876 * If the buffer has the write error flag, we have failed 3877 * to write out another inode in the same block. In this 3878 * case, we don't have to read the block because we may 3879 * read the old inode data successfully. 3880 */ 3881 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 3882 set_buffer_uptodate(bh); 3883 3884 if (buffer_uptodate(bh)) { 3885 /* someone brought it uptodate while we waited */ 3886 unlock_buffer(bh); 3887 goto has_buffer; 3888 } 3889 3890 /* 3891 * If we have all information of the inode in memory and this 3892 * is the only valid inode in the block, we need not read the 3893 * block. 3894 */ 3895 if (in_mem) { 3896 struct buffer_head *bitmap_bh; 3897 int i, start; 3898 3899 start = inode_offset & ~(inodes_per_block - 1); 3900 3901 /* Is the inode bitmap in cache? */ 3902 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 3903 if (!bitmap_bh) 3904 goto make_io; 3905 3906 /* 3907 * If the inode bitmap isn't in cache then the 3908 * optimisation may end up performing two reads instead 3909 * of one, so skip it. 3910 */ 3911 if (!buffer_uptodate(bitmap_bh)) { 3912 brelse(bitmap_bh); 3913 goto make_io; 3914 } 3915 for (i = start; i < start + inodes_per_block; i++) { 3916 if (i == inode_offset) 3917 continue; 3918 if (ext4_test_bit(i, bitmap_bh->b_data)) 3919 break; 3920 } 3921 brelse(bitmap_bh); 3922 if (i == start + inodes_per_block) { 3923 /* all other inodes are free, so skip I/O */ 3924 memset(bh->b_data, 0, bh->b_size); 3925 set_buffer_uptodate(bh); 3926 unlock_buffer(bh); 3927 goto has_buffer; 3928 } 3929 } 3930 3931 make_io: 3932 /* 3933 * If we need to do any I/O, try to pre-readahead extra 3934 * blocks from the inode table. 3935 */ 3936 if (EXT4_SB(sb)->s_inode_readahead_blks) { 3937 ext4_fsblk_t b, end, table; 3938 unsigned num; 3939 3940 table = ext4_inode_table(sb, gdp); 3941 /* Make sure s_inode_readahead_blks is a power of 2 */ 3942 while (EXT4_SB(sb)->s_inode_readahead_blks & 3943 (EXT4_SB(sb)->s_inode_readahead_blks-1)) 3944 EXT4_SB(sb)->s_inode_readahead_blks = 3945 (EXT4_SB(sb)->s_inode_readahead_blks & 3946 (EXT4_SB(sb)->s_inode_readahead_blks-1)); 3947 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 3948 if (table > b) 3949 b = table; 3950 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 3951 num = EXT4_INODES_PER_GROUP(sb); 3952 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3953 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 3954 num -= le16_to_cpu(gdp->bg_itable_unused); 3955 table += num / inodes_per_block; 3956 if (end > table) 3957 end = table; 3958 while (b <= end) 3959 sb_breadahead(sb, b++); 3960 } 3961 3962 /* 3963 * There are other valid inodes in the buffer, this inode 3964 * has in-inode xattrs, or we don't have this inode in memory. 3965 * Read the block from disk. 3966 */ 3967 get_bh(bh); 3968 bh->b_end_io = end_buffer_read_sync; 3969 submit_bh(READ_META, bh); 3970 wait_on_buffer(bh); 3971 if (!buffer_uptodate(bh)) { 3972 ext4_error(sb, __func__, 3973 "unable to read inode block - inode=%lu, " 3974 "block=%llu", inode->i_ino, block); 3975 brelse(bh); 3976 return -EIO; 3977 } 3978 } 3979 has_buffer: 3980 iloc->bh = bh; 3981 return 0; 3982 } 3983 3984 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 3985 { 3986 /* We have all inode data except xattrs in memory here. */ 3987 return __ext4_get_inode_loc(inode, iloc, 3988 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 3989 } 3990 3991 void ext4_set_inode_flags(struct inode *inode) 3992 { 3993 unsigned int flags = EXT4_I(inode)->i_flags; 3994 3995 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 3996 if (flags & EXT4_SYNC_FL) 3997 inode->i_flags |= S_SYNC; 3998 if (flags & EXT4_APPEND_FL) 3999 inode->i_flags |= S_APPEND; 4000 if (flags & EXT4_IMMUTABLE_FL) 4001 inode->i_flags |= S_IMMUTABLE; 4002 if (flags & EXT4_NOATIME_FL) 4003 inode->i_flags |= S_NOATIME; 4004 if (flags & EXT4_DIRSYNC_FL) 4005 inode->i_flags |= S_DIRSYNC; 4006 } 4007 4008 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4009 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4010 { 4011 unsigned int flags = ei->vfs_inode.i_flags; 4012 4013 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4014 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4015 if (flags & S_SYNC) 4016 ei->i_flags |= EXT4_SYNC_FL; 4017 if (flags & S_APPEND) 4018 ei->i_flags |= EXT4_APPEND_FL; 4019 if (flags & S_IMMUTABLE) 4020 ei->i_flags |= EXT4_IMMUTABLE_FL; 4021 if (flags & S_NOATIME) 4022 ei->i_flags |= EXT4_NOATIME_FL; 4023 if (flags & S_DIRSYNC) 4024 ei->i_flags |= EXT4_DIRSYNC_FL; 4025 } 4026 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4027 struct ext4_inode_info *ei) 4028 { 4029 blkcnt_t i_blocks ; 4030 struct inode *inode = &(ei->vfs_inode); 4031 struct super_block *sb = inode->i_sb; 4032 4033 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4034 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4035 /* we are using combined 48 bit field */ 4036 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4037 le32_to_cpu(raw_inode->i_blocks_lo); 4038 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4039 /* i_blocks represent file system block size */ 4040 return i_blocks << (inode->i_blkbits - 9); 4041 } else { 4042 return i_blocks; 4043 } 4044 } else { 4045 return le32_to_cpu(raw_inode->i_blocks_lo); 4046 } 4047 } 4048 4049 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4050 { 4051 struct ext4_iloc iloc; 4052 struct ext4_inode *raw_inode; 4053 struct ext4_inode_info *ei; 4054 struct buffer_head *bh; 4055 struct inode *inode; 4056 long ret; 4057 int block; 4058 4059 inode = iget_locked(sb, ino); 4060 if (!inode) 4061 return ERR_PTR(-ENOMEM); 4062 if (!(inode->i_state & I_NEW)) 4063 return inode; 4064 4065 ei = EXT4_I(inode); 4066 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4067 ei->i_acl = EXT4_ACL_NOT_CACHED; 4068 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4069 #endif 4070 4071 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4072 if (ret < 0) 4073 goto bad_inode; 4074 bh = iloc.bh; 4075 raw_inode = ext4_raw_inode(&iloc); 4076 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4077 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4078 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4079 if (!(test_opt(inode->i_sb, NO_UID32))) { 4080 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4081 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4082 } 4083 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4084 4085 ei->i_state = 0; 4086 ei->i_dir_start_lookup = 0; 4087 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4088 /* We now have enough fields to check if the inode was active or not. 4089 * This is needed because nfsd might try to access dead inodes 4090 * the test is that same one that e2fsck uses 4091 * NeilBrown 1999oct15 4092 */ 4093 if (inode->i_nlink == 0) { 4094 if (inode->i_mode == 0 || 4095 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4096 /* this inode is deleted */ 4097 brelse(bh); 4098 ret = -ESTALE; 4099 goto bad_inode; 4100 } 4101 /* The only unlinked inodes we let through here have 4102 * valid i_mode and are being read by the orphan 4103 * recovery code: that's fine, we're about to complete 4104 * the process of deleting those. */ 4105 } 4106 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4107 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4108 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4109 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4110 cpu_to_le32(EXT4_OS_HURD)) { 4111 ei->i_file_acl |= 4112 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4113 } 4114 inode->i_size = ext4_isize(raw_inode); 4115 ei->i_disksize = inode->i_size; 4116 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4117 ei->i_block_group = iloc.block_group; 4118 /* 4119 * NOTE! The in-memory inode i_data array is in little-endian order 4120 * even on big-endian machines: we do NOT byteswap the block numbers! 4121 */ 4122 for (block = 0; block < EXT4_N_BLOCKS; block++) 4123 ei->i_data[block] = raw_inode->i_block[block]; 4124 INIT_LIST_HEAD(&ei->i_orphan); 4125 4126 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4127 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4128 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4129 EXT4_INODE_SIZE(inode->i_sb)) { 4130 brelse(bh); 4131 ret = -EIO; 4132 goto bad_inode; 4133 } 4134 if (ei->i_extra_isize == 0) { 4135 /* The extra space is currently unused. Use it. */ 4136 ei->i_extra_isize = sizeof(struct ext4_inode) - 4137 EXT4_GOOD_OLD_INODE_SIZE; 4138 } else { 4139 __le32 *magic = (void *)raw_inode + 4140 EXT4_GOOD_OLD_INODE_SIZE + 4141 ei->i_extra_isize; 4142 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4143 ei->i_state |= EXT4_STATE_XATTR; 4144 } 4145 } else 4146 ei->i_extra_isize = 0; 4147 4148 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4149 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4150 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4151 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4152 4153 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4154 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4155 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4156 inode->i_version |= 4157 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4158 } 4159 4160 if (S_ISREG(inode->i_mode)) { 4161 inode->i_op = &ext4_file_inode_operations; 4162 inode->i_fop = &ext4_file_operations; 4163 ext4_set_aops(inode); 4164 } else if (S_ISDIR(inode->i_mode)) { 4165 inode->i_op = &ext4_dir_inode_operations; 4166 inode->i_fop = &ext4_dir_operations; 4167 } else if (S_ISLNK(inode->i_mode)) { 4168 if (ext4_inode_is_fast_symlink(inode)) { 4169 inode->i_op = &ext4_fast_symlink_inode_operations; 4170 nd_terminate_link(ei->i_data, inode->i_size, 4171 sizeof(ei->i_data) - 1); 4172 } else { 4173 inode->i_op = &ext4_symlink_inode_operations; 4174 ext4_set_aops(inode); 4175 } 4176 } else { 4177 inode->i_op = &ext4_special_inode_operations; 4178 if (raw_inode->i_block[0]) 4179 init_special_inode(inode, inode->i_mode, 4180 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4181 else 4182 init_special_inode(inode, inode->i_mode, 4183 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4184 } 4185 brelse(iloc.bh); 4186 ext4_set_inode_flags(inode); 4187 unlock_new_inode(inode); 4188 return inode; 4189 4190 bad_inode: 4191 iget_failed(inode); 4192 return ERR_PTR(ret); 4193 } 4194 4195 static int ext4_inode_blocks_set(handle_t *handle, 4196 struct ext4_inode *raw_inode, 4197 struct ext4_inode_info *ei) 4198 { 4199 struct inode *inode = &(ei->vfs_inode); 4200 u64 i_blocks = inode->i_blocks; 4201 struct super_block *sb = inode->i_sb; 4202 4203 if (i_blocks <= ~0U) { 4204 /* 4205 * i_blocks can be represnted in a 32 bit variable 4206 * as multiple of 512 bytes 4207 */ 4208 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4209 raw_inode->i_blocks_high = 0; 4210 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4211 return 0; 4212 } 4213 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4214 return -EFBIG; 4215 4216 if (i_blocks <= 0xffffffffffffULL) { 4217 /* 4218 * i_blocks can be represented in a 48 bit variable 4219 * as multiple of 512 bytes 4220 */ 4221 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4222 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4223 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4224 } else { 4225 ei->i_flags |= EXT4_HUGE_FILE_FL; 4226 /* i_block is stored in file system block size */ 4227 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4228 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4229 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4230 } 4231 return 0; 4232 } 4233 4234 /* 4235 * Post the struct inode info into an on-disk inode location in the 4236 * buffer-cache. This gobbles the caller's reference to the 4237 * buffer_head in the inode location struct. 4238 * 4239 * The caller must have write access to iloc->bh. 4240 */ 4241 static int ext4_do_update_inode(handle_t *handle, 4242 struct inode *inode, 4243 struct ext4_iloc *iloc) 4244 { 4245 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4246 struct ext4_inode_info *ei = EXT4_I(inode); 4247 struct buffer_head *bh = iloc->bh; 4248 int err = 0, rc, block; 4249 4250 /* For fields not not tracking in the in-memory inode, 4251 * initialise them to zero for new inodes. */ 4252 if (ei->i_state & EXT4_STATE_NEW) 4253 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4254 4255 ext4_get_inode_flags(ei); 4256 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4257 if (!(test_opt(inode->i_sb, NO_UID32))) { 4258 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4259 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4260 /* 4261 * Fix up interoperability with old kernels. Otherwise, old inodes get 4262 * re-used with the upper 16 bits of the uid/gid intact 4263 */ 4264 if (!ei->i_dtime) { 4265 raw_inode->i_uid_high = 4266 cpu_to_le16(high_16_bits(inode->i_uid)); 4267 raw_inode->i_gid_high = 4268 cpu_to_le16(high_16_bits(inode->i_gid)); 4269 } else { 4270 raw_inode->i_uid_high = 0; 4271 raw_inode->i_gid_high = 0; 4272 } 4273 } else { 4274 raw_inode->i_uid_low = 4275 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 4276 raw_inode->i_gid_low = 4277 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 4278 raw_inode->i_uid_high = 0; 4279 raw_inode->i_gid_high = 0; 4280 } 4281 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4282 4283 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4284 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4285 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4286 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4287 4288 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4289 goto out_brelse; 4290 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4291 /* clear the migrate flag in the raw_inode */ 4292 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); 4293 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4294 cpu_to_le32(EXT4_OS_HURD)) 4295 raw_inode->i_file_acl_high = 4296 cpu_to_le16(ei->i_file_acl >> 32); 4297 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4298 ext4_isize_set(raw_inode, ei->i_disksize); 4299 if (ei->i_disksize > 0x7fffffffULL) { 4300 struct super_block *sb = inode->i_sb; 4301 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4302 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 4303 EXT4_SB(sb)->s_es->s_rev_level == 4304 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 4305 /* If this is the first large file 4306 * created, add a flag to the superblock. 4307 */ 4308 err = ext4_journal_get_write_access(handle, 4309 EXT4_SB(sb)->s_sbh); 4310 if (err) 4311 goto out_brelse; 4312 ext4_update_dynamic_rev(sb); 4313 EXT4_SET_RO_COMPAT_FEATURE(sb, 4314 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4315 sb->s_dirt = 1; 4316 handle->h_sync = 1; 4317 err = ext4_journal_dirty_metadata(handle, 4318 EXT4_SB(sb)->s_sbh); 4319 } 4320 } 4321 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4322 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4323 if (old_valid_dev(inode->i_rdev)) { 4324 raw_inode->i_block[0] = 4325 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4326 raw_inode->i_block[1] = 0; 4327 } else { 4328 raw_inode->i_block[0] = 0; 4329 raw_inode->i_block[1] = 4330 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4331 raw_inode->i_block[2] = 0; 4332 } 4333 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 4334 raw_inode->i_block[block] = ei->i_data[block]; 4335 4336 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4337 if (ei->i_extra_isize) { 4338 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4339 raw_inode->i_version_hi = 4340 cpu_to_le32(inode->i_version >> 32); 4341 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4342 } 4343 4344 4345 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 4346 rc = ext4_journal_dirty_metadata(handle, bh); 4347 if (!err) 4348 err = rc; 4349 ei->i_state &= ~EXT4_STATE_NEW; 4350 4351 out_brelse: 4352 brelse(bh); 4353 ext4_std_error(inode->i_sb, err); 4354 return err; 4355 } 4356 4357 /* 4358 * ext4_write_inode() 4359 * 4360 * We are called from a few places: 4361 * 4362 * - Within generic_file_write() for O_SYNC files. 4363 * Here, there will be no transaction running. We wait for any running 4364 * trasnaction to commit. 4365 * 4366 * - Within sys_sync(), kupdate and such. 4367 * We wait on commit, if tol to. 4368 * 4369 * - Within prune_icache() (PF_MEMALLOC == true) 4370 * Here we simply return. We can't afford to block kswapd on the 4371 * journal commit. 4372 * 4373 * In all cases it is actually safe for us to return without doing anything, 4374 * because the inode has been copied into a raw inode buffer in 4375 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4376 * knfsd. 4377 * 4378 * Note that we are absolutely dependent upon all inode dirtiers doing the 4379 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4380 * which we are interested. 4381 * 4382 * It would be a bug for them to not do this. The code: 4383 * 4384 * mark_inode_dirty(inode) 4385 * stuff(); 4386 * inode->i_size = expr; 4387 * 4388 * is in error because a kswapd-driven write_inode() could occur while 4389 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4390 * will no longer be on the superblock's dirty inode list. 4391 */ 4392 int ext4_write_inode(struct inode *inode, int wait) 4393 { 4394 if (current->flags & PF_MEMALLOC) 4395 return 0; 4396 4397 if (ext4_journal_current_handle()) { 4398 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4399 dump_stack(); 4400 return -EIO; 4401 } 4402 4403 if (!wait) 4404 return 0; 4405 4406 return ext4_force_commit(inode->i_sb); 4407 } 4408 4409 /* 4410 * ext4_setattr() 4411 * 4412 * Called from notify_change. 4413 * 4414 * We want to trap VFS attempts to truncate the file as soon as 4415 * possible. In particular, we want to make sure that when the VFS 4416 * shrinks i_size, we put the inode on the orphan list and modify 4417 * i_disksize immediately, so that during the subsequent flushing of 4418 * dirty pages and freeing of disk blocks, we can guarantee that any 4419 * commit will leave the blocks being flushed in an unused state on 4420 * disk. (On recovery, the inode will get truncated and the blocks will 4421 * be freed, so we have a strong guarantee that no future commit will 4422 * leave these blocks visible to the user.) 4423 * 4424 * Another thing we have to assure is that if we are in ordered mode 4425 * and inode is still attached to the committing transaction, we must 4426 * we start writeout of all the dirty pages which are being truncated. 4427 * This way we are sure that all the data written in the previous 4428 * transaction are already on disk (truncate waits for pages under 4429 * writeback). 4430 * 4431 * Called with inode->i_mutex down. 4432 */ 4433 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4434 { 4435 struct inode *inode = dentry->d_inode; 4436 int error, rc = 0; 4437 const unsigned int ia_valid = attr->ia_valid; 4438 4439 error = inode_change_ok(inode, attr); 4440 if (error) 4441 return error; 4442 4443 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 4444 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 4445 handle_t *handle; 4446 4447 /* (user+group)*(old+new) structure, inode write (sb, 4448 * inode block, ? - but truncate inode update has it) */ 4449 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 4450 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 4451 if (IS_ERR(handle)) { 4452 error = PTR_ERR(handle); 4453 goto err_out; 4454 } 4455 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 4456 if (error) { 4457 ext4_journal_stop(handle); 4458 return error; 4459 } 4460 /* Update corresponding info in inode so that everything is in 4461 * one transaction */ 4462 if (attr->ia_valid & ATTR_UID) 4463 inode->i_uid = attr->ia_uid; 4464 if (attr->ia_valid & ATTR_GID) 4465 inode->i_gid = attr->ia_gid; 4466 error = ext4_mark_inode_dirty(handle, inode); 4467 ext4_journal_stop(handle); 4468 } 4469 4470 if (attr->ia_valid & ATTR_SIZE) { 4471 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 4472 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4473 4474 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 4475 error = -EFBIG; 4476 goto err_out; 4477 } 4478 } 4479 } 4480 4481 if (S_ISREG(inode->i_mode) && 4482 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 4483 handle_t *handle; 4484 4485 handle = ext4_journal_start(inode, 3); 4486 if (IS_ERR(handle)) { 4487 error = PTR_ERR(handle); 4488 goto err_out; 4489 } 4490 4491 error = ext4_orphan_add(handle, inode); 4492 EXT4_I(inode)->i_disksize = attr->ia_size; 4493 rc = ext4_mark_inode_dirty(handle, inode); 4494 if (!error) 4495 error = rc; 4496 ext4_journal_stop(handle); 4497 4498 if (ext4_should_order_data(inode)) { 4499 error = ext4_begin_ordered_truncate(inode, 4500 attr->ia_size); 4501 if (error) { 4502 /* Do as much error cleanup as possible */ 4503 handle = ext4_journal_start(inode, 3); 4504 if (IS_ERR(handle)) { 4505 ext4_orphan_del(NULL, inode); 4506 goto err_out; 4507 } 4508 ext4_orphan_del(handle, inode); 4509 ext4_journal_stop(handle); 4510 goto err_out; 4511 } 4512 } 4513 } 4514 4515 rc = inode_setattr(inode, attr); 4516 4517 /* If inode_setattr's call to ext4_truncate failed to get a 4518 * transaction handle at all, we need to clean up the in-core 4519 * orphan list manually. */ 4520 if (inode->i_nlink) 4521 ext4_orphan_del(NULL, inode); 4522 4523 if (!rc && (ia_valid & ATTR_MODE)) 4524 rc = ext4_acl_chmod(inode); 4525 4526 err_out: 4527 ext4_std_error(inode->i_sb, error); 4528 if (!error) 4529 error = rc; 4530 return error; 4531 } 4532 4533 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 4534 struct kstat *stat) 4535 { 4536 struct inode *inode; 4537 unsigned long delalloc_blocks; 4538 4539 inode = dentry->d_inode; 4540 generic_fillattr(inode, stat); 4541 4542 /* 4543 * We can't update i_blocks if the block allocation is delayed 4544 * otherwise in the case of system crash before the real block 4545 * allocation is done, we will have i_blocks inconsistent with 4546 * on-disk file blocks. 4547 * We always keep i_blocks updated together with real 4548 * allocation. But to not confuse with user, stat 4549 * will return the blocks that include the delayed allocation 4550 * blocks for this file. 4551 */ 4552 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 4553 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4554 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 4555 4556 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4557 return 0; 4558 } 4559 4560 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 4561 int chunk) 4562 { 4563 int indirects; 4564 4565 /* if nrblocks are contiguous */ 4566 if (chunk) { 4567 /* 4568 * With N contiguous data blocks, it need at most 4569 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 4570 * 2 dindirect blocks 4571 * 1 tindirect block 4572 */ 4573 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 4574 return indirects + 3; 4575 } 4576 /* 4577 * if nrblocks are not contiguous, worse case, each block touch 4578 * a indirect block, and each indirect block touch a double indirect 4579 * block, plus a triple indirect block 4580 */ 4581 indirects = nrblocks * 2 + 1; 4582 return indirects; 4583 } 4584 4585 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4586 { 4587 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 4588 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 4589 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4590 } 4591 4592 /* 4593 * Account for index blocks, block groups bitmaps and block group 4594 * descriptor blocks if modify datablocks and index blocks 4595 * worse case, the indexs blocks spread over different block groups 4596 * 4597 * If datablocks are discontiguous, they are possible to spread over 4598 * different block groups too. If they are contiugous, with flexbg, 4599 * they could still across block group boundary. 4600 * 4601 * Also account for superblock, inode, quota and xattr blocks 4602 */ 4603 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4604 { 4605 int groups, gdpblocks; 4606 int idxblocks; 4607 int ret = 0; 4608 4609 /* 4610 * How many index blocks need to touch to modify nrblocks? 4611 * The "Chunk" flag indicating whether the nrblocks is 4612 * physically contiguous on disk 4613 * 4614 * For Direct IO and fallocate, they calls get_block to allocate 4615 * one single extent at a time, so they could set the "Chunk" flag 4616 */ 4617 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4618 4619 ret = idxblocks; 4620 4621 /* 4622 * Now let's see how many group bitmaps and group descriptors need 4623 * to account 4624 */ 4625 groups = idxblocks; 4626 if (chunk) 4627 groups += 1; 4628 else 4629 groups += nrblocks; 4630 4631 gdpblocks = groups; 4632 if (groups > EXT4_SB(inode->i_sb)->s_groups_count) 4633 groups = EXT4_SB(inode->i_sb)->s_groups_count; 4634 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 4635 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 4636 4637 /* bitmaps and block group descriptor blocks */ 4638 ret += groups + gdpblocks; 4639 4640 /* Blocks for super block, inode, quota and xattr blocks */ 4641 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 4642 4643 return ret; 4644 } 4645 4646 /* 4647 * Calulate the total number of credits to reserve to fit 4648 * the modification of a single pages into a single transaction, 4649 * which may include multiple chunks of block allocations. 4650 * 4651 * This could be called via ext4_write_begin() 4652 * 4653 * We need to consider the worse case, when 4654 * one new block per extent. 4655 */ 4656 int ext4_writepage_trans_blocks(struct inode *inode) 4657 { 4658 int bpp = ext4_journal_blocks_per_page(inode); 4659 int ret; 4660 4661 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4662 4663 /* Account for data blocks for journalled mode */ 4664 if (ext4_should_journal_data(inode)) 4665 ret += bpp; 4666 return ret; 4667 } 4668 4669 /* 4670 * Calculate the journal credits for a chunk of data modification. 4671 * 4672 * This is called from DIO, fallocate or whoever calling 4673 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. 4674 * 4675 * journal buffers for data blocks are not included here, as DIO 4676 * and fallocate do no need to journal data buffers. 4677 */ 4678 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 4679 { 4680 return ext4_meta_trans_blocks(inode, nrblocks, 1); 4681 } 4682 4683 /* 4684 * The caller must have previously called ext4_reserve_inode_write(). 4685 * Give this, we know that the caller already has write access to iloc->bh. 4686 */ 4687 int ext4_mark_iloc_dirty(handle_t *handle, 4688 struct inode *inode, struct ext4_iloc *iloc) 4689 { 4690 int err = 0; 4691 4692 if (test_opt(inode->i_sb, I_VERSION)) 4693 inode_inc_iversion(inode); 4694 4695 /* the do_update_inode consumes one bh->b_count */ 4696 get_bh(iloc->bh); 4697 4698 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 4699 err = ext4_do_update_inode(handle, inode, iloc); 4700 put_bh(iloc->bh); 4701 return err; 4702 } 4703 4704 /* 4705 * On success, We end up with an outstanding reference count against 4706 * iloc->bh. This _must_ be cleaned up later. 4707 */ 4708 4709 int 4710 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4711 struct ext4_iloc *iloc) 4712 { 4713 int err = 0; 4714 if (handle) { 4715 err = ext4_get_inode_loc(inode, iloc); 4716 if (!err) { 4717 BUFFER_TRACE(iloc->bh, "get_write_access"); 4718 err = ext4_journal_get_write_access(handle, iloc->bh); 4719 if (err) { 4720 brelse(iloc->bh); 4721 iloc->bh = NULL; 4722 } 4723 } 4724 } 4725 ext4_std_error(inode->i_sb, err); 4726 return err; 4727 } 4728 4729 /* 4730 * Expand an inode by new_extra_isize bytes. 4731 * Returns 0 on success or negative error number on failure. 4732 */ 4733 static int ext4_expand_extra_isize(struct inode *inode, 4734 unsigned int new_extra_isize, 4735 struct ext4_iloc iloc, 4736 handle_t *handle) 4737 { 4738 struct ext4_inode *raw_inode; 4739 struct ext4_xattr_ibody_header *header; 4740 struct ext4_xattr_entry *entry; 4741 4742 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 4743 return 0; 4744 4745 raw_inode = ext4_raw_inode(&iloc); 4746 4747 header = IHDR(inode, raw_inode); 4748 entry = IFIRST(header); 4749 4750 /* No extended attributes present */ 4751 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 4752 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 4753 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 4754 new_extra_isize); 4755 EXT4_I(inode)->i_extra_isize = new_extra_isize; 4756 return 0; 4757 } 4758 4759 /* try to expand with EAs present */ 4760 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 4761 raw_inode, handle); 4762 } 4763 4764 /* 4765 * What we do here is to mark the in-core inode as clean with respect to inode 4766 * dirtiness (it may still be data-dirty). 4767 * This means that the in-core inode may be reaped by prune_icache 4768 * without having to perform any I/O. This is a very good thing, 4769 * because *any* task may call prune_icache - even ones which 4770 * have a transaction open against a different journal. 4771 * 4772 * Is this cheating? Not really. Sure, we haven't written the 4773 * inode out, but prune_icache isn't a user-visible syncing function. 4774 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 4775 * we start and wait on commits. 4776 * 4777 * Is this efficient/effective? Well, we're being nice to the system 4778 * by cleaning up our inodes proactively so they can be reaped 4779 * without I/O. But we are potentially leaving up to five seconds' 4780 * worth of inodes floating about which prune_icache wants us to 4781 * write out. One way to fix that would be to get prune_icache() 4782 * to do a write_super() to free up some memory. It has the desired 4783 * effect. 4784 */ 4785 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 4786 { 4787 struct ext4_iloc iloc; 4788 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4789 static unsigned int mnt_count; 4790 int err, ret; 4791 4792 might_sleep(); 4793 err = ext4_reserve_inode_write(handle, inode, &iloc); 4794 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4795 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4796 /* 4797 * We need extra buffer credits since we may write into EA block 4798 * with this same handle. If journal_extend fails, then it will 4799 * only result in a minor loss of functionality for that inode. 4800 * If this is felt to be critical, then e2fsck should be run to 4801 * force a large enough s_min_extra_isize. 4802 */ 4803 if ((jbd2_journal_extend(handle, 4804 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 4805 ret = ext4_expand_extra_isize(inode, 4806 sbi->s_want_extra_isize, 4807 iloc, handle); 4808 if (ret) { 4809 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 4810 if (mnt_count != 4811 le16_to_cpu(sbi->s_es->s_mnt_count)) { 4812 ext4_warning(inode->i_sb, __func__, 4813 "Unable to expand inode %lu. Delete" 4814 " some EAs or run e2fsck.", 4815 inode->i_ino); 4816 mnt_count = 4817 le16_to_cpu(sbi->s_es->s_mnt_count); 4818 } 4819 } 4820 } 4821 } 4822 if (!err) 4823 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 4824 return err; 4825 } 4826 4827 /* 4828 * ext4_dirty_inode() is called from __mark_inode_dirty() 4829 * 4830 * We're really interested in the case where a file is being extended. 4831 * i_size has been changed by generic_commit_write() and we thus need 4832 * to include the updated inode in the current transaction. 4833 * 4834 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 4835 * are allocated to the file. 4836 * 4837 * If the inode is marked synchronous, we don't honour that here - doing 4838 * so would cause a commit on atime updates, which we don't bother doing. 4839 * We handle synchronous inodes at the highest possible level. 4840 */ 4841 void ext4_dirty_inode(struct inode *inode) 4842 { 4843 handle_t *current_handle = ext4_journal_current_handle(); 4844 handle_t *handle; 4845 4846 handle = ext4_journal_start(inode, 2); 4847 if (IS_ERR(handle)) 4848 goto out; 4849 if (current_handle && 4850 current_handle->h_transaction != handle->h_transaction) { 4851 /* This task has a transaction open against a different fs */ 4852 printk(KERN_EMERG "%s: transactions do not match!\n", 4853 __func__); 4854 } else { 4855 jbd_debug(5, "marking dirty. outer handle=%p\n", 4856 current_handle); 4857 ext4_mark_inode_dirty(handle, inode); 4858 } 4859 ext4_journal_stop(handle); 4860 out: 4861 return; 4862 } 4863 4864 #if 0 4865 /* 4866 * Bind an inode's backing buffer_head into this transaction, to prevent 4867 * it from being flushed to disk early. Unlike 4868 * ext4_reserve_inode_write, this leaves behind no bh reference and 4869 * returns no iloc structure, so the caller needs to repeat the iloc 4870 * lookup to mark the inode dirty later. 4871 */ 4872 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 4873 { 4874 struct ext4_iloc iloc; 4875 4876 int err = 0; 4877 if (handle) { 4878 err = ext4_get_inode_loc(inode, &iloc); 4879 if (!err) { 4880 BUFFER_TRACE(iloc.bh, "get_write_access"); 4881 err = jbd2_journal_get_write_access(handle, iloc.bh); 4882 if (!err) 4883 err = ext4_journal_dirty_metadata(handle, 4884 iloc.bh); 4885 brelse(iloc.bh); 4886 } 4887 } 4888 ext4_std_error(inode->i_sb, err); 4889 return err; 4890 } 4891 #endif 4892 4893 int ext4_change_inode_journal_flag(struct inode *inode, int val) 4894 { 4895 journal_t *journal; 4896 handle_t *handle; 4897 int err; 4898 4899 /* 4900 * We have to be very careful here: changing a data block's 4901 * journaling status dynamically is dangerous. If we write a 4902 * data block to the journal, change the status and then delete 4903 * that block, we risk forgetting to revoke the old log record 4904 * from the journal and so a subsequent replay can corrupt data. 4905 * So, first we make sure that the journal is empty and that 4906 * nobody is changing anything. 4907 */ 4908 4909 journal = EXT4_JOURNAL(inode); 4910 if (is_journal_aborted(journal)) 4911 return -EROFS; 4912 4913 jbd2_journal_lock_updates(journal); 4914 jbd2_journal_flush(journal); 4915 4916 /* 4917 * OK, there are no updates running now, and all cached data is 4918 * synced to disk. We are now in a completely consistent state 4919 * which doesn't have anything in the journal, and we know that 4920 * no filesystem updates are running, so it is safe to modify 4921 * the inode's in-core data-journaling state flag now. 4922 */ 4923 4924 if (val) 4925 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 4926 else 4927 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 4928 ext4_set_aops(inode); 4929 4930 jbd2_journal_unlock_updates(journal); 4931 4932 /* Finally we can mark the inode as dirty. */ 4933 4934 handle = ext4_journal_start(inode, 1); 4935 if (IS_ERR(handle)) 4936 return PTR_ERR(handle); 4937 4938 err = ext4_mark_inode_dirty(handle, inode); 4939 handle->h_sync = 1; 4940 ext4_journal_stop(handle); 4941 ext4_std_error(inode->i_sb, err); 4942 4943 return err; 4944 } 4945 4946 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 4947 { 4948 return !buffer_mapped(bh); 4949 } 4950 4951 int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 4952 { 4953 loff_t size; 4954 unsigned long len; 4955 int ret = -EINVAL; 4956 void *fsdata; 4957 struct file *file = vma->vm_file; 4958 struct inode *inode = file->f_path.dentry->d_inode; 4959 struct address_space *mapping = inode->i_mapping; 4960 4961 /* 4962 * Get i_alloc_sem to stop truncates messing with the inode. We cannot 4963 * get i_mutex because we are already holding mmap_sem. 4964 */ 4965 down_read(&inode->i_alloc_sem); 4966 size = i_size_read(inode); 4967 if (page->mapping != mapping || size <= page_offset(page) 4968 || !PageUptodate(page)) { 4969 /* page got truncated from under us? */ 4970 goto out_unlock; 4971 } 4972 ret = 0; 4973 if (PageMappedToDisk(page)) 4974 goto out_unlock; 4975 4976 if (page->index == size >> PAGE_CACHE_SHIFT) 4977 len = size & ~PAGE_CACHE_MASK; 4978 else 4979 len = PAGE_CACHE_SIZE; 4980 4981 if (page_has_buffers(page)) { 4982 /* return if we have all the buffers mapped */ 4983 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 4984 ext4_bh_unmapped)) 4985 goto out_unlock; 4986 } 4987 /* 4988 * OK, we need to fill the hole... Do write_begin write_end 4989 * to do block allocation/reservation.We are not holding 4990 * inode.i__mutex here. That allow * parallel write_begin, 4991 * write_end call. lock_page prevent this from happening 4992 * on the same page though 4993 */ 4994 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 4995 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 4996 if (ret < 0) 4997 goto out_unlock; 4998 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 4999 len, len, page, fsdata); 5000 if (ret < 0) 5001 goto out_unlock; 5002 ret = 0; 5003 out_unlock: 5004 up_read(&inode->i_alloc_sem); 5005 return ret; 5006 } 5007